From 02b055a96b27c1d3dd601631ca279cd9d8b656c1 Mon Sep 17 00:00:00 2001
From: Tatiana Villa Ema <tatiana@tecnologia-facil.es>
Date: Sat, 25 Apr 2026 17:42:24 +0200
Subject: [PATCH] fix: parser PDF formato Mercadona sin espacio qty-nombre +
 productos por peso

---
 app.py         | 47 +++++++++++++++++++++++++-------
 autocompra7.py | 73 +++++++++++++++++++++++++++++++++++---------------
 2 files changed, 90 insertions(+), 30 deletions(-)

diff --git a/app.py b/app.py
index 6c1d4b7..43d5e1a 100644
--- a/app.py
+++ b/app.py
@@ -71,14 +71,37 @@ _OCR_EXCLUDE = [
 def parsear_texto_ticket(texto):
     """Extrae productos de texto OCR con el mismo formato que los tickets Mercadona."""
     productos = []
-    for line in texto.splitlines():
-        linea = line.strip()
-        if not linea:
+    lines = texto.splitlines()
+    i = 0
+    while i < len(lines):
+        linea = lines[i].strip()
+        if not linea or any(kw in linea.upper() for kw in _OCR_EXCLUDE):
+            i += 1
             continue
-        if any(kw in linea.upper() for kw in _OCR_EXCLUDE):
-            continue
-        # "2 ROLLO HOGAR DOBLE 2,35 4,70"
-        m = re.match(r"^(\d+)\s+(.+?)\s+(\d+[,\.]\d{2})\s+(\d+[,\.]\d{2})$", linea)
+
+        # Producto por peso (2 líneas): "1BROCOLI" + "1,048 kg 2,60 €/kg 2,72"
+        if i + 1 < len(lines):
+            next_line = lines[i + 1].strip()
+            weight_m = re.search(r"kg.*?(\d+[,.]\d{2})\s*$", next_line)
+            if weight_m and re.match(r"^\d+[A-ZÁÉÍÓÚÑa-záéíóúñ+]", linea):
+                name_m = re.match(r"^(\d+)(.+)$", linea)
+                if name_m:
+                    try:
+                        cantidad = int(name_m.group(1))
+                        precio_total = round(float(weight_m.group(1).replace(",", ".")), 2)
+                        productos.append({
+                            "cantidad":        cantidad,
+                            "producto":        name_m.group(2).strip().upper(),
+                            "precio_unitario": round(precio_total / cantidad, 2),
+                            "precio_total":    precio_total,
+                        })
+                        i += 2
+                        continue
+                    except ValueError:
+                        pass
+
+        # "2ARROZ SOS 1,88 3,76"
+        m = re.match(r"^(\d+)(.+?)\s+(\d+[,\.]\d{2})\s+(\d+[,\.]\d{2})$", linea)
         if m:
             try:
                 productos.append({
@@ -87,11 +110,13 @@ def parsear_texto_ticket(texto):
                     "precio_unitario": round(float(m.group(3).replace(',', '.')), 2),
                     "precio_total":    round(float(m.group(4).replace(',', '.')), 2),
                 })
+                i += 1
                 continue
             except ValueError:
                 pass
-        # "1 CROISSANT RELL CACAO 1,90"
-        m = re.match(r"^(\d+)\s+(.+?)\s+(\d+[,\.]\d{2})$", linea)
+
+        # "1CROISSANT RELL CACAO 1,90"
+        m = re.match(r"^(\d+)(.+?)\s+(\d+[,\.]\d{2})$", linea)
         if m:
             try:
                 cantidad = int(m.group(1))
@@ -102,8 +127,12 @@ def parsear_texto_ticket(texto):
                     "precio_unitario": round(precio / cantidad, 2),
                     "precio_total":    precio,
                 })
+                i += 1
+                continue
             except ValueError:
                 pass
+
+        i += 1
     return productos
 
 # -----------------------------------------------------------------------
diff --git a/autocompra7.py b/autocompra7.py
index cbe74c3..8fc17da 100644
--- a/autocompra7.py
+++ b/autocompra7.py
@@ -36,30 +36,61 @@ def extract_data_from_pdf(file_path):
     fecha = datetime.strptime(date_match.group(1), "%d/%m/%Y") if date_match else None
 
     products = []
+    lines = text.splitlines()
+    i = 0
+    while i < len(lines):
+        line = lines[i].strip()
 
-    for line in text.splitlines():
-        if any(keyword in line for keyword in exclude_keywords):
-            continue
-        
-        # Coincide con líneas tipo: "2 ROLLO HOGAR DOBLE 2,35 4,70"
-        match = re.match(r"(\d+)\s+(.*?)\s+(\d+,\d{2})\s+(\d+,\d{2})$", line)
-        if match:
-            cantidad = int(match.group(1))
-            producto = match.group(2).strip().upper()
-            precio_unitario = float(match.group(3).replace(",", "."))
-            precio_total = float(match.group(4).replace(",", "."))
-            products.append((fecha, cantidad, producto, precio_unitario, precio_total))
+        if not line or any(kw in line for kw in exclude_keywords):
+            i += 1
             continue
 
-        # Coincide con líneas tipo: "1 CROISSANT RELL CACAO 1,90"
-        match_simple = re.match(r"(\d+)\s+(.*?)\s+(\d+,\d{2})$", line)
-        if match_simple:
-            cantidad = int(match_simple.group(1))
-            producto = match_simple.group(2).strip().upper()
-            precio_total = float(match_simple.group(3).replace(",", "."))
-            precio_unitario = precio_total / cantidad
-            products.append((fecha, cantidad, producto, round(precio_unitario, 2), precio_total))
-            continue
+        # Producto por peso (2 líneas):
+        #   "1BROCOLI"
+        #   "1,048 kg 2,60 €/kg 2,72"
+        if i + 1 < len(lines):
+            next_line = lines[i + 1].strip()
+            weight_m = re.search(r"kg.*?(\d+[,.]\d{2})\s*$", next_line)
+            if weight_m and re.match(r"^\d+[A-ZÁÉÍÓÚÑa-záéíóúñ+]", line):
+                name_m = re.match(r"^(\d+)(.+)$", line)
+                if name_m:
+                    cantidad = int(name_m.group(1))
+                    producto = name_m.group(2).strip().upper()
+                    precio_total = float(weight_m.group(1).replace(",", "."))
+                    products.append((fecha, cantidad, producto,
+                                     round(precio_total / cantidad, 2), precio_total))
+                    i += 2
+                    continue
+
+        # Formato: "2ARROZ SOS 1,88 3,76"  →  cantidad / nombre / p.unit / total
+        m = re.match(r"^(\d+)(.+?)\s+(\d+,\d{2})\s+(\d+,\d{2})$", line)
+        if m:
+            try:
+                cantidad = int(m.group(1))
+                producto = m.group(2).strip().upper()
+                precio_unitario = float(m.group(3).replace(",", "."))
+                precio_total    = float(m.group(4).replace(",", "."))
+                products.append((fecha, cantidad, producto, precio_unitario, precio_total))
+                i += 1
+                continue
+            except ValueError:
+                pass
+
+        # Formato: "1CALABAZA TROZOS 2,50"  →  cantidad / nombre / total
+        m = re.match(r"^(\d+)(.+?)\s+(\d+,\d{2})$", line)
+        if m:
+            try:
+                cantidad = int(m.group(1))
+                producto = m.group(2).strip().upper()
+                precio_total = float(m.group(3).replace(",", "."))
+                products.append((fecha, cantidad, producto,
+                                 round(precio_total / cantidad, 2), precio_total))
+                i += 1
+                continue
+            except ValueError:
+                pass
+
+        i += 1
 
     return products