fix: parser PDF formato Mercadona sin espacio qty-nombre + productos por peso

2026-04-25 17:42:24 +02:00 · 2026-04-25 17:42:24 +02:00 · 02b055a96b
parent 2e1b35553a
commit 02b055a96b
2 changed files with 90 additions and 30 deletions
--- a/app.py
+++ b/app.py
@ -71,14 +71,37 @@ _OCR_EXCLUDE = [
 def parsear_texto_ticket(texto):
    """Extrae productos de texto OCR con el mismo formato que los tickets Mercadona."""
    productos = []
-    for line in texto.splitlines():
-        linea = line.strip()
-        if not linea:
+    lines = texto.splitlines()
+    i = 0
+    while i < len(lines):
+        linea = lines[i].strip()
+        if not linea or any(kw in linea.upper() for kw in _OCR_EXCLUDE):
+            i += 1
            continue
-        if any(kw in linea.upper() for kw in _OCR_EXCLUDE):
-            continue
-        # "2 ROLLO HOGAR DOBLE 2,35 4,70"
-        m = re.match(r"^(\d+)\s+(.+?)\s+(\d+[,\.]\d{2})\s+(\d+[,\.]\d{2})$", linea)
+
+        # Producto por peso (2 líneas): "1BROCOLI" + "1,048 kg 2,60 €/kg 2,72"
+        if i + 1 < len(lines):
+            next_line = lines[i + 1].strip()
+            weight_m = re.search(r"kg.*?(\d+[,.]\d{2})\s*$", next_line)
+            if weight_m and re.match(r"^\d+[A-ZÁÉÍÓÚÑa-záéíóúñ+]", linea):
+                name_m = re.match(r"^(\d+)(.+)$", linea)
+                if name_m:
+                    try:
+                        cantidad = int(name_m.group(1))
+                        precio_total = round(float(weight_m.group(1).replace(",", ".")), 2)
+                        productos.append({
+                            "cantidad":        cantidad,
+                            "producto":        name_m.group(2).strip().upper(),
+                            "precio_unitario": round(precio_total / cantidad, 2),
+                            "precio_total":    precio_total,
+                        })
+                        i += 2
+                        continue
+                    except ValueError:
+                        pass
+
+        # "2ARROZ SOS 1,88 3,76"
+        m = re.match(r"^(\d+)(.+?)\s+(\d+[,\.]\d{2})\s+(\d+[,\.]\d{2})$", linea)
        if m:
            try:
                productos.append({
@ -87,11 +110,13 @@ def parsear_texto_ticket(texto):
                    "precio_unitario": round(float(m.group(3).replace(',', '.')), 2),
                    "precio_total":    round(float(m.group(4).replace(',', '.')), 2),
                })
+                i += 1
                continue
            except ValueError:
                pass
-        # "1 CROISSANT RELL CACAO 1,90"
-        m = re.match(r"^(\d+)\s+(.+?)\s+(\d+[,\.]\d{2})$", linea)
+
+        # "1CROISSANT RELL CACAO 1,90"
+        m = re.match(r"^(\d+)(.+?)\s+(\d+[,\.]\d{2})$", linea)
        if m:
            try:
                cantidad = int(m.group(1))
@ -102,8 +127,12 @@ def parsear_texto_ticket(texto):
                    "precio_unitario": round(precio / cantidad, 2),
                    "precio_total":    precio,
                })
+                i += 1
+                continue
            except ValueError:
                pass
+
+        i += 1
    return productos

 # -----------------------------------------------------------------------
--- a/autocompra7.py
+++ b/autocompra7.py
@ -36,30 +36,61 @@ def extract_data_from_pdf(file_path):
    fecha = datetime.strptime(date_match.group(1), "%d/%m/%Y") if date_match else None

    products = []
+    lines = text.splitlines()
+    i = 0
+    while i < len(lines):
+        line = lines[i].strip()

-    for line in text.splitlines():
-        if any(keyword in line for keyword in exclude_keywords):
-            continue
-        
-        # Coincide con líneas tipo: "2 ROLLO HOGAR DOBLE 2,35 4,70"
-        match = re.match(r"(\d+)\s+(.*?)\s+(\d+,\d{2})\s+(\d+,\d{2})$", line)
-        if match:
-            cantidad = int(match.group(1))
-            producto = match.group(2).strip().upper()
-            precio_unitario = float(match.group(3).replace(",", "."))
-            precio_total = float(match.group(4).replace(",", "."))
-            products.append((fecha, cantidad, producto, precio_unitario, precio_total))
+        if not line or any(kw in line for kw in exclude_keywords):
+            i += 1
            continue

-        # Coincide con líneas tipo: "1 CROISSANT RELL CACAO 1,90"
-        match_simple = re.match(r"(\d+)\s+(.*?)\s+(\d+,\d{2})$", line)
-        if match_simple:
-            cantidad = int(match_simple.group(1))
-            producto = match_simple.group(2).strip().upper()
-            precio_total = float(match_simple.group(3).replace(",", "."))
-            precio_unitario = precio_total / cantidad
-            products.append((fecha, cantidad, producto, round(precio_unitario, 2), precio_total))
-            continue
+        # Producto por peso (2 líneas):
+        #   "1BROCOLI"
+        #   "1,048 kg 2,60 €/kg 2,72"
+        if i + 1 < len(lines):
+            next_line = lines[i + 1].strip()
+            weight_m = re.search(r"kg.*?(\d+[,.]\d{2})\s*$", next_line)
+            if weight_m and re.match(r"^\d+[A-ZÁÉÍÓÚÑa-záéíóúñ+]", line):
+                name_m = re.match(r"^(\d+)(.+)$", line)
+                if name_m:
+                    cantidad = int(name_m.group(1))
+                    producto = name_m.group(2).strip().upper()
+                    precio_total = float(weight_m.group(1).replace(",", "."))
+                    products.append((fecha, cantidad, producto,
+                                     round(precio_total / cantidad, 2), precio_total))
+                    i += 2
+                    continue
+
+        # Formato: "2ARROZ SOS 1,88 3,76"  →  cantidad / nombre / p.unit / total
+        m = re.match(r"^(\d+)(.+?)\s+(\d+,\d{2})\s+(\d+,\d{2})$", line)
+        if m:
+            try:
+                cantidad = int(m.group(1))
+                producto = m.group(2).strip().upper()
+                precio_unitario = float(m.group(3).replace(",", "."))
+                precio_total    = float(m.group(4).replace(",", "."))
+                products.append((fecha, cantidad, producto, precio_unitario, precio_total))
+                i += 1
+                continue
+            except ValueError:
+                pass
+
+        # Formato: "1CALABAZA TROZOS 2,50"  →  cantidad / nombre / total
+        m = re.match(r"^(\d+)(.+?)\s+(\d+,\d{2})$", line)
+        if m:
+            try:
+                cantidad = int(m.group(1))
+                producto = m.group(2).strip().upper()
+                precio_total = float(m.group(3).replace(",", "."))
+                products.append((fecha, cantidad, producto,
+                                 round(precio_total / cantidad, 2), precio_total))
+                i += 1
+                continue
+            except ValueError:
+                pass
+
+        i += 1

    return products