From 02b055a96b27c1d3dd601631ca279cd9d8b656c1 Mon Sep 17 00:00:00 2001 From: Tatiana Villa Ema Date: Sat, 25 Apr 2026 17:42:24 +0200 Subject: [PATCH] fix: parser PDF formato Mercadona sin espacio qty-nombre + productos por peso --- app.py | 47 +++++++++++++++++++++++++------- autocompra7.py | 73 +++++++++++++++++++++++++++++++++++--------------- 2 files changed, 90 insertions(+), 30 deletions(-) diff --git a/app.py b/app.py index 6c1d4b7..43d5e1a 100644 --- a/app.py +++ b/app.py @@ -71,14 +71,37 @@ _OCR_EXCLUDE = [ def parsear_texto_ticket(texto): """Extrae productos de texto OCR con el mismo formato que los tickets Mercadona.""" productos = [] - for line in texto.splitlines(): - linea = line.strip() - if not linea: + lines = texto.splitlines() + i = 0 + while i < len(lines): + linea = lines[i].strip() + if not linea or any(kw in linea.upper() for kw in _OCR_EXCLUDE): + i += 1 continue - if any(kw in linea.upper() for kw in _OCR_EXCLUDE): - continue - # "2 ROLLO HOGAR DOBLE 2,35 4,70" - m = re.match(r"^(\d+)\s+(.+?)\s+(\d+[,\.]\d{2})\s+(\d+[,\.]\d{2})$", linea) + + # Producto por peso (2 líneas): "1BROCOLI" + "1,048 kg 2,60 €/kg 2,72" + if i + 1 < len(lines): + next_line = lines[i + 1].strip() + weight_m = re.search(r"kg.*?(\d+[,.]\d{2})\s*$", next_line) + if weight_m and re.match(r"^\d+[A-ZÁÉÍÓÚÑa-záéíóúñ+]", linea): + name_m = re.match(r"^(\d+)(.+)$", linea) + if name_m: + try: + cantidad = int(name_m.group(1)) + precio_total = round(float(weight_m.group(1).replace(",", ".")), 2) + productos.append({ + "cantidad": cantidad, + "producto": name_m.group(2).strip().upper(), + "precio_unitario": round(precio_total / cantidad, 2), + "precio_total": precio_total, + }) + i += 2 + continue + except ValueError: + pass + + # "2ARROZ SOS 1,88 3,76" + m = re.match(r"^(\d+)(.+?)\s+(\d+[,\.]\d{2})\s+(\d+[,\.]\d{2})$", linea) if m: try: productos.append({ @@ -87,11 +110,13 @@ def parsear_texto_ticket(texto): "precio_unitario": round(float(m.group(3).replace(',', '.')), 2), "precio_total": round(float(m.group(4).replace(',', '.')), 2), }) + i += 1 continue except ValueError: pass - # "1 CROISSANT RELL CACAO 1,90" - m = re.match(r"^(\d+)\s+(.+?)\s+(\d+[,\.]\d{2})$", linea) + + # "1CROISSANT RELL CACAO 1,90" + m = re.match(r"^(\d+)(.+?)\s+(\d+[,\.]\d{2})$", linea) if m: try: cantidad = int(m.group(1)) @@ -102,8 +127,12 @@ def parsear_texto_ticket(texto): "precio_unitario": round(precio / cantidad, 2), "precio_total": precio, }) + i += 1 + continue except ValueError: pass + + i += 1 return productos # ----------------------------------------------------------------------- diff --git a/autocompra7.py b/autocompra7.py index cbe74c3..8fc17da 100644 --- a/autocompra7.py +++ b/autocompra7.py @@ -36,30 +36,61 @@ def extract_data_from_pdf(file_path): fecha = datetime.strptime(date_match.group(1), "%d/%m/%Y") if date_match else None products = [] + lines = text.splitlines() + i = 0 + while i < len(lines): + line = lines[i].strip() - for line in text.splitlines(): - if any(keyword in line for keyword in exclude_keywords): - continue - - # Coincide con líneas tipo: "2 ROLLO HOGAR DOBLE 2,35 4,70" - match = re.match(r"(\d+)\s+(.*?)\s+(\d+,\d{2})\s+(\d+,\d{2})$", line) - if match: - cantidad = int(match.group(1)) - producto = match.group(2).strip().upper() - precio_unitario = float(match.group(3).replace(",", ".")) - precio_total = float(match.group(4).replace(",", ".")) - products.append((fecha, cantidad, producto, precio_unitario, precio_total)) + if not line or any(kw in line for kw in exclude_keywords): + i += 1 continue - # Coincide con líneas tipo: "1 CROISSANT RELL CACAO 1,90" - match_simple = re.match(r"(\d+)\s+(.*?)\s+(\d+,\d{2})$", line) - if match_simple: - cantidad = int(match_simple.group(1)) - producto = match_simple.group(2).strip().upper() - precio_total = float(match_simple.group(3).replace(",", ".")) - precio_unitario = precio_total / cantidad - products.append((fecha, cantidad, producto, round(precio_unitario, 2), precio_total)) - continue + # Producto por peso (2 líneas): + # "1BROCOLI" + # "1,048 kg 2,60 €/kg 2,72" + if i + 1 < len(lines): + next_line = lines[i + 1].strip() + weight_m = re.search(r"kg.*?(\d+[,.]\d{2})\s*$", next_line) + if weight_m and re.match(r"^\d+[A-ZÁÉÍÓÚÑa-záéíóúñ+]", line): + name_m = re.match(r"^(\d+)(.+)$", line) + if name_m: + cantidad = int(name_m.group(1)) + producto = name_m.group(2).strip().upper() + precio_total = float(weight_m.group(1).replace(",", ".")) + products.append((fecha, cantidad, producto, + round(precio_total / cantidad, 2), precio_total)) + i += 2 + continue + + # Formato: "2ARROZ SOS 1,88 3,76" → cantidad / nombre / p.unit / total + m = re.match(r"^(\d+)(.+?)\s+(\d+,\d{2})\s+(\d+,\d{2})$", line) + if m: + try: + cantidad = int(m.group(1)) + producto = m.group(2).strip().upper() + precio_unitario = float(m.group(3).replace(",", ".")) + precio_total = float(m.group(4).replace(",", ".")) + products.append((fecha, cantidad, producto, precio_unitario, precio_total)) + i += 1 + continue + except ValueError: + pass + + # Formato: "1CALABAZA TROZOS 2,50" → cantidad / nombre / total + m = re.match(r"^(\d+)(.+?)\s+(\d+,\d{2})$", line) + if m: + try: + cantidad = int(m.group(1)) + producto = m.group(2).strip().upper() + precio_total = float(m.group(3).replace(",", ".")) + products.append((fecha, cantidad, producto, + round(precio_total / cantidad, 2), precio_total)) + i += 1 + continue + except ValueError: + pass + + i += 1 return products