fix: parser PDF formato Mercadona sin espacio qty-nombre + productos por peso

This commit is contained in:
Tatiana Villa Ema 2026-04-25 17:42:24 +02:00
parent 2e1b35553a
commit 02b055a96b
2 changed files with 90 additions and 30 deletions

47
app.py
View File

@ -71,14 +71,37 @@ _OCR_EXCLUDE = [
def parsear_texto_ticket(texto):
"""Extrae productos de texto OCR con el mismo formato que los tickets Mercadona."""
productos = []
for line in texto.splitlines():
linea = line.strip()
if not linea:
lines = texto.splitlines()
i = 0
while i < len(lines):
linea = lines[i].strip()
if not linea or any(kw in linea.upper() for kw in _OCR_EXCLUDE):
i += 1
continue
if any(kw in linea.upper() for kw in _OCR_EXCLUDE):
continue
# "2 ROLLO HOGAR DOBLE 2,35 4,70"
m = re.match(r"^(\d+)\s+(.+?)\s+(\d+[,\.]\d{2})\s+(\d+[,\.]\d{2})$", linea)
# Producto por peso (2 líneas): "1BROCOLI" + "1,048 kg 2,60 €/kg 2,72"
if i + 1 < len(lines):
next_line = lines[i + 1].strip()
weight_m = re.search(r"kg.*?(\d+[,.]\d{2})\s*$", next_line)
if weight_m and re.match(r"^\d+[A-ZÁÉÍÓÚÑa-záéíóúñ+]", linea):
name_m = re.match(r"^(\d+)(.+)$", linea)
if name_m:
try:
cantidad = int(name_m.group(1))
precio_total = round(float(weight_m.group(1).replace(",", ".")), 2)
productos.append({
"cantidad": cantidad,
"producto": name_m.group(2).strip().upper(),
"precio_unitario": round(precio_total / cantidad, 2),
"precio_total": precio_total,
})
i += 2
continue
except ValueError:
pass
# "2ARROZ SOS 1,88 3,76"
m = re.match(r"^(\d+)(.+?)\s+(\d+[,\.]\d{2})\s+(\d+[,\.]\d{2})$", linea)
if m:
try:
productos.append({
@ -87,11 +110,13 @@ def parsear_texto_ticket(texto):
"precio_unitario": round(float(m.group(3).replace(',', '.')), 2),
"precio_total": round(float(m.group(4).replace(',', '.')), 2),
})
i += 1
continue
except ValueError:
pass
# "1 CROISSANT RELL CACAO 1,90"
m = re.match(r"^(\d+)\s+(.+?)\s+(\d+[,\.]\d{2})$", linea)
# "1CROISSANT RELL CACAO 1,90"
m = re.match(r"^(\d+)(.+?)\s+(\d+[,\.]\d{2})$", linea)
if m:
try:
cantidad = int(m.group(1))
@ -102,8 +127,12 @@ def parsear_texto_ticket(texto):
"precio_unitario": round(precio / cantidad, 2),
"precio_total": precio,
})
i += 1
continue
except ValueError:
pass
i += 1
return productos
# -----------------------------------------------------------------------

View File

@ -36,30 +36,61 @@ def extract_data_from_pdf(file_path):
fecha = datetime.strptime(date_match.group(1), "%d/%m/%Y") if date_match else None
products = []
lines = text.splitlines()
i = 0
while i < len(lines):
line = lines[i].strip()
for line in text.splitlines():
if any(keyword in line for keyword in exclude_keywords):
continue
# Coincide con líneas tipo: "2 ROLLO HOGAR DOBLE 2,35 4,70"
match = re.match(r"(\d+)\s+(.*?)\s+(\d+,\d{2})\s+(\d+,\d{2})$", line)
if match:
cantidad = int(match.group(1))
producto = match.group(2).strip().upper()
precio_unitario = float(match.group(3).replace(",", "."))
precio_total = float(match.group(4).replace(",", "."))
products.append((fecha, cantidad, producto, precio_unitario, precio_total))
if not line or any(kw in line for kw in exclude_keywords):
i += 1
continue
# Coincide con líneas tipo: "1 CROISSANT RELL CACAO 1,90"
match_simple = re.match(r"(\d+)\s+(.*?)\s+(\d+,\d{2})$", line)
if match_simple:
cantidad = int(match_simple.group(1))
producto = match_simple.group(2).strip().upper()
precio_total = float(match_simple.group(3).replace(",", "."))
precio_unitario = precio_total / cantidad
products.append((fecha, cantidad, producto, round(precio_unitario, 2), precio_total))
continue
# Producto por peso (2 líneas):
# "1BROCOLI"
# "1,048 kg 2,60 €/kg 2,72"
if i + 1 < len(lines):
next_line = lines[i + 1].strip()
weight_m = re.search(r"kg.*?(\d+[,.]\d{2})\s*$", next_line)
if weight_m and re.match(r"^\d+[A-ZÁÉÍÓÚÑa-záéíóúñ+]", line):
name_m = re.match(r"^(\d+)(.+)$", line)
if name_m:
cantidad = int(name_m.group(1))
producto = name_m.group(2).strip().upper()
precio_total = float(weight_m.group(1).replace(",", "."))
products.append((fecha, cantidad, producto,
round(precio_total / cantidad, 2), precio_total))
i += 2
continue
# Formato: "2ARROZ SOS 1,88 3,76" → cantidad / nombre / p.unit / total
m = re.match(r"^(\d+)(.+?)\s+(\d+,\d{2})\s+(\d+,\d{2})$", line)
if m:
try:
cantidad = int(m.group(1))
producto = m.group(2).strip().upper()
precio_unitario = float(m.group(3).replace(",", "."))
precio_total = float(m.group(4).replace(",", "."))
products.append((fecha, cantidad, producto, precio_unitario, precio_total))
i += 1
continue
except ValueError:
pass
# Formato: "1CALABAZA TROZOS 2,50" → cantidad / nombre / total
m = re.match(r"^(\d+)(.+?)\s+(\d+,\d{2})$", line)
if m:
try:
cantidad = int(m.group(1))
producto = m.group(2).strip().upper()
precio_total = float(m.group(3).replace(",", "."))
products.append((fecha, cantidad, producto,
round(precio_total / cantidad, 2), precio_total))
i += 1
continue
except ValueError:
pass
i += 1
return products