autocompra/autocompra4.py

import re
import pandas as pd
import os
from datetime import datetime, timedelta
from PyPDF2 import PdfReader
from collections import defaultdict

# Carpeta con tus tickets PDF
ticket_folder = "tickets"

def extract_data_from_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"

    date_match = re.search(r"(\d{2}/\d{2}/\d{4})", text)
    fecha = datetime.strptime(date_match.group(1), "%d/%m/%Y") if date_match else None

    products = []
    for line in text.splitlines():
        match = re.match(r"\d*\s?(.*?)\s+(\d+,\d{2})\s+(\d+,\d{2})", line)
        if match:
            nombre = match.group(1).strip().upper()
            unit_price = float(match.group(2).replace(",", "."))
            total_price = float(match.group(3).replace(",", "."))
            products.append((fecha, nombre, unit_price, total_price))
        else:
            match_simple = re.match(r"(.*?)\s+(\d+,\d{2})$", line)
            if match_simple:
                nombre = match_simple.group(1).strip().upper()
                total_price = float(match_simple.group(2).replace(",", "."))
                products.append((fecha, nombre, None, total_price))

    return products

# Recolectar todos los productos de todos los tickets
datos = []
for file in os.listdir(ticket_folder):
    if file.endswith(".pdf"):
        path = os.path.join(ticket_folder, file)
        datos.extend(extract_data_from_pdf(path))

# Crear DataFrame
columnas = ["fecha", "producto", "precio_unitario", "precio_total"]
df = pd.DataFrame(datos, columns=columnas)
df.dropna(subset=["fecha"], inplace=True)

# Normalizar nombres de producto
df["producto"] = df["producto"].str.upper().str.strip()

# Calcular el tiempo entre compras para cada producto
df["diferencia_dias"] = df.groupby("producto")["fecha"].diff().dt.days

# Calcular la frecuencia de compra (promedio de días entre compras)
frecuencia_compra = df.groupby("producto")["diferencia_dias"].mean().reset_index()

# Estimación de la duración de los productos (cuánto duran en casa)
frecuencia_compra["proxima_compra_estimado"] = df["fecha"].max() + pd.to_timedelta(frecuencia_compra["diferencia_dias"], unit="D")

# Ahora seleccionamos los productos que más frecuentemente compras
productos_estimados = frecuencia_compra.sort_values("diferencia_dias", ascending=True)

# Listar la compra estimada
productos_estimados["producto"] = productos_estimados["producto"].str.title()  # Capitalizar el nombre del producto
productos_estimados["fecha_estimada_proxima_compra"] = productos_estimados["proxima_compra_estimado"].dt.strftime("%d/%m/%Y")

# Generar la lista de la compra en formato HTML
html_content = """
<!DOCTYPE html>
<html lang="es">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Lista de la Compra Estimada</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            background-color: #f4f4f9;
            margin: 0;
            padding: 0;
        }
        h1 {
            text-align: center;
            color: #333;
            padding-top: 20px;
        }
        table {
            width: 80%;
            margin: 30px auto;
            border-collapse: collapse;
        }
        th, td {
            padding: 12px;
            text-align: left;
            border-bottom: 1px solid #ddd;
        }
        th {
            background-color: #4CAF50;
            color: white;
        }
        tr:hover {
            background-color: #f1f1f1;
        }
        td {
            color: #333;
        }
    </style>
</head>
<body>

    <h1>Lista de la Compra Estimada</h1>

    <table>
        <thead>
            <tr>
                <th>Producto</th>
                <th>Días Promedio entre Compras</th>
                <th>Fecha Estimada de la Próxima Compra</th>
            </tr>
        </thead>
        <tbody>
"""

# Añadir filas con los productos
for _, row in productos_estimados.iterrows():
    html_content += f"""
        <tr>
            <td>{row['producto']}</td>
            <td>{row['diferencia_dias']:.0f}</td>
            <td>{row['fecha_estimada_proxima_compra']}</td>
        </tr>
    """

html_content += """
        </tbody>
    </table>

</body>
</html>
"""

# Guardar el archivo HTML
with open("lista_compra_estimado.html", "w", encoding="utf-8") as file:
    file.write(html_content)

print("\n✅ Página HTML generada: lista_compra_estimado.html")