leyendo el boe

2026-05-29 13:20:03 +02:00 · 2026-05-29 13:20:03 +02:00 · 0c3acffeb9
parent 993c427cfc
commit 0c3acffeb9
4 changed files with 1875 additions and 0 deletions
--- a/boe.xml
+++ b/boe.xml
@ -0,0 +1,8 @@
 <?xml version="1.0" encoding="utf-8"?>
 <response>
  <status>
    <code>400</code>
    <text>No soportado ningún mime type de la cabecera Accept.</text>
  </status>
  <data/>
 </response>
--- a/src/main/resources/convocatorias/busquedas-boe/20260529.xml
+++ b/src/main/resources/convocatorias/busquedas-boe/20260529.xml
--- a/src/main/resources/convocatorias/busquedas-boe/leer-boe.py
+++ b/src/main/resources/convocatorias/busquedas-boe/leer-boe.py
@ -0,0 +1,11 @@
 import requests
 import xml.etree.ElementTree as ET
 import datetime
 today = datetime.date.today()
 url = f"https://www.boe.es/datosabiertos/api/boe/sumario/{today.strftime('%Y%m%d')}"
 response = requests.get(url)
 print(response.status_code)
 print(response.text[:1000])  # primeros caracteres
--- a/src/main/resources/convocatorias/busquedas-boe/xml_to_md.py
+++ b/src/main/resources/convocatorias/busquedas-boe/xml_to_md.py
@ -0,0 +1,196 @@
 """Convierte oep2026.xml a un markdown limpio y estructurado."""
 import xml.etree.ElementTree as ET
 import re
 import os
 XML_PATH = os.path.join(os.path.dirname(__file__), "oep2026.xml")
 MD_PATH  = os.path.join(os.path.dirname(__file__), "oep2026.md")
 # ── helpers ──────────────────────────────────────────────────────────────────
 def inner_text(elem):
    """Texto plano de un elemento (ignora etiquetas internas como <strong>)."""
    return "".join(elem.itertext()).strip()
 def cell_text(td):
    t = inner_text(td)
    return t if t and t.strip() else ""
 # ── table renderer ────────────────────────────────────────────────────────────
 HEADER_6 = ["Cód.", "Cuerpo / Escala", "Cupo general", "Discp. general", "Discp. intelectual", "Total plazas"]
 SEP_6    = "|:---|:---|---:|---:|---:|---:|"
 def table_to_md(table):
    """Renderiza una <table> HTML como tabla Markdown con una sola cabecera."""
    lines = []
    # Cabecera única al inicio de la tabla
    lines.append("| " + " | ".join(HEADER_6) + " |")
    lines.append(SEP_6)
    tbody = table.find("tbody")
    if tbody is None:
        return "\n".join(lines)
    for tr in tbody.findall("tr"):
        cells = tr.findall("td")
        if not cells:
            continue
        colspan = int(cells[0].get("colspan", "1"))
        txt = inner_text(cells[0])
        if colspan >= 4:
            # Fila de sección/grupo → línea de separación visual
            if not txt:
                continue
            if txt.isupper():
                lines.append(f"| **{txt}** | | | | | |")
            elif txt.startswith("Sub") or txt.startswith("sub"):
                lines.append(f"| *{txt}* | | | | | |")
            else:
                lines.append(f"| {txt} | | | | | |")
        else:
            row_cells = [cell_text(c) for c in cells]
            while len(row_cells) < 6:
                row_cells.append("")
            # Detectar filas "Total": el primer texto no vacío contiene "Total" o "TOTAL"
            first_nonempty = next((v for v in row_cells if v), "")
            if "otal" in first_nonempty:
                row_cells = [f"**{v}**" if v else "" for v in row_cells]
            lines.append("| " + " | ".join(row_cells) + " |")
    return "\n".join(lines)
 def table_to_md_short(table):
    """Para tablas con 4 columnas (ANEXO VII)."""
    lines = []
    thead = table.find("thead")
    if thead is not None:
        for tr in thead.findall("tr"):
            cols = [cell_text(c) for c in tr]
            if any(cols):
                lines.append("| " + " | ".join(cols) + " |")
                lines.append("|" + "|".join([":---" if i == 0 else "---:" for i in range(len(cols))]) + "|")
    tbody = table.find("tbody")
    if tbody is None:
        return "\n".join(lines)
    for tr in tbody.findall("tr"):
        cells = tr.findall("td")
        if not cells:
            continue
        colspan = cells[0].get("colspan", "1")
        if int(colspan) >= 3:
            txt = inner_text(cells[0])
            if txt:
                lines.append(f"\n*{txt}*\n")
        else:
            row = [cell_text(c) for c in cells]
            lines.append("| " + " | ".join(row) + " |")
    return "\n".join(lines)
 # ── main conversion ───────────────────────────────────────────────────────────
 def convert():
    tree = ET.parse(XML_PATH)
    root = tree.getroot()
    meta   = root.find("metadatos")
    texto  = root.find("texto")
    titulo          = inner_text(meta.find("titulo"))
    referencia      = inner_text(meta.find("identificador"))
    fecha_pub       = inner_text(meta.find("fecha_publicacion"))
    fecha_pub_fmt   = f"{fecha_pub[6:8]}/{fecha_pub[4:6]}/{fecha_pub[:4]}"
    fecha_disp      = inner_text(meta.find("fecha_disposicion"))
    fecha_disp_fmt  = f"{fecha_disp[6:8]}/{fecha_disp[4:6]}/{fecha_disp[:4]}"
    departamento    = inner_text(meta.find("departamento"))
    numero_oficial  = inner_text(meta.find("numero_oficial"))
    url_eli         = inner_text(meta.find("url_eli"))
    md = []
    # ── Título y metadatos ────────────────────────────────────────────────────
    md.append(f"# {titulo}\n")
    md.append(f"> **Referencia:** {referencia}  ")
    md.append(f"> **Fecha disposición:** {fecha_disp_fmt}  ")
    md.append(f"> **Fecha publicación:** {fecha_pub_fmt}  ")
    md.append(f"> **Departamento:** {departamento}  ")
    md.append(f"> **Núm. oficial:** RD {numero_oficial}  ")
    md.append(f"> **ELI:** <{url_eli}>")
    md.append("")
    # ── Cuerpo del texto ──────────────────────────────────────────────────────
    in_preamble = True
    preamble_lines = []
    for child in texto:
        tag  = child.tag
        cls  = child.get("class", "")
        text = inner_text(child)
        if tag == "p":
            if cls == "articulo":
                if in_preamble and preamble_lines:
                    md.append("## PREÁMBULO\n")
                    md.extend(preamble_lines)
                    preamble_lines = []
                    in_preamble = False
                # Distinguir artículo vs disposición
                if re.match(r"^(Artículo|Artículo\s+\d)", text):
                    md.append(f"\n### {text}\n")
                else:
                    # disposición adicional / final
                    md.append(f"\n### {text}\n")
            elif cls in ("parrafo", "parrafo_2"):
                if in_preamble:
                    preamble_lines.append(text + "\n")
                else:
                    md.append(text + "\n")
            elif cls == "centro_redonda":
                md.append(f"\n**{text}**\n")
            elif cls in ("firma_rey", "firma_ministro"):
                md.append(f"*{text}*  ")
            elif cls == "anexo_num":
                md.append(f"\n---\n\n## {text}")
            elif cls == "anexo_tit":
                md.append(f"\n### {text}\n")
            elif cls == "anexo":
                # ANEXO VII no tiene anexo_num ni tit separados
                md.append(f"\n---\n\n## {text}\n")
        elif tag == "table":
            # Detectar si es tabla de 4 cols (ANEXO VII)
            cols = child.find("colgroup")
            ncols = len(cols.findall("col")) if cols is not None else 6
            if ncols <= 4:
                md.append(table_to_md_short(child))
            else:
                md.append(table_to_md(child))
            md.append("")
    # Si no hubo ningún artículo (no debería pasar)
    if in_preamble and preamble_lines:
        md.append("## PREÁMBULO\n")
        md.extend(preamble_lines)
    output = "\n".join(md)
    # Limpieza de líneas en blanco excesivas
    output = re.sub(r"\n{4,}", "\n\n\n", output)
    with open(MD_PATH, "w", encoding="utf-8") as f:
        f.write(output)
    print(f"✓ Generado: {MD_PATH}")
    lines = output.count("\n")
    print(f"  {lines} líneas, {len(output)} caracteres")
 if __name__ == "__main__":
    convert()