leyendo el boe

2026-05-29 13:20:03 +02:00 · 2026-05-29 13:20:03 +02:00 · 0c3acffeb9
parent 993c427cfc
commit 0c3acffeb9
4 changed files with 1875 additions and 0 deletions
--- a/boe.xml
+++ b/boe.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="utf-8"?>
+<response>
+  <status>
+    <code>400</code>
+    <text>No soportado ningún mime type de la cabecera Accept.</text>
+  </status>
+  <data/>
+</response>
--- a/src/main/resources/convocatorias/busquedas-boe/20260529.xml
+++ b/src/main/resources/convocatorias/busquedas-boe/20260529.xml
--- a/src/main/resources/convocatorias/busquedas-boe/leer-boe.py
+++ b/src/main/resources/convocatorias/busquedas-boe/leer-boe.py
@ -0,0 +1,11 @@
+import requests
+import xml.etree.ElementTree as ET
+import datetime
+
+today = datetime.date.today()
+url = f"https://www.boe.es/datosabiertos/api/boe/sumario/{today.strftime('%Y%m%d')}"
+
+response = requests.get(url)
+
+print(response.status_code)
+print(response.text[:1000])  # primeros caracteres
--- a/src/main/resources/convocatorias/busquedas-boe/xml_to_md.py
+++ b/src/main/resources/convocatorias/busquedas-boe/xml_to_md.py
@ -0,0 +1,196 @@
+"""Convierte oep2026.xml a un markdown limpio y estructurado."""
+import xml.etree.ElementTree as ET
+import re
+import os
+
+XML_PATH = os.path.join(os.path.dirname(__file__), "oep2026.xml")
+MD_PATH  = os.path.join(os.path.dirname(__file__), "oep2026.md")
+
+# ── helpers ──────────────────────────────────────────────────────────────────
+
+def inner_text(elem):
+    """Texto plano de un elemento (ignora etiquetas internas como <strong>)."""
+    return "".join(elem.itertext()).strip()
+
+def cell_text(td):
+    t = inner_text(td)
+    return t if t and t.strip() else ""
+
+# ── table renderer ────────────────────────────────────────────────────────────
+
+HEADER_6 = ["Cód.", "Cuerpo / Escala", "Cupo general", "Discp. general", "Discp. intelectual", "Total plazas"]
+SEP_6    = "|:---|:---|---:|---:|---:|---:|"
+
+def table_to_md(table):
+    """Renderiza una <table> HTML como tabla Markdown con una sola cabecera."""
+    lines = []
+    # Cabecera única al inicio de la tabla
+    lines.append("| " + " | ".join(HEADER_6) + " |")
+    lines.append(SEP_6)
+
+    tbody = table.find("tbody")
+    if tbody is None:
+        return "\n".join(lines)
+
+    for tr in tbody.findall("tr"):
+        cells = tr.findall("td")
+        if not cells:
+            continue
+        colspan = int(cells[0].get("colspan", "1"))
+        txt = inner_text(cells[0])
+
+        if colspan >= 4:
+            # Fila de sección/grupo → línea de separación visual
+            if not txt:
+                continue
+            if txt.isupper():
+                lines.append(f"| **{txt}** | | | | | |")
+            elif txt.startswith("Sub") or txt.startswith("sub"):
+                lines.append(f"| *{txt}* | | | | | |")
+            else:
+                lines.append(f"| {txt} | | | | | |")
+        else:
+            row_cells = [cell_text(c) for c in cells]
+            while len(row_cells) < 6:
+                row_cells.append("")
+            # Detectar filas "Total": el primer texto no vacío contiene "Total" o "TOTAL"
+            first_nonempty = next((v for v in row_cells if v), "")
+            if "otal" in first_nonempty:
+                row_cells = [f"**{v}**" if v else "" for v in row_cells]
+            lines.append("| " + " | ".join(row_cells) + " |")
+
+    return "\n".join(lines)
+
+
+def table_to_md_short(table):
+    """Para tablas con 4 columnas (ANEXO VII)."""
+    lines = []
+    thead = table.find("thead")
+    if thead is not None:
+        for tr in thead.findall("tr"):
+            cols = [cell_text(c) for c in tr]
+            if any(cols):
+                lines.append("| " + " | ".join(cols) + " |")
+                lines.append("|" + "|".join([":---" if i == 0 else "---:" for i in range(len(cols))]) + "|")
+    tbody = table.find("tbody")
+    if tbody is None:
+        return "\n".join(lines)
+    for tr in tbody.findall("tr"):
+        cells = tr.findall("td")
+        if not cells:
+            continue
+        colspan = cells[0].get("colspan", "1")
+        if int(colspan) >= 3:
+            txt = inner_text(cells[0])
+            if txt:
+                lines.append(f"\n*{txt}*\n")
+        else:
+            row = [cell_text(c) for c in cells]
+            lines.append("| " + " | ".join(row) + " |")
+    return "\n".join(lines)
+
+# ── main conversion ───────────────────────────────────────────────────────────
+
+def convert():
+    tree = ET.parse(XML_PATH)
+    root = tree.getroot()
+
+    meta   = root.find("metadatos")
+    texto  = root.find("texto")
+
+    titulo          = inner_text(meta.find("titulo"))
+    referencia      = inner_text(meta.find("identificador"))
+    fecha_pub       = inner_text(meta.find("fecha_publicacion"))
+    fecha_pub_fmt   = f"{fecha_pub[6:8]}/{fecha_pub[4:6]}/{fecha_pub[:4]}"
+    fecha_disp      = inner_text(meta.find("fecha_disposicion"))
+    fecha_disp_fmt  = f"{fecha_disp[6:8]}/{fecha_disp[4:6]}/{fecha_disp[:4]}"
+    departamento    = inner_text(meta.find("departamento"))
+    numero_oficial  = inner_text(meta.find("numero_oficial"))
+    url_eli         = inner_text(meta.find("url_eli"))
+
+    md = []
+
+    # ── Título y metadatos ────────────────────────────────────────────────────
+    md.append(f"# {titulo}\n")
+    md.append(f"> **Referencia:** {referencia}  ")
+    md.append(f"> **Fecha disposición:** {fecha_disp_fmt}  ")
+    md.append(f"> **Fecha publicación:** {fecha_pub_fmt}  ")
+    md.append(f"> **Departamento:** {departamento}  ")
+    md.append(f"> **Núm. oficial:** RD {numero_oficial}  ")
+    md.append(f"> **ELI:** <{url_eli}>")
+    md.append("")
+
+    # ── Cuerpo del texto ──────────────────────────────────────────────────────
+    in_preamble = True
+    preamble_lines = []
+
+    for child in texto:
+        tag  = child.tag
+        cls  = child.get("class", "")
+        text = inner_text(child)
+
+        if tag == "p":
+            if cls == "articulo":
+                if in_preamble and preamble_lines:
+                    md.append("## PREÁMBULO\n")
+                    md.extend(preamble_lines)
+                    preamble_lines = []
+                    in_preamble = False
+
+                # Distinguir artículo vs disposición
+                if re.match(r"^(Artículo|Artículo\s+\d)", text):
+                    md.append(f"\n### {text}\n")
+                else:
+                    # disposición adicional / final
+                    md.append(f"\n### {text}\n")
+
+            elif cls in ("parrafo", "parrafo_2"):
+                if in_preamble:
+                    preamble_lines.append(text + "\n")
+                else:
+                    md.append(text + "\n")
+
+            elif cls == "centro_redonda":
+                md.append(f"\n**{text}**\n")
+
+            elif cls in ("firma_rey", "firma_ministro"):
+                md.append(f"*{text}*  ")
+
+            elif cls == "anexo_num":
+                md.append(f"\n---\n\n## {text}")
+
+            elif cls == "anexo_tit":
+                md.append(f"\n### {text}\n")
+
+            elif cls == "anexo":
+                # ANEXO VII no tiene anexo_num ni tit separados
+                md.append(f"\n---\n\n## {text}\n")
+
+        elif tag == "table":
+            # Detectar si es tabla de 4 cols (ANEXO VII)
+            cols = child.find("colgroup")
+            ncols = len(cols.findall("col")) if cols is not None else 6
+            if ncols <= 4:
+                md.append(table_to_md_short(child))
+            else:
+                md.append(table_to_md(child))
+            md.append("")
+
+    # Si no hubo ningún artículo (no debería pasar)
+    if in_preamble and preamble_lines:
+        md.append("## PREÁMBULO\n")
+        md.extend(preamble_lines)
+
+    output = "\n".join(md)
+    # Limpieza de líneas en blanco excesivas
+    output = re.sub(r"\n{4,}", "\n\n\n", output)
+
+    with open(MD_PATH, "w", encoding="utf-8") as f:
+        f.write(output)
+
+    print(f"✓ Generado: {MD_PATH}")
+    lines = output.count("\n")
+    print(f"  {lines} líneas, {len(output)} caracteres")
+
+if __name__ == "__main__":
+    convert()