taiage/scripts/md_a_audio_con_openai.py

import os
import re
from pathlib import Path
from openai import OpenAI

# Configura tu API Key aquí o mediante variable de entorno
client = OpenAI(api_key="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")

def limpiar_markdown(texto):
    """Limpieza profunda para que la IA no lea símbolos innecesarios."""
    texto = re.sub(r'```.*?```', ' [bloque de código] ', texto, flags=re.DOTALL)
    texto = re.sub(r'\|.*?\|', '', texto)  # Eliminar tablas
    texto = re.sub(r'[-:]{3,}', '', texto)
    texto = re.sub(r'!\[.*?\]\(.*?\)', '', texto) # Imágenes
    texto = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', texto) # Enlaces
    texto = re.sub(r'[#*_~`>]', '', texto)
    return ' '.join(texto.split())

def dividir_texto(texto, max_chars=4000):
    """Divide el texto en fragmentos para no superar el límite de la API."""
    return [texto[i:i + max_chars] for i in range(0, len(texto), max_chars)]

def convertir_md_a_audio(path_md):
    base_name = path_md.stem
    audio_output = path_md.with_suffix('.mp3')

    print(f"\n--- Procesando: {path_md.name} ---")

    try:
        texto = path_md.read_text(encoding="utf-8")
        texto_limpio = limpiar_markdown(texto)

        if len(texto_limpio.strip()) < 5:
            print("⚠️ Texto insuficiente.")
            return

        # Dividimos el texto si es muy largo
        fragmentos = dividir_texto(texto_limpio)

        # Procesamos y guardamos el audio (OpenAI genera archivos de alta calidad)
        # Voces recomendadas: 'alloy' (neutra), 'onyx' (profunda), 'nova' (femenina enérgica)
        response = client.audio.speech.create(
            model="tts-1",
            voice="onyx",
            input=texto_limpio
        )

        response.stream_to_file(audio_output)
        print(f"✅ Audio generado con éxito: {audio_output}")

    except Exception as e:
        print(f"❌ Error en {path_md.name}: {e}")

def procesar_carpeta():
    archivos = sorted(Path('.').glob('tema*.md'))
    if not archivos:
        print("No se encontraron archivos .md")
        return

    for md in archivos:
        convertir_md_a_audio(md)

if __name__ == "__main__":
    procesar_carpeta()