Code/app/fiches/generer.py

"""Module de génération des fiches pour l'application.

Fonctions principales :
1. `remplacer_latex_par_mathml`
2. `markdown_to_html_rgaa`
3. `rendu_html`
4. `generer_fiche`

Toutes ces fonctions gèrent la conversion et le rendu de contenu Markdown
vers du HTML structuré avec des mathématiques, respectant les règles RGAA.
"""

import os
import re

import markdown
import pypandoc
import streamlit as st
import yaml
from bs4 import BeautifulSoup
from latex2mathml.converter import convert as latex_to_mathml

from app.fiches.utils import (
    build_dynamic_sections,
    build_ihh_sections,
    build_isg_sections,
    build_ivc_sections,
    build_minerai_sections,
    build_production_sections,
    render_fiche_markdown,
)


# === Fonctions de transformation ===
def remplacer_latex_par_mathml(markdown_text: str) -> str:
    """Remplace les formules LaTeX par des blocs MathML.

    Args:
        markdown_text (str): Texte Markdown contenant du LaTeX.

    Returns:
        str: Le même texte avec les formules LaTeX converties en MathML.
    """

    def remplacer_bloc_display(match):
        formule_latex = match.group(1).strip()
        try:
            mathml = latex_to_mathml(formule_latex, display='block')
            return f'<div class="math-block">{mathml}</div>'
        except Exception as e:
            return f"<pre>Erreur LaTeX block: {e}</pre>"

    def remplacer_bloc_inline(match):
        formule_latex = match.group(1).strip()
        try:
            mathml = latex_to_mathml(formule_latex, display='inline')
            return f'<span class="math-inline">{mathml}</span>'
        except Exception as e:
            return f"<code>Erreur LaTeX inline: {e}</code>"

    markdown_text = re.sub(r"\$\$(.*?)\$\$", remplacer_bloc_display, markdown_text, flags=re.DOTALL)
    markdown_text = re.sub(r"(?<!\$)\$(.+?)\$(?!\$)", remplacer_bloc_inline, markdown_text, flags=re.DOTALL)
    return markdown_text

def markdown_to_html_rgaa(markdown_text: str, caption_text: str|None) -> str:
    """Convertit un texte Markdown en HTML structuré accessible.

    Args:
        markdown_text (str): Texte Markdown à convertir.
        caption_text (str, optional): Titre du tableau si applicable.

    Returns:
        str: Le HTML structuré avec des attributs de contraintes ARIA.
    """
    html = markdown.markdown(markdown_text, extensions=['tables'])
    soup = BeautifulSoup(html, "html.parser")
    for i, table in enumerate(soup.find_all("table"), start=1):
        table["role"] = "table"
        table["summary"] = caption_text
        if caption_text:
            caption = soup.new_tag("caption")
            caption.string = caption_text
            table.insert(len(table.contents), caption)
        for th in table.find_all("th"):
            th["scope"] = "col"
    return str(soup)

def rendu_html(contenu_md: str) -> list[str]:
    """Rend le contenu Markdown en HTML avec une structure spécifique.

    Args:
        contenu_md (str): Texte Markdown à formater.

    Returns:
        list[str]: Liste d'étapes de construction du HTML final.
    """
    lignes = contenu_md.split('\n')
    sections_n1 = []
    section_n1_actuelle = {"titre": None, "intro": [], "sections_n2": {}}
    section_n2_actuelle = None
    for ligne in lignes:
        if re.match(r'^#[^#]', ligne):
            if section_n1_actuelle["titre"] or section_n1_actuelle["intro"] or section_n1_actuelle["sections_n2"]:
                sections_n1.append(section_n1_actuelle)
            section_n1_actuelle = {"titre": ligne.strip('# ').strip(), "intro": [], "sections_n2": {}}
            section_n2_actuelle = None
        elif re.match(r'^##[^#]', ligne):
            section_n2_actuelle = ligne.strip('# ').strip()
            section_n1_actuelle["sections_n2"][section_n2_actuelle] = [f"## {section_n2_actuelle}"]
        elif section_n2_actuelle:
            section_n1_actuelle["sections_n2"][section_n2_actuelle].append(ligne)
        else:
            section_n1_actuelle["intro"].append(ligne)

    if section_n1_actuelle["titre"] or section_n1_actuelle["intro"] or section_n1_actuelle["sections_n2"]:
        sections_n1.append(section_n1_actuelle)

    bloc_titre = sections_n1[0]["titre"] if sections_n1 and sections_n1[0]["titre"] else "fiche"
    titre_id = re.sub(r'\W+', '-', bloc_titre.lower()).strip('-')

    html_output = [f'<section role="region" aria-labelledby="{titre_id}">', f'<h1 id="{titre_id}">{bloc_titre}</h1>']
    for bloc in sections_n1:
        if bloc["titre"] and bloc["titre"] != bloc_titre:
            html_output.append(f"<h2>{bloc['titre']}</h2>")
        if bloc["intro"]:
            intro_md = remplacer_latex_par_mathml("\n".join(bloc["intro"]))
            html_intro = markdown_to_html_rgaa(intro_md, None)
            html_output.append(html_intro)
        for sous_titre, contenu in bloc["sections_n2"].items():
            contenu_md = remplacer_latex_par_mathml("\n".join(contenu))
            contenu_html = markdown_to_html_rgaa(contenu_md, caption_text=sous_titre)
            html_output.append(f"<details><summary>{sous_titre}</summary>{contenu_html}</details>")

    html_output.append("</section>")

    return html_output

def generer_fiche(md_source: str, dossier: str, nom_fichier: str, seuils: dict) -> str:
    """Génère un document PDF et son HTML correspondant pour une fiche.

    Args:
        md_source (str): Texte Markdown source contenant la fiche.
        dossier (str): Dossier/rubrique de destination.
        nom_fichier (str): Nom du fichier (sans extension).
        seuils (dict): Valeurs de seuils pour l'analyse.

    Returns:
        str: Chemin absolu vers le fichier HTML généré.

    Notes:
        Cette fonction :
        - Convertit et formate les données Markdown.
        - Génère un document PDF sous format XeLaTeX.
        - Crée un document HTML accessible avec des mathématiques.
    """
    front_match = re.match(r"(?s)^---\n(.*?)\n---\n", md_source)
    context = yaml.safe_load(front_match.group(1)) if front_match else {}

    type_fiche = context.get("type_fiche")
    if type_fiche == "indice":
        indice = context.get("indice_court")
        if indice == "ICS":
            md_source = build_dynamic_sections(md_source)
        elif indice == "IVC":
            md_source = build_ivc_sections(md_source)
        elif indice == "IHH":
            md_source = build_ihh_sections(md_source)
        elif indice == "ISG":
            md_source = build_isg_sections(md_source)
    elif type_fiche in ["assemblage", "fabrication"]:
        md_source = build_production_sections(md_source)
    elif type_fiche == "minerai":
        md_source = build_minerai_sections(md_source)

    contenu_md = render_fiche_markdown(md_source, seuils, license_path="assets/licence.md")

    md_path = os.path.join("Fiches", dossier, nom_fichier)
    os.makedirs(os.path.dirname(md_path), exist_ok=True)
    with open(md_path, "w", encoding="utf-8") as f:
        f.write(contenu_md)

    # Génération automatique du PDF
    pdf_dir = os.path.join("static", "Fiches", dossier)
    os.makedirs(pdf_dir, exist_ok=True)

    # Construire le chemin PDF correspondant (même nom que .md, mais .pdf)
    nom_pdf = os.path.splitext(nom_fichier)[0] + ".pdf"
    pdf_path = os.path.join(pdf_dir, nom_pdf)

    try:
        pypandoc.convert_file(
            md_path,
            to="pdf",
            outputfile=pdf_path,
            extra_args=["--pdf-engine=xelatex", "-V", "geometry:margin=2cm"]
        )
    except Exception as e:
        st.error(f"[ERREUR] Génération PDF échouée pour {md_path}: {e}")

    html_output = rendu_html(contenu_md)

    html_dir = os.path.join("HTML", dossier)
    os.makedirs(html_dir, exist_ok=True)
    html_path = os.path.join(html_dir, os.path.splitext(nom_fichier)[0] + ".html")
    with open(html_path, "w", encoding="utf-8") as f:
        f.write("\n".join(html_output))

    return html_path