Code/IA/00 - fiches_corpus/generate_corpus.py

import os
import re
import shutil
from pathlib import Path

EXCLUDE_DIRS = {"Local"}
MAX_SECTION_LENGTH = 1200  # non utilisé ici car découpe selon présence de ###

def slugify(text):
    return re.sub(r'\W+', '-', text.strip()).strip('-').lower()

def split_markdown_sections_refined(content):
    lines = content.splitlines()
    sections = []
    header_level_2 = None
    section_lines = []
    subsections = []
    current_subsection = None
    inside_section = False

    for line in lines:
        if line.startswith("## "):
            if header_level_2:
                if current_subsection:
                    subsections.append(current_subsection)
                sections.append((header_level_2, section_lines, subsections))
                section_lines, subsections = [], []
                current_subsection = None
            header_level_2 = line[3:].strip()
            inside_section = True
        elif line.startswith("### ") and inside_section:
            if current_subsection:
                subsections.append(current_subsection)
            current_subsection = (line[4:].strip(), [])
        elif inside_section:
            if current_subsection:
                current_subsection[1].append(line)
            else:
                section_lines.append(line)

    if header_level_2:
        if current_subsection:
            subsections.append(current_subsection)
        sections.append((header_level_2, section_lines, subsections))
    return sections

def process_markdown_file(md_path, rel_output_dir):
    with open(md_path, encoding="utf-8") as f:
        content = f.read()
    sections = split_markdown_sections_refined(content)

    for idx, (sec_title, sec_lines, subsections) in enumerate(sections):
        base_name = f"{idx:02d}-{slugify(sec_title)}"
        if subsections:
            sec_dir = rel_output_dir / base_name
            sec_dir.mkdir(parents=True, exist_ok=True)
            with open(sec_dir / "_intro.md", "w", encoding="utf-8") as f_out:
                f_out.write(f"## {sec_title}\n")
                f_out.write("\n".join(sec_lines).strip())
            for sub_idx, (sub_title, sub_lines) in enumerate(subsections):
                sub_name = f"{sub_idx:02d}-{slugify(sub_title)}.md"
                with open(sec_dir / sub_name, "w", encoding="utf-8") as f_out:
                    f_out.write(f"### {sub_title}\n")
                    f_out.write("\n".join(sub_lines).strip())
        else:
            with open(rel_output_dir / f"{base_name}.md", "w", encoding="utf-8") as f_out:
                f_out.write(f"## {sec_title}\n")
                f_out.write("\n".join(sec_lines).strip())

def build_corpus_structure():
    BASE_DIR = Path(__file__).resolve().parent.parent.parent
    print(BASE_DIR)
    SOURCE_DIR = BASE_DIR / "Fiches"
    DEST_DIR = BASE_DIR / "Corpus"

    if DEST_DIR.exists():
        shutil.rmtree(DEST_DIR)
    DEST_DIR.mkdir(parents=True, exist_ok=True)

    for root, _, files in os.walk(SOURCE_DIR):
        rel_path = Path(root).relative_to(SOURCE_DIR)
        if any(part in EXCLUDE_DIRS for part in rel_path.parts):
            continue
        for file in files:
            if not file.endswith(".md") or ".md." in file:
                continue
            input_file = Path(root) / file
            subdir = rel_path
            filename_no_ext = Path(file).stem
            output_dir = DEST_DIR / subdir / filename_no_ext
            output_dir.mkdir(parents=True, exist_ok=True)
            process_markdown_file(input_file, output_dir)

if __name__ == "__main__":
    build_corpus_structure()
    print("✅ Corpus généré avec succès dans le dossier 'Corpus/'")