import os import re import shutil from pathlib import Path EXCLUDE_DIRS = {"Local"} MAX_SECTION_LENGTH = 1200 # non utilisé ici car découpe selon présence de ### def slugify(text): return re.sub(r'\W+', '-', text.strip()).strip('-').lower() def split_markdown_sections_refined(content): lines = content.splitlines() sections = [] header_level_2 = None section_lines = [] subsections = [] current_subsection = None inside_section = False for line in lines: if line.startswith("## "): if header_level_2: if current_subsection: subsections.append(current_subsection) sections.append((header_level_2, section_lines, subsections)) section_lines, subsections = [], [] current_subsection = None header_level_2 = line[3:].strip() inside_section = True elif line.startswith("### ") and inside_section: if current_subsection: subsections.append(current_subsection) current_subsection = (line[4:].strip(), []) elif inside_section: if current_subsection: current_subsection[1].append(line) else: section_lines.append(line) if header_level_2: if current_subsection: subsections.append(current_subsection) sections.append((header_level_2, section_lines, subsections)) return sections def process_markdown_file(md_path, rel_output_dir): with open(md_path, encoding="utf-8") as f: content = f.read() sections = split_markdown_sections_refined(content) for idx, (sec_title, sec_lines, subsections) in enumerate(sections): base_name = f"{idx:02d}-{slugify(sec_title)}" if subsections: sec_dir = rel_output_dir / base_name sec_dir.mkdir(parents=True, exist_ok=True) with open(sec_dir / "_intro.md", "w", encoding="utf-8") as f_out: f_out.write(f"## {sec_title}\n") f_out.write("\n".join(sec_lines).strip()) for sub_idx, (sub_title, sub_lines) in enumerate(subsections): sub_name = f"{sub_idx:02d}-{slugify(sub_title)}.md" with open(sec_dir / sub_name, "w", encoding="utf-8") as f_out: f_out.write(f"### {sub_title}\n") f_out.write("\n".join(sub_lines).strip()) else: with open(rel_output_dir / f"{base_name}.md", "w", encoding="utf-8") as f_out: f_out.write(f"## {sec_title}\n") f_out.write("\n".join(sec_lines).strip()) def build_corpus_structure(): BASE_DIR = Path(__file__).resolve().parent SOURCE_DIR = BASE_DIR / "Fiches" DEST_DIR = BASE_DIR / "Corpus" if DEST_DIR.exists(): shutil.rmtree(DEST_DIR) DEST_DIR.mkdir(parents=True, exist_ok=True) for root, _, files in os.walk(SOURCE_DIR): rel_path = Path(root).relative_to(SOURCE_DIR) if any(part in EXCLUDE_DIRS for part in rel_path.parts): continue for file in files: if not file.endswith(".md") or ".md." in file: continue input_file = Path(root) / file subdir = rel_path filename_no_ext = Path(file).stem output_dir = DEST_DIR / subdir / filename_no_ext output_dir.mkdir(parents=True, exist_ok=True) process_markdown_file(input_file, output_dir) if __name__ == "__main__": build_corpus_structure() print("✅ Corpus généré avec succès dans le dossier 'Corpus/'")