Code/IA/00 - fiches_corpus/generate_corpus.py
2025-06-04 08:35:53 +02:00

97 lines
3.6 KiB
Python

import os
import re
import shutil
from pathlib import Path
EXCLUDE_DIRS = {"Local"}
MAX_SECTION_LENGTH = 1200 # non utilisé ici car découpe selon présence de ###
def slugify(text):
return re.sub(r'\W+', '-', text.strip()).strip('-').lower()
def split_markdown_sections_refined(content):
lines = content.splitlines()
sections = []
header_level_2 = None
section_lines = []
subsections = []
current_subsection = None
inside_section = False
for line in lines:
if line.startswith("## "):
if header_level_2:
if current_subsection:
subsections.append(current_subsection)
sections.append((header_level_2, section_lines, subsections))
section_lines, subsections = [], []
current_subsection = None
header_level_2 = line[3:].strip()
inside_section = True
elif line.startswith("### ") and inside_section:
if current_subsection:
subsections.append(current_subsection)
current_subsection = (line[4:].strip(), [])
elif inside_section:
if current_subsection:
current_subsection[1].append(line)
else:
section_lines.append(line)
if header_level_2:
if current_subsection:
subsections.append(current_subsection)
sections.append((header_level_2, section_lines, subsections))
return sections
def process_markdown_file(md_path, rel_output_dir):
with open(md_path, encoding="utf-8") as f:
content = f.read()
sections = split_markdown_sections_refined(content)
for idx, (sec_title, sec_lines, subsections) in enumerate(sections):
base_name = f"{idx:02d}-{slugify(sec_title)}"
if subsections:
sec_dir = rel_output_dir / base_name
sec_dir.mkdir(parents=True, exist_ok=True)
with open(sec_dir / "_intro.md", "w", encoding="utf-8") as f_out:
f_out.write(f"## {sec_title}\n")
f_out.write("\n".join(sec_lines).strip())
for sub_idx, (sub_title, sub_lines) in enumerate(subsections):
sub_name = f"{sub_idx:02d}-{slugify(sub_title)}.md"
with open(sec_dir / sub_name, "w", encoding="utf-8") as f_out:
f_out.write(f"### {sub_title}\n")
f_out.write("\n".join(sub_lines).strip())
else:
with open(rel_output_dir / f"{base_name}.md", "w", encoding="utf-8") as f_out:
f_out.write(f"## {sec_title}\n")
f_out.write("\n".join(sec_lines).strip())
def build_corpus_structure():
BASE_DIR = Path(__file__).resolve().parent.parent.parent
print(BASE_DIR)
SOURCE_DIR = BASE_DIR / "Fiches"
DEST_DIR = BASE_DIR / "Corpus"
if DEST_DIR.exists():
shutil.rmtree(DEST_DIR)
DEST_DIR.mkdir(parents=True, exist_ok=True)
for root, _, files in os.walk(SOURCE_DIR):
rel_path = Path(root).relative_to(SOURCE_DIR)
if any(part in EXCLUDE_DIRS for part in rel_path.parts):
continue
for file in files:
if not file.endswith(".md") or ".md." in file:
continue
input_file = Path(root) / file
subdir = rel_path
filename_no_ext = Path(file).stem
output_dir = DEST_DIR / subdir / filename_no_ext
output_dir.mkdir(parents=True, exist_ok=True)
process_markdown_file(input_file, output_dir)
if __name__ == "__main__":
build_corpus_structure()
print("✅ Corpus généré avec succès dans le dossier 'Corpus/'")