97 lines
3.6 KiB
Python
97 lines
3.6 KiB
Python
import os
|
|
import re
|
|
import shutil
|
|
from pathlib import Path
|
|
|
|
EXCLUDE_DIRS = {"Local"}
|
|
MAX_SECTION_LENGTH = 1200 # non utilisé ici car découpe selon présence de ###
|
|
|
|
def slugify(text):
|
|
return re.sub(r'\W+', '-', text.strip()).strip('-').lower()
|
|
|
|
def split_markdown_sections_refined(content):
|
|
lines = content.splitlines()
|
|
sections = []
|
|
header_level_2 = None
|
|
section_lines = []
|
|
subsections = []
|
|
current_subsection = None
|
|
inside_section = False
|
|
|
|
for line in lines:
|
|
if line.startswith("## "):
|
|
if header_level_2:
|
|
if current_subsection:
|
|
subsections.append(current_subsection)
|
|
sections.append((header_level_2, section_lines, subsections))
|
|
section_lines, subsections = [], []
|
|
current_subsection = None
|
|
header_level_2 = line[3:].strip()
|
|
inside_section = True
|
|
elif line.startswith("### ") and inside_section:
|
|
if current_subsection:
|
|
subsections.append(current_subsection)
|
|
current_subsection = (line[4:].strip(), [])
|
|
elif inside_section:
|
|
if current_subsection:
|
|
current_subsection[1].append(line)
|
|
else:
|
|
section_lines.append(line)
|
|
|
|
if header_level_2:
|
|
if current_subsection:
|
|
subsections.append(current_subsection)
|
|
sections.append((header_level_2, section_lines, subsections))
|
|
return sections
|
|
|
|
def process_markdown_file(md_path, rel_output_dir):
|
|
with open(md_path, encoding="utf-8") as f:
|
|
content = f.read()
|
|
sections = split_markdown_sections_refined(content)
|
|
|
|
for idx, (sec_title, sec_lines, subsections) in enumerate(sections):
|
|
base_name = f"{idx:02d}-{slugify(sec_title)}"
|
|
if subsections:
|
|
sec_dir = rel_output_dir / base_name
|
|
sec_dir.mkdir(parents=True, exist_ok=True)
|
|
with open(sec_dir / "_intro.md", "w", encoding="utf-8") as f_out:
|
|
f_out.write(f"## {sec_title}\n")
|
|
f_out.write("\n".join(sec_lines).strip())
|
|
for sub_idx, (sub_title, sub_lines) in enumerate(subsections):
|
|
sub_name = f"{sub_idx:02d}-{slugify(sub_title)}.md"
|
|
with open(sec_dir / sub_name, "w", encoding="utf-8") as f_out:
|
|
f_out.write(f"### {sub_title}\n")
|
|
f_out.write("\n".join(sub_lines).strip())
|
|
else:
|
|
with open(rel_output_dir / f"{base_name}.md", "w", encoding="utf-8") as f_out:
|
|
f_out.write(f"## {sec_title}\n")
|
|
f_out.write("\n".join(sec_lines).strip())
|
|
|
|
def build_corpus_structure():
|
|
BASE_DIR = Path(__file__).resolve().parent.parent.parent
|
|
print(BASE_DIR)
|
|
SOURCE_DIR = BASE_DIR / "Fiches"
|
|
DEST_DIR = BASE_DIR / "Corpus"
|
|
|
|
if DEST_DIR.exists():
|
|
shutil.rmtree(DEST_DIR)
|
|
DEST_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
for root, _, files in os.walk(SOURCE_DIR):
|
|
rel_path = Path(root).relative_to(SOURCE_DIR)
|
|
if any(part in EXCLUDE_DIRS for part in rel_path.parts):
|
|
continue
|
|
for file in files:
|
|
if not file.endswith(".md") or ".md." in file:
|
|
continue
|
|
input_file = Path(root) / file
|
|
subdir = rel_path
|
|
filename_no_ext = Path(file).stem
|
|
output_dir = DEST_DIR / subdir / filename_no_ext
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
process_markdown_file(input_file, output_dir)
|
|
|
|
if __name__ == "__main__":
|
|
build_corpus_structure()
|
|
print("✅ Corpus généré avec succès dans le dossier 'Corpus/'")
|