Code/index.py

#!/usr/bin/env python3
"""
index.py — indexation hybride des mini‑fiches
===========================================

• 1 fichier = 1 passage **si** le fichier ≤ WORD_LIMIT mots (par défaut : 600).
• Au‑delà (rare : fiche ICS, ISG, etc.), on découpe en blocs ~CHUNK mots
  avec chevauchement OVERLAP pour isoler les tableaux et valeurs numériques.
• Incrémental : encode uniquement les fichiers nouveaux ou modifiés.
• Embeddings : BGE‑M3 (FlagEmbedding) en CPU, normalisés L2.

Usage :
    python index.py --root Corpus               # première construction
    python index.py                              # relance rapide (0 s si rien)

Arguments :
    --root   dossier des fiches (déf. Corpus)
    --index  nom du fichier FAISS (déf. corpus.idx)
    --meta   fichier méta JSON (déf. corpus.meta.json)
    --word   WORD_LIMIT (déf. 600)
    --chunk  CHUNK (déf. 350)
"""

import argparse, json, re, sys
from pathlib import Path
import faiss, numpy as np
from FlagEmbedding import BGEM3FlagModel
from rich import print

# --------------------- CLI --------------------------------------------------
parser = argparse.ArgumentParser(description="Indexation hybride : 1 passage par fiche courte, découpe douce pour les longues.")
parser.add_argument("--root",  default="Corpus", help="Répertoire racine des fiches")
parser.add_argument("--index", default="corpus.idx", help="Nom du fichier FAISS")
parser.add_argument("--meta",  default="corpus.meta.json", help="Nom du méta JSON")
parser.add_argument("--word",  type=int, default=600, help="WORD_LIMIT : au‑delà on découpe (mots)")
parser.add_argument("--chunk", type=int, default=350, help="Taille des chunks quand on découpe (mots)")
args = parser.parse_args()

ROOT      = Path(args.root)
INDEX_F   = Path(args.index)
META_F    = Path(args.meta)
WORD_LIMIT= args.word
CHUNK     = args.chunk
OVERLAP   = 50
EXTS      = {".md", ".markdown", ".txt"}

print(f"[dim]Racine : {ROOT}  |  Index : {INDEX_F}[/]")

# ---------------- split helper --------------------------------------------
def split_long(text: str):
    """Découpe douce : blocs ~CHUNK mots, préserve tableaux."""
    sentences = re.split(r"(?<=[.!?])\s+", text)
    chunks, buf = [], []
    for s in sentences:
        if "|" in s or re.fullmatch(r"\s*-{3,}\s*", s):
            if buf:
                chunks.append(" ".join(buf))
                buf = []
            chunks.append(s)
            continue
        buf.append(s)
        if len(" ".join(buf).split()) >= CHUNK:
            chunks.append(" ".join(buf))
            buf = buf[-OVERLAP:]
    if buf:
        chunks.append(" ".join(buf))
    return chunks

# ------------------------ lire méta existant ------------------------------
old_meta = {}
if INDEX_F.exists() and META_F.exists():
    try:
        for m in json.load(META_F.open()):
            old_meta[m["path"]] = m
    except Exception as e:
        print(f"[yellow]Avertissement : méta illisible ({e}), reconstruction complète.[/]")
        old_meta = {}

# ------------------------ scanner les fichiers ----------------------------
files = [fp for fp in ROOT.rglob("*") if fp.suffix.lower() in EXTS]
files.sort()

new_docs, new_meta, kept_meta = [], [], []

for fp in files:
    rel = str(fp.relative_to(ROOT))
    mtime = int(fp.stat().st_mtime)
    prev = old_meta.get(rel)
    if prev and prev["mtime"] == mtime:
        kept_meta.append(prev)
        continue

    txt   = fp.read_text(encoding="utf-8")
    words = len(txt.split())
    if words <= WORD_LIMIT:
        new_docs.append(txt)
        new_meta.append({"path": rel, "part": 0, "mtime": mtime})
    else:
        for i, chunk in enumerate(split_long(txt)):
            new_docs.append(chunk)
            new_meta.append({"path": rel, "part": i, "mtime": mtime})

print(f"Nouveaux/Modifiés : {len(new_meta)} | Conservés : {len(kept_meta)}")
if not new_meta and INDEX_F.exists():
    print("Index déjà à jour ✔︎")
    sys.exit(0)

# ------------------------ embeddings --------------------------------------
model = BGEM3FlagModel("BAAI/bge-m3", device="cpu")
emb = model.encode(new_docs)
if isinstance(emb, dict):
    emb = next(v for v in emb.values() if isinstance(v, np.ndarray))
emb = emb / np.linalg.norm(emb, axis=1, keepdims=True)
emb = emb.astype("float32")

# ------------------------ FAISS update ------------------------------------
if INDEX_F.exists():
    idx = faiss.read_index(str(INDEX_F))
else:
    idx = faiss.IndexFlatIP(emb.shape[1])

idx.add(emb)
faiss.write_index(idx, str(INDEX_F))

# ------------------------ save meta ---------------------------------------
all_meta = kept_meta + new_meta
json.dump(all_meta, META_F.open("w"), ensure_ascii=False, indent=2)
print(f"Index mis à jour ✔︎ | Total passages : {idx.ntotal}")