Code/index.py

#!/usr/bin/env python3
"""
index.py — Indexation « mini‑fiches » SANS découpage
====================================================

Objectif : chaque fichier (chapitre) devient **un seul** passage, afin de
préserver l’intégrité des tableaux, listes, etc. — conformément à votre
organisation manuelle.

Caractéristiques :
• Incrémental : seuls les fichiers nouveaux ou modifiés sont ré‑encodés.
• Paramètres CLI :
    --root   racine des fiches   (défaut : Corpus)
    --index  nom du fichier idx  (défaut : corpus.idx)
    --meta   nom du fichier méta (défaut : corpus.meta.json)
• Extensions prises : .md .markdown .txt
• Embeddings : BGE‑M3 (FlagEmbedding) en CPU, normalisés L2.

Usage :
    python index.py                   # première indexation (tous les fichiers)
    python index.py                   # relance instantanée (rien à faire)
    touch Corpus/…/nouveau.md
    python index.py                   # encode seulement 1 fichier
"""

import argparse, json, os, time
from pathlib import Path

import faiss, numpy as np
from FlagEmbedding import BGEM3FlagModel
from rich import print

# --------------------- CLI --------------------------------------------------
parser = argparse.ArgumentParser(description="Indexation incrémentale des mini‑fiches (1 fichier = 1 passage).")
parser.add_argument("--root",  default="Corpus", help="Répertoire racine des fiches")
parser.add_argument("--index", default="corpus.idx", help="Nom du fichier FAISS")
parser.add_argument("--meta",  default="corpus.meta.json", help="Nom du méta JSON")
args = parser.parse_args()

ROOT      = Path(args.root).expanduser()
INDEX_F   = Path(args.index)
META_F    = Path(args.meta)
EXTS      = {".md", ".markdown", ".txt"}

print(f"[dim]Racine : {ROOT}  |  Index : {INDEX_F}[/]")

# ------------------------ lire méta existant -------------------------------
old_meta = []
old_mtime = {}
if INDEX_F.exists() and META_F.exists():
    try:
        old_meta = json.load(META_F.open())
        old_mtime = {m["path"]: m["mtime"] for m in old_meta}
    except Exception as e:
        print(f"[yellow]Avertissement : impossible de lire l'ancien méta : {e}. On repart de zéro.[/]")
        old_meta = []
        old_mtime = {}

# ------------------------ scanner les fichiers -----------------------------
files = [fp for fp in ROOT.rglob("*") if fp.suffix.lower() in EXTS]
files.sort()

new_docs, new_meta = [], []
kept_meta = []  # meta non modifiés

for fp in files:
    path_str = str(fp.relative_to(ROOT))
    mtime    = int(fp.stat().st_mtime)
    if path_str in old_mtime and old_mtime[path_str] == mtime:
        # déjà indexé, rien à faire
        kept_meta.append(next(m for m in old_meta if m["path"] == path_str))
        continue
    # fichier nouveau ou modifié
    txt = fp.read_text(encoding="utf-8")
    new_docs.append(txt)
    new_meta.append({"path": path_str, "mtime": mtime})

print(f"Nouveaux/Modifiés : {len(new_docs)}  |  Conservés : {len(kept_meta)}")
if not new_docs and INDEX_F.exists():
    print("Index déjà à jour ✔︎")
    exit(0)

# ------------------------ embeddings BGE‑M3 ---------------------------------
model = BGEM3FlagModel("BAAI/bge-m3", device="cpu")
emb = model.encode(new_docs)
if isinstance(emb, dict):
    emb = next(v for v in emb.values() if isinstance(v, np.ndarray))
emb = emb / np.linalg.norm(emb, axis=1, keepdims=True)
emb = emb.astype("float32")

# ------------------------ mise à jour FAISS --------------------------------
if INDEX_F.exists():
    idx = faiss.read_index(str(INDEX_F))
else:
    idx = faiss.IndexFlatIP(emb.shape[1])

idx.add(emb)
faiss.write_index(idx, str(INDEX_F))

# ------------------------ enregistrer le nouveau méta ----------------------
all_meta = kept_meta + new_meta
json.dump(all_meta, META_F.open("w"), ensure_ascii=False, indent=2)

print(f"Index mis à jour ✔︎  |  Total passages : {idx.ntotal}")