Update index.py

2025-05-19 07:40:57 +02:00 · 2025-05-19 07:40:57 +02:00 · e26fc3e20d
commit e26fc3e20d
parent 2c4931bdfe
1 changed files with 73 additions and 49 deletions
--- a/index.py
+++ b/index.py
@ -1,86 +1,111 @@
 #!/usr/bin/env python3
 """
-index.py — Indexation « mini‑fiches » SANS découpage
-====================================================
+index.py — indexation hybride des mini‑fiches
+===========================================

-Objectif : chaque fichier (chapitre) devient **un seul** passage, afin de
-préserver l’intégrité des tableaux, listes, etc. — conformément à votre
-organisation manuelle.
-
-Caractéristiques :
-• Incrémental : seuls les fichiers nouveaux ou modifiés sont ré‑encodés.
-• Paramètres CLI :
-    --root   racine des fiches   (défaut : Corpus)
-    --index  nom du fichier idx  (défaut : corpus.idx)
-    --meta   nom du fichier méta (défaut : corpus.meta.json)
-• Extensions prises : .md .markdown .txt
+• 1 fichier = 1 passage **si** le fichier ≤ WORD_LIMIT mots (par défaut : 600).
+• Au‑delà (rare : fiche ICS, ISG, etc.), on découpe en blocs ~CHUNK mots
+  avec chevauchement OVERLAP pour isoler les tableaux et valeurs numériques.
+• Incrémental : encode uniquement les fichiers nouveaux ou modifiés.
 • Embeddings : BGE‑M3 (FlagEmbedding) en CPU, normalisés L2.

 Usage :
-    python index.py                   # première indexation (tous les fichiers)
-    python index.py                   # relance instantanée (rien à faire)
-    touch Corpus/…/nouveau.md
-    python index.py                   # encode seulement 1 fichier
+    python index.py --root Corpus               # première construction
+    python index.py                              # relance rapide (0 s si rien)
+
+Arguments :
+    --root   dossier des fiches (déf. Corpus)
+    --index  nom du fichier FAISS (déf. corpus.idx)
+    --meta   fichier méta JSON (déf. corpus.meta.json)
+    --word   WORD_LIMIT (déf. 600)
+    --chunk  CHUNK (déf. 350)
 """

-import argparse, json, os, time
+import argparse, json, re, sys
 from pathlib import Path
-
 import faiss, numpy as np
 from FlagEmbedding import BGEM3FlagModel
 from rich import print

 # --------------------- CLI --------------------------------------------------
-parser = argparse.ArgumentParser(description="Indexation incrémentale des mini‑fiches (1 fichier = 1 passage).")
+parser = argparse.ArgumentParser(description="Indexation hybride : 1 passage par fiche courte, découpe douce pour les longues.")
 parser.add_argument("--root",  default="Corpus", help="Répertoire racine des fiches")
 parser.add_argument("--index", default="corpus.idx", help="Nom du fichier FAISS")
 parser.add_argument("--meta",  default="corpus.meta.json", help="Nom du méta JSON")
+parser.add_argument("--word",  type=int, default=600, help="WORD_LIMIT : au‑delà on découpe (mots)")
+parser.add_argument("--chunk", type=int, default=350, help="Taille des chunks quand on découpe (mots)")
 args = parser.parse_args()

-ROOT      = Path(args.root).expanduser()
+ROOT      = Path(args.root)
 INDEX_F   = Path(args.index)
 META_F    = Path(args.meta)
+WORD_LIMIT= args.word
+CHUNK     = args.chunk
+OVERLAP   = 50
 EXTS      = {".md", ".markdown", ".txt"}

 print(f"[dim]Racine : {ROOT}  |  Index : {INDEX_F}[/]")

-# ------------------------ lire méta existant -------------------------------
-old_meta = []
-old_mtime = {}
+# ---------------- split helper --------------------------------------------
+def split_long(text: str):
+    """Découpe douce : blocs ~CHUNK mots, préserve tableaux."""
+    sentences = re.split(r"(?<=[.!?])\s+", text)
+    chunks, buf = [], []
+    for s in sentences:
+        if "|" in s or re.fullmatch(r"\s*-{3,}\s*", s):
+            if buf:
+                chunks.append(" ".join(buf))
+                buf = []
+            chunks.append(s)
+            continue
+        buf.append(s)
+        if len(" ".join(buf).split()) >= CHUNK:
+            chunks.append(" ".join(buf))
+            buf = buf[-OVERLAP:]
+    if buf:
+        chunks.append(" ".join(buf))
+    return chunks
+
+# ------------------------ lire méta existant ------------------------------
+old_meta = {}
 if INDEX_F.exists() and META_F.exists():
    try:
-        old_meta = json.load(META_F.open())
-        old_mtime = {m["path"]: m["mtime"] for m in old_meta}
+        for m in json.load(META_F.open()):
+            old_meta[m["path"]] = m
    except Exception as e:
-        print(f"[yellow]Avertissement : impossible de lire l'ancien méta : {e}. On repart de zéro.[/]")
-        old_meta = []
-        old_mtime = {}
+        print(f"[yellow]Avertissement : méta illisible ({e}), reconstruction complète.[/]")
+        old_meta = {}

-# ------------------------ scanner les fichiers -----------------------------
+# ------------------------ scanner les fichiers ----------------------------
 files = [fp for fp in ROOT.rglob("*") if fp.suffix.lower() in EXTS]
 files.sort()

-new_docs, new_meta = [], []
-kept_meta = []  # meta non modifiés
+new_docs, new_meta, kept_meta = [], [], []

 for fp in files:
-    path_str = str(fp.relative_to(ROOT))
-    mtime    = int(fp.stat().st_mtime)
-    if path_str in old_mtime and old_mtime[path_str] == mtime:
-        # déjà indexé, rien à faire
-        kept_meta.append(next(m for m in old_meta if m["path"] == path_str))
+    rel = str(fp.relative_to(ROOT))
+    mtime = int(fp.stat().st_mtime)
+    prev = old_meta.get(rel)
+    if prev and prev["mtime"] == mtime:
+        kept_meta.append(prev)
        continue
-    # fichier nouveau ou modifié
-    txt = fp.read_text(encoding="utf-8")
-    new_docs.append(txt)
-    new_meta.append({"path": path_str, "mtime": mtime})

-print(f"Nouveaux/Modifiés : {len(new_docs)}  |  Conservés : {len(kept_meta)}")
-if not new_docs and INDEX_F.exists():
+    txt   = fp.read_text(encoding="utf-8")
+    words = len(txt.split())
+    if words <= WORD_LIMIT:
+        new_docs.append(txt)
+        new_meta.append({"path": rel, "part": 0, "mtime": mtime})
+    else:
+        for i, chunk in enumerate(split_long(txt)):
+            new_docs.append(chunk)
+            new_meta.append({"path": rel, "part": i, "mtime": mtime})
+
+print(f"Nouveaux/Modifiés : {len(new_meta)} | Conservés : {len(kept_meta)}")
+if not new_meta and INDEX_F.exists():
    print("Index déjà à jour ✔︎")
-    exit(0)
+    sys.exit(0)

-# ------------------------ embeddings BGE‑M3 ---------------------------------
+# ------------------------ embeddings --------------------------------------
 model = BGEM3FlagModel("BAAI/bge-m3", device="cpu")
 emb = model.encode(new_docs)
 if isinstance(emb, dict):
@ -88,7 +113,7 @@ if isinstance(emb, dict):
 emb = emb / np.linalg.norm(emb, axis=1, keepdims=True)
 emb = emb.astype("float32")

-# ------------------------ mise à jour FAISS --------------------------------
+# ------------------------ FAISS update ------------------------------------
 if INDEX_F.exists():
    idx = faiss.read_index(str(INDEX_F))
 else:
@ -97,8 +122,7 @@ else:
 idx.add(emb)
 faiss.write_index(idx, str(INDEX_F))

-# ------------------------ enregistrer le nouveau méta ----------------------
+# ------------------------ save meta ---------------------------------------
 all_meta = kept_meta + new_meta
 json.dump(all_meta, META_F.open("w"), ensure_ascii=False, indent=2)
-
-print(f"Index mis à jour ✔︎  |  Total passages : {idx.ntotal}")
+print(f"Index mis à jour ✔︎ | Total passages : {idx.ntotal}")