From 58f7cbf669813e12ad33bedcf57bd29822769845 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phan?= <stephan-pro@peccini.fr>
Date: Sun, 18 May 2025 18:14:41 +0200
Subject: [PATCH] Update index.py

---
 index.py | 75 +++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 50 insertions(+), 25 deletions(-)

diff --git a/index.py b/index.py
index 1623597..cfd34fe 100644
--- a/index.py
+++ b/index.py
@@ -1,38 +1,63 @@
-# index.py  (exécuter dans un venv ou un conteneur python:3.11-slim)
+#!/usr/bin/env python3
+"""
+Indexation du répertoire Fiches avec BGE‑M3 (FlagEmbedding) + FAISS.
+Découpe les documents markdown en blocs de ~800 tokens, génère les embeddings,
+et écrit corpus.idx et corpus.meta.json à la racine.
+"""
 
 from pathlib import Path
-from sentence_transformers import SentenceTransformer
-import faiss, json, re
+import json
+import re
+import faiss
+from FlagEmbedding import BGEM3FlagModel
 
-MODEL_NAME = "WhereIsAI/bge-base-fr"
-CHUNK = 800          # tokens environ ~600 mots
-OVERLAP = 100        # lissage
+# --- Paramètres -------------------------------------------------------------
+ROOT = Path("/app/Fiches")          # dossier monté contenant les fiches .md
+MODEL_NAME = "BAAI/bge-m3"          # embedding multilingue, licence MIT
+CHUNK = 800                          # taille cible (≈600 mots)
+OVERLAP = 100                        # chevauchement pour la cohésion
+INDEX_FILE = "corpus.idx"
+META_FILE = "corpus.meta.json"
 
-def split(text):
-    # coupe proprement sur phrase/ponctuation
-    sentences = re.split(r'(?<=[\.\!\?]) +', text)
+# --- Fonctions utilitaires --------------------------------------------------
+
+def split(text: str, chunk_size: int = CHUNK, overlap: int = OVERLAP):
+    """Découpe un texte en morceaux de chunk_size mots avec overlap mots de recouvrement."""
+    sentences = re.split(r"(?<=[\.\!\?])\s+", text)
     chunks, buf = [], []
     for s in sentences:
         buf.append(s)
-        if len(" ".join(buf)) > CHUNK:
+        if len(" ".join(buf)) > chunk_size:
             chunks.append(" ".join(buf))
-            buf = buf[-OVERLAP:]
-    if buf: chunks.append(" ".join(buf))
+            buf = buf[-overlap:]
+    if buf:
+        chunks.append(" ".join(buf))
     return chunks
 
-docs, meta = [], []
-for fp in Path("/app/Fiches").rglob("*.md"):
-    txt = fp.read_text(encoding="utf-8")
-    for i, chunk in enumerate(split(txt)):
-        docs.append(chunk)
-        meta.append({"file": fp.name, "part": i})
+# --- Pipeline principal -----------------------------------------------------
 
-model = SentenceTransformer(MODEL_NAME, device="cpu")
-emb = model.encode(docs, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
+def main():
+    docs, meta = [], []
+    for fp in ROOT.rglob("*.md"):
+        text = fp.read_text(encoding="utf-8", errors="ignore")
+        for i, chunk in enumerate(split(text)):
+            docs.append(chunk)
+            meta.append({"file": fp.relative_to(ROOT).as_posix(), "part": i})
 
-index = faiss.IndexFlatIP(emb.shape[1])
-index.add(emb)
+    print(f"Découpé {len(docs)} passages, génération des embeddings…")
 
-faiss.write_index(index, "corpus.idx")
-json.dump(meta, open("corpus.meta.json", "w"))
-print(f"Indexé {len(docs)} passages.")
+    model = BGEM3FlagModel(MODEL_NAME, device="cpu")
+    emb = model.encode(docs, batch_size=64, normalize_embeddings=True)
+
+    index = faiss.IndexFlatIP(emb.shape[1])
+    index.add(emb.astype("float32"))
+    faiss.write_index(index, INDEX_FILE)
+
+    with open(META_FILE, "w", encoding="utf-8") as f:
+        json.dump(meta, f, ensure_ascii=False, indent=2)
+
+    print(f"Index écrit dans {INDEX_FILE} avec {len(docs)} vecteurs.")
+
+
+if __name__ == "__main__":
+    main()