From 4f61b37db14ecdbd9e4923b05f9befd65ce556b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phan?= <stephan-pro@peccini.fr>
Date: Mon, 19 May 2025 07:55:12 +0200
Subject: [PATCH] Update rag.py

---
 rag.py | 43 +++++++++++++++++++++++++++++++------------
 1 file changed, 31 insertions(+), 12 deletions(-)
diff --git a/rag.py b/rag.py
index ed0cb66..7ab2bc0 100644
--- a/rag.py
+++ b/rag.py
@@ -16,27 +16,44 @@ import faiss, numpy as np, requests
 from FlagEmbedding import BGEM3FlagModel
 from rich import print
 
-ROOT = Path("Corpus")                # dossier racine des fiches (comme dans index.py)
+ROOT = Path("Fiches")                # dossier racine des fiches
 K    = 30                               # nombre de passages remis au LLM
 
+# ------------------ utilitaires de découpe identiques à l'index -------------
+CHUNK, OVERLAP = 800, 100              # garder cohérent avec index.py
+
+def split(text: str):
+    sents = re.split(r"(?<=[.!?]) +", text)
+    buf, out = [], []
+    for s in sents:
+        buf.append(s)
+        if len(" ".join(buf).split()) > CHUNK:
+            out.append(" ".join(buf))
+            buf = buf[-OVERLAP:]
+    if buf:
+        out.append(" ".join(buf))
+    return out
+
 # ------------------- charger meta et reconstruire passages ------------------
 meta_path = Path("corpus.meta.json")
 if not meta_path.exists():
     raise SystemExit("corpus.meta.json introuvable – lancez d'abord index.py")
 meta = json.load(meta_path.open())
 
+# mapping (file, part) -> chunk text
+cache: dict[tuple[str, int], str] = {}
+for fp in sorted(ROOT.rglob("*")):
+    if fp.suffix.lower() not in {".md", ".markdown", ".txt"}:
+        continue
+    chunks = split(fp.read_text(encoding="utf-8"))
+    for i, ch in enumerate(chunks):
+        cache[(fp.name, i)] = ch
+
 # reconstruire docs dans le même ordre que l'index ---------------------------
 docs = []
 for m in meta:
-    filepath = ROOT / m["path"]
-    try:
-        if filepath.exists() and filepath.suffix.lower() in {".md", ".markdown", ".txt"}:
-            docs.append(filepath.read_text(encoding="utf-8"))
-        else:
-            docs.append(f"[passage manquant: {m['path']}]")
-    except Exception as e:
-        print(f"[dim]Erreur lecture {m['path']}: {e}[/]")
-        docs.append(f"[erreur lecture: {m['path']}]")
+    key = (m["file"], m["part"])
+    docs.append(cache.get(key, "[passage manquant]"))
 
 print(f"[dim]Passages rechargés : {len(docs)} (ordre conforme à l'index).[/]")
 
@@ -63,7 +80,9 @@ try:
 
         D, I = idx.search(q_emb.astype("float32").reshape(1, -1), K)
         hits = I[0]
-        # contexte des passages trouvés
+        # réordonne pour mettre en tête les passages contenant “Seuil”
+        hits = sorted(hits, key=lambda i: "Seuil" not in docs[int(i)])
+
         context = "\n\n".join(docs[int(i)] for i in hits[:K])
         prompt = (
             "<system>Réponds en français, de façon précise, et uniquement à partir du contexte fourni. Si l'information n'est pas dans le contexte, réponds : 'Je ne sais pas'.</system>\n"
@@ -85,6 +104,6 @@ try:
         print("\n[dim]--- contexte utilisé ---[/]")
         for rank, idx_id in enumerate(hits, 1):
             m = meta[int(idx_id)]
-            print(f"[{rank}] {m['path']} → {docs[int(idx_id)][:120]}…")
+            print(f"[{rank}] {m['file']} · part {m['part']} → {docs[int(idx_id)][:120]}…")
 except Exception as e:
     print("[red]Erreur :", e)