From a3608353a224cf256bdc0e3bbfae1117533524cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phan?= <stephan-pro@peccini.fr>
Date: Mon, 19 May 2025 06:45:46 +0200
Subject: [PATCH] Improve text chunking to preserve Markdown tables

Enhance split function to detect and preserve Markdown tables when
chunking text. Tables are now kept intact by forcing splits before
and after table content.

Also increase K value from 10 to 30 in rag.py to provide more
passages to the LLM.
---
 index.py | 24 +++++++++++++++++++++---
 rag.py   |  2 +-
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/index.py b/index.py
index d21e053..e7e7839 100644
--- a/index.py
+++ b/index.py
@@ -35,16 +35,34 @@ BATCH = 256                                   # plus grand batch : encode plus
 # --- Fonctions utilitaires --------------------------------------------------
 
 def split(text: str, chunk_size: int = CHUNK, overlap: int = OVERLAP):
-    """Découpe un texte en morceaux de chunk_size mots avec overlap mots."""
+    """Découpe *text* en chunks (~chunk_size mots) tout en
+    préservant entièrement les tableaux Markdown.
+
+    • Si une ligne contient ‘|’ ou n’est constituée que de tirets (---),
+      on force la coupure avant / après pour ne pas casser le tableau.
+    • Le reste est découpé sur la ponctuation (. ! ?) avec overlap.
+    """
     sentences = re.split(r"(?<=[\.!?])\s+", text)
     chunks, buf = [], []
+
     for s in sentences:
+        # ---- table Markdown ------------------------------------------------
+        if "|" in s or re.fullmatch(r"\s*-{3,}\s*", s):
+            if buf:                       # vider le buffer courant
+                chunks.append(" ".join(buf))
+                buf = []
+            chunks.append(s)              # garder le tableau entier
+            continue
+
+        # ---- traitement normal --------------------------------------------
         buf.append(s)
-        if len(" ".join(buf)) > chunk_size:
+        if len(" ".join(buf).split()) >= chunk_size:
             chunks.append(" ".join(buf))
-            buf = buf[-overlap:]
+            buf = buf[-overlap:]          # chevauchement
+
     if buf:
         chunks.append(" ".join(buf))
+
     return chunks
 
 
diff --git a/rag.py b/rag.py
index 21a57ba..7ab2bc0 100644
--- a/rag.py
+++ b/rag.py
@@ -17,7 +17,7 @@ from FlagEmbedding import BGEM3FlagModel
 from rich import print
 
 ROOT = Path("Fiches")                # dossier racine des fiches
-K    = 10                               # nombre de passages remis au LLM
+K    = 30                               # nombre de passages remis au LLM
 
 # ------------------ utilitaires de découpe identiques à l'index -------------
 CHUNK, OVERLAP = 800, 100              # garder cohérent avec index.py