Improve text chunking to preserve Markdown tables
Enhance split function to detect and preserve Markdown tables when chunking text. Tables are now kept intact by forcing splits before and after table content. Also increase K value from 10 to 30 in rag.py to provide more passages to the LLM.
This commit is contained in:
parent
f8c630cdfe
commit
a3608353a2
26
index.py
26
index.py
@ -35,16 +35,34 @@ BATCH = 256 # plus grand batch : encode plus
|
||||
# --- Fonctions utilitaires --------------------------------------------------
|
||||
|
||||
def split(text: str, chunk_size: int = CHUNK, overlap: int = OVERLAP):
|
||||
"""Découpe un texte en morceaux de chunk_size mots avec overlap mots."""
|
||||
"""Découpe *text* en chunks (~chunk_size mots) tout en
|
||||
préservant entièrement les tableaux Markdown.
|
||||
|
||||
• Si une ligne contient ‘|’ ou n’est constituée que de tirets (---),
|
||||
on force la coupure avant / après pour ne pas casser le tableau.
|
||||
• Le reste est découpé sur la ponctuation (. ! ?) avec overlap.
|
||||
"""
|
||||
sentences = re.split(r"(?<=[\.!?])\s+", text)
|
||||
chunks, buf = [], []
|
||||
|
||||
for s in sentences:
|
||||
buf.append(s)
|
||||
if len(" ".join(buf)) > chunk_size:
|
||||
# ---- table Markdown ------------------------------------------------
|
||||
if "|" in s or re.fullmatch(r"\s*-{3,}\s*", s):
|
||||
if buf: # vider le buffer courant
|
||||
chunks.append(" ".join(buf))
|
||||
buf = buf[-overlap:]
|
||||
buf = []
|
||||
chunks.append(s) # garder le tableau entier
|
||||
continue
|
||||
|
||||
# ---- traitement normal --------------------------------------------
|
||||
buf.append(s)
|
||||
if len(" ".join(buf).split()) >= chunk_size:
|
||||
chunks.append(" ".join(buf))
|
||||
buf = buf[-overlap:] # chevauchement
|
||||
|
||||
if buf:
|
||||
chunks.append(" ".join(buf))
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
|
||||
2
rag.py
2
rag.py
@ -17,7 +17,7 @@ from FlagEmbedding import BGEM3FlagModel
|
||||
from rich import print
|
||||
|
||||
ROOT = Path("Fiches") # dossier racine des fiches
|
||||
K = 10 # nombre de passages remis au LLM
|
||||
K = 30 # nombre de passages remis au LLM
|
||||
|
||||
# ------------------ utilitaires de découpe identiques à l'index -------------
|
||||
CHUNK, OVERLAP = 800, 100 # garder cohérent avec index.py
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user