Update index.py

This commit is contained in:
Stéphan Peccini 2025-05-18 18:14:41 +02:00
parent cc54af71e6
commit 58f7cbf669

View File

@ -1,38 +1,63 @@
# index.py (exécuter dans un venv ou un conteneur python:3.11-slim) #!/usr/bin/env python3
"""
Indexation du répertoire Fiches avec BGEM3 (FlagEmbedding) + FAISS.
Découpe les documents markdown en blocs de ~800 tokens, génère les embeddings,
et écrit corpus.idx et corpus.meta.json à la racine.
"""
from pathlib import Path from pathlib import Path
from sentence_transformers import SentenceTransformer import json
import faiss, json, re import re
import faiss
from FlagEmbedding import BGEM3FlagModel
MODEL_NAME = "WhereIsAI/bge-base-fr" # --- Paramètres -------------------------------------------------------------
CHUNK = 800 # tokens environ ~600 mots ROOT = Path("/app/Fiches") # dossier monté contenant les fiches .md
OVERLAP = 100 # lissage MODEL_NAME = "BAAI/bge-m3" # embedding multilingue, licence MIT
CHUNK = 800 # taille cible (≈600 mots)
OVERLAP = 100 # chevauchement pour la cohésion
INDEX_FILE = "corpus.idx"
META_FILE = "corpus.meta.json"
def split(text): # --- Fonctions utilitaires --------------------------------------------------
# coupe proprement sur phrase/ponctuation
sentences = re.split(r'(?<=[\.\!\?]) +', text) def split(text: str, chunk_size: int = CHUNK, overlap: int = OVERLAP):
"""Découpe un texte en morceaux de chunk_size mots avec overlap mots de recouvrement."""
sentences = re.split(r"(?<=[\.\!\?])\s+", text)
chunks, buf = [], [] chunks, buf = [], []
for s in sentences: for s in sentences:
buf.append(s) buf.append(s)
if len(" ".join(buf)) > CHUNK: if len(" ".join(buf)) > chunk_size:
chunks.append(" ".join(buf))
buf = buf[-overlap:]
if buf:
chunks.append(" ".join(buf)) chunks.append(" ".join(buf))
buf = buf[-OVERLAP:]
if buf: chunks.append(" ".join(buf))
return chunks return chunks
docs, meta = [], [] # --- Pipeline principal -----------------------------------------------------
for fp in Path("/app/Fiches").rglob("*.md"):
txt = fp.read_text(encoding="utf-8")
for i, chunk in enumerate(split(txt)):
docs.append(chunk)
meta.append({"file": fp.name, "part": i})
model = SentenceTransformer(MODEL_NAME, device="cpu") def main():
emb = model.encode(docs, batch_size=64, show_progress_bar=True, normalize_embeddings=True) docs, meta = [], []
for fp in ROOT.rglob("*.md"):
text = fp.read_text(encoding="utf-8", errors="ignore")
for i, chunk in enumerate(split(text)):
docs.append(chunk)
meta.append({"file": fp.relative_to(ROOT).as_posix(), "part": i})
print(f"Découpé {len(docs)} passages, génération des embeddings…")
model = BGEM3FlagModel(MODEL_NAME, device="cpu")
emb = model.encode(docs, batch_size=64, normalize_embeddings=True)
index = faiss.IndexFlatIP(emb.shape[1]) index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb) index.add(emb.astype("float32"))
faiss.write_index(index, INDEX_FILE)
faiss.write_index(index, "corpus.idx") with open(META_FILE, "w", encoding="utf-8") as f:
json.dump(meta, open("corpus.meta.json", "w")) json.dump(meta, f, ensure_ascii=False, indent=2)
print(f"Indexé {len(docs)} passages.")
print(f"Index écrit dans {INDEX_FILE} avec {len(docs)} vecteurs.")
if __name__ == "__main__":
main()