From 58f7cbf669813e12ad33bedcf57bd29822769845 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phan?= Date: Sun, 18 May 2025 18:14:41 +0200 Subject: [PATCH] Update index.py --- index.py | 75 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 25 deletions(-) diff --git a/index.py b/index.py index 1623597..cfd34fe 100644 --- a/index.py +++ b/index.py @@ -1,38 +1,63 @@ -# index.py (exécuter dans un venv ou un conteneur python:3.11-slim) +#!/usr/bin/env python3 +""" +Indexation du répertoire Fiches avec BGE‑M3 (FlagEmbedding) + FAISS. +Découpe les documents markdown en blocs de ~800 tokens, génère les embeddings, +et écrit corpus.idx et corpus.meta.json à la racine. +""" from pathlib import Path -from sentence_transformers import SentenceTransformer -import faiss, json, re +import json +import re +import faiss +from FlagEmbedding import BGEM3FlagModel -MODEL_NAME = "WhereIsAI/bge-base-fr" -CHUNK = 800 # tokens environ ~600 mots -OVERLAP = 100 # lissage +# --- Paramètres ------------------------------------------------------------- +ROOT = Path("/app/Fiches") # dossier monté contenant les fiches .md +MODEL_NAME = "BAAI/bge-m3" # embedding multilingue, licence MIT +CHUNK = 800 # taille cible (≈600 mots) +OVERLAP = 100 # chevauchement pour la cohésion +INDEX_FILE = "corpus.idx" +META_FILE = "corpus.meta.json" -def split(text): - # coupe proprement sur phrase/ponctuation - sentences = re.split(r'(?<=[\.\!\?]) +', text) +# --- Fonctions utilitaires -------------------------------------------------- + +def split(text: str, chunk_size: int = CHUNK, overlap: int = OVERLAP): + """Découpe un texte en morceaux de chunk_size mots avec overlap mots de recouvrement.""" + sentences = re.split(r"(?<=[\.\!\?])\s+", text) chunks, buf = [], [] for s in sentences: buf.append(s) - if len(" ".join(buf)) > CHUNK: + if len(" ".join(buf)) > chunk_size: chunks.append(" ".join(buf)) - buf = buf[-OVERLAP:] - if buf: chunks.append(" ".join(buf)) + buf = buf[-overlap:] + if buf: + chunks.append(" ".join(buf)) return chunks -docs, meta = [], [] -for fp in Path("/app/Fiches").rglob("*.md"): - txt = fp.read_text(encoding="utf-8") - for i, chunk in enumerate(split(txt)): - docs.append(chunk) - meta.append({"file": fp.name, "part": i}) +# --- Pipeline principal ----------------------------------------------------- -model = SentenceTransformer(MODEL_NAME, device="cpu") -emb = model.encode(docs, batch_size=64, show_progress_bar=True, normalize_embeddings=True) +def main(): + docs, meta = [], [] + for fp in ROOT.rglob("*.md"): + text = fp.read_text(encoding="utf-8", errors="ignore") + for i, chunk in enumerate(split(text)): + docs.append(chunk) + meta.append({"file": fp.relative_to(ROOT).as_posix(), "part": i}) -index = faiss.IndexFlatIP(emb.shape[1]) -index.add(emb) + print(f"Découpé {len(docs)} passages, génération des embeddings…") -faiss.write_index(index, "corpus.idx") -json.dump(meta, open("corpus.meta.json", "w")) -print(f"Indexé {len(docs)} passages.") + model = BGEM3FlagModel(MODEL_NAME, device="cpu") + emb = model.encode(docs, batch_size=64, normalize_embeddings=True) + + index = faiss.IndexFlatIP(emb.shape[1]) + index.add(emb.astype("float32")) + faiss.write_index(index, INDEX_FILE) + + with open(META_FILE, "w", encoding="utf-8") as f: + json.dump(meta, f, ensure_ascii=False, indent=2) + + print(f"Index écrit dans {INDEX_FILE} avec {len(docs)} vecteurs.") + + +if __name__ == "__main__": + main()