Code/index.py
2025-05-18 18:14:41 +02:00

64 lines
2.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Indexation du répertoire Fiches avec BGEM3 (FlagEmbedding) + FAISS.
Découpe les documents markdown en blocs de ~800 tokens, génère les embeddings,
et écrit corpus.idx et corpus.meta.json à la racine.
"""
from pathlib import Path
import json
import re
import faiss
from FlagEmbedding import BGEM3FlagModel
# --- Paramètres -------------------------------------------------------------
ROOT = Path("/app/Fiches") # dossier monté contenant les fiches .md
MODEL_NAME = "BAAI/bge-m3" # embedding multilingue, licence MIT
CHUNK = 800 # taille cible (≈600 mots)
OVERLAP = 100 # chevauchement pour la cohésion
INDEX_FILE = "corpus.idx"
META_FILE = "corpus.meta.json"
# --- Fonctions utilitaires --------------------------------------------------
def split(text: str, chunk_size: int = CHUNK, overlap: int = OVERLAP):
"""Découpe un texte en morceaux de chunk_size mots avec overlap mots de recouvrement."""
sentences = re.split(r"(?<=[\.\!\?])\s+", text)
chunks, buf = [], []
for s in sentences:
buf.append(s)
if len(" ".join(buf)) > chunk_size:
chunks.append(" ".join(buf))
buf = buf[-overlap:]
if buf:
chunks.append(" ".join(buf))
return chunks
# --- Pipeline principal -----------------------------------------------------
def main():
docs, meta = [], []
for fp in ROOT.rglob("*.md"):
text = fp.read_text(encoding="utf-8", errors="ignore")
for i, chunk in enumerate(split(text)):
docs.append(chunk)
meta.append({"file": fp.relative_to(ROOT).as_posix(), "part": i})
print(f"Découpé {len(docs)} passages, génération des embeddings…")
model = BGEM3FlagModel(MODEL_NAME, device="cpu")
emb = model.encode(docs, batch_size=64, normalize_embeddings=True)
index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb.astype("float32"))
faiss.write_index(index, INDEX_FILE)
with open(META_FILE, "w", encoding="utf-8") as f:
json.dump(meta, f, ensure_ascii=False, indent=2)
print(f"Index écrit dans {INDEX_FILE} avec {len(docs)} vecteurs.")
if __name__ == "__main__":
main()