Code/index.py
2025-05-18 18:21:00 +02:00

79 lines
2.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Indexation du répertoire Fiches avec BGEM3 (FlagEmbedding) + FAISS.
Parcourt récursivement tous les fichiers markdown / texte (extensions .md, .MD, .markdown, .txt),
découpe en blocs de ~800 tokens, génère les embeddings, et écrit corpus.idx + corpus.meta.json.
"""
from pathlib import Path
import json
import re
import faiss
import numpy as np
from FlagEmbedding import BGEM3FlagModel
# --- Paramètres -------------------------------------------------------------
ROOT = Path("/app/Fiches") # dossier monté contenant les fiches
MODEL_NAME = "BAAI/bge-m3" # embedding multilingue, licence MIT
CHUNK = 800 # taille cible (≈600 mots)
OVERLAP = 100 # chevauchement pour la cohésion
INDEX_FILE = "corpus.idx"
META_FILE = "corpus.meta.json"
EXTENSIONS = ["*.md", "*.MD", "*.markdown", "*.txt"]
# --- Fonctions utilitaires --------------------------------------------------
def split(text: str, chunk_size: int = CHUNK, overlap: int = OVERLAP):
"""Découpe un texte en morceaux de chunk_size mots avec overlap mots de recouvrement."""
sentences = re.split(r"(?<=[\.!?])\s+", text)
chunks, buf = [], []
for s in sentences:
buf.append(s)
if len(" ".join(buf)) > chunk_size:
chunks.append(" ".join(buf))
buf = buf[-overlap:]
if buf:
chunks.append(" ".join(buf))
return chunks
# --- Pipeline principal -----------------------------------------------------
def gather_files(root: Path):
for pattern in EXTENSIONS:
yield from root.rglob(pattern)
def main():
docs, meta = [], []
for fp in gather_files(ROOT):
text = fp.read_text(encoding="utf-8", errors="ignore")
for i, chunk in enumerate(split(text)):
docs.append(chunk)
meta.append({"file": fp.relative_to(ROOT).as_posix(), "part": i})
if not docs:
raise SystemExit("Aucun fichier trouvé dans /app/Fiches. Vérifiez le montage ou les extensions.")
print(f"Découpé {len(docs)} passages, génération des embeddings…")
model = BGEM3FlagModel(MODEL_NAME, device="cpu")
emb = model.encode(docs, batch_size=64) # pas de normalisation interne
# Normalisation manuelle (cosine)
emb = emb.astype("float32")
norms = np.linalg.norm(emb, axis=1, keepdims=True)
emb = emb / np.maximum(norms, 1e-12)
index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb)
faiss.write_index(index, INDEX_FILE)
with open(META_FILE, "w", encoding="utf-8") as f:
json.dump(meta, f, ensure_ascii=False, indent=2)
print(f"Index écrit dans {INDEX_FILE} avec {len(docs)} vecteurs.")
if __name__ == "__main__":
main()