79 lines
2.7 KiB
Python
79 lines
2.7 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Indexation du répertoire Fiches avec BGE‑M3 (FlagEmbedding) + FAISS.
|
||
Parcourt récursivement tous les fichiers markdown / texte (extensions .md, .MD, .markdown, .txt),
|
||
découpe en blocs de ~800 tokens, génère les embeddings, et écrit corpus.idx + corpus.meta.json.
|
||
"""
|
||
|
||
from pathlib import Path
|
||
import json
|
||
import re
|
||
import faiss
|
||
import numpy as np
|
||
from FlagEmbedding import BGEM3FlagModel
|
||
|
||
# --- Paramètres -------------------------------------------------------------
|
||
ROOT = Path("/app/Fiches") # dossier monté contenant les fiches
|
||
MODEL_NAME = "BAAI/bge-m3" # embedding multilingue, licence MIT
|
||
CHUNK = 800 # taille cible (≈600 mots)
|
||
OVERLAP = 100 # chevauchement pour la cohésion
|
||
INDEX_FILE = "corpus.idx"
|
||
META_FILE = "corpus.meta.json"
|
||
EXTENSIONS = ["*.md", "*.MD", "*.markdown", "*.txt"]
|
||
|
||
# --- Fonctions utilitaires --------------------------------------------------
|
||
|
||
def split(text: str, chunk_size: int = CHUNK, overlap: int = OVERLAP):
|
||
"""Découpe un texte en morceaux de chunk_size mots avec overlap mots de recouvrement."""
|
||
sentences = re.split(r"(?<=[\.!?])\s+", text)
|
||
chunks, buf = [], []
|
||
for s in sentences:
|
||
buf.append(s)
|
||
if len(" ".join(buf)) > chunk_size:
|
||
chunks.append(" ".join(buf))
|
||
buf = buf[-overlap:]
|
||
if buf:
|
||
chunks.append(" ".join(buf))
|
||
return chunks
|
||
|
||
# --- Pipeline principal -----------------------------------------------------
|
||
|
||
def gather_files(root: Path):
|
||
for pattern in EXTENSIONS:
|
||
yield from root.rglob(pattern)
|
||
|
||
def main():
|
||
docs, meta = [], []
|
||
|
||
for fp in gather_files(ROOT):
|
||
text = fp.read_text(encoding="utf-8", errors="ignore")
|
||
for i, chunk in enumerate(split(text)):
|
||
docs.append(chunk)
|
||
meta.append({"file": fp.relative_to(ROOT).as_posix(), "part": i})
|
||
|
||
if not docs:
|
||
raise SystemExit("Aucun fichier trouvé dans /app/Fiches. Vérifiez le montage ou les extensions.")
|
||
|
||
print(f"Découpé {len(docs)} passages, génération des embeddings…")
|
||
|
||
model = BGEM3FlagModel(MODEL_NAME, device="cpu")
|
||
emb = model.encode(docs, batch_size=64) # pas de normalisation interne
|
||
|
||
# Normalisation manuelle (cosine)
|
||
emb = emb.astype("float32")
|
||
norms = np.linalg.norm(emb, axis=1, keepdims=True)
|
||
emb = emb / np.maximum(norms, 1e-12)
|
||
|
||
index = faiss.IndexFlatIP(emb.shape[1])
|
||
index.add(emb)
|
||
faiss.write_index(index, INDEX_FILE)
|
||
|
||
with open(META_FILE, "w", encoding="utf-8") as f:
|
||
json.dump(meta, f, ensure_ascii=False, indent=2)
|
||
|
||
print(f"Index écrit dans {INDEX_FILE} avec {len(docs)} vecteurs.")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|