# index.py (exécuter dans un venv ou un conteneur python:3.11-slim) from pathlib import Path from sentence_transformers import SentenceTransformer import faiss, json, re MODEL_NAME = "WhereIsAI/bge-base-fr" CHUNK = 800 # tokens environ ~600 mots OVERLAP = 100 # lissage def split(text): # coupe proprement sur phrase/ponctuation sentences = re.split(r'(?<=[\.\!\?]) +', text) chunks, buf = [], [] for s in sentences: buf.append(s) if len(" ".join(buf)) > CHUNK: chunks.append(" ".join(buf)) buf = buf[-OVERLAP:] if buf: chunks.append(" ".join(buf)) return chunks docs, meta = [], [] for fp in Path("/app/Fiches").rglob("*.md"): txt = fp.read_text(encoding="utf-8") for i, chunk in enumerate(split(txt)): docs.append(chunk) meta.append({"file": fp.name, "part": i}) model = SentenceTransformer(MODEL_NAME, device="cpu") emb = model.encode(docs, batch_size=64, show_progress_bar=True, normalize_embeddings=True) index = faiss.IndexFlatIP(emb.shape[1]) index.add(emb) faiss.write_index(index, "corpus.idx") json.dump(meta, open("corpus.meta.json", "w")) print(f"Indexé {len(docs)} passages.")