From 8f8e041c6bae819e33c1a347aaf4ab1a8e940cd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phan?= Date: Sun, 18 May 2025 18:18:52 +0200 Subject: [PATCH] Update index.py --- index.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/index.py b/index.py index ceab073..0faab64 100644 --- a/index.py +++ b/index.py @@ -9,6 +9,7 @@ from pathlib import Path import json import re import faiss +import numpy as np from FlagEmbedding import BGEM3FlagModel # --- Paramètres ------------------------------------------------------------- @@ -23,7 +24,7 @@ META_FILE = "corpus.meta.json" def split(text: str, chunk_size: int = CHUNK, overlap: int = OVERLAP): """Découpe un texte en morceaux de chunk_size mots avec overlap mots de recouvrement.""" - sentences = re.split(r"(?<=[\.\!\?])\s+", text) + sentences = re.split(r"(?<=[\.!?])\s+", text) chunks, buf = [], [] for s in sentences: buf.append(s) @@ -47,10 +48,15 @@ def main(): print(f"Découpé {len(docs)} passages, génération des embeddings…") model = BGEM3FlagModel(MODEL_NAME, device="cpu") - emb = model.encode(docs, batch_size=64, normalize=True) + emb = model.encode(docs, batch_size=64) # pas de normalisation interne + + # Normalisation manuelle (cosine) + emb = emb.astype("float32") + norms = np.linalg.norm(emb, axis=1, keepdims=True) + emb = emb / np.maximum(norms, 1e-12) index = faiss.IndexFlatIP(emb.shape[1]) - index.add(emb.astype("float32")) + index.add(emb) faiss.write_index(index, INDEX_FILE) with open(META_FILE, "w", encoding="utf-8") as f: