Update index.py

This commit is contained in:
Stéphan Peccini 2025-05-18 18:18:52 +02:00
parent 96a083fd72
commit 8f8e041c6b

View File

@ -9,6 +9,7 @@ from pathlib import Path
import json
import re
import faiss
import numpy as np
from FlagEmbedding import BGEM3FlagModel
# --- Paramètres -------------------------------------------------------------
@ -23,7 +24,7 @@ META_FILE = "corpus.meta.json"
def split(text: str, chunk_size: int = CHUNK, overlap: int = OVERLAP):
"""Découpe un texte en morceaux de chunk_size mots avec overlap mots de recouvrement."""
sentences = re.split(r"(?<=[\.\!\?])\s+", text)
sentences = re.split(r"(?<=[\.!?])\s+", text)
chunks, buf = [], []
for s in sentences:
buf.append(s)
@ -47,10 +48,15 @@ def main():
print(f"Découpé {len(docs)} passages, génération des embeddings…")
model = BGEM3FlagModel(MODEL_NAME, device="cpu")
emb = model.encode(docs, batch_size=64, normalize=True)
emb = model.encode(docs, batch_size=64) # pas de normalisation interne
# Normalisation manuelle (cosine)
emb = emb.astype("float32")
norms = np.linalg.norm(emb, axis=1, keepdims=True)
emb = emb / np.maximum(norms, 1e-12)
index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb.astype("float32"))
index.add(emb)
faiss.write_index(index, INDEX_FILE)
with open(META_FILE, "w", encoding="utf-8") as f: