Update index.py
This commit is contained in:
parent
96a083fd72
commit
8f8e041c6b
12
index.py
12
index.py
@ -9,6 +9,7 @@ from pathlib import Path
|
||||
import json
|
||||
import re
|
||||
import faiss
|
||||
import numpy as np
|
||||
from FlagEmbedding import BGEM3FlagModel
|
||||
|
||||
# --- Paramètres -------------------------------------------------------------
|
||||
@ -23,7 +24,7 @@ META_FILE = "corpus.meta.json"
|
||||
|
||||
def split(text: str, chunk_size: int = CHUNK, overlap: int = OVERLAP):
|
||||
"""Découpe un texte en morceaux de chunk_size mots avec overlap mots de recouvrement."""
|
||||
sentences = re.split(r"(?<=[\.\!\?])\s+", text)
|
||||
sentences = re.split(r"(?<=[\.!?])\s+", text)
|
||||
chunks, buf = [], []
|
||||
for s in sentences:
|
||||
buf.append(s)
|
||||
@ -47,10 +48,15 @@ def main():
|
||||
print(f"Découpé {len(docs)} passages, génération des embeddings…")
|
||||
|
||||
model = BGEM3FlagModel(MODEL_NAME, device="cpu")
|
||||
emb = model.encode(docs, batch_size=64, normalize=True)
|
||||
emb = model.encode(docs, batch_size=64) # pas de normalisation interne
|
||||
|
||||
# Normalisation manuelle (cosine)
|
||||
emb = emb.astype("float32")
|
||||
norms = np.linalg.norm(emb, axis=1, keepdims=True)
|
||||
emb = emb / np.maximum(norms, 1e-12)
|
||||
|
||||
index = faiss.IndexFlatIP(emb.shape[1])
|
||||
index.add(emb.astype("float32"))
|
||||
index.add(emb)
|
||||
faiss.write_index(index, INDEX_FILE)
|
||||
|
||||
with open(META_FILE, "w", encoding="utf-8") as f:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user