Update index.py
This commit is contained in:
parent
96a083fd72
commit
8f8e041c6b
12
index.py
12
index.py
@ -9,6 +9,7 @@ from pathlib import Path
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import faiss
|
import faiss
|
||||||
|
import numpy as np
|
||||||
from FlagEmbedding import BGEM3FlagModel
|
from FlagEmbedding import BGEM3FlagModel
|
||||||
|
|
||||||
# --- Paramètres -------------------------------------------------------------
|
# --- Paramètres -------------------------------------------------------------
|
||||||
@ -23,7 +24,7 @@ META_FILE = "corpus.meta.json"
|
|||||||
|
|
||||||
def split(text: str, chunk_size: int = CHUNK, overlap: int = OVERLAP):
|
def split(text: str, chunk_size: int = CHUNK, overlap: int = OVERLAP):
|
||||||
"""Découpe un texte en morceaux de chunk_size mots avec overlap mots de recouvrement."""
|
"""Découpe un texte en morceaux de chunk_size mots avec overlap mots de recouvrement."""
|
||||||
sentences = re.split(r"(?<=[\.\!\?])\s+", text)
|
sentences = re.split(r"(?<=[\.!?])\s+", text)
|
||||||
chunks, buf = [], []
|
chunks, buf = [], []
|
||||||
for s in sentences:
|
for s in sentences:
|
||||||
buf.append(s)
|
buf.append(s)
|
||||||
@ -47,10 +48,15 @@ def main():
|
|||||||
print(f"Découpé {len(docs)} passages, génération des embeddings…")
|
print(f"Découpé {len(docs)} passages, génération des embeddings…")
|
||||||
|
|
||||||
model = BGEM3FlagModel(MODEL_NAME, device="cpu")
|
model = BGEM3FlagModel(MODEL_NAME, device="cpu")
|
||||||
emb = model.encode(docs, batch_size=64, normalize=True)
|
emb = model.encode(docs, batch_size=64) # pas de normalisation interne
|
||||||
|
|
||||||
|
# Normalisation manuelle (cosine)
|
||||||
|
emb = emb.astype("float32")
|
||||||
|
norms = np.linalg.norm(emb, axis=1, keepdims=True)
|
||||||
|
emb = emb / np.maximum(norms, 1e-12)
|
||||||
|
|
||||||
index = faiss.IndexFlatIP(emb.shape[1])
|
index = faiss.IndexFlatIP(emb.shape[1])
|
||||||
index.add(emb.astype("float32"))
|
index.add(emb)
|
||||||
faiss.write_index(index, INDEX_FILE)
|
faiss.write_index(index, INDEX_FILE)
|
||||||
|
|
||||||
with open(META_FILE, "w", encoding="utf-8") as f:
|
with open(META_FILE, "w", encoding="utf-8") as f:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user