39 lines
1.2 KiB
Python
39 lines
1.2 KiB
Python
# index.py (exécuter dans un venv ou un conteneur python:3.11-slim)
|
|
|
|
from pathlib import Path
|
|
from sentence_transformers import SentenceTransformer
|
|
import faiss, json, re
|
|
|
|
MODEL_NAME = "WhereIsAI/bge-base-fr"
|
|
CHUNK = 800 # tokens environ ~600 mots
|
|
OVERLAP = 100 # lissage
|
|
|
|
def split(text):
|
|
# coupe proprement sur phrase/ponctuation
|
|
sentences = re.split(r'(?<=[\.\!\?]) +', text)
|
|
chunks, buf = [], []
|
|
for s in sentences:
|
|
buf.append(s)
|
|
if len(" ".join(buf)) > CHUNK:
|
|
chunks.append(" ".join(buf))
|
|
buf = buf[-OVERLAP:]
|
|
if buf: chunks.append(" ".join(buf))
|
|
return chunks
|
|
|
|
docs, meta = [], []
|
|
for fp in Path("/app/Fiches").rglob("*.md"):
|
|
txt = fp.read_text(encoding="utf-8")
|
|
for i, chunk in enumerate(split(txt)):
|
|
docs.append(chunk)
|
|
meta.append({"file": fp.name, "part": i})
|
|
|
|
model = SentenceTransformer(MODEL_NAME, device="cpu")
|
|
emb = model.encode(docs, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
|
|
|
|
index = faiss.IndexFlatIP(emb.shape[1])
|
|
index.add(emb)
|
|
|
|
faiss.write_index(index, "corpus.idx")
|
|
json.dump(meta, open("corpus.meta.json", "w"))
|
|
print(f"Indexé {len(docs)} passages.")
|