# index.py  (exécuter dans un venv ou un conteneur python:3.11-slim)

from pathlib import Path
from sentence_transformers import SentenceTransformer
import faiss, json, re

MODEL_NAME = "WhereIsAI/bge-base-fr"
CHUNK = 800          # tokens environ ~600 mots
OVERLAP = 100        # lissage

def split(text):
    # coupe proprement sur phrase/ponctuation
    sentences = re.split(r'(?<=[\.\!\?]) +', text)
    chunks, buf = [], []
    for s in sentences:
        buf.append(s)
        if len(" ".join(buf)) > CHUNK:
            chunks.append(" ".join(buf))
            buf = buf[-OVERLAP:]
    if buf: chunks.append(" ".join(buf))
    return chunks

docs, meta = [], []
for fp in Path("/app/Fiches").rglob("*.md"):
    txt = fp.read_text(encoding="utf-8")
    for i, chunk in enumerate(split(txt)):
        docs.append(chunk)
        meta.append({"file": fp.name, "part": i})

model = SentenceTransformer(MODEL_NAME, device="cpu")
emb = model.encode(docs, batch_size=64, show_progress_bar=True, normalize_embeddings=True)

index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb)

faiss.write_index(index, "corpus.idx")
json.dump(meta, open("corpus.meta.json", "w"))
print(f"Indexé {len(docs)} passages.")