Update index.py
This commit is contained in:
parent
cc54af71e6
commit
58f7cbf669
73
index.py
73
index.py
@ -1,38 +1,63 @@
|
|||||||
# index.py (exécuter dans un venv ou un conteneur python:3.11-slim)
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Indexation du répertoire Fiches avec BGE‑M3 (FlagEmbedding) + FAISS.
|
||||||
|
Découpe les documents markdown en blocs de ~800 tokens, génère les embeddings,
|
||||||
|
et écrit corpus.idx et corpus.meta.json à la racine.
|
||||||
|
"""
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from sentence_transformers import SentenceTransformer
|
import json
|
||||||
import faiss, json, re
|
import re
|
||||||
|
import faiss
|
||||||
|
from FlagEmbedding import BGEM3FlagModel
|
||||||
|
|
||||||
MODEL_NAME = "WhereIsAI/bge-base-fr"
|
# --- Paramètres -------------------------------------------------------------
|
||||||
CHUNK = 800 # tokens environ ~600 mots
|
ROOT = Path("/app/Fiches") # dossier monté contenant les fiches .md
|
||||||
OVERLAP = 100 # lissage
|
MODEL_NAME = "BAAI/bge-m3" # embedding multilingue, licence MIT
|
||||||
|
CHUNK = 800 # taille cible (≈600 mots)
|
||||||
|
OVERLAP = 100 # chevauchement pour la cohésion
|
||||||
|
INDEX_FILE = "corpus.idx"
|
||||||
|
META_FILE = "corpus.meta.json"
|
||||||
|
|
||||||
def split(text):
|
# --- Fonctions utilitaires --------------------------------------------------
|
||||||
# coupe proprement sur phrase/ponctuation
|
|
||||||
sentences = re.split(r'(?<=[\.\!\?]) +', text)
|
def split(text: str, chunk_size: int = CHUNK, overlap: int = OVERLAP):
|
||||||
|
"""Découpe un texte en morceaux de chunk_size mots avec overlap mots de recouvrement."""
|
||||||
|
sentences = re.split(r"(?<=[\.\!\?])\s+", text)
|
||||||
chunks, buf = [], []
|
chunks, buf = [], []
|
||||||
for s in sentences:
|
for s in sentences:
|
||||||
buf.append(s)
|
buf.append(s)
|
||||||
if len(" ".join(buf)) > CHUNK:
|
if len(" ".join(buf)) > chunk_size:
|
||||||
|
chunks.append(" ".join(buf))
|
||||||
|
buf = buf[-overlap:]
|
||||||
|
if buf:
|
||||||
chunks.append(" ".join(buf))
|
chunks.append(" ".join(buf))
|
||||||
buf = buf[-OVERLAP:]
|
|
||||||
if buf: chunks.append(" ".join(buf))
|
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
docs, meta = [], []
|
# --- Pipeline principal -----------------------------------------------------
|
||||||
for fp in Path("/app/Fiches").rglob("*.md"):
|
|
||||||
txt = fp.read_text(encoding="utf-8")
|
|
||||||
for i, chunk in enumerate(split(txt)):
|
|
||||||
docs.append(chunk)
|
|
||||||
meta.append({"file": fp.name, "part": i})
|
|
||||||
|
|
||||||
model = SentenceTransformer(MODEL_NAME, device="cpu")
|
def main():
|
||||||
emb = model.encode(docs, batch_size=64, show_progress_bar=True, normalize_embeddings=True)
|
docs, meta = [], []
|
||||||
|
for fp in ROOT.rglob("*.md"):
|
||||||
|
text = fp.read_text(encoding="utf-8", errors="ignore")
|
||||||
|
for i, chunk in enumerate(split(text)):
|
||||||
|
docs.append(chunk)
|
||||||
|
meta.append({"file": fp.relative_to(ROOT).as_posix(), "part": i})
|
||||||
|
|
||||||
|
print(f"Découpé {len(docs)} passages, génération des embeddings…")
|
||||||
|
|
||||||
|
model = BGEM3FlagModel(MODEL_NAME, device="cpu")
|
||||||
|
emb = model.encode(docs, batch_size=64, normalize_embeddings=True)
|
||||||
|
|
||||||
index = faiss.IndexFlatIP(emb.shape[1])
|
index = faiss.IndexFlatIP(emb.shape[1])
|
||||||
index.add(emb)
|
index.add(emb.astype("float32"))
|
||||||
|
faiss.write_index(index, INDEX_FILE)
|
||||||
|
|
||||||
faiss.write_index(index, "corpus.idx")
|
with open(META_FILE, "w", encoding="utf-8") as f:
|
||||||
json.dump(meta, open("corpus.meta.json", "w"))
|
json.dump(meta, f, ensure_ascii=False, indent=2)
|
||||||
print(f"Indexé {len(docs)} passages.")
|
|
||||||
|
print(f"Index écrit dans {INDEX_FILE} avec {len(docs)} vecteurs.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user