Update index.py
This commit is contained in:
parent
8f8e041c6b
commit
03cc42c22d
17
index.py
17
index.py
@ -1,8 +1,8 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
Indexation du répertoire Fiches avec BGE‑M3 (FlagEmbedding) + FAISS.
|
Indexation du répertoire Fiches avec BGE‑M3 (FlagEmbedding) + FAISS.
|
||||||
Découpe les documents markdown en blocs de ~800 tokens, génère les embeddings,
|
Parcourt récursivement tous les fichiers markdown / texte (extensions .md, .MD, .markdown, .txt),
|
||||||
et écrit corpus.idx et corpus.meta.json à la racine.
|
découpe en blocs de ~800 tokens, génère les embeddings, et écrit corpus.idx + corpus.meta.json.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -13,12 +13,13 @@ import numpy as np
|
|||||||
from FlagEmbedding import BGEM3FlagModel
|
from FlagEmbedding import BGEM3FlagModel
|
||||||
|
|
||||||
# --- Paramètres -------------------------------------------------------------
|
# --- Paramètres -------------------------------------------------------------
|
||||||
ROOT = Path("/app/Fiches") # dossier monté contenant les fiches .md
|
ROOT = Path("/app/Fiches") # dossier monté contenant les fiches
|
||||||
MODEL_NAME = "BAAI/bge-m3" # embedding multilingue, licence MIT
|
MODEL_NAME = "BAAI/bge-m3" # embedding multilingue, licence MIT
|
||||||
CHUNK = 800 # taille cible (≈600 mots)
|
CHUNK = 800 # taille cible (≈600 mots)
|
||||||
OVERLAP = 100 # chevauchement pour la cohésion
|
OVERLAP = 100 # chevauchement pour la cohésion
|
||||||
INDEX_FILE = "corpus.idx"
|
INDEX_FILE = "corpus.idx"
|
||||||
META_FILE = "corpus.meta.json"
|
META_FILE = "corpus.meta.json"
|
||||||
|
EXTENSIONS = ["*.md", "*.MD", "*.markdown", "*.txt"]
|
||||||
|
|
||||||
# --- Fonctions utilitaires --------------------------------------------------
|
# --- Fonctions utilitaires --------------------------------------------------
|
||||||
|
|
||||||
@ -37,14 +38,22 @@ def split(text: str, chunk_size: int = CHUNK, overlap: int = OVERLAP):
|
|||||||
|
|
||||||
# --- Pipeline principal -----------------------------------------------------
|
# --- Pipeline principal -----------------------------------------------------
|
||||||
|
|
||||||
|
def gather_files(root: Path):
|
||||||
|
for pattern in EXTENSIONS:
|
||||||
|
yield from root.rglob(pattern)
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
docs, meta = [], []
|
docs, meta = [], []
|
||||||
for fp in ROOT.rglob("*.md"):
|
|
||||||
|
for fp in gather_files(ROOT):
|
||||||
text = fp.read_text(encoding="utf-8", errors="ignore")
|
text = fp.read_text(encoding="utf-8", errors="ignore")
|
||||||
for i, chunk in enumerate(split(text)):
|
for i, chunk in enumerate(split(text)):
|
||||||
docs.append(chunk)
|
docs.append(chunk)
|
||||||
meta.append({"file": fp.relative_to(ROOT).as_posix(), "part": i})
|
meta.append({"file": fp.relative_to(ROOT).as_posix(), "part": i})
|
||||||
|
|
||||||
|
if not docs:
|
||||||
|
raise SystemExit("Aucun fichier trouvé dans /app/Fiches. Vérifiez le montage ou les extensions.")
|
||||||
|
|
||||||
print(f"Découpé {len(docs)} passages, génération des embeddings…")
|
print(f"Découpé {len(docs)} passages, génération des embeddings…")
|
||||||
|
|
||||||
model = BGEM3FlagModel(MODEL_NAME, device="cpu")
|
model = BGEM3FlagModel(MODEL_NAME, device="cpu")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user