import os import faiss import numpy as np import requests from sentence_transformers import SentenceTransformer import re def collect_markdown_files(root_dir): """Parcourt récursivement le répertoire pour charger les fichiers .md""" texts, sources, raw_contents = [], [], [] for root, dirs, files in os.walk(root_dir): for f in files: if f.endswith(".md"): full_path = os.path.join(root, f) rel_path = os.path.relpath(full_path, root_dir) try: with open(full_path, "r", encoding="utf-8") as file: content = file.read().strip() if content: enriched = f"[Fichier : {rel_path}]\n\n{content}" texts.append(enriched) sources.append(full_path) raw_contents.append(content) except Exception as e: print(f"Erreur lecture {full_path}: {e}") return texts, sources, raw_contents def build_faiss_index(texts, model): """Crée l'index FAISS avec les embeddings""" print("📦 Génération des embeddings...") embeddings = model.encode(texts, show_progress_bar=True) dim = embeddings.shape[1] index = faiss.IndexFlatL2(dim) index.add(np.array(embeddings)) return index, embeddings def search_hybrid(query, embeddings, texts, paths, raw_contents, model, root_dir, k=5): """Effectue une recherche hybride : vecteurs + mots-clés""" print("🔗 Recherche vectorielle...") query_vector = model.encode([query]) _, faiss_indices = index.search(np.array(query_vector), k) vector_results = [(texts[i], paths[i]) for i in faiss_indices[0]] print("🔍 Recherche par mot-clé améliorée...") query_lower = query.lower() keywords = set(re.findall(r'\w+', query_lower)) keyword_hits = [] for i, (path, content) in enumerate(zip(paths, raw_contents)): haystack = f"{path} {content}".lower() match_count = sum(1 for kw in keywords if kw in haystack) if match_count >= 2 or 'isg' in haystack: keyword_hits.append((texts[i], paths[i], match_count)) keyword_hits.sort(key=lambda x: -x[2]) keyword_results = [(doc, path) for doc, path, _ in keyword_hits[:5]] combined = vector_results + keyword_results seen = set() unique_results = [] for doc, path in combined: if path not in seen: unique_results.append((doc, path)) seen.add(path) top_contexts = [doc for doc, _ in unique_results[:3]] top_sources = [os.path.relpath(p, root_dir) for _, p in unique_results[:3]] return top_contexts, top_sources def ask_ollama(prompt, model_name="llama3-8b-fast:latest"): """Appelle le modèle Ollama""" response = requests.post( "http://localhost:11434/api/generate", json={"model": model_name, "prompt": prompt, "stream": False} ) return response.json()["response"] # === Main === ROOT_DIR = "Corpus" MODEL_NAME = "all-MiniLM-L6-v2" print("🔍 Chargement des fichiers markdown...") texts, paths, raw_contents = collect_markdown_files(ROOT_DIR) print(f"📄 {len(texts)} fichiers chargés.") print("🧠 Chargement du modèle d'embedding...") model = SentenceTransformer(MODEL_NAME) index, embeddings = build_faiss_index(texts, model) # Boucle utilisateur while True: query = input("\n🔎 Pose ta question (ou Entrée pour quitter) : ").strip() if not query: print("👋 Fin du programme.") break top_contexts, top_sources = search_hybrid( query, embeddings, texts, paths, raw_contents, model, ROOT_DIR, k=10 ) context = "\n\n".join(top_contexts) fichiers_utilisés = "\n".join(f"- {src}" for src in top_sources) prompt = ( f"Contexte :\n{context}\n\n" f"Question : {query}\n" f"Réponds clairement, cite les seuils ou données si disponibles." ) print("\n🧠 Appel au modèle Ollama...\n") reponse = ask_ollama(prompt) print("📘 Fichiers utilisés :\n", fichiers_utilisés) print("\n🧠 Réponse :\n", reponse)