Delete rag_md.py

2025-05-19 14:27:38 +02:00 · 2025-05-19 14:27:38 +02:00 · b2c47048c7
commit b2c47048c7
parent 54c6a309e6
1 changed files with 0 additions and 116 deletions
--- a/rag_md.py
+++ b/rag_md.py
@ -1,116 +0,0 @@
-import os
-import faiss
-import numpy as np
-import requests
-from sentence_transformers import SentenceTransformer
-import re
-
-def collect_markdown_files(root_dir):
-    """Parcourt récursivement le répertoire pour charger les fichiers .md"""
-    texts, sources, raw_contents = [], [], []
-    for root, dirs, files in os.walk(root_dir):
-        for f in files:
-            if f.endswith(".md"):
-                full_path = os.path.join(root, f)
-                rel_path = os.path.relpath(full_path, root_dir)
-                try:
-                    with open(full_path, "r", encoding="utf-8") as file:
-                        content = file.read().strip()
-                        if content:
-                            enriched = f"[Fichier : {rel_path}]\n\n{content}"
-                            texts.append(enriched)
-                            sources.append(full_path)
-                            raw_contents.append(content)
-                except Exception as e:
-                    print(f"Erreur lecture {full_path}: {e}")
-    return texts, sources, raw_contents
-
-def build_faiss_index(texts, model):
-    """Crée l'index FAISS avec les embeddings"""
-    print("📦 Génération des embeddings...")
-    embeddings = model.encode(texts, show_progress_bar=True)
-    dim = embeddings.shape[1]
-    index = faiss.IndexFlatL2(dim)
-    index.add(np.array(embeddings))
-    return index, embeddings
-
-def search_hybrid(query, embeddings, texts, paths, raw_contents, model, root_dir, k=5):
-    """Effectue une recherche hybride : vecteurs + mots-clés"""
-    print("🔗 Recherche vectorielle...")
-    query_vector = model.encode([query])
-    _, faiss_indices = index.search(np.array(query_vector), k)
-
-    vector_results = [(texts[i], paths[i]) for i in faiss_indices[0]]
-
-    print("🔍 Recherche par mot-clé améliorée...")
-    query_lower = query.lower()
-    keywords = set(re.findall(r'\w+', query_lower))
-
-    keyword_hits = []
-    for i, (path, content) in enumerate(zip(paths, raw_contents)):
-        haystack = f"{path} {content}".lower()
-        match_count = sum(1 for kw in keywords if kw in haystack)
-        if match_count >= 2 or 'isg' in haystack:
-            keyword_hits.append((texts[i], paths[i], match_count))
-
-    keyword_hits.sort(key=lambda x: -x[2])
-    keyword_results = [(doc, path) for doc, path, _ in keyword_hits[:5]]
-
-    combined = vector_results + keyword_results
-    seen = set()
-    unique_results = []
-    for doc, path in combined:
-        if path not in seen:
-            unique_results.append((doc, path))
-            seen.add(path)
-
-    top_contexts = [doc for doc, _ in unique_results[:3]]
-    top_sources = [os.path.relpath(p, root_dir) for _, p in unique_results[:3]]
-    return top_contexts, top_sources
-
-def ask_ollama(prompt, model_name="llama3-8b-fast:latest"):
-    """Appelle le modèle Ollama"""
-    response = requests.post(
-        "http://localhost:11434/api/generate",
-        json={"model": model_name, "prompt": prompt, "stream": False}
-    )
-    return response.json()["response"]
-
-# === Main ===
-ROOT_DIR = "Corpus"
-MODEL_NAME = "all-MiniLM-L6-v2"
-
-print("🔍 Chargement des fichiers markdown...")
-texts, paths, raw_contents = collect_markdown_files(ROOT_DIR)
-print(f"📄 {len(texts)} fichiers chargés.")
-
-print("🧠 Chargement du modèle d'embedding...")
-model = SentenceTransformer(MODEL_NAME)
-
-index, embeddings = build_faiss_index(texts, model)
-
-# Boucle utilisateur
-while True:
-    query = input("\n🔎 Pose ta question (ou Entrée pour quitter) : ").strip()
-    if not query:
-        print("👋 Fin du programme.")
-        break
-
-    top_contexts, top_sources = search_hybrid(
-        query, embeddings, texts, paths, raw_contents, model, ROOT_DIR, k=10
-    )
-
-    context = "\n\n".join(top_contexts)
-    fichiers_utilisés = "\n".join(f"- {src}" for src in top_sources)
-
-    prompt = (
-        f"Contexte :\n{context}\n\n"
-        f"Question : {query}\n"
-        f"Réponds clairement, cite les seuils ou données si disponibles."
-    )
-
-    print("\n🧠 Appel au modèle Ollama...\n")
-    reponse = ask_ollama(prompt)
-
-    print("📘 Fichiers utilisés :\n", fichiers_utilisés)
-    print("\n🧠 Réponse :\n", reponse)