| import json | |
| import numpy as np | |
| import faiss | |
| from sentence_transformers import SentenceTransformer | |
| class VectorDB: | |
| def __init__(self): | |
| self.model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| self.index = None | |
| self.documents = [] | |
| def load_documents(self, path="documents.json"): | |
| with open(path, "r", encoding="utf-8") as f: | |
| raw_docs = json.load(f) | |
| self.documents = [] | |
| texts = [] | |
| for i, doc in enumerate(raw_docs): | |
| content = doc.get("content") or doc.get("text") or doc.get("data") | |
| if not content: | |
| print(f"⚠️ Skipping document {i}: no content/text field") | |
| continue | |
| self.documents.append(doc) | |
| texts.append(content) | |
| if not texts: | |
| raise ValueError("No valid documents found to index") | |
| embeddings = self.model.encode(texts, convert_to_numpy=True) | |
| dim = embeddings.shape[1] | |
| self.index = faiss.IndexFlatL2(dim) | |
| self.index.add(embeddings) | |
| def search(self, query, top_k=3): | |
| if self.index is None: | |
| return [] | |
| query_embedding = self.model.encode([query], convert_to_numpy=True) | |
| distances, indices = self.index.search(query_embedding, top_k) | |
| results = [] | |
| for idx in indices[0]: | |
| results.append(self.documents[idx]["content"]) | |
| return results | |