|
|
""" |
|
|
Build HyperGraph and Index to Supabase |
|
|
Converts flat facts to HyperNodes with embeddings and stores in vector DB |
|
|
""" |
|
|
import sys |
|
|
from pathlib import Path |
|
|
sys.path.insert(0, str(Path(__file__).parent.parent)) |
|
|
|
|
|
import json |
|
|
from typing import List, Dict |
|
|
from tqdm import tqdm |
|
|
|
|
|
from config import get_settings |
|
|
from services.embedding_service import VietnameseEmbeddingService |
|
|
from services.vector_db_service import SupabaseVectorDB |
|
|
|
|
|
|
|
|
def build_and_index_hypergraph( |
|
|
facts_file: str = "plant_facts.json", |
|
|
batch_size: int = 100 |
|
|
): |
|
|
""" |
|
|
Build HyperGraph from facts and index to Supabase |
|
|
|
|
|
Args: |
|
|
facts_file: Path to flattened facts JSON |
|
|
batch_size: Batch size for embedding and insertion |
|
|
""" |
|
|
print(f"\n{'='*60}") |
|
|
print(f"Building HyperGraph and Indexing to Supabase") |
|
|
print(f"{'='*60}\n") |
|
|
|
|
|
|
|
|
print(f"Loading facts from {facts_file}...") |
|
|
with open(facts_file, 'r', encoding='utf-8') as f: |
|
|
facts = json.load(f) |
|
|
print(f"Loaded {len(facts)} facts") |
|
|
|
|
|
|
|
|
print("\nInitializing services...") |
|
|
settings = get_settings() |
|
|
embed_service = VietnameseEmbeddingService() |
|
|
vector_db = SupabaseVectorDB( |
|
|
url=settings.supabase_url, |
|
|
key=settings.supabase_anon_key |
|
|
) |
|
|
|
|
|
|
|
|
existing_count = vector_db.count_nodes() |
|
|
if existing_count > 0: |
|
|
print(f"\n⚠️ Warning: Database already has {existing_count} nodes") |
|
|
response = input("Clear existing nodes? (yes/no): ") |
|
|
if response.lower() == 'yes': |
|
|
print("Clearing database...") |
|
|
vector_db.clear_all_nodes() |
|
|
print("✅ Database cleared") |
|
|
|
|
|
|
|
|
print(f"\nBuilding HyperNodes from facts...") |
|
|
hypernodes = [] |
|
|
|
|
|
for fact in tqdm(facts, desc="Processing facts"): |
|
|
|
|
|
plant_name = fact.get("Tên", "") |
|
|
section = fact.get("Mục", "") |
|
|
chunk_id = fact.get("_chunk_id", 0) |
|
|
is_chunked = fact.get("_is_chunked", False) |
|
|
|
|
|
|
|
|
for key, value in fact.items(): |
|
|
if key.startswith("_") or key in ["Tên", "Mục"]: |
|
|
continue |
|
|
|
|
|
|
|
|
hypernode = { |
|
|
"key": key, |
|
|
"value": str(value), |
|
|
"plant_name": plant_name, |
|
|
"section": section if section else None, |
|
|
"chunk_id": chunk_id, |
|
|
"is_chunked": is_chunked |
|
|
} |
|
|
hypernodes.append(hypernode) |
|
|
|
|
|
print(f"Generated {len(hypernodes)} HyperNodes") |
|
|
|
|
|
|
|
|
print(f"\nEmbedding and indexing (batch size: {batch_size})...") |
|
|
|
|
|
for i in tqdm(range(0, len(hypernodes), batch_size), desc="Indexing batches"): |
|
|
batch = hypernodes[i:i+batch_size] |
|
|
|
|
|
|
|
|
keys = [node["key"] for node in batch] |
|
|
values = [node["value"] for node in batch] |
|
|
|
|
|
|
|
|
key_embeddings = embed_service.embed_batch(keys, batch_size=len(keys)) |
|
|
value_embeddings = embed_service.embed_batch(values, batch_size=len(values)) |
|
|
|
|
|
|
|
|
nodes_with_embeddings = [] |
|
|
for j, node in enumerate(batch): |
|
|
node["key_embedding"] = key_embeddings[j] |
|
|
node["value_embedding"] = value_embeddings[j] |
|
|
nodes_with_embeddings.append(node) |
|
|
|
|
|
|
|
|
try: |
|
|
vector_db.insert_hypernodes_batch(nodes_with_embeddings) |
|
|
except Exception as e: |
|
|
error_msg = str(e) |
|
|
|
|
|
if "duplicate" in error_msg.lower(): |
|
|
print(f"\n⚠️ Skipping batch {i//batch_size} (duplicates)") |
|
|
else: |
|
|
print(f"\n❌ Error in batch {i//batch_size}: {error_msg}") |
|
|
print("Retrying with smaller batches...") |
|
|
|
|
|
for j in range(0, len(nodes_with_embeddings), 10): |
|
|
mini_batch = nodes_with_embeddings[j:j+10] |
|
|
try: |
|
|
vector_db.insert_hypernodes_batch(mini_batch) |
|
|
except Exception as e2: |
|
|
print(f" Failed mini-batch at {j}: {str(e2)[:100]}") |
|
|
|
|
|
|
|
|
final_count = vector_db.count_nodes() |
|
|
|
|
|
print(f"\n{'='*60}") |
|
|
print(f"INDEXING COMPLETE") |
|
|
print(f"{'='*60}") |
|
|
print(f"Total HyperNodes indexed: {final_count}") |
|
|
print(f"Expected: {len(hypernodes)}") |
|
|
print(f"Success rate: {final_count/len(hypernodes)*100:.1f}%") |
|
|
print(f"{'='*60}\n") |
|
|
|
|
|
|
|
|
print("Testing vector search...") |
|
|
test_query = "chữa ho" |
|
|
query_emb = embed_service.embed_text(test_query) |
|
|
results = vector_db.search_by_value(query_emb, top_k=5, threshold=0.3) |
|
|
|
|
|
print(f"\nQuery: '{test_query}'") |
|
|
print(f"Top {len(results)} results:") |
|
|
for i, result in enumerate(results, 1): |
|
|
print(f"{i}. {result['plant_name']} - {result['key']}: " |
|
|
f"{result['value'][:60]}... (sim: {result['similarity']:.3f})") |
|
|
|
|
|
print(f"\n✅ HyperGraph successfully built and indexed!") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
import argparse |
|
|
|
|
|
parser = argparse.ArgumentParser(description="Build and index HyperGraph") |
|
|
parser.add_argument("--facts", default="plant_facts.json", help="Path to facts JSON") |
|
|
parser.add_argument("--batch-size", type=int, default=100, help="Batch size") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
build_and_index_hypergraph(args.facts, args.batch_size) |
|
|
|