Thuong Nguyen
Initial deployment: FastAPI backend with data and Dockerfile
f3cb94f
"""
Import Pre-generated Embeddings to Supabase
Use this after running generate_embeddings_kaggle.ipynb on Kaggle
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
import json
import numpy as np
from tqdm import tqdm
from config import get_settings
from services.vector_db_service import SupabaseVectorDB
def import_embeddings_from_json(
embeddings_file: str = "plant_hypernodes_with_embeddings.json",
batch_size: int = 200
):
"""
Import pre-generated embeddings from JSON file
Args:
embeddings_file: Path to JSON file with HyperNodes + embeddings
batch_size: Batch size for insertion
"""
print(f"\n{'='*60}")
print(f"Importing Pre-generated Embeddings to Supabase")
print(f"{'='*60}\n")
# Load hypernodes with embeddings
print(f"Loading {embeddings_file}...")
with open(embeddings_file, 'r', encoding='utf-8') as f:
hypernodes = json.load(f)
print(f"Loaded {len(hypernodes)} HyperNodes with embeddings")
# Initialize Supabase
print("\nConnecting to Supabase...")
settings = get_settings()
vector_db = SupabaseVectorDB(
url=settings.supabase_url,
key=settings.supabase_anon_key
)
# Check existing
existing_count = vector_db.count_nodes()
if existing_count > 0:
print(f"\n⚠️ Warning: Database already has {existing_count} nodes")
response = input("Clear existing nodes? (yes/no): ")
if response.lower() == 'yes':
print("Clearing database...")
vector_db.clear_all_nodes()
print("✅ Database cleared")
# Insert in batches
print(f"\nInserting {len(hypernodes)} nodes (batch size: {batch_size})...")
for i in tqdm(range(0, len(hypernodes), batch_size), desc="Inserting batches"):
batch = hypernodes[i:i+batch_size]
try:
vector_db.insert_hypernodes_batch(batch)
except Exception as e:
error_msg = str(e)
if "duplicate" in error_msg.lower():
print(f"\n⚠️ Skipping batch {i//batch_size} (duplicates)")
else:
print(f"\n❌ Error in batch {i//batch_size}: {error_msg}")
print("Retrying with smaller batches...")
# Retry in smaller chunks
for j in range(0, len(batch), 10):
mini_batch = batch[j:j+10]
try:
vector_db.insert_hypernodes_batch(mini_batch)
except Exception as e2:
print(f" Failed mini-batch at {j}: {str(e2)[:100]}")
# Final statistics
final_count = vector_db.count_nodes()
print(f"\n{'='*60}")
print(f"IMPORT COMPLETE")
print(f"{'='*60}")
print(f"Total HyperNodes in database: {final_count}")
print(f"Expected: {len(hypernodes)}")
print(f"Success rate: {final_count/len(hypernodes)*100:.1f}%")
print(f"{'='*60}\n")
print("✅ Embeddings successfully imported to Supabase!")
def import_embeddings_from_npz(
embeddings_file: str = "plant_embeddings.npz",
metadata_file: str = "plant_metadata.json",
batch_size: int = 200
):
"""
Import from compressed NumPy format
Args:
embeddings_file: Path to .npz file with embeddings
metadata_file: Path to JSON file with node metadata
batch_size: Batch size for insertion
"""
print(f"\n{'='*60}")
print(f"Importing from NPZ format")
print(f"{'='*60}\n")
# Load embeddings
print(f"Loading {embeddings_file}...")
data = np.load(embeddings_file)
key_embeddings = data['key_embeddings']
value_embeddings = data['value_embeddings']
print(f"Loaded embeddings:")
print(f" Keys: {key_embeddings.shape}")
print(f" Values: {value_embeddings.shape}")
# Load metadata
print(f"\nLoading {metadata_file}...")
with open(metadata_file, 'r', encoding='utf-8') as f:
metadata = json.load(f)
print(f"Loaded {len(metadata)} metadata entries")
# Combine
hypernodes = []
for i, meta in enumerate(metadata):
node = meta.copy()
node['key_embedding'] = key_embeddings[i].tolist()
node['value_embedding'] = value_embeddings[i].tolist()
hypernodes.append(node)
print(f"Combined into {len(hypernodes)} HyperNodes\n")
# Use JSON import function
import_embeddings_from_json.__wrapped__(hypernodes, batch_size)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Import pre-generated embeddings")
parser.add_argument("--format", choices=['json', 'npz'], default='json',
help="Input format")
parser.add_argument("--embeddings", default="plant_hypernodes_with_embeddings.json",
help="Path to embeddings file")
parser.add_argument("--metadata", default="plant_metadata.json",
help="Path to metadata file (for NPZ format)")
parser.add_argument("--batch-size", type=int, default=200,
help="Batch size for insertion")
args = parser.parse_args()
if args.format == 'json':
import_embeddings_from_json(args.embeddings, args.batch_size)
else:
import_embeddings_from_npz(args.embeddings, args.metadata, args.batch_size)