"""
Import Pre-generated Embeddings to Supabase
Use this after running generate_embeddings_kaggle.ipynb on Kaggle
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))

import json
import numpy as np
from tqdm import tqdm

from config import get_settings
from services.vector_db_service import SupabaseVectorDB


def import_embeddings_from_json(
    embeddings_file: str = "plant_hypernodes_with_embeddings.json",
    batch_size: int = 200
):
    """
    Import pre-generated embeddings from JSON file
    
    Args:
        embeddings_file: Path to JSON file with HyperNodes + embeddings
        batch_size: Batch size for insertion
    """
    print(f"\n{'='*60}")
    print(f"Importing Pre-generated Embeddings to Supabase")
    print(f"{'='*60}\n")
    
    # Load hypernodes with embeddings
    print(f"Loading {embeddings_file}...")
    with open(embeddings_file, 'r', encoding='utf-8') as f:
        hypernodes = json.load(f)
    
    print(f"Loaded {len(hypernodes)} HyperNodes with embeddings")
    
    # Initialize Supabase
    print("\nConnecting to Supabase...")
    settings = get_settings()
    vector_db = SupabaseVectorDB(
        url=settings.supabase_url,
        key=settings.supabase_anon_key
    )
    
    # Check existing
    existing_count = vector_db.count_nodes()
    if existing_count > 0:
        print(f"\n⚠️  Warning: Database already has {existing_count} nodes")
        response = input("Clear existing nodes? (yes/no): ")
        if response.lower() == 'yes':
            print("Clearing database...")
            vector_db.clear_all_nodes()
            print("✅ Database cleared")
    
    # Insert in batches
    print(f"\nInserting {len(hypernodes)} nodes (batch size: {batch_size})...")
    
    for i in tqdm(range(0, len(hypernodes), batch_size), desc="Inserting batches"):
        batch = hypernodes[i:i+batch_size]
        
        try:
            vector_db.insert_hypernodes_batch(batch)
        except Exception as e:
            error_msg = str(e)
            if "duplicate" in error_msg.lower():
                print(f"\n⚠️ Skipping batch {i//batch_size} (duplicates)")
            else:
                print(f"\n❌ Error in batch {i//batch_size}: {error_msg}")
                print("Retrying with smaller batches...")
                # Retry in smaller chunks
                for j in range(0, len(batch), 10):
                    mini_batch = batch[j:j+10]
                    try:
                        vector_db.insert_hypernodes_batch(mini_batch)
                    except Exception as e2:
                        print(f"  Failed mini-batch at {j}: {str(e2)[:100]}")
    
    # Final statistics
    final_count = vector_db.count_nodes()
    
    print(f"\n{'='*60}")
    print(f"IMPORT COMPLETE")
    print(f"{'='*60}")
    print(f"Total HyperNodes in database: {final_count}")
    print(f"Expected: {len(hypernodes)}")
    print(f"Success rate: {final_count/len(hypernodes)*100:.1f}%")
    print(f"{'='*60}\n")
    
    print("✅ Embeddings successfully imported to Supabase!")


def import_embeddings_from_npz(
    embeddings_file: str = "plant_embeddings.npz",
    metadata_file: str = "plant_metadata.json",
    batch_size: int = 200
):
    """
    Import from compressed NumPy format
    
    Args:
        embeddings_file: Path to .npz file with embeddings
        metadata_file: Path to JSON file with node metadata
        batch_size: Batch size for insertion
    """
    print(f"\n{'='*60}")
    print(f"Importing from NPZ format")
    print(f"{'='*60}\n")
    
    # Load embeddings
    print(f"Loading {embeddings_file}...")
    data = np.load(embeddings_file)
    key_embeddings = data['key_embeddings']
    value_embeddings = data['value_embeddings']
    
    print(f"Loaded embeddings:")
    print(f"  Keys: {key_embeddings.shape}")
    print(f"  Values: {value_embeddings.shape}")
    
    # Load metadata
    print(f"\nLoading {metadata_file}...")
    with open(metadata_file, 'r', encoding='utf-8') as f:
        metadata = json.load(f)
    
    print(f"Loaded {len(metadata)} metadata entries")
    
    # Combine
    hypernodes = []
    for i, meta in enumerate(metadata):
        node = meta.copy()
        node['key_embedding'] = key_embeddings[i].tolist()
        node['value_embedding'] = value_embeddings[i].tolist()
        hypernodes.append(node)
    
    print(f"Combined into {len(hypernodes)} HyperNodes\n")
    
    # Use JSON import function
    import_embeddings_from_json.__wrapped__(hypernodes, batch_size)


if __name__ == "__main__":
    import argparse
    
    parser = argparse.ArgumentParser(description="Import pre-generated embeddings")
    parser.add_argument("--format", choices=['json', 'npz'], default='json',
                       help="Input format")
    parser.add_argument("--embeddings", default="plant_hypernodes_with_embeddings.json",
                       help="Path to embeddings file")
    parser.add_argument("--metadata", default="plant_metadata.json",
                       help="Path to metadata file (for NPZ format)")
    parser.add_argument("--batch-size", type=int, default=200,
                       help="Batch size for insertion")
    
    args = parser.parse_args()
    
    if args.format == 'json':
        import_embeddings_from_json(args.embeddings, args.batch_size)
    else:
        import_embeddings_from_npz(args.embeddings, args.metadata, args.batch_size)