Spaces:

thuonguyenvan
/

Plant-Recognition-with-Q-A-System-Backend

Sleeping

Plant-Recognition-with-Q-A-System-Backend

File size: 5,819 Bytes

f3cb94f

"""
Ontology Flattener
Converts hierarchical JSON-LD plant data to flat fact lists for OG-RAG HyperGraph
"""
import sys
from pathlib import Path

# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))

from typing import List, Dict, Any
import json
from utils.key_normalizer import normalize_key
from utils.chunker import chunk_long_value, estimate_tokens
from utils.data_loader import PlantDataLoader


def flatten_plant_ontology(
    plant_data: Dict[str, Any],
    chunk_threshold: int = 250
) -> List[Dict[str, Any]]:
    """
    Convert nested JSON-LD to flat fact list with intelligent chunking
    
    Args:
        plant_data: Nested plant ontology data
        chunk_threshold: Maximum tokens before chunking (default: 250)
        
    Returns:
        List of flat facts suitable for HyperGraph
    """
    facts = []
    plant_name = plant_data.get("ten", "")
    
    if not plant_name:
        return facts
    
    # 1. Basic Info (always keep together, no chunking)
    basic_fact = {
        "Tên": plant_name,
        "Tên khoa học": plant_data.get("ten_khoa_hoc", ""),
        "Họ": plant_data.get("ho", "")
    }
    # Remove empty values
    basic_fact = {k: v for k, v in basic_fact.items() if v}
    if basic_fact:
        basic_fact["_is_chunked"] = False
        facts.append(basic_fact)
    
    # 2. Process each section
    sections = [
        "Mô tả", "Phân bố", 
        "Công dụng", "Cách dùng", "Bộ phận dùng",
        "Thông tin khác"
    ]
    
    for section in sections:
        if section not in plant_data:
            continue
        
        section_data = plant_data[section]
        
        if not isinstance(section_data, dict):
            continue
        
        # Process each field in section
        for field_key, field_value in section_data.items():
            if not field_value or field_value == "":
                continue
            
            # Normalize key to Vietnamese
            normalized_key = normalize_key(field_key)
            
            # Convert to string
            value_str = str(field_value)
            
            # Check if chunking needed
            if estimate_tokens(value_str) > chunk_threshold:
                # CHUNK IT!
                chunks = chunk_long_value(
                    normalized_key,
                    value_str,
                    max_tokens=chunk_threshold
                )
                
                for chunk_key, chunk_value, chunk_id in chunks:
                    fact = {
                        "Tên": plant_name,
                        "Mục": normalize_key(section),
                        chunk_key: chunk_value,
                        "_chunk_id": chunk_id,
                        "_is_chunked": True
                    }
                    facts.append(fact)
            else:
                # No chunking needed
                fact = {
                    "Tên": plant_name,
                    "Mục": normalize_key(section),
                    normalized_key: value_str,
                    "_is_chunked": False
                }
                facts.append(fact)
    
    return facts


def build_all_plant_facts(
    data_dir: str = "data",
    output_file: str = "plant_facts.json",
    chunk_threshold: int = 250
) -> List[Dict]:
    """
    Process all plants and generate flat facts
    
    Args:
        data_dir: Directory containing JSON-LD files
        output_file: Output file for facts (optional)
        chunk_threshold: Token threshold for chunking
        
    Returns:
        List of all facts from all plants
    """
    from tqdm import tqdm
    
    loader = PlantDataLoader(data_dir)
    all_facts = []
    
    jsonld_files = sorted(Path(data_dir).glob("ontology_node_*.jsonld"))
    
    print(f"\nProcessing {len(jsonld_files)} plant files...")
    
    for jsonld_file in tqdm(jsonld_files, desc="Flattening plants"):
        # Load plant data
        plant_data = loader._load_jsonld_file(jsonld_file)
        
        if not plant_data:
            continue
        
        # Flatten + chunk
        plant_facts = flatten_plant_ontology(plant_data, chunk_threshold)
        all_facts.extend(plant_facts)
    
    # Save if output file specified
    if output_file:
        print(f"\nSaving {len(all_facts)} facts to {output_file}...")
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(all_facts, f, ensure_ascii=False, indent=2)
    
    # Print statistics
    print(f"\n{'='*60}")
    print(f"STATISTICS")
    print(f"{'='*60}")
    print(f"Total plants processed: {len(jsonld_files)}")
    print(f"Total facts generated: {len(all_facts)}")
    print(f"Avg facts per plant: {len(all_facts) / len(jsonld_files):.1f}")
    
    chunked = [f for f in all_facts if f.get("_is_chunked", False)]
    print(f"Chunked facts: {len(chunked)} ({len(chunked)/len(all_facts)*100:.1f}%)")
    print(f"Unchunked facts: {len(all_facts) - len(chunked)}")
    
    # Section coverage
    sections = [f.get("Mục") for f in all_facts if "Mục" in f]
    section_counts = {}
    for section in sections:
        section_counts[section] = section_counts.get(section, 0) + 1
    
    print(f"\nSection coverage:")
    for section, count in sorted(section_counts.items(), key=lambda x: -x[1]):
        print(f"  {section}: {count}")
    
    print(f"{'='*60}\n")
    
    return all_facts


if __name__ == "__main__":
    import sys
    
    # Allow optional arguments
    data_dir = sys.argv[1] if len(sys.argv) > 1 else "data"
    output_file = sys.argv[2] if len(sys.argv) > 2 else "plant_facts.json"
    
    facts = build_all_plant_facts(data_dir, output_file)
    
    print(f"✅ Done! Generated {len(facts)} facts")
    print(f"📄 Saved to {output_file}")