File size: 5,819 Bytes
f3cb94f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
"""
Ontology Flattener
Converts hierarchical JSON-LD plant data to flat fact lists for OG-RAG HyperGraph
"""
import sys
from pathlib import Path

# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))

from typing import List, Dict, Any
import json
from utils.key_normalizer import normalize_key
from utils.chunker import chunk_long_value, estimate_tokens
from utils.data_loader import PlantDataLoader


def flatten_plant_ontology(
    plant_data: Dict[str, Any],
    chunk_threshold: int = 250
) -> List[Dict[str, Any]]:
    """
    Convert nested JSON-LD to flat fact list with intelligent chunking
    
    Args:
        plant_data: Nested plant ontology data
        chunk_threshold: Maximum tokens before chunking (default: 250)
        
    Returns:
        List of flat facts suitable for HyperGraph
    """
    facts = []
    plant_name = plant_data.get("ten", "")
    
    if not plant_name:
        return facts
    
    # 1. Basic Info (always keep together, no chunking)
    basic_fact = {
        "Tên": plant_name,
        "Tên khoa học": plant_data.get("ten_khoa_hoc", ""),
        "Họ": plant_data.get("ho", "")
    }
    # Remove empty values
    basic_fact = {k: v for k, v in basic_fact.items() if v}
    if basic_fact:
        basic_fact["_is_chunked"] = False
        facts.append(basic_fact)
    
    # 2. Process each section
    sections = [
        "Mô tả", "Phân bố", 
        "Công dụng", "Cách dùng", "Bộ phận dùng",
        "Thông tin khác"
    ]
    
    for section in sections:
        if section not in plant_data:
            continue
        
        section_data = plant_data[section]
        
        if not isinstance(section_data, dict):
            continue
        
        # Process each field in section
        for field_key, field_value in section_data.items():
            if not field_value or field_value == "":
                continue
            
            # Normalize key to Vietnamese
            normalized_key = normalize_key(field_key)
            
            # Convert to string
            value_str = str(field_value)
            
            # Check if chunking needed
            if estimate_tokens(value_str) > chunk_threshold:
                # CHUNK IT!
                chunks = chunk_long_value(
                    normalized_key,
                    value_str,
                    max_tokens=chunk_threshold
                )
                
                for chunk_key, chunk_value, chunk_id in chunks:
                    fact = {
                        "Tên": plant_name,
                        "Mục": normalize_key(section),
                        chunk_key: chunk_value,
                        "_chunk_id": chunk_id,
                        "_is_chunked": True
                    }
                    facts.append(fact)
            else:
                # No chunking needed
                fact = {
                    "Tên": plant_name,
                    "Mục": normalize_key(section),
                    normalized_key: value_str,
                    "_is_chunked": False
                }
                facts.append(fact)
    
    return facts


def build_all_plant_facts(
    data_dir: str = "data",
    output_file: str = "plant_facts.json",
    chunk_threshold: int = 250
) -> List[Dict]:
    """
    Process all plants and generate flat facts
    
    Args:
        data_dir: Directory containing JSON-LD files
        output_file: Output file for facts (optional)
        chunk_threshold: Token threshold for chunking
        
    Returns:
        List of all facts from all plants
    """
    from tqdm import tqdm
    
    loader = PlantDataLoader(data_dir)
    all_facts = []
    
    jsonld_files = sorted(Path(data_dir).glob("ontology_node_*.jsonld"))
    
    print(f"\nProcessing {len(jsonld_files)} plant files...")
    
    for jsonld_file in tqdm(jsonld_files, desc="Flattening plants"):
        # Load plant data
        plant_data = loader._load_jsonld_file(jsonld_file)
        
        if not plant_data:
            continue
        
        # Flatten + chunk
        plant_facts = flatten_plant_ontology(plant_data, chunk_threshold)
        all_facts.extend(plant_facts)
    
    # Save if output file specified
    if output_file:
        print(f"\nSaving {len(all_facts)} facts to {output_file}...")
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(all_facts, f, ensure_ascii=False, indent=2)
    
    # Print statistics
    print(f"\n{'='*60}")
    print(f"STATISTICS")
    print(f"{'='*60}")
    print(f"Total plants processed: {len(jsonld_files)}")
    print(f"Total facts generated: {len(all_facts)}")
    print(f"Avg facts per plant: {len(all_facts) / len(jsonld_files):.1f}")
    
    chunked = [f for f in all_facts if f.get("_is_chunked", False)]
    print(f"Chunked facts: {len(chunked)} ({len(chunked)/len(all_facts)*100:.1f}%)")
    print(f"Unchunked facts: {len(all_facts) - len(chunked)}")
    
    # Section coverage
    sections = [f.get("Mục") for f in all_facts if "Mục" in f]
    section_counts = {}
    for section in sections:
        section_counts[section] = section_counts.get(section, 0) + 1
    
    print(f"\nSection coverage:")
    for section, count in sorted(section_counts.items(), key=lambda x: -x[1]):
        print(f"  {section}: {count}")
    
    print(f"{'='*60}\n")
    
    return all_facts


if __name__ == "__main__":
    import sys
    
    # Allow optional arguments
    data_dir = sys.argv[1] if len(sys.argv) > 1 else "data"
    output_file = sys.argv[2] if len(sys.argv) > 2 else "plant_facts.json"
    
    facts = build_all_plant_facts(data_dir, output_file)
    
    print(f"✅ Done! Generated {len(facts)} facts")
    print(f"📄 Saved to {output_file}")