Thuong Nguyen
Initial deployment: FastAPI backend with data and Dockerfile
f3cb94f
"""
Ontology Flattener
Converts hierarchical JSON-LD plant data to flat fact lists for OG-RAG HyperGraph
"""
import sys
from pathlib import Path
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from typing import List, Dict, Any
import json
from utils.key_normalizer import normalize_key
from utils.chunker import chunk_long_value, estimate_tokens
from utils.data_loader import PlantDataLoader
def flatten_plant_ontology(
plant_data: Dict[str, Any],
chunk_threshold: int = 250
) -> List[Dict[str, Any]]:
"""
Convert nested JSON-LD to flat fact list with intelligent chunking
Args:
plant_data: Nested plant ontology data
chunk_threshold: Maximum tokens before chunking (default: 250)
Returns:
List of flat facts suitable for HyperGraph
"""
facts = []
plant_name = plant_data.get("ten", "")
if not plant_name:
return facts
# 1. Basic Info (always keep together, no chunking)
basic_fact = {
"Tên": plant_name,
"Tên khoa học": plant_data.get("ten_khoa_hoc", ""),
"Họ": plant_data.get("ho", "")
}
# Remove empty values
basic_fact = {k: v for k, v in basic_fact.items() if v}
if basic_fact:
basic_fact["_is_chunked"] = False
facts.append(basic_fact)
# 2. Process each section
sections = [
"Mô tả", "Phân bố",
"Công dụng", "Cách dùng", "Bộ phận dùng",
"Thông tin khác"
]
for section in sections:
if section not in plant_data:
continue
section_data = plant_data[section]
if not isinstance(section_data, dict):
continue
# Process each field in section
for field_key, field_value in section_data.items():
if not field_value or field_value == "":
continue
# Normalize key to Vietnamese
normalized_key = normalize_key(field_key)
# Convert to string
value_str = str(field_value)
# Check if chunking needed
if estimate_tokens(value_str) > chunk_threshold:
# CHUNK IT!
chunks = chunk_long_value(
normalized_key,
value_str,
max_tokens=chunk_threshold
)
for chunk_key, chunk_value, chunk_id in chunks:
fact = {
"Tên": plant_name,
"Mục": normalize_key(section),
chunk_key: chunk_value,
"_chunk_id": chunk_id,
"_is_chunked": True
}
facts.append(fact)
else:
# No chunking needed
fact = {
"Tên": plant_name,
"Mục": normalize_key(section),
normalized_key: value_str,
"_is_chunked": False
}
facts.append(fact)
return facts
def build_all_plant_facts(
data_dir: str = "data",
output_file: str = "plant_facts.json",
chunk_threshold: int = 250
) -> List[Dict]:
"""
Process all plants and generate flat facts
Args:
data_dir: Directory containing JSON-LD files
output_file: Output file for facts (optional)
chunk_threshold: Token threshold for chunking
Returns:
List of all facts from all plants
"""
from tqdm import tqdm
loader = PlantDataLoader(data_dir)
all_facts = []
jsonld_files = sorted(Path(data_dir).glob("ontology_node_*.jsonld"))
print(f"\nProcessing {len(jsonld_files)} plant files...")
for jsonld_file in tqdm(jsonld_files, desc="Flattening plants"):
# Load plant data
plant_data = loader._load_jsonld_file(jsonld_file)
if not plant_data:
continue
# Flatten + chunk
plant_facts = flatten_plant_ontology(plant_data, chunk_threshold)
all_facts.extend(plant_facts)
# Save if output file specified
if output_file:
print(f"\nSaving {len(all_facts)} facts to {output_file}...")
with open(output_file, "w", encoding="utf-8") as f:
json.dump(all_facts, f, ensure_ascii=False, indent=2)
# Print statistics
print(f"\n{'='*60}")
print(f"STATISTICS")
print(f"{'='*60}")
print(f"Total plants processed: {len(jsonld_files)}")
print(f"Total facts generated: {len(all_facts)}")
print(f"Avg facts per plant: {len(all_facts) / len(jsonld_files):.1f}")
chunked = [f for f in all_facts if f.get("_is_chunked", False)]
print(f"Chunked facts: {len(chunked)} ({len(chunked)/len(all_facts)*100:.1f}%)")
print(f"Unchunked facts: {len(all_facts) - len(chunked)}")
# Section coverage
sections = [f.get("Mục") for f in all_facts if "Mục" in f]
section_counts = {}
for section in sections:
section_counts[section] = section_counts.get(section, 0) + 1
print(f"\nSection coverage:")
for section, count in sorted(section_counts.items(), key=lambda x: -x[1]):
print(f" {section}: {count}")
print(f"{'='*60}\n")
return all_facts
if __name__ == "__main__":
import sys
# Allow optional arguments
data_dir = sys.argv[1] if len(sys.argv) > 1 else "data"
output_file = sys.argv[2] if len(sys.argv) > 2 else "plant_facts.json"
facts = build_all_plant_facts(data_dir, output_file)
print(f"✅ Done! Generated {len(facts)} facts")
print(f"📄 Saved to {output_file}")