|
|
""" |
|
|
JSON-LD Data Loader |
|
|
Loads and processes plant ontology data from JSON-LD files |
|
|
""" |
|
|
import json |
|
|
from pathlib import Path |
|
|
from typing import Dict, List, Optional, Any |
|
|
from functools import lru_cache |
|
|
|
|
|
|
|
|
class PlantDataLoader: |
|
|
"""Loader for plant ontology JSON-LD data""" |
|
|
|
|
|
def __init__(self, data_dir: str = "data"): |
|
|
""" |
|
|
Initialize data loader |
|
|
|
|
|
Args: |
|
|
data_dir: Directory containing JSON-LD files |
|
|
""" |
|
|
self.data_dir = Path(data_dir) |
|
|
self._class_to_file_cache = {} |
|
|
self._class_to_name_cache = {} |
|
|
|
|
|
|
|
|
self._cv_class_mapping = self._load_cv_class_mapping() |
|
|
|
|
|
self._build_index() |
|
|
|
|
|
def _load_cv_class_mapping(self) -> Dict[str, str]: |
|
|
"""Load CV model class to Vietnamese name mapping from JSON""" |
|
|
try: |
|
|
mapping_file = Path("cv_class_to_vietnamese.json") |
|
|
if mapping_file.exists(): |
|
|
with open(mapping_file, 'r', encoding='utf-8') as f: |
|
|
import json |
|
|
mapping = json.load(f) |
|
|
print(f"Loaded CV class mapping: {len(mapping)} classes") |
|
|
return mapping |
|
|
except Exception as e: |
|
|
print(f"Warning: Could not load CV mapping: {e}") |
|
|
|
|
|
return {} |
|
|
|
|
|
def _build_index(self): |
|
|
"""Build index of all plants for fast lookup by multiple keys""" |
|
|
print(f"Building plant data index from {self.data_dir}...") |
|
|
|
|
|
jsonld_files = list(self.data_dir.glob("ontology_node_*.jsonld")) |
|
|
|
|
|
for jsonld_file in jsonld_files: |
|
|
try: |
|
|
plant_data = self._load_jsonld_file(jsonld_file) |
|
|
if plant_data: |
|
|
plant_name = plant_data.get("ten", "") |
|
|
scientific_name = plant_data.get("ten_khoa_hoc", "") |
|
|
|
|
|
|
|
|
|
|
|
class_name = self._scientific_to_class(scientific_name) |
|
|
|
|
|
|
|
|
if class_name: |
|
|
self._class_to_file_cache[class_name] = jsonld_file.name |
|
|
self._class_to_name_cache[class_name] = plant_name |
|
|
|
|
|
|
|
|
if plant_name: |
|
|
self._class_to_file_cache[plant_name] = jsonld_file.name |
|
|
|
|
|
if plant_name not in self._class_to_name_cache.values(): |
|
|
self._class_to_name_cache[plant_name] = plant_name |
|
|
|
|
|
|
|
|
if class_name: |
|
|
self._class_to_file_cache[class_name.lower()] = jsonld_file.name |
|
|
if plant_name: |
|
|
self._class_to_file_cache[plant_name.lower()] = jsonld_file.name |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Warning: Failed to index {jsonld_file.name}: {e}") |
|
|
|
|
|
print(f"Indexed {len(set(self._class_to_file_cache.values()))} plants") |
|
|
|
|
|
@staticmethod |
|
|
def _scientific_to_class(scientific_name: str) -> str: |
|
|
""" |
|
|
Convert scientific name to class name |
|
|
|
|
|
Handles corrupt data with extra spaces |
|
|
e.g., "Centella asiati ca (L.) Urb." -> "Centella_asiatica" |
|
|
|
|
|
Args: |
|
|
scientific_name: Scientific name string |
|
|
|
|
|
Returns: |
|
|
Class name in format "Genus_species" |
|
|
""" |
|
|
if not scientific_name: |
|
|
return "" |
|
|
|
|
|
import re |
|
|
|
|
|
|
|
|
clean = re.sub(r'\([^)]*\)', '', scientific_name) |
|
|
|
|
|
|
|
|
clean = ' '.join(clean.split()) |
|
|
|
|
|
|
|
|
parts = clean.split() |
|
|
|
|
|
if len(parts) >= 2: |
|
|
genus = parts[0] |
|
|
|
|
|
|
|
|
species_parts = [] |
|
|
for part in parts[1:]: |
|
|
if part[0].isupper(): |
|
|
break |
|
|
species_parts.append(part) |
|
|
|
|
|
if species_parts: |
|
|
species = ''.join(species_parts) |
|
|
return f"{genus}_{species}" |
|
|
|
|
|
|
|
|
return parts[0] if parts else "" |
|
|
|
|
|
def _load_jsonld_file(self, file_path: Path) -> Optional[Dict]: |
|
|
""" |
|
|
Load and extract plant data from JSON-LD file |
|
|
|
|
|
Merges ALL nodes from @graph into a single plant dictionary: |
|
|
- Plant node (metadata) |
|
|
- Mô tả node |
|
|
- Phân bố node |
|
|
- Công dụng node |
|
|
- etc. |
|
|
|
|
|
Args: |
|
|
file_path: Path to JSON-LD file |
|
|
|
|
|
Returns: |
|
|
Complete plant data dictionary with all sections merged |
|
|
""" |
|
|
try: |
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
|
data = json.load(f) |
|
|
|
|
|
|
|
|
if "@graph" in data: |
|
|
plant_data = {} |
|
|
|
|
|
for node in data["@graph"]: |
|
|
if not isinstance(node, dict): |
|
|
continue |
|
|
|
|
|
node_type = node.get("@type") |
|
|
|
|
|
if node_type == "Plant": |
|
|
|
|
|
plant_data.update(node) |
|
|
elif node_type: |
|
|
|
|
|
|
|
|
section_data = {k: v for k, v in node.items() if k != "@type"} |
|
|
|
|
|
|
|
|
if section_data and any(v is not None for v in section_data.values()): |
|
|
plant_data[node_type] = section_data |
|
|
|
|
|
return plant_data if plant_data else None |
|
|
|
|
|
|
|
|
if data.get("@type") == "Plant": |
|
|
return data |
|
|
|
|
|
return None |
|
|
except Exception as e: |
|
|
print(f"Error loading {file_path}: {e}") |
|
|
return None |
|
|
|
|
|
def get_plant_by_class(self, class_name: str) -> Optional[Dict]: |
|
|
""" |
|
|
Get full plant data by class name |
|
|
|
|
|
Supports: |
|
|
- CV model class names (e.g., "Centella_asiatica") via CSV mapping |
|
|
- Vietnamese names (e.g., "Rau má") via cache |
|
|
- Scientific names via cache |
|
|
|
|
|
Args: |
|
|
class_name: e.g., "Centella_asiatica" or "Curculigo_orchioides" or "Sâm cau" |
|
|
|
|
|
Returns: |
|
|
Full plant data dictionary |
|
|
""" |
|
|
|
|
|
if class_name in self._cv_class_mapping: |
|
|
vietnamese_name = self._cv_class_mapping[class_name] |
|
|
return self.get_plant_by_name(vietnamese_name) |
|
|
|
|
|
|
|
|
if class_name not in self._class_to_file_cache: |
|
|
return None |
|
|
|
|
|
file_name = self._class_to_file_cache[class_name] |
|
|
file_path = self.data_dir / file_name |
|
|
|
|
|
return self._load_jsonld_file(file_path) |
|
|
|
|
|
def get_plant_by_name(self, vietnamese_name: str) -> Optional[Dict]: |
|
|
""" |
|
|
Get plant data by Vietnamese name |
|
|
|
|
|
Args: |
|
|
vietnamese_name: e.g., "Sâm cau" |
|
|
|
|
|
Returns: |
|
|
Full plant data dictionary |
|
|
""" |
|
|
return self.get_plant_by_class(vietnamese_name) |
|
|
|
|
|
def get_all_plant_names(self) -> List[str]: |
|
|
"""Get list of all Vietnamese plant names""" |
|
|
return list(set(self._class_to_name_cache.values())) |
|
|
|
|
|
def get_all_class_names(self) -> List[str]: |
|
|
"""Get list of all class names""" |
|
|
return list(set(k for k in self._class_to_file_cache.keys() |
|
|
if k not in self._class_to_name_cache.values())) |
|
|
|
|
|
def count_plants(self) -> int: |
|
|
"""Get total number of plants""" |
|
|
return len(set(self._class_to_file_cache.values())) |
|
|
|
|
|
|
|
|
@lru_cache() |
|
|
def get_plant_data_loader(data_dir: str = "data") -> PlantDataLoader: |
|
|
"""Get cached plant data loader instance""" |
|
|
return PlantDataLoader(data_dir) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
loader = PlantDataLoader() |
|
|
|
|
|
print(f"\nTotal plants: {loader.count_plants()}") |
|
|
print(f"Sample plant names: {loader.get_all_plant_names()[:5]}") |
|
|
|
|
|
|
|
|
plant = loader.get_plant_by_class("Curculigo_orchioides") |
|
|
if plant: |
|
|
print(f"\nLoaded: {plant.get('ten')} ({plant.get('ten_khoa_hoc')})") |
|
|
print(f"Sections: {list(plant.keys())}") |
|
|
|