Spaces:

evgueni-p
/

fbmc-chronos2

Sleeping

File size: 13,583 Bytes

f4be780

#!/usr/bin/env python3
"""
Feature Availability Module
Categorizes 2,514 features by their availability windows for forecasting.

Purpose: Prevent data leakage by clearly defining what features are available
         at run time for different forecast horizons.

Categories:
1. Full-horizon D+14 (always known): temporal, weather, CNEC outages, LTA
2. Partial D+1 only (masked D+2-D+14): load forecasts
3. Historical only (not available): prices, generation, demand, lags, etc.
"""

from typing import Dict, List, Tuple, Set
import pandas as pd
import numpy as np
from datetime import datetime, timedelta


class FeatureAvailability:
    """
    Defines availability windows for all features in the dataset.

    Availability Horizons:
    - D+14: Available for full 14-day forecast (temporal, weather, outages, LTA)
    - D+1: Available for day-ahead only (load forecasts)
    - D+0: Current value only, forward-filled (LTA)
    - Historical: Not available for future (prices, generation, demand, lags)
    """

    # Feature categories with their availability windows
    AVAILABILITY_WINDOWS = {
        # FULL HORIZON - D+14 (336 hours)
        'temporal': {
            'horizon_hours': float('inf'),  # Always computable
            'description': 'Time-based features (hour, day, month, weekday, etc.)',
            'patterns': ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend'],
            'suffixes': ['_sin', '_cos'],
            'expected_count': 12,
        },
        'weather': {
            'horizon_hours': 336,  # D+14 weather forecasts
            'description': 'Weather forecasts (temp, wind, solar, cloud, pressure)',
            'prefixes': ['temp_', 'wind_', 'wind10m_', 'wind100m_', 'winddir_', 'solar_', 'cloud_', 'pressure_'],
            'expected_count': 375,  # Approximate (52 grid points × ~7 variables)
        },
        'cnec_outages': {
            'horizon_hours': 336,  # D+14+ planned transmission outages
            'description': 'Planned CNEC transmission outages (published weeks ahead)',
            'prefixes': ['outage_cnec_'],
            'expected_count': 176,
        },
        'lta': {
            'horizon_hours': 0,  # D+0 only (current value)
            'description': 'Long-term allocations (forward-filled from D+0)',
            'prefixes': ['lta_'],
            'expected_count': 40,
            'forward_fill': True,  # Special handling: forward-fill current value
        },

        # PARTIAL HORIZON - D+1 only (24 hours)
        'load_forecast': {
            'horizon_hours': 24,  # D+1 only, masked D+2-D+14
            'description': 'Day-ahead load forecasts (published D-1)',
            'prefixes': ['load_forecast_'],
            'expected_count': 12,
            'requires_masking': True,  # Mask hours 25-336
        },

        # HISTORICAL ONLY - Not available for forecasting
        'prices': {
            'horizon_hours': -1,  # Historical only
            'description': 'Day-ahead electricity prices (determined D-1)',
            'prefixes': ['price_'],
            'expected_count': 24,
        },
        'generation': {
            'horizon_hours': -1,
            'description': 'Actual generation by fuel type',
            'prefixes': ['gen_'],
            'expected_count': 183,  # 12 zones × ~15 fuel types
        },
        'demand': {
            'horizon_hours': -1,
            'description': 'Actual electricity demand',
            'prefixes': ['demand_'],
            'expected_count': 24,  # 12 zones + aggregates
        },
        'border_lags': {
            'horizon_hours': -1,
            'description': 'Lagged cross-border flows',
            'patterns': ['_lag_', '_L', 'border_'],
            'expected_count': 264,  # 38 borders × 7 lags (1h, 3h, 6h, 12h, 24h, 168h, 720h)
        },
        'cnec_flows': {
            'horizon_hours': -1,
            'description': 'Historical CNEC flows and constraints',
            'prefixes': ['cnec_'],
            'patterns': ['_flow', '_binding', '_margin', '_ram'],
            'expected_count': 1000,  # Tier-1 CNECs with multiple metrics
        },
        'netpos': {
            'horizon_hours': -1,
            'description': 'Historical net positions',
            'prefixes': ['netpos_'],
            'expected_count': 48,  # 12 zones × 4 metrics
        },
        'system_agg': {
            'horizon_hours': -1,
            'description': 'System-level aggregates',
            'prefixes': ['total_', 'avg_', 'max', 'min', 'std_', 'mean_', 'sum_'],
            'expected_count': 353,  # Various aggregations
        },
        'pumped_storage': {
            'horizon_hours': -1,
            'description': 'Pumped hydro storage generation',
            'prefixes': ['pumped_'],
            'expected_count': 7,  # Countries with pumped storage
        },
        'hydro_storage': {
            'horizon_hours': -1,
            'description': 'Hydro reservoir levels (weekly data)',
            'prefixes': ['hydro_storage_'],
            'expected_count': 7,
        },
    }

    @classmethod
    def categorize_features(cls, columns: List[str]) -> Dict[str, List[str]]:
        """
        Categorize all features by their availability windows.

        Args:
            columns: All column names from dataset

        Returns:
            Dictionary with categories:
            - full_horizon_d14: Available for full 14-day forecast
            - partial_d1: Available D+1 only (requires masking)
            - historical: Not available for forecasting
            - uncategorized: Features that don't match any pattern
        """
        full_horizon_d14 = []
        partial_d1 = []
        historical = []
        uncategorized = []

        for col in columns:
            # Skip metadata columns
            if col == 'timestamp' or col.startswith('target_border_'):
                continue

            categorized = False

            # Check each category
            for category, config in cls.AVAILABILITY_WINDOWS.items():
                if cls._matches_category(col, config):
                    # Assign to appropriate list based on horizon
                    if config['horizon_hours'] >= 336 or config['horizon_hours'] == float('inf'):
                        full_horizon_d14.append(col)
                    elif config['horizon_hours'] == 24:
                        partial_d1.append(col)
                    elif config['horizon_hours'] < 0:
                        historical.append(col)
                    elif config['horizon_hours'] == 0:
                        # LTA: forward-filled, treat as full horizon
                        full_horizon_d14.append(col)

                    categorized = True
                    break

            if not categorized:
                uncategorized.append(col)

        return {
            'full_horizon_d14': full_horizon_d14,
            'partial_d1': partial_d1,
            'historical': historical,
            'uncategorized': uncategorized,
        }

    @classmethod
    def _matches_category(cls, col: str, config: Dict) -> bool:
        """Check if column matches category patterns."""
        # Check exact matches
        if 'patterns' in config:
            if col in config['patterns']:
                return True
            # Check for pattern substring matches
            if any(pattern in col for pattern in config['patterns']):
                return True

        # Check prefixes
        if 'prefixes' in config:
            if any(col.startswith(prefix) for prefix in config['prefixes']):
                return True

        # Check suffixes
        if 'suffixes' in config:
            if any(col.endswith(suffix) for suffix in config['suffixes']):
                return True

        return False

    @classmethod
    def create_availability_mask(
        cls,
        feature_name: str,
        forecast_horizon_hours: int = 336
    ) -> np.ndarray:
        """
        Create binary availability mask for a feature across forecast horizon.

        Args:
            feature_name: Name of the feature
            forecast_horizon_hours: Length of forecast (default 336 = 14 days)

        Returns:
            Binary mask: 1 = available, 0 = masked/unavailable
        """
        # Determine category
        for category, config in cls.AVAILABILITY_WINDOWS.items():
            if cls._matches_category(feature_name, config):
                horizon = config['horizon_hours']

                # Full horizon or infinite (temporal)
                if horizon >= forecast_horizon_hours or horizon == float('inf'):
                    return np.ones(forecast_horizon_hours, dtype=np.float32)

                # Partial horizon (e.g., D+1 = 24 hours)
                elif horizon > 0:
                    mask = np.zeros(forecast_horizon_hours, dtype=np.float32)
                    mask[:int(horizon)] = 1.0
                    return mask

                # Forward-fill (LTA: D+0)
                elif horizon == 0:
                    return np.ones(forecast_horizon_hours, dtype=np.float32)

                # Historical only
                else:
                    return np.zeros(forecast_horizon_hours, dtype=np.float32)

        # Unknown feature: assume historical (conservative)
        return np.zeros(forecast_horizon_hours, dtype=np.float32)

    @classmethod
    def validate_categorization(
        cls,
        categories: Dict[str, List[str]],
        verbose: bool = True
    ) -> Tuple[bool, List[str]]:
        """
        Validate feature categorization against expected counts.

        Args:
            categories: Output from categorize_features()
            verbose: Print validation details

        Returns:
            (is_valid, warnings)
        """
        warnings = []

        # Total feature count (excl. timestamp + 38 targets)
        total_features = sum(len(v) for v in categories.values())
        expected_total = 2514  # 2,553 columns - 1 timestamp - 38 targets

        if total_features != expected_total:
            warnings.append(
                f"Feature count mismatch: {total_features} vs expected {expected_total}"
            )

        # Check full-horizon D+14 features
        full_d14 = len(categories['full_horizon_d14'])
        # Expected: temporal (12) + weather (~375) + outages (176) + LTA (40) = ~603
        if full_d14 < 200 or full_d14 > 700:
            warnings.append(
                f"Full-horizon D+14 count unusual: {full_d14} (expected ~240-640)"
            )

        # Check partial D+1 features
        partial_d1 = len(categories['partial_d1'])
        if partial_d1 != 12:
            warnings.append(
                f"Partial D+1 count: {partial_d1} (expected 12 load forecasts)"
            )

        # Check uncategorized
        if categories['uncategorized']:
            warnings.append(
                f"Uncategorized features: {len(categories['uncategorized'])} "
                f"(first 5: {categories['uncategorized'][:5]})"
            )

        if verbose:
            print("="*60)
            print("FEATURE CATEGORIZATION VALIDATION")
            print("="*60)
            print(f"Full-horizon D+14:  {len(categories['full_horizon_d14']):4d} features")
            print(f"Partial D+1:        {len(categories['partial_d1']):4d} features")
            print(f"Historical only:    {len(categories['historical']):4d} features")
            print(f"Uncategorized:      {len(categories['uncategorized']):4d} features")
            print(f"Total:              {total_features:4d} features")

            if warnings:
                print("\n[!] WARNINGS:")
                for w in warnings:
                    print(f"    - {w}")
            else:
                print("\n[OK] Validation passed!")
            print("="*60)

        return len(warnings) == 0, warnings

    @classmethod
    def get_category_summary(cls, categories: Dict[str, List[str]]) -> pd.DataFrame:
        """
        Generate summary table of feature categorization.

        Returns:
            DataFrame with category, count, availability, and sample features
        """
        summary = []

        # Full-horizon D+14
        summary.append({
            'Category': 'Full-horizon D+14',
            'Count': len(categories['full_horizon_d14']),
            'Availability': 'D+1 to D+14 (336 hours)',
            'Masking': 'None',
            'Sample Features': ', '.join(categories['full_horizon_d14'][:3]),
        })

        # Partial D+1
        summary.append({
            'Category': 'Partial D+1',
            'Count': len(categories['partial_d1']),
            'Availability': 'D+1 only (24 hours)',
            'Masking': 'Mask D+2 to D+14',
            'Sample Features': ', '.join(categories['partial_d1'][:3]),
        })

        # Historical
        summary.append({
            'Category': 'Historical only',
            'Count': len(categories['historical']),
            'Availability': 'Not available for forecasting',
            'Masking': 'All zeros',
            'Sample Features': ', '.join(categories['historical'][:3]),
        })

        # Uncategorized
        if categories['uncategorized']:
            summary.append({
                'Category': 'Uncategorized',
                'Count': len(categories['uncategorized']),
                'Availability': 'Unknown (conservative: historical)',
                'Masking': 'All zeros (conservative)',
                'Sample Features': ', '.join(categories['uncategorized'][:3]),
            })

        return pd.DataFrame(summary)