Spaces:

hugging-science
/

abmelt-benchmark

Build error

File size: 11,583 Bytes

8ef403e

#!/usr/bin/env python3
"""

Denormalize Tagg, Tm, and Tmon values from normalized holdout sets.



This script:

1. Loads reference file (tm_holdout_4.csv) to get merck_id and name mapping

2. Loads normalized values from tagg_holdout_normalized.csv, tm_holdout_normalized.csv, and tmon_holdout_normalized.csv

3. Filters to only include antibodies present in reference file

4. Denormalizes the normalized values using utils.py

5. Saves denormalized values to separate CSV files with _denormalized postfix



Note: The tmon file uses column 'tmonset' which represents T_mon_onset (tmon).

"""

import sys
import pandas as pd
import numpy as np
from pathlib import Path
from scipy.stats import pearsonr

# Add src to path for imports
sys.path.append(str(Path(__file__).parent / "src"))

from utils import renormalize, DEFAULT_STATS

def denormalize_temperature_type(normalized_df, reference_df, temp_type, column_name, output_file):
    """

    Denormalize a specific temperature type.

    

    Args:

        normalized_df: DataFrame with normalized values

        reference_df: DataFrame with merck_id and name mapping

        temp_type: Temperature type ('tagg', 'tm', or 'tmon')

        column_name: Name of the column in normalized_df (e.g., 'tagg', 'tm', 'tmonset')

        output_file: Path to output CSV file

    

    Returns:

        DataFrame with denormalized values

    """
    print(f"\n{'='*80}")
    print(f"Processing {temp_type.upper()}")
    print(f"{'='*80}")
    
    # Check if column exists
    if column_name not in normalized_df.columns:
        print(f"ERROR: '{column_name}' column not found!")
        print(f"Available columns: {list(normalized_df.columns)}")
        return None
    
    # Filter normalized_df to only include antibodies present in reference file
    # The 'name' column in normalized_df contains Merck IDs
    print(f"\nFiltering to antibodies present in reference file...")
    filtered_df = normalized_df[normalized_df['name'].isin(reference_df['merck_id'])].copy()
    print(f"  Found {len(filtered_df)} matching antibodies")
    
    if len(filtered_df) == 0:
        print(f"ERROR: No matching antibodies found!")
        print(f"Reference antibodies (merck_id): {reference_df['merck_id'].tolist()}")
        print(f"Normalized antibodies (name): {normalized_df['name'].tolist()}")
        return None
    
    # Merge with reference to get merck_id and name
    merged_df = pd.merge(
        filtered_df[['name', column_name]],
        reference_df[['merck_id', 'name']],
        left_on='name',
        right_on='merck_id',
        how='inner'
    )
    
    # Denormalize the normalized values
    print(f"\nDenormalizing normalized {temp_type.upper()} values...")
    normalized_values = merged_df[column_name].values
    denormalized_values = renormalize(normalized_values, temp_type=temp_type)
    
    # Create output dataframe with merck_id, name, and denormalized value
    output_column = temp_type  # Use 'tmon' instead of 'tmonset' for output
    output_df = pd.DataFrame({
        'merck_id': merged_df['merck_id'],
        'name': merged_df['name_y'],
        output_column: denormalized_values
    })
    
    # Display results
    print(f"\nStatistics used:")
    print(f"  Mean: {DEFAULT_STATS[temp_type]['mean']:.2f}°C")
    print(f"  Std:  {DEFAULT_STATS[temp_type]['std']:.2f}°C")
    
    print(f"\n{'Merck ID':<15} {'Name':<20} {'Normalized':<15} {'Denormalized':<15}")
    print("-" * 65)
    
    for _, row in merged_df.iterrows():
        merck_id = row['merck_id']
        antibody_name = row['name_y']
        normalized_val = row[column_name]
        denormalized_val = output_df[output_df['merck_id'] == merck_id][output_column].values[0]
        print(f"{merck_id:<15} {antibody_name:<20} {normalized_val:<15.4f} {denormalized_val:<15.2f}")
    
    # Summary statistics
    print(f"\nSUMMARY STATISTICS")
    print(f"Mean Denormalized {temp_type.upper()}:     {denormalized_values.mean():.2f}°C")
    print(f"Std Denormalized {temp_type.upper()}:       {denormalized_values.std():.2f}°C")
    print(f"Min Denormalized {temp_type.upper()}:       {denormalized_values.min():.2f}°C")
    print(f"Max Denormalized {temp_type.upper()}:       {denormalized_values.max():.2f}°C")
    
    # Save results to CSV
    output_df.to_csv(output_file, index=False)
    print(f"\nDenormalized values saved to: {output_file}")
    
    return output_df

def compare_tm_values(actual_df, denormalized_df, normalized_df, data_dir):
    """

    Compare actual TM values with denormalized values.

    

    Args:

        actual_df: DataFrame with actual TM values (from tm_holdout_4.csv)

        denormalized_df: DataFrame with denormalized TM values

        normalized_df: DataFrame with normalized TM values

        data_dir: Path to data directory

    

    Returns:

        DataFrame with comparison results

    """
    print(f"\n{'='*80}")
    print("COMPARING ACTUAL vs DENORMALIZED TM VALUES")
    print(f"{'='*80}")
    
    # Merge actual, normalized, and denormalized values
    # First merge actual with normalized (on merck_id = name in normalized_df)
    temp_df = pd.merge(
        actual_df[['merck_id', 'name', 'tm']],
        normalized_df[['name', 'tm']],
        left_on='merck_id',
        right_on='name',
        how='inner',
        suffixes=('_actual', '_normalized')
    )
    
    # Rename columns from first merge
    temp_df = temp_df.rename(columns={
        'tm_actual': 'actual_tm',
        'tm_normalized': 'normalized_tm',
        'name_actual': 'antibody_name'
    })
    
    # Drop duplicate name column if it exists
    if 'name_normalized' in temp_df.columns:
        temp_df = temp_df.drop(columns=['name_normalized'])
    
    # Then merge with denormalized
    merged_df = pd.merge(
        temp_df,
        denormalized_df[['merck_id', 'tm']],
        on='merck_id',
        how='inner'
    )
    
    if len(merged_df) == 0:
        print("ERROR: No matching antibodies found for comparison!")
        return None
    
    # Rename denormalized tm column
    merged_df = merged_df.rename(columns={
        'tm': 'denormalized_tm'
    })
    
    # Calculate errors
    merged_df['error'] = merged_df['denormalized_tm'] - merged_df['actual_tm']
    merged_df['abs_error'] = np.abs(merged_df['error'])
    merged_df['abs_error_percent'] = (merged_df['abs_error'] / merged_df['actual_tm']) * 100
    
    # Display results
    print(f"\nStatistics used for denormalization:")
    print(f"  Mean: {DEFAULT_STATS['tm']['mean']:.2f}°C")
    print(f"  Std:  {DEFAULT_STATS['tm']['std']:.2f}°C")
    
    print(f"\n{'Antibody':<20} {'Merck ID':<12} {'Actual TM':<12} {'Normalized':<12} {'Denormalized':<15} {'Error':<12} {'Abs Error':<12} {'Error %':<10}")
    print("-" * 110)
    
    for _, row in merged_df.iterrows():
        antibody_name = row['antibody_name']
        print(f"{antibody_name:<20} "
              f"{row['merck_id']:<12} "
              f"{row['actual_tm']:<12.2f} "
              f"{row['normalized_tm']:<12.4f} "
              f"{row['denormalized_tm']:<15.2f} "
              f"{row['error']:<12.2f} "
              f"{row['abs_error']:<12.2f} "
              f"{row['abs_error_percent']:<10.2f}")
    
    # Summary statistics
    print(f"\n{'='*80}")
    print("SUMMARY STATISTICS")
    print(f"{'='*80}")
    print(f"\nMean Absolute Error (MAE):     {merged_df['abs_error'].mean():.2f}°C")
    print(f"Root Mean Squared Error (RMSE): {np.sqrt((merged_df['error']**2).mean()):.2f}°C")
    print(f"Mean Absolute Percent Error:    {merged_df['abs_error_percent'].mean():.2f}%")
    print(f"Max Absolute Error:             {merged_df['abs_error'].max():.2f}°C")
    print(f"Min Absolute Error:             {merged_df['abs_error'].min():.2f}°C")
    
    # Pearson correlation
    pearson_corr, pearson_pvalue = pearsonr(merged_df['actual_tm'], merged_df['denormalized_tm'])
    print(f"\nPearson Correlation (r):         {pearson_corr:.4f}")
    print(f"Pearson Correlation p-value:      {pearson_pvalue:.4f}")
    
    # Also show correlation using np.corrcoef for consistency
    correlation = np.corrcoef(merged_df['actual_tm'], merged_df['denormalized_tm'])[0, 1]
    print(f"Correlation (np.corrcoef):        {correlation:.4f}")
    
    # R-squared
    ss_res = np.sum((merged_df['actual_tm'] - merged_df['denormalized_tm'])**2)
    ss_tot = np.sum((merged_df['actual_tm'] - merged_df['actual_tm'].mean())**2)
    r_squared = 1 - (ss_res / ss_tot)
    print(f"R-squared (R²):                  {r_squared:.4f}")
    
    # Save results to CSV
    output_file = data_dir / "tm_comparison_results.csv"
    merged_df.to_csv(output_file, index=False)
    print(f"\nComparison results saved to: {output_file}")
    
    return merged_df

def main():
    # File paths
    data_dir = Path(__file__).parent / "data" / "abmelt"
    reference_file = data_dir / "tm_holdout_4.csv"
    
    normalized_files = {
        'tagg': data_dir / "tagg_holdout_normalized.csv",
        'tm': data_dir / "tm_holdout_normalized.csv",
        'tmon': data_dir / "tmon_holdout_normalized.csv"
    }
    
    output_files = {
        'tagg': data_dir / "tagg_holdout_denormalized.csv",
        'tm': data_dir / "tm_holdout_denormalized.csv",
        'tmon': data_dir / "tmon_holdout_denormalized.csv"
    }
    
    column_names = {
        'tagg': 'tagg',
        'tm': 'tm',
        'tmon': 'tmonset'  # Note: column is named 'tmonset' not 'tmon'
    }
    
    # Load reference file to get merck_id and name mapping
    print("Loading reference file (tm_holdout_4.csv)...")
    reference_df = pd.read_csv(reference_file)
    print(f"  Found {len(reference_df)} antibodies in reference file")
    print(f"  Antibodies: {', '.join(reference_df['merck_id'].tolist())}")
    
    # Process each temperature type
    results = {}
    for temp_type in ['tagg', 'tm', 'tmon']:
        normalized_file = normalized_files[temp_type]
        output_file = output_files[temp_type]
        column_name = column_names[temp_type]
        
        # Load normalized values
        print(f"\n{'='*80}")
        print(f"Loading normalized {temp_type.upper()} values from {normalized_file.name}...")
        normalized_df = pd.read_csv(normalized_file)
        print(f"  Found {len(normalized_df)} antibodies with normalized {temp_type.upper()} values")
        
        # Denormalize
        result_df = denormalize_temperature_type(
            normalized_df, 
            reference_df, 
            temp_type, 
            column_name, 
            output_file
        )
        
        if result_df is not None:
            results[temp_type] = result_df
    
    # Compare actual vs denormalized TM values if TM was processed
    if 'tm' in results:
        # Load normalized TM values for comparison
        normalized_tm_df = pd.read_csv(normalized_files['tm'])
        compare_tm_values(reference_df, results['tm'], normalized_tm_df, data_dir)
    
    # Summary
    print(f"\n{'='*80}")
    print("SUMMARY")
    print(f"{'='*80}")
    print(f"\nSuccessfully denormalized {len(results)} temperature types:")
    for temp_type in results.keys():
        print(f"  - {temp_type.upper()}: {output_files[temp_type].name}")
    
    print(f"\n{'='*80}")

if __name__ == "__main__":
    main()