Spaces:

hugging-science
/

abmelt-benchmark

Build error

File size: 18,890 Bytes

8ef403e

#!/usr/bin/env python3

"""

Test script for AbMelt structure generation functionality.

Tests both sequence-based structure generation and PDB-based processing.

"""

import os
import sys
import logging
import tempfile
import shutil
from pathlib import Path
from typing import Dict, List, Tuple
import argparse

# Add src to path for imports
sys.path.append(str(Path(__file__).parent / "src"))

try:
    from structure_prep import (
        prepare_structure, 
        generate_structure_from_sequences,
        prepare_pdb_for_analysis,
        validate_structure,
        get_chain_sequences
    )
    from Bio.PDB import PDBParser
    from Bio.SeqUtils import seq1
except ImportError as e:
    print(f"Failed to import required modules: {e}")
    print("Please ensure you're in the correct environment with required dependencies installed.")
    sys.exit(1)

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)


class StructureGenerationTester:
    """Test class for structure generation functionality."""
    
    def __init__(self, test_dir: str = None):
        """Initialize tester with optional test directory."""
        self.test_dir = Path(test_dir) if test_dir else Path(tempfile.mkdtemp(prefix="abmelt_test_"))
        self.test_dir.mkdir(parents=True, exist_ok=True)
        logger.info(f"Test directory: {self.test_dir}")
        
        # Test antibody sequences (example sequences)
        self.test_sequences = {
            "alemtuzumab": {
                "heavy": "QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYWMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATITADESTSTTAYMELSSLRSEDTAVYYCARGGYSSGYYFDYWGQGTLVTVSS",
                "light": "DIQMTQSPSSLSASVGDRVTITCRASQDISNYLNWFQQKPGKAPKLLIYYATSLADGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQGNTFPWTFGQGTKVEIKR"
            },
            "nivolumab": {
                "heavy": "QVQLVQSGAEVKKPGSSVKVSCKASGYTFTSYWINWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATITADESTSTTAYMELSSLRSEDTAVYYCARGGYSSGYYFDYWGQGTLVTVSS",
                "light": "DIQMTQSPSSLSASVGDRVTITCRASQDISNYLNWFQQKPGKAPKLLIYYATSLADGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQGNTFPWTFGQGTKVEIKR"
            }
        }
        
        # Test configuration
        self.config = {
            "paths": {
                "temp_dir": str(self.test_dir),
                "output_dir": str(self.test_dir / "output"),
                "log_dir": str(self.test_dir / "logs")
            },
            "structure": {
                "validate_structure": True,
                "extract_sequences": True,
                "create_work_dir": True
            }
        }
        
        # Create output directories
        for path in self.config["paths"].values():
            Path(path).mkdir(parents=True, exist_ok=True)
    
    def test_sequence_based_generation(self) -> Dict[str, bool]:
        """Test structure generation from sequences."""
        logger.info("=" * 60)
        logger.info("TESTING SEQUENCE-BASED STRUCTURE GENERATION")
        logger.info("=" * 60)
        
        results = {}
        
        for antibody_name, sequences in self.test_sequences.items():
            logger.info(f"\nTesting {antibody_name}...")
            
            try:
                # Test direct sequence generation
                output_file = self.test_dir / f"{antibody_name}_direct.pdb"
                logger.info("Testing direct sequence generation...")
                
                generated_file = generate_structure_from_sequences(
                    heavy_chain=sequences["heavy"],
                    light_chain=sequences["light"],
                    output_file=str(output_file)
                )
                
                # Verify file was created
                if Path(generated_file).exists():
                    logger.info(f"✓ Direct generation successful: {generated_file}")
                    results[f"{antibody_name}_direct"] = True
                else:
                    logger.error(f"✗ Direct generation failed: {generated_file}")
                    results[f"{antibody_name}_direct"] = False
                
                # Test through prepare_structure function
                logger.info("Testing through prepare_structure...")
                antibody = {
                    "name": antibody_name,
                    "heavy_chain": sequences["heavy"],
                    "light_chain": sequences["light"],
                    "type": "sequences"
                }
                
                structure_files = prepare_structure(antibody, self.config)
                
                # Verify structure files
                if self._verify_structure_files(structure_files, antibody_name):
                    logger.info(f"✓ prepare_structure successful for {antibody_name}")
                    results[f"{antibody_name}_prepare"] = True
                else:
                    logger.error(f"✗ prepare_structure failed for {antibody_name}")
                    results[f"{antibody_name}_prepare"] = False
                
            except Exception as e:
                logger.error(f"✗ Error testing {antibody_name}: {e}")
                results[f"{antibody_name}_error"] = False
        
        return results
    
    def test_pdb_based_processing(self) -> Dict[str, bool]:
        """Test PDB-based structure processing."""
        logger.info("=" * 60)
        logger.info("TESTING PDB-BASED STRUCTURE PROCESSING")
        logger.info("=" * 60)
        
        results = {}
        
        # First generate some test PDBs
        test_pdbs = {}
        for antibody_name, sequences in self.test_sequences.items():
            try:
                pdb_file = self.test_dir / f"{antibody_name}_test.pdb"
                generate_structure_from_sequences(
                    heavy_chain=sequences["heavy"],
                    light_chain=sequences["light"],
                    output_file=str(pdb_file)
                )
                test_pdbs[antibody_name] = str(pdb_file)
                logger.info(f"Generated test PDB: {pdb_file}")
            except Exception as e:
                logger.error(f"Failed to generate test PDB for {antibody_name}: {e}")
                continue
        
        # Test PDB processing
        for antibody_name, pdb_file in test_pdbs.items():
            logger.info(f"\nTesting PDB processing for {antibody_name}...")
            
            try:
                # Test prepare_pdb_for_analysis
                logger.info("Testing prepare_pdb_for_analysis...")
                structure_files = prepare_pdb_for_analysis(
                    pdb_file=pdb_file,
                    output_dir=str(self.test_dir / "pdb_analysis")
                )
                
                if self._verify_structure_files(structure_files, antibody_name):
                    logger.info(f"✓ prepare_pdb_for_analysis successful for {antibody_name}")
                    results[f"{antibody_name}_pdb_analysis"] = True
                else:
                    logger.error(f"✗ prepare_pdb_for_analysis failed for {antibody_name}")
                    results[f"{antibody_name}_pdb_analysis"] = False
                
                # Test through prepare_structure with PDB type
                logger.info("Testing prepare_structure with PDB type...")
                antibody = {
                    "name": f"{antibody_name}_pdb",
                    "pdb_file": pdb_file,
                    "type": "pdb"
                }
                
                structure_files = prepare_structure(antibody, self.config)
                
                if self._verify_structure_files(structure_files, f"{antibody_name}_pdb"):
                    logger.info(f"✓ prepare_structure (PDB) successful for {antibody_name}")
                    results[f"{antibody_name}_pdb_prepare"] = True
                else:
                    logger.error(f"✗ prepare_structure (PDB) failed for {antibody_name}")
                    results[f"{antibody_name}_pdb_prepare"] = False
                
            except Exception as e:
                logger.error(f"✗ Error processing PDB for {antibody_name}: {e}")
                results[f"{antibody_name}_pdb_error"] = False
        
        return results
    
    def test_structure_validation(self) -> Dict[str, bool]:
        """Test structure validation functionality."""
        logger.info("=" * 60)
        logger.info("TESTING STRUCTURE VALIDATION")
        logger.info("=" * 60)
        
        results = {}
        
        # Test with valid structures
        for antibody_name, sequences in self.test_sequences.items():
            try:
                pdb_file = self.test_dir / f"{antibody_name}_validation.pdb"
                generate_structure_from_sequences(
                    heavy_chain=sequences["heavy"],
                    light_chain=sequences["light"],
                    output_file=str(pdb_file)
                )
                
                # Test validation
                is_valid = validate_structure(str(pdb_file))
                if is_valid:
                    logger.info(f"✓ Structure validation passed for {antibody_name}")
                    results[f"{antibody_name}_validation"] = True
                else:
                    logger.warning(f"⚠ Structure validation failed for {antibody_name}")
                    results[f"{antibody_name}_validation"] = False
                
                # Test sequence extraction
                chains = get_chain_sequences(str(pdb_file))
                if chains:
                    logger.info(f"✓ Chain sequences extracted for {antibody_name}: {list(chains.keys())}")
                    results[f"{antibody_name}_sequences"] = True
                else:
                    logger.error(f"✗ Failed to extract chain sequences for {antibody_name}")
                    results[f"{antibody_name}_sequences"] = False
                
            except Exception as e:
                logger.error(f"✗ Error in validation test for {antibody_name}: {e}")
                results[f"{antibody_name}_validation_error"] = False
        
        # Test with invalid file
        try:
            invalid_file = self.test_dir / "invalid.pdb"
            with open(invalid_file, 'w') as f:
                f.write("ATOM      1  N   ALA A   1      20.154  16.967  23.862  1.00 11.18           N\n")
                f.write("ATOM      2  CA  ALA A   1      19.030  16.067  23.862  1.00 11.18           C\n")
                # Incomplete structure - only one chain
            
            is_valid = validate_structure(str(invalid_file))
            if not is_valid:
                logger.info("✓ Correctly identified invalid structure (single chain)")
                results["invalid_structure"] = True
            else:
                logger.warning("⚠ Failed to identify invalid structure")
                results["invalid_structure"] = False
                
        except Exception as e:
            logger.error(f"✗ Error testing invalid structure: {e}")
            results["invalid_structure_error"] = False
        
        return results
    
    def test_error_handling(self) -> Dict[str, bool]:
        """Test error handling for various edge cases."""
        logger.info("=" * 60)
        logger.info("TESTING ERROR HANDLING")
        logger.info("=" * 60)
        
        results = {}
        
        # Test with invalid antibody type
        try:
            invalid_antibody = {
                "name": "test",
                "type": "invalid_type"
            }
            prepare_structure(invalid_antibody, self.config)
            logger.error("✗ Should have raised error for invalid antibody type")
            results["invalid_type"] = False
        except ValueError as e:
            logger.info(f"✓ Correctly raised error for invalid type: {e}")
            results["invalid_type"] = True
        except Exception as e:
            logger.error(f"✗ Unexpected error for invalid type: {e}")
            results["invalid_type"] = False
        
        # Test with missing sequences
        try:
            incomplete_antibody = {
                "name": "test",
                "heavy_chain": "QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYWMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATITADESTSTTAYMELSSLRSEDTAVYYCARGGYSSGYYFDYWGQGTLVTVSS",
                "type": "sequences"
                # Missing light_chain
            }
            prepare_structure(incomplete_antibody, self.config)
            logger.error("✗ Should have raised error for missing light chain")
            results["missing_light"] = False
        except KeyError as e:
            logger.info(f"✓ Correctly raised error for missing light chain: {e}")
            results["missing_light"] = True
        except Exception as e:
            logger.error(f"✗ Unexpected error for missing light chain: {e}")
            results["missing_light"] = False
        
        # Test with non-existent PDB file
        try:
            non_existent_antibody = {
                "name": "test",
                "pdb_file": "/non/existent/file.pdb",
                "type": "pdb"
            }
            prepare_structure(non_existent_antibody, self.config)
            logger.error("✗ Should have raised error for non-existent PDB")
            results["non_existent_pdb"] = False
        except FileNotFoundError as e:
            logger.info(f"✓ Correctly raised error for non-existent PDB: {e}")
            results["non_existent_pdb"] = True
        except Exception as e:
            logger.error(f"✗ Unexpected error for non-existent PDB: {e}")
            results["non_existent_pdb"] = False
        
        return results
    
    def _verify_structure_files(self, structure_files: Dict[str, str], antibody_name: str) -> bool:
        """Verify that structure files were created correctly."""
        required_keys = ["pdb_file", "work_dir"]
        
        # Check required keys
        for key in required_keys:
            if key not in structure_files:
                logger.error(f"Missing required key: {key}")
                return False
        
        # Check if files exist
        pdb_file = Path(structure_files["pdb_file"])
        work_dir = Path(structure_files["work_dir"])
        
        if not pdb_file.exists():
            logger.error(f"PDB file does not exist: {pdb_file}")
            return False
        
        if not work_dir.exists():
            logger.error(f"Work directory does not exist: {work_dir}")
            return False
        
        # Check if PDB file is valid
        try:
            parser = PDBParser(QUIET=True)
            structure = parser.get_structure("test", str(pdb_file))
            chains = list(structure.get_chains())
            if len(chains) < 2:
                logger.error(f"PDB file has insufficient chains: {len(chains)}")
                return False
        except Exception as e:
            logger.error(f"PDB file is not valid: {e}")
            return False
        
        logger.info(f"✓ Structure files verified for {antibody_name}")
        return True
    
    def run_all_tests(self) -> Dict[str, bool]:
        """Run all tests and return combined results."""
        logger.info("Starting comprehensive structure generation tests...")
        
        all_results = {}
        
        # Run all test categories
        all_results.update(self.test_sequence_based_generation())
        all_results.update(self.test_pdb_based_processing())
        all_results.update(self.test_structure_validation())
        all_results.update(self.test_error_handling())
        
        return all_results
    
    def print_summary(self, results: Dict[str, bool]):
        """Print test summary."""
        logger.info("=" * 60)
        logger.info("TEST SUMMARY")
        logger.info("=" * 60)
        
        total_tests = len(results)
        passed_tests = sum(1 for result in results.values() if result)
        failed_tests = total_tests - passed_tests
        
        logger.info(f"Total tests: {total_tests}")
        logger.info(f"Passed: {passed_tests}")
        logger.info(f"Failed: {failed_tests}")
        logger.info(f"Success rate: {passed_tests/total_tests*100:.1f}%")
        
        if failed_tests > 0:
            logger.info("\nFailed tests:")
            for test_name, result in results.items():
                if not result:
                    logger.info(f"  ✗ {test_name}")
        
        logger.info(f"\nTest directory: {self.test_dir}")
        logger.info("You can inspect the generated files in the test directory.")
    
    def cleanup(self):
        """Clean up test directory."""
        if self.test_dir.exists():
            shutil.rmtree(self.test_dir)
            logger.info(f"Cleaned up test directory: {self.test_dir}")


def main():
    """Main function to run structure generation tests."""
    parser = argparse.ArgumentParser(description='Test AbMelt structure generation')
    parser.add_argument('--test-dir', type=str, help='Test directory (default: temporary)')
    parser.add_argument('--keep-files', action='store_true', help='Keep test files after completion')
    parser.add_argument('--verbose', '-v', action='store_true', help='Verbose logging')
    
    args = parser.parse_args()
    
    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)
    
    # Create tester
    tester = StructureGenerationTester(test_dir=args.test_dir)
    
    try:
        # Run all tests
        results = tester.run_all_tests()
        
        # Print summary
        tester.print_summary(results)
        
        # Cleanup unless keeping files
        if not args.keep_files:
            tester.cleanup()
        else:
            logger.info(f"Test files kept in: {tester.test_dir}")
        
        # Exit with appropriate code
        failed_tests = sum(1 for result in results.values() if not result)
        sys.exit(0 if failed_tests == 0 else 1)
        
    except KeyboardInterrupt:
        logger.info("\nTest interrupted by user")
        tester.cleanup()
        sys.exit(1)
    except Exception as e:
        logger.error(f"Test failed with error: {e}")
        tester.cleanup()
        sys.exit(1)


if __name__ == "__main__":
    main()