import numpy as np from typing import Dict, List import torch import pickle # This file contains helper functions for the backend operations def build_full_prompt(prompt: str, prompt_prefix: str, prompt_suffix: str) -> str: """ Build the full prompt with instructions Args: prompt: Original financial statement content (without instructions) Returns: Full prompt with instructions """ return f"{prompt_prefix}{prompt}{prompt_suffix}" def check_gpu_utilization(): """Print detailed GPU utilization information""" if not torch.cuda.is_available(): print("❌ CUDA is not available. Running on CPU.") return False # Print GPU device information device_count = torch.cuda.device_count() print(f"✅ Found {device_count} CUDA device(s):") for i in range(torch.cuda.device_count()): device_props = torch.cuda.get_device_properties(i) print(f" Device {i}: {device_props.name}") print(f" Memory: {device_props.total_memory / 1024**3:.2f} GB") # Print current GPU usage current_device = torch.cuda.current_device() print(f"\nCurrent device: {current_device} ({torch.cuda.get_device_name(current_device)})") print(f" Memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB") print(f" Memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB") # Try using nvidia-smi command for more detailed information try: import subprocess print("\nnvidia-smi output:") subprocess.run(['nvidia-smi'], check=True) except: print("Failed to run nvidia-smi command") return True def jensen_shannon_distance(p: Dict[str, float], q: Dict[str, float]) -> float: """ Calculate Jensen-Shannon distance between two probability distributions Args: p: First probability distribution as dictionary q: Second probability distribution as dictionary Returns: Jensen-Shannon distance (0 = identical, 1 = maximally different) """ # Ensure all keys are in both distributions all_keys = set(p.keys()) | set(q.keys()) p_vec = np.array([p.get(k, 0.0) for k in all_keys]) q_vec = np.array([q.get(k, 0.0) for k in all_keys]) # Normalize distributions p_vec = p_vec / np.sum(p_vec) if np.sum(p_vec) > 0 else p_vec q_vec = q_vec / np.sum(q_vec) if np.sum(q_vec) > 0 else q_vec # Calculate midpoint distribution m_vec = 0.5 * (p_vec + q_vec) # Calculate KL divergences and add a small epsilon to avoid log(0) eps = 1e-10 p_vec = np.maximum(p_vec, eps) q_vec = np.maximum(q_vec, eps) m_vec = np.maximum(m_vec, eps) kl_p_m = np.sum(p_vec * np.log(p_vec / m_vec)) kl_q_m = np.sum(q_vec * np.log(q_vec / m_vec)) # Jensen-Shannon divergence js_divergence = 0.5 * (kl_p_m + kl_q_m) # Convert to distance return np.sqrt(js_divergence) def load_dataset(file_path: str) -> List[str]: """ Load dataset from a text file Args: file_path: Path to the text file containing sentences Returns: metadata and mutants as a list """ with open(file_path, 'rb') as f: content = pickle.load(f) print("Loaded mutant data of type:", type(content)) # Expecting a two-element list: [metadata, mutants] metadata = content[0] # e.g., a dictionary including the header info mutants = content[1] # list of rows (each row is a list) return [metadata, mutants] def store_mutant_results(results_data, output_file): """Store results to Excel file""" header = results_data['header'] results = results_data['results'] # Create and save DataFrame import pandas as pd df = pd.DataFrame(results, columns=header) df.to_excel(output_file, index=False) print('Results stored in', output_file)