File size: 7,641 Bytes
9b4ef96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#!/usr/bin/env python3
"""
Script untuk training dengan monitoring GPU dan logging yang lengkap
"""

import os
import sys
import time
import json
import psutil
import GPUtil
from pathlib import Path
from datetime import datetime
import logging
from finetune_lora import main as finetune_main

def setup_logging():
    """Setup logging dengan format yang lengkap"""
    log_dir = Path("logs")
    log_dir.mkdir(exist_ok=True)
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file = log_dir / f"training_{timestamp}.log"
    
    # Setup logging format
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_file, encoding='utf-8'),
            logging.StreamHandler(sys.stdout)
        ]
    )
    
    return logging.getLogger(__name__)

def get_system_info():
    """Get system information"""
    info = {
        "timestamp": datetime.now().isoformat(),
        "cpu_count": psutil.cpu_count(),
        "memory_total_gb": round(psutil.virtual_memory().total / (1024**3), 2),
        "memory_available_gb": round(psutil.virtual_memory().available / (1024**3), 2),
        "disk_usage": {}
    }
    
    # Disk usage
    for partition in psutil.disk_partitions():
        try:
            usage = psutil.disk_usage(partition.mountpoint)
            info["disk_usage"][partition.mountpoint] = {
                "total_gb": round(usage.total / (1024**3), 2),
                "used_gb": round(usage.used / (1024**3), 2),
                "free_gb": round(usage.free / (1024**3), 2),
                "percent": usage.percent
            }
        except PermissionError:
            continue
    
    return info

def get_gpu_info():
    """Get GPU information"""
    try:
        gpus = GPUtil.getGPUs()
        gpu_info = []
        
        for gpu in gpus:
            gpu_info.append({
                "id": gpu.id,
                "name": gpu.name,
                "memory_total_mb": gpu.memoryTotal,
                "memory_used_mb": gpu.memoryUsed,
                "memory_free_mb": gpu.memoryFree,
                "memory_utilization_percent": gpu.memoryUtil * 100,
                "gpu_utilization_percent": gpu.load * 100,
                "temperature_celsius": gpu.temperature
            })
        
        return gpu_info
    except Exception as e:
        logging.warning(f"Could not get GPU info: {e}")
        return []

def monitor_resources(logger, interval=30):
    """Monitor system resources during training"""
    logger.info("๐Ÿ” Starting resource monitoring...")
    
    start_time = time.time()
    monitoring_data = []
    
    try:
        while True:
            # Get current resource usage
            current_time = time.time()
            elapsed_time = current_time - start_time
            
            # System info
            system_info = get_system_info()
            system_info["elapsed_time_seconds"] = elapsed_time
            
            # GPU info
            gpu_info = get_gpu_info()
            
            # Memory usage
            memory = psutil.virtual_memory()
            system_info["memory_used_gb"] = round(memory.used / (1024**3), 2)
            system_info["memory_percent"] = memory.percent
            
            # CPU usage
            system_info["cpu_percent"] = psutil.cpu_percent(interval=1)
            
            # Combine all info
            monitoring_entry = {
                "timestamp": datetime.now().isoformat(),
                "elapsed_time_seconds": elapsed_time,
                "system": system_info,
                "gpu": gpu_info
            }
            
            monitoring_data.append(monitoring_entry)
            
            # Log summary
            logger.info(f"โฑ๏ธ  Elapsed: {elapsed_time/60:.1f}min | "
                       f"CPU: {system_info['cpu_percent']:.1f}% | "
                       f"RAM: {system_info['memory_percent']:.1f}%")
            
            if gpu_info:
                for gpu in gpu_info:
                    logger.info(f"๐ŸŽฎ GPU {gpu['id']}: "
                               f"Util: {gpu['gpu_utilization_percent']:.1f}% | "
                               f"Memory: {gpu['memory_utilization_percent']:.1f}% | "
                               f"Temp: {gpu['temperature_celsius']:.1f}ยฐC")
            
            # Save monitoring data periodically
            if len(monitoring_data) % 10 == 0:  # Every 10 entries
                monitoring_file = Path("logs") / f"monitoring_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
                with open(monitoring_file, 'w') as f:
                    json.dump(monitoring_data, f, indent=2)
                logger.info(f"๐Ÿ’พ Monitoring data saved: {monitoring_file}")
            
            time.sleep(interval)
            
    except KeyboardInterrupt:
        logger.info("โน๏ธ  Resource monitoring stopped by user")
    
    return monitoring_data

def main():
    """Main function untuk training dengan monitoring"""
    print("๐Ÿš€ Training dengan Monitoring - Llama 3.1 8B LoRA")
    print("=" * 60)
    
    # Setup logging
    logger = setup_logging()
    
    # Log system information
    logger.info("๐Ÿ–ฅ๏ธ  System Information:")
    system_info = get_system_info()
    for key, value in system_info.items():
        if key != "disk_usage":
            logger.info(f"  {key}: {value}")
    
    # Log GPU information
    gpu_info = get_gpu_info()
    if gpu_info:
        logger.info("๐ŸŽฎ GPU Information:")
        for gpu in gpu_info:
            logger.info(f"  GPU {gpu['id']}: {gpu['name']}")
            logger.info(f"    Memory: {gpu['memory_total_mb']}MB total")
            logger.info(f"    Temperature: {gpu['temperature_celsius']}ยฐC")
    else:
        logger.warning("โš ๏ธ  No GPU detected. Training will be very slow on CPU!")
    
    # Check prerequisites
    logger.info("๐Ÿ” Checking prerequisites...")
    
    # Check if model exists
    model_path = Path("models/llama-3.1-8b-instruct")
    if not model_path.exists():
        logger.error("โŒ Base model not found. Please run download_model.py first!")
        return
    
    # Check if dataset exists
    data_path = Path("data/training_data.jsonl")
    if not data_path.exists():
        logger.error("โŒ Training dataset not found. Please run create_sample_dataset.py first!")
        return
    
    # Check if config exists
    config_path = Path("configs/llama_config.yaml")
    if not config_path.exists():
        logger.error("โŒ Model configuration not found. Please run download_model.py first!")
        return
    
    logger.info("โœ… All prerequisites met!")
    
    # Start resource monitoring in background
    import threading
    monitoring_thread = threading.Thread(
        target=monitor_resources, 
        args=(logger, 30),  # Monitor every 30 seconds
        daemon=True
    )
    monitoring_thread.start()
    
    # Start training
    logger.info("๐Ÿš€ Starting LoRA fine-tuning...")
    try:
        finetune_main()
        logger.info("โœ… Training completed successfully!")
    except Exception as e:
        logger.error(f"โŒ Training failed: {e}")
        raise
    finally:
        logger.info("๐Ÿ“Š Training session ended")
        
        # Save final monitoring data
        monitoring_file = Path("logs") / f"final_monitoring_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        # Note: In a real implementation, you'd want to capture the monitoring data
        logger.info(f"๐Ÿ’พ Final monitoring data saved: {monitoring_file}")

if __name__ == "__main__":
    main()