alextorelli commited on
Commit
244cb91
·
verified ·
1 Parent(s): 4c39bbc

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. app.py +53 -35
  2. requirements.txt +1 -0
app.py CHANGED
@@ -8,6 +8,7 @@ import asyncio
8
  import base64
9
  import json
10
  import os
 
11
  from datetime import datetime
12
  from pathlib import Path
13
  from typing import Any, Dict, Optional
@@ -16,8 +17,6 @@ import gradio as gr
16
  from fastapi import FastAPI, HTTPException
17
  from huggingface_hub import HfApi
18
  from pydantic import BaseModel
19
- from autotrain.trainers.clm.params import LLMTrainingParams
20
- from autotrain.trainers.clm import utils as clm_utils
21
 
22
  # Environment setup
23
  HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
@@ -91,7 +90,7 @@ async def run_training_job(
91
  num_epochs: Optional[int] = None,
92
  batch_size: Optional[int] = None,
93
  ):
94
- """Execute AutoTrain job asynchronously using Python API"""
95
  try:
96
  # Update job status
97
  metadata = load_job_metadata(job_id)
@@ -106,39 +105,58 @@ async def run_training_job(
106
  output_dir = JOBS_DIR / f"{job_id}_output"
107
  output_dir.mkdir(exist_ok=True)
108
 
109
- # Configure AutoTrain parameters using Python API
110
- params = LLMTrainingParams(
111
- model=base_model,
112
- data_path=str(dataset_path),
113
- text_column=text_column,
114
- rejected_text_column=response_column,
115
- project_name=f"codechef-{project_name}",
116
- push_to_hub=True,
117
- repo_id=repo_id,
118
- token=HF_TOKEN,
119
- # Training configuration
120
- num_train_epochs=num_epochs if num_epochs else (1 if is_demo else 3),
121
- learning_rate=learning_rate if learning_rate else 2e-4,
122
- per_device_train_batch_size=batch_size if batch_size else (1 if is_demo else 2),
123
- gradient_accumulation_steps=4,
124
- warmup_ratio=0.1,
125
- # Memory optimization
126
- use_peft=True,
127
- quantization="int4",
128
- mixed_precision="bf16",
129
- # Demo mode optimizations
130
- max_seq_length=512 if is_demo else 2048,
131
- logging_steps=10,
132
- save_total_limit=1,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  )
134
-
135
- # Run training in thread pool to avoid blocking
136
- def _run_training():
137
- """Synchronous training execution"""
138
- return clm_utils.train(params)
139
-
140
- loop = asyncio.get_event_loop()
141
- await loop.run_in_executor(None, _run_training)
142
 
143
  # Update final status
144
  metadata = load_job_metadata(job_id)
 
8
  import base64
9
  import json
10
  import os
11
+ import yaml
12
  from datetime import datetime
13
  from pathlib import Path
14
  from typing import Any, Dict, Optional
 
17
  from fastapi import FastAPI, HTTPException
18
  from huggingface_hub import HfApi
19
  from pydantic import BaseModel
 
 
20
 
21
  # Environment setup
22
  HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
 
90
  num_epochs: Optional[int] = None,
91
  batch_size: Optional[int] = None,
92
  ):
93
+ """Execute AutoTrain job asynchronously using config file"""
94
  try:
95
  # Update job status
96
  metadata = load_job_metadata(job_id)
 
105
  output_dir = JOBS_DIR / f"{job_id}_output"
106
  output_dir.mkdir(exist_ok=True)
107
 
108
+ # Create AutoTrain config file (more reliable than CLI args)
109
+ config = {
110
+ "task": "llm:sft",
111
+ "base_model": base_model,
112
+ "project_name": f"codechef-{project_name}",
113
+ "data_path": str(dataset_path),
114
+ "train_split": "train",
115
+ "valid_split": None,
116
+ "text_column": text_column,
117
+ "rejected_text_column": response_column,
118
+ "add_eos_token": True,
119
+ "block_size": 512 if is_demo else 2048,
120
+ "model_max_length": 2048,
121
+ "epochs": num_epochs if num_epochs else (1 if is_demo else 3),
122
+ "batch_size": batch_size if batch_size else (1 if is_demo else 2),
123
+ "lr": learning_rate if learning_rate else 2e-4,
124
+ "peft": True,
125
+ "quantization": "int4",
126
+ "target_modules": "all-linear",
127
+ "lora_r": 16,
128
+ "lora_alpha": 32,
129
+ "lora_dropout": 0.05,
130
+ "weight_decay": 0.01,
131
+ "gradient_accumulation": 4,
132
+ "mixed_precision": "bf16",
133
+ "push_to_hub": True,
134
+ "repo_id": repo_id,
135
+ "token": HF_TOKEN,
136
+ "logging_steps": 10,
137
+ "save_total_limit": 1,
138
+ }
139
+
140
+ config_file = output_dir / "config.yml"
141
+ with open(config_file, "w") as f:
142
+ yaml.dump(config, f)
143
+
144
+ # Run AutoTrain with config file
145
+ process = await asyncio.create_subprocess_exec(
146
+ "autotrain",
147
+ "--config",
148
+ str(config_file),
149
+ stdout=asyncio.subprocess.PIPE,
150
+ stderr=asyncio.subprocess.PIPE,
151
+ cwd=str(output_dir),
152
  )
153
+
154
+ stdout, stderr = await process.communicate()
155
+
156
+ if process.returncode != 0:
157
+ raise RuntimeError(
158
+ f"Training failed with code {process.returncode}: {stderr.decode()[-500:]}"
159
+ )
 
160
 
161
  # Update final status
162
  metadata = load_job_metadata(job_id)
requirements.txt CHANGED
@@ -6,3 +6,4 @@ huggingface_hub>=0.20.0
6
  datasets>=2.14.0
7
  pydantic>=2.0.0
8
  python-multipart>=0.0.6
 
 
6
  datasets>=2.14.0
7
  pydantic>=2.0.0
8
  python-multipart>=0.0.6
9
+ pyyaml>=6.0