diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..4f1d51575a04862a685fc3f545959f310f8cbfc5 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,9 @@ +turing/reporting.py +turing/plots.py +turing/features.py +turing/evaluate_model.py +turing/data_validation.py + +turing/CLI_runner +turing/modeling/train.py +turing/tests \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000000000000000000000000000000000000..ca207ff05c0348a3c30a1a278dfd6b8cdc0618a1 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,14 @@ +services: + api: + build: . + container_name: turing_app + image: turing_api + ports: + - "7860:7860" + + environment: + - MLFLOW_TRACKING_USERNAME=${MLFLOW_USER} + - MLFLOW_TRACKING_PASSWORD=${MLFLOW_PWD} + - DAGSHUB_USER_TOKEN=${DAGSHUB_TOKEN} + + command: uvicorn turing.api.app:app --host 0.0.0.0 --port 7860 --reload \ No newline at end of file diff --git a/dockerfile b/dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..42dd62de776aa03b1c0310c0423270758894307e --- /dev/null +++ b/dockerfile @@ -0,0 +1,31 @@ +FROM python:3.12 + +# Create a non-root user to run the application and set permissions +RUN useradd -m -u 1000 turinguser +RUN mkdir -p /app/models && chown -R turinguser:turinguser /app /app/models +USER turinguser + +# Set environment variables +# PATH to include local user binaries and project root +ENV PATH="/home/turinguser/.local/bin:$PATH" +ENV PROJ_ROOT=/app + +# Set the working directory in the container +WORKDIR /app + +# Copy essential files to install dependencies +COPY --chown=turinguser requirements.txt . + +# Install Python dependencies +RUN pip install --default-timeout=1000 --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu +RUN pip3 install -v -r requirements.txt --upgrade --default-timeout=1000 --no-cache-dir --break-system-packages + +# Copy remaining project files +COPY --chown=turinguser turing ./turing +COPY --chown=turinguser reports ./reports + +# Expose port 7860 for the FastAPI application +EXPOSE 7860 + +# Default command to run the FastAPI application on port 7860 +CMD ["uvicorn", "turing.api.app:app", "--host", "0.0.0.0", "--port", "7860"] \ No newline at end of file diff --git a/reports/.gitkeep b/reports/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/reports/feedback/feedback_data.csv b/reports/feedback/feedback_data.csv new file mode 100644 index 0000000000000000000000000000000000000000..c77afd3d1f0fb53b55bd2cb285f2ce199583eddd --- /dev/null +++ b/reports/feedback/feedback_data.csv @@ -0,0 +1,3 @@ +Timestamp,Input_Text,Language,Model_Prediction,User_Correction +2025-12-11 22:41:05,# Create output directory,python,Usage,DevelopmentNotes +2025-12-11 23:05:24,# Entry point for running the API directly with python,python,Usage,DevelopmentNotes diff --git a/reports/figures/.gitkeep b/reports/figures/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/reports/figures/logo_header.svg b/reports/figures/logo_header.svg new file mode 100644 index 0000000000000000000000000000000000000000..fde0102644902834fbc91f670843f2180619562b --- /dev/null +++ b/reports/figures/logo_header.svg @@ -0,0 +1,38 @@ + + + + + + + + + + + + + + + + + + + + C + + + + + C + + + + + C + + diff --git a/reports/unit_and_behavioral_tests/report.md b/reports/unit_and_behavioral_tests/report.md new file mode 100644 index 0000000000000000000000000000000000000000..fdb1a1f776bc054e2cfb61b53b1e172e358fefcf --- /dev/null +++ b/reports/unit_and_behavioral_tests/report.md @@ -0,0 +1,108 @@ + +# Test Execution Report + + +### Environment + + +```text + Parameter Value + Timestamp 2025-11-27 15:44:47 + Context turing +Python Version 3.12.12 + Platform Windows-11-10.0.26100-SP0 +``` + + +### Executive Summary + + +```text + Total Passed Failed Success Rate + 66 35 31 53.0% +``` + + +Detailed Breakdown: + + +### BEHAVIORAL Tests + + +```text + Module Test Case Result Time Message + test_directional.py test_java_directional_add_deprecation [ FAILED ] 0.30s turing\tests\behavioral\test_directional.py:16: Assertion... + test_directional.py test_python_directional_remove_todo [ FAILED ] 0.15s turing\tests\behavioral\test_directional.py:31: Assertion... + test_directional.py test_pharo_directional_add_responsibility [ FAILED ] 0.13s turing\tests\behavioral\test_directional.py:49: Assertion... + test_directional.py test_java_directional_contrast_rational [ FAILED ] 0.12s turing\tests\behavioral\test_directional.py:70: Assertion... + test_directional.py test_python_directional_contrast_todo [ FAILED ] 0.12s turing\tests\behavioral\test_directional.py:87: Assertion... + test_directional.py test_pharo_directional_contrast_collaborators [ FAILED ] 0.13s turing\tests\behavioral\test_directional.py:112: Assertio... + test_directional.py test_java_directional_shift_summary_to_expand [ FAILED ] 0.12s turing\tests\behavioral\test_directional.py:132: Assertio... + test_directional.py test_python_directional_shift_summary_to_devnotes [ FAILED ] 0.12s turing\tests\behavioral\test_directional.py:152: Assertio... + test_directional.py test_pharo_directional_shift_to_example [ FAILED ] 0.12s turing\tests\behavioral\test_directional.py:173: Assertio... + test_invariance.py test_python_invariance_parameters[:param user_i... [ FAILED ] 0.22s turing\tests\behavioral\test_invariance.py:15: AssertionE... + test_invariance.py test_python_invariance_parameters[:PARAM USER_I... [ FAILED ] 0.07s turing\tests\behavioral\test_invariance.py:15: AssertionE... + test_invariance.py test_python_invariance_parameters[ :param user... [ FAILED ] 0.06s turing\tests\behavioral\test_invariance.py:15: AssertionE... + test_invariance.py test_python_invariance_parameters[:param user_i... [ FAILED ] 0.06s turing\tests\behavioral\test_invariance.py:15: AssertionE... + test_invariance.py test_java_invariance_deprecation [ FAILED ] 0.13s turing\tests\behavioral\test_invariance.py:26: AssertionE... + test_invariance.py test_python_invariance_summary [ FAILED ] 0.13s turing\tests\behavioral\test_invariance.py:45: AssertionE... + test_invariance.py test_pharo_invariance_intent [ FAILED ] 0.13s turing\tests\behavioral\test_invariance.py:64: AssertionE... + test_invariance.py test_python_invariance_typos_parameters [ FAILED ] 0.07s turing\tests\behavioral\test_invariance.py:85: AssertionE... + test_invariance.py test_java_invariance_semantic_summary [ PASS ] 0.32s +test_minimum_functionality.py test_java_mft[test getfilestatus and related li... [ PASS ] 0.06s +test_minimum_functionality.py test_java_mft[/* @deprecated Use something else... [ FAILED ] 0.06s turing\tests\behavioral\test_minimum_functionality.py:17:... +test_minimum_functionality.py test_java_mft[code source of this file http gre... [ FAILED ] 0.06s turing\tests\behavioral\test_minimum_functionality.py:17:... +test_minimum_functionality.py test_java_mft[this is balanced if each pool is ... [ FAILED ] 0.06s turing\tests\behavioral\test_minimum_functionality.py:17:... +test_minimum_functionality.py test_java_mft[// For internal use only.-expecte... [ FAILED ] 0.06s turing\tests\behavioral\test_minimum_functionality.py:17:... +test_minimum_functionality.py test_java_mft[this impl delegates to the old fi... [ FAILED ] 0.07s turing\tests\behavioral\test_minimum_functionality.py:17:... +test_minimum_functionality.py test_java_mft[/** Usage: new MyClass(arg1). */-... [ FAILED ] 0.07s turing\tests\behavioral\test_minimum_functionality.py:17:... +test_minimum_functionality.py test_python_mft[a service specific account of t... [ PASS ] 0.06s +test_minimum_functionality.py test_python_mft[:param user_id: The ID of the u... [ FAILED ] 0.07s turing\tests\behavioral\test_minimum_functionality.py:29:... +test_minimum_functionality.py test_python_mft[# TODO: Refactor this entire bl... [ FAILED ] 0.07s turing\tests\behavioral\test_minimum_functionality.py:29:... +test_minimum_functionality.py test_python_mft[use this class if you want acce... [ PASS ] 0.06s +test_minimum_functionality.py test_python_mft[# create a new list by filterin... [ FAILED ] 0.08s turing\tests\behavioral\test_minimum_functionality.py:29:... +test_minimum_functionality.py test_pharo_mft[i am a simple arrow like arrowhe... [ PASS ] 0.07s +test_minimum_functionality.py test_pharo_mft[the example below shows how to c... [ PASS ] 0.07s +test_minimum_functionality.py test_pharo_mft[i provide a data structure indep... [ FAILED ] 0.06s turing\tests\behavioral\test_minimum_functionality.py:43:... +test_minimum_functionality.py test_pharo_mft[the cache is cleared after each ... [ FAILED ] 0.07s turing\tests\behavioral\test_minimum_functionality.py:43:... +test_minimum_functionality.py test_pharo_mft[it is possible hovewer to custom... [ PASS ] 0.07s +test_minimum_functionality.py test_pharo_mft[collaborators: BlElement, BlSpac... [ FAILED ] 0.07s turing\tests\behavioral\test_minimum_functionality.py:43:... +``` + + +### UNIT Tests + + +```text + Module Test Case Result Time Message + test_config.py test_proj_root_is_correctly_identified [ PASS ] 0.00s + test_config.py test_directory_paths_are_correctly_structured [ PASS ] 0.00s + test_config.py test_dataset_constants_are_valid [ PASS ] 0.00s + test_config.py test_labels_map_and_total_categories_are_correct [ PASS ] 0.00s + test_config.py test_numeric_parameters_are_positive [ PASS ] 0.00s + test_config.py test_load_dotenv_is_called_on_module_load [ PASS ] 0.00s + test_dataset.py test_initialization_paths_are_correct [ FAILED ] 0.00s turing\tests\unit\test_dataset.py:24: AssertionError + test_dataset.py test_format_labels_for_csv[input_labels0-[1, 0,... [ PASS ] 0.00s + test_dataset.py test_format_labels_for_csv[[1, 0, 1]-[1, 0, 1]] [ PASS ] 0.00s + test_dataset.py test_format_labels_for_csv[input_labels2-[]] [ PASS ] 0.00s + test_dataset.py test_format_labels_for_csv[None-None] [ PASS ] 0.00s + test_dataset.py test_get_dataset_raises_file_not_found [ PASS ] 0.00s + test_dataset.py test_get_dataset_success_and_label_parsing [ PASS ] 0.48s +test_features.py test_config_id_generation [ PASS ] 0.00s +test_features.py test_config_attributes [ PASS ] 0.00s +test_features.py test_clean_text_basic [ PASS ] 0.00s +test_features.py test_clean_text_stopwords [ PASS ] 2.39s +test_features.py test_clean_text_lemmatization [ PASS ] 0.00s +test_features.py test_clean_text_handles_none [ PASS ] 0.00s +test_features.py test_extract_numeric_features [ PASS ] 0.00s + test_model.py test_model_initialization[randomForestTfIdf] [ PASS ] 0.00s + test_model.py test_model_initialization[codeBerta] [ PASS ] 0.00s + test_model.py test_model_setup[randomForestTfIdf] [ PASS ] 0.00s + test_model.py test_model_setup[codeBerta] [ PASS ] 1.39s + test_model.py test_model_train[randomForestTfIdf] [ PASS ] 3.06s + test_model.py test_model_train[codeBerta] [ PASS ] 4.90s + test_model.py test_model_evaluate[randomForestTfIdf] [ PASS ] 1.39s + test_model.py test_model_evaluate[codeBerta] [ FAILED ] 6.36s turing\tests\unit\test_model.py:101: AssertionError + test_model.py test_model_predict[randomForestTfIdf] [ PASS ] 1.36s + test_model.py test_model_predict[codeBerta] [ PASS ] 5.26s +``` diff --git a/reports/unit_tests/report.md b/reports/unit_tests/report.md new file mode 100644 index 0000000000000000000000000000000000000000..9ebe350e94f19f83541c3a75f87c163a6baf5b3d --- /dev/null +++ b/reports/unit_tests/report.md @@ -0,0 +1,122 @@ + +# Turing Test Execution Report + + + +--- + + + +## Environment Information + + +| Parameter | Value | +|:---------------|:---------------------------| +| Timestamp | 2025-12-04 18:14:18 | +| Context | TURING | +| Python Version | 3.12.12 | +| Platform | macOS-15.6-arm64-arm-64bit | +| Architecture | arm64 | + + +--- + + +## Executive Summary + + +**Overall Status:** MOSTLY PASSED + + +**Success Rate:** 91.2% + + +| Metric | Count | +|:-------------|--------:| +| Total Tests | 34 | +| Passed | 31 | +| Failed | 3 | +| Success Rate | 91.2% | + + +**Visual Progress:** + + +``` +Progress: [█████████████████████████████████████████████░░░░░] 91.2% +Passed: 31/34 tests +``` + + +--- + + +## UNIT Tests + + +### Statistics + + +| Status | Count | +|:---------|-----------:| +| Total | 34 | +| Passed | 31 (91.2%) | +| Failed | 3 (8.8%) | + + +### Test Results + + +| Module | Test Case | Result | Time | Message | +|:----------------|:---------------------------------------------------|:---------|:-------|:-----------------------------------------------------| +| test_api.py | test_health_check_returns_ok | PASS | 0.01s | | +| test_api.py | test_predict_success_java | PASS | 0.02s | | +| test_api.py | test_predict_success_python | PASS | 0.00s | | +| test_api.py | test_predict_success_pharo | PASS | 0.00s | | +| test_api.py | test_predict_missing_texts | PASS | 0.00s | | +| test_api.py | test_predict_missing_language | PASS | 0.00s | | +| test_api.py | test_predict_empty_texts | PASS | 0.00s | | +| test_api.py | test_predict_error_handling | PASS | 0.00s | | +| test_api.py | test_predict_invalid_language | PASS | 0.00s | | +| test_api.py | test_prediction_request_valid | PASS | 0.00s | | +| test_api.py | test_prediction_response_valid | PASS | 0.00s | | +| test_config.py | test_proj_root_is_correctly_identified | PASS | 0.00s | | +| test_config.py | test_directory_paths_are_correctly_structured | PASS | 0.00s | | +| test_config.py | test_dataset_constants_are_valid | PASS | 0.00s | | +| test_config.py | test_labels_map_and_total_categories_are_correct | PASS | 0.00s | | +| test_config.py | test_numeric_parameters_are_positive | PASS | 0.00s | | +| test_config.py | test_load_dotenv_is_called_on_module_load | PASS | 0.00s | | +| test_dataset.py | test_initialization_paths_are_correct | FAIL | 0.00s | turing/tests/unit/test_dataset.py:25: AssertionError | +| test_dataset.py | test_format_labels_for_csv[input_labels0-[1, 0,... | PASS | 0.00s | | +| test_dataset.py | test_format_labels_for_csv[[1, 0, 1]-[1, 0, 1]] | PASS | 0.00s | | +| test_dataset.py | test_format_labels_for_csv[input_labels2-[]] | PASS | 0.00s | | +| test_dataset.py | test_format_labels_for_csv[None-None] | PASS | 0.00s | | +| test_dataset.py | test_get_dataset_raises_file_not_found | PASS | 0.00s | | +| test_dataset.py | test_get_dataset_success_and_label_parsing | FAIL | 0.00s | turing/dataset.py:128: FileNotFoundError | +| test_model.py | test_model_initialization[randomForestTfIdf] | PASS | 0.00s | | +| test_model.py | test_model_initialization[codeBerta] | PASS | 0.00s | | +| test_model.py | test_model_setup[randomForestTfIdf] | PASS | 0.00s | | +| test_model.py | test_model_setup[codeBerta] | PASS | 0.93s | | +| test_model.py | test_model_train[randomForestTfIdf] | PASS | 2.66s | | +| test_model.py | test_model_train[codeBerta] | PASS | 7.22s | | +| test_model.py | test_model_evaluate[randomForestTfIdf] | PASS | 1.31s | | +| test_model.py | test_model_evaluate[codeBerta] | FAIL | 8.83s | turing/tests/unit/test_model.py:101: AssertionError | +| test_model.py | test_model_predict[randomForestTfIdf] | PASS | 1.21s | | +| test_model.py | test_model_predict[codeBerta] | PASS | 5.98s | | + + +--- + + +> **ERROR**: 3 test(s) failed. Please review the error messages above. + + + +--- + + + +*Report generated on 2025-12-04 at 18:14:18* + + +*Powered by Turing Test Suite* diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..f024c882d66251a4a53553b2fedcf33a587a77c4 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,13 @@ +fastapi +uvicorn[standard] +loguru +pydantic +python-dotenv +mlflow +numpy +transformers +dagshub +datasets +accelerate +scikit-learn +gradio \ No newline at end of file diff --git a/turing/CLI_runner/run_dataset.py b/turing/CLI_runner/run_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..b02c26e3acbdfa3f035ab2c1883108feac2c70cf --- /dev/null +++ b/turing/CLI_runner/run_dataset.py @@ -0,0 +1,105 @@ +import os +from pathlib import Path +import sys + +from loguru import logger +import typer +from typing_extensions import Annotated + +try: + from turing.config import INTERIM_DATA_DIR, RAW_DATA_DIR + from turing.dataset import DatasetManager +except ImportError: + logger.error("Error: Could not import DatasetManager. Check sys.path configuration.") + logger.error(f"Current sys.path: {sys.path}") + sys.exit(1) + + +script_dir = os.path.dirname(os.path.abspath(__file__)) +proj_root = os.path.dirname(os.path.dirname(script_dir)) +sys.path.append(proj_root) + +app = typer.Typer(help="CLI for dataset management (Download, Conversion, and Search).") + + +@app.command() +def download(): + """ + Loads the dataset from Hugging Face and saves it into the "raw" folder. + """ + logger.info("Starting dataset download...") + manager = DatasetManager() + manager.download_dataset() + logger.success("Download complete.") + + +@app.command(name="parquet-to-csv") +def parquet_to_csv(): + """ + Converts all parquet files in the raw data directory + to CSV format in the interim data directory. + """ + logger.info("Starting Parquet -> CSV conversion...") + manager = DatasetManager() + manager.parquet_to_csv() + logger.success("Conversion complete.") + + +@app.command() +def search( + filename: Annotated[ + str, typer.Argument(help="The exact filename to search for (e.g., 'java_train.parquet')") + ], + directory: Annotated[ + str, + typer.Option( + "--directory", + "-d", + help="Directory to search in. Keywords 'raw' or 'interim' can be used.", + ), + ] = "raw", +): + """ + Searches for a file by name in the data directories. + """ + logger.info(f"Initializing search for '{filename}'...") + manager = DatasetManager() + + search_path = None + if directory.lower() == "raw": + search_path = RAW_DATA_DIR + logger.info("Searching in 'raw' data directory.") + elif directory.lower() == "interim": + search_path = INTERIM_DATA_DIR + logger.info("Searching in 'interim' data directory.") + else: + search_path = Path(directory) + logger.info(f"Searching in custom path: {search_path}") + + results = manager.search_file(filename, search_directory=search_path) + + if results: + logger.success(f"Found {len(results)} file(s):") + for res in results: + print(f"-> {res}") + else: + logger.warning(f"File '{filename}' not found in {search_path}.") + + +@app.command(name="show-raw-hf") +def show_raw_hf(): + """ + Loads and displays info about the raw dataset from Hugging Face. + """ + logger.info("Loading raw dataset info from Hugging Face...") + manager = DatasetManager() + dataset = manager.get_raw_dataset_from_hf() + if dataset: + logger.info("Dataset info:") + print(dataset) + else: + logger.error("Could not retrieve dataset.") + + +if __name__ == "__main__": + app() diff --git a/turing/CLI_runner/run_prediction.py b/turing/CLI_runner/run_prediction.py new file mode 100644 index 0000000000000000000000000000000000000000..f6f104daa4c0aae5a493fd4d5fc3f6a412b94cc3 --- /dev/null +++ b/turing/CLI_runner/run_prediction.py @@ -0,0 +1,57 @@ +from pathlib import Path +import sys + +from loguru import logger +import typer + +from turing.modeling.models.randomForestTfIdf import RandomForestTfIdf +from turing.modeling.predict import ModelInference + +# Add project root to sys.path +current_dir = Path(__file__).resolve().parent +project_root = current_dir.parent +if str(project_root) not in sys.path: + sys.path.append(str(project_root)) + +app = typer.Typer() + + +@app.command() +def main( + mlflow_run_id: str = typer.Option( + "af1fa5959dc14fa9a29a0a19c11f1b08", help="The MLflow Run ID" + ), + artifact_name: str = typer.Option( + "RandomForestTfIdf_java", help="The name of the model artifact" + ), + language: str = typer.Option("java", help="The target programming language"), +): + """ + Run inference using the dataset stored on disk (Standard CML/DVC workflow). + """ + logger.info("Starting CLI inference process...") + + try: + # Initialize inference engine + inference_engine = ModelInference() + + # Run prediction on the test dataset + results = inference_engine.predict_from_mlflow( + mlflow_run_id=mlflow_run_id, + artifact_name=artifact_name, + language=language, + model_class=RandomForestTfIdf, + ) + + # Output results + print("\n--- Prediction Results ---") + print(results) + print("--------------------------") + + except Exception as e: + logger.error(f"CLI Prediction failed: {e}") + raise typer.Exit(code=1) + + +if __name__ == "__main__": + app() diff --git a/turing/__init__.py b/turing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..18ae806ff73753d6560266c7fb68c2bd51971a7b --- /dev/null +++ b/turing/__init__.py @@ -0,0 +1 @@ +from turing import config # noqa: F401 diff --git a/turing/__pycache__/__init__.cpython-312.pyc b/turing/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b29672cc77222a9cc6febdd7fbd5f3fa8bed3541 Binary files /dev/null and b/turing/__pycache__/__init__.cpython-312.pyc differ diff --git a/turing/__pycache__/config.cpython-312.pyc b/turing/__pycache__/config.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..54a78bb57637c5c4af860f7cba23a1187954768d Binary files /dev/null and b/turing/__pycache__/config.cpython-312.pyc differ diff --git a/turing/__pycache__/dataset.cpython-312.pyc b/turing/__pycache__/dataset.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..67213b96474be9aadf50425b2a70e088497b677e Binary files /dev/null and b/turing/__pycache__/dataset.cpython-312.pyc differ diff --git a/turing/__pycache__/evaluate_model.cpython-312.pyc b/turing/__pycache__/evaluate_model.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..683e5fae32d19b99f5035f0716ffa3f44b40e6c4 Binary files /dev/null and b/turing/__pycache__/evaluate_model.cpython-312.pyc differ diff --git a/turing/api/__init__.py b/turing/api/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/turing/api/app.py b/turing/api/app.py new file mode 100644 index 0000000000000000000000000000000000000000..fb0b3dbd841736ee37c70e299763510aada87342 --- /dev/null +++ b/turing/api/app.py @@ -0,0 +1,115 @@ +import base64 +import os + +from fastapi import FastAPI, HTTPException +from fastapi.responses import JSONResponse +import gradio as gr +from loguru import logger + +from turing.api.demo import create_demo +from turing.api.schemas import PredictionRequest, PredictionResponse +from turing.modeling.predict import ModelInference + + +def get_logo_b64_src(filename="logo_header.svg"): + """read SVG and convert it into a string Base64 for HTML.""" + try: + base_path = os.path.dirname(os.path.abspath(__file__)) + target_path = os.path.join(base_path, "..", "..", "reports", "figures", filename) + target_path = os.path.normpath(target_path) + + with open(target_path, "rb") as f: + encoded = base64.b64encode(f.read()).decode("utf-8") + return f"data:image/svg+xml;base64,{encoded}" + except Exception as e: + print(f"Unable to load logo for API: {e}") + return "" + + +# load logo +logo_src = get_logo_b64_src() + +# html +logo_html_big = f""" + + + +""" + +# description +description_md = f""" +API for classifying code comments. + +You can interact with the model directly using the visual interface. +Click the logo below to open it: + +{logo_html_big} + +""" + +app = FastAPI( + title="Turing Team Code Classification API", + description=description_md, + version="1.0.0" +) + +@app.get("/manifest.json") +def get_manifest(): + return JSONResponse(content={ + "name": "Turing App", + "short_name": "Turing", + "start_url": "/gradio", + "display": "standalone", + "background_color": "#ffffff", + "theme_color": "#000000", + "icons": [] + }) + +# Global inference engine instance +inference_engine = ModelInference() + +demo = create_demo(inference_engine) +app = gr.mount_gradio_app(app, demo, path="/gradio") + +@app.get("/") +def health_check(): + """ + Root endpoint to verify API status. + """ + return {"status": "ok", "message": "Turing Code Classification API is ready.", "ui_url": "/gradio"} + + +@app.post("/predict", response_model=PredictionResponse) +def predict(request: PredictionRequest): + """ + Endpoint to classify a list of code comments. + Dynamically loads the model from MLflow based on the request parameters. + """ + try: + logger.info(f"Received prediction request for language: {request.language}") + + # Perform prediction using the inference engine + raw, predictions, run_id, artifact = inference_engine.predict_payload( + texts=request.texts, language=request.language + ) + + # Ensure predictions are serializable (convert numpy arrays to lists) + if hasattr(predictions, "tolist"): + predictions = predictions.tolist() + + return PredictionResponse( + predictions=raw.tolist(), + labels=predictions, + model_info={"artifact": artifact, "language": request.language}, + ) + + except Exception as e: + logger.error(f"Prediction failed: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + + +# Entry point for running the API directly with python +if __name__ == "__main__": + import uvicorn + + uvicorn.run(app, host="127.0.0.1", port=7860) diff --git a/turing/api/demo.py b/turing/api/demo.py new file mode 100644 index 0000000000000000000000000000000000000000..6b9e0a57c23895dd4bbf649df7fb9ec44fdfb3bf --- /dev/null +++ b/turing/api/demo.py @@ -0,0 +1,302 @@ +import csv +from datetime import datetime +import os + +import gradio as gr + +# ---IMPORTS --- +try: + from turing.modeling.models.codeBerta import CodeBERTa + from turing.modeling.predict import ModelInference +except ImportError as e: + print(f"WARNING: Error importing real modules: {e}") + class CodeBERTa: + pass + class ModelInference: + pass + +# --- CONFIGURATION --- +FEEDBACK_FILE = "reports/feedback/feedback_data.csv" + +LABELS_MAP = { + "java": ["summary", "Ownership", "Expand", "usage", "Pointer", "deprecation", "rational"], + "python": ["Usage", "Parameters", "DevelopmentNotes", "Expand", "Summary"], + "pharo": ["Keyimplementationpoints", "Example", "Responsibilities", "Intent", "Keymessages", "Collaborators"], +} + +# --- CSS --- +CSS = """ +:root { + --bg-primary: #fafaf9; --bg-secondary: #ffffff; --border-color: #e5e7eb; + --text-primary: #1f2937; --text-secondary: #6b7280; --accent-bg: #f3f4f6; + --primary-btn: #ea580c; --primary-btn-hover: #c2410c; +} +.dark, body.dark, .gradio-container.dark { + --bg-primary: #0f172a; --bg-secondary: #1e293b; --border-color: #374151; + --text-primary: #f3f4f6; --text-secondary: #9ca3af; --accent-bg: #334155; +} +body, .gradio-container { + background-color: var(--bg-primary) !important; color: var(--text-primary) !important; + font-family: 'Segoe UI', system-ui, sans-serif; transition: background 0.3s, color 0.3s; +} +.compact-header { + display: flex; align-items: center; justify-content: space-between; padding: 1.5rem 2rem; + border-bottom: 1px solid var(--border-color); margin-bottom: 2rem; + background-color: var(--bg-secondary); flex-wrap: wrap; gap: 1rem; border-radius: 0 0 12px 12px; +} +.input-card, .output-card { + background-color: var(--bg-secondary); border: 1px solid var(--border-color); + border-radius: 12px; padding: 1.5rem; margin-bottom: 1rem; box-shadow: 0 4px 6px -1px rgba(0,0,0,0.1); +} +.header-left { display: flex; align-items: center; gap: 1.5rem; } +.logo-icon { + height: 55px; width: auto; padding: 0; background-color: transparent; + border: none; box-shadow: none; display: flex; align-items: center; justify-content: center; flex-shrink: 0; +} +.logo-icon svg { height: 100%; width: auto; fill: var(--primary-btn); } +.title-group { display: flex; flex-direction: column; } +.main-title { font-size: 1.6rem; font-weight: 800; margin: 0; line-height: 1.1; color: var(--text-primary); letter-spacing: -0.5px; } +.subtitle { font-size: 0.95rem; color: var(--text-secondary); margin: 0; font-weight: 400; } +.section-title { font-weight: 600; color: var(--text-primary); margin-bottom: 1rem; } +.header-right { flex: 1; display: flex; justify-content: flex-end; align-items: center; min-width: 250px; } +.dev-note-container { + background-color: var(--accent-bg); border: 1px solid var(--border-color); border-radius: 16px; + width: 520px; height: 64px; display: flex; align-items: center; justify-content: flex-start; padding: 0 24px; gap: 1rem; +} +.dev-note-container:hover { border-color: var(--primary-btn); } +.dev-icon { font-size: 1.4rem; background: transparent !important; border: none !important; display: flex; align-items: center; flex-shrink: 0; } +.dev-text { + font-family: 'Courier New', monospace; font-size: 0.95rem; color: var(--text-secondary); + transition: opacity 1.5s ease; white-space: normal; line-height: 1.2; text-align: left; + display: -webkit-box; -webkit-line-clamp: 2; -webkit-box-orient: vertical; overflow: hidden; +} +.dev-text.hidden { opacity: 0; } +.feedback-section { margin-top: 2rem; padding-top: 1.5rem; border-top: 1px dashed var(--border-color); } +.feedback-title { font-size: 0.8rem; font-weight: 700; color: var(--text-secondary); text-transform: uppercase; margin-bottom: 0.8rem; } +.gr-button-primary { background: var(--primary-btn) !important; border: none !important; color: white !important; } +.gr-button-primary:hover { background: var(--primary-btn-hover) !important; } +.gr-button-secondary { background: var(--bg-primary) !important; border: 1px solid var(--border-color) !important; color: var(--text-primary) !important; } +.gr-box, .gr-input, .gr-dropdown { background: var(--bg-primary) !important; border-color: var(--border-color) !important; } +#result-box textarea { + font-size: 1.25rem; font-weight: 700; text-align: center; color: var(--primary-btn); + background-color: transparent; border: none; overflow: hidden !important; resize: none; white-space: normal; line-height: 1.4; +} +""" + +# --- JAVASCRIPT --- +JS_LOADER = """ +() => { + const notes = [ + "Yes, even Pharo. Don’t ask why.", + "Is ‘deprecated’ significant? Asking for a friend.", + "Technical debt is just future-me's problem.", + "Comment first, code later. Obviously.", + "If it works, don't touch it.", + "Fixing bugs created by previous-me.", + "Legacy code: don't breathe on it.", + "Documentation is a love letter to your future self.", + "It works on my machine!", + "404: Motivation not found.", + "Compiling... please hold." + ]; + let idx = 0; + function rotateNotes() { + const textEl = document.getElementById('dev-note-text'); + if (!textEl) { setTimeout(rotateNotes, 500); return; } + textEl.classList.add('hidden'); + setTimeout(() => { + idx = (idx + 1) % notes.length; + textEl.innerText = notes[idx]; + textEl.classList.remove('hidden'); + }, 1500); + } + setInterval(rotateNotes, 10000); +} +""" + +# --- UTILITIES --- +def load_svg_content(filename="logo_header.svg"): + base_path = os.path.dirname(os.path.abspath(__file__)) + target_path = os.path.join(base_path, "..", "..", "reports", "figures", filename) + target_path = os.path.normpath(target_path) + + if os.path.exists(target_path): + with open(target_path, "r", encoding="utf-8") as f: + return f.read() + else: + print(f"[WARNING] Logo not found in: {target_path}") + return "CCC" + +def save_feedback_to_csv(text, language, predicted, suggested): + if not text: + return "No data." + try: + os.makedirs(os.path.dirname(FEEDBACK_FILE), exist_ok=True) + file_exists = os.path.isfile(FEEDBACK_FILE) + with open(FEEDBACK_FILE, mode='a', newline='', encoding='utf-8') as f: + writer = csv.writer(f) + if not file_exists: + writer.writerow(["Timestamp", "Input_Text", "Language", "Model_Prediction", "User_Correction"]) + + pred_label = predicted + if isinstance(predicted, dict): + pred_label = max(predicted, key=predicted.get) if predicted else "Unknown" + + writer.writerow([ + datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + text.strip(), + language, + pred_label, + suggested + ]) + return "Feedback saved successfully!" + except Exception as e: + return f"Error saving feedback: {str(e)}" + +# --- SYNTAX VALIDATION LOGIC --- +def is_valid_syntax(text: str, language: str) -> bool: + """ + Validates if the text follows the basic comment syntax for the given language. + """ + text = text.strip() + if not text: + return False + + if language == "java": + # Supports: // comment OR /* comment */ + return text.startswith("//") or (text.startswith("/*") and text.endswith("*/")) + + elif language == "python": + # Supports: # comment OR """ docstring """ OR ''' docstring ''' + return text.startswith("#") or \ + (text.startswith('"""') and text.endswith('"""')) or \ + (text.startswith("'''") and text.endswith("'''")) + + elif language == "pharo": + # Supports: " comment " + return text.startswith('"') and text.endswith('"') + + return True + +# --- MAIN DEMO --- +def create_demo(inference_engine: ModelInference): + + def classify_comment(text: str, language: str): + """ + Calls the inference engine only if syntax is valid. + """ + if not text: + return None + + # SYNTAX CHECK + if not is_valid_syntax(text, language): + error_msg = "Error: Invalid Syntax." + if language == "java": + error_msg += " Java comments must start with '//' or be enclosed in '/* ... */'." + elif language == "python": + error_msg += " Python comments must start with '#' or use docstrings ('\"\"\"' / \"'''\")." + elif language == "pharo": + error_msg += " Pharo comments must be enclosed in double quotes (e.g., \"comment\")." + return error_msg + + # INFERENCE + try: + _, labels, _, _ = inference_engine.predict_payload( + texts=[text], + language=language + ) + + if labels and len(labels) > 0: + first_prediction = labels[0][0] + if isinstance(first_prediction, (list, tuple)): + return first_prediction[0] + else: + return str(first_prediction) + + return "Unknown: Low confidence." + + except Exception as e: + print(f"Prediction Error: {e}") + return f"System Error: Failed to process request for '{language}'." + + def update_dropdown(language): + choices = LABELS_MAP.get(language, []) + return gr.Dropdown(choices=choices, value=None, interactive=True) + + def clear_all(): + return (None, "java", "", gr.Dropdown(choices=LABELS_MAP["java"], value=None, interactive=True), "") + + logo_svg = load_svg_content("logo_header.svg") + + with gr.Blocks(title="Code Comment Classifier") as demo: + gr.HTML(f"") + + # --- HEADER --- + gr.HTML(f""" +
+
+
{logo_svg}
+
+

Code Comment Classifier

+

for Java, Python & Pharo

+
+
+
+
+ 💭 + Initializing... +
+
+
+ """) + + with gr.Row(): + with gr.Column(): + gr.HTML('
📝 Input Source
') + input_text = gr.Textbox(label="Code Comment", lines=8, show_label=False, placeholder="Enter code comment here...") + with gr.Row(): + input_lang = gr.Dropdown(["java", "python", "pharo"], label="Language", value="java", scale=2) + submit_btn = gr.Button("⚡ Classify", variant="primary", scale=1) + clear_btn = gr.Button("🗑️ Clear All", variant="secondary", size="sm") + + with gr.Column(): + gr.HTML('
📊 Classification Result
') + output_tags = gr.Textbox( + label="Predicted Category", + show_label=False, + elem_id="result-box", + interactive=False, + lines=2 + ) + + gr.HTML('
🛠️ Help Improve the Model
') + with gr.Row(): + correction_dropdown = gr.Dropdown( + choices=LABELS_MAP["java"], + label="Correct Label", + show_label=False, + container=False, + scale=3, + interactive=True + ) + feedback_btn = gr.Button("📤 Save Feedback", variant="secondary", scale=1) + feedback_msg = gr.Markdown("", show_label=False) + + gr.Examples( + examples=[ + ["/** Validates the user session token. */", "java"], + ["# Retry logic for DB connection.", "python"], + ['"Manages the network connection lifecycle."', "pharo"] + ], + inputs=[input_text, input_lang], + label="Quick Examples" + ) + + input_lang.change(fn=update_dropdown, inputs=input_lang, outputs=correction_dropdown) + submit_btn.click(fn=classify_comment, inputs=[input_text, input_lang], outputs=[output_tags]) + feedback_btn.click(fn=save_feedback_to_csv, inputs=[input_text, input_lang, output_tags, correction_dropdown], outputs=[feedback_msg]) + clear_btn.click(fn=clear_all, inputs=None, outputs=[input_text, input_lang, output_tags, correction_dropdown, feedback_msg]) + + demo.load(None, js=JS_LOADER) + + return demo \ No newline at end of file diff --git a/turing/api/schemas.py b/turing/api/schemas.py new file mode 100644 index 0000000000000000000000000000000000000000..eff7e9def4c6b4233624f8e81cd5a29a3e71898e --- /dev/null +++ b/turing/api/schemas.py @@ -0,0 +1,22 @@ +from typing import Any, List + +from pydantic import BaseModel, Field + + +# Input Schema +class PredictionRequest(BaseModel): + texts: List[str] = Field( + ..., + description="List of code comments to classify", + example=["public void main", "def init self"], + ) + language: str = Field( + ..., description="Programming language (java, python, pharo)", example="java" + ) + + +# Output Schema +class PredictionResponse(BaseModel): + predictions: List[Any] = Field(..., description="List of predicted labels") + labels: List[Any] = Field(..., description="List of human-readable labels") + model_info: dict = Field(..., description="Metadata about the model used") diff --git a/turing/config.py b/turing/config.py new file mode 100644 index 0000000000000000000000000000000000000000..b4dd4e2177fa9e11404e349039535bdd9fd11a7a --- /dev/null +++ b/turing/config.py @@ -0,0 +1,95 @@ +from pathlib import Path + +from dotenv import load_dotenv +from loguru import logger + +# Load environment variables from .env file if it exists +load_dotenv() + +# Paths +PROJ_ROOT = Path(__file__).resolve().parents[1] +logger.info(f"PROJ_ROOT path is: {PROJ_ROOT}") + +DATA_DIR = PROJ_ROOT / "data" +RAW_DATA_DIR = DATA_DIR / "raw" +INTERIM_DATA_DIR = DATA_DIR / "interim" +PROCESSED_DATA_DIR = DATA_DIR / "processed" +EXTERNAL_DATA_DIR = DATA_DIR / "external" + +MODELS_DIR = PROJ_ROOT / "models" + +REPORTS_DIR = PROJ_ROOT / "reports" +FIGURES_DIR = REPORTS_DIR / "figures" + +# Dataset +DATASET_HF_ID = "NLBSE/nlbse26-code-comment-classification" +LANGS = ["java", "python", "pharo"] +INPUT_COLUMN = "combo" +LABEL_COLUMN = "labels" + +LABELS_MAP = { + "java": ["summary", "Ownership", "Expand", "usage", "Pointer", "deprecation", "rational"], + "python": ["Usage", "Parameters", "DevelopmentNotes", "Expand", "Summary"], + "pharo": [ + "Keyimplementationpoints", + "Example", + "Responsibilities", + "Intent", + "Keymessages", + "Collaborators", + ], +} + +TOTAL_CATEGORIES = sum(len(v) for v in LABELS_MAP.values()) + +# Score parameters +MAX_AVG_RUNTIME = 5.0 # seconds +MAX_AVG_FLOPS = 5000.0 # GFLOPS + +# Training parameters +DEFAULT_BATCH_SIZE = 32 + +# Model configuration mapping +MODEL_CONFIG = { + "codeberta": { + "model_name": "fine-tuned-CodeBERTa", + "exp_name": "fine-tuned-CodeBERTa", + "model_class_module": "turing.modeling.models.codeBerta", + "model_class_name": "CodeBERTa", + }, + "graphcodebert": { + "model_name": "GraphCodeBERT", + "exp_name": "fine-tuned-GraphCodeBERT", + "model_class_module": "turing.modeling.models.graphCodeBert", + "model_class_name": "GraphCodeBERTClassifier", + }, + "tinybert": { + "model_name": "TinyBERT", + "exp_name": "fine-tuned-TinyBERT", + "model_class_module": "turing.modeling.models.tinyBert", + "model_class_name": "TinyBERTClassifier", + }, + "randomforest": { + "model_name": "RandomForest-TfIdf", + "exp_name": "RandomForest-TfIdf", + "model_class_module": "turing.modeling.models.randomForestTfIdf", + "model_class_name": "RandomForestTfIdf", + }, +} +DEFAULT_NUM_ITERATIONS = 20 + +# Existing model modules +EXISTING_MODELS = [ + "randomForestTfIdf", + "codeBerta", +] + +# If tqdm is installed, configure loguru with tqdm.write +# https://github.com/Delgan/loguru/issues/135 +try: + from tqdm import tqdm + + logger.remove(0) + logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True) +except (ModuleNotFoundError, ValueError): + pass diff --git a/turing/data_validation.py b/turing/data_validation.py new file mode 100644 index 0000000000000000000000000000000000000000..538296889a77a0e771149f8993b738ae90c05ae0 --- /dev/null +++ b/turing/data_validation.py @@ -0,0 +1,271 @@ +from pathlib import Path +import traceback +from typing import List + +from deepchecks.tabular import Dataset, Suite +from deepchecks.tabular.checks import ( + ConflictingLabels, + DataDuplicates, + LabelDrift, + OutlierSampleDetection, + TrainTestSamplesMix, +) +import numpy as np +import pandas as pd + +from turing.config import LABEL_COLUMN, LABELS_MAP + +try: + from deepchecks.nlp import TextData + from deepchecks.nlp.checks import ( + PropertyDrift, + TextEmbeddingsDrift, + ) + + NLP_AVAILABLE = True +except ImportError: + NLP_AVAILABLE = False + + +def _encode_labels_for_validation( + series: pd.Series, class_names: List[str] +) -> pd.Series: + def encode(lbl): + active_labels = [] + for idx, is_active in enumerate(lbl): + if is_active: + if idx < len(class_names): + active_labels.append(class_names[idx]) + else: + active_labels.append(f"Class_{idx}") + if not active_labels: + return "No_Label" + return " & ".join(active_labels) + + return series.apply(encode) + + +def _calculate_code_specific_properties(text_series: List[str]) -> pd.DataFrame: + props = [] + for text in text_series: + s = str(text) + length = len(s) + non_alnum = sum(1 for c in s if not c.isalnum() and not c.isspace()) + props.append( + { + "Text_Length": length, + "Symbol_Ratio": non_alnum / length if length > 0 else 0.0, + } + ) + return pd.DataFrame(props) + + +def _nuke_rogue_files(): + """ + delete .npy files + """ + rogue_filenames = [ + "embeddings.npy" + + ] + for fname in rogue_filenames: + p = Path(fname) + if p.exists(): + try: + p.unlink() + except Exception: + pass + + +def run_custom_deepchecks( + df_train: pd.DataFrame, + df_test: pd.DataFrame, + output_dir: Path, + stage: str, + language: str, +): + print(f" [Deepchecks] Running Integrity Suite ({stage})...") + output_dir.mkdir(parents=True, exist_ok=True) + + class_names = LABELS_MAP.get(language, []) + cols = ["f_length", "f_word_count", "f_starts_verb", "text_hash"] + + for c in cols: + if c not in df_train.columns: + df_train[c] = 0 + if c not in df_test.columns: + df_test[c] = 0 + + train_ds_df = df_train[cols].copy() + train_ds_df["target"] = _encode_labels_for_validation( + df_train[LABEL_COLUMN], class_names + ) + test_ds_df = df_test[cols].copy() + test_ds_df["target"] = _encode_labels_for_validation( + df_test[LABEL_COLUMN], class_names + ) + + cat_features = ["text_hash", "f_starts_verb"] + train_ds = Dataset(train_ds_df, label="target", cat_features=cat_features) + test_ds = Dataset(test_ds_df, label="target", cat_features=cat_features) + + check_conflicts = ConflictingLabels(columns=["text_hash"]) + if hasattr(check_conflicts, "add_condition_ratio_of_conflicting_labels_not_greater_than"): + check_conflicts.add_condition_ratio_of_conflicting_labels_not_greater_than(0) + else: + check_conflicts.add_condition_ratio_of_conflicting_labels_less_or_equal(0) + + check_duplicates = DataDuplicates() + if hasattr(check_duplicates, "add_condition_ratio_not_greater_than"): + check_duplicates.add_condition_ratio_not_greater_than(0.05) + else: + check_duplicates.add_condition_ratio_less_or_equal(0.05) + + check_leakage = TrainTestSamplesMix(columns=["text_hash"]) + try: + if hasattr(check_leakage, "add_condition_ratio_not_greater_than"): + check_leakage.add_condition_ratio_not_greater_than(0) + except Exception: + pass + + check_outliers = OutlierSampleDetection() + try: + if hasattr(check_outliers, "add_condition_outlier_ratio_less_or_equal"): + check_outliers.add_condition_outlier_ratio_less_or_equal(0.05) + except Exception: + pass + + custom_suite = Suite( + "Code Quality & Integrity", + check_conflicts, + check_duplicates, + check_leakage, + LabelDrift(), + check_outliers, + ) + + try: + result = custom_suite.run(train_dataset=train_ds, test_dataset=test_ds) + report_path = output_dir / f"1_Integrity_{stage}.html" + result.save_as_html(str(report_path), as_widget=False) + print(f" [Deepchecks] Report Saved: {report_path}") + except Exception as e: + print(f" [Deepchecks] Error: {e}") + traceback.print_exc() + + +def run_targeted_nlp_checks( + df_train: pd.DataFrame, + df_test: pd.DataFrame, + output_dir: Path, + stage: str, + language: str = "english", +): + if not NLP_AVAILABLE: + print(" [Skip] NLP Suite skipped (libs not installed).") + return + + from deepchecks.nlp import Suite as NLPSuite + + print(f" [NLP Check] Running Semantic Analysis ({stage})...") + output_dir.mkdir(parents=True, exist_ok=True) + + # Clean up any existing garbage before starting + _nuke_rogue_files() + + DRIFT_THRESHOLD = 0.20 + PROP_THRESHOLD = 0.35 + SAMPLE_SIZE = 2000 + df_tr = ( + df_train.sample(n=SAMPLE_SIZE, random_state=42) + if len(df_train) > SAMPLE_SIZE + else df_train + ) + df_te = ( + df_test.sample(n=SAMPLE_SIZE, random_state=42) + if len(df_test) > SAMPLE_SIZE + else df_test + ) + + try: # START MAIN TRY BLOCK + y_tr = np.vstack(df_tr[LABEL_COLUMN].tolist()) + y_te = np.vstack(df_te[LABEL_COLUMN].tolist()) + + train_ds = TextData( + df_tr["comment_sentence"].tolist(), + label=y_tr, + task_type="text_classification", + ) + test_ds = TextData( + df_te["comment_sentence"].tolist(), + label=y_te, + task_type="text_classification", + ) + + print(" [NLP Check] Calculating custom code properties...") + train_props = _calculate_code_specific_properties( + df_tr["comment_sentence"].tolist() + ) + test_props = _calculate_code_specific_properties( + df_te["comment_sentence"].tolist() + ) + + train_ds.set_properties(train_props) + test_ds.set_properties(test_props) + + # In-memory calculation only. + train_ds.calculate_builtin_embeddings() + test_ds.calculate_builtin_embeddings() + + check_embeddings = TextEmbeddingsDrift() + if hasattr(check_embeddings, "add_condition_drift_score_not_greater_than"): + check_embeddings.add_condition_drift_score_not_greater_than(DRIFT_THRESHOLD) + elif hasattr(check_embeddings, "add_condition_drift_score_less_than"): + check_embeddings.add_condition_drift_score_less_than(DRIFT_THRESHOLD) + + check_len = PropertyDrift(custom_property_name="Text_Length") + if hasattr(check_len, "add_condition_drift_score_not_greater_than"): + check_len.add_condition_drift_score_not_greater_than(PROP_THRESHOLD) + elif hasattr(check_len, "add_condition_drift_score_less_than"): + check_len.add_condition_drift_score_less_than(PROP_THRESHOLD) + + check_sym = PropertyDrift(custom_property_name="Symbol_Ratio") + if hasattr(check_sym, "add_condition_drift_score_not_greater_than"): + check_sym.add_condition_drift_score_not_greater_than(PROP_THRESHOLD) + elif hasattr(check_sym, "add_condition_drift_score_less_than"): + check_sym.add_condition_drift_score_less_than(PROP_THRESHOLD) + + suite = NLPSuite( + "Code Comment Semantic Analysis", + check_embeddings, + check_len, + check_sym + ) + + res = suite.run(train_ds, test_ds) + + report_path = output_dir / f"2_Semantic_{stage}.html" + res.save_as_html(str(report_path), as_widget=False) + print(f" [NLP Check] Report saved: {report_path}") + + try: + passed = res.get_passed_checks() + n_passed = len(passed) + n_total = len(res.results) + print(f" [NLP Result] {n_passed}/{n_total} checks passed.") + + if n_passed < n_total: + print(" [NLP Warning] Failed Checks details:") + for result in res.results: + if not result.passed_conditions(): + print(f" - {result.check.name}: {result.conditions_results[0].details}") + except Exception: + pass + + except Exception as e: + print(f" [NLP Check] Failed: {e}") + import traceback + traceback.print_exc() + + finally: + _nuke_rogue_files() \ No newline at end of file diff --git a/turing/dataset.py b/turing/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..97cd6efd816d5880790d43d918e193a4a4eb12ab --- /dev/null +++ b/turing/dataset.py @@ -0,0 +1,210 @@ +import ast +import os +from pathlib import Path + +from datasets import DatasetDict, load_dataset +from loguru import logger + +import turing.config as config + + +class DatasetManager: + """ + Manages the loading, transformation, and access of project datasets. + """ + + def __init__(self, dataset_path: Path = None): + self.hf_id = config.DATASET_HF_ID + self.raw_data_dir = config.RAW_DATA_DIR + self.interim_data_dir = config.INTERIM_DATA_DIR + self.base_interim_path = self.interim_data_dir / "base" + + if dataset_path: + self.dataset_path = dataset_path + else: + self.dataset_path = self.base_interim_path + + def _format_labels_for_csv(self, example: dict) -> dict: + """ + Formats the labels list as a string for CSV storage. + (Private class method) + + Args: + example (dict): A single example from the dataset. + + Returns: + dict: The example with labels converted to string. + """ + labels = example.get("labels") + if isinstance(labels, list): + example["labels"] = str(labels) + return example + + def download_dataset(self): + """ + Loads the dataset from Hugging Face and saves it into the "raw" folder. + """ + logger.info(f"Loading dataset: {self.hf_id}") + try: + ds = load_dataset(self.hf_id) + logger.success("Dataset loaded successfully.") + logger.info(f"Dataset splits: {ds}") + + self.raw_data_dir.mkdir(parents=True, exist_ok=True) + + for split_name, dataset_split in ds.items(): + output_path = os.path.join( + self.raw_data_dir, f"{split_name.replace('-', '_')}.parquet" + ) + dataset_split.to_parquet(output_path) + + logger.success(f"Dataset saved to {self.raw_data_dir}.") + except Exception as e: + logger.warning(f"Error during loading: {e}.") + + def parquet_to_csv(self): + """ + Converts all parquet files in the raw data directory + to CSV format in the interim data directory. + """ + logger.info("Starting Parquet to CSV conversion...") + self.base_interim_path.mkdir(parents=True, exist_ok=True) + + for file_name in os.listdir(self.raw_data_dir): + if file_name.endswith(".parquet"): + part_name = file_name.replace(".parquet", "").replace("-", "_") + + # Load the parquet file + dataset = load_dataset( + "parquet", data_files={part_name: str(self.raw_data_dir / file_name)} + ) + + # Map and format labels + dataset[part_name] = dataset[part_name].map(self._format_labels_for_csv) + + # Save to CSV + csv_output_path = os.path.join(self.base_interim_path, f"{part_name}.csv") + dataset[part_name].to_csv(csv_output_path) + + logger.info(f"Converted {file_name} to {csv_output_path}") + + logger.success("Parquet -> CSV conversion complete.") + + def get_dataset_name(self) -> str: + """ + Returns the name of the current dataset being used. + + Returns: + str: The name of the dataset (e.g., 'clean-aug-soft-k5000'). + """ + return self.dataset_path.name + + def get_dataset(self) -> DatasetDict: + """ + Returns the processed dataset from the interim data directory + as a DatasetDict (loaded from CSVs). + + Returns: + DatasetDict: The complete dataset with train and test splits for each language. + """ + + dataset_path = self.dataset_path + + # Define the base filenames + data_files = { + "java_train": str(dataset_path / "java_train.csv"), + "java_test": str(dataset_path / "java_test.csv"), + "python_train": str(dataset_path / "python_train.csv"), + "python_test": str(dataset_path / "python_test.csv"), + "pharo_train": str(dataset_path / "pharo_train.csv"), + "pharo_test": str(dataset_path / "pharo_test.csv"), + } + + # Verify file existence before loading + logger.info("Loading CSV dataset from splits...") + existing_data_files = {} + for key, path in data_files.items(): + if not os.path.exists(path): + found = False + if os.path.exists(dataset_path): + for f in os.listdir(dataset_path): + if f.startswith(key) and f.endswith(".csv"): + existing_data_files[key] = str(dataset_path / f) + found = True + break + if not found: + logger.warning(f"File not found for split '{key}': {path}") + else: + existing_data_files[key] = path + + if not existing_data_files: + logger.error("No dataset CSV files found. Run 'parquet-to-csv' first.") + raise FileNotFoundError("Dataset CSV files not found.") + + logger.info(f"Found files: {list(existing_data_files.keys())}") + + full_dataset = load_dataset("csv", data_files=existing_data_files) + + logger.info("Formatting labels (from string back to list)...") + for split in full_dataset: + full_dataset[split] = full_dataset[split].map( + lambda x: { + "labels": ast.literal_eval(x["labels"]) + if isinstance(x["labels"], str) + else x["labels"] + } + ) + + logger.success("Dataset is ready for use.") + return full_dataset + + def get_raw_dataset_from_hf(self) -> DatasetDict: + """ + Loads the raw dataset directly from Hugging Face without saving. + + Returns: + DatasetDict: The raw dataset from Hugging Face. + """ + logger.info(f"Loading raw dataset '{self.hf_id}' from Hugging Face...") + try: + ds = load_dataset(self.hf_id) + logger.success(f"Successfully loaded '{self.hf_id}'.") + return ds + except Exception as e: + logger.error(f"Failed to load dataset from Hugging Face: {e}") + return None + + def search_file(self, file_name: str, search_directory: Path = None) -> list: + """ + Recursively searches for a file by name within a specified data directory. + + Args: + file_name (str): The name of the file to search for (e.g., "java_train.csv"). + search_directory (Path, optional): The directory to search in. + Defaults to self.raw_data_dir. + + Returns: + list: A list of Path objects for all found files. + """ + if search_directory is None: + search_directory = self.raw_data_dir + logger.info(f"Defaulting search to raw data directory: {search_directory}") + + if not search_directory.is_dir(): + logger.error(f"Search directory not found: {search_directory}") + return [] + + logger.info(f"Searching for '{file_name}' in '{search_directory}'...") + + found_files = [] + for root, dirs, files in os.walk(search_directory): + for file in files: + if file == file_name: + found_files.append(Path(root) / file) + + if not found_files: + logger.warning(f"No files named '{file_name}' found in '{search_directory}'.") + else: + logger.success(f"Found {len(found_files)} matching file(s).") + + return found_files diff --git a/turing/evaluate_model.py b/turing/evaluate_model.py new file mode 100644 index 0000000000000000000000000000000000000000..6e41e59169ed44b57b7c06b29233d82d65722ce5 --- /dev/null +++ b/turing/evaluate_model.py @@ -0,0 +1,121 @@ +import time + +from datasets import DatasetDict +from loguru import logger +import numpy as np +import pandas as pd +import torch + +import turing.config as config + + +def calculate_submission_score(avg_f1: float, avg_runtime: float, avg_flops: float) -> float: + """ + Calculates the final competition score. + The score is a weighted sum of F1 score, runtime, and GFLOPS. + Weights: + - F1 Score: 60% + - Runtime: 20% + - GFLOPS: 20% + + Args: + avg_f1 (float): Average F1 score across all categories. + avg_runtime (float): Average runtime in seconds. + avg_flops (float): Average GFLOPS. + + Returns: + float: Final submission score. + """ + + score_f1 = 0.6 * avg_f1 + + runtime_ratio = (config.MAX_AVG_RUNTIME - avg_runtime) / config.MAX_AVG_RUNTIME + score_runtime = 0.2 * max(runtime_ratio, 0) + + flops_ratio = (config.MAX_AVG_FLOPS - avg_flops) / config.MAX_AVG_FLOPS + score_flops = 0.2 * max(flops_ratio, 0) + + total_score = score_f1 + score_runtime + score_flops + + logger.info(f" F1 Score (60%): {score_f1:.4f} (avg_f1: {avg_f1:.4f})") + logger.info( + f" Runtime Score (20%): {score_runtime:.4f} (avg_runtime: {avg_runtime:.4f}s / {config.MAX_AVG_RUNTIME}s)" + ) + logger.info( + f" GFLOPS Score (20%): {score_flops:.4f} (avg_flops: {avg_flops:.4f} / {config.MAX_AVG_FLOPS})" + ) + logger.info(" ====================") + logger.info(f" Final Score: {total_score:.4f}") + + return total_score + + +def evaluate_models(models: dict, dataset: DatasetDict): + """ + Evaluates the provided models on the test datasets for each language. + Computes precision, recall, and F1 score for each category and language. + Also measures average runtime and GFLOPS for model inference. + + Args: + models (dict): A dictionary mapping language codes to their respective models. + dataset (DatasetDict): A DatasetDict containing test datasets for each language. + + Returns: + pd.DataFrame: DataFrame containing precision, recall, and F1 scores for each category and language. + float: Final submission score calculated based on average F1, runtime, and GF + """ + + total_flops = 0 + total_time = 0 + scores = [] + + for lan in config.LANGS: + logger.info(f"\n--- Evaluating Language: {lan.upper()} ---") + model = models[lan] + + with torch.profiler.profile(with_flops=True) as p: + test_data = dataset[f"{lan}_test"] + x = test_data[config.INPUT_COLUMN] + x = list(x) if hasattr(x, 'tolist') else x # Convert pandas Series to list + y_true = np.array(test_data[config.LABEL_COLUMN]).T + + begin = time.time() + for i in range(10): + y_pred = model.predict(x) + y_pred = np.asarray(y_pred).T + total = time.time() - begin + total_time = total_time + total + + total_flops = total_flops + (sum(k.flops for k in p.key_averages()) / 1e9) + + for i in range(len(y_pred)): + assert len(y_pred[i]) == len(y_true[i]) + tp = sum([true == pred == 1 for (true, pred) in zip(y_true[i], y_pred[i])]) + #tn = sum([true == pred == 0 for (true, pred) in zip(y_true[i], y_pred[i])]) + fp = sum([true == 0 and pred == 1 for (true, pred) in zip(y_true[i], y_pred[i])]) + fn = sum([true == 1 and pred == 0 for (true, pred) in zip(y_true[i], y_pred[i])]) + precision = tp / (tp + fp) + recall = tp / (tp + fn) + f1 = (2 * tp) / (2 * tp + fp + fn) + scores.append({ + "lan": lan, + "cat": config.LABELS_MAP[lan][i], + "precision": precision, + "recall": recall, + "f1": f1, + }) + + logger.info(f"Compute in GFLOPs: {total_flops / 10}") + logger.info(f"Avg runtime in seconds: {total_time / 10}") + scores = pd.DataFrame(scores) + print(scores) + + avg_f1 = scores["f1"].mean() + avg_runtime = total_time / 10 + avg_flops = total_flops / 10 + + final_score = calculate_submission_score(avg_f1, avg_runtime, avg_flops) + + logger.info(f"Final Score for {lan.upper()}: {final_score:.4f}") + + return scores, final_score diff --git a/turing/features.py b/turing/features.py new file mode 100644 index 0000000000000000000000000000000000000000..a1b350f3e1156dc7394b0725189f05094616488b --- /dev/null +++ b/turing/features.py @@ -0,0 +1,678 @@ +import ast +import hashlib +from pathlib import Path +import random +import re +from typing import List, Tuple + +import nltk +from nltk.corpus import stopwords, wordnet +from nltk.stem import PorterStemmer, WordNetLemmatizer +import numpy as np +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.feature_selection import SelectKBest, chi2 +import typer + +from turing.config import ( + INTERIM_DATA_DIR, + LABEL_COLUMN, + LANGS, +) +from turing.data_validation import run_custom_deepchecks, run_targeted_nlp_checks +from turing.dataset import DatasetManager + +# --- NLTK Resource Check --- +REQUIRED_NLTK_PACKAGES = [ + "stopwords", + "wordnet", + "omw-1.4", + "averaged_perceptron_tagger", + "punkt", +] +for package in REQUIRED_NLTK_PACKAGES: + try: + nltk.data.find(f"corpora/{package}") + except LookupError: + try: + nltk.download(package, quiet=True) + except Exception: + pass + +app = typer.Typer() + + +# --- CONFIGURATION CLASS --- +class FeaturePipelineConfig: + """ + Configuration holder for the pipeline. Generates a unique ID based on parameters + to version the output directories. + """ + + def __init__( + self, + use_stopwords: bool, + use_lemmatization: bool, + use_combo_feature: bool, + max_features: int, + min_comment_length: int, + max_comment_length: int, + enable_augmentation: bool, + custom_tags: str = "base", + ): + self.use_stopwords = use_stopwords + self.use_lemmatization = use_lemmatization + self.use_combo_feature = use_combo_feature + self.max_features = max_features + self.min_comment_length = min_comment_length + self.max_comment_length = max_comment_length + self.enable_augmentation = enable_augmentation + self.custom_tags = custom_tags + self.hash_id = self._generate_readable_id() + + def _generate_readable_id(self) -> str: + tags = ["clean"] + if self.enable_augmentation: + tags.append("aug-soft") + tags.append(f"k{self.max_features}") + if self.custom_tags != "base": + tags.append(self.custom_tags) + return "-".join(tags) + + +# --- TEXT UTILITIES --- +class TextCanonicalizer: + """ + Reduces text to a 'canonical' form (stemmed, lowercase) + to detect semantic duplicates. + preserves javadoc tags to distinguish usage (@return) from summary (Returns). + """ + + def __init__(self): + self.stemmer = PorterStemmer() + self.stop_words = set(stopwords.words("english")) + # Code keywords are preserved as they carry semantic weight + self.code_keywords = { + "return", + "true", + "false", + "null", + "if", + "else", + "void", + "int", + "boolean", + "param", + "throws", + "exception", + } + + def to_canonical(self, text: str) -> str: + if pd.isna(text): + return "" + text = str(text).lower() + text = re.sub(r"[^a-z0-9\s@]", " ", text) + + words = text.split() + canonical_words = [] + + for w in words: + # If the word starts with @ (e.g., @return), keep it as is + if w.startswith("@"): + canonical_words.append(w) + continue + + if w in self.stop_words and w not in self.code_keywords: + continue + + stemmed = self.stemmer.stem(w) + canonical_words.append(stemmed) + + return " ".join(canonical_words).strip() + + +class TextProcessor: + """ + Standard text cleaning logic for final feature extraction (TF-IDF). + """ + + def __init__(self, config: FeaturePipelineConfig, language: str = "english"): + self.config = config + self.stop_words = set(stopwords.words(language)) + self.lemmatizer = WordNetLemmatizer() + + def clean_text(self, text: str) -> str: + if pd.isna(text): + return "" + text = str(text).lower() + # Remove heavy code markers but keep text structure + text = re.sub(r"(^\s*//+|^\s*/\*+|\*/$)", "", text) + # Keep only alpha characters for NLP model (plus pipe for combo) + text = re.sub(r"[^a-z\s|]", " ", text) + tokens = text.split() + if self.config.use_stopwords: + tokens = [w for w in tokens if w not in self.stop_words] + if self.config.use_lemmatization: + tokens = [self.lemmatizer.lemmatize(w) for w in tokens] + return " ".join(tokens) + + +# --- AUGMENTATION --- +class SafeAugmenter: + """ + protects reserved keywords from synonym replacement. + """ + + def __init__(self, aug_prob=0.3): + self.aug_prob = aug_prob + self.protected_words = { + "return", + "public", + "private", + "void", + "class", + "static", + "final", + "if", + "else", + "for", + "while", + "try", + "catch", + "import", + "package", + "null", + "true", + "false", + "self", + "def", + "todo", + "fixme", + "param", + "throw", + } + + def get_synonyms(self, word): + synonyms = set() + for syn in wordnet.synsets(word): + for lemma in syn.lemmas(): + name = lemma.name().replace("_", " ") + if name.isalpha() and name.lower() != word.lower(): + synonyms.add(name) + return list(synonyms) + + def augment(self, text: str) -> str: + if pd.isna(text) or not text: + return "" + words = text.split() + if len(words) < 2: + return text + new_words = [] + for word in words: + word_lower = word.lower() + + if word_lower in self.protected_words: + new_words.append(word) + continue + + # Random Case Injection (Noise) + if random.random() < 0.1: + if word[0].isupper(): + new_words.append(word.lower()) + else: + new_words.append(word.capitalize()) + continue + + # Synonym Replacement + if random.random() < self.aug_prob and len(word) > 3: + syns = self.get_synonyms(word_lower) + if syns: + replacement = random.choice(syns) + if word[0].isupper(): + replacement = replacement.capitalize() + new_words.append(replacement) + else: + new_words.append(word) + else: + new_words.append(word) + return " ".join(new_words) + + def apply_balancing( + self, df: pd.DataFrame, min_samples: int = 100 + ) -> Tuple[pd.DataFrame, pd.DataFrame]: + """ + Generates synthetic data for minority classes. + Returns: (Balanced DataFrame, Report DataFrame) + """ + df["temp_label_str"] = df[LABEL_COLUMN].astype(str) + counts = df["temp_label_str"].value_counts() + print( + f"\n [Balance Check - PRE] Min class size: {counts.min()} | Max: {counts.max()}" + ) + + existing_sentences = set(df["comment_sentence"].str.strip()) + new_rows = [] + report_rows = [] + + for label_str, count in counts.items(): + if count < min_samples: + needed = min_samples - count + class_subset = df[df["temp_label_str"] == label_str] + if class_subset.empty: + continue + + samples = class_subset["comment_sentence"].tolist() + orig_label = class_subset[LABEL_COLUMN].iloc[0] + + # Propagate 'combo' if present + orig_combo = None + if "combo" in class_subset.columns: + orig_combo = class_subset["combo"].iloc[0] + + generated = 0 + attempts = 0 + # Cap attempts to avoid infinite loops if vocabulary is too small + while generated < needed and attempts < needed * 5: + attempts += 1 + src = random.choice(samples) + aug_txt = self.augment(src).strip() + + # Ensure Global Uniqueness + if aug_txt and aug_txt not in existing_sentences: + row = { + "comment_sentence": aug_txt, + LABEL_COLUMN: orig_label, + "partition": "train_aug", + "index": -1, # Placeholder + } + if orig_combo: + row["combo"] = orig_combo + + new_rows.append(row) + report_rows.append( + { + "original_text": src, + "augmented_text": aug_txt, + "label": label_str, + "reason": f"Class has {count} samples (Target {min_samples})", + } + ) + existing_sentences.add(aug_txt) + generated += 1 + + df = df.drop(columns=["temp_label_str"]) + df_report = pd.DataFrame(report_rows) + + if new_rows: + augmented_df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True) + augmented_df["index"] = range(len(augmented_df)) + + temp_counts = augmented_df[LABEL_COLUMN].astype(str).value_counts() + print( + f" [Balance Check - POST] Min class size: {temp_counts.min()} | Max: {temp_counts.max()}" + ) + return augmented_df, df_report + + return df, df_report + + +# --- CLEANING LOGIC --- +def clean_training_data_smart( + df: pd.DataFrame, min_len: int, max_len: int, language: str = "english" +) -> Tuple[pd.DataFrame, pd.DataFrame]: + """ + Performs 'Smart Cleaning' on the Training Set with language-specific heuristics. + """ + canon = TextCanonicalizer() + dropped_rows = [] + + print(f" [Clean] Computing heuristics (Language: {language})...") + df["canon_key"] = df["comment_sentence"].apply(canon.to_canonical) + + # 1. Token Length Filter + def count_code_tokens(text): + return len([t for t in re.split(r"[^a-zA-Z0-9]+", str(text)) if t]) + + df["temp_token_len"] = df["comment_sentence"].apply(count_code_tokens) + + + MIN_ALPHA_CHARS = 6 + MAX_SYMBOL_RATIO = 0.50 + + # 2. Heuristic Filters (Tiny/Huge/Code) + def get_heuristics(text): + s = str(text).strip() + char_len = len(s) + if char_len == 0: + return False, False, 1.0 + + alpha_len = sum(1 for c in s if c.isalpha()) + + non_alnum_chars = sum(1 for c in s if not c.isalnum() and not c.isspace()) + symbol_ratio = non_alnum_chars / char_len if char_len > 0 else 0 + + is_tiny = alpha_len < MIN_ALPHA_CHARS + is_huge = char_len > 800 + is_code = symbol_ratio > MAX_SYMBOL_RATIO + + return is_tiny, is_huge, is_code + + heuristics = df["comment_sentence"].apply(get_heuristics) + df["is_tiny"] = [x[0] for x in heuristics] + df["is_huge"] = [x[1] for x in heuristics] + df["symbol_ratio"] = [x[2] for x in heuristics] + + + df["is_code"] = df["symbol_ratio"] > 0.50 + + mask_keep = ( + (df["temp_token_len"] >= min_len) + & (df["temp_token_len"] <= max_len) + & (~df["is_tiny"]) + & (~df["is_huge"]) + & (~df["is_code"]) + ) + + df_dropped_qual = df[~mask_keep].copy() + if not df_dropped_qual.empty: + def reason(row): + if row["is_tiny"]: + return f"Too Tiny (<{MIN_ALPHA_CHARS} alpha)" + if row["is_huge"]: + return "Too Huge (>800 chars)" + if row["is_code"]: + return f"Pure Code (>{int(MAX_SYMBOL_RATIO*100)}% symbols)" + return f"Token Count ({row['temp_token_len']})" + + df_dropped_qual["drop_reason"] = df_dropped_qual.apply(reason, axis=1) + dropped_rows.append(df_dropped_qual) + + df = df[mask_keep].copy() + + # 3. Semantic Conflicts (Ambiguity) + df["label_s"] = df[LABEL_COLUMN].astype(str) + conflict_counts = df.groupby("canon_key")["label_s"].nunique() + conflicting_keys = conflict_counts[conflict_counts > 1].index + + mask_conflicts = df["canon_key"].isin(conflicting_keys) + df_dropped_conflicts = df[mask_conflicts].copy() + if not df_dropped_conflicts.empty: + df_dropped_conflicts["drop_reason"] = "Semantic Conflict" + dropped_rows.append(df_dropped_conflicts) + + df = df[~mask_conflicts].copy() + + # 4. Exact Duplicates + mask_dupes = df.duplicated(subset=["comment_sentence"], keep="first") + df_dropped_dupes = df[mask_dupes].copy() + if not df_dropped_dupes.empty: + df_dropped_dupes["drop_reason"] = "Exact Duplicate" + dropped_rows.append(df_dropped_dupes) + + df = df[~mask_dupes].copy() + + # Cleanup columns + cols_to_drop = [ + "canon_key", + "label_s", + "temp_token_len", + "is_tiny", + "is_huge", + "is_code", + "symbol_ratio" + ] + df = df.drop(columns=cols_to_drop, errors="ignore") + + if dropped_rows: + df_report = pd.concat(dropped_rows, ignore_index=True) + cols_rep = ["index", "comment_sentence", LABEL_COLUMN, "drop_reason"] + final_cols = [c for c in cols_rep if c in df_report.columns] + df_report = df_report[final_cols] + else: + df_report = pd.DataFrame(columns=["index", "comment_sentence", "drop_reason"]) + + print(f" [Clean] Removed {len(df_report)} rows. Final: {len(df)}.") + return df, df_report + +# --- FEATURE ENGINEERING --- +class FeatureEngineer: + def __init__(self, config: FeaturePipelineConfig): + self.config = config + self.processor = TextProcessor(config=config) + self.tfidf_vectorizer = TfidfVectorizer(max_features=config.max_features) + + def extract_features_for_check(self, df: pd.DataFrame) -> pd.DataFrame: + """Extracts metadata features for analysis.""" + + def analyze(text): + s = str(text) + words = s.split() + n_words = len(words) + if n_words == 0: + return 0, 0, 0 + first_word = words[0].lower() + starts_verb = ( + 1 + if first_word.endswith("s") + or first_word.startswith("get") + or first_word.startswith("set") + else 0 + ) + return (len(s), n_words, starts_verb) + + metrics = df["comment_sentence"].apply(analyze) + df["f_length"] = [x[0] for x in metrics] + df["f_word_count"] = [x[1] for x in metrics] + df["f_starts_verb"] = [x[2] for x in metrics] + # Calculate MD5 hash for efficient exact duplicate detection in Deepchecks + df["text_hash"] = df["comment_sentence"].apply( + lambda x: hashlib.md5(str(x).encode()).hexdigest() + ) + return df + + def vectorize_and_select(self, df_train, df_test): + def clean_fn(x): + return re.sub(r"[^a-zA-Z\s]", "", str(x).lower()) + + X_train = self.tfidf_vectorizer.fit_transform( + df_train["comment_sentence"].apply(clean_fn) + ) + y_train = np.stack(df_train[LABEL_COLUMN].values) + + # Handling multi-label for Chi2 (using sum or max) + y_train_sum = ( + y_train.sum(axis=1) if len(y_train.shape) > 1 else y_train + ) + selector = SelectKBest( + chi2, k=min(self.config.max_features, X_train.shape[1]) + ) + X_train = selector.fit_transform(X_train, y_train_sum) + + X_test = self.tfidf_vectorizer.transform( + df_test["comment_sentence"].apply(clean_fn) + ) + X_test = selector.transform(X_test) + + vocab = [ + self.tfidf_vectorizer.get_feature_names_out()[i] + for i in selector.get_support(indices=True) + ] + return X_train, X_test, vocab + + +# --- MAIN EXECUTION --- +def main( + feature_dir: Path = typer.Option( + INTERIM_DATA_DIR / "features", help="Output dir." + ), + reports_root: Path = typer.Option( + Path("reports/data"), help="Reports root." + ), + max_features: int = typer.Option(5000), + min_comment_length: int = typer.Option( + 2, help="Remove comments shorter than chars." + ), + max_comment_length: int = typer.Option(300), + augment: bool = typer.Option(False, "--augment", help="Enable augmentation."), + balance_threshold: int = typer.Option(100, help="Min samples per class."), + run_vectorization: bool = typer.Option(False, "--run-vectorization"), + run_nlp_check: bool = typer.Option( + True, "--run-nlp", help="Run Deepchecks NLP suite." + ), + custom_tags: str = typer.Option("base", help="Custom tags."), + save_full_csv: bool = typer.Option(False, "--save-full-csv"), + languages: List[str] = typer.Option(LANGS, show_default=False), +): + + config = FeaturePipelineConfig( + True, + True, + True, + max_features, + min_comment_length, + max_comment_length, + augment, + custom_tags, + ) + print(f"=== Pipeline ID: {config.hash_id} ===") + + dm = DatasetManager() + full_dataset = dm.get_dataset() + fe = FeatureEngineer(config) + augmenter = SafeAugmenter() + + feat_output_dir = feature_dir / config.hash_id + feat_output_dir.mkdir(parents=True, exist_ok=True) + report_output_dir = reports_root / config.hash_id + + for lang in languages: + print(f"\n{'='*30}\nPROCESSING LANGUAGE: {lang.upper()}\n{'='*30}") + df_train = full_dataset[f"{lang}_train"].to_pandas() + df_test = full_dataset[f"{lang}_test"].to_pandas() + + # Standardize Label Format + for df in [df_train, df_test]: + if isinstance(df[LABEL_COLUMN].iloc[0], str): + df[LABEL_COLUMN] = ( + df[LABEL_COLUMN] + .str.replace(r"\s+", ", ", regex=True) + .apply(ast.literal_eval) + ) + + lang_report_dir = report_output_dir / lang + + # 1. RAW AUDIT + print(" >>> Phase 1: Auditing RAW Data") + df_train_raw = fe.extract_features_for_check(df_train.copy()) + df_test_raw = fe.extract_features_for_check(df_test.copy()) + run_custom_deepchecks( + df_train_raw, df_test_raw, lang_report_dir, "raw", lang + ) + if run_nlp_check: + run_targeted_nlp_checks( + df_train_raw, df_test_raw, lang_report_dir, "raw" + ) + + # 2. CLEANING & AUGMENTATION + print("\n >>> Phase 2: Smart Cleaning & Augmentation") + df_train, df_dropped = clean_training_data_smart( + df_train, min_comment_length, max_comment_length, language=lang + ) + + if not df_dropped.empty: + dropped_path = lang_report_dir / "dropped_rows.csv" + df_dropped.to_csv(dropped_path, index=False) + print(f" [Report] Dropped rows details saved to: {dropped_path}") + + if augment: + print(" [Augment] Applying Soft Balancing...") + df_train, df_aug_report = augmenter.apply_balancing( + df_train, min_samples=balance_threshold + ) + + if not df_aug_report.empty: + aug_path = lang_report_dir / "augmentation_report.csv" + df_aug_report.to_csv(aug_path, index=False) + print( + f" [Report] Augmentation details saved to: {aug_path}" + ) + + # 3. PROCESSED AUDIT + print("\n >>> Phase 3: Auditing PROCESSED Data") + df_train = fe.extract_features_for_check(df_train) + df_test = fe.extract_features_for_check(df_test) + run_custom_deepchecks( + df_train, df_test, lang_report_dir, "processed", lang + ) + if run_nlp_check: + run_targeted_nlp_checks( + df_train, df_test, lang_report_dir, "processed" + ) + + # 4. FINAL PROCESSING & SAVING + print("\n >>> Phase 4: Final Processing & Save") + df_train["comment_clean"] = df_train["comment_sentence"].apply( + fe.processor.clean_text + ) + df_test["comment_clean"] = df_test["comment_sentence"].apply( + fe.processor.clean_text + ) + + if config.use_combo_feature: + if "combo" in df_train.columns: + df_train["combo_clean"] = df_train["combo"].apply( + fe.processor.clean_text + ) + if "combo" in df_test.columns: + df_test["combo_clean"] = df_test["combo"].apply( + fe.processor.clean_text + ) + + X_train, X_test, vocab = None, None, [] + if run_vectorization: + print(" [Vectorization] TF-IDF & Chi2...") + X_train, X_test, vocab = fe.vectorize_and_select(df_train, df_test) + def format_label_robust(lbl): + if hasattr(lbl, "tolist"): # Check if numpy array + lbl = lbl.tolist() + return str(lbl) + + df_train[LABEL_COLUMN] = df_train[LABEL_COLUMN].apply(format_label_robust) + df_test[LABEL_COLUMN] = df_test[LABEL_COLUMN].apply(format_label_robust) + + cols_to_save = [ + "index", + LABEL_COLUMN, + "comment_sentence", + "comment_clean", + ] + if "combo" in df_train.columns: + cols_to_save.append("combo") + if "combo_clean" in df_train.columns: + cols_to_save.append("combo_clean") + meta_cols = [c for c in df_train.columns if c.startswith("f_")] + cols_to_save.extend(meta_cols) + + print(f" [Save] Columns: {cols_to_save}") + df_train[cols_to_save].to_csv( + feat_output_dir / f"{lang}_train.csv", index=False + ) + df_test[cols_to_save].to_csv( + feat_output_dir / f"{lang}_test.csv", index=False + ) + + if run_vectorization and X_train is not None: + from scipy.sparse import save_npz + + save_npz(feat_output_dir / f"{lang}_train_tfidf.npz", X_train) + save_npz(feat_output_dir / f"{lang}_test_tfidf.npz", X_test) + with open( + feat_output_dir / f"{lang}_vocab.txt", "w", encoding="utf-8" + ) as f: + f.write("\n".join(vocab)) + + print(f"\nAll Done. Reports in: {report_output_dir}") + + +if __name__ == "__main__": + typer.run(main) \ No newline at end of file diff --git a/turing/modeling/__init__.py b/turing/modeling/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/turing/modeling/__pycache__/__init__.cpython-312.pyc b/turing/modeling/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c639786e44075a325e64356d683d24db8ed0a443 Binary files /dev/null and b/turing/modeling/__pycache__/__init__.cpython-312.pyc differ diff --git a/turing/modeling/__pycache__/baseModel.cpython-312.pyc b/turing/modeling/__pycache__/baseModel.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..80883da933239f0960209bb179084d4a46408082 Binary files /dev/null and b/turing/modeling/__pycache__/baseModel.cpython-312.pyc differ diff --git a/turing/modeling/baseModel.py b/turing/modeling/baseModel.py new file mode 100644 index 0000000000000000000000000000000000000000..b9f0fde2c413d1052b0902c980a10a8274cd75c6 --- /dev/null +++ b/turing/modeling/baseModel.py @@ -0,0 +1,111 @@ +from abc import ABC, abstractmethod +import os +import shutil + +from loguru import logger +import mlflow +from numpy import ndarray + + +class BaseModel(ABC): + """ + Abstract base class for training models. + Subclasses should define the model and implement specific logic + for training, evaluation, and model persistence. + """ + + def __init__(self, language, path=None): + """ + Initialize the trainer. + + Args: + language (str): Language for the model. + path (str, optional): Path to load a pre-trained model. Defaults to None. + If None, a new model is initialized. + """ + + self.language = language + self.model = None + if path: + self.load(path) + else: + self.setup_model() + + @abstractmethod + def setup_model(self): + """ + Initialize or build the model. + Called in __init__ of subclass. + """ + pass + + @abstractmethod + def train(self, X_train, y_train) -> dict[str,any]: + """ + Main training logic for the model. + + Args: + X_train: Input training data. + y_train: True labels for training data. + """ + pass + + @abstractmethod + def evaluate(self, X_test, y_test) -> dict[str,any]: + """ + Evaluation logic for the model. + + Args: + X_test: Input test data. + y_test: True labels for test data. + """ + pass + + @abstractmethod + def predict(self, X) -> ndarray: + """ + Make predictions using the trained model. + + Args: + X: Input data for prediction. + + Returns: + Predictions made by the model. + """ + pass + + def save(self, path, model_name): + """ + Save model and log to MLflow. + + Args: + path (str): Path to save the model. + model_name (str): Name to use when saving the model (without extension). + """ + + if self.model is None: + raise ValueError("Model is not trained. Cannot save uninitialized model.") + + complete_path = os.path.join(path, f"{model_name}_{self.language}") + if os.path.exists(complete_path) and os.path.isdir(complete_path): + shutil.rmtree(complete_path) + mlflow.sklearn.save_model(self.model, complete_path) + + try: + mlflow.log_artifact(complete_path) + except Exception as e: + logger.error(f"Failed to log model to MLflow: {e}") + + logger.info(f"Model saved to: {complete_path}") + + def load(self, model_path): + """ + Load model from specified local path or mlflow model URI. + + Args: + model_path (str): Path to load the model from (local or mlflow URI). + """ + + self.model = mlflow.sklearn.load_model(model_path) + logger.info(f"Model loaded from: {model_path}") + diff --git a/turing/modeling/model_selector.py b/turing/modeling/model_selector.py new file mode 100644 index 0000000000000000000000000000000000000000..0f83c97a5477c72c299631f1c29930672e693289 --- /dev/null +++ b/turing/modeling/model_selector.py @@ -0,0 +1,145 @@ +from typing import Optional + +from loguru import logger +from mlflow.tracking import MlflowClient + + +def get_best_model_by_tag( + language: str, + tag_key: str = "best_model", + metric: str = "f1_score" +) -> Optional[dict]: + """ + Retrieve the best model for a specific language using MLflow tags. + + Args: + language: Programming language (java, python, pharo) + tag_key: Tag key to search for (default: "best_model") + metric: Metric to use for ordering (default: "f1_score") + + Returns: + Dict with run_id and artifact_name of the best model or None if not found + """ + + client = MlflowClient() + experiments = client.search_experiments() + if not experiments: + logger.error("No experiments found in MLflow") + return None + + try: + runs = client.search_runs( + experiment_ids=[exp.experiment_id for exp in experiments], + filter_string=f"tags.{tag_key} = 'true' and tags.Language = '{language}'", + order_by=[f"metrics.{metric} DESC"], + max_results=1 + ) + + if not runs: + logger.warning(f"No runs found with tag '{tag_key}' for language '{language}'") + return None + + best_run = runs[0] + run_id = best_run.info.run_id + exp_name = client.get_experiment(best_run.info.experiment_id).name + run_name = best_run.info.run_name + artifact_name = best_run.data.tags.get("model_name") + model_id = best_run.data.tags.get("model_id") + logger.info(f"Found best model for {language}: {exp_name}/{run_name} ({run_id}), artifact={artifact_name}") + + return { + "run_id": run_id, + "artifact": artifact_name, + "model_id": model_id + } + + except Exception as e: + logger.error(f"Error searching for best model: {e}") + return None + + +def get_best_model_info( + language: str, + fallback_registry: dict = None +) -> dict: + """ + Retrieve the best model information for a language. + First searches by tag, then falls back to hardcoded registry. + + Args: + language: Programming language + fallback_registry: Fallback registry with run_id and artifact + + Returns: + Dict with run_id and artifact of the model + """ + + model_info = get_best_model_by_tag(language, "best_model") + + if model_info: + logger.info(f"Using tagged best model for {language}") + return model_info + + if fallback_registry and language in fallback_registry: + logger.warning(f"No tagged model found for {language}, using fallback registry") + return fallback_registry[language] + + model_info = get_best_model_by_metric(language) + + if model_info: + logger.warning(f"Using best model by metric for {language}") + return model_info + + raise ValueError(f"No model found for language {language}") + + +def get_best_model_by_metric( + language: str, + metric: str = "f1_score" +) -> Optional[dict]: + """ + Find the model with the best metric for a language. + + Args: + language: Programming language + metric: Metric to use for ordering + + Returns: + Dict with run_id and artifact of the model or None + """ + + client = MlflowClient() + experiments = client.search_experiments() + if not experiments: + logger.error("No experiments found in MLflow") + return None + + try: + runs = client.search_runs( + experiment_ids=[exp.experiment_id for exp in experiments], + filter_string=f"tags.Language = '{language}'", + order_by=[f"metrics.{metric} DESC"], + max_results=1 + ) + + if not runs: + logger.warning(f"No runs found for language '{language}'") + return None + + best_run = runs[0] + run_id = best_run.info.run_id + exp_name = client.get_experiment(best_run.info.experiment_id).name + run_name = best_run.info.run_name + artifact_name = best_run.data.tags.get("model_name") + model_id = best_run.data.tags.get("model_id") + logger.info(f"Found best model for {language}: {exp_name}/{run_name} ({run_id}), artifact={artifact_name}") + + return { + "run_id": run_id, + "artifact": artifact_name, + "model_id": model_id + } + + except Exception as e: + logger.error(f"Error finding best model by metric: {e}") + return None diff --git a/turing/modeling/models/__init__.py b/turing/modeling/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5fc7efe62befe30f25787a6adbb0ee796e167fe5 --- /dev/null +++ b/turing/modeling/models/__init__.py @@ -0,0 +1,15 @@ +""" +Model classes for code comment classification. +""" + +from turing.modeling.models.codeBerta import CodeBERTa +from turing.modeling.models.graphCodeBert import GraphCodeBERTClassifier +from turing.modeling.models.randomForestTfIdf import RandomForestTfIdf +from turing.modeling.models.tinyBert import TinyBERTClassifier + +__all__ = [ + "CodeBERTa", + "RandomForestTfIdf", + "TinyBERTClassifier", + "GraphCodeBERTClassifier", +] diff --git a/turing/modeling/models/__pycache__/miniLM.cpython-312.pyc b/turing/modeling/models/__pycache__/miniLM.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3f0ee2b529f8208b9a8597159087ccf2452ee16d Binary files /dev/null and b/turing/modeling/models/__pycache__/miniLM.cpython-312.pyc differ diff --git a/turing/modeling/models/__pycache__/miniLmWithClassificationHead.cpython-312.pyc b/turing/modeling/models/__pycache__/miniLmWithClassificationHead.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9d95768e056e5a2372b5ff2dc24236212578cbc8 Binary files /dev/null and b/turing/modeling/models/__pycache__/miniLmWithClassificationHead.cpython-312.pyc differ diff --git a/turing/modeling/models/__pycache__/randomForestTfIdf.cpython-312.pyc b/turing/modeling/models/__pycache__/randomForestTfIdf.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f0f0602f2610e640ed25a906433f59c392845613 Binary files /dev/null and b/turing/modeling/models/__pycache__/randomForestTfIdf.cpython-312.pyc differ diff --git a/turing/modeling/models/codeBerta.py b/turing/modeling/models/codeBerta.py new file mode 100644 index 0000000000000000000000000000000000000000..d593503288765f11635627da12c01ae8b35e9461 --- /dev/null +++ b/turing/modeling/models/codeBerta.py @@ -0,0 +1,463 @@ +import os +import shutil +import warnings + +from loguru import logger +import mlflow +import numpy as np +from numpy import ndarray +from sklearn.metrics import ( + accuracy_score, + classification_report, + f1_score, + precision_score, + recall_score, +) +import torch +from torch.utils.data import Dataset +from transformers import ( + AutoModelForSequenceClassification, + AutoTokenizer, + EarlyStoppingCallback, + Trainer, + TrainingArguments, +) + +from turing.config import MODELS_DIR + +from ..baseModel import BaseModel + +warnings.filterwarnings("ignore") + + +def compute_metrics(eval_pred): + predictions, labels = eval_pred + + # Sigmoid function to convert logits to probabilities + probs = 1 / (1 + np.exp(-predictions)) + + # Apply threshold of 0.5 (becomes 1 if > 0.5, otherwise 0) + preds = (probs > 0.5).astype(int) + + # Calculate F1 score (macro average for multi-label) + f1 = f1_score(labels, preds, average='macro') + precision = precision_score(labels, preds, average='macro', zero_division=0) + recall = recall_score(labels, preds, average='macro', zero_division=0) + + return { + 'f1': f1, + 'precision': precision, + 'recall': recall, + } + + + +class CodeBERTaDataset(Dataset): + """ + Internal Dataset class for CodeBERTa. + """ + + def __init__(self, encodings, labels=None, num_labels=None): + """ + Initialize the InternalDataset. + Args: + encodings (dict): Tokenized encodings. + labels (list or np.ndarray, optional): Corresponding labels. + num_labels (int, optional): Total number of classes. Required for auto-converting indices to one-hot. + """ + + self.encodings = {key: torch.tensor(val) for key, val in encodings.items()} + + if labels is not None: + if not isinstance(labels, (np.ndarray, torch.Tensor)): + labels = np.array(labels) + + # Case A: labels are indices (integers) + if num_labels is not None and (len(labels.shape) == 1 or (len(labels.shape) == 2 and labels.shape[1] == 1)): + labels_flat = labels.flatten() + + # Create one-hot encoded matrix + one_hot = np.zeros((len(labels_flat), num_labels), dtype=np.float32) + + # Set the corresponding index to 1 + valid_indices = labels_flat < num_labels + one_hot[valid_indices, labels_flat[valid_indices]] = 1.0 + + self.labels = torch.tensor(one_hot, dtype=torch.float) + + # Case B: labels are already vectors (e.g., One-Hot or Multi-Hot) + else: + self.labels = torch.tensor(labels, dtype=torch.float) + else: + self.labels = None + + + def __getitem__(self, idx): + """ + Retrieve item at index idx. + + Args: + idx (int): Index of the item to retrieve. + + Returns: + dict: Dictionary containing input_ids, attention_mask, and labels (if available). + """ + + item = {key: val[idx] for key, val in self.encodings.items()} + if self.labels is not None: + item['labels'] = self.labels[idx] + return item + + + def __len__(self): + """ + Return the length of the dataset. + + Returns: + int: Length of the dataset. + """ + + return len(self.encodings['input_ids']) + + + +class CodeBERTa(BaseModel): + """ + HuggingFace implementation of BaseModel for Code Comment Classification. + Uses CodeBERTa-small-v1 for efficient inference. + """ + + def __init__(self, language, path=None): + """ + Initialize the CodeBERTa model with configuration parameters. + + Args: + language (str): Language for the model. + path (str, optional): Path to load a pre-trained model. Defaults to None. + """ + + self.params = { + "model_name_hf": "huggingface/CodeBERTa-small-v1", + "num_labels": 7 if language == "java" else 5 if language == "python" else 6, + "max_length": 128, + "epochs": 15, + "batch_size_train": 16, + "batch_size_eval": 64, + "learning_rate": 1e-5, + "weight_decay": 0.02, + "train_size": 0.8, + "early_stopping_patience": 3, + "early_stopping_threshold": 0.005 + } + + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.tokenizer = None + + super().__init__(language, path) + + + def setup_model(self): + """ + Initialize the CodeBERTa tokenizer and model. + """ + + logger.info(f"Initializing {self.params['model_name_hf']} on {self.device}...") + + self.tokenizer = AutoTokenizer.from_pretrained(self.params["model_name_hf"]) + self.model = AutoModelForSequenceClassification.from_pretrained( + self.params["model_name_hf"], + num_labels=self.params["num_labels"], + problem_type="multi_label_classification" + ).to(self.device) + logger.info("CodeBERTa model initialized.") + + + def _tokenize(self, texts): + """ + Helper to tokenize list of texts efficiently. + + Args: + texts (list): List of text strings to tokenize. + + Returns: + dict: Tokenized encodings. + """ + + safe_texts = [] + for t in texts: + if t is None: + safe_texts.append("") + elif isinstance(t, (int, float)): + if t != t: # NaN check + safe_texts.append("") + else: + safe_texts.append(str(t)) + else: + safe_texts.append(str(t)) + + return self.tokenizer( + safe_texts, + truncation=True, + padding=True, + max_length=self.params["max_length"] + ) + + + def train(self, X_train, y_train) -> dict[str,any]: + """ + Train the model using HF Trainer and log to MLflow. + + Args: + X_train (list): Training input texts. + y_train (list or np.ndarray): Training labels. + + Returns: + dict[str, any]: Dictionary of parameters used for training. + """ + + if self.model is None: + raise ValueError("Model is not initialized. Call setup_model() before training.") + + # log parameters to MLflow without model_name_hf + params_to_log = {k: v for k, v in self.params.items() if k != "model_name_hf" and k != "num_labels"} + + logger.info(f"Starting training for: {self.language.upper()}") + + # Prepare dataset (train/val split) + train_encodings = self._tokenize(X_train) + full_dataset = CodeBERTaDataset(train_encodings, y_train, num_labels=self.params["num_labels"]) + train_size = int(self.params["train_size"] * len(full_dataset)) + val_size = len(full_dataset) - train_size + train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, val_size]) + + temp_ckpt_dir = os.path.join(MODELS_DIR, "temp_checkpoints") + + use_fp16 = torch.cuda.is_available() + if not use_fp16: + logger.info("Mixed Precision (fp16) disabled because CUDA is not available.") + + training_args = TrainingArguments( + output_dir=temp_ckpt_dir, + num_train_epochs=self.params["epochs"], + per_device_train_batch_size=self.params["batch_size_train"], + per_device_eval_batch_size=self.params["batch_size_eval"], + learning_rate=self.params["learning_rate"], + weight_decay=self.params["weight_decay"], + eval_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + metric_for_best_model="f1", + greater_is_better=True, + save_total_limit=2, + logging_dir='./logs', + logging_steps=50, + fp16=use_fp16, + optim="adamw_torch", + report_to="none", + no_cuda=not torch.cuda.is_available() + ) + + trainer = Trainer( + model=self.model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=val_dataset, + compute_metrics=compute_metrics, + callbacks=[EarlyStoppingCallback(early_stopping_patience=self.params["early_stopping_patience"], early_stopping_threshold=self.params["early_stopping_threshold"])] + ) + trainer.train() + logger.info(f"Training for {self.language.upper()} completed.") + + if os.path.exists(temp_ckpt_dir): + shutil.rmtree(temp_ckpt_dir) + + return params_to_log + + + def evaluate(self, X_test, y_test) -> dict[str,any]: + """ + Evaluate model on test data, return metrics and log to MLflow. + Handles automatic conversion of y_test to match multi-label prediction shape. + + Args: + X_test (list): Input test data. + y_test (list or np.ndarray): True labels for test data. + + Returns: + dict[str, any]: Dictionary of evaluation metrics. + """ + + # Obtain predictions + y_pred = self.predict(X_test) + + # Convert y_test to numpy array if needed + if not isinstance(y_test, (np.ndarray, torch.Tensor)): + y_test_np = np.array(y_test) + elif isinstance(y_test, torch.Tensor): + y_test_np = y_test.cpu().numpy() + else: + y_test_np = y_test + + num_labels = self.params["num_labels"] + is_multilabel_pred = (y_pred.ndim == 2 and y_pred.shape[1] > 1) + is_flat_truth = (y_test_np.ndim == 1) or (y_test_np.ndim == 2 and y_test_np.shape[1] == 1) + + if is_multilabel_pred and is_flat_truth: + # Create a zero matrix + y_test_expanded = np.zeros((y_test_np.shape[0], num_labels), dtype=int) + + # Flatten y_test for iteration + indices = y_test_np.flatten() + + # Use indices to set the correct column to 1 + for i, label_idx in enumerate(indices): + idx = int(label_idx) + if 0 <= idx < num_labels: + y_test_expanded[i, idx] = 1 + + y_test_np = y_test_expanded + + # Generate classification report + report = classification_report(y_test_np, y_pred, zero_division=0) + print("\n" + "=" * 50) + print("CLASSIFICATION REPORT") + print(report) + print("=" * 50 + "\n") + + metrics = { + "accuracy": accuracy_score(y_test_np, y_pred), + "precision": precision_score(y_test_np, y_pred, average="macro", zero_division=0), + "recall": recall_score(y_test_np, y_pred, average="macro", zero_division=0), + "f1_score": f1_score(y_test_np, y_pred, average="macro"), + } + + mlflow.log_metrics(metrics) + + logger.info( + f"Evaluation completed — Accuracy: {metrics['accuracy']:.3f}, F1: {metrics['f1_score']:.3f}" + ) + return metrics + + + def predict(self, X) -> ndarray: + """ + Make predictions for Multi-Label classification. + Returns Binary Matrix (Multi-Hot) where multiple classes can be 1. + + Args: + X (list): Input texts for prediction. + + Returns: + np.ndarray: Multi-Hot Encoded predictions (e.g., [[0, 1, 1, 0], ...]) + """ + + if self.model is None: + raise ValueError("Model is not trained. Call train() or load() before prediction.") + + # Set model to evaluation mode + self.model.eval() + + encodings = self._tokenize(X) + # Pass None as labels because we are in inference + dataset = CodeBERTaDataset(encodings, labels=None) + + use_fp16 = torch.cuda.is_available() + + training_args = TrainingArguments( + output_dir="./pred_temp", + per_device_eval_batch_size=self.params["batch_size_eval"], + fp16=use_fp16, + report_to="none", + no_cuda=not torch.cuda.is_available() + ) + + trainer = Trainer(model=self.model, args=training_args) + output = trainer.predict(dataset) + + # Clean up temporary prediction directory + if os.path.exists("./pred_temp"): + shutil.rmtree("./pred_temp") + + # Convert logits to probabilities + logits = output.predictions + probs = 1 / (1 + np.exp(-logits)) + + # Apply a threshold of 0.5 (if prob > 0.5, predict 1 else 0) + preds_binary = (probs > 0.5).astype(int) + + return preds_binary + + + def save(self, path, model_name): + """ + Save model locally and log to MLflow as artifact. + + Args: + path (str): Directory path to save the model. + model_name (str): Name for the saved model. + """ + + if self.model is None: + raise ValueError("Model is not trained. Cannot save uninitialized model.") + + # Local Saving + complete_path = os.path.join(path, f"{model_name}_{self.language}") + + # Remove existing directory if it exists + if os.path.exists(complete_path) and os.path.isdir(complete_path): + shutil.rmtree(complete_path) + + # Save model and tokenizer + logger.info(f"Saving model to: {complete_path}") + self.model.save_pretrained(complete_path) + self.tokenizer.save_pretrained(complete_path) + logger.info("Model saved locally.") + + try: + # Log to MLflow + logger.info("Logging artifacts to MLflow...") + mlflow.log_artifacts(local_dir=complete_path, artifact_path=f"{model_name}_{self.language}") + except Exception as e: + logger.error(f"Failed to log model artifacts to MLflow: {e}") + + + def load(self, model_path): + """ + Load model from a local path OR an MLflow URI. + + Args: + model_path (str): Local path or MLflow URI to load the model from. + """ + + logger.info(f"Loading model from: {model_path}") + local_model_path = model_path + + # Downloading model from MLflow and saving to local path + if model_path.startswith("models:/") or model_path.startswith("runs:/"): + try: + logger.info("Detected MLflow model URI. Attempting to load from MLflow...") + local_model_path = os.path.join(MODELS_DIR, "mlflow_temp_models") + local_model_path = mlflow.artifacts.download_artifacts(artifact_uri=model_path, dst_path=local_model_path) + logger.info(f"Model downloaded from MLflow to: {local_model_path}") + except Exception as e: + logger.error(f"Failed to load from MLflow: {e}") + raise e + + # Loading from local path + try: + if not os.path.exists(local_model_path): + raise FileNotFoundError(f"Model path not found: {local_model_path}") + + # Load tokenizer and model from local path + self.tokenizer = AutoTokenizer.from_pretrained(local_model_path) + self.model = AutoModelForSequenceClassification.from_pretrained( + local_model_path + ).to(self.device) + logger.info("Model loaded from local path successfully.") + + except Exception as e: + logger.error(f"Failed to load model from local path: {e}") + raise e + + # Set model to evaluation mode + self.model.eval() \ No newline at end of file diff --git a/turing/modeling/models/graphCodeBert.py b/turing/modeling/models/graphCodeBert.py new file mode 100644 index 0000000000000000000000000000000000000000..83ec9d1042c2c2dd9e7835e0576f3c09d051e61b --- /dev/null +++ b/turing/modeling/models/graphCodeBert.py @@ -0,0 +1,469 @@ +import os +import shutil +import warnings + +from loguru import logger +import mlflow +import numpy as np +from numpy import ndarray +from sklearn.metrics import ( + accuracy_score, + classification_report, + f1_score, + precision_score, + recall_score, +) +import torch +from torch.utils.data import Dataset +from transformers import ( + AutoModelForSequenceClassification, + AutoTokenizer, + EarlyStoppingCallback, + Trainer, + TrainingArguments, +) + +from turing.config import MODELS_DIR + +from ..baseModel import BaseModel + +warnings.filterwarnings("ignore") + + +def compute_metrics(eval_pred): + predictions, labels = eval_pred + + # Sigmoid function to convert logits to probabilities + probs = 1 / (1 + np.exp(-predictions)) + + # Apply threshold of 0.5 (becomes 1 if > 0.5, otherwise 0) + preds = (probs > 0.5).astype(int) + + # Calculate F1 score (macro average for multi-label) + f1 = f1_score(labels, preds, average="macro") + precision = precision_score(labels, preds, average="macro", zero_division=0) + recall = recall_score(labels, preds, average="macro", zero_division=0) + + return { + "f1": f1, + "precision": precision, + "recall": recall, + } + + +class GraphCodeBERTDataset(Dataset): + """ + Internal Dataset class for GraphCodeBERT. + """ + + def __init__(self, encodings, labels=None, num_labels=None): + """ + Initialize the InternalDataset. + Args: + encodings (dict): Tokenized encodings. + labels (list or np.ndarray, optional): Corresponding labels. + num_labels (int, optional): Total number of classes. Required for auto-converting indices to one-hot. + """ + + self.encodings = {key: torch.tensor(val) for key, val in encodings.items()} + + if labels is not None: + if not isinstance(labels, (np.ndarray, torch.Tensor)): + labels = np.array(labels) + + # Case A: labels are indices (integers) + if num_labels is not None and ( + len(labels.shape) == 1 or (len(labels.shape) == 2 and labels.shape[1] == 1) + ): + labels_flat = labels.flatten() + + # Create one-hot encoded matrix + one_hot = np.zeros((len(labels_flat), num_labels), dtype=np.float32) + + # Set the corresponding index to 1 + valid_indices = labels_flat < num_labels + one_hot[valid_indices, labels_flat[valid_indices]] = 1.0 + + self.labels = torch.tensor(one_hot, dtype=torch.float) + + # Case B: labels are already vectors (e.g., One-Hot or Multi-Hot) + else: + self.labels = torch.tensor(labels, dtype=torch.float) + else: + self.labels = None + + def __getitem__(self, idx): + """ + Retrieve item at index idx. + + Args: + idx (int): Index of the item to retrieve. + + Returns: + dict: Dictionary containing input_ids, attention_mask, and labels (if available). + """ + + item = {key: val[idx] for key, val in self.encodings.items()} + if self.labels is not None: + item["labels"] = self.labels[idx] + return item + + def __len__(self): + """ + Return the length of the dataset. + + Returns: + int: Length of the dataset. + """ + + return len(self.encodings["input_ids"]) + + +class GraphCodeBERTClassifier(BaseModel): + """ + HuggingFace implementation of BaseModel for Code Comment Classification. + Uses GraphCodeBERT (microsoft/graphcodebert-base) for code understanding via data flow graphs. + """ + + def __init__(self, language, path=None): + """ + Initialize the GraphCodeBERT model with configuration parameters. + + Args: + language (str): Language for the model. + path (str, optional): Path to load a pre-trained model. Defaults to None. + """ + + self.params = { + "model_name_hf": "microsoft/graphcodebert-base", + "num_labels": 7 if language == "java" else 5 if language == "python" else 6, + "max_length": 256, + "epochs": 15, + "batch_size_train": 16, + "batch_size_eval": 64, + "learning_rate": 2e-5, + "weight_decay": 0.01, + "train_size": 0.8, + "early_stopping_patience": 3, + "early_stopping_threshold": 0.0, + "warmup_steps": 500, + "seed": 42, + } + + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.tokenizer = None + + super().__init__(language, path) + + def setup_model(self): + """ + Initialize the GraphCodeBERT tokenizer and model. + """ + + logger.info(f"Initializing {self.params['model_name_hf']} on {self.device}...") + + self.tokenizer = AutoTokenizer.from_pretrained(self.params["model_name_hf"]) + self.model = AutoModelForSequenceClassification.from_pretrained( + self.params["model_name_hf"], + num_labels=self.params["num_labels"], + problem_type="multi_label_classification", + use_safetensors=True, # Force use of safetensors for security + ).to(self.device) + logger.info("GraphCodeBERT model initialized.") + + def _tokenize(self, texts): + """ + Helper to tokenize list of texts efficiently. + + Args: + texts (list): List of text strings to tokenize. + + Returns: + dict: Tokenized encodings. + """ + + safe_texts = [] + for t in texts: + if t is None: + safe_texts.append("") + elif isinstance(t, (int, float)): + if t != t: # NaN check + safe_texts.append("") + else: + safe_texts.append(str(t)) + else: + safe_texts.append(str(t)) + + return self.tokenizer( + safe_texts, truncation=True, padding=True, max_length=self.params["max_length"] + ) + + def train(self, X_train, y_train) -> dict[str, any]: + """ + Train the model using HF Trainer and log to MLflow. + + Args: + X_train (list): Training input texts. + y_train (list or np.ndarray): Training labels. + + Returns: + dict[str, any]: Dictionary of parameters used for training. + """ + + if self.model is None: + raise ValueError("Model is not initialized. Call setup_model() before training.") + + # log parameters to MLflow without model_name_hf + params_to_log = { + k: v for k, v in self.params.items() if k != "model_name_hf" and k != "num_labels" + } + + logger.info(f"Starting training for: {self.language.upper()}") + + # Prepare dataset (train/val split) + train_encodings = self._tokenize(X_train) + full_dataset = GraphCodeBERTDataset( + train_encodings, y_train, num_labels=self.params["num_labels"] + ) + train_size = int(self.params["train_size"] * len(full_dataset)) + val_size = len(full_dataset) - train_size + train_dataset, val_dataset = torch.utils.data.random_split( + full_dataset, [train_size, val_size] + ) + + temp_ckpt_dir = os.path.join(MODELS_DIR, "temp_checkpoints") + + use_fp16 = torch.cuda.is_available() + if not use_fp16: + logger.info("Mixed Precision (fp16) disabled because CUDA is not available.") + + training_args = TrainingArguments( + output_dir=temp_ckpt_dir, + num_train_epochs=self.params["epochs"], + per_device_train_batch_size=self.params["batch_size_train"], + per_device_eval_batch_size=self.params["batch_size_eval"], + learning_rate=self.params["learning_rate"], + weight_decay=self.params["weight_decay"], + eval_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + metric_for_best_model="f1", + greater_is_better=True, + save_total_limit=2, + logging_dir="./logs", + logging_steps=50, + fp16=use_fp16, + optim="adamw_torch", + report_to="none", + no_cuda=not torch.cuda.is_available(), + ) + + trainer = Trainer( + model=self.model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=val_dataset, + compute_metrics=compute_metrics, + callbacks=[ + EarlyStoppingCallback( + early_stopping_patience=self.params["early_stopping_patience"], + early_stopping_threshold=self.params["early_stopping_threshold"], + ) + ], + ) + trainer.train() + logger.info(f"Training for {self.language.upper()} completed.") + + if os.path.exists(temp_ckpt_dir): + shutil.rmtree(temp_ckpt_dir) + + return params_to_log + + def evaluate(self, X_test, y_test) -> dict[str, any]: + """ + Evaluate model on test data, return metrics and log to MLflow. + Handles automatic conversion of y_test to match multi-label prediction shape. + + Args: + X_test (list): Input test data. + y_test (list or np.ndarray): True labels for test data. + + Returns: + dict[str, any]: Dictionary of evaluation metrics. + """ + + # Obtain predictions + y_pred = self.predict(X_test) + + # Convert y_test to numpy array if needed + if not isinstance(y_test, (np.ndarray, torch.Tensor)): + y_test_np = np.array(y_test) + elif isinstance(y_test, torch.Tensor): + y_test_np = y_test.cpu().numpy() + else: + y_test_np = y_test + + num_labels = self.params["num_labels"] + is_multilabel_pred = y_pred.ndim == 2 and y_pred.shape[1] > 1 + is_flat_truth = (y_test_np.ndim == 1) or (y_test_np.ndim == 2 and y_test_np.shape[1] == 1) + + if is_multilabel_pred and is_flat_truth: + # Create a zero matrix + y_test_expanded = np.zeros((y_test_np.shape[0], num_labels), dtype=int) + + # Flatten y_test for iteration + indices = y_test_np.flatten() + + # Use indices to set the correct column to 1 + for i, label_idx in enumerate(indices): + idx = int(label_idx) + if 0 <= idx < num_labels: + y_test_expanded[i, idx] = 1 + + y_test_np = y_test_expanded + + # Generate classification report + report = classification_report(y_test_np, y_pred, zero_division=0) + print("\n" + "=" * 50) + print("CLASSIFICATION REPORT") + print(report) + print("=" * 50 + "\n") + + metrics = { + "accuracy": accuracy_score(y_test_np, y_pred), + "precision": precision_score(y_test_np, y_pred, average="macro", zero_division=0), + "recall": recall_score(y_test_np, y_pred, average="macro", zero_division=0), + "f1_score": f1_score(y_test_np, y_pred, average="macro", zero_division=0), + } + + mlflow.log_metrics(metrics) + + logger.info( + f"Evaluation completed — Accuracy: {metrics['accuracy']:.3f}, F1: {metrics['f1_score']:.3f}" + ) + return metrics + + def predict(self, X) -> ndarray: + """ + Make predictions for Multi-Label classification. + Returns Binary Matrix (Multi-Hot) where multiple classes can be 1. + + Args: + X (list): Input texts for prediction. + + Returns: + np.ndarray: Multi-Hot Encoded predictions (e.g., [[0, 1, 1, 0], ...]) + """ + + if self.model is None: + raise ValueError("Model is not trained. Call train() or load() before prediction.") + + # Set model to evaluation mode + self.model.eval() + + encodings = self._tokenize(X) + # Pass None as labels because we are in inference + dataset = GraphCodeBERTDataset(encodings, labels=None) + + use_fp16 = torch.cuda.is_available() + + training_args = TrainingArguments( + output_dir="./pred_temp", + per_device_eval_batch_size=self.params["batch_size_eval"], + fp16=use_fp16, + report_to="none", + no_cuda=not torch.cuda.is_available(), + ) + + trainer = Trainer(model=self.model, args=training_args) + output = trainer.predict(dataset) + + # Clean up temporary prediction directory + if os.path.exists("./pred_temp"): + shutil.rmtree("./pred_temp") + + # Convert logits to probabilities + logits = output.predictions + probs = 1 / (1 + np.exp(-logits)) + + # Apply a threshold of 0.5 (if prob > 0.5, predict 1 else 0) + preds_binary = (probs > 0.5).astype(int) + + return preds_binary + + def save(self, path, model_name): + """ + Save model locally and log to MLflow as artifact. + + Args: + path (str): Directory path to save the model. + model_name (str): Name for the saved model. + """ + + if self.model is None: + raise ValueError("Model is not trained. Cannot save uninitialized model.") + + # Local Saving + complete_path = os.path.join(path, f"{model_name}_{self.language}") + + # Remove existing directory if it exists + if os.path.exists(complete_path) and os.path.isdir(complete_path): + shutil.rmtree(complete_path) + + # Save model and tokenizer + logger.info(f"Saving model to: {complete_path}") + self.model.save_pretrained(complete_path) + self.tokenizer.save_pretrained(complete_path) + logger.info("Model saved locally.") + + try: + # Log to MLflow + logger.info("Logging artifacts to MLflow...") + mlflow.log_artifacts( + local_dir=complete_path, artifact_path=f"{model_name}_{self.language}" + ) + except Exception as e: + logger.error(f"Failed to log model artifacts to MLflow: {e}") + + def load(self, model_path): + """ + Load model from a local path OR an MLflow URI. + + Args: + model_path (str): Local path or MLflow URI to load the model from. + """ + + logger.info(f"Loading model from: {model_path}") + local_model_path = model_path + + # Downloading model from MLflow and saving to local path + if model_path.startswith("models:/") or model_path.startswith("runs:/"): + try: + logger.info("Detected MLflow model URI. Attempting to load from MLflow...") + local_model_path = os.path.join(MODELS_DIR, "mlflow_temp_models") + local_model_path = mlflow.artifacts.download_artifacts( + artifact_uri=model_path, dst_path=local_model_path + ) + logger.info(f"Model downloaded from MLflow to: {local_model_path}") + except Exception as e: + logger.error(f"Failed to load from MLflow: {e}") + raise e + + # Loading from local path + try: + if not os.path.exists(local_model_path): + raise FileNotFoundError(f"Model path not found: {local_model_path}") + + # Load tokenizer and model from local path + self.tokenizer = AutoTokenizer.from_pretrained(local_model_path) + self.model = AutoModelForSequenceClassification.from_pretrained(local_model_path).to( + self.device + ) + logger.info("Model loaded from local path successfully.") + + except Exception as e: + logger.error(f"Failed to load model from local path: {e}") + raise e + + # Set model to evaluation mode + self.model.eval() diff --git a/turing/modeling/models/randomForestTfIdf.py b/turing/modeling/models/randomForestTfIdf.py new file mode 100644 index 0000000000000000000000000000000000000000..3e785f7e0c24f4bf7e01e292d06d0a47c59d92f6 --- /dev/null +++ b/turing/modeling/models/randomForestTfIdf.py @@ -0,0 +1,153 @@ +import warnings + +from loguru import logger +from numpy import ndarray +from sklearn.ensemble import RandomForestClassifier +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics import ( + accuracy_score, + classification_report, + f1_score, + precision_score, + recall_score, +) +from sklearn.model_selection import GridSearchCV +from sklearn.multioutput import MultiOutputClassifier +from sklearn.pipeline import Pipeline + +from ..baseModel import BaseModel + +warnings.filterwarnings("ignore") + + +class RandomForestTfIdf(BaseModel): + """ + Sklearn implementation of BaseModel with integrated Grid Search. + Builds a TF-IDF + RandomForest pipeline for multi-output text classification. + """ + + def __init__(self, language, path=None): + """ + Initialize the RandomForestTfIdf model with configuration parameters. + + Args: + language (str): Language for the model. + path (str, optional): Path to load a pre-trained model. Defaults to None. + If None, a new model is initialized. + """ + + self.params = {"stop_words": "english", "random_state": 42, "cv_folds": 5} + + self.grid_params = { + "clf__estimator__n_estimators": [50, 100, 200], + "clf__estimator__max_depth": [None, 10, 20], + "tfidf__max_features": [3000, 5000, 8000], + } + + super().__init__(language, path) + + def setup_model(self): + """ + Initialize the scikit-learn pipeline with TF-IDF vectorizer and RandomForest classifier. + """ + + base_estimator = RandomForestClassifier( + random_state=self.params["random_state"], n_jobs=-1 + ) + + self.pipeline = Pipeline( + [ + ( + "tfidf", + TfidfVectorizer(ngram_range=(1, 2), stop_words=self.params["stop_words"]), + ), + ("clf", MultiOutputClassifier(base_estimator, n_jobs=-1)), + ] + ) + + self.model = self.pipeline + logger.info("Scikit-learn pipeline initialized.") + + def train(self, X_train, y_train) -> dict[str, any]: + """ + Train the model using Grid Search to find the best hyperparameters. + + Args: + X_train: Input training data. + y_train: True labels for training data. + """ + + if self.model is None: + raise ValueError( + "Model pipeline is not initialized. Call setup_model() before training." + ) + + logger.info(f"Starting training for: {self.language.upper()}") + logger.info("Performing Grid Search for best hyperparameters...") + grid_search = GridSearchCV( + self.pipeline, + param_grid=self.grid_params, + cv=self.params["cv_folds"], + scoring="f1_weighted", + n_jobs=-1, + verbose=1, + ) + grid_search.fit(X_train, y_train) + + logger.success(f"Best params found: {grid_search.best_params_}") + + parameters_to_log = { + "max_features": grid_search.best_params_["tfidf__max_features"], + "n_estimators": grid_search.best_params_["clf__estimator__n_estimators"], + "max_depth": grid_search.best_params_["clf__estimator__max_depth"], + } + + self.model = grid_search.best_estimator_ + logger.success(f"Training for {self.language.upper()} completed.") + + return parameters_to_log + + def evaluate(self, X_test, y_test) -> dict[str, any]: + """ + Evaluate model on test data and return metrics. + + Args: + X_test: Input test data. + y_test: True labels for test data. + """ + + y_pred = self.predict(X_test) + + report = classification_report(y_test, y_pred, zero_division=0) + print("\n" + "=" * 50) + print("CLASSIFICATION REPORT") + print(report) + print("=" * 50 + "\n") + + metrics = { + "accuracy": accuracy_score(y_test, y_pred), + "precision": precision_score(y_test, y_pred, average="macro", zero_division=0), + "recall": recall_score(y_test, y_pred, average="macro", zero_division=0), + "f1_score": f1_score(y_test, y_pred, average="weighted"), + } + + logger.info( + f"Evaluation completed — Accuracy: {metrics['accuracy']:.3f}, F1: {metrics['f1_score']:.3f}" + ) + return metrics + + def predict(self, X) -> ndarray: + """ + Make predictions using the trained model. + + Args: + X: Input data for prediction. + + Returns: + Predictions made by the model. + """ + + if self.model is None: + raise ValueError("Model is not trained. Call train() or load() before prediction.") + + return self.model.predict(X) diff --git a/turing/modeling/models/tinyBert.py b/turing/modeling/models/tinyBert.py new file mode 100644 index 0000000000000000000000000000000000000000..8d76eb4dddf2e767bae4a5cced7c97328580b6da --- /dev/null +++ b/turing/modeling/models/tinyBert.py @@ -0,0 +1,441 @@ +""" +Ultra-lightweight multi-label text classification model for code comment analysis. + +This module implements a specialized neural architecture combining TinyBERT +(15MB, 96 layers compressed) with a custom multi-label classification head. +Designed for efficient inference on resource-constrained environments while +maintaining competitive performance on code comment classification tasks. + +Architecture: + - Encoder: TinyBERT (prajjwal1/bert-tiny) + - Hidden dimension: 312 + - Classification layers: 312 -> 128 (ReLU) -> num_labels (Sigmoid) + - Regularization: Dropout(0.2) for preventing overfitting + - Loss function: Binary Cross-Entropy for multi-label classification + +Performance characteristics: + - Model size: ~15MB + - Inference latency: ~50ms per sample + - Memory footprint: ~200MB during training + - Supports multi-label outputs via sigmoid activation +""" + +from typing import List + +from loguru import logger +import numpy as np +from sklearn.preprocessing import MultiLabelBinarizer +import torch +from torch import nn +from torch.optim import Adam + +import turing.config as config +from turing.modeling.baseModel import BaseModel + +try: + from transformers import AutoModel, AutoTokenizer +except ImportError: + logger.error("transformers library required. Install with: pip install transformers torch") + + +class TinyBERTClassifier(BaseModel): + """ + Ultra-lightweight multi-label classifier for code comment analysis. + + Combines TinyBERT encoder with a custom classification head optimized for + multi-label code comment classification across Java, Python, and Pharo. + + Attributes: + device (torch.device): Computation device (CPU/GPU). + model (nn.ModuleDict): Container for encoder and classifier components. + tokenizer (AutoTokenizer): Hugging Face tokenizer for text preprocessing. + classifier (nn.Sequential): Custom multi-label classification head. + num_labels (int): Number of output classes per language. + labels_map (list): Mapping of label indices to semantic categories. + + References: + TinyBERT: https://huggingface.co/prajjwal1/bert-tiny + """ + + def __init__(self, language: str, path: str = None): + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + logger.info(f"TinyBERT using device: {self.device}") + self.model = None + self.tokenizer = None + self.classifier = None + self.mlb = MultiLabelBinarizer() + self.labels_map = config.LABELS_MAP.get(language, []) + self.num_labels = len(self.labels_map) + self.params = { + "model": "TinyBERT", + "model_size": "15MB", + "epochs": 15, + "batch_size": 8, + "learning_rate": 1e-3, + } + super().__init__(language=language, path=path) + + def setup_model(self): + """ + Initialize TinyBERT encoder and custom classification head. + + Loads the pre-trained TinyBERT model from Hugging Face model hub and + constructs a custom multi-label classification head with: + - Input: 312-dimensional encoder embeddings [CLS] token + - Hidden layer: 128 units with ReLU activation + - Dropout: 0.2 for regularization + - Output: num_labels units with Sigmoid activation + + Raises: + Exception: If model initialization fails due to network or missing dependencies. + """ + self._initialize_model() + + def _initialize_model(self): + """ + Initialize TinyBERT encoder and custom classification head. + + Loads the pre-trained TinyBERT model from Hugging Face model hub and + constructs a custom multi-label classification head with: + - Input: 312-dimensional encoder embeddings [CLS] token + - Hidden layer: 128 units with ReLU activation + - Dropout: 0.2 for regularization + - Output: num_labels units with Sigmoid activation + + Raises: + Exception: If model initialization fails due to network or missing dependencies. + """ + try: + model_name = "prajjwal1/bert-tiny" + + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + encoder = AutoModel.from_pretrained(model_name) + encoder.to(self.device) + + hidden_dim = encoder.config.hidden_size + + self.classifier = nn.Sequential( + nn.Linear(hidden_dim, 128), + nn.ReLU(), + nn.Dropout(0.2), + nn.Linear(128, self.num_labels), + nn.Sigmoid(), + ).to(self.device) + + self.model = nn.ModuleDict({"encoder": encoder, "classifier": self.classifier}) + + logger.success(f"Initialized TinyBERTClassifier for {self.language}") + logger.info(f"Model size: ~15MB | Labels: {self.num_labels}") + + except Exception as e: + logger.error(f"Error initializing model: {e}") + raise + + def train( + self, + X_train: List[str], + y_train: np.ndarray, + path: str = None, + model_name: str = "tinybert_classifier", + epochs: int = 15, + batch_size: int = 8, + learning_rate: float = 1e-3, + ) -> dict: + """ + Train the classifier using binary cross-entropy loss. + + Implements gradient descent optimization with adaptive learning rate scheduling. + Supports checkpoint saving for model persistence and recovery. + + Args: + X_train (List[str]): Training text samples (code comments). + y_train (np.ndarray): Binary label matrix of shape (n_samples, n_labels). + path (str, optional): Directory path for model checkpoint saving. + model_name (str): Identifier for saved model artifacts. + epochs (int): Number of complete training iterations. Default: 3. + batch_size (int): Number of samples per gradient update. Default: 16. + learning_rate (float): Adam optimizer learning rate. Default: 2e-5. + + Returns: + dict: Training configuration including hyperparameters and model metadata. + + Raises: + Exception: If training fails due to data inconsistency or resource exhaustion. + """ + try: + if self.model is None: + self._initialize_model() + + optimizer = Adam(self.classifier.parameters(), lr=learning_rate) + criterion = nn.BCELoss() + + num_samples = len(X_train) + num_batches = (num_samples + batch_size - 1) // batch_size + + logger.info(f"Starting training: {epochs} epochs, {num_batches} batches per epoch") + + for epoch in range(epochs): + total_loss = 0.0 + + for batch_idx in range(num_batches): + start_idx = batch_idx * batch_size + end_idx = min(start_idx + batch_size, num_samples) + + batch_texts = X_train[start_idx:end_idx] + batch_labels = y_train[start_idx:end_idx] + + optimizer.zero_grad() + + tokens = self.tokenizer( + batch_texts, + padding=True, + truncation=True, + max_length=128, + return_tensors="pt", + ).to(self.device) + + with torch.no_grad(): + encoder_output = self.model["encoder"](**tokens) + cls_token = encoder_output.last_hidden_state[:, 0, :] + + logits = self.classifier(cls_token) + + labels_tensor = torch.tensor(batch_labels, dtype=torch.float32).to(self.device) + loss = criterion(logits, labels_tensor) + + loss.backward() + optimizer.step() + + total_loss += loss.item() + + avg_loss = total_loss / num_batches + logger.info(f"Epoch {epoch + 1}/{epochs} - Loss: {avg_loss:.4f}") + + logger.success(f"Training completed for {self.language}") + + if path: + self.save(path, model_name) + + return { + "epochs": epochs, + "batch_size": batch_size, + "learning_rate": learning_rate, + "model_size_mb": 15, + } + + except Exception as e: + logger.error(f"Error training model: {e}") + raise + + def predict(self, texts: List[str], threshold: float = 0.3) -> np.ndarray: + """ + Generate multi-label predictions for code comments. + + Performs inference in evaluation mode without gradient computation. + Applies probability threshold to convert sigmoid outputs to binary labels. + + Args: + texts (List[str]): Code comment samples for classification. + threshold (float): Decision boundary for label assignment. Default: 0.5. + Values below threshold are mapped to 0, above to 1. + + Returns: + np.ndarray: Binary predictions matrix of shape (n_samples, n_labels). + + Raises: + ValueError: If model is not initialized. + Exception: If inference fails due to incompatible input dimensions. + """ + if self.model is None: + raise ValueError("Model not initialized. Train or load a model first.") + + self.model.eval() + predictions = [] + + # Convert various types to list: pandas Series, Dataset Column, etc. + if hasattr(texts, "tolist"): + texts = texts.tolist() + elif hasattr(texts, "__iter__") and not isinstance(texts, list): + texts = list(texts) + + try: + with torch.no_grad(): + tokens = self.tokenizer( + texts, padding=True, truncation=True, max_length=128, return_tensors="pt" + ).to(self.device) + + encoder_output = self.model["encoder"](**tokens) + cls_token = encoder_output.last_hidden_state[:, 0, :] + + logits = self.classifier(cls_token) + probabilities = logits.cpu().numpy() + + predictions = (probabilities > threshold).astype(int) + + return predictions + + except Exception as e: + logger.error(f"Error during prediction: {e}") + raise + + def evaluate(self, X_test: List[str], y_test: np.ndarray) -> dict: + """ + Evaluate classification performance on test set. + + Computes per-label and macro-averaged metrics: + - Precision: TP / (TP + FP) - correctness of positive predictions + - Recall: TP / (TP + FN) - coverage of actual positive instances + - F1-Score: 2 * (P * R) / (P + R) - harmonic mean of precision and recall + - Accuracy: Per-sample exact match rate + + Args: + X_test (List[str]): Test text samples for evaluation. + y_test (np.ndarray): Ground truth binary label matrix or indices. + + Returns: + dict: Evaluation metrics including f1_score, precision, recall, accuracy. + + Raises: + Exception: If evaluation fails due to prediction errors. + """ + try: + predictions = self.predict(X_test) + + # Convert y_test to numpy array if needed + if not isinstance(y_test, (np.ndarray, torch.Tensor)): + y_test_np = np.array(y_test) + elif isinstance(y_test, torch.Tensor): + y_test_np = y_test.cpu().numpy() + else: + y_test_np = y_test + + # Handle conversion from flat indices to multi-hot encoding if needed + is_multilabel_pred = predictions.ndim == 2 and predictions.shape[1] > 1 + is_flat_truth = (y_test_np.ndim == 1) or ( + y_test_np.ndim == 2 and y_test_np.shape[1] == 1 + ) + + if is_multilabel_pred and is_flat_truth: + # Create zero matrix for multi-hot encoding + y_test_expanded = np.zeros((y_test_np.shape[0], self.num_labels), dtype=int) + indices = y_test_np.flatten() + + # Set columns to 1 based on indices + for i, label_idx in enumerate(indices): + idx = int(label_idx) + if 0 <= idx < self.num_labels: + y_test_expanded[i, idx] = 1 + + y_test_np = y_test_expanded + + tp = np.sum((predictions == 1) & (y_test_np == 1), axis=0) + fp = np.sum((predictions == 1) & (y_test_np == 0), axis=0) + fn = np.sum((predictions == 0) & (y_test_np == 1), axis=0) + + precision_per_label = tp / (tp + fp + 1e-10) + recall_per_label = tp / (tp + fn + 1e-10) + f1_per_label = ( + 2 + * (precision_per_label * recall_per_label) + / (precision_per_label + recall_per_label + 1e-10) + ) + + metrics = { + "f1_score": float(np.mean(f1_per_label)), + "precision": float(np.mean(precision_per_label)), + "recall": float(np.mean(recall_per_label)), + "accuracy": float(np.mean(predictions == y_test_np)), + } + + logger.info(f"Evaluation metrics: {metrics}") + return metrics + + except Exception as e: + logger.error(f"Error evaluating model: {e}") + raise + + def save(self, path: str, model_name: str = "tinybert_classifier"): + """ + Persist model artifacts including weights, tokenizer, and configuration. + + Saves the following components: + - classifier.pt: PyTorch state dictionary of classification head + - tokenizer configuration: Hugging Face tokenizer files + - config.json: Model metadata and label mappings + + Args: + path (str): Parent directory for model checkpoint storage. + model_name (str): Model identifier used as subdirectory name. + + Raises: + Exception: If file I/O or serialization fails. + """ + try: + import os + + model_path = os.path.join(path, model_name) + os.makedirs(model_path, exist_ok=True) + + if self.classifier: + torch.save(self.classifier.state_dict(), os.path.join(model_path, "classifier.pt")) + + if self.tokenizer: + self.tokenizer.save_pretrained(model_path) + + config_data = { + "language": self.language, + "num_labels": self.num_labels, + "labels_map": self.labels_map, + "model_type": "tinybert_classifier", + "model_name": model_name, + } + + import json + + with open(os.path.join(model_path, "config.json"), "w") as f: + json.dump(config_data, f, indent=2) + + logger.success(f"Model saved to {model_path}") + + except Exception as e: + logger.error(f"Error saving model: {e}") + raise + + def load(self, path: str): + """ + Restore model state from checkpoint directory. + + Loads classifier weights from serialized PyTorch tensors and reinitializes + the tokenizer from saved configuration. Restores language-specific label + mappings from JSON metadata. + + Args: + path (str): Directory containing model checkpoint files. + + Raises: + Exception: If file not found or deserialization fails. + """ + try: + import json + import os + + self._initialize_model() + + classifier_path = os.path.join(path, "classifier.pt") + if os.path.exists(classifier_path): + self.classifier.load_state_dict( + torch.load(classifier_path, map_location=self.device) + ) + + config_path = os.path.join(path, "config.json") + if os.path.exists(config_path): + with open(config_path, "r") as f: + config_data = json.load(f) + self.language = config_data.get("language", self.language) + self.labels_map = config_data.get("labels_map", self.labels_map) + + logger.success(f"Model loaded from {path}") + + except Exception as e: + logger.error(f"Error loading model: {e}") + raise diff --git a/turing/modeling/predict.py b/turing/modeling/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..4304a04830035c8e3e50dc97cc67a955ecce1c77 --- /dev/null +++ b/turing/modeling/predict.py @@ -0,0 +1,195 @@ +import importlib +import warnings + +import dagshub +from loguru import logger +import mlflow +import numpy as np +import pandas as pd + +from turing.config import INPUT_COLUMN, LABELS_MAP, LANGS, MODEL_CONFIG, MODELS_DIR +from turing.dataset import DatasetManager +from turing.modeling.model_selector import get_best_model_info +from turing.modeling.models.codeBerta import CodeBERTa + + +class ModelInference: + # Model Configuration (Fallback Registry) + FALLBACK_MODEL_REGISTRY = { + "java": { + "run_id": "446f4459780347da8c796e619129be37", + "artifact": "fine-tuned-CodeBERTa_java", + "model_id": "codeberta", + }, + "python": { + "run_id": "ef5fd8ebf33a412087dcf02afd9e3147", + "artifact": "fine-tuned-CodeBERTa_python", + "model_id": "codeberta", + }, + "pharo": { + "run_id": "97822c6d84fc40c5b2363c9201a39997", + "artifact": "fine-tuned-CodeBERTa_pharo", + "model_id": "codeberta", + }, + } + + + def __init__(self, repo_owner="se4ai2526-uniba", repo_name="Turing", use_best_model_tags=True): + dagshub.init(repo_owner=repo_owner, repo_name=repo_name, mlflow=True) + warnings.filterwarnings("ignore") + self.dataset_manager = DatasetManager() + self.use_best_model_tags = use_best_model_tags + + # Initialize model registry based on configuration + if use_best_model_tags: + logger.info("Using MLflow tags to find best models") + + self.model_registry = {} + for lang in LANGS: + try: + model_info = get_best_model_info( + lang, fallback_registry=self.FALLBACK_MODEL_REGISTRY + ) + self.model_registry[lang] = model_info + logger.info(f"Loaded model info for {lang}: {model_info}") + + # raise error if any required info is missing + if not all(k in model_info for k in ("run_id", "artifact", "model_id")): + raise ValueError(f"Incomplete model info for {lang}: {model_info}") + + except Exception as e: + logger.warning(f"Could not load model info for {lang}: {e}") + if lang in self.FALLBACK_MODEL_REGISTRY: + self.model_registry[lang] = self.FALLBACK_MODEL_REGISTRY[lang] + + # Pre-cache models locally + run_id = self.model_registry[lang]["run_id"] + artifact = self.model_registry[lang]["artifact"] + self._get_cached_model_path(run_id, artifact, lang) + else: + logger.info("Using hardcoded model registry") + self.model_registry = self.FALLBACK_MODEL_REGISTRY + + def _decode_predictions(self, raw_predictions, language: str): + """ + Converts the binary matrix from the model into human-readable labels. + + Args: + raw_predictions: Numpy array or similar with binary predictions + language: Programming language for label mapping + """ + + labels_map = LABELS_MAP.get(language, []) + decoded_results = [] + + # Ensure input is a numpy array for processing + if isinstance(raw_predictions, list): + raw_array = np.array(raw_predictions) + elif isinstance(raw_predictions, pd.DataFrame): + raw_array = raw_predictions.values + else: + raw_array = raw_predictions + + # Iterate over rows + for row in raw_array: + indices = np.where(row == 1)[0] + # Map indices to labels safely + row_labels = [labels_map[i] for i in indices if i < len(labels_map)] + decoded_results.append(row_labels) + + return decoded_results + + def _get_cached_model_path(self, run_id: str, artifact_name: str, language: str) -> str: + """Checks if model exists locally; if not, downloads it from MLflow.""" + # Define local path: models/mlflow_temp_models/language/artifact_name + local_path = MODELS_DIR / "mlflow_temp_models" / language / artifact_name + + if local_path.exists(): + logger.info(f"Loading {language} model from local cache: {local_path}") + return str(local_path) + + logger.info( + f"Model not found locally. Downloading {language} model from MLflow (Run ID: {run_id})..." + ) + + # Ensure parent directory exists + local_path.parent.mkdir(parents=True, exist_ok=True) + + # Download artifacts to the parent directory (artifact_name folder will be created inside) + mlflow.artifacts.download_artifacts( + run_id=run_id, artifact_path=artifact_name, dst_path=str(local_path.parent) + ) + logger.success(f"Model downloaded and cached at: {local_path}") + + return str(local_path) + + def predict_payload(self, texts: list[str], language: str): + """ + API Prediction: Automatically fetches the correct model from the registry based on language. + + Args: + texts: List of code comments to classify + language: Programming language + """ + + # 1. Validate Language and Fetch Config + if language not in self.model_registry: + raise ValueError( + f"Language '{language}' is not supported or the model is not configured." + ) + + model_config = self.model_registry[language] + run_id = model_config["run_id"] + artifact_name = model_config["artifact"] + model_id = model_config["model_id"] + + # Dynamically import model class + config_entry = MODEL_CONFIG[model_id] + module_name = config_entry["model_class_module"] + class_name = config_entry["model_class_name"] + module = importlib.import_module(module_name) + model_class = getattr(module, class_name) + + # 2. Get Model Path (Local Cache or Download) + model_path = self._get_cached_model_path(run_id, artifact_name, language) + + # Load Model + model = model_class(language=language, path=model_path) + + # 3. Predict + raw_predictions = model.predict(texts) + + # 4. Decode Labels + decoded_labels = self._decode_predictions(raw_predictions, language) + + return raw_predictions, decoded_labels, run_id, artifact_name + + def predict_from_mlflow( + self, mlflow_run_id: str, artifact_name: str, language: str, model_class=CodeBERTa + ): + """ + Legacy method for CML/CLI: Predicts on the test dataset stored on disk. + """ + # Load Dataset + try: + full_dataset = self.dataset_manager.get_dataset() + dataset_key = f"{language}_test" + if dataset_key not in full_dataset: + raise ValueError(f"Dataset key '{dataset_key}' not found.") + test_ds = full_dataset[dataset_key] + X_test = test_ds[INPUT_COLUMN] + except Exception as e: + logger.error(f"Error loading dataset: {e}") + raise e + + # Load Model (Local Cache or Download) + model_path = self._get_cached_model_path(mlflow_run_id, artifact_name, language) + model = model_class(language=language, path=model_path) + + raw_predictions = model.predict(X_test) + + # Decode output + readable_predictions = self._decode_predictions(raw_predictions, language) + + logger.info("Dataset prediction completed.") + return readable_predictions diff --git a/turing/modeling/train.py b/turing/modeling/train.py new file mode 100644 index 0000000000000000000000000000000000000000..cad1b0567f663bcf20c40af5ecdb17c28fa49fe7 --- /dev/null +++ b/turing/modeling/train.py @@ -0,0 +1,212 @@ +from importlib import import_module +import os +import warnings + +import dagshub +from loguru import logger +import mlflow +from mlflow.tracking import MlflowClient +import numpy as np +import typer + +import turing.config as config +from turing.dataset import DatasetManager +from turing.evaluate_model import evaluate_models + +dagshub.init(repo_owner="se4ai2526-uniba", repo_name="Turing", mlflow=True) + +warnings.filterwarnings("ignore") + +DEFAULT_MODEL = "codeberta" +_default_cfg = config.MODEL_CONFIG[DEFAULT_MODEL] + +MODEL_CLASS_MODULE = _default_cfg["model_class_module"] +MODEL_CLASS_NAME = _default_cfg["model_class_name"] +MODEL_CLASS = __import__(MODEL_CLASS_MODULE, fromlist=[MODEL_CLASS_NAME]) +MODEL_CLASS = getattr(MODEL_CLASS, MODEL_CLASS_NAME) +EXP_NAME = _default_cfg["exp_name"] +MODEL_NAME = _default_cfg["model_name"] + + + +app = typer.Typer() + + +def tag_best_models( + metric: str = "f1_score" +): + """ + Tag the best existing models in MLflow based on the specified metric. + Remove previous best_model tags before tagging the new best models. + + Args: + metric: Metric to use for determining the best model + """ + + dagshub.init(repo_owner="se4ai2526-uniba", repo_name="Turing", mlflow=True) + client = MlflowClient() + + # Get all experiments from Mlflow + experiments = client.search_experiments() + if not experiments: + logger.error("No experiments found in MLflow") + return + + # Find the best run for each language + experiments_ids = [exp.experiment_id for exp in experiments] + for lang in config.LANGS: + # Get all runs for the language + runs = client.search_runs( + experiment_ids=experiments_ids, + filter_string=f"tags.Language = '{lang}'", + order_by=[f"metrics.{metric} DESC"] + ) + + if not runs: + logger.warning(f"No runs found for language {lang}") + continue + logger.info(f"Found {len(runs)} runs for {lang}") + + # Get the best run for the language + best_run = runs[0] + run_id = best_run.info.run_id + + # Remove previous best_model tags for this language + for run in runs[1:]: + try: + client.delete_tag(run.info.run_id, "best_model") + except Exception: + pass + + # Tag the best model + client.set_tag(run_id, "best_model", "true") + + +def show_tagged_models(): + """ + Show all models tagged as best_model. + """ + + dagshub.init(repo_owner="se4ai2526-uniba", repo_name="Turing", mlflow=True) + client = MlflowClient() + + # Get all experiments from Mlflow + experiments = client.search_experiments() + if not experiments: + logger.error("No experiments found in MLflow") + return + + # Find all runs tagged as best_model + runs = client.search_runs( + experiment_ids=[exp.experiment_id for exp in experiments], + filter_string="tags.best_model = 'true'", + order_by=["tags.Language ASC"] + ) + logger.info(f"\nFound {len(runs)} best models in experiments:\n") + + # Display details of each tagged best model + for run in runs: + language = run.data.tags.get("Language", "unknown") + exp_name = client.get_experiment(run.info.experiment_id).name + run_id = run.info.run_id + run_name = run.data.tags.get("mlflow.runName", "N/A") + dataset_name = run.data.tags.get("dataset_name", "unknown") + + logger.info(f"Language: {language}") + logger.info(f" Run: {exp_name}/{run_name} ({run_id})") + logger.info(f" Dataset: {dataset_name}") + + if run.data.metrics: + for metric in run.data.metrics: + logger.info(f" {metric}: {run.data.metrics[metric]:.4f}") + + logger.info("") + + +@app.command() +def main(model: str = typer.Option("codeberta", help="Model to train: codeberta, graphcodebert, tinybert, or randomforest"), dataset: str = typer.Option(None, help="Dataset to use for training")): + # Get model configuration from config + model_key = model.lower() + if model_key not in config.MODEL_CONFIG: + logger.error(f"Unknown model: {model_key}. Available models: {list(config.MODEL_CONFIG.keys())}") + return + + model_cfg = config.MODEL_CONFIG[model_key] + model_name = model_cfg["model_name"] + exp_name = model_cfg["exp_name"] + + # Dynamically import model class + module = import_module(model_cfg["model_class_module"]) + model_class = getattr(module, model_cfg["model_class_name"]) + + logger.info(f"Training model: {model_name}") + + # Load dataset + dataset_path = config.INTERIM_DATA_DIR / "features" / dataset + dataset_manager = DatasetManager(dataset_path=dataset_path) + try: + full_dataset = dataset_manager.get_dataset() + dataset_name = dataset_manager.get_dataset_name() + except Exception as e: + logger.error(f"Error loading dataset: {e}") + return + logger.info(f"Dataset loaded successfully: {dataset_name}") + + # Train and evaluate models for each language + mlflow.set_experiment(exp_name) + models = {} + for lang in config.LANGS: + # Prepare training and testing data + train_ds = full_dataset[f"{lang}_train"] + test_ds = full_dataset[f"{lang}_test"] + X_train = train_ds[config.INPUT_COLUMN] + y_train = train_ds[config.LABEL_COLUMN] + X_test = test_ds[config.INPUT_COLUMN] + y_test = test_ds[config.LABEL_COLUMN] + X_train = list(X_train) + X_test = list(X_test) + y_train = np.array(y_train) + + # Initialize model + model = model_class(language=lang) + + # Train and evaluate model within an MLflow run + try: + with mlflow.start_run(run_name=f"{model_name}_{lang}"): + mlflow.set_tag("Language", lang) + mlflow.set_tag("dataset_name", dataset_name) + mlflow.set_tag("model_id", model_key) + mlflow.log_params(model.params) + parameters_to_log = model.train( + X_train, + y_train + ) + mlflow.log_params(parameters_to_log) + model.save(os.path.join(config.MODELS_DIR, exp_name),model_name=model_name) + metrics = model.evaluate(X_test, y_test) + mlflow.log_metrics(metrics) + + # Log model name for later retrieval + mlflow.set_tag("model_name", f"{model_name}_{lang}") + + except Exception as e: + logger.error(f"Error training/evaluating model for {lang}: {e}") + return + + # Store trained model + models[lang] = model + logger.success(f"All {model_name} models trained and evaluated.") + + # Competition-style evaluation of trained models + logger.info("Starting competition-style evaluation of trained models...") + evaluate_models(models, full_dataset) + logger.success("Evaluation completed.") + + logger.info("Tagging best models in MLflow...") + tag_best_models() + logger.info("Best models:") + show_tagged_models() + + +if __name__ == "__main__": + app() diff --git a/turing/plots.py b/turing/plots.py new file mode 100644 index 0000000000000000000000000000000000000000..10f8e958ed9634b8c2aceaff6fb3bd6a8841a998 --- /dev/null +++ b/turing/plots.py @@ -0,0 +1,29 @@ +from pathlib import Path + +from loguru import logger +from tqdm import tqdm +import typer + +from turing.config import FIGURES_DIR, PROCESSED_DATA_DIR + +app = typer.Typer() + + +@app.command() +def main( + # ---- REPLACE DEFAULT PATHS AS APPROPRIATE ---- + input_path: Path = PROCESSED_DATA_DIR / "dataset.csv", + output_path: Path = FIGURES_DIR / "plot.png", + # ----------------------------------------- +): + # ---- REPLACE THIS WITH YOUR OWN CODE ---- + logger.info("Generating plot from data...") + for i in tqdm(range(10), total=10): + if i == 5: + logger.info("Something happened for iteration 5.") + logger.success("Plot generation complete.") + # ----------------------------------------- + + +if __name__ == "__main__": + app() diff --git a/turing/reporting.py b/turing/reporting.py new file mode 100644 index 0000000000000000000000000000000000000000..ff4fb88e672d38867d2b31f68a95c14c43a04f0f --- /dev/null +++ b/turing/reporting.py @@ -0,0 +1,173 @@ +from datetime import datetime +import platform +import sys +from typing import Optional + +from loguru import logger +import pandas as pd + +from turing.config import REPORTS_DIR + + +class TestReportGenerator: + """ + Handles the generation of structured Markdown reports specifically for test execution results. + """ + + def __init__(self, context_name: str, report_category: str): + self.context_name = context_name + self.report_category = report_category + self.timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + self.content = [] + self.output_dir = REPORTS_DIR / self.report_category + + def add_header(self, text: str, level: int = 1): + self.content.append(f"\n{'#' * level} {text}\n") + + def add_divider(self, style: str = "thin"): + """Add a visual divider line.""" + dividers = { + "thin": "---", + "thick": "___", + "section": "\n---\n", + } + self.content.append(f"\n{dividers.get(style, dividers['thin'])}\n") + + def add_code_block(self, content: str, language: str = ""): + """Add a code block.""" + self.content.append(f"\n```{language}\n{content}\n```\n") + + def add_alert_box(self, message: str, box_type: str = "info"): + """Add a styled alert box using blockquotes.""" + box_headers = { + "info": "INFO", + "success": "SUCCESS", + "warning": "WARNING", + "error": "ERROR", + } + header = box_headers.get(box_type, "INFO") + self.content.append(f"\n> **{header}**: {message}\n") + + def add_progress_bar(self, passed: int, total: int, width: int = 50): + """Add an ASCII progress bar.""" + if total == 0: + percentage = 0 + filled = 0 + else: + percentage = (passed / total * 100) + filled = int(width * passed / total) + + empty = width - filled + bar = "█" * filled + "░" * empty + self.add_code_block(f"Progress: [{bar}] {percentage:.1f}%\nPassed: {passed}/{total} tests", "") + + def add_summary_box(self, total: int, passed: int, failed: int, skipped: int = 0): + """Add a visually enhanced summary box.""" + success_rate = (passed / total * 100) if total > 0 else 0 + + # Determine status + if success_rate == 100: + status = "ALL TESTS PASSED" + elif success_rate >= 80: + status = "MOSTLY PASSED" + elif success_rate >= 50: + status = "PARTIAL SUCCESS" + else: + status = "NEEDS ATTENTION" + + self.add_header("Executive Summary", level=2) + self.add_text(f"**Overall Status:** {status}") + self.add_text(f"**Success Rate:** {success_rate:.1f}%") + + # Summary table + summary_data = [ + ["Total Tests", str(total)], + ["Passed", str(passed)], + ["Failed", str(failed)], + ] + + if skipped > 0: + summary_data.append(["Skipped", str(skipped)]) + + summary_data.append(["Success Rate", f"{success_rate:.1f}%"]) + + df = pd.DataFrame(summary_data, columns=["Metric", "Count"]) + self.add_dataframe(df, title=None, align=("left", "right")) + + # Progress bar + self.add_text("**Visual Progress:**") + self.add_progress_bar(passed, total) + + def add_environment_metadata(self): + """Add enhanced environment metadata.""" + self.add_header("Environment Information", level=2) + + metadata = [ + ["Timestamp", datetime.now().strftime("%Y-%m-%d %H:%M:%S")], + ["Context", self.context_name.upper()], + ["Python Version", sys.version.split()[0]], + ["Platform", platform.platform()], + ["Architecture", platform.machine()], + ] + df = pd.DataFrame(metadata, columns=["Parameter", "Value"]) + self.add_dataframe(df, title=None, align=("left", "left")) + + def add_text(self, text: str): + self.content.append(f"\n{text}\n") + + def add_category_stats(self, df: pd.DataFrame, category: str): + """Add statistics for a test category.""" + total = len(df) + passed = len(df[df['Result'] == "PASS"]) + failed = len(df[df['Result'] == "FAIL"]) + skipped = len(df[df['Result'] == "SKIP"]) + + stats = [ + ["Total", str(total)], + ["Passed", f"{passed} ({passed/total*100:.1f}%)" if total > 0 else "0"], + ["Failed", f"{failed} ({failed/total*100:.1f}%)" if total > 0 else "0"], + ] + + if skipped > 0: + stats.append(["Skipped", f"{skipped} ({skipped/total*100:.1f}%)"]) + + stats_df = pd.DataFrame(stats, columns=["Status", "Count"]) + self.add_dataframe(stats_df, title="Statistics", align=("left", "right")) + + def add_dataframe(self, df: pd.DataFrame, title: Optional[str] = None, align: tuple = None): + """Add a formatted dataframe table.""" + if title: + self.add_header(title, level=3) + + if df.empty: + self.content.append("\n_No data available._\n") + return + + try: + if not align: + align = tuple(["left"] * len(df.columns)) + + table_md = df.to_markdown(index=False, tablefmt="pipe", colalign=align) + self.content.append(f"\n{table_md}\n") + except Exception as e: + logger.warning(f"Tabulate error: {e}. Using simple text.") + self.content.append(f"\n```text\n{df.to_string(index=False)}\n```\n") + + def save(self, filename: str = "test_report.md") -> str: + """Save the report to a file.""" + try: + self.output_dir.mkdir(parents=True, exist_ok=True) + file_path = self.output_dir / filename + + # Add footer + self.add_divider("section") + self.add_text(f"*Report generated on {datetime.now().strftime('%Y-%m-%d at %H:%M:%S')}*") + self.add_text("*Powered by Turing Test Suite*") + + with open(file_path, "w", encoding="utf-8") as f: + f.write("\n".join(self.content)) + logger.info(f"Test report saved: {file_path}") + return str(file_path) + except Exception as e: + logger.error(f"Save failed: {e}") + raise diff --git a/turing/tests/behavioral/test_directional.py b/turing/tests/behavioral/test_directional.py new file mode 100644 index 0000000000000000000000000000000000000000..d82d16743916763a64e603224e78f0e693660fc2 --- /dev/null +++ b/turing/tests/behavioral/test_directional.py @@ -0,0 +1,183 @@ +# These tests check that adding or removing keywords logically changes the prediction + + +def test_java_directional_add_deprecation(java_model, get_predicted_labels): + """Tests that adding '@deprecated' ADDs the 'deprecation' label""" + # Base comment should be a 'Pointer' due to the link + base_comment = "/** Use {@link #newUserMethod()} instead. */" + # Perturbed comment adds a keyword + pert_comment = "/** @deprecated Use {@link #newUserMethod()} instead. */" + + preds_base = get_predicted_labels(java_model, base_comment, "java") + preds_pert = get_predicted_labels(java_model, pert_comment, "java") + + # The base comment should not have 'deprecation' + assert "deprecation" not in preds_base + # The perturbed comment must have 'deprecation' + assert "deprecation" in preds_pert + # The original 'Pointer' label should still be there + assert "Pointer" in preds_base + assert "Pointer" in preds_pert + + +def test_python_directional_remove_todo(python_model, get_predicted_labels): + """Tests that removing 'TODO' REMOVES the 'DevelopmentNotes' labe.""" + base_comment = "# TODO: Refactor this entire block." + pert_comment = "# Refactor this entire block." + + preds_base = get_predicted_labels(python_model, base_comment, "python") + preds_pert = get_predicted_labels(python_model, pert_comment, "python") + + # The base comment must have 'DevelopmentNotes' + assert "DevelopmentNotes" in preds_base + # The perturbed comment must not have 'DevelopmentNotes' + assert "DevelopmentNotes" not in preds_pert + + +def test_pharo_directional_add_responsibility(pharo_model, get_predicted_labels): + """Tests that adding 'i am responsible for' adds the 'Responsibilities' label""" + base_comment = '"i am a simple arrow"' + pert_comment = '"i am a simple arrow. i am responsible for drawing."' + + preds_base = get_predicted_labels(pharo_model, base_comment, "pharo") + preds_pert = get_predicted_labels(pharo_model, pert_comment, "pharo") + + # base comment should have 'Intent' + assert "Intent" in preds_base + # base comment should not have 'Responsibilities' + assert "Responsibilities" not in preds_base + # perturbed comment must have 'Responsibilities' + assert "Responsibilities" in preds_pert + # original 'Intent' label should still be there + assert "Intent" in preds_pert + + +def test_java_directional_contrast_rational(java_model, get_predicted_labels): + """ + Tests that adding a design rationale adds the 'rational' label + """ + # Base comment is a simple summary + base_comment = "/** Returns the user ID. */" + # Perturbed comment adds a design rationale + pert_comment = "/** Returns the user ID. This is cached for performance. */" + + preds_base = get_predicted_labels(java_model, base_comment, "java") + preds_pert = get_predicted_labels(java_model, pert_comment, "java") + + # Base comment should be a 'summary' + assert "summary" in preds_base + # Base comment should not have 'rational' + assert "rational" not in preds_base + # Perturbed comment must now have 'rational' + assert "rational" in preds_pert + # Perturbed comment should ideally still be a 'summary' + assert "summary" in preds_pert + + +def test_python_directional_contrast_todo(python_model, get_predicted_labels): + """ + Tests that adding a "TODO" clause adds the 'DevelopmentNotes' label + """ + # Base comment is a simple summary + base_comment = "Fetches the user profile." + # Perturbed comment adds a development note + pert_comment = "Fetches the user profile. TODO: This is deprecated." + + preds_base = get_predicted_labels(python_model, base_comment, "python") + preds_pert = get_predicted_labels(python_model, pert_comment, "python") + + # Base comment should be a 'Summary' + assert "Summary" in preds_base + # Base comment should not have 'DevelopmentNotes' + assert "DevelopmentNotes" not in preds_base + # Perturbed comment must now have 'DevelopmentNotes' + assert "DevelopmentNotes" in preds_pert + # Perturbed comment should ideally still be a 'Summary' + assert "Summary" in preds_pert + + +def test_pharo_directional_contrast_collaborators(pharo_model, get_predicted_labels): + """ + Tests that adding a 'but i work with' clause adds the 'Collaborators' label + """ + # Base comment is a simple intent + base_comment = '"i am a simple arrow like arrowhead."' + pert_comment = '"i am a simple arrow, but i work with BlSpace to position."' + + preds_base = get_predicted_labels(pharo_model, base_comment, "pharo") + preds_pert = get_predicted_labels(pharo_model, pert_comment, "pharo") + + # Base comment should be 'Intent' + assert "Intent" in preds_base + # Base comment should not have 'Collaborators' + assert "Collaborators" not in preds_base + # Perturbed comment must now have 'Collaborators' + assert "Collaborators" in preds_pert + # Perturbed comment should ideally still have 'Intent' + assert "Intent" in preds_pert + + +def test_java_directional_shift_summary_to_expand(java_model, get_predicted_labels): + """ + Tests that replacing a simple 'summary' with an 'Expand' implementation note + shifts the primary classification from 'summary' to 'Expand' + """ + # Base comment is a simple summary + base_comment = "/** Returns the user ID. */" + # Perturbed comment shifts the focus entirely to implementation details + pert_comment = "/** Implementation Note: This delegates to the old system. */" + + preds_base = get_predicted_labels(java_model, base_comment, "java") + preds_pert = get_predicted_labels(java_model, pert_comment, "java") + + # Base comment must have 'summary' + assert "summary" in preds_base + # Perturbed comment must not have 'summary' + assert "summary" not in preds_pert + # Perturbed comment must now have 'Expand' + assert "Expand" in preds_pert + + +def test_python_directional_shift_summary_to_devnotes(python_model, get_predicted_labels): + """ + Tests that replacing a 'Summary' with a critical development note (deprecated) + shifts the classification from 'Summary' to 'DevelopmentNotes' + """ + print(f"\n[DEBUG] Oggetto modello Python: {python_model}, Lingua: {python_model.language}") + # Base comment is a clear Summary + base_comment = "Fetches the user profile." + # Perturbed comment shifts the focus entirely to a note about future work + pert_comment = "DEPRECATED: This function is scheduled for removal in v2.0." + + preds_base = get_predicted_labels(python_model, base_comment, "python") + preds_pert = get_predicted_labels(python_model, pert_comment, "python") + + # Base comment must have 'Summary' + assert "Summary" in preds_base + # Perturbed comment must not have 'Summary' + assert "Summary" not in preds_pert + # Perturbed comment must now have 'DevelopmentNotes' + assert "DevelopmentNotes" in preds_pert + + +def test_pharo_directional_shift_to_example(pharo_model, get_predicted_labels): + """ + Tests that changing a comment from a 'Responsibility' statement to an + explicit 'Example' statement shifts the primary classification + """ + # Base comment is a clear 'Responsibilities' + base_comment = '"i provide a data structure independent api"' + # Perturbed comment replaces the responsibility claim with an explicit example pattern + pert_comment = '"[Example] run the data structure independent api."' + + preds_base = get_predicted_labels(pharo_model, base_comment, "pharo") + preds_pert = get_predicted_labels(pharo_model, pert_comment, "pharo") + + # Base comment msut have Responsibilities + assert "Responsibilities" in preds_base + # Base comment should not have Example + assert "Example" not in preds_base + # Perturbed comment must now have Example + assert "Example" in preds_pert + # Perturbed comment should not have Responsibilities + assert "Responsibilities" not in preds_pert diff --git a/turing/tests/behavioral/test_invariance.py b/turing/tests/behavioral/test_invariance.py new file mode 100644 index 0000000000000000000000000000000000000000..fe85fdb484ca2c6d1d1db8284b020826c2e23a88 --- /dev/null +++ b/turing/tests/behavioral/test_invariance.py @@ -0,0 +1,117 @@ +import pytest + +# These tests check that "noise" (like capitalization or punctuation) does not change the prediction + + +@pytest.mark.parametrize( + "comment", + [ + ":param user_id: The ID of the user.", # Base + ":PARAM USER_ID: THE ID OF THE USER.", # Uppercase + " :param user_id: The ID of the user . ", # Whitespace + ":param user_id: The ID of the user!!!", # Punctuation + ], +) +def test_python_invariance_parameters(python_model, comment, get_predicted_labels): + """Tests that noise doesn't break ':param' detection.""" + expected = {"Parameters"} + preds = get_predicted_labels(python_model, comment, "python") + assert preds == expected + + +def test_java_invariance_deprecation(java_model, get_predicted_labels): + """Tests that noise doesn't break '@deprecated' detection""" + base_comment = "/** @deprecated Use newUserMethod() */" + pert_comment = "/** @DEPRECATED... Use newUserMethod()!!! */" + + preds_base = get_predicted_labels(java_model, base_comment, "java") + preds_pert = get_predicted_labels(java_model, pert_comment, "java") + + assert {"deprecation"} <= preds_base + assert preds_base == preds_pert + + +def test_python_invariance_summary(python_model, get_predicted_labels): + """Tests that noise doesn't break a simple 'Summary' detection""" + + base_comment = "a service specific account of type bar." + expected = {"Summary"} + + # Perturbations + variants = [ + base_comment, + "A SERVICE SPECIFIC ACCOUNT OF TYPE BAR.", + " a service specific account of type bar. ", + "a service specific account of type bar!!!", + ] + + for comment in variants: + preds = get_predicted_labels(python_model, comment, "python") + assert preds == expected + + +def test_pharo_invariance_intent(pharo_model, get_predicted_labels): + """Tests that noise doesn't break Pharo's 'Intent' detection""" + + base_comment = '"i am a simple arrow like arrowhead."' + expected = {"Intent"} + + # Perturbations + variants = [ + base_comment, + '"I AM A SIMPLE ARROW LIKE ARROWHEAD."', + ' "i am a simple arrow like arrowhead." ', + '"i am a simple arrow like arrowhead !!"', # + ] + + for comment in variants: + preds = get_predicted_labels(pharo_model, comment, "pharo") + assert preds == expected + + +def test_python_invariance_typos_parameters(python_model, get_predicted_labels): + """ + Tests typo tolerance + + """ + + # Define the single expected outcome + expected_labels = {"Parameters"} + + # Define the base case and all its variants (with typos) + variants = [ + ":param user_id: The ID of the user.", + ":paramater user_id: The ID of the user.", + ":pram user_id: The ID of teh user.", + ] + + # Loop through all variants and assert they all produce the *exact* expected outcome + for comment in variants: + preds = get_predicted_labels(python_model, comment, "python") + assert preds == expected_labels + + +def test_java_invariance_semantic_summary(java_model, get_predicted_labels): + """ + Tests semantic invariance + + """ + + # Get the prediction for the base comment + base_comment = "/** Returns the user ID. */" + base_preds = get_predicted_labels(java_model, base_comment, "java") + + # Define semantic paraphrases of the base comment + variants = [ + base_comment, + "/** Gets the user ID. */", + "/** Fetches the ID for the user. */", + "/** A method to return the user's ID. */", + ] + + # Check that the base prediction is valid (summary) + assert "summary" in base_preds + + for comment in variants: + preds = get_predicted_labels(java_model, comment, "java") + assert preds == base_preds diff --git a/turing/tests/behavioral/test_minimum_functionality.py b/turing/tests/behavioral/test_minimum_functionality.py new file mode 100644 index 0000000000000000000000000000000000000000..f088e7656bb98d67aaebbdfc1bc8da37a1dc5e74 --- /dev/null +++ b/turing/tests/behavioral/test_minimum_functionality.py @@ -0,0 +1,52 @@ +import pytest + +# These tests check for basic, obvious classifications + + +@pytest.mark.parametrize( + "comment, expected_labels", + [ + ("test getfilestatus and related listing operations.", {"summary"}), + ("/* @deprecated Use something else. */", {"deprecation"}), + ("code source of this file http grepcode.com", {"Pointer"}), + ("this is balanced if each pool is balanced.", {"rational"}), + ("// For internal use only.", {"Ownership"}), + ("this impl delegates to the old filesystem", {"Expand"}), + ("/** Usage: new MyClass(arg1). */", {"usage"}), + ], +) +def test_java_mft(java_model, comment, expected_labels, get_predicted_labels): + preds = get_predicted_labels(java_model, comment, "java") + assert preds == expected_labels + + +@pytest.mark.parametrize( + "comment, expected_labels", + [ + ("a service specific account of type bar.", {"Summary"}), + (":param user_id: The ID of the user.", {"Parameters"}), + ("# TODO: Refactor this entire block.", {"DevelopmentNotes"}), + ("use this class if you want access to all of the mechanisms", {"Usage"}), + ("# create a new list by filtering duplicates from the input", {"Expand"}), + ], +) +def test_python_mft(python_model, comment, expected_labels, get_predicted_labels): + preds = get_predicted_labels(python_model, comment, "python") + assert preds == expected_labels + + +@pytest.mark.parametrize( + "comment, expected_labels", + [ + ("i am a simple arrow like arrowhead.", {"Intent"}), + ("the example below shows how to create a simple element", {"Example"}), + ("i provide a data structure independent api", {"Responsibilities"}), + ("the cache is cleared after each test to ensure isolation.", {"Keyimplementationpoints"}), + ("it is possible hovewer to customize a length fraction", {"Keymessages"}), + ("collaborators: BlElement, BlSpace", {"Collaborators"}), + ], +) +def test_pharo_mft(pharo_model, comment, expected_labels, get_predicted_labels): + """Tests basic keyword-to-label mapping for Pharo (e.g., 'I am...').""" + preds = get_predicted_labels(pharo_model, comment, "pharo") + assert preds == expected_labels diff --git a/turing/tests/conftest.py b/turing/tests/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..16c5c6e5a26b65bf0e7651248082882a7ffea364 --- /dev/null +++ b/turing/tests/conftest.py @@ -0,0 +1,305 @@ +import os +from pathlib import Path +import sys + +import numpy as np +import pandas as pd +import pytest + +import turing.config as config +from turing.dataset import DatasetManager +from turing.reporting import TestReportGenerator + +# --- Path Setup --- +script_dir = os.path.dirname(os.path.abspath(__file__)) +proj_root = os.path.dirname(os.path.dirname(script_dir)) +sys.path.append(proj_root) + +train_dir = os.path.join(proj_root, "turing", "modeling") +sys.path.insert(1, train_dir) + + +try: + # Import train.py + import turing.modeling.train as train +except ImportError as e: + pytest.skip( + f"Could not import 'train.py'. Check sys.path. Error: {e}", allow_module_level=True + ) + +# --- Reporting Setup --- +execution_results = [] +active_categories = set() + + +def clean_test_name(nodeid): + """Pulisce il nome del test rimuovendo parametri lunghi.""" + parts = nodeid.split("::") + test_name = parts[-1] + if len(test_name) > 50: + test_name = test_name[:47] + "..." + return test_name + + +def format_error_message(long_repr): + """Estrae solo l'errore principale.""" + if not long_repr: + return "" + lines = str(long_repr).split("\n") + last_line = lines[-1] + clean_msg = last_line.replace("|", "-").strip() + if len(clean_msg) > 60: + clean_msg = clean_msg[:57] + "..." + return clean_msg + + +@pytest.hookimpl(tryfirst=True, hookwrapper=True) +def pytest_runtest_makereport(item, call): + outcome = yield + report = outcome.get_result() + + if report.when == "call": + path_str = str(item.fspath) + category = "GENERAL" + + if "unit" in path_str: + category = "UNIT" + elif "behavioral" in path_str: + category = "BEHAVIORAL" + elif "modeling" in path_str: + category = "MODELING" + + active_categories.add(category) + + # Simplified status mapping + status_map = {"passed": "PASS", "failed": "FAIL", "skipped": "SKIP"} + status_str = status_map.get(report.outcome, report.outcome.upper()) + + execution_results.append( + { + "Category": category, + "Module": item.fspath.basename, + "Test Case": clean_test_name(item.nodeid), + "Result": status_str, + "Time": f"{report.duration:.2f}s", + "Message": format_error_message(report.longrepr) if report.failed else "", + } + ) + + +def pytest_sessionfinish(session, exitstatus): + """Generate enhanced test report at session end.""" + if not execution_results: + return + + report_type = ( + f"{list(active_categories)[0].lower()}_tests" + if len(active_categories) == 1 + else "unit_and_behavioral_tests" + ) + + try: + manager = TestReportGenerator(context_name="turing", report_category=report_type) + + # Main title + manager.add_header("Turing Test Execution Report") + manager.add_divider("section") + + # Environment info + manager.add_environment_metadata() + manager.add_divider("thin") + + df = pd.DataFrame(execution_results) + + # Sommario + total = len(df) + passed = len(df[df["Result"] == "[ PASS ]"]) + failed = len(df[df["Result"] == "[ FAILED ]"]) + summary = pd.DataFrame( + [ + { + "Total": total, + "Passed": passed, + "Failed": failed, + "Success Rate": f"{(passed / total) * 100:.1f}%", + } + ] + ) + manager.add_dataframe(summary, title="Executive Summary") + + # Detailed breakdown by category + cols = ["Module", "Test Case", "Result", "Time", "Message"] + + if len(active_categories) > 1: + manager.add_header("Detailed Test Results by Category", level=2) + manager.add_divider("thin") + + for cat in sorted(active_categories): + subset = df[df["Category"] == cat][cols] + manager.add_dataframe(subset, title=f"{cat} Tests") + else: + manager.add_alert_box( + "All tests passed successfully!", + box_type="success" + ) + + manager.save("report.md") + except Exception as e: + print(f"\nError generating report: {e}") + + +# --- Fixtures --- + + +@pytest.fixture(scope="function") +def manager() -> DatasetManager: + """ + Provides a instance of DatasetManager for each test. + """ + return DatasetManager() + + +@pytest.fixture(scope="function") +def fake_csv_data_dir(tmp_path: Path) -> Path: + """ + Creates a temporary directory structure mocking 'data/interim/features/clean-aug-soft-k5000' + and populates it with minimal, valid CSV files for testing. + + Returns: + Path: The path to the *parent* of 'features' (e.g., the mocked INTERIM_DATA_DIR). + """ + interim_dir = tmp_path / "interim_test" + features_dir = interim_dir / "features" / "clean-aug-soft-k5000" + features_dir.mkdir(parents=True, exist_ok=True) + + # Define minimal valid CSV content + csv_content = ( + "combo,labels\n" + '"java code text","[1, 0, 0, 0, 0, 0, 0]"\n' + '"other java code","[0, 1, 0, 0, 0, 0, 0]"\n' + ) + + # Write mock files + (features_dir / "java_train.csv").write_text(csv_content) + (features_dir / "java_test.csv").write_text(csv_content) + + # Return the root of the mocked interim directory + return interim_dir + + +@pytest.fixture(scope="session") +def mock_data(): + """ + Provides a minimal, consistent, session-scoped dataset for model testing. + This simulates the (X, y) data structure used for training and evaluation. + """ + X = [ + "this is java code for summary", + "python is great for parameters", + "a java example for usage", + "running python script for development notes", + "pharo is a language for intent", + "another java rational example", + ] + + # Mock labels for a 'java' model (7 categories) + # Shape (6 samples, 7 features) + y = np.array( + [ + [1, 0, 0, 0, 0, 0, 0], + [0, 1, 0, 0, 0, 0, 0], + [1, 0, 0, 1, 0, 0, 0], + [0, 0, 1, 0, 0, 0, 0], + [0, 0, 0, 0, 1, 0, 0], + [1, 0, 0, 0, 0, 0, 1], + ] + ) + return {"X": X, "y": y} + + +@pytest.fixture(scope="module") +def trained_rf_model(mock_data, tmp_path_factory): + """ + Provides a fully-trained RandomForestTfIdf model instance. + """ + # Import locally to ensure proj_root is set + from modeling.models.randomForestTfIdf import RandomForestTfIdf + + # Arrange + model = RandomForestTfIdf(language="java") + + # Monkeypatch grid search parameters for maximum speed + model.grid_params = { + "tfidf__max_features": [10, 20], # Use minimal features + "clf__estimator__n_estimators": [2, 5], # Use minimal trees + } + model.params["cv_folds"] = 2 # Use minimal CV folds + + # Create a persistent temp dir for this module's run + model_path = tmp_path_factory.mktemp("trained_rf_model") + + # Act: Train the model + model.train(mock_data["X"], mock_data["y"], path=str(model_path), model_name="test_model") + + # Yield the trained model and its save path + yield model, model_path + + +MODEL_CLASS_TO_TEST = train.MODEL_CLASS +MODEL_EXPERIMENT_NAME = train.EXP_NAME +MODEL_NAME_BASE = train.MODEL_NAME + + +@pytest.fixture(scope="session") +def get_predicted_labels(): + def _helper(model, comment_sentence: str, lang: str) -> set: + if config.INPUT_COLUMN == "combo": + combo_input = f"DummyClass.{lang} | {comment_sentence}" + input_data = [combo_input] + else: + input_data = [comment_sentence] + + prediction_array = model.predict(input_data)[0] + labels_map = config.LABELS_MAP[lang] + predicted_labels = {labels_map[i] for i, val in enumerate(prediction_array) if val == 1} + return predicted_labels + + return _helper + + +@pytest.fixture(scope="module") +def java_model(): + """Loads the Java model from the config path""" + model_path = os.path.join(config.MODELS_DIR, MODEL_EXPERIMENT_NAME, f"{MODEL_NAME_BASE}_java") + if not os.path.exists(model_path): + pytest.skip( + "Production model not found. Skipping behavioral tests for Java.", + allow_module_level=True, + ) + return MODEL_CLASS_TO_TEST(language="java", path=model_path) + + +@pytest.fixture(scope="module") +def python_model(): + """Loads the Python model from the config path""" + model_path = os.path.join( + config.MODELS_DIR, MODEL_EXPERIMENT_NAME, f"{MODEL_NAME_BASE}_python" + ) + if not os.path.exists(model_path): + pytest.skip( + "Production model not found. Skipping behavioral tests for Python.", + allow_module_level=True, + ) + return MODEL_CLASS_TO_TEST(language="python", path=model_path) + + +@pytest.fixture(scope="module") +def pharo_model(): + """Loads the Pharo model from the config path""" + model_path = os.path.join(config.MODELS_DIR, MODEL_EXPERIMENT_NAME, f"{MODEL_NAME_BASE}_pharo") + if not os.path.exists(model_path): + pytest.skip( + "Production model not found. Skipping behavioral tests for Pharo.", + allow_module_level=True, + ) + return MODEL_CLASS_TO_TEST(language="pharo", path=model_path) diff --git a/turing/tests/unit/test_api.py b/turing/tests/unit/test_api.py new file mode 100644 index 0000000000000000000000000000000000000000..27d7dccf645c0c12bfd920cb86ea02ded61e3f7c --- /dev/null +++ b/turing/tests/unit/test_api.py @@ -0,0 +1,201 @@ +from unittest.mock import patch + +from fastapi.testclient import TestClient +import numpy as np +import pytest + +from turing.api.app import app +from turing.api.schemas import PredictionRequest, PredictionResponse + + +@pytest.fixture +def client(): + """Fixture that provides a test client for the FastAPI app.""" + return TestClient(app) + + +@pytest.fixture +def mock_inference_engine(): + """Fixture that provides a mocked inference engine.""" + with patch('turing.api.app.inference_engine') as mock: + yield mock + + +class TestHealthCheck: + """Test suite for the health check endpoint.""" + + def test_health_check_returns_ok(self, client): + """Test that the health check endpoint returns status ok.""" + response = client.get("/") + assert response.status_code == 200 + assert response.json() == { + "status": "ok", + "message": "Turing Code Classification API is ready." + } + + +class TestPredictEndpoint: + """Test suite for the predict endpoint.""" + + def test_predict_success_java(self, client, mock_inference_engine): + """Test successful prediction for Java code.""" + # Setup mock + mock_inference_engine.predict_payload.return_value = ( + np.array([0, 1]), # raw predictions as numpy array + ["class", "method"], # labels + "run_id_123", # run_id + "models:/CodeBERTa_java/Production" # artifact + ) + + # Make request + request_data = { + "texts": ["public class Main", "public void test()"], + "language": "java" + } + response = client.post("/predict", json=request_data) + + # Assertions + assert response.status_code == 200 + data = response.json() + assert "predictions" in data + assert "labels" in data + assert "model_info" in data + assert data["labels"] == ["class", "method"] + assert data["model_info"]["language"] == "java" + + def test_predict_success_python(self, client, mock_inference_engine): + """Test successful prediction for Python code.""" + # Setup mock + mock_inference_engine.predict_payload.return_value = ( + np.array([1, 0]), # raw predictions as numpy array + ["function", "class"], # labels + "run_id_456", # run_id + "models:/CodeBERTa_python/Production" # artifact + ) + + # Make request + request_data = { + "texts": ["def main():", "class MyClass:"], + "language": "python" + } + response = client.post("/predict", json=request_data) + + # Assertions + assert response.status_code == 200 + data = response.json() + assert data["labels"] == ["function", "class"] + assert data["model_info"]["language"] == "python" + + def test_predict_success_pharo(self, client, mock_inference_engine): + """Test successful prediction for Pharo code.""" + # Setup mock + mock_inference_engine.predict_payload.return_value = ( + np.array([0]), # raw predictions as numpy array + ["method"], # labels + "run_id_789", # run_id + "models:/CodeBERTa_pharo/Production" # artifact + ) + + # Make request + request_data = { + "texts": ["initialize"], + "language": "pharo" + } + response = client.post("/predict", json=request_data) + + # Assertions + assert response.status_code == 200 + data = response.json() + assert data["labels"] == ["method"] + assert data["model_info"]["language"] == "pharo" + + def test_predict_missing_texts(self, client): + """Test that prediction fails when texts are missing.""" + request_data = { + "language": "java" + } + response = client.post("/predict", json=request_data) + assert response.status_code == 422 # Validation error + + def test_predict_missing_language(self, client): + """Test that prediction fails when language is missing.""" + request_data = { + "texts": ["public class Main"] + } + response = client.post("/predict", json=request_data) + assert response.status_code == 422 # Validation error + + def test_predict_empty_texts(self, client, mock_inference_engine): + """Test prediction with empty texts list.""" + mock_inference_engine.predict_payload.return_value = ( + np.array([]), # raw predictions as empty numpy array + [], # labels + "run_id_000", # run_id + "models:/CodeBERTa_java/Production" # artifact + ) + + request_data = { + "texts": [], + "language": "java" + } + response = client.post("/predict", json=request_data) + + # Should succeed with empty results + assert response.status_code == 200 + data = response.json() + assert data["predictions"] == [] + assert data["labels"] == [] + + def test_predict_error_handling(self, client, mock_inference_engine): + """Test that prediction endpoint handles errors gracefully.""" + # Setup mock to raise an exception + mock_inference_engine.predict_payload.side_effect = Exception("Model loading failed") + + request_data = { + "texts": ["public class Main"], + "language": "java" + } + response = client.post("/predict", json=request_data) + + # Should return 500 error + assert response.status_code == 500 + assert "Model loading failed" in response.json()["detail"] + + def test_predict_invalid_language(self, client, mock_inference_engine): + """Test prediction with invalid language parameter.""" + # The model might raise an error for unsupported language + mock_inference_engine.predict_payload.side_effect = ValueError("Unsupported language: cobol") + + request_data = { + "texts": ["IDENTIFICATION DIVISION."], + "language": "cobol" + } + response = client.post("/predict", json=request_data) + + # Should return 500 error + assert response.status_code == 500 + assert "Unsupported language" in response.json()["detail"] + + +class TestAPISchemas: + """Test suite for API schemas validation.""" + + def test_prediction_request_valid(self): + """Test that PredictionRequest validates correct data.""" + request = PredictionRequest( + texts=["public void main"], + language="java" + ) + assert request.texts == ["public void main"] + assert request.language == "java" + + def test_prediction_response_valid(self): + """Test that PredictionResponse validates correct data.""" + response = PredictionResponse( + predictions=[0, 1], + labels=["class", "method"], + model_info={"artifact": "models:/CodeBERTa_java/Production", "language": "java"} + ) + assert response.predictions == [0, 1] + assert response.labels == ["class", "method"] + assert response.model_info["language"] == "java" diff --git a/turing/tests/unit/test_config.py b/turing/tests/unit/test_config.py new file mode 100644 index 0000000000000000000000000000000000000000..fac2d83ad3d5ce0c0f15e658850e8b7f46b842e2 --- /dev/null +++ b/turing/tests/unit/test_config.py @@ -0,0 +1,133 @@ +import importlib +from pathlib import Path +from unittest.mock import patch + +import pytest + +# Import the module to be tested +import turing.config as config + + +@pytest.mark.config +class TestConfig: + """ + Test suite for validating the project's configuration module (config.py). + + These tests verify that paths are structured correctly, critical constants + are of the expected type and value, and module-level logic + (like calculations and .env loading) executes as intended. + """ + + def test_proj_root_is_correctly_identified(self): + """ + Validates that PROJ_ROOT is a Path object and points to the + actual project root directory (which should contain 'pyproject.toml'). + """ + assert isinstance(config.PROJ_ROOT, Path) + assert config.PROJ_ROOT.is_dir() + + # A common "sanity check" is to look for a known file at the root + expected_file = config.PROJ_ROOT / "pyproject.toml" + assert expected_file.is_file(), ( + f"PROJ_ROOT ({config.PROJ_ROOT}) does not seem to be the project root. " + f"Could not find {expected_file}" + ) + + def test_directory_paths_are_correctly_structured(self): + """ + Ensures all key directory variables are Path objects + and are correctly parented under PROJ_ROOT. + """ + # List of all directory variables defined in config.py + path_vars = [ + config.DATA_DIR, + config.RAW_DATA_DIR, + config.INTERIM_DATA_DIR, + config.PROCESSED_DATA_DIR, + config.EXTERNAL_DATA_DIR, + config.MODELS_DIR, + config.REPORTS_DIR, + config.FIGURES_DIR, + ] + + for path_var in path_vars: + assert isinstance(path_var, Path) + # Check that PROJ_ROOT is an ancestor of this path + assert config.PROJ_ROOT in path_var.parents + + # Spot-check a few for correct relative paths + assert config.DATA_DIR == config.PROJ_ROOT / "data" + assert config.RAW_DATA_DIR == config.PROJ_ROOT / "data" / "raw" + assert config.FIGURES_DIR == config.PROJ_ROOT / "reports" / "figures" + + def test_dataset_constants_are_valid(self): + """ + Validates that critical dataset constants are non-empty and of + the correct type. + """ + assert isinstance(config.DATASET_HF_ID, str) + assert config.DATASET_HF_ID == "NLBSE/nlbse26-code-comment-classification" + + assert isinstance(config.LANGS, list) + assert len(config.LANGS) == 3 + assert "java" in config.LANGS + + assert isinstance(config.INPUT_COLUMN, str) and config.INPUT_COLUMN + assert isinstance(config.LABEL_COLUMN, str) and config.LABEL_COLUMN + + def test_labels_map_and_total_categories_are_correct(self): + """ + Validates the LABELS_MAP structure and ensures TOTAL_CATEGORIES + is correctly calculated from it. + """ + assert isinstance(config.LABELS_MAP, dict) + + # Ensure all languages in LANGS are keys in LABELS_MAP + for lang in config.LANGS: + assert lang in config.LABELS_MAP + assert isinstance(config.LABELS_MAP[lang], list) + assert len(config.LABELS_MAP[lang]) > 0 + + # Validate the derived calculation + expected_total = ( + len(config.LABELS_MAP["java"]) + + len(config.LABELS_MAP["python"]) + + len(config.LABELS_MAP["pharo"]) + ) + assert config.TOTAL_CATEGORIES == expected_total + assert config.TOTAL_CATEGORIES == 18 # 7 + 5 + 6 + + def test_numeric_parameters_are_positive(self): + """ + Ensures that numeric scoring and training parameters are positive + and of the correct type. + """ + numeric_params = { + "MAX_AVG_RUNTIME": config.MAX_AVG_RUNTIME, + "MAX_AVG_FLOPS": config.MAX_AVG_FLOPS, + "DEFAULT_BATCH_SIZE": config.DEFAULT_BATCH_SIZE, + "DEFAULT_NUM_ITERATIONS": config.DEFAULT_NUM_ITERATIONS, + } + + for name, value in numeric_params.items(): + assert isinstance(value, (int, float)), f"{name} is not numeric" + assert value > 0, f"{name} must be positive" + + @patch("dotenv.load_dotenv") + def test_load_dotenv_is_called_on_module_load(self, mock_load_dotenv): + """ + Tests that the load_dotenv() function is executed when the + config.py module is loaded. + + This requires reloading the module, as it's likely already been + imported by pytest or conftest. + """ + # Arrange (Patch is active) + + # Act + # Reload the config module to trigger its top-level statements + importlib.reload(config) + + # Assert + # Check that the patched load_dotenv was called + mock_load_dotenv.assert_called_once() diff --git a/turing/tests/unit/test_dataset.py b/turing/tests/unit/test_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..271b19932e6fcf88faf65fc3b15cc846c8003961 --- /dev/null +++ b/turing/tests/unit/test_dataset.py @@ -0,0 +1,95 @@ +from pathlib import Path + +import pytest + +# Project modules are importable thanks to conftest.py +import turing.config as config +from turing.dataset import DatasetManager + + +@pytest.mark.data_loader +class TestDatasetManager: + """ + Unit tests for the DatasetManager class. + This test suite validates initialization, data transformation logic, + and data loading mechanisms, including error handling. + """ + + def test_initialization_paths_are_correct(self, manager: DatasetManager): + """ + Verifies that the DatasetManager initializes with the correct + Hugging Face ID and constructs its paths as expected. + """ + assert manager.hf_id == "NLBSE/nlbse26-code-comment-classification" + assert "data/raw" in str(manager.raw_data_dir) + # base_interim_path should contain either 'base' or 'features' + path_str = str(manager.base_interim_path) + assert "data/interim" in path_str and ("base" in path_str or "features" in path_str) + + @pytest.mark.parametrize( + "input_labels, expected_output", + [ + ([1, 0, 1], "[1, 0, 1]"), # Case: Standard list + ("[1, 0, 1]", "[1, 0, 1]"), # Case: Already a string + ([], "[]"), # Case: Empty list + (None, None), # Case: None value + ], + ) + def test_format_labels_for_csv(self, manager: DatasetManager, input_labels, expected_output): + """ + Tests the internal _format_labels_for_csv method to ensure + it correctly serializes label lists (or handles other inputs) to strings. + """ + # Arrange + example = {"labels": input_labels} + + # Act + formatted_example = manager._format_labels_for_csv(example) + + # Assert + assert formatted_example["labels"] == expected_output + + def test_get_dataset_raises_file_not_found(self, monkeypatch): + """ + Ensures that get_dataset() raises a FileNotFoundError when + the target interim CSV files do not exist. + """ + # Arrange + # Patch the config to point to a non-existent directory + fake_dir = Path("/path/that/is/totally/fake") + monkeypatch.setattr(config, "INTERIM_DATA_DIR", fake_dir) + + # Manager must be initialized *after* patching config + manager_with_fake_path = DatasetManager() + + # Act & Assert + with pytest.raises(FileNotFoundError, match="Dataset CSV files not found."): + manager_with_fake_path.get_dataset() + + def test_get_dataset_success_and_label_parsing(self, fake_csv_data_dir: Path, monkeypatch): + """ + Verifies that get_dataset() successfully loads data from mock CSVs + and correctly parses the string-formatted labels back into lists. + """ + # Arrange + # Point the config at our temporary fixture directory + monkeypatch.setattr(config, "INTERIM_DATA_DIR", fake_csv_data_dir) + manager = DatasetManager() + + # Act + dataset = manager.get_dataset() + + # Assert + # Check that the correct splits were loaded + assert "java_train" in dataset + assert "java_test" in dataset + assert "python_train" not in dataset # Confirms only found files are loaded + + # Check content integrity + assert len(dataset["java_train"]) == 2 + assert dataset["java_train"][0]["combo"] == "java code text" + + # Ccheck that the string '[1, 0, ...]' was parsed back to a list + expected_labels = [1, 0, 0, 0, 0, 0, 0] + assert dataset["java_train"][0]["labels"] == expected_labels + assert isinstance(dataset["java_train"][0]["labels"], list) diff --git a/turing/tests/unit/test_features.py b/turing/tests/unit/test_features.py new file mode 100644 index 0000000000000000000000000000000000000000..6593a6425d4e8fb345c69db2e227308d5e90fb5d --- /dev/null +++ b/turing/tests/unit/test_features.py @@ -0,0 +1,121 @@ +import pandas as pd +import pytest + +from turing.features import ( + FeatureEngineer, + FeaturePipelineConfig, + TextProcessor, +) + +# --- Fixtures --- + + +@pytest.fixture(scope="module") +def full_config(): + """Returns a config with stopwords and lemmatization enabled.""" + return FeaturePipelineConfig( + use_stopwords=True, + use_lemmatization=True, + use_combo_feature=False, + max_features=5000, + min_comment_length=10, + max_comment_length=500, + enable_augmentation=False, + custom_tags="test", + ) + + +@pytest.fixture(scope="module") +def basic_config(): + """Returns a config with all extra steps disabled.""" + return FeaturePipelineConfig( + use_stopwords=False, + use_lemmatization=False, + use_combo_feature=False, + max_features=100, + min_comment_length=5, + max_comment_length=200, + enable_augmentation=False, + ) + + +@pytest.fixture(scope="module") +def full_processor(full_config): + """A TextProcessor with all steps enabled.""" + return TextProcessor(config=full_config, language="english") + + +@pytest.fixture(scope="module") +def basic_processor(basic_config): + """A TextProcessor with only basic cleaning (lowercase, punctuation).""" + return TextProcessor(config=basic_config, language="english") + + +# --- Tests --- + + +class TestFeaturePipelineConfig: + def test_config_id_generation(self, full_config, basic_config): + """Tests that the readable ID is generated correctly.""" + assert full_config.hash_id == "clean-k5000-test" + assert basic_config.hash_id == "clean-k100" + + def test_config_attributes(self, full_config): + """Tests that attributes are set correctly.""" + assert full_config.use_stopwords is True + assert full_config.use_lemmatization is True + assert full_config.max_features == 5000 + + +class TestTextProcessor: + def test_clean_text_basic(self, basic_processor): + """Tests lowercase and punctuation removal.""" + text = "This is a TEST... with punctuation!!" + expected = "this is a test with punctuation" + assert basic_processor.clean_text(text) == expected + + def test_clean_text_stopwords(self, full_processor, basic_processor): + """Tests stopword removal logic.""" + text = "this is a test with a stopword" + + # With stopwords enabled + expected_full = "test stopword" + assert full_processor.clean_text(text) == expected_full + + # With stopwords disabled + expected_basic = "this is a test with a stopword" + assert basic_processor.clean_text(text) == expected_basic + + def test_clean_text_lemmatization(self, full_processor, basic_processor): + """Tests lemmatization logic.""" + text = "running tests while dogs are barking" + + # With lemmatization enabled + expected_full = "running test dog barking" # 'are' and 'while' are stopwords + assert full_processor.clean_text(text) == expected_full + + # With lemmatization disabled + expected_basic = "running tests while dogs are barking" + assert basic_processor.clean_text(text) == expected_basic + + def test_clean_text_handles_none(self, basic_processor): + """Tests that it doesn't crash on None or pd.NA.""" + assert basic_processor.clean_text(None) == "" + assert basic_processor.clean_text(pd.NA) == "" + + +class TestFeatureEngineer: + def test_extract_numeric_features(self, basic_config): + """Tests that extract_features_for_check adds metadata features.""" + fe = FeatureEngineer(config=basic_config) + data = {"comment_sentence": ["This is short.", "This one is a bit longer.", ""]} + df = pd.DataFrame(data) + df_out = fe.extract_features_for_check(df) + + assert "f_length" in df_out.columns + assert "f_word_count" in df_out.columns + assert "f_starts_verb" in df_out.columns + assert "text_hash" in df_out.columns + + assert df_out["f_length"].tolist() == [14, 25, 0] + assert df_out["f_word_count"].tolist() == [3, 6, 0] diff --git a/turing/tests/unit/test_model.py b/turing/tests/unit/test_model.py new file mode 100644 index 0000000000000000000000000000000000000000..8ee8173e09935280b17ae0eae20ec1b1dbcb764b --- /dev/null +++ b/turing/tests/unit/test_model.py @@ -0,0 +1,135 @@ +import inspect + +import numpy as np +import pytest + +from turing.config import EXISTING_MODELS +import turing.modeling.models as my_models + + +@pytest.fixture +def get_model(request: str): + """Fixture that returns a list of existing model names.""" + model_name = request.param + + module = getattr(my_models, model_name, None) + + classes = [ + cls + for _, cls in inspect.getmembers(module, inspect.isclass) + if cls.__module__ == module.__name__ + ] + + cls = classes[0] + + from turing.config import LANGS + + lang = LANGS[0] + return cls(language=lang) + + +@pytest.mark.parametrize("get_model", EXISTING_MODELS, indirect=True) +def test_model_initialization(get_model): + """ + Test that each model class can be initialized without errors. + """ + model = get_model + assert model is not None + from turing.modeling.baseModel import BaseModel + + assert isinstance(model, BaseModel) + + +@pytest.mark.parametrize("get_model", EXISTING_MODELS, indirect=True) +def test_model_setup(get_model): + """ + Test that each model class sets up its internal model correctly. + """ + model = get_model + model.setup_model() + assert model.model is not None + + +@pytest.mark.parametrize("get_model", EXISTING_MODELS, indirect=True) +def test_model_train(tmp_path, get_model): + """ + Test that each model class can run the train method without errors. + """ + model = get_model + model.setup_model() + + # Using mock data for training + X_train = ["sample text data"] * 10 + + y_train = [0, 1] * 5 + + y_train = np.array(y_train).reshape(-1, 1) + + # fake directory and model name + fake_path = tmp_path / "out" + fake_path.mkdir() + + parameters = model.train(X_train, y_train) + + assert isinstance(parameters, dict) + assert model.model is not None + + +@pytest.mark.parametrize("get_model", EXISTING_MODELS, indirect=True) +def test_model_evaluate(tmp_path, get_model): + """ + Test that each model class can run the evaluate method without errors. + """ + model = get_model + model.setup_model() + + # Using mock data for training + X_train = ["sample text data"] * 10 + + y_train = [0, 1] * 5 + + y_train = np.array(y_train).reshape(-1, 1) + + # fake directory and model name + fake_path = tmp_path / "out" + fake_path.mkdir() + + _ = model.train(X_train, y_train) + + # Using mock data for evaluation + X_test = ["sample text data"] * 10 + y_test = [0, 1] * 5 + metrics = model.evaluate(X_test, y_test) + + assert isinstance(metrics, dict) + assert metrics and "accuracy" in metrics + assert "f1_score" in metrics or "f1_score_micro" in metrics + + +@pytest.mark.parametrize("get_model", EXISTING_MODELS, indirect=True) +def test_model_predict(tmp_path, get_model): + """ + Test that each model class can run the predict method without errors. + """ + model = get_model + model.setup_model() + + # Using mock data for training + X_train = ["sample text data"] * 10 + + y_train = [0, 1] * 5 + + y_train = np.array(y_train).reshape(-1, 1) + + # fake directory and model name + fake_path = tmp_path / "out" + fake_path.mkdir() + + _ = model.train(X_train, y_train) + + # Using mock data for prediction + X_input = ["sample text data"] * 3 + predictions = model.predict(X_input) + + assert predictions is not None + assert len(predictions) == len(X_input)