diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..4f1d51575a04862a685fc3f545959f310f8cbfc5
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,9 @@
+turing/reporting.py
+turing/plots.py
+turing/features.py
+turing/evaluate_model.py
+turing/data_validation.py
+
+turing/CLI_runner
+turing/modeling/train.py
+turing/tests
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ca207ff05c0348a3c30a1a278dfd6b8cdc0618a1
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,14 @@
+services:
+ api:
+ build: .
+ container_name: turing_app
+ image: turing_api
+ ports:
+ - "7860:7860"
+
+ environment:
+ - MLFLOW_TRACKING_USERNAME=${MLFLOW_USER}
+ - MLFLOW_TRACKING_PASSWORD=${MLFLOW_PWD}
+ - DAGSHUB_USER_TOKEN=${DAGSHUB_TOKEN}
+
+ command: uvicorn turing.api.app:app --host 0.0.0.0 --port 7860 --reload
\ No newline at end of file
diff --git a/dockerfile b/dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..42dd62de776aa03b1c0310c0423270758894307e
--- /dev/null
+++ b/dockerfile
@@ -0,0 +1,31 @@
+FROM python:3.12
+
+# Create a non-root user to run the application and set permissions
+RUN useradd -m -u 1000 turinguser
+RUN mkdir -p /app/models && chown -R turinguser:turinguser /app /app/models
+USER turinguser
+
+# Set environment variables
+# PATH to include local user binaries and project root
+ENV PATH="/home/turinguser/.local/bin:$PATH"
+ENV PROJ_ROOT=/app
+
+# Set the working directory in the container
+WORKDIR /app
+
+# Copy essential files to install dependencies
+COPY --chown=turinguser requirements.txt .
+
+# Install Python dependencies
+RUN pip install --default-timeout=1000 --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+RUN pip3 install -v -r requirements.txt --upgrade --default-timeout=1000 --no-cache-dir --break-system-packages
+
+# Copy remaining project files
+COPY --chown=turinguser turing ./turing
+COPY --chown=turinguser reports ./reports
+
+# Expose port 7860 for the FastAPI application
+EXPOSE 7860
+
+# Default command to run the FastAPI application on port 7860
+CMD ["uvicorn", "turing.api.app:app", "--host", "0.0.0.0", "--port", "7860"]
\ No newline at end of file
diff --git a/reports/.gitkeep b/reports/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/reports/feedback/feedback_data.csv b/reports/feedback/feedback_data.csv
new file mode 100644
index 0000000000000000000000000000000000000000..c77afd3d1f0fb53b55bd2cb285f2ce199583eddd
--- /dev/null
+++ b/reports/feedback/feedback_data.csv
@@ -0,0 +1,3 @@
+Timestamp,Input_Text,Language,Model_Prediction,User_Correction
+2025-12-11 22:41:05,# Create output directory,python,Usage,DevelopmentNotes
+2025-12-11 23:05:24,# Entry point for running the API directly with python,python,Usage,DevelopmentNotes
diff --git a/reports/figures/.gitkeep b/reports/figures/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/reports/figures/logo_header.svg b/reports/figures/logo_header.svg
new file mode 100644
index 0000000000000000000000000000000000000000..fde0102644902834fbc91f670843f2180619562b
--- /dev/null
+++ b/reports/figures/logo_header.svg
@@ -0,0 +1,38 @@
+
diff --git a/reports/unit_and_behavioral_tests/report.md b/reports/unit_and_behavioral_tests/report.md
new file mode 100644
index 0000000000000000000000000000000000000000..fdb1a1f776bc054e2cfb61b53b1e172e358fefcf
--- /dev/null
+++ b/reports/unit_and_behavioral_tests/report.md
@@ -0,0 +1,108 @@
+
+# Test Execution Report
+
+
+### Environment
+
+
+```text
+ Parameter Value
+ Timestamp 2025-11-27 15:44:47
+ Context turing
+Python Version 3.12.12
+ Platform Windows-11-10.0.26100-SP0
+```
+
+
+### Executive Summary
+
+
+```text
+ Total Passed Failed Success Rate
+ 66 35 31 53.0%
+```
+
+
+Detailed Breakdown:
+
+
+### BEHAVIORAL Tests
+
+
+```text
+ Module Test Case Result Time Message
+ test_directional.py test_java_directional_add_deprecation [ FAILED ] 0.30s turing\tests\behavioral\test_directional.py:16: Assertion...
+ test_directional.py test_python_directional_remove_todo [ FAILED ] 0.15s turing\tests\behavioral\test_directional.py:31: Assertion...
+ test_directional.py test_pharo_directional_add_responsibility [ FAILED ] 0.13s turing\tests\behavioral\test_directional.py:49: Assertion...
+ test_directional.py test_java_directional_contrast_rational [ FAILED ] 0.12s turing\tests\behavioral\test_directional.py:70: Assertion...
+ test_directional.py test_python_directional_contrast_todo [ FAILED ] 0.12s turing\tests\behavioral\test_directional.py:87: Assertion...
+ test_directional.py test_pharo_directional_contrast_collaborators [ FAILED ] 0.13s turing\tests\behavioral\test_directional.py:112: Assertio...
+ test_directional.py test_java_directional_shift_summary_to_expand [ FAILED ] 0.12s turing\tests\behavioral\test_directional.py:132: Assertio...
+ test_directional.py test_python_directional_shift_summary_to_devnotes [ FAILED ] 0.12s turing\tests\behavioral\test_directional.py:152: Assertio...
+ test_directional.py test_pharo_directional_shift_to_example [ FAILED ] 0.12s turing\tests\behavioral\test_directional.py:173: Assertio...
+ test_invariance.py test_python_invariance_parameters[:param user_i... [ FAILED ] 0.22s turing\tests\behavioral\test_invariance.py:15: AssertionE...
+ test_invariance.py test_python_invariance_parameters[:PARAM USER_I... [ FAILED ] 0.07s turing\tests\behavioral\test_invariance.py:15: AssertionE...
+ test_invariance.py test_python_invariance_parameters[ :param user... [ FAILED ] 0.06s turing\tests\behavioral\test_invariance.py:15: AssertionE...
+ test_invariance.py test_python_invariance_parameters[:param user_i... [ FAILED ] 0.06s turing\tests\behavioral\test_invariance.py:15: AssertionE...
+ test_invariance.py test_java_invariance_deprecation [ FAILED ] 0.13s turing\tests\behavioral\test_invariance.py:26: AssertionE...
+ test_invariance.py test_python_invariance_summary [ FAILED ] 0.13s turing\tests\behavioral\test_invariance.py:45: AssertionE...
+ test_invariance.py test_pharo_invariance_intent [ FAILED ] 0.13s turing\tests\behavioral\test_invariance.py:64: AssertionE...
+ test_invariance.py test_python_invariance_typos_parameters [ FAILED ] 0.07s turing\tests\behavioral\test_invariance.py:85: AssertionE...
+ test_invariance.py test_java_invariance_semantic_summary [ PASS ] 0.32s
+test_minimum_functionality.py test_java_mft[test getfilestatus and related li... [ PASS ] 0.06s
+test_minimum_functionality.py test_java_mft[/* @deprecated Use something else... [ FAILED ] 0.06s turing\tests\behavioral\test_minimum_functionality.py:17:...
+test_minimum_functionality.py test_java_mft[code source of this file http gre... [ FAILED ] 0.06s turing\tests\behavioral\test_minimum_functionality.py:17:...
+test_minimum_functionality.py test_java_mft[this is balanced if each pool is ... [ FAILED ] 0.06s turing\tests\behavioral\test_minimum_functionality.py:17:...
+test_minimum_functionality.py test_java_mft[// For internal use only.-expecte... [ FAILED ] 0.06s turing\tests\behavioral\test_minimum_functionality.py:17:...
+test_minimum_functionality.py test_java_mft[this impl delegates to the old fi... [ FAILED ] 0.07s turing\tests\behavioral\test_minimum_functionality.py:17:...
+test_minimum_functionality.py test_java_mft[/** Usage: new MyClass(arg1). */-... [ FAILED ] 0.07s turing\tests\behavioral\test_minimum_functionality.py:17:...
+test_minimum_functionality.py test_python_mft[a service specific account of t... [ PASS ] 0.06s
+test_minimum_functionality.py test_python_mft[:param user_id: The ID of the u... [ FAILED ] 0.07s turing\tests\behavioral\test_minimum_functionality.py:29:...
+test_minimum_functionality.py test_python_mft[# TODO: Refactor this entire bl... [ FAILED ] 0.07s turing\tests\behavioral\test_minimum_functionality.py:29:...
+test_minimum_functionality.py test_python_mft[use this class if you want acce... [ PASS ] 0.06s
+test_minimum_functionality.py test_python_mft[# create a new list by filterin... [ FAILED ] 0.08s turing\tests\behavioral\test_minimum_functionality.py:29:...
+test_minimum_functionality.py test_pharo_mft[i am a simple arrow like arrowhe... [ PASS ] 0.07s
+test_minimum_functionality.py test_pharo_mft[the example below shows how to c... [ PASS ] 0.07s
+test_minimum_functionality.py test_pharo_mft[i provide a data structure indep... [ FAILED ] 0.06s turing\tests\behavioral\test_minimum_functionality.py:43:...
+test_minimum_functionality.py test_pharo_mft[the cache is cleared after each ... [ FAILED ] 0.07s turing\tests\behavioral\test_minimum_functionality.py:43:...
+test_minimum_functionality.py test_pharo_mft[it is possible hovewer to custom... [ PASS ] 0.07s
+test_minimum_functionality.py test_pharo_mft[collaborators: BlElement, BlSpac... [ FAILED ] 0.07s turing\tests\behavioral\test_minimum_functionality.py:43:...
+```
+
+
+### UNIT Tests
+
+
+```text
+ Module Test Case Result Time Message
+ test_config.py test_proj_root_is_correctly_identified [ PASS ] 0.00s
+ test_config.py test_directory_paths_are_correctly_structured [ PASS ] 0.00s
+ test_config.py test_dataset_constants_are_valid [ PASS ] 0.00s
+ test_config.py test_labels_map_and_total_categories_are_correct [ PASS ] 0.00s
+ test_config.py test_numeric_parameters_are_positive [ PASS ] 0.00s
+ test_config.py test_load_dotenv_is_called_on_module_load [ PASS ] 0.00s
+ test_dataset.py test_initialization_paths_are_correct [ FAILED ] 0.00s turing\tests\unit\test_dataset.py:24: AssertionError
+ test_dataset.py test_format_labels_for_csv[input_labels0-[1, 0,... [ PASS ] 0.00s
+ test_dataset.py test_format_labels_for_csv[[1, 0, 1]-[1, 0, 1]] [ PASS ] 0.00s
+ test_dataset.py test_format_labels_for_csv[input_labels2-[]] [ PASS ] 0.00s
+ test_dataset.py test_format_labels_for_csv[None-None] [ PASS ] 0.00s
+ test_dataset.py test_get_dataset_raises_file_not_found [ PASS ] 0.00s
+ test_dataset.py test_get_dataset_success_and_label_parsing [ PASS ] 0.48s
+test_features.py test_config_id_generation [ PASS ] 0.00s
+test_features.py test_config_attributes [ PASS ] 0.00s
+test_features.py test_clean_text_basic [ PASS ] 0.00s
+test_features.py test_clean_text_stopwords [ PASS ] 2.39s
+test_features.py test_clean_text_lemmatization [ PASS ] 0.00s
+test_features.py test_clean_text_handles_none [ PASS ] 0.00s
+test_features.py test_extract_numeric_features [ PASS ] 0.00s
+ test_model.py test_model_initialization[randomForestTfIdf] [ PASS ] 0.00s
+ test_model.py test_model_initialization[codeBerta] [ PASS ] 0.00s
+ test_model.py test_model_setup[randomForestTfIdf] [ PASS ] 0.00s
+ test_model.py test_model_setup[codeBerta] [ PASS ] 1.39s
+ test_model.py test_model_train[randomForestTfIdf] [ PASS ] 3.06s
+ test_model.py test_model_train[codeBerta] [ PASS ] 4.90s
+ test_model.py test_model_evaluate[randomForestTfIdf] [ PASS ] 1.39s
+ test_model.py test_model_evaluate[codeBerta] [ FAILED ] 6.36s turing\tests\unit\test_model.py:101: AssertionError
+ test_model.py test_model_predict[randomForestTfIdf] [ PASS ] 1.36s
+ test_model.py test_model_predict[codeBerta] [ PASS ] 5.26s
+```
diff --git a/reports/unit_tests/report.md b/reports/unit_tests/report.md
new file mode 100644
index 0000000000000000000000000000000000000000..9ebe350e94f19f83541c3a75f87c163a6baf5b3d
--- /dev/null
+++ b/reports/unit_tests/report.md
@@ -0,0 +1,122 @@
+
+# Turing Test Execution Report
+
+
+
+---
+
+
+
+## Environment Information
+
+
+| Parameter | Value |
+|:---------------|:---------------------------|
+| Timestamp | 2025-12-04 18:14:18 |
+| Context | TURING |
+| Python Version | 3.12.12 |
+| Platform | macOS-15.6-arm64-arm-64bit |
+| Architecture | arm64 |
+
+
+---
+
+
+## Executive Summary
+
+
+**Overall Status:** MOSTLY PASSED
+
+
+**Success Rate:** 91.2%
+
+
+| Metric | Count |
+|:-------------|--------:|
+| Total Tests | 34 |
+| Passed | 31 |
+| Failed | 3 |
+| Success Rate | 91.2% |
+
+
+**Visual Progress:**
+
+
+```
+Progress: [█████████████████████████████████████████████░░░░░] 91.2%
+Passed: 31/34 tests
+```
+
+
+---
+
+
+## UNIT Tests
+
+
+### Statistics
+
+
+| Status | Count |
+|:---------|-----------:|
+| Total | 34 |
+| Passed | 31 (91.2%) |
+| Failed | 3 (8.8%) |
+
+
+### Test Results
+
+
+| Module | Test Case | Result | Time | Message |
+|:----------------|:---------------------------------------------------|:---------|:-------|:-----------------------------------------------------|
+| test_api.py | test_health_check_returns_ok | PASS | 0.01s | |
+| test_api.py | test_predict_success_java | PASS | 0.02s | |
+| test_api.py | test_predict_success_python | PASS | 0.00s | |
+| test_api.py | test_predict_success_pharo | PASS | 0.00s | |
+| test_api.py | test_predict_missing_texts | PASS | 0.00s | |
+| test_api.py | test_predict_missing_language | PASS | 0.00s | |
+| test_api.py | test_predict_empty_texts | PASS | 0.00s | |
+| test_api.py | test_predict_error_handling | PASS | 0.00s | |
+| test_api.py | test_predict_invalid_language | PASS | 0.00s | |
+| test_api.py | test_prediction_request_valid | PASS | 0.00s | |
+| test_api.py | test_prediction_response_valid | PASS | 0.00s | |
+| test_config.py | test_proj_root_is_correctly_identified | PASS | 0.00s | |
+| test_config.py | test_directory_paths_are_correctly_structured | PASS | 0.00s | |
+| test_config.py | test_dataset_constants_are_valid | PASS | 0.00s | |
+| test_config.py | test_labels_map_and_total_categories_are_correct | PASS | 0.00s | |
+| test_config.py | test_numeric_parameters_are_positive | PASS | 0.00s | |
+| test_config.py | test_load_dotenv_is_called_on_module_load | PASS | 0.00s | |
+| test_dataset.py | test_initialization_paths_are_correct | FAIL | 0.00s | turing/tests/unit/test_dataset.py:25: AssertionError |
+| test_dataset.py | test_format_labels_for_csv[input_labels0-[1, 0,... | PASS | 0.00s | |
+| test_dataset.py | test_format_labels_for_csv[[1, 0, 1]-[1, 0, 1]] | PASS | 0.00s | |
+| test_dataset.py | test_format_labels_for_csv[input_labels2-[]] | PASS | 0.00s | |
+| test_dataset.py | test_format_labels_for_csv[None-None] | PASS | 0.00s | |
+| test_dataset.py | test_get_dataset_raises_file_not_found | PASS | 0.00s | |
+| test_dataset.py | test_get_dataset_success_and_label_parsing | FAIL | 0.00s | turing/dataset.py:128: FileNotFoundError |
+| test_model.py | test_model_initialization[randomForestTfIdf] | PASS | 0.00s | |
+| test_model.py | test_model_initialization[codeBerta] | PASS | 0.00s | |
+| test_model.py | test_model_setup[randomForestTfIdf] | PASS | 0.00s | |
+| test_model.py | test_model_setup[codeBerta] | PASS | 0.93s | |
+| test_model.py | test_model_train[randomForestTfIdf] | PASS | 2.66s | |
+| test_model.py | test_model_train[codeBerta] | PASS | 7.22s | |
+| test_model.py | test_model_evaluate[randomForestTfIdf] | PASS | 1.31s | |
+| test_model.py | test_model_evaluate[codeBerta] | FAIL | 8.83s | turing/tests/unit/test_model.py:101: AssertionError |
+| test_model.py | test_model_predict[randomForestTfIdf] | PASS | 1.21s | |
+| test_model.py | test_model_predict[codeBerta] | PASS | 5.98s | |
+
+
+---
+
+
+> **ERROR**: 3 test(s) failed. Please review the error messages above.
+
+
+
+---
+
+
+
+*Report generated on 2025-12-04 at 18:14:18*
+
+
+*Powered by Turing Test Suite*
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f024c882d66251a4a53553b2fedcf33a587a77c4
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,13 @@
+fastapi
+uvicorn[standard]
+loguru
+pydantic
+python-dotenv
+mlflow
+numpy
+transformers
+dagshub
+datasets
+accelerate
+scikit-learn
+gradio
\ No newline at end of file
diff --git a/turing/CLI_runner/run_dataset.py b/turing/CLI_runner/run_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..b02c26e3acbdfa3f035ab2c1883108feac2c70cf
--- /dev/null
+++ b/turing/CLI_runner/run_dataset.py
@@ -0,0 +1,105 @@
+import os
+from pathlib import Path
+import sys
+
+from loguru import logger
+import typer
+from typing_extensions import Annotated
+
+try:
+ from turing.config import INTERIM_DATA_DIR, RAW_DATA_DIR
+ from turing.dataset import DatasetManager
+except ImportError:
+ logger.error("Error: Could not import DatasetManager. Check sys.path configuration.")
+ logger.error(f"Current sys.path: {sys.path}")
+ sys.exit(1)
+
+
+script_dir = os.path.dirname(os.path.abspath(__file__))
+proj_root = os.path.dirname(os.path.dirname(script_dir))
+sys.path.append(proj_root)
+
+app = typer.Typer(help="CLI for dataset management (Download, Conversion, and Search).")
+
+
+@app.command()
+def download():
+ """
+ Loads the dataset from Hugging Face and saves it into the "raw" folder.
+ """
+ logger.info("Starting dataset download...")
+ manager = DatasetManager()
+ manager.download_dataset()
+ logger.success("Download complete.")
+
+
+@app.command(name="parquet-to-csv")
+def parquet_to_csv():
+ """
+ Converts all parquet files in the raw data directory
+ to CSV format in the interim data directory.
+ """
+ logger.info("Starting Parquet -> CSV conversion...")
+ manager = DatasetManager()
+ manager.parquet_to_csv()
+ logger.success("Conversion complete.")
+
+
+@app.command()
+def search(
+ filename: Annotated[
+ str, typer.Argument(help="The exact filename to search for (e.g., 'java_train.parquet')")
+ ],
+ directory: Annotated[
+ str,
+ typer.Option(
+ "--directory",
+ "-d",
+ help="Directory to search in. Keywords 'raw' or 'interim' can be used.",
+ ),
+ ] = "raw",
+):
+ """
+ Searches for a file by name in the data directories.
+ """
+ logger.info(f"Initializing search for '{filename}'...")
+ manager = DatasetManager()
+
+ search_path = None
+ if directory.lower() == "raw":
+ search_path = RAW_DATA_DIR
+ logger.info("Searching in 'raw' data directory.")
+ elif directory.lower() == "interim":
+ search_path = INTERIM_DATA_DIR
+ logger.info("Searching in 'interim' data directory.")
+ else:
+ search_path = Path(directory)
+ logger.info(f"Searching in custom path: {search_path}")
+
+ results = manager.search_file(filename, search_directory=search_path)
+
+ if results:
+ logger.success(f"Found {len(results)} file(s):")
+ for res in results:
+ print(f"-> {res}")
+ else:
+ logger.warning(f"File '{filename}' not found in {search_path}.")
+
+
+@app.command(name="show-raw-hf")
+def show_raw_hf():
+ """
+ Loads and displays info about the raw dataset from Hugging Face.
+ """
+ logger.info("Loading raw dataset info from Hugging Face...")
+ manager = DatasetManager()
+ dataset = manager.get_raw_dataset_from_hf()
+ if dataset:
+ logger.info("Dataset info:")
+ print(dataset)
+ else:
+ logger.error("Could not retrieve dataset.")
+
+
+if __name__ == "__main__":
+ app()
diff --git a/turing/CLI_runner/run_prediction.py b/turing/CLI_runner/run_prediction.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6f104daa4c0aae5a493fd4d5fc3f6a412b94cc3
--- /dev/null
+++ b/turing/CLI_runner/run_prediction.py
@@ -0,0 +1,57 @@
+from pathlib import Path
+import sys
+
+from loguru import logger
+import typer
+
+from turing.modeling.models.randomForestTfIdf import RandomForestTfIdf
+from turing.modeling.predict import ModelInference
+
+# Add project root to sys.path
+current_dir = Path(__file__).resolve().parent
+project_root = current_dir.parent
+if str(project_root) not in sys.path:
+ sys.path.append(str(project_root))
+
+app = typer.Typer()
+
+
+@app.command()
+def main(
+ mlflow_run_id: str = typer.Option(
+ "af1fa5959dc14fa9a29a0a19c11f1b08", help="The MLflow Run ID"
+ ),
+ artifact_name: str = typer.Option(
+ "RandomForestTfIdf_java", help="The name of the model artifact"
+ ),
+ language: str = typer.Option("java", help="The target programming language"),
+):
+ """
+ Run inference using the dataset stored on disk (Standard CML/DVC workflow).
+ """
+ logger.info("Starting CLI inference process...")
+
+ try:
+ # Initialize inference engine
+ inference_engine = ModelInference()
+
+ # Run prediction on the test dataset
+ results = inference_engine.predict_from_mlflow(
+ mlflow_run_id=mlflow_run_id,
+ artifact_name=artifact_name,
+ language=language,
+ model_class=RandomForestTfIdf,
+ )
+
+ # Output results
+ print("\n--- Prediction Results ---")
+ print(results)
+ print("--------------------------")
+
+ except Exception as e:
+ logger.error(f"CLI Prediction failed: {e}")
+ raise typer.Exit(code=1)
+
+
+if __name__ == "__main__":
+ app()
diff --git a/turing/__init__.py b/turing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..18ae806ff73753d6560266c7fb68c2bd51971a7b
--- /dev/null
+++ b/turing/__init__.py
@@ -0,0 +1 @@
+from turing import config # noqa: F401
diff --git a/turing/__pycache__/__init__.cpython-312.pyc b/turing/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b29672cc77222a9cc6febdd7fbd5f3fa8bed3541
Binary files /dev/null and b/turing/__pycache__/__init__.cpython-312.pyc differ
diff --git a/turing/__pycache__/config.cpython-312.pyc b/turing/__pycache__/config.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54a78bb57637c5c4af860f7cba23a1187954768d
Binary files /dev/null and b/turing/__pycache__/config.cpython-312.pyc differ
diff --git a/turing/__pycache__/dataset.cpython-312.pyc b/turing/__pycache__/dataset.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67213b96474be9aadf50425b2a70e088497b677e
Binary files /dev/null and b/turing/__pycache__/dataset.cpython-312.pyc differ
diff --git a/turing/__pycache__/evaluate_model.cpython-312.pyc b/turing/__pycache__/evaluate_model.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..683e5fae32d19b99f5035f0716ffa3f44b40e6c4
Binary files /dev/null and b/turing/__pycache__/evaluate_model.cpython-312.pyc differ
diff --git a/turing/api/__init__.py b/turing/api/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/turing/api/app.py b/turing/api/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb0b3dbd841736ee37c70e299763510aada87342
--- /dev/null
+++ b/turing/api/app.py
@@ -0,0 +1,115 @@
+import base64
+import os
+
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import JSONResponse
+import gradio as gr
+from loguru import logger
+
+from turing.api.demo import create_demo
+from turing.api.schemas import PredictionRequest, PredictionResponse
+from turing.modeling.predict import ModelInference
+
+
+def get_logo_b64_src(filename="logo_header.svg"):
+ """read SVG and convert it into a string Base64 for HTML."""
+ try:
+ base_path = os.path.dirname(os.path.abspath(__file__))
+ target_path = os.path.join(base_path, "..", "..", "reports", "figures", filename)
+ target_path = os.path.normpath(target_path)
+
+ with open(target_path, "rb") as f:
+ encoded = base64.b64encode(f.read()).decode("utf-8")
+ return f"data:image/svg+xml;base64,{encoded}"
+ except Exception as e:
+ print(f"Unable to load logo for API: {e}")
+ return ""
+
+
+# load logo
+logo_src = get_logo_b64_src()
+
+# html
+logo_html_big = f"""
+
+
+
+"""
+
+# description
+description_md = f"""
+API for classifying code comments.
+
+You can interact with the model directly using the visual interface.
+Click the logo below to open it:
+
+{logo_html_big}
+
+"""
+
+app = FastAPI(
+ title="Turing Team Code Classification API",
+ description=description_md,
+ version="1.0.0"
+)
+
+@app.get("/manifest.json")
+def get_manifest():
+ return JSONResponse(content={
+ "name": "Turing App",
+ "short_name": "Turing",
+ "start_url": "/gradio",
+ "display": "standalone",
+ "background_color": "#ffffff",
+ "theme_color": "#000000",
+ "icons": []
+ })
+
+# Global inference engine instance
+inference_engine = ModelInference()
+
+demo = create_demo(inference_engine)
+app = gr.mount_gradio_app(app, demo, path="/gradio")
+
+@app.get("/")
+def health_check():
+ """
+ Root endpoint to verify API status.
+ """
+ return {"status": "ok", "message": "Turing Code Classification API is ready.", "ui_url": "/gradio"}
+
+
+@app.post("/predict", response_model=PredictionResponse)
+def predict(request: PredictionRequest):
+ """
+ Endpoint to classify a list of code comments.
+ Dynamically loads the model from MLflow based on the request parameters.
+ """
+ try:
+ logger.info(f"Received prediction request for language: {request.language}")
+
+ # Perform prediction using the inference engine
+ raw, predictions, run_id, artifact = inference_engine.predict_payload(
+ texts=request.texts, language=request.language
+ )
+
+ # Ensure predictions are serializable (convert numpy arrays to lists)
+ if hasattr(predictions, "tolist"):
+ predictions = predictions.tolist()
+
+ return PredictionResponse(
+ predictions=raw.tolist(),
+ labels=predictions,
+ model_info={"artifact": artifact, "language": request.language},
+ )
+
+ except Exception as e:
+ logger.error(f"Prediction failed: {str(e)}")
+ raise HTTPException(status_code=500, detail=str(e))
+
+
+# Entry point for running the API directly with python
+if __name__ == "__main__":
+ import uvicorn
+
+ uvicorn.run(app, host="127.0.0.1", port=7860)
diff --git a/turing/api/demo.py b/turing/api/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b9e0a57c23895dd4bbf649df7fb9ec44fdfb3bf
--- /dev/null
+++ b/turing/api/demo.py
@@ -0,0 +1,302 @@
+import csv
+from datetime import datetime
+import os
+
+import gradio as gr
+
+# ---IMPORTS ---
+try:
+ from turing.modeling.models.codeBerta import CodeBERTa
+ from turing.modeling.predict import ModelInference
+except ImportError as e:
+ print(f"WARNING: Error importing real modules: {e}")
+ class CodeBERTa:
+ pass
+ class ModelInference:
+ pass
+
+# --- CONFIGURATION ---
+FEEDBACK_FILE = "reports/feedback/feedback_data.csv"
+
+LABELS_MAP = {
+ "java": ["summary", "Ownership", "Expand", "usage", "Pointer", "deprecation", "rational"],
+ "python": ["Usage", "Parameters", "DevelopmentNotes", "Expand", "Summary"],
+ "pharo": ["Keyimplementationpoints", "Example", "Responsibilities", "Intent", "Keymessages", "Collaborators"],
+}
+
+# --- CSS ---
+CSS = """
+:root {
+ --bg-primary: #fafaf9; --bg-secondary: #ffffff; --border-color: #e5e7eb;
+ --text-primary: #1f2937; --text-secondary: #6b7280; --accent-bg: #f3f4f6;
+ --primary-btn: #ea580c; --primary-btn-hover: #c2410c;
+}
+.dark, body.dark, .gradio-container.dark {
+ --bg-primary: #0f172a; --bg-secondary: #1e293b; --border-color: #374151;
+ --text-primary: #f3f4f6; --text-secondary: #9ca3af; --accent-bg: #334155;
+}
+body, .gradio-container {
+ background-color: var(--bg-primary) !important; color: var(--text-primary) !important;
+ font-family: 'Segoe UI', system-ui, sans-serif; transition: background 0.3s, color 0.3s;
+}
+.compact-header {
+ display: flex; align-items: center; justify-content: space-between; padding: 1.5rem 2rem;
+ border-bottom: 1px solid var(--border-color); margin-bottom: 2rem;
+ background-color: var(--bg-secondary); flex-wrap: wrap; gap: 1rem; border-radius: 0 0 12px 12px;
+}
+.input-card, .output-card {
+ background-color: var(--bg-secondary); border: 1px solid var(--border-color);
+ border-radius: 12px; padding: 1.5rem; margin-bottom: 1rem; box-shadow: 0 4px 6px -1px rgba(0,0,0,0.1);
+}
+.header-left { display: flex; align-items: center; gap: 1.5rem; }
+.logo-icon {
+ height: 55px; width: auto; padding: 0; background-color: transparent;
+ border: none; box-shadow: none; display: flex; align-items: center; justify-content: center; flex-shrink: 0;
+}
+.logo-icon svg { height: 100%; width: auto; fill: var(--primary-btn); }
+.title-group { display: flex; flex-direction: column; }
+.main-title { font-size: 1.6rem; font-weight: 800; margin: 0; line-height: 1.1; color: var(--text-primary); letter-spacing: -0.5px; }
+.subtitle { font-size: 0.95rem; color: var(--text-secondary); margin: 0; font-weight: 400; }
+.section-title { font-weight: 600; color: var(--text-primary); margin-bottom: 1rem; }
+.header-right { flex: 1; display: flex; justify-content: flex-end; align-items: center; min-width: 250px; }
+.dev-note-container {
+ background-color: var(--accent-bg); border: 1px solid var(--border-color); border-radius: 16px;
+ width: 520px; height: 64px; display: flex; align-items: center; justify-content: flex-start; padding: 0 24px; gap: 1rem;
+}
+.dev-note-container:hover { border-color: var(--primary-btn); }
+.dev-icon { font-size: 1.4rem; background: transparent !important; border: none !important; display: flex; align-items: center; flex-shrink: 0; }
+.dev-text {
+ font-family: 'Courier New', monospace; font-size: 0.95rem; color: var(--text-secondary);
+ transition: opacity 1.5s ease; white-space: normal; line-height: 1.2; text-align: left;
+ display: -webkit-box; -webkit-line-clamp: 2; -webkit-box-orient: vertical; overflow: hidden;
+}
+.dev-text.hidden { opacity: 0; }
+.feedback-section { margin-top: 2rem; padding-top: 1.5rem; border-top: 1px dashed var(--border-color); }
+.feedback-title { font-size: 0.8rem; font-weight: 700; color: var(--text-secondary); text-transform: uppercase; margin-bottom: 0.8rem; }
+.gr-button-primary { background: var(--primary-btn) !important; border: none !important; color: white !important; }
+.gr-button-primary:hover { background: var(--primary-btn-hover) !important; }
+.gr-button-secondary { background: var(--bg-primary) !important; border: 1px solid var(--border-color) !important; color: var(--text-primary) !important; }
+.gr-box, .gr-input, .gr-dropdown { background: var(--bg-primary) !important; border-color: var(--border-color) !important; }
+#result-box textarea {
+ font-size: 1.25rem; font-weight: 700; text-align: center; color: var(--primary-btn);
+ background-color: transparent; border: none; overflow: hidden !important; resize: none; white-space: normal; line-height: 1.4;
+}
+"""
+
+# --- JAVASCRIPT ---
+JS_LOADER = """
+() => {
+ const notes = [
+ "Yes, even Pharo. Don’t ask why.",
+ "Is ‘deprecated’ significant? Asking for a friend.",
+ "Technical debt is just future-me's problem.",
+ "Comment first, code later. Obviously.",
+ "If it works, don't touch it.",
+ "Fixing bugs created by previous-me.",
+ "Legacy code: don't breathe on it.",
+ "Documentation is a love letter to your future self.",
+ "It works on my machine!",
+ "404: Motivation not found.",
+ "Compiling... please hold."
+ ];
+ let idx = 0;
+ function rotateNotes() {
+ const textEl = document.getElementById('dev-note-text');
+ if (!textEl) { setTimeout(rotateNotes, 500); return; }
+ textEl.classList.add('hidden');
+ setTimeout(() => {
+ idx = (idx + 1) % notes.length;
+ textEl.innerText = notes[idx];
+ textEl.classList.remove('hidden');
+ }, 1500);
+ }
+ setInterval(rotateNotes, 10000);
+}
+"""
+
+# --- UTILITIES ---
+def load_svg_content(filename="logo_header.svg"):
+ base_path = os.path.dirname(os.path.abspath(__file__))
+ target_path = os.path.join(base_path, "..", "..", "reports", "figures", filename)
+ target_path = os.path.normpath(target_path)
+
+ if os.path.exists(target_path):
+ with open(target_path, "r", encoding="utf-8") as f:
+ return f.read()
+ else:
+ print(f"[WARNING] Logo not found in: {target_path}")
+ return "CCC"
+
+def save_feedback_to_csv(text, language, predicted, suggested):
+ if not text:
+ return "No data."
+ try:
+ os.makedirs(os.path.dirname(FEEDBACK_FILE), exist_ok=True)
+ file_exists = os.path.isfile(FEEDBACK_FILE)
+ with open(FEEDBACK_FILE, mode='a', newline='', encoding='utf-8') as f:
+ writer = csv.writer(f)
+ if not file_exists:
+ writer.writerow(["Timestamp", "Input_Text", "Language", "Model_Prediction", "User_Correction"])
+
+ pred_label = predicted
+ if isinstance(predicted, dict):
+ pred_label = max(predicted, key=predicted.get) if predicted else "Unknown"
+
+ writer.writerow([
+ datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+ text.strip(),
+ language,
+ pred_label,
+ suggested
+ ])
+ return "Feedback saved successfully!"
+ except Exception as e:
+ return f"Error saving feedback: {str(e)}"
+
+# --- SYNTAX VALIDATION LOGIC ---
+def is_valid_syntax(text: str, language: str) -> bool:
+ """
+ Validates if the text follows the basic comment syntax for the given language.
+ """
+ text = text.strip()
+ if not text:
+ return False
+
+ if language == "java":
+ # Supports: // comment OR /* comment */
+ return text.startswith("//") or (text.startswith("/*") and text.endswith("*/"))
+
+ elif language == "python":
+ # Supports: # comment OR """ docstring """ OR ''' docstring '''
+ return text.startswith("#") or \
+ (text.startswith('"""') and text.endswith('"""')) or \
+ (text.startswith("'''") and text.endswith("'''"))
+
+ elif language == "pharo":
+ # Supports: " comment "
+ return text.startswith('"') and text.endswith('"')
+
+ return True
+
+# --- MAIN DEMO ---
+def create_demo(inference_engine: ModelInference):
+
+ def classify_comment(text: str, language: str):
+ """
+ Calls the inference engine only if syntax is valid.
+ """
+ if not text:
+ return None
+
+ # SYNTAX CHECK
+ if not is_valid_syntax(text, language):
+ error_msg = "Error: Invalid Syntax."
+ if language == "java":
+ error_msg += " Java comments must start with '//' or be enclosed in '/* ... */'."
+ elif language == "python":
+ error_msg += " Python comments must start with '#' or use docstrings ('\"\"\"' / \"'''\")."
+ elif language == "pharo":
+ error_msg += " Pharo comments must be enclosed in double quotes (e.g., \"comment\")."
+ return error_msg
+
+ # INFERENCE
+ try:
+ _, labels, _, _ = inference_engine.predict_payload(
+ texts=[text],
+ language=language
+ )
+
+ if labels and len(labels) > 0:
+ first_prediction = labels[0][0]
+ if isinstance(first_prediction, (list, tuple)):
+ return first_prediction[0]
+ else:
+ return str(first_prediction)
+
+ return "Unknown: Low confidence."
+
+ except Exception as e:
+ print(f"Prediction Error: {e}")
+ return f"System Error: Failed to process request for '{language}'."
+
+ def update_dropdown(language):
+ choices = LABELS_MAP.get(language, [])
+ return gr.Dropdown(choices=choices, value=None, interactive=True)
+
+ def clear_all():
+ return (None, "java", "", gr.Dropdown(choices=LABELS_MAP["java"], value=None, interactive=True), "")
+
+ logo_svg = load_svg_content("logo_header.svg")
+
+ with gr.Blocks(title="Code Comment Classifier") as demo:
+ gr.HTML(f"")
+
+ # --- HEADER ---
+ gr.HTML(f"""
+
+ """)
+
+ with gr.Row():
+ with gr.Column():
+ gr.HTML('')
+ input_text = gr.Textbox(label="Code Comment", lines=8, show_label=False, placeholder="Enter code comment here...")
+ with gr.Row():
+ input_lang = gr.Dropdown(["java", "python", "pharo"], label="Language", value="java", scale=2)
+ submit_btn = gr.Button("⚡ Classify", variant="primary", scale=1)
+ clear_btn = gr.Button("🗑️ Clear All", variant="secondary", size="sm")
+
+ with gr.Column():
+ gr.HTML('')
+ output_tags = gr.Textbox(
+ label="Predicted Category",
+ show_label=False,
+ elem_id="result-box",
+ interactive=False,
+ lines=2
+ )
+
+ gr.HTML('🛠️ Help Improve the Model
')
+ with gr.Row():
+ correction_dropdown = gr.Dropdown(
+ choices=LABELS_MAP["java"],
+ label="Correct Label",
+ show_label=False,
+ container=False,
+ scale=3,
+ interactive=True
+ )
+ feedback_btn = gr.Button("📤 Save Feedback", variant="secondary", scale=1)
+ feedback_msg = gr.Markdown("", show_label=False)
+
+ gr.Examples(
+ examples=[
+ ["/** Validates the user session token. */", "java"],
+ ["# Retry logic for DB connection.", "python"],
+ ['"Manages the network connection lifecycle."', "pharo"]
+ ],
+ inputs=[input_text, input_lang],
+ label="Quick Examples"
+ )
+
+ input_lang.change(fn=update_dropdown, inputs=input_lang, outputs=correction_dropdown)
+ submit_btn.click(fn=classify_comment, inputs=[input_text, input_lang], outputs=[output_tags])
+ feedback_btn.click(fn=save_feedback_to_csv, inputs=[input_text, input_lang, output_tags, correction_dropdown], outputs=[feedback_msg])
+ clear_btn.click(fn=clear_all, inputs=None, outputs=[input_text, input_lang, output_tags, correction_dropdown, feedback_msg])
+
+ demo.load(None, js=JS_LOADER)
+
+ return demo
\ No newline at end of file
diff --git a/turing/api/schemas.py b/turing/api/schemas.py
new file mode 100644
index 0000000000000000000000000000000000000000..eff7e9def4c6b4233624f8e81cd5a29a3e71898e
--- /dev/null
+++ b/turing/api/schemas.py
@@ -0,0 +1,22 @@
+from typing import Any, List
+
+from pydantic import BaseModel, Field
+
+
+# Input Schema
+class PredictionRequest(BaseModel):
+ texts: List[str] = Field(
+ ...,
+ description="List of code comments to classify",
+ example=["public void main", "def init self"],
+ )
+ language: str = Field(
+ ..., description="Programming language (java, python, pharo)", example="java"
+ )
+
+
+# Output Schema
+class PredictionResponse(BaseModel):
+ predictions: List[Any] = Field(..., description="List of predicted labels")
+ labels: List[Any] = Field(..., description="List of human-readable labels")
+ model_info: dict = Field(..., description="Metadata about the model used")
diff --git a/turing/config.py b/turing/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4dd4e2177fa9e11404e349039535bdd9fd11a7a
--- /dev/null
+++ b/turing/config.py
@@ -0,0 +1,95 @@
+from pathlib import Path
+
+from dotenv import load_dotenv
+from loguru import logger
+
+# Load environment variables from .env file if it exists
+load_dotenv()
+
+# Paths
+PROJ_ROOT = Path(__file__).resolve().parents[1]
+logger.info(f"PROJ_ROOT path is: {PROJ_ROOT}")
+
+DATA_DIR = PROJ_ROOT / "data"
+RAW_DATA_DIR = DATA_DIR / "raw"
+INTERIM_DATA_DIR = DATA_DIR / "interim"
+PROCESSED_DATA_DIR = DATA_DIR / "processed"
+EXTERNAL_DATA_DIR = DATA_DIR / "external"
+
+MODELS_DIR = PROJ_ROOT / "models"
+
+REPORTS_DIR = PROJ_ROOT / "reports"
+FIGURES_DIR = REPORTS_DIR / "figures"
+
+# Dataset
+DATASET_HF_ID = "NLBSE/nlbse26-code-comment-classification"
+LANGS = ["java", "python", "pharo"]
+INPUT_COLUMN = "combo"
+LABEL_COLUMN = "labels"
+
+LABELS_MAP = {
+ "java": ["summary", "Ownership", "Expand", "usage", "Pointer", "deprecation", "rational"],
+ "python": ["Usage", "Parameters", "DevelopmentNotes", "Expand", "Summary"],
+ "pharo": [
+ "Keyimplementationpoints",
+ "Example",
+ "Responsibilities",
+ "Intent",
+ "Keymessages",
+ "Collaborators",
+ ],
+}
+
+TOTAL_CATEGORIES = sum(len(v) for v in LABELS_MAP.values())
+
+# Score parameters
+MAX_AVG_RUNTIME = 5.0 # seconds
+MAX_AVG_FLOPS = 5000.0 # GFLOPS
+
+# Training parameters
+DEFAULT_BATCH_SIZE = 32
+
+# Model configuration mapping
+MODEL_CONFIG = {
+ "codeberta": {
+ "model_name": "fine-tuned-CodeBERTa",
+ "exp_name": "fine-tuned-CodeBERTa",
+ "model_class_module": "turing.modeling.models.codeBerta",
+ "model_class_name": "CodeBERTa",
+ },
+ "graphcodebert": {
+ "model_name": "GraphCodeBERT",
+ "exp_name": "fine-tuned-GraphCodeBERT",
+ "model_class_module": "turing.modeling.models.graphCodeBert",
+ "model_class_name": "GraphCodeBERTClassifier",
+ },
+ "tinybert": {
+ "model_name": "TinyBERT",
+ "exp_name": "fine-tuned-TinyBERT",
+ "model_class_module": "turing.modeling.models.tinyBert",
+ "model_class_name": "TinyBERTClassifier",
+ },
+ "randomforest": {
+ "model_name": "RandomForest-TfIdf",
+ "exp_name": "RandomForest-TfIdf",
+ "model_class_module": "turing.modeling.models.randomForestTfIdf",
+ "model_class_name": "RandomForestTfIdf",
+ },
+}
+DEFAULT_NUM_ITERATIONS = 20
+
+# Existing model modules
+EXISTING_MODELS = [
+ "randomForestTfIdf",
+ "codeBerta",
+]
+
+# If tqdm is installed, configure loguru with tqdm.write
+# https://github.com/Delgan/loguru/issues/135
+try:
+ from tqdm import tqdm
+
+ logger.remove(0)
+ logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True)
+except (ModuleNotFoundError, ValueError):
+ pass
diff --git a/turing/data_validation.py b/turing/data_validation.py
new file mode 100644
index 0000000000000000000000000000000000000000..538296889a77a0e771149f8993b738ae90c05ae0
--- /dev/null
+++ b/turing/data_validation.py
@@ -0,0 +1,271 @@
+from pathlib import Path
+import traceback
+from typing import List
+
+from deepchecks.tabular import Dataset, Suite
+from deepchecks.tabular.checks import (
+ ConflictingLabels,
+ DataDuplicates,
+ LabelDrift,
+ OutlierSampleDetection,
+ TrainTestSamplesMix,
+)
+import numpy as np
+import pandas as pd
+
+from turing.config import LABEL_COLUMN, LABELS_MAP
+
+try:
+ from deepchecks.nlp import TextData
+ from deepchecks.nlp.checks import (
+ PropertyDrift,
+ TextEmbeddingsDrift,
+ )
+
+ NLP_AVAILABLE = True
+except ImportError:
+ NLP_AVAILABLE = False
+
+
+def _encode_labels_for_validation(
+ series: pd.Series, class_names: List[str]
+) -> pd.Series:
+ def encode(lbl):
+ active_labels = []
+ for idx, is_active in enumerate(lbl):
+ if is_active:
+ if idx < len(class_names):
+ active_labels.append(class_names[idx])
+ else:
+ active_labels.append(f"Class_{idx}")
+ if not active_labels:
+ return "No_Label"
+ return " & ".join(active_labels)
+
+ return series.apply(encode)
+
+
+def _calculate_code_specific_properties(text_series: List[str]) -> pd.DataFrame:
+ props = []
+ for text in text_series:
+ s = str(text)
+ length = len(s)
+ non_alnum = sum(1 for c in s if not c.isalnum() and not c.isspace())
+ props.append(
+ {
+ "Text_Length": length,
+ "Symbol_Ratio": non_alnum / length if length > 0 else 0.0,
+ }
+ )
+ return pd.DataFrame(props)
+
+
+def _nuke_rogue_files():
+ """
+ delete .npy files
+ """
+ rogue_filenames = [
+ "embeddings.npy"
+
+ ]
+ for fname in rogue_filenames:
+ p = Path(fname)
+ if p.exists():
+ try:
+ p.unlink()
+ except Exception:
+ pass
+
+
+def run_custom_deepchecks(
+ df_train: pd.DataFrame,
+ df_test: pd.DataFrame,
+ output_dir: Path,
+ stage: str,
+ language: str,
+):
+ print(f" [Deepchecks] Running Integrity Suite ({stage})...")
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ class_names = LABELS_MAP.get(language, [])
+ cols = ["f_length", "f_word_count", "f_starts_verb", "text_hash"]
+
+ for c in cols:
+ if c not in df_train.columns:
+ df_train[c] = 0
+ if c not in df_test.columns:
+ df_test[c] = 0
+
+ train_ds_df = df_train[cols].copy()
+ train_ds_df["target"] = _encode_labels_for_validation(
+ df_train[LABEL_COLUMN], class_names
+ )
+ test_ds_df = df_test[cols].copy()
+ test_ds_df["target"] = _encode_labels_for_validation(
+ df_test[LABEL_COLUMN], class_names
+ )
+
+ cat_features = ["text_hash", "f_starts_verb"]
+ train_ds = Dataset(train_ds_df, label="target", cat_features=cat_features)
+ test_ds = Dataset(test_ds_df, label="target", cat_features=cat_features)
+
+ check_conflicts = ConflictingLabels(columns=["text_hash"])
+ if hasattr(check_conflicts, "add_condition_ratio_of_conflicting_labels_not_greater_than"):
+ check_conflicts.add_condition_ratio_of_conflicting_labels_not_greater_than(0)
+ else:
+ check_conflicts.add_condition_ratio_of_conflicting_labels_less_or_equal(0)
+
+ check_duplicates = DataDuplicates()
+ if hasattr(check_duplicates, "add_condition_ratio_not_greater_than"):
+ check_duplicates.add_condition_ratio_not_greater_than(0.05)
+ else:
+ check_duplicates.add_condition_ratio_less_or_equal(0.05)
+
+ check_leakage = TrainTestSamplesMix(columns=["text_hash"])
+ try:
+ if hasattr(check_leakage, "add_condition_ratio_not_greater_than"):
+ check_leakage.add_condition_ratio_not_greater_than(0)
+ except Exception:
+ pass
+
+ check_outliers = OutlierSampleDetection()
+ try:
+ if hasattr(check_outliers, "add_condition_outlier_ratio_less_or_equal"):
+ check_outliers.add_condition_outlier_ratio_less_or_equal(0.05)
+ except Exception:
+ pass
+
+ custom_suite = Suite(
+ "Code Quality & Integrity",
+ check_conflicts,
+ check_duplicates,
+ check_leakage,
+ LabelDrift(),
+ check_outliers,
+ )
+
+ try:
+ result = custom_suite.run(train_dataset=train_ds, test_dataset=test_ds)
+ report_path = output_dir / f"1_Integrity_{stage}.html"
+ result.save_as_html(str(report_path), as_widget=False)
+ print(f" [Deepchecks] Report Saved: {report_path}")
+ except Exception as e:
+ print(f" [Deepchecks] Error: {e}")
+ traceback.print_exc()
+
+
+def run_targeted_nlp_checks(
+ df_train: pd.DataFrame,
+ df_test: pd.DataFrame,
+ output_dir: Path,
+ stage: str,
+ language: str = "english",
+):
+ if not NLP_AVAILABLE:
+ print(" [Skip] NLP Suite skipped (libs not installed).")
+ return
+
+ from deepchecks.nlp import Suite as NLPSuite
+
+ print(f" [NLP Check] Running Semantic Analysis ({stage})...")
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ # Clean up any existing garbage before starting
+ _nuke_rogue_files()
+
+ DRIFT_THRESHOLD = 0.20
+ PROP_THRESHOLD = 0.35
+ SAMPLE_SIZE = 2000
+ df_tr = (
+ df_train.sample(n=SAMPLE_SIZE, random_state=42)
+ if len(df_train) > SAMPLE_SIZE
+ else df_train
+ )
+ df_te = (
+ df_test.sample(n=SAMPLE_SIZE, random_state=42)
+ if len(df_test) > SAMPLE_SIZE
+ else df_test
+ )
+
+ try: # START MAIN TRY BLOCK
+ y_tr = np.vstack(df_tr[LABEL_COLUMN].tolist())
+ y_te = np.vstack(df_te[LABEL_COLUMN].tolist())
+
+ train_ds = TextData(
+ df_tr["comment_sentence"].tolist(),
+ label=y_tr,
+ task_type="text_classification",
+ )
+ test_ds = TextData(
+ df_te["comment_sentence"].tolist(),
+ label=y_te,
+ task_type="text_classification",
+ )
+
+ print(" [NLP Check] Calculating custom code properties...")
+ train_props = _calculate_code_specific_properties(
+ df_tr["comment_sentence"].tolist()
+ )
+ test_props = _calculate_code_specific_properties(
+ df_te["comment_sentence"].tolist()
+ )
+
+ train_ds.set_properties(train_props)
+ test_ds.set_properties(test_props)
+
+ # In-memory calculation only.
+ train_ds.calculate_builtin_embeddings()
+ test_ds.calculate_builtin_embeddings()
+
+ check_embeddings = TextEmbeddingsDrift()
+ if hasattr(check_embeddings, "add_condition_drift_score_not_greater_than"):
+ check_embeddings.add_condition_drift_score_not_greater_than(DRIFT_THRESHOLD)
+ elif hasattr(check_embeddings, "add_condition_drift_score_less_than"):
+ check_embeddings.add_condition_drift_score_less_than(DRIFT_THRESHOLD)
+
+ check_len = PropertyDrift(custom_property_name="Text_Length")
+ if hasattr(check_len, "add_condition_drift_score_not_greater_than"):
+ check_len.add_condition_drift_score_not_greater_than(PROP_THRESHOLD)
+ elif hasattr(check_len, "add_condition_drift_score_less_than"):
+ check_len.add_condition_drift_score_less_than(PROP_THRESHOLD)
+
+ check_sym = PropertyDrift(custom_property_name="Symbol_Ratio")
+ if hasattr(check_sym, "add_condition_drift_score_not_greater_than"):
+ check_sym.add_condition_drift_score_not_greater_than(PROP_THRESHOLD)
+ elif hasattr(check_sym, "add_condition_drift_score_less_than"):
+ check_sym.add_condition_drift_score_less_than(PROP_THRESHOLD)
+
+ suite = NLPSuite(
+ "Code Comment Semantic Analysis",
+ check_embeddings,
+ check_len,
+ check_sym
+ )
+
+ res = suite.run(train_ds, test_ds)
+
+ report_path = output_dir / f"2_Semantic_{stage}.html"
+ res.save_as_html(str(report_path), as_widget=False)
+ print(f" [NLP Check] Report saved: {report_path}")
+
+ try:
+ passed = res.get_passed_checks()
+ n_passed = len(passed)
+ n_total = len(res.results)
+ print(f" [NLP Result] {n_passed}/{n_total} checks passed.")
+
+ if n_passed < n_total:
+ print(" [NLP Warning] Failed Checks details:")
+ for result in res.results:
+ if not result.passed_conditions():
+ print(f" - {result.check.name}: {result.conditions_results[0].details}")
+ except Exception:
+ pass
+
+ except Exception as e:
+ print(f" [NLP Check] Failed: {e}")
+ import traceback
+ traceback.print_exc()
+
+ finally:
+ _nuke_rogue_files()
\ No newline at end of file
diff --git a/turing/dataset.py b/turing/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..97cd6efd816d5880790d43d918e193a4a4eb12ab
--- /dev/null
+++ b/turing/dataset.py
@@ -0,0 +1,210 @@
+import ast
+import os
+from pathlib import Path
+
+from datasets import DatasetDict, load_dataset
+from loguru import logger
+
+import turing.config as config
+
+
+class DatasetManager:
+ """
+ Manages the loading, transformation, and access of project datasets.
+ """
+
+ def __init__(self, dataset_path: Path = None):
+ self.hf_id = config.DATASET_HF_ID
+ self.raw_data_dir = config.RAW_DATA_DIR
+ self.interim_data_dir = config.INTERIM_DATA_DIR
+ self.base_interim_path = self.interim_data_dir / "base"
+
+ if dataset_path:
+ self.dataset_path = dataset_path
+ else:
+ self.dataset_path = self.base_interim_path
+
+ def _format_labels_for_csv(self, example: dict) -> dict:
+ """
+ Formats the labels list as a string for CSV storage.
+ (Private class method)
+
+ Args:
+ example (dict): A single example from the dataset.
+
+ Returns:
+ dict: The example with labels converted to string.
+ """
+ labels = example.get("labels")
+ if isinstance(labels, list):
+ example["labels"] = str(labels)
+ return example
+
+ def download_dataset(self):
+ """
+ Loads the dataset from Hugging Face and saves it into the "raw" folder.
+ """
+ logger.info(f"Loading dataset: {self.hf_id}")
+ try:
+ ds = load_dataset(self.hf_id)
+ logger.success("Dataset loaded successfully.")
+ logger.info(f"Dataset splits: {ds}")
+
+ self.raw_data_dir.mkdir(parents=True, exist_ok=True)
+
+ for split_name, dataset_split in ds.items():
+ output_path = os.path.join(
+ self.raw_data_dir, f"{split_name.replace('-', '_')}.parquet"
+ )
+ dataset_split.to_parquet(output_path)
+
+ logger.success(f"Dataset saved to {self.raw_data_dir}.")
+ except Exception as e:
+ logger.warning(f"Error during loading: {e}.")
+
+ def parquet_to_csv(self):
+ """
+ Converts all parquet files in the raw data directory
+ to CSV format in the interim data directory.
+ """
+ logger.info("Starting Parquet to CSV conversion...")
+ self.base_interim_path.mkdir(parents=True, exist_ok=True)
+
+ for file_name in os.listdir(self.raw_data_dir):
+ if file_name.endswith(".parquet"):
+ part_name = file_name.replace(".parquet", "").replace("-", "_")
+
+ # Load the parquet file
+ dataset = load_dataset(
+ "parquet", data_files={part_name: str(self.raw_data_dir / file_name)}
+ )
+
+ # Map and format labels
+ dataset[part_name] = dataset[part_name].map(self._format_labels_for_csv)
+
+ # Save to CSV
+ csv_output_path = os.path.join(self.base_interim_path, f"{part_name}.csv")
+ dataset[part_name].to_csv(csv_output_path)
+
+ logger.info(f"Converted {file_name} to {csv_output_path}")
+
+ logger.success("Parquet -> CSV conversion complete.")
+
+ def get_dataset_name(self) -> str:
+ """
+ Returns the name of the current dataset being used.
+
+ Returns:
+ str: The name of the dataset (e.g., 'clean-aug-soft-k5000').
+ """
+ return self.dataset_path.name
+
+ def get_dataset(self) -> DatasetDict:
+ """
+ Returns the processed dataset from the interim data directory
+ as a DatasetDict (loaded from CSVs).
+
+ Returns:
+ DatasetDict: The complete dataset with train and test splits for each language.
+ """
+
+ dataset_path = self.dataset_path
+
+ # Define the base filenames
+ data_files = {
+ "java_train": str(dataset_path / "java_train.csv"),
+ "java_test": str(dataset_path / "java_test.csv"),
+ "python_train": str(dataset_path / "python_train.csv"),
+ "python_test": str(dataset_path / "python_test.csv"),
+ "pharo_train": str(dataset_path / "pharo_train.csv"),
+ "pharo_test": str(dataset_path / "pharo_test.csv"),
+ }
+
+ # Verify file existence before loading
+ logger.info("Loading CSV dataset from splits...")
+ existing_data_files = {}
+ for key, path in data_files.items():
+ if not os.path.exists(path):
+ found = False
+ if os.path.exists(dataset_path):
+ for f in os.listdir(dataset_path):
+ if f.startswith(key) and f.endswith(".csv"):
+ existing_data_files[key] = str(dataset_path / f)
+ found = True
+ break
+ if not found:
+ logger.warning(f"File not found for split '{key}': {path}")
+ else:
+ existing_data_files[key] = path
+
+ if not existing_data_files:
+ logger.error("No dataset CSV files found. Run 'parquet-to-csv' first.")
+ raise FileNotFoundError("Dataset CSV files not found.")
+
+ logger.info(f"Found files: {list(existing_data_files.keys())}")
+
+ full_dataset = load_dataset("csv", data_files=existing_data_files)
+
+ logger.info("Formatting labels (from string back to list)...")
+ for split in full_dataset:
+ full_dataset[split] = full_dataset[split].map(
+ lambda x: {
+ "labels": ast.literal_eval(x["labels"])
+ if isinstance(x["labels"], str)
+ else x["labels"]
+ }
+ )
+
+ logger.success("Dataset is ready for use.")
+ return full_dataset
+
+ def get_raw_dataset_from_hf(self) -> DatasetDict:
+ """
+ Loads the raw dataset directly from Hugging Face without saving.
+
+ Returns:
+ DatasetDict: The raw dataset from Hugging Face.
+ """
+ logger.info(f"Loading raw dataset '{self.hf_id}' from Hugging Face...")
+ try:
+ ds = load_dataset(self.hf_id)
+ logger.success(f"Successfully loaded '{self.hf_id}'.")
+ return ds
+ except Exception as e:
+ logger.error(f"Failed to load dataset from Hugging Face: {e}")
+ return None
+
+ def search_file(self, file_name: str, search_directory: Path = None) -> list:
+ """
+ Recursively searches for a file by name within a specified data directory.
+
+ Args:
+ file_name (str): The name of the file to search for (e.g., "java_train.csv").
+ search_directory (Path, optional): The directory to search in.
+ Defaults to self.raw_data_dir.
+
+ Returns:
+ list: A list of Path objects for all found files.
+ """
+ if search_directory is None:
+ search_directory = self.raw_data_dir
+ logger.info(f"Defaulting search to raw data directory: {search_directory}")
+
+ if not search_directory.is_dir():
+ logger.error(f"Search directory not found: {search_directory}")
+ return []
+
+ logger.info(f"Searching for '{file_name}' in '{search_directory}'...")
+
+ found_files = []
+ for root, dirs, files in os.walk(search_directory):
+ for file in files:
+ if file == file_name:
+ found_files.append(Path(root) / file)
+
+ if not found_files:
+ logger.warning(f"No files named '{file_name}' found in '{search_directory}'.")
+ else:
+ logger.success(f"Found {len(found_files)} matching file(s).")
+
+ return found_files
diff --git a/turing/evaluate_model.py b/turing/evaluate_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e41e59169ed44b57b7c06b29233d82d65722ce5
--- /dev/null
+++ b/turing/evaluate_model.py
@@ -0,0 +1,121 @@
+import time
+
+from datasets import DatasetDict
+from loguru import logger
+import numpy as np
+import pandas as pd
+import torch
+
+import turing.config as config
+
+
+def calculate_submission_score(avg_f1: float, avg_runtime: float, avg_flops: float) -> float:
+ """
+ Calculates the final competition score.
+ The score is a weighted sum of F1 score, runtime, and GFLOPS.
+ Weights:
+ - F1 Score: 60%
+ - Runtime: 20%
+ - GFLOPS: 20%
+
+ Args:
+ avg_f1 (float): Average F1 score across all categories.
+ avg_runtime (float): Average runtime in seconds.
+ avg_flops (float): Average GFLOPS.
+
+ Returns:
+ float: Final submission score.
+ """
+
+ score_f1 = 0.6 * avg_f1
+
+ runtime_ratio = (config.MAX_AVG_RUNTIME - avg_runtime) / config.MAX_AVG_RUNTIME
+ score_runtime = 0.2 * max(runtime_ratio, 0)
+
+ flops_ratio = (config.MAX_AVG_FLOPS - avg_flops) / config.MAX_AVG_FLOPS
+ score_flops = 0.2 * max(flops_ratio, 0)
+
+ total_score = score_f1 + score_runtime + score_flops
+
+ logger.info(f" F1 Score (60%): {score_f1:.4f} (avg_f1: {avg_f1:.4f})")
+ logger.info(
+ f" Runtime Score (20%): {score_runtime:.4f} (avg_runtime: {avg_runtime:.4f}s / {config.MAX_AVG_RUNTIME}s)"
+ )
+ logger.info(
+ f" GFLOPS Score (20%): {score_flops:.4f} (avg_flops: {avg_flops:.4f} / {config.MAX_AVG_FLOPS})"
+ )
+ logger.info(" ====================")
+ logger.info(f" Final Score: {total_score:.4f}")
+
+ return total_score
+
+
+def evaluate_models(models: dict, dataset: DatasetDict):
+ """
+ Evaluates the provided models on the test datasets for each language.
+ Computes precision, recall, and F1 score for each category and language.
+ Also measures average runtime and GFLOPS for model inference.
+
+ Args:
+ models (dict): A dictionary mapping language codes to their respective models.
+ dataset (DatasetDict): A DatasetDict containing test datasets for each language.
+
+ Returns:
+ pd.DataFrame: DataFrame containing precision, recall, and F1 scores for each category and language.
+ float: Final submission score calculated based on average F1, runtime, and GF
+ """
+
+ total_flops = 0
+ total_time = 0
+ scores = []
+
+ for lan in config.LANGS:
+ logger.info(f"\n--- Evaluating Language: {lan.upper()} ---")
+ model = models[lan]
+
+ with torch.profiler.profile(with_flops=True) as p:
+ test_data = dataset[f"{lan}_test"]
+ x = test_data[config.INPUT_COLUMN]
+ x = list(x) if hasattr(x, 'tolist') else x # Convert pandas Series to list
+ y_true = np.array(test_data[config.LABEL_COLUMN]).T
+
+ begin = time.time()
+ for i in range(10):
+ y_pred = model.predict(x)
+ y_pred = np.asarray(y_pred).T
+ total = time.time() - begin
+ total_time = total_time + total
+
+ total_flops = total_flops + (sum(k.flops for k in p.key_averages()) / 1e9)
+
+ for i in range(len(y_pred)):
+ assert len(y_pred[i]) == len(y_true[i])
+ tp = sum([true == pred == 1 for (true, pred) in zip(y_true[i], y_pred[i])])
+ #tn = sum([true == pred == 0 for (true, pred) in zip(y_true[i], y_pred[i])])
+ fp = sum([true == 0 and pred == 1 for (true, pred) in zip(y_true[i], y_pred[i])])
+ fn = sum([true == 1 and pred == 0 for (true, pred) in zip(y_true[i], y_pred[i])])
+ precision = tp / (tp + fp)
+ recall = tp / (tp + fn)
+ f1 = (2 * tp) / (2 * tp + fp + fn)
+ scores.append({
+ "lan": lan,
+ "cat": config.LABELS_MAP[lan][i],
+ "precision": precision,
+ "recall": recall,
+ "f1": f1,
+ })
+
+ logger.info(f"Compute in GFLOPs: {total_flops / 10}")
+ logger.info(f"Avg runtime in seconds: {total_time / 10}")
+ scores = pd.DataFrame(scores)
+ print(scores)
+
+ avg_f1 = scores["f1"].mean()
+ avg_runtime = total_time / 10
+ avg_flops = total_flops / 10
+
+ final_score = calculate_submission_score(avg_f1, avg_runtime, avg_flops)
+
+ logger.info(f"Final Score for {lan.upper()}: {final_score:.4f}")
+
+ return scores, final_score
diff --git a/turing/features.py b/turing/features.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1b350f3e1156dc7394b0725189f05094616488b
--- /dev/null
+++ b/turing/features.py
@@ -0,0 +1,678 @@
+import ast
+import hashlib
+from pathlib import Path
+import random
+import re
+from typing import List, Tuple
+
+import nltk
+from nltk.corpus import stopwords, wordnet
+from nltk.stem import PorterStemmer, WordNetLemmatizer
+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.feature_selection import SelectKBest, chi2
+import typer
+
+from turing.config import (
+ INTERIM_DATA_DIR,
+ LABEL_COLUMN,
+ LANGS,
+)
+from turing.data_validation import run_custom_deepchecks, run_targeted_nlp_checks
+from turing.dataset import DatasetManager
+
+# --- NLTK Resource Check ---
+REQUIRED_NLTK_PACKAGES = [
+ "stopwords",
+ "wordnet",
+ "omw-1.4",
+ "averaged_perceptron_tagger",
+ "punkt",
+]
+for package in REQUIRED_NLTK_PACKAGES:
+ try:
+ nltk.data.find(f"corpora/{package}")
+ except LookupError:
+ try:
+ nltk.download(package, quiet=True)
+ except Exception:
+ pass
+
+app = typer.Typer()
+
+
+# --- CONFIGURATION CLASS ---
+class FeaturePipelineConfig:
+ """
+ Configuration holder for the pipeline. Generates a unique ID based on parameters
+ to version the output directories.
+ """
+
+ def __init__(
+ self,
+ use_stopwords: bool,
+ use_lemmatization: bool,
+ use_combo_feature: bool,
+ max_features: int,
+ min_comment_length: int,
+ max_comment_length: int,
+ enable_augmentation: bool,
+ custom_tags: str = "base",
+ ):
+ self.use_stopwords = use_stopwords
+ self.use_lemmatization = use_lemmatization
+ self.use_combo_feature = use_combo_feature
+ self.max_features = max_features
+ self.min_comment_length = min_comment_length
+ self.max_comment_length = max_comment_length
+ self.enable_augmentation = enable_augmentation
+ self.custom_tags = custom_tags
+ self.hash_id = self._generate_readable_id()
+
+ def _generate_readable_id(self) -> str:
+ tags = ["clean"]
+ if self.enable_augmentation:
+ tags.append("aug-soft")
+ tags.append(f"k{self.max_features}")
+ if self.custom_tags != "base":
+ tags.append(self.custom_tags)
+ return "-".join(tags)
+
+
+# --- TEXT UTILITIES ---
+class TextCanonicalizer:
+ """
+ Reduces text to a 'canonical' form (stemmed, lowercase)
+ to detect semantic duplicates.
+ preserves javadoc tags to distinguish usage (@return) from summary (Returns).
+ """
+
+ def __init__(self):
+ self.stemmer = PorterStemmer()
+ self.stop_words = set(stopwords.words("english"))
+ # Code keywords are preserved as they carry semantic weight
+ self.code_keywords = {
+ "return",
+ "true",
+ "false",
+ "null",
+ "if",
+ "else",
+ "void",
+ "int",
+ "boolean",
+ "param",
+ "throws",
+ "exception",
+ }
+
+ def to_canonical(self, text: str) -> str:
+ if pd.isna(text):
+ return ""
+ text = str(text).lower()
+ text = re.sub(r"[^a-z0-9\s@]", " ", text)
+
+ words = text.split()
+ canonical_words = []
+
+ for w in words:
+ # If the word starts with @ (e.g., @return), keep it as is
+ if w.startswith("@"):
+ canonical_words.append(w)
+ continue
+
+ if w in self.stop_words and w not in self.code_keywords:
+ continue
+
+ stemmed = self.stemmer.stem(w)
+ canonical_words.append(stemmed)
+
+ return " ".join(canonical_words).strip()
+
+
+class TextProcessor:
+ """
+ Standard text cleaning logic for final feature extraction (TF-IDF).
+ """
+
+ def __init__(self, config: FeaturePipelineConfig, language: str = "english"):
+ self.config = config
+ self.stop_words = set(stopwords.words(language))
+ self.lemmatizer = WordNetLemmatizer()
+
+ def clean_text(self, text: str) -> str:
+ if pd.isna(text):
+ return ""
+ text = str(text).lower()
+ # Remove heavy code markers but keep text structure
+ text = re.sub(r"(^\s*//+|^\s*/\*+|\*/$)", "", text)
+ # Keep only alpha characters for NLP model (plus pipe for combo)
+ text = re.sub(r"[^a-z\s|]", " ", text)
+ tokens = text.split()
+ if self.config.use_stopwords:
+ tokens = [w for w in tokens if w not in self.stop_words]
+ if self.config.use_lemmatization:
+ tokens = [self.lemmatizer.lemmatize(w) for w in tokens]
+ return " ".join(tokens)
+
+
+# --- AUGMENTATION ---
+class SafeAugmenter:
+ """
+ protects reserved keywords from synonym replacement.
+ """
+
+ def __init__(self, aug_prob=0.3):
+ self.aug_prob = aug_prob
+ self.protected_words = {
+ "return",
+ "public",
+ "private",
+ "void",
+ "class",
+ "static",
+ "final",
+ "if",
+ "else",
+ "for",
+ "while",
+ "try",
+ "catch",
+ "import",
+ "package",
+ "null",
+ "true",
+ "false",
+ "self",
+ "def",
+ "todo",
+ "fixme",
+ "param",
+ "throw",
+ }
+
+ def get_synonyms(self, word):
+ synonyms = set()
+ for syn in wordnet.synsets(word):
+ for lemma in syn.lemmas():
+ name = lemma.name().replace("_", " ")
+ if name.isalpha() and name.lower() != word.lower():
+ synonyms.add(name)
+ return list(synonyms)
+
+ def augment(self, text: str) -> str:
+ if pd.isna(text) or not text:
+ return ""
+ words = text.split()
+ if len(words) < 2:
+ return text
+ new_words = []
+ for word in words:
+ word_lower = word.lower()
+
+ if word_lower in self.protected_words:
+ new_words.append(word)
+ continue
+
+ # Random Case Injection (Noise)
+ if random.random() < 0.1:
+ if word[0].isupper():
+ new_words.append(word.lower())
+ else:
+ new_words.append(word.capitalize())
+ continue
+
+ # Synonym Replacement
+ if random.random() < self.aug_prob and len(word) > 3:
+ syns = self.get_synonyms(word_lower)
+ if syns:
+ replacement = random.choice(syns)
+ if word[0].isupper():
+ replacement = replacement.capitalize()
+ new_words.append(replacement)
+ else:
+ new_words.append(word)
+ else:
+ new_words.append(word)
+ return " ".join(new_words)
+
+ def apply_balancing(
+ self, df: pd.DataFrame, min_samples: int = 100
+ ) -> Tuple[pd.DataFrame, pd.DataFrame]:
+ """
+ Generates synthetic data for minority classes.
+ Returns: (Balanced DataFrame, Report DataFrame)
+ """
+ df["temp_label_str"] = df[LABEL_COLUMN].astype(str)
+ counts = df["temp_label_str"].value_counts()
+ print(
+ f"\n [Balance Check - PRE] Min class size: {counts.min()} | Max: {counts.max()}"
+ )
+
+ existing_sentences = set(df["comment_sentence"].str.strip())
+ new_rows = []
+ report_rows = []
+
+ for label_str, count in counts.items():
+ if count < min_samples:
+ needed = min_samples - count
+ class_subset = df[df["temp_label_str"] == label_str]
+ if class_subset.empty:
+ continue
+
+ samples = class_subset["comment_sentence"].tolist()
+ orig_label = class_subset[LABEL_COLUMN].iloc[0]
+
+ # Propagate 'combo' if present
+ orig_combo = None
+ if "combo" in class_subset.columns:
+ orig_combo = class_subset["combo"].iloc[0]
+
+ generated = 0
+ attempts = 0
+ # Cap attempts to avoid infinite loops if vocabulary is too small
+ while generated < needed and attempts < needed * 5:
+ attempts += 1
+ src = random.choice(samples)
+ aug_txt = self.augment(src).strip()
+
+ # Ensure Global Uniqueness
+ if aug_txt and aug_txt not in existing_sentences:
+ row = {
+ "comment_sentence": aug_txt,
+ LABEL_COLUMN: orig_label,
+ "partition": "train_aug",
+ "index": -1, # Placeholder
+ }
+ if orig_combo:
+ row["combo"] = orig_combo
+
+ new_rows.append(row)
+ report_rows.append(
+ {
+ "original_text": src,
+ "augmented_text": aug_txt,
+ "label": label_str,
+ "reason": f"Class has {count} samples (Target {min_samples})",
+ }
+ )
+ existing_sentences.add(aug_txt)
+ generated += 1
+
+ df = df.drop(columns=["temp_label_str"])
+ df_report = pd.DataFrame(report_rows)
+
+ if new_rows:
+ augmented_df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
+ augmented_df["index"] = range(len(augmented_df))
+
+ temp_counts = augmented_df[LABEL_COLUMN].astype(str).value_counts()
+ print(
+ f" [Balance Check - POST] Min class size: {temp_counts.min()} | Max: {temp_counts.max()}"
+ )
+ return augmented_df, df_report
+
+ return df, df_report
+
+
+# --- CLEANING LOGIC ---
+def clean_training_data_smart(
+ df: pd.DataFrame, min_len: int, max_len: int, language: str = "english"
+) -> Tuple[pd.DataFrame, pd.DataFrame]:
+ """
+ Performs 'Smart Cleaning' on the Training Set with language-specific heuristics.
+ """
+ canon = TextCanonicalizer()
+ dropped_rows = []
+
+ print(f" [Clean] Computing heuristics (Language: {language})...")
+ df["canon_key"] = df["comment_sentence"].apply(canon.to_canonical)
+
+ # 1. Token Length Filter
+ def count_code_tokens(text):
+ return len([t for t in re.split(r"[^a-zA-Z0-9]+", str(text)) if t])
+
+ df["temp_token_len"] = df["comment_sentence"].apply(count_code_tokens)
+
+
+ MIN_ALPHA_CHARS = 6
+ MAX_SYMBOL_RATIO = 0.50
+
+ # 2. Heuristic Filters (Tiny/Huge/Code)
+ def get_heuristics(text):
+ s = str(text).strip()
+ char_len = len(s)
+ if char_len == 0:
+ return False, False, 1.0
+
+ alpha_len = sum(1 for c in s if c.isalpha())
+
+ non_alnum_chars = sum(1 for c in s if not c.isalnum() and not c.isspace())
+ symbol_ratio = non_alnum_chars / char_len if char_len > 0 else 0
+
+ is_tiny = alpha_len < MIN_ALPHA_CHARS
+ is_huge = char_len > 800
+ is_code = symbol_ratio > MAX_SYMBOL_RATIO
+
+ return is_tiny, is_huge, is_code
+
+ heuristics = df["comment_sentence"].apply(get_heuristics)
+ df["is_tiny"] = [x[0] for x in heuristics]
+ df["is_huge"] = [x[1] for x in heuristics]
+ df["symbol_ratio"] = [x[2] for x in heuristics]
+
+
+ df["is_code"] = df["symbol_ratio"] > 0.50
+
+ mask_keep = (
+ (df["temp_token_len"] >= min_len)
+ & (df["temp_token_len"] <= max_len)
+ & (~df["is_tiny"])
+ & (~df["is_huge"])
+ & (~df["is_code"])
+ )
+
+ df_dropped_qual = df[~mask_keep].copy()
+ if not df_dropped_qual.empty:
+ def reason(row):
+ if row["is_tiny"]:
+ return f"Too Tiny (<{MIN_ALPHA_CHARS} alpha)"
+ if row["is_huge"]:
+ return "Too Huge (>800 chars)"
+ if row["is_code"]:
+ return f"Pure Code (>{int(MAX_SYMBOL_RATIO*100)}% symbols)"
+ return f"Token Count ({row['temp_token_len']})"
+
+ df_dropped_qual["drop_reason"] = df_dropped_qual.apply(reason, axis=1)
+ dropped_rows.append(df_dropped_qual)
+
+ df = df[mask_keep].copy()
+
+ # 3. Semantic Conflicts (Ambiguity)
+ df["label_s"] = df[LABEL_COLUMN].astype(str)
+ conflict_counts = df.groupby("canon_key")["label_s"].nunique()
+ conflicting_keys = conflict_counts[conflict_counts > 1].index
+
+ mask_conflicts = df["canon_key"].isin(conflicting_keys)
+ df_dropped_conflicts = df[mask_conflicts].copy()
+ if not df_dropped_conflicts.empty:
+ df_dropped_conflicts["drop_reason"] = "Semantic Conflict"
+ dropped_rows.append(df_dropped_conflicts)
+
+ df = df[~mask_conflicts].copy()
+
+ # 4. Exact Duplicates
+ mask_dupes = df.duplicated(subset=["comment_sentence"], keep="first")
+ df_dropped_dupes = df[mask_dupes].copy()
+ if not df_dropped_dupes.empty:
+ df_dropped_dupes["drop_reason"] = "Exact Duplicate"
+ dropped_rows.append(df_dropped_dupes)
+
+ df = df[~mask_dupes].copy()
+
+ # Cleanup columns
+ cols_to_drop = [
+ "canon_key",
+ "label_s",
+ "temp_token_len",
+ "is_tiny",
+ "is_huge",
+ "is_code",
+ "symbol_ratio"
+ ]
+ df = df.drop(columns=cols_to_drop, errors="ignore")
+
+ if dropped_rows:
+ df_report = pd.concat(dropped_rows, ignore_index=True)
+ cols_rep = ["index", "comment_sentence", LABEL_COLUMN, "drop_reason"]
+ final_cols = [c for c in cols_rep if c in df_report.columns]
+ df_report = df_report[final_cols]
+ else:
+ df_report = pd.DataFrame(columns=["index", "comment_sentence", "drop_reason"])
+
+ print(f" [Clean] Removed {len(df_report)} rows. Final: {len(df)}.")
+ return df, df_report
+
+# --- FEATURE ENGINEERING ---
+class FeatureEngineer:
+ def __init__(self, config: FeaturePipelineConfig):
+ self.config = config
+ self.processor = TextProcessor(config=config)
+ self.tfidf_vectorizer = TfidfVectorizer(max_features=config.max_features)
+
+ def extract_features_for_check(self, df: pd.DataFrame) -> pd.DataFrame:
+ """Extracts metadata features for analysis."""
+
+ def analyze(text):
+ s = str(text)
+ words = s.split()
+ n_words = len(words)
+ if n_words == 0:
+ return 0, 0, 0
+ first_word = words[0].lower()
+ starts_verb = (
+ 1
+ if first_word.endswith("s")
+ or first_word.startswith("get")
+ or first_word.startswith("set")
+ else 0
+ )
+ return (len(s), n_words, starts_verb)
+
+ metrics = df["comment_sentence"].apply(analyze)
+ df["f_length"] = [x[0] for x in metrics]
+ df["f_word_count"] = [x[1] for x in metrics]
+ df["f_starts_verb"] = [x[2] for x in metrics]
+ # Calculate MD5 hash for efficient exact duplicate detection in Deepchecks
+ df["text_hash"] = df["comment_sentence"].apply(
+ lambda x: hashlib.md5(str(x).encode()).hexdigest()
+ )
+ return df
+
+ def vectorize_and_select(self, df_train, df_test):
+ def clean_fn(x):
+ return re.sub(r"[^a-zA-Z\s]", "", str(x).lower())
+
+ X_train = self.tfidf_vectorizer.fit_transform(
+ df_train["comment_sentence"].apply(clean_fn)
+ )
+ y_train = np.stack(df_train[LABEL_COLUMN].values)
+
+ # Handling multi-label for Chi2 (using sum or max)
+ y_train_sum = (
+ y_train.sum(axis=1) if len(y_train.shape) > 1 else y_train
+ )
+ selector = SelectKBest(
+ chi2, k=min(self.config.max_features, X_train.shape[1])
+ )
+ X_train = selector.fit_transform(X_train, y_train_sum)
+
+ X_test = self.tfidf_vectorizer.transform(
+ df_test["comment_sentence"].apply(clean_fn)
+ )
+ X_test = selector.transform(X_test)
+
+ vocab = [
+ self.tfidf_vectorizer.get_feature_names_out()[i]
+ for i in selector.get_support(indices=True)
+ ]
+ return X_train, X_test, vocab
+
+
+# --- MAIN EXECUTION ---
+def main(
+ feature_dir: Path = typer.Option(
+ INTERIM_DATA_DIR / "features", help="Output dir."
+ ),
+ reports_root: Path = typer.Option(
+ Path("reports/data"), help="Reports root."
+ ),
+ max_features: int = typer.Option(5000),
+ min_comment_length: int = typer.Option(
+ 2, help="Remove comments shorter than chars."
+ ),
+ max_comment_length: int = typer.Option(300),
+ augment: bool = typer.Option(False, "--augment", help="Enable augmentation."),
+ balance_threshold: int = typer.Option(100, help="Min samples per class."),
+ run_vectorization: bool = typer.Option(False, "--run-vectorization"),
+ run_nlp_check: bool = typer.Option(
+ True, "--run-nlp", help="Run Deepchecks NLP suite."
+ ),
+ custom_tags: str = typer.Option("base", help="Custom tags."),
+ save_full_csv: bool = typer.Option(False, "--save-full-csv"),
+ languages: List[str] = typer.Option(LANGS, show_default=False),
+):
+
+ config = FeaturePipelineConfig(
+ True,
+ True,
+ True,
+ max_features,
+ min_comment_length,
+ max_comment_length,
+ augment,
+ custom_tags,
+ )
+ print(f"=== Pipeline ID: {config.hash_id} ===")
+
+ dm = DatasetManager()
+ full_dataset = dm.get_dataset()
+ fe = FeatureEngineer(config)
+ augmenter = SafeAugmenter()
+
+ feat_output_dir = feature_dir / config.hash_id
+ feat_output_dir.mkdir(parents=True, exist_ok=True)
+ report_output_dir = reports_root / config.hash_id
+
+ for lang in languages:
+ print(f"\n{'='*30}\nPROCESSING LANGUAGE: {lang.upper()}\n{'='*30}")
+ df_train = full_dataset[f"{lang}_train"].to_pandas()
+ df_test = full_dataset[f"{lang}_test"].to_pandas()
+
+ # Standardize Label Format
+ for df in [df_train, df_test]:
+ if isinstance(df[LABEL_COLUMN].iloc[0], str):
+ df[LABEL_COLUMN] = (
+ df[LABEL_COLUMN]
+ .str.replace(r"\s+", ", ", regex=True)
+ .apply(ast.literal_eval)
+ )
+
+ lang_report_dir = report_output_dir / lang
+
+ # 1. RAW AUDIT
+ print(" >>> Phase 1: Auditing RAW Data")
+ df_train_raw = fe.extract_features_for_check(df_train.copy())
+ df_test_raw = fe.extract_features_for_check(df_test.copy())
+ run_custom_deepchecks(
+ df_train_raw, df_test_raw, lang_report_dir, "raw", lang
+ )
+ if run_nlp_check:
+ run_targeted_nlp_checks(
+ df_train_raw, df_test_raw, lang_report_dir, "raw"
+ )
+
+ # 2. CLEANING & AUGMENTATION
+ print("\n >>> Phase 2: Smart Cleaning & Augmentation")
+ df_train, df_dropped = clean_training_data_smart(
+ df_train, min_comment_length, max_comment_length, language=lang
+ )
+
+ if not df_dropped.empty:
+ dropped_path = lang_report_dir / "dropped_rows.csv"
+ df_dropped.to_csv(dropped_path, index=False)
+ print(f" [Report] Dropped rows details saved to: {dropped_path}")
+
+ if augment:
+ print(" [Augment] Applying Soft Balancing...")
+ df_train, df_aug_report = augmenter.apply_balancing(
+ df_train, min_samples=balance_threshold
+ )
+
+ if not df_aug_report.empty:
+ aug_path = lang_report_dir / "augmentation_report.csv"
+ df_aug_report.to_csv(aug_path, index=False)
+ print(
+ f" [Report] Augmentation details saved to: {aug_path}"
+ )
+
+ # 3. PROCESSED AUDIT
+ print("\n >>> Phase 3: Auditing PROCESSED Data")
+ df_train = fe.extract_features_for_check(df_train)
+ df_test = fe.extract_features_for_check(df_test)
+ run_custom_deepchecks(
+ df_train, df_test, lang_report_dir, "processed", lang
+ )
+ if run_nlp_check:
+ run_targeted_nlp_checks(
+ df_train, df_test, lang_report_dir, "processed"
+ )
+
+ # 4. FINAL PROCESSING & SAVING
+ print("\n >>> Phase 4: Final Processing & Save")
+ df_train["comment_clean"] = df_train["comment_sentence"].apply(
+ fe.processor.clean_text
+ )
+ df_test["comment_clean"] = df_test["comment_sentence"].apply(
+ fe.processor.clean_text
+ )
+
+ if config.use_combo_feature:
+ if "combo" in df_train.columns:
+ df_train["combo_clean"] = df_train["combo"].apply(
+ fe.processor.clean_text
+ )
+ if "combo" in df_test.columns:
+ df_test["combo_clean"] = df_test["combo"].apply(
+ fe.processor.clean_text
+ )
+
+ X_train, X_test, vocab = None, None, []
+ if run_vectorization:
+ print(" [Vectorization] TF-IDF & Chi2...")
+ X_train, X_test, vocab = fe.vectorize_and_select(df_train, df_test)
+ def format_label_robust(lbl):
+ if hasattr(lbl, "tolist"): # Check if numpy array
+ lbl = lbl.tolist()
+ return str(lbl)
+
+ df_train[LABEL_COLUMN] = df_train[LABEL_COLUMN].apply(format_label_robust)
+ df_test[LABEL_COLUMN] = df_test[LABEL_COLUMN].apply(format_label_robust)
+
+ cols_to_save = [
+ "index",
+ LABEL_COLUMN,
+ "comment_sentence",
+ "comment_clean",
+ ]
+ if "combo" in df_train.columns:
+ cols_to_save.append("combo")
+ if "combo_clean" in df_train.columns:
+ cols_to_save.append("combo_clean")
+ meta_cols = [c for c in df_train.columns if c.startswith("f_")]
+ cols_to_save.extend(meta_cols)
+
+ print(f" [Save] Columns: {cols_to_save}")
+ df_train[cols_to_save].to_csv(
+ feat_output_dir / f"{lang}_train.csv", index=False
+ )
+ df_test[cols_to_save].to_csv(
+ feat_output_dir / f"{lang}_test.csv", index=False
+ )
+
+ if run_vectorization and X_train is not None:
+ from scipy.sparse import save_npz
+
+ save_npz(feat_output_dir / f"{lang}_train_tfidf.npz", X_train)
+ save_npz(feat_output_dir / f"{lang}_test_tfidf.npz", X_test)
+ with open(
+ feat_output_dir / f"{lang}_vocab.txt", "w", encoding="utf-8"
+ ) as f:
+ f.write("\n".join(vocab))
+
+ print(f"\nAll Done. Reports in: {report_output_dir}")
+
+
+if __name__ == "__main__":
+ typer.run(main)
\ No newline at end of file
diff --git a/turing/modeling/__init__.py b/turing/modeling/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/turing/modeling/__pycache__/__init__.cpython-312.pyc b/turing/modeling/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c639786e44075a325e64356d683d24db8ed0a443
Binary files /dev/null and b/turing/modeling/__pycache__/__init__.cpython-312.pyc differ
diff --git a/turing/modeling/__pycache__/baseModel.cpython-312.pyc b/turing/modeling/__pycache__/baseModel.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..80883da933239f0960209bb179084d4a46408082
Binary files /dev/null and b/turing/modeling/__pycache__/baseModel.cpython-312.pyc differ
diff --git a/turing/modeling/baseModel.py b/turing/modeling/baseModel.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9f0fde2c413d1052b0902c980a10a8274cd75c6
--- /dev/null
+++ b/turing/modeling/baseModel.py
@@ -0,0 +1,111 @@
+from abc import ABC, abstractmethod
+import os
+import shutil
+
+from loguru import logger
+import mlflow
+from numpy import ndarray
+
+
+class BaseModel(ABC):
+ """
+ Abstract base class for training models.
+ Subclasses should define the model and implement specific logic
+ for training, evaluation, and model persistence.
+ """
+
+ def __init__(self, language, path=None):
+ """
+ Initialize the trainer.
+
+ Args:
+ language (str): Language for the model.
+ path (str, optional): Path to load a pre-trained model. Defaults to None.
+ If None, a new model is initialized.
+ """
+
+ self.language = language
+ self.model = None
+ if path:
+ self.load(path)
+ else:
+ self.setup_model()
+
+ @abstractmethod
+ def setup_model(self):
+ """
+ Initialize or build the model.
+ Called in __init__ of subclass.
+ """
+ pass
+
+ @abstractmethod
+ def train(self, X_train, y_train) -> dict[str,any]:
+ """
+ Main training logic for the model.
+
+ Args:
+ X_train: Input training data.
+ y_train: True labels for training data.
+ """
+ pass
+
+ @abstractmethod
+ def evaluate(self, X_test, y_test) -> dict[str,any]:
+ """
+ Evaluation logic for the model.
+
+ Args:
+ X_test: Input test data.
+ y_test: True labels for test data.
+ """
+ pass
+
+ @abstractmethod
+ def predict(self, X) -> ndarray:
+ """
+ Make predictions using the trained model.
+
+ Args:
+ X: Input data for prediction.
+
+ Returns:
+ Predictions made by the model.
+ """
+ pass
+
+ def save(self, path, model_name):
+ """
+ Save model and log to MLflow.
+
+ Args:
+ path (str): Path to save the model.
+ model_name (str): Name to use when saving the model (without extension).
+ """
+
+ if self.model is None:
+ raise ValueError("Model is not trained. Cannot save uninitialized model.")
+
+ complete_path = os.path.join(path, f"{model_name}_{self.language}")
+ if os.path.exists(complete_path) and os.path.isdir(complete_path):
+ shutil.rmtree(complete_path)
+ mlflow.sklearn.save_model(self.model, complete_path)
+
+ try:
+ mlflow.log_artifact(complete_path)
+ except Exception as e:
+ logger.error(f"Failed to log model to MLflow: {e}")
+
+ logger.info(f"Model saved to: {complete_path}")
+
+ def load(self, model_path):
+ """
+ Load model from specified local path or mlflow model URI.
+
+ Args:
+ model_path (str): Path to load the model from (local or mlflow URI).
+ """
+
+ self.model = mlflow.sklearn.load_model(model_path)
+ logger.info(f"Model loaded from: {model_path}")
+
diff --git a/turing/modeling/model_selector.py b/turing/modeling/model_selector.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f83c97a5477c72c299631f1c29930672e693289
--- /dev/null
+++ b/turing/modeling/model_selector.py
@@ -0,0 +1,145 @@
+from typing import Optional
+
+from loguru import logger
+from mlflow.tracking import MlflowClient
+
+
+def get_best_model_by_tag(
+ language: str,
+ tag_key: str = "best_model",
+ metric: str = "f1_score"
+) -> Optional[dict]:
+ """
+ Retrieve the best model for a specific language using MLflow tags.
+
+ Args:
+ language: Programming language (java, python, pharo)
+ tag_key: Tag key to search for (default: "best_model")
+ metric: Metric to use for ordering (default: "f1_score")
+
+ Returns:
+ Dict with run_id and artifact_name of the best model or None if not found
+ """
+
+ client = MlflowClient()
+ experiments = client.search_experiments()
+ if not experiments:
+ logger.error("No experiments found in MLflow")
+ return None
+
+ try:
+ runs = client.search_runs(
+ experiment_ids=[exp.experiment_id for exp in experiments],
+ filter_string=f"tags.{tag_key} = 'true' and tags.Language = '{language}'",
+ order_by=[f"metrics.{metric} DESC"],
+ max_results=1
+ )
+
+ if not runs:
+ logger.warning(f"No runs found with tag '{tag_key}' for language '{language}'")
+ return None
+
+ best_run = runs[0]
+ run_id = best_run.info.run_id
+ exp_name = client.get_experiment(best_run.info.experiment_id).name
+ run_name = best_run.info.run_name
+ artifact_name = best_run.data.tags.get("model_name")
+ model_id = best_run.data.tags.get("model_id")
+ logger.info(f"Found best model for {language}: {exp_name}/{run_name} ({run_id}), artifact={artifact_name}")
+
+ return {
+ "run_id": run_id,
+ "artifact": artifact_name,
+ "model_id": model_id
+ }
+
+ except Exception as e:
+ logger.error(f"Error searching for best model: {e}")
+ return None
+
+
+def get_best_model_info(
+ language: str,
+ fallback_registry: dict = None
+) -> dict:
+ """
+ Retrieve the best model information for a language.
+ First searches by tag, then falls back to hardcoded registry.
+
+ Args:
+ language: Programming language
+ fallback_registry: Fallback registry with run_id and artifact
+
+ Returns:
+ Dict with run_id and artifact of the model
+ """
+
+ model_info = get_best_model_by_tag(language, "best_model")
+
+ if model_info:
+ logger.info(f"Using tagged best model for {language}")
+ return model_info
+
+ if fallback_registry and language in fallback_registry:
+ logger.warning(f"No tagged model found for {language}, using fallback registry")
+ return fallback_registry[language]
+
+ model_info = get_best_model_by_metric(language)
+
+ if model_info:
+ logger.warning(f"Using best model by metric for {language}")
+ return model_info
+
+ raise ValueError(f"No model found for language {language}")
+
+
+def get_best_model_by_metric(
+ language: str,
+ metric: str = "f1_score"
+) -> Optional[dict]:
+ """
+ Find the model with the best metric for a language.
+
+ Args:
+ language: Programming language
+ metric: Metric to use for ordering
+
+ Returns:
+ Dict with run_id and artifact of the model or None
+ """
+
+ client = MlflowClient()
+ experiments = client.search_experiments()
+ if not experiments:
+ logger.error("No experiments found in MLflow")
+ return None
+
+ try:
+ runs = client.search_runs(
+ experiment_ids=[exp.experiment_id for exp in experiments],
+ filter_string=f"tags.Language = '{language}'",
+ order_by=[f"metrics.{metric} DESC"],
+ max_results=1
+ )
+
+ if not runs:
+ logger.warning(f"No runs found for language '{language}'")
+ return None
+
+ best_run = runs[0]
+ run_id = best_run.info.run_id
+ exp_name = client.get_experiment(best_run.info.experiment_id).name
+ run_name = best_run.info.run_name
+ artifact_name = best_run.data.tags.get("model_name")
+ model_id = best_run.data.tags.get("model_id")
+ logger.info(f"Found best model for {language}: {exp_name}/{run_name} ({run_id}), artifact={artifact_name}")
+
+ return {
+ "run_id": run_id,
+ "artifact": artifact_name,
+ "model_id": model_id
+ }
+
+ except Exception as e:
+ logger.error(f"Error finding best model by metric: {e}")
+ return None
diff --git a/turing/modeling/models/__init__.py b/turing/modeling/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fc7efe62befe30f25787a6adbb0ee796e167fe5
--- /dev/null
+++ b/turing/modeling/models/__init__.py
@@ -0,0 +1,15 @@
+"""
+Model classes for code comment classification.
+"""
+
+from turing.modeling.models.codeBerta import CodeBERTa
+from turing.modeling.models.graphCodeBert import GraphCodeBERTClassifier
+from turing.modeling.models.randomForestTfIdf import RandomForestTfIdf
+from turing.modeling.models.tinyBert import TinyBERTClassifier
+
+__all__ = [
+ "CodeBERTa",
+ "RandomForestTfIdf",
+ "TinyBERTClassifier",
+ "GraphCodeBERTClassifier",
+]
diff --git a/turing/modeling/models/__pycache__/miniLM.cpython-312.pyc b/turing/modeling/models/__pycache__/miniLM.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3f0ee2b529f8208b9a8597159087ccf2452ee16d
Binary files /dev/null and b/turing/modeling/models/__pycache__/miniLM.cpython-312.pyc differ
diff --git a/turing/modeling/models/__pycache__/miniLmWithClassificationHead.cpython-312.pyc b/turing/modeling/models/__pycache__/miniLmWithClassificationHead.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d95768e056e5a2372b5ff2dc24236212578cbc8
Binary files /dev/null and b/turing/modeling/models/__pycache__/miniLmWithClassificationHead.cpython-312.pyc differ
diff --git a/turing/modeling/models/__pycache__/randomForestTfIdf.cpython-312.pyc b/turing/modeling/models/__pycache__/randomForestTfIdf.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f0f0602f2610e640ed25a906433f59c392845613
Binary files /dev/null and b/turing/modeling/models/__pycache__/randomForestTfIdf.cpython-312.pyc differ
diff --git a/turing/modeling/models/codeBerta.py b/turing/modeling/models/codeBerta.py
new file mode 100644
index 0000000000000000000000000000000000000000..d593503288765f11635627da12c01ae8b35e9461
--- /dev/null
+++ b/turing/modeling/models/codeBerta.py
@@ -0,0 +1,463 @@
+import os
+import shutil
+import warnings
+
+from loguru import logger
+import mlflow
+import numpy as np
+from numpy import ndarray
+from sklearn.metrics import (
+ accuracy_score,
+ classification_report,
+ f1_score,
+ precision_score,
+ recall_score,
+)
+import torch
+from torch.utils.data import Dataset
+from transformers import (
+ AutoModelForSequenceClassification,
+ AutoTokenizer,
+ EarlyStoppingCallback,
+ Trainer,
+ TrainingArguments,
+)
+
+from turing.config import MODELS_DIR
+
+from ..baseModel import BaseModel
+
+warnings.filterwarnings("ignore")
+
+
+def compute_metrics(eval_pred):
+ predictions, labels = eval_pred
+
+ # Sigmoid function to convert logits to probabilities
+ probs = 1 / (1 + np.exp(-predictions))
+
+ # Apply threshold of 0.5 (becomes 1 if > 0.5, otherwise 0)
+ preds = (probs > 0.5).astype(int)
+
+ # Calculate F1 score (macro average for multi-label)
+ f1 = f1_score(labels, preds, average='macro')
+ precision = precision_score(labels, preds, average='macro', zero_division=0)
+ recall = recall_score(labels, preds, average='macro', zero_division=0)
+
+ return {
+ 'f1': f1,
+ 'precision': precision,
+ 'recall': recall,
+ }
+
+
+
+class CodeBERTaDataset(Dataset):
+ """
+ Internal Dataset class for CodeBERTa.
+ """
+
+ def __init__(self, encodings, labels=None, num_labels=None):
+ """
+ Initialize the InternalDataset.
+ Args:
+ encodings (dict): Tokenized encodings.
+ labels (list or np.ndarray, optional): Corresponding labels.
+ num_labels (int, optional): Total number of classes. Required for auto-converting indices to one-hot.
+ """
+
+ self.encodings = {key: torch.tensor(val) for key, val in encodings.items()}
+
+ if labels is not None:
+ if not isinstance(labels, (np.ndarray, torch.Tensor)):
+ labels = np.array(labels)
+
+ # Case A: labels are indices (integers)
+ if num_labels is not None and (len(labels.shape) == 1 or (len(labels.shape) == 2 and labels.shape[1] == 1)):
+ labels_flat = labels.flatten()
+
+ # Create one-hot encoded matrix
+ one_hot = np.zeros((len(labels_flat), num_labels), dtype=np.float32)
+
+ # Set the corresponding index to 1
+ valid_indices = labels_flat < num_labels
+ one_hot[valid_indices, labels_flat[valid_indices]] = 1.0
+
+ self.labels = torch.tensor(one_hot, dtype=torch.float)
+
+ # Case B: labels are already vectors (e.g., One-Hot or Multi-Hot)
+ else:
+ self.labels = torch.tensor(labels, dtype=torch.float)
+ else:
+ self.labels = None
+
+
+ def __getitem__(self, idx):
+ """
+ Retrieve item at index idx.
+
+ Args:
+ idx (int): Index of the item to retrieve.
+
+ Returns:
+ dict: Dictionary containing input_ids, attention_mask, and labels (if available).
+ """
+
+ item = {key: val[idx] for key, val in self.encodings.items()}
+ if self.labels is not None:
+ item['labels'] = self.labels[idx]
+ return item
+
+
+ def __len__(self):
+ """
+ Return the length of the dataset.
+
+ Returns:
+ int: Length of the dataset.
+ """
+
+ return len(self.encodings['input_ids'])
+
+
+
+class CodeBERTa(BaseModel):
+ """
+ HuggingFace implementation of BaseModel for Code Comment Classification.
+ Uses CodeBERTa-small-v1 for efficient inference.
+ """
+
+ def __init__(self, language, path=None):
+ """
+ Initialize the CodeBERTa model with configuration parameters.
+
+ Args:
+ language (str): Language for the model.
+ path (str, optional): Path to load a pre-trained model. Defaults to None.
+ """
+
+ self.params = {
+ "model_name_hf": "huggingface/CodeBERTa-small-v1",
+ "num_labels": 7 if language == "java" else 5 if language == "python" else 6,
+ "max_length": 128,
+ "epochs": 15,
+ "batch_size_train": 16,
+ "batch_size_eval": 64,
+ "learning_rate": 1e-5,
+ "weight_decay": 0.02,
+ "train_size": 0.8,
+ "early_stopping_patience": 3,
+ "early_stopping_threshold": 0.005
+ }
+
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ self.tokenizer = None
+
+ super().__init__(language, path)
+
+
+ def setup_model(self):
+ """
+ Initialize the CodeBERTa tokenizer and model.
+ """
+
+ logger.info(f"Initializing {self.params['model_name_hf']} on {self.device}...")
+
+ self.tokenizer = AutoTokenizer.from_pretrained(self.params["model_name_hf"])
+ self.model = AutoModelForSequenceClassification.from_pretrained(
+ self.params["model_name_hf"],
+ num_labels=self.params["num_labels"],
+ problem_type="multi_label_classification"
+ ).to(self.device)
+ logger.info("CodeBERTa model initialized.")
+
+
+ def _tokenize(self, texts):
+ """
+ Helper to tokenize list of texts efficiently.
+
+ Args:
+ texts (list): List of text strings to tokenize.
+
+ Returns:
+ dict: Tokenized encodings.
+ """
+
+ safe_texts = []
+ for t in texts:
+ if t is None:
+ safe_texts.append("")
+ elif isinstance(t, (int, float)):
+ if t != t: # NaN check
+ safe_texts.append("")
+ else:
+ safe_texts.append(str(t))
+ else:
+ safe_texts.append(str(t))
+
+ return self.tokenizer(
+ safe_texts,
+ truncation=True,
+ padding=True,
+ max_length=self.params["max_length"]
+ )
+
+
+ def train(self, X_train, y_train) -> dict[str,any]:
+ """
+ Train the model using HF Trainer and log to MLflow.
+
+ Args:
+ X_train (list): Training input texts.
+ y_train (list or np.ndarray): Training labels.
+
+ Returns:
+ dict[str, any]: Dictionary of parameters used for training.
+ """
+
+ if self.model is None:
+ raise ValueError("Model is not initialized. Call setup_model() before training.")
+
+ # log parameters to MLflow without model_name_hf
+ params_to_log = {k: v for k, v in self.params.items() if k != "model_name_hf" and k != "num_labels"}
+
+ logger.info(f"Starting training for: {self.language.upper()}")
+
+ # Prepare dataset (train/val split)
+ train_encodings = self._tokenize(X_train)
+ full_dataset = CodeBERTaDataset(train_encodings, y_train, num_labels=self.params["num_labels"])
+ train_size = int(self.params["train_size"] * len(full_dataset))
+ val_size = len(full_dataset) - train_size
+ train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, val_size])
+
+ temp_ckpt_dir = os.path.join(MODELS_DIR, "temp_checkpoints")
+
+ use_fp16 = torch.cuda.is_available()
+ if not use_fp16:
+ logger.info("Mixed Precision (fp16) disabled because CUDA is not available.")
+
+ training_args = TrainingArguments(
+ output_dir=temp_ckpt_dir,
+ num_train_epochs=self.params["epochs"],
+ per_device_train_batch_size=self.params["batch_size_train"],
+ per_device_eval_batch_size=self.params["batch_size_eval"],
+ learning_rate=self.params["learning_rate"],
+ weight_decay=self.params["weight_decay"],
+ eval_strategy="epoch",
+ save_strategy="epoch",
+ load_best_model_at_end=True,
+ metric_for_best_model="f1",
+ greater_is_better=True,
+ save_total_limit=2,
+ logging_dir='./logs',
+ logging_steps=50,
+ fp16=use_fp16,
+ optim="adamw_torch",
+ report_to="none",
+ no_cuda=not torch.cuda.is_available()
+ )
+
+ trainer = Trainer(
+ model=self.model,
+ args=training_args,
+ train_dataset=train_dataset,
+ eval_dataset=val_dataset,
+ compute_metrics=compute_metrics,
+ callbacks=[EarlyStoppingCallback(early_stopping_patience=self.params["early_stopping_patience"], early_stopping_threshold=self.params["early_stopping_threshold"])]
+ )
+ trainer.train()
+ logger.info(f"Training for {self.language.upper()} completed.")
+
+ if os.path.exists(temp_ckpt_dir):
+ shutil.rmtree(temp_ckpt_dir)
+
+ return params_to_log
+
+
+ def evaluate(self, X_test, y_test) -> dict[str,any]:
+ """
+ Evaluate model on test data, return metrics and log to MLflow.
+ Handles automatic conversion of y_test to match multi-label prediction shape.
+
+ Args:
+ X_test (list): Input test data.
+ y_test (list or np.ndarray): True labels for test data.
+
+ Returns:
+ dict[str, any]: Dictionary of evaluation metrics.
+ """
+
+ # Obtain predictions
+ y_pred = self.predict(X_test)
+
+ # Convert y_test to numpy array if needed
+ if not isinstance(y_test, (np.ndarray, torch.Tensor)):
+ y_test_np = np.array(y_test)
+ elif isinstance(y_test, torch.Tensor):
+ y_test_np = y_test.cpu().numpy()
+ else:
+ y_test_np = y_test
+
+ num_labels = self.params["num_labels"]
+ is_multilabel_pred = (y_pred.ndim == 2 and y_pred.shape[1] > 1)
+ is_flat_truth = (y_test_np.ndim == 1) or (y_test_np.ndim == 2 and y_test_np.shape[1] == 1)
+
+ if is_multilabel_pred and is_flat_truth:
+ # Create a zero matrix
+ y_test_expanded = np.zeros((y_test_np.shape[0], num_labels), dtype=int)
+
+ # Flatten y_test for iteration
+ indices = y_test_np.flatten()
+
+ # Use indices to set the correct column to 1
+ for i, label_idx in enumerate(indices):
+ idx = int(label_idx)
+ if 0 <= idx < num_labels:
+ y_test_expanded[i, idx] = 1
+
+ y_test_np = y_test_expanded
+
+ # Generate classification report
+ report = classification_report(y_test_np, y_pred, zero_division=0)
+ print("\n" + "=" * 50)
+ print("CLASSIFICATION REPORT")
+ print(report)
+ print("=" * 50 + "\n")
+
+ metrics = {
+ "accuracy": accuracy_score(y_test_np, y_pred),
+ "precision": precision_score(y_test_np, y_pred, average="macro", zero_division=0),
+ "recall": recall_score(y_test_np, y_pred, average="macro", zero_division=0),
+ "f1_score": f1_score(y_test_np, y_pred, average="macro"),
+ }
+
+ mlflow.log_metrics(metrics)
+
+ logger.info(
+ f"Evaluation completed — Accuracy: {metrics['accuracy']:.3f}, F1: {metrics['f1_score']:.3f}"
+ )
+ return metrics
+
+
+ def predict(self, X) -> ndarray:
+ """
+ Make predictions for Multi-Label classification.
+ Returns Binary Matrix (Multi-Hot) where multiple classes can be 1.
+
+ Args:
+ X (list): Input texts for prediction.
+
+ Returns:
+ np.ndarray: Multi-Hot Encoded predictions (e.g., [[0, 1, 1, 0], ...])
+ """
+
+ if self.model is None:
+ raise ValueError("Model is not trained. Call train() or load() before prediction.")
+
+ # Set model to evaluation mode
+ self.model.eval()
+
+ encodings = self._tokenize(X)
+ # Pass None as labels because we are in inference
+ dataset = CodeBERTaDataset(encodings, labels=None)
+
+ use_fp16 = torch.cuda.is_available()
+
+ training_args = TrainingArguments(
+ output_dir="./pred_temp",
+ per_device_eval_batch_size=self.params["batch_size_eval"],
+ fp16=use_fp16,
+ report_to="none",
+ no_cuda=not torch.cuda.is_available()
+ )
+
+ trainer = Trainer(model=self.model, args=training_args)
+ output = trainer.predict(dataset)
+
+ # Clean up temporary prediction directory
+ if os.path.exists("./pred_temp"):
+ shutil.rmtree("./pred_temp")
+
+ # Convert logits to probabilities
+ logits = output.predictions
+ probs = 1 / (1 + np.exp(-logits))
+
+ # Apply a threshold of 0.5 (if prob > 0.5, predict 1 else 0)
+ preds_binary = (probs > 0.5).astype(int)
+
+ return preds_binary
+
+
+ def save(self, path, model_name):
+ """
+ Save model locally and log to MLflow as artifact.
+
+ Args:
+ path (str): Directory path to save the model.
+ model_name (str): Name for the saved model.
+ """
+
+ if self.model is None:
+ raise ValueError("Model is not trained. Cannot save uninitialized model.")
+
+ # Local Saving
+ complete_path = os.path.join(path, f"{model_name}_{self.language}")
+
+ # Remove existing directory if it exists
+ if os.path.exists(complete_path) and os.path.isdir(complete_path):
+ shutil.rmtree(complete_path)
+
+ # Save model and tokenizer
+ logger.info(f"Saving model to: {complete_path}")
+ self.model.save_pretrained(complete_path)
+ self.tokenizer.save_pretrained(complete_path)
+ logger.info("Model saved locally.")
+
+ try:
+ # Log to MLflow
+ logger.info("Logging artifacts to MLflow...")
+ mlflow.log_artifacts(local_dir=complete_path, artifact_path=f"{model_name}_{self.language}")
+ except Exception as e:
+ logger.error(f"Failed to log model artifacts to MLflow: {e}")
+
+
+ def load(self, model_path):
+ """
+ Load model from a local path OR an MLflow URI.
+
+ Args:
+ model_path (str): Local path or MLflow URI to load the model from.
+ """
+
+ logger.info(f"Loading model from: {model_path}")
+ local_model_path = model_path
+
+ # Downloading model from MLflow and saving to local path
+ if model_path.startswith("models:/") or model_path.startswith("runs:/"):
+ try:
+ logger.info("Detected MLflow model URI. Attempting to load from MLflow...")
+ local_model_path = os.path.join(MODELS_DIR, "mlflow_temp_models")
+ local_model_path = mlflow.artifacts.download_artifacts(artifact_uri=model_path, dst_path=local_model_path)
+ logger.info(f"Model downloaded from MLflow to: {local_model_path}")
+ except Exception as e:
+ logger.error(f"Failed to load from MLflow: {e}")
+ raise e
+
+ # Loading from local path
+ try:
+ if not os.path.exists(local_model_path):
+ raise FileNotFoundError(f"Model path not found: {local_model_path}")
+
+ # Load tokenizer and model from local path
+ self.tokenizer = AutoTokenizer.from_pretrained(local_model_path)
+ self.model = AutoModelForSequenceClassification.from_pretrained(
+ local_model_path
+ ).to(self.device)
+ logger.info("Model loaded from local path successfully.")
+
+ except Exception as e:
+ logger.error(f"Failed to load model from local path: {e}")
+ raise e
+
+ # Set model to evaluation mode
+ self.model.eval()
\ No newline at end of file
diff --git a/turing/modeling/models/graphCodeBert.py b/turing/modeling/models/graphCodeBert.py
new file mode 100644
index 0000000000000000000000000000000000000000..83ec9d1042c2c2dd9e7835e0576f3c09d051e61b
--- /dev/null
+++ b/turing/modeling/models/graphCodeBert.py
@@ -0,0 +1,469 @@
+import os
+import shutil
+import warnings
+
+from loguru import logger
+import mlflow
+import numpy as np
+from numpy import ndarray
+from sklearn.metrics import (
+ accuracy_score,
+ classification_report,
+ f1_score,
+ precision_score,
+ recall_score,
+)
+import torch
+from torch.utils.data import Dataset
+from transformers import (
+ AutoModelForSequenceClassification,
+ AutoTokenizer,
+ EarlyStoppingCallback,
+ Trainer,
+ TrainingArguments,
+)
+
+from turing.config import MODELS_DIR
+
+from ..baseModel import BaseModel
+
+warnings.filterwarnings("ignore")
+
+
+def compute_metrics(eval_pred):
+ predictions, labels = eval_pred
+
+ # Sigmoid function to convert logits to probabilities
+ probs = 1 / (1 + np.exp(-predictions))
+
+ # Apply threshold of 0.5 (becomes 1 if > 0.5, otherwise 0)
+ preds = (probs > 0.5).astype(int)
+
+ # Calculate F1 score (macro average for multi-label)
+ f1 = f1_score(labels, preds, average="macro")
+ precision = precision_score(labels, preds, average="macro", zero_division=0)
+ recall = recall_score(labels, preds, average="macro", zero_division=0)
+
+ return {
+ "f1": f1,
+ "precision": precision,
+ "recall": recall,
+ }
+
+
+class GraphCodeBERTDataset(Dataset):
+ """
+ Internal Dataset class for GraphCodeBERT.
+ """
+
+ def __init__(self, encodings, labels=None, num_labels=None):
+ """
+ Initialize the InternalDataset.
+ Args:
+ encodings (dict): Tokenized encodings.
+ labels (list or np.ndarray, optional): Corresponding labels.
+ num_labels (int, optional): Total number of classes. Required for auto-converting indices to one-hot.
+ """
+
+ self.encodings = {key: torch.tensor(val) for key, val in encodings.items()}
+
+ if labels is not None:
+ if not isinstance(labels, (np.ndarray, torch.Tensor)):
+ labels = np.array(labels)
+
+ # Case A: labels are indices (integers)
+ if num_labels is not None and (
+ len(labels.shape) == 1 or (len(labels.shape) == 2 and labels.shape[1] == 1)
+ ):
+ labels_flat = labels.flatten()
+
+ # Create one-hot encoded matrix
+ one_hot = np.zeros((len(labels_flat), num_labels), dtype=np.float32)
+
+ # Set the corresponding index to 1
+ valid_indices = labels_flat < num_labels
+ one_hot[valid_indices, labels_flat[valid_indices]] = 1.0
+
+ self.labels = torch.tensor(one_hot, dtype=torch.float)
+
+ # Case B: labels are already vectors (e.g., One-Hot or Multi-Hot)
+ else:
+ self.labels = torch.tensor(labels, dtype=torch.float)
+ else:
+ self.labels = None
+
+ def __getitem__(self, idx):
+ """
+ Retrieve item at index idx.
+
+ Args:
+ idx (int): Index of the item to retrieve.
+
+ Returns:
+ dict: Dictionary containing input_ids, attention_mask, and labels (if available).
+ """
+
+ item = {key: val[idx] for key, val in self.encodings.items()}
+ if self.labels is not None:
+ item["labels"] = self.labels[idx]
+ return item
+
+ def __len__(self):
+ """
+ Return the length of the dataset.
+
+ Returns:
+ int: Length of the dataset.
+ """
+
+ return len(self.encodings["input_ids"])
+
+
+class GraphCodeBERTClassifier(BaseModel):
+ """
+ HuggingFace implementation of BaseModel for Code Comment Classification.
+ Uses GraphCodeBERT (microsoft/graphcodebert-base) for code understanding via data flow graphs.
+ """
+
+ def __init__(self, language, path=None):
+ """
+ Initialize the GraphCodeBERT model with configuration parameters.
+
+ Args:
+ language (str): Language for the model.
+ path (str, optional): Path to load a pre-trained model. Defaults to None.
+ """
+
+ self.params = {
+ "model_name_hf": "microsoft/graphcodebert-base",
+ "num_labels": 7 if language == "java" else 5 if language == "python" else 6,
+ "max_length": 256,
+ "epochs": 15,
+ "batch_size_train": 16,
+ "batch_size_eval": 64,
+ "learning_rate": 2e-5,
+ "weight_decay": 0.01,
+ "train_size": 0.8,
+ "early_stopping_patience": 3,
+ "early_stopping_threshold": 0.0,
+ "warmup_steps": 500,
+ "seed": 42,
+ }
+
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ self.tokenizer = None
+
+ super().__init__(language, path)
+
+ def setup_model(self):
+ """
+ Initialize the GraphCodeBERT tokenizer and model.
+ """
+
+ logger.info(f"Initializing {self.params['model_name_hf']} on {self.device}...")
+
+ self.tokenizer = AutoTokenizer.from_pretrained(self.params["model_name_hf"])
+ self.model = AutoModelForSequenceClassification.from_pretrained(
+ self.params["model_name_hf"],
+ num_labels=self.params["num_labels"],
+ problem_type="multi_label_classification",
+ use_safetensors=True, # Force use of safetensors for security
+ ).to(self.device)
+ logger.info("GraphCodeBERT model initialized.")
+
+ def _tokenize(self, texts):
+ """
+ Helper to tokenize list of texts efficiently.
+
+ Args:
+ texts (list): List of text strings to tokenize.
+
+ Returns:
+ dict: Tokenized encodings.
+ """
+
+ safe_texts = []
+ for t in texts:
+ if t is None:
+ safe_texts.append("")
+ elif isinstance(t, (int, float)):
+ if t != t: # NaN check
+ safe_texts.append("")
+ else:
+ safe_texts.append(str(t))
+ else:
+ safe_texts.append(str(t))
+
+ return self.tokenizer(
+ safe_texts, truncation=True, padding=True, max_length=self.params["max_length"]
+ )
+
+ def train(self, X_train, y_train) -> dict[str, any]:
+ """
+ Train the model using HF Trainer and log to MLflow.
+
+ Args:
+ X_train (list): Training input texts.
+ y_train (list or np.ndarray): Training labels.
+
+ Returns:
+ dict[str, any]: Dictionary of parameters used for training.
+ """
+
+ if self.model is None:
+ raise ValueError("Model is not initialized. Call setup_model() before training.")
+
+ # log parameters to MLflow without model_name_hf
+ params_to_log = {
+ k: v for k, v in self.params.items() if k != "model_name_hf" and k != "num_labels"
+ }
+
+ logger.info(f"Starting training for: {self.language.upper()}")
+
+ # Prepare dataset (train/val split)
+ train_encodings = self._tokenize(X_train)
+ full_dataset = GraphCodeBERTDataset(
+ train_encodings, y_train, num_labels=self.params["num_labels"]
+ )
+ train_size = int(self.params["train_size"] * len(full_dataset))
+ val_size = len(full_dataset) - train_size
+ train_dataset, val_dataset = torch.utils.data.random_split(
+ full_dataset, [train_size, val_size]
+ )
+
+ temp_ckpt_dir = os.path.join(MODELS_DIR, "temp_checkpoints")
+
+ use_fp16 = torch.cuda.is_available()
+ if not use_fp16:
+ logger.info("Mixed Precision (fp16) disabled because CUDA is not available.")
+
+ training_args = TrainingArguments(
+ output_dir=temp_ckpt_dir,
+ num_train_epochs=self.params["epochs"],
+ per_device_train_batch_size=self.params["batch_size_train"],
+ per_device_eval_batch_size=self.params["batch_size_eval"],
+ learning_rate=self.params["learning_rate"],
+ weight_decay=self.params["weight_decay"],
+ eval_strategy="epoch",
+ save_strategy="epoch",
+ load_best_model_at_end=True,
+ metric_for_best_model="f1",
+ greater_is_better=True,
+ save_total_limit=2,
+ logging_dir="./logs",
+ logging_steps=50,
+ fp16=use_fp16,
+ optim="adamw_torch",
+ report_to="none",
+ no_cuda=not torch.cuda.is_available(),
+ )
+
+ trainer = Trainer(
+ model=self.model,
+ args=training_args,
+ train_dataset=train_dataset,
+ eval_dataset=val_dataset,
+ compute_metrics=compute_metrics,
+ callbacks=[
+ EarlyStoppingCallback(
+ early_stopping_patience=self.params["early_stopping_patience"],
+ early_stopping_threshold=self.params["early_stopping_threshold"],
+ )
+ ],
+ )
+ trainer.train()
+ logger.info(f"Training for {self.language.upper()} completed.")
+
+ if os.path.exists(temp_ckpt_dir):
+ shutil.rmtree(temp_ckpt_dir)
+
+ return params_to_log
+
+ def evaluate(self, X_test, y_test) -> dict[str, any]:
+ """
+ Evaluate model on test data, return metrics and log to MLflow.
+ Handles automatic conversion of y_test to match multi-label prediction shape.
+
+ Args:
+ X_test (list): Input test data.
+ y_test (list or np.ndarray): True labels for test data.
+
+ Returns:
+ dict[str, any]: Dictionary of evaluation metrics.
+ """
+
+ # Obtain predictions
+ y_pred = self.predict(X_test)
+
+ # Convert y_test to numpy array if needed
+ if not isinstance(y_test, (np.ndarray, torch.Tensor)):
+ y_test_np = np.array(y_test)
+ elif isinstance(y_test, torch.Tensor):
+ y_test_np = y_test.cpu().numpy()
+ else:
+ y_test_np = y_test
+
+ num_labels = self.params["num_labels"]
+ is_multilabel_pred = y_pred.ndim == 2 and y_pred.shape[1] > 1
+ is_flat_truth = (y_test_np.ndim == 1) or (y_test_np.ndim == 2 and y_test_np.shape[1] == 1)
+
+ if is_multilabel_pred and is_flat_truth:
+ # Create a zero matrix
+ y_test_expanded = np.zeros((y_test_np.shape[0], num_labels), dtype=int)
+
+ # Flatten y_test for iteration
+ indices = y_test_np.flatten()
+
+ # Use indices to set the correct column to 1
+ for i, label_idx in enumerate(indices):
+ idx = int(label_idx)
+ if 0 <= idx < num_labels:
+ y_test_expanded[i, idx] = 1
+
+ y_test_np = y_test_expanded
+
+ # Generate classification report
+ report = classification_report(y_test_np, y_pred, zero_division=0)
+ print("\n" + "=" * 50)
+ print("CLASSIFICATION REPORT")
+ print(report)
+ print("=" * 50 + "\n")
+
+ metrics = {
+ "accuracy": accuracy_score(y_test_np, y_pred),
+ "precision": precision_score(y_test_np, y_pred, average="macro", zero_division=0),
+ "recall": recall_score(y_test_np, y_pred, average="macro", zero_division=0),
+ "f1_score": f1_score(y_test_np, y_pred, average="macro", zero_division=0),
+ }
+
+ mlflow.log_metrics(metrics)
+
+ logger.info(
+ f"Evaluation completed — Accuracy: {metrics['accuracy']:.3f}, F1: {metrics['f1_score']:.3f}"
+ )
+ return metrics
+
+ def predict(self, X) -> ndarray:
+ """
+ Make predictions for Multi-Label classification.
+ Returns Binary Matrix (Multi-Hot) where multiple classes can be 1.
+
+ Args:
+ X (list): Input texts for prediction.
+
+ Returns:
+ np.ndarray: Multi-Hot Encoded predictions (e.g., [[0, 1, 1, 0], ...])
+ """
+
+ if self.model is None:
+ raise ValueError("Model is not trained. Call train() or load() before prediction.")
+
+ # Set model to evaluation mode
+ self.model.eval()
+
+ encodings = self._tokenize(X)
+ # Pass None as labels because we are in inference
+ dataset = GraphCodeBERTDataset(encodings, labels=None)
+
+ use_fp16 = torch.cuda.is_available()
+
+ training_args = TrainingArguments(
+ output_dir="./pred_temp",
+ per_device_eval_batch_size=self.params["batch_size_eval"],
+ fp16=use_fp16,
+ report_to="none",
+ no_cuda=not torch.cuda.is_available(),
+ )
+
+ trainer = Trainer(model=self.model, args=training_args)
+ output = trainer.predict(dataset)
+
+ # Clean up temporary prediction directory
+ if os.path.exists("./pred_temp"):
+ shutil.rmtree("./pred_temp")
+
+ # Convert logits to probabilities
+ logits = output.predictions
+ probs = 1 / (1 + np.exp(-logits))
+
+ # Apply a threshold of 0.5 (if prob > 0.5, predict 1 else 0)
+ preds_binary = (probs > 0.5).astype(int)
+
+ return preds_binary
+
+ def save(self, path, model_name):
+ """
+ Save model locally and log to MLflow as artifact.
+
+ Args:
+ path (str): Directory path to save the model.
+ model_name (str): Name for the saved model.
+ """
+
+ if self.model is None:
+ raise ValueError("Model is not trained. Cannot save uninitialized model.")
+
+ # Local Saving
+ complete_path = os.path.join(path, f"{model_name}_{self.language}")
+
+ # Remove existing directory if it exists
+ if os.path.exists(complete_path) and os.path.isdir(complete_path):
+ shutil.rmtree(complete_path)
+
+ # Save model and tokenizer
+ logger.info(f"Saving model to: {complete_path}")
+ self.model.save_pretrained(complete_path)
+ self.tokenizer.save_pretrained(complete_path)
+ logger.info("Model saved locally.")
+
+ try:
+ # Log to MLflow
+ logger.info("Logging artifacts to MLflow...")
+ mlflow.log_artifacts(
+ local_dir=complete_path, artifact_path=f"{model_name}_{self.language}"
+ )
+ except Exception as e:
+ logger.error(f"Failed to log model artifacts to MLflow: {e}")
+
+ def load(self, model_path):
+ """
+ Load model from a local path OR an MLflow URI.
+
+ Args:
+ model_path (str): Local path or MLflow URI to load the model from.
+ """
+
+ logger.info(f"Loading model from: {model_path}")
+ local_model_path = model_path
+
+ # Downloading model from MLflow and saving to local path
+ if model_path.startswith("models:/") or model_path.startswith("runs:/"):
+ try:
+ logger.info("Detected MLflow model URI. Attempting to load from MLflow...")
+ local_model_path = os.path.join(MODELS_DIR, "mlflow_temp_models")
+ local_model_path = mlflow.artifacts.download_artifacts(
+ artifact_uri=model_path, dst_path=local_model_path
+ )
+ logger.info(f"Model downloaded from MLflow to: {local_model_path}")
+ except Exception as e:
+ logger.error(f"Failed to load from MLflow: {e}")
+ raise e
+
+ # Loading from local path
+ try:
+ if not os.path.exists(local_model_path):
+ raise FileNotFoundError(f"Model path not found: {local_model_path}")
+
+ # Load tokenizer and model from local path
+ self.tokenizer = AutoTokenizer.from_pretrained(local_model_path)
+ self.model = AutoModelForSequenceClassification.from_pretrained(local_model_path).to(
+ self.device
+ )
+ logger.info("Model loaded from local path successfully.")
+
+ except Exception as e:
+ logger.error(f"Failed to load model from local path: {e}")
+ raise e
+
+ # Set model to evaluation mode
+ self.model.eval()
diff --git a/turing/modeling/models/randomForestTfIdf.py b/turing/modeling/models/randomForestTfIdf.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e785f7e0c24f4bf7e01e292d06d0a47c59d92f6
--- /dev/null
+++ b/turing/modeling/models/randomForestTfIdf.py
@@ -0,0 +1,153 @@
+import warnings
+
+from loguru import logger
+from numpy import ndarray
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics import (
+ accuracy_score,
+ classification_report,
+ f1_score,
+ precision_score,
+ recall_score,
+)
+from sklearn.model_selection import GridSearchCV
+from sklearn.multioutput import MultiOutputClassifier
+from sklearn.pipeline import Pipeline
+
+from ..baseModel import BaseModel
+
+warnings.filterwarnings("ignore")
+
+
+class RandomForestTfIdf(BaseModel):
+ """
+ Sklearn implementation of BaseModel with integrated Grid Search.
+ Builds a TF-IDF + RandomForest pipeline for multi-output text classification.
+ """
+
+ def __init__(self, language, path=None):
+ """
+ Initialize the RandomForestTfIdf model with configuration parameters.
+
+ Args:
+ language (str): Language for the model.
+ path (str, optional): Path to load a pre-trained model. Defaults to None.
+ If None, a new model is initialized.
+ """
+
+ self.params = {"stop_words": "english", "random_state": 42, "cv_folds": 5}
+
+ self.grid_params = {
+ "clf__estimator__n_estimators": [50, 100, 200],
+ "clf__estimator__max_depth": [None, 10, 20],
+ "tfidf__max_features": [3000, 5000, 8000],
+ }
+
+ super().__init__(language, path)
+
+ def setup_model(self):
+ """
+ Initialize the scikit-learn pipeline with TF-IDF vectorizer and RandomForest classifier.
+ """
+
+ base_estimator = RandomForestClassifier(
+ random_state=self.params["random_state"], n_jobs=-1
+ )
+
+ self.pipeline = Pipeline(
+ [
+ (
+ "tfidf",
+ TfidfVectorizer(ngram_range=(1, 2), stop_words=self.params["stop_words"]),
+ ),
+ ("clf", MultiOutputClassifier(base_estimator, n_jobs=-1)),
+ ]
+ )
+
+ self.model = self.pipeline
+ logger.info("Scikit-learn pipeline initialized.")
+
+ def train(self, X_train, y_train) -> dict[str, any]:
+ """
+ Train the model using Grid Search to find the best hyperparameters.
+
+ Args:
+ X_train: Input training data.
+ y_train: True labels for training data.
+ """
+
+ if self.model is None:
+ raise ValueError(
+ "Model pipeline is not initialized. Call setup_model() before training."
+ )
+
+ logger.info(f"Starting training for: {self.language.upper()}")
+ logger.info("Performing Grid Search for best hyperparameters...")
+ grid_search = GridSearchCV(
+ self.pipeline,
+ param_grid=self.grid_params,
+ cv=self.params["cv_folds"],
+ scoring="f1_weighted",
+ n_jobs=-1,
+ verbose=1,
+ )
+ grid_search.fit(X_train, y_train)
+
+ logger.success(f"Best params found: {grid_search.best_params_}")
+
+ parameters_to_log = {
+ "max_features": grid_search.best_params_["tfidf__max_features"],
+ "n_estimators": grid_search.best_params_["clf__estimator__n_estimators"],
+ "max_depth": grid_search.best_params_["clf__estimator__max_depth"],
+ }
+
+ self.model = grid_search.best_estimator_
+ logger.success(f"Training for {self.language.upper()} completed.")
+
+ return parameters_to_log
+
+ def evaluate(self, X_test, y_test) -> dict[str, any]:
+ """
+ Evaluate model on test data and return metrics.
+
+ Args:
+ X_test: Input test data.
+ y_test: True labels for test data.
+ """
+
+ y_pred = self.predict(X_test)
+
+ report = classification_report(y_test, y_pred, zero_division=0)
+ print("\n" + "=" * 50)
+ print("CLASSIFICATION REPORT")
+ print(report)
+ print("=" * 50 + "\n")
+
+ metrics = {
+ "accuracy": accuracy_score(y_test, y_pred),
+ "precision": precision_score(y_test, y_pred, average="macro", zero_division=0),
+ "recall": recall_score(y_test, y_pred, average="macro", zero_division=0),
+ "f1_score": f1_score(y_test, y_pred, average="weighted"),
+ }
+
+ logger.info(
+ f"Evaluation completed — Accuracy: {metrics['accuracy']:.3f}, F1: {metrics['f1_score']:.3f}"
+ )
+ return metrics
+
+ def predict(self, X) -> ndarray:
+ """
+ Make predictions using the trained model.
+
+ Args:
+ X: Input data for prediction.
+
+ Returns:
+ Predictions made by the model.
+ """
+
+ if self.model is None:
+ raise ValueError("Model is not trained. Call train() or load() before prediction.")
+
+ return self.model.predict(X)
diff --git a/turing/modeling/models/tinyBert.py b/turing/modeling/models/tinyBert.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d76eb4dddf2e767bae4a5cced7c97328580b6da
--- /dev/null
+++ b/turing/modeling/models/tinyBert.py
@@ -0,0 +1,441 @@
+"""
+Ultra-lightweight multi-label text classification model for code comment analysis.
+
+This module implements a specialized neural architecture combining TinyBERT
+(15MB, 96 layers compressed) with a custom multi-label classification head.
+Designed for efficient inference on resource-constrained environments while
+maintaining competitive performance on code comment classification tasks.
+
+Architecture:
+ - Encoder: TinyBERT (prajjwal1/bert-tiny)
+ - Hidden dimension: 312
+ - Classification layers: 312 -> 128 (ReLU) -> num_labels (Sigmoid)
+ - Regularization: Dropout(0.2) for preventing overfitting
+ - Loss function: Binary Cross-Entropy for multi-label classification
+
+Performance characteristics:
+ - Model size: ~15MB
+ - Inference latency: ~50ms per sample
+ - Memory footprint: ~200MB during training
+ - Supports multi-label outputs via sigmoid activation
+"""
+
+from typing import List
+
+from loguru import logger
+import numpy as np
+from sklearn.preprocessing import MultiLabelBinarizer
+import torch
+from torch import nn
+from torch.optim import Adam
+
+import turing.config as config
+from turing.modeling.baseModel import BaseModel
+
+try:
+ from transformers import AutoModel, AutoTokenizer
+except ImportError:
+ logger.error("transformers library required. Install with: pip install transformers torch")
+
+
+class TinyBERTClassifier(BaseModel):
+ """
+ Ultra-lightweight multi-label classifier for code comment analysis.
+
+ Combines TinyBERT encoder with a custom classification head optimized for
+ multi-label code comment classification across Java, Python, and Pharo.
+
+ Attributes:
+ device (torch.device): Computation device (CPU/GPU).
+ model (nn.ModuleDict): Container for encoder and classifier components.
+ tokenizer (AutoTokenizer): Hugging Face tokenizer for text preprocessing.
+ classifier (nn.Sequential): Custom multi-label classification head.
+ num_labels (int): Number of output classes per language.
+ labels_map (list): Mapping of label indices to semantic categories.
+
+ References:
+ TinyBERT: https://huggingface.co/prajjwal1/bert-tiny
+ """
+
+ def __init__(self, language: str, path: str = None):
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ logger.info(f"TinyBERT using device: {self.device}")
+ self.model = None
+ self.tokenizer = None
+ self.classifier = None
+ self.mlb = MultiLabelBinarizer()
+ self.labels_map = config.LABELS_MAP.get(language, [])
+ self.num_labels = len(self.labels_map)
+ self.params = {
+ "model": "TinyBERT",
+ "model_size": "15MB",
+ "epochs": 15,
+ "batch_size": 8,
+ "learning_rate": 1e-3,
+ }
+ super().__init__(language=language, path=path)
+
+ def setup_model(self):
+ """
+ Initialize TinyBERT encoder and custom classification head.
+
+ Loads the pre-trained TinyBERT model from Hugging Face model hub and
+ constructs a custom multi-label classification head with:
+ - Input: 312-dimensional encoder embeddings [CLS] token
+ - Hidden layer: 128 units with ReLU activation
+ - Dropout: 0.2 for regularization
+ - Output: num_labels units with Sigmoid activation
+
+ Raises:
+ Exception: If model initialization fails due to network or missing dependencies.
+ """
+ self._initialize_model()
+
+ def _initialize_model(self):
+ """
+ Initialize TinyBERT encoder and custom classification head.
+
+ Loads the pre-trained TinyBERT model from Hugging Face model hub and
+ constructs a custom multi-label classification head with:
+ - Input: 312-dimensional encoder embeddings [CLS] token
+ - Hidden layer: 128 units with ReLU activation
+ - Dropout: 0.2 for regularization
+ - Output: num_labels units with Sigmoid activation
+
+ Raises:
+ Exception: If model initialization fails due to network or missing dependencies.
+ """
+ try:
+ model_name = "prajjwal1/bert-tiny"
+
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+ encoder = AutoModel.from_pretrained(model_name)
+ encoder.to(self.device)
+
+ hidden_dim = encoder.config.hidden_size
+
+ self.classifier = nn.Sequential(
+ nn.Linear(hidden_dim, 128),
+ nn.ReLU(),
+ nn.Dropout(0.2),
+ nn.Linear(128, self.num_labels),
+ nn.Sigmoid(),
+ ).to(self.device)
+
+ self.model = nn.ModuleDict({"encoder": encoder, "classifier": self.classifier})
+
+ logger.success(f"Initialized TinyBERTClassifier for {self.language}")
+ logger.info(f"Model size: ~15MB | Labels: {self.num_labels}")
+
+ except Exception as e:
+ logger.error(f"Error initializing model: {e}")
+ raise
+
+ def train(
+ self,
+ X_train: List[str],
+ y_train: np.ndarray,
+ path: str = None,
+ model_name: str = "tinybert_classifier",
+ epochs: int = 15,
+ batch_size: int = 8,
+ learning_rate: float = 1e-3,
+ ) -> dict:
+ """
+ Train the classifier using binary cross-entropy loss.
+
+ Implements gradient descent optimization with adaptive learning rate scheduling.
+ Supports checkpoint saving for model persistence and recovery.
+
+ Args:
+ X_train (List[str]): Training text samples (code comments).
+ y_train (np.ndarray): Binary label matrix of shape (n_samples, n_labels).
+ path (str, optional): Directory path for model checkpoint saving.
+ model_name (str): Identifier for saved model artifacts.
+ epochs (int): Number of complete training iterations. Default: 3.
+ batch_size (int): Number of samples per gradient update. Default: 16.
+ learning_rate (float): Adam optimizer learning rate. Default: 2e-5.
+
+ Returns:
+ dict: Training configuration including hyperparameters and model metadata.
+
+ Raises:
+ Exception: If training fails due to data inconsistency or resource exhaustion.
+ """
+ try:
+ if self.model is None:
+ self._initialize_model()
+
+ optimizer = Adam(self.classifier.parameters(), lr=learning_rate)
+ criterion = nn.BCELoss()
+
+ num_samples = len(X_train)
+ num_batches = (num_samples + batch_size - 1) // batch_size
+
+ logger.info(f"Starting training: {epochs} epochs, {num_batches} batches per epoch")
+
+ for epoch in range(epochs):
+ total_loss = 0.0
+
+ for batch_idx in range(num_batches):
+ start_idx = batch_idx * batch_size
+ end_idx = min(start_idx + batch_size, num_samples)
+
+ batch_texts = X_train[start_idx:end_idx]
+ batch_labels = y_train[start_idx:end_idx]
+
+ optimizer.zero_grad()
+
+ tokens = self.tokenizer(
+ batch_texts,
+ padding=True,
+ truncation=True,
+ max_length=128,
+ return_tensors="pt",
+ ).to(self.device)
+
+ with torch.no_grad():
+ encoder_output = self.model["encoder"](**tokens)
+ cls_token = encoder_output.last_hidden_state[:, 0, :]
+
+ logits = self.classifier(cls_token)
+
+ labels_tensor = torch.tensor(batch_labels, dtype=torch.float32).to(self.device)
+ loss = criterion(logits, labels_tensor)
+
+ loss.backward()
+ optimizer.step()
+
+ total_loss += loss.item()
+
+ avg_loss = total_loss / num_batches
+ logger.info(f"Epoch {epoch + 1}/{epochs} - Loss: {avg_loss:.4f}")
+
+ logger.success(f"Training completed for {self.language}")
+
+ if path:
+ self.save(path, model_name)
+
+ return {
+ "epochs": epochs,
+ "batch_size": batch_size,
+ "learning_rate": learning_rate,
+ "model_size_mb": 15,
+ }
+
+ except Exception as e:
+ logger.error(f"Error training model: {e}")
+ raise
+
+ def predict(self, texts: List[str], threshold: float = 0.3) -> np.ndarray:
+ """
+ Generate multi-label predictions for code comments.
+
+ Performs inference in evaluation mode without gradient computation.
+ Applies probability threshold to convert sigmoid outputs to binary labels.
+
+ Args:
+ texts (List[str]): Code comment samples for classification.
+ threshold (float): Decision boundary for label assignment. Default: 0.5.
+ Values below threshold are mapped to 0, above to 1.
+
+ Returns:
+ np.ndarray: Binary predictions matrix of shape (n_samples, n_labels).
+
+ Raises:
+ ValueError: If model is not initialized.
+ Exception: If inference fails due to incompatible input dimensions.
+ """
+ if self.model is None:
+ raise ValueError("Model not initialized. Train or load a model first.")
+
+ self.model.eval()
+ predictions = []
+
+ # Convert various types to list: pandas Series, Dataset Column, etc.
+ if hasattr(texts, "tolist"):
+ texts = texts.tolist()
+ elif hasattr(texts, "__iter__") and not isinstance(texts, list):
+ texts = list(texts)
+
+ try:
+ with torch.no_grad():
+ tokens = self.tokenizer(
+ texts, padding=True, truncation=True, max_length=128, return_tensors="pt"
+ ).to(self.device)
+
+ encoder_output = self.model["encoder"](**tokens)
+ cls_token = encoder_output.last_hidden_state[:, 0, :]
+
+ logits = self.classifier(cls_token)
+ probabilities = logits.cpu().numpy()
+
+ predictions = (probabilities > threshold).astype(int)
+
+ return predictions
+
+ except Exception as e:
+ logger.error(f"Error during prediction: {e}")
+ raise
+
+ def evaluate(self, X_test: List[str], y_test: np.ndarray) -> dict:
+ """
+ Evaluate classification performance on test set.
+
+ Computes per-label and macro-averaged metrics:
+ - Precision: TP / (TP + FP) - correctness of positive predictions
+ - Recall: TP / (TP + FN) - coverage of actual positive instances
+ - F1-Score: 2 * (P * R) / (P + R) - harmonic mean of precision and recall
+ - Accuracy: Per-sample exact match rate
+
+ Args:
+ X_test (List[str]): Test text samples for evaluation.
+ y_test (np.ndarray): Ground truth binary label matrix or indices.
+
+ Returns:
+ dict: Evaluation metrics including f1_score, precision, recall, accuracy.
+
+ Raises:
+ Exception: If evaluation fails due to prediction errors.
+ """
+ try:
+ predictions = self.predict(X_test)
+
+ # Convert y_test to numpy array if needed
+ if not isinstance(y_test, (np.ndarray, torch.Tensor)):
+ y_test_np = np.array(y_test)
+ elif isinstance(y_test, torch.Tensor):
+ y_test_np = y_test.cpu().numpy()
+ else:
+ y_test_np = y_test
+
+ # Handle conversion from flat indices to multi-hot encoding if needed
+ is_multilabel_pred = predictions.ndim == 2 and predictions.shape[1] > 1
+ is_flat_truth = (y_test_np.ndim == 1) or (
+ y_test_np.ndim == 2 and y_test_np.shape[1] == 1
+ )
+
+ if is_multilabel_pred and is_flat_truth:
+ # Create zero matrix for multi-hot encoding
+ y_test_expanded = np.zeros((y_test_np.shape[0], self.num_labels), dtype=int)
+ indices = y_test_np.flatten()
+
+ # Set columns to 1 based on indices
+ for i, label_idx in enumerate(indices):
+ idx = int(label_idx)
+ if 0 <= idx < self.num_labels:
+ y_test_expanded[i, idx] = 1
+
+ y_test_np = y_test_expanded
+
+ tp = np.sum((predictions == 1) & (y_test_np == 1), axis=0)
+ fp = np.sum((predictions == 1) & (y_test_np == 0), axis=0)
+ fn = np.sum((predictions == 0) & (y_test_np == 1), axis=0)
+
+ precision_per_label = tp / (tp + fp + 1e-10)
+ recall_per_label = tp / (tp + fn + 1e-10)
+ f1_per_label = (
+ 2
+ * (precision_per_label * recall_per_label)
+ / (precision_per_label + recall_per_label + 1e-10)
+ )
+
+ metrics = {
+ "f1_score": float(np.mean(f1_per_label)),
+ "precision": float(np.mean(precision_per_label)),
+ "recall": float(np.mean(recall_per_label)),
+ "accuracy": float(np.mean(predictions == y_test_np)),
+ }
+
+ logger.info(f"Evaluation metrics: {metrics}")
+ return metrics
+
+ except Exception as e:
+ logger.error(f"Error evaluating model: {e}")
+ raise
+
+ def save(self, path: str, model_name: str = "tinybert_classifier"):
+ """
+ Persist model artifacts including weights, tokenizer, and configuration.
+
+ Saves the following components:
+ - classifier.pt: PyTorch state dictionary of classification head
+ - tokenizer configuration: Hugging Face tokenizer files
+ - config.json: Model metadata and label mappings
+
+ Args:
+ path (str): Parent directory for model checkpoint storage.
+ model_name (str): Model identifier used as subdirectory name.
+
+ Raises:
+ Exception: If file I/O or serialization fails.
+ """
+ try:
+ import os
+
+ model_path = os.path.join(path, model_name)
+ os.makedirs(model_path, exist_ok=True)
+
+ if self.classifier:
+ torch.save(self.classifier.state_dict(), os.path.join(model_path, "classifier.pt"))
+
+ if self.tokenizer:
+ self.tokenizer.save_pretrained(model_path)
+
+ config_data = {
+ "language": self.language,
+ "num_labels": self.num_labels,
+ "labels_map": self.labels_map,
+ "model_type": "tinybert_classifier",
+ "model_name": model_name,
+ }
+
+ import json
+
+ with open(os.path.join(model_path, "config.json"), "w") as f:
+ json.dump(config_data, f, indent=2)
+
+ logger.success(f"Model saved to {model_path}")
+
+ except Exception as e:
+ logger.error(f"Error saving model: {e}")
+ raise
+
+ def load(self, path: str):
+ """
+ Restore model state from checkpoint directory.
+
+ Loads classifier weights from serialized PyTorch tensors and reinitializes
+ the tokenizer from saved configuration. Restores language-specific label
+ mappings from JSON metadata.
+
+ Args:
+ path (str): Directory containing model checkpoint files.
+
+ Raises:
+ Exception: If file not found or deserialization fails.
+ """
+ try:
+ import json
+ import os
+
+ self._initialize_model()
+
+ classifier_path = os.path.join(path, "classifier.pt")
+ if os.path.exists(classifier_path):
+ self.classifier.load_state_dict(
+ torch.load(classifier_path, map_location=self.device)
+ )
+
+ config_path = os.path.join(path, "config.json")
+ if os.path.exists(config_path):
+ with open(config_path, "r") as f:
+ config_data = json.load(f)
+ self.language = config_data.get("language", self.language)
+ self.labels_map = config_data.get("labels_map", self.labels_map)
+
+ logger.success(f"Model loaded from {path}")
+
+ except Exception as e:
+ logger.error(f"Error loading model: {e}")
+ raise
diff --git a/turing/modeling/predict.py b/turing/modeling/predict.py
new file mode 100644
index 0000000000000000000000000000000000000000..4304a04830035c8e3e50dc97cc67a955ecce1c77
--- /dev/null
+++ b/turing/modeling/predict.py
@@ -0,0 +1,195 @@
+import importlib
+import warnings
+
+import dagshub
+from loguru import logger
+import mlflow
+import numpy as np
+import pandas as pd
+
+from turing.config import INPUT_COLUMN, LABELS_MAP, LANGS, MODEL_CONFIG, MODELS_DIR
+from turing.dataset import DatasetManager
+from turing.modeling.model_selector import get_best_model_info
+from turing.modeling.models.codeBerta import CodeBERTa
+
+
+class ModelInference:
+ # Model Configuration (Fallback Registry)
+ FALLBACK_MODEL_REGISTRY = {
+ "java": {
+ "run_id": "446f4459780347da8c796e619129be37",
+ "artifact": "fine-tuned-CodeBERTa_java",
+ "model_id": "codeberta",
+ },
+ "python": {
+ "run_id": "ef5fd8ebf33a412087dcf02afd9e3147",
+ "artifact": "fine-tuned-CodeBERTa_python",
+ "model_id": "codeberta",
+ },
+ "pharo": {
+ "run_id": "97822c6d84fc40c5b2363c9201a39997",
+ "artifact": "fine-tuned-CodeBERTa_pharo",
+ "model_id": "codeberta",
+ },
+ }
+
+
+ def __init__(self, repo_owner="se4ai2526-uniba", repo_name="Turing", use_best_model_tags=True):
+ dagshub.init(repo_owner=repo_owner, repo_name=repo_name, mlflow=True)
+ warnings.filterwarnings("ignore")
+ self.dataset_manager = DatasetManager()
+ self.use_best_model_tags = use_best_model_tags
+
+ # Initialize model registry based on configuration
+ if use_best_model_tags:
+ logger.info("Using MLflow tags to find best models")
+
+ self.model_registry = {}
+ for lang in LANGS:
+ try:
+ model_info = get_best_model_info(
+ lang, fallback_registry=self.FALLBACK_MODEL_REGISTRY
+ )
+ self.model_registry[lang] = model_info
+ logger.info(f"Loaded model info for {lang}: {model_info}")
+
+ # raise error if any required info is missing
+ if not all(k in model_info for k in ("run_id", "artifact", "model_id")):
+ raise ValueError(f"Incomplete model info for {lang}: {model_info}")
+
+ except Exception as e:
+ logger.warning(f"Could not load model info for {lang}: {e}")
+ if lang in self.FALLBACK_MODEL_REGISTRY:
+ self.model_registry[lang] = self.FALLBACK_MODEL_REGISTRY[lang]
+
+ # Pre-cache models locally
+ run_id = self.model_registry[lang]["run_id"]
+ artifact = self.model_registry[lang]["artifact"]
+ self._get_cached_model_path(run_id, artifact, lang)
+ else:
+ logger.info("Using hardcoded model registry")
+ self.model_registry = self.FALLBACK_MODEL_REGISTRY
+
+ def _decode_predictions(self, raw_predictions, language: str):
+ """
+ Converts the binary matrix from the model into human-readable labels.
+
+ Args:
+ raw_predictions: Numpy array or similar with binary predictions
+ language: Programming language for label mapping
+ """
+
+ labels_map = LABELS_MAP.get(language, [])
+ decoded_results = []
+
+ # Ensure input is a numpy array for processing
+ if isinstance(raw_predictions, list):
+ raw_array = np.array(raw_predictions)
+ elif isinstance(raw_predictions, pd.DataFrame):
+ raw_array = raw_predictions.values
+ else:
+ raw_array = raw_predictions
+
+ # Iterate over rows
+ for row in raw_array:
+ indices = np.where(row == 1)[0]
+ # Map indices to labels safely
+ row_labels = [labels_map[i] for i in indices if i < len(labels_map)]
+ decoded_results.append(row_labels)
+
+ return decoded_results
+
+ def _get_cached_model_path(self, run_id: str, artifact_name: str, language: str) -> str:
+ """Checks if model exists locally; if not, downloads it from MLflow."""
+ # Define local path: models/mlflow_temp_models/language/artifact_name
+ local_path = MODELS_DIR / "mlflow_temp_models" / language / artifact_name
+
+ if local_path.exists():
+ logger.info(f"Loading {language} model from local cache: {local_path}")
+ return str(local_path)
+
+ logger.info(
+ f"Model not found locally. Downloading {language} model from MLflow (Run ID: {run_id})..."
+ )
+
+ # Ensure parent directory exists
+ local_path.parent.mkdir(parents=True, exist_ok=True)
+
+ # Download artifacts to the parent directory (artifact_name folder will be created inside)
+ mlflow.artifacts.download_artifacts(
+ run_id=run_id, artifact_path=artifact_name, dst_path=str(local_path.parent)
+ )
+ logger.success(f"Model downloaded and cached at: {local_path}")
+
+ return str(local_path)
+
+ def predict_payload(self, texts: list[str], language: str):
+ """
+ API Prediction: Automatically fetches the correct model from the registry based on language.
+
+ Args:
+ texts: List of code comments to classify
+ language: Programming language
+ """
+
+ # 1. Validate Language and Fetch Config
+ if language not in self.model_registry:
+ raise ValueError(
+ f"Language '{language}' is not supported or the model is not configured."
+ )
+
+ model_config = self.model_registry[language]
+ run_id = model_config["run_id"]
+ artifact_name = model_config["artifact"]
+ model_id = model_config["model_id"]
+
+ # Dynamically import model class
+ config_entry = MODEL_CONFIG[model_id]
+ module_name = config_entry["model_class_module"]
+ class_name = config_entry["model_class_name"]
+ module = importlib.import_module(module_name)
+ model_class = getattr(module, class_name)
+
+ # 2. Get Model Path (Local Cache or Download)
+ model_path = self._get_cached_model_path(run_id, artifact_name, language)
+
+ # Load Model
+ model = model_class(language=language, path=model_path)
+
+ # 3. Predict
+ raw_predictions = model.predict(texts)
+
+ # 4. Decode Labels
+ decoded_labels = self._decode_predictions(raw_predictions, language)
+
+ return raw_predictions, decoded_labels, run_id, artifact_name
+
+ def predict_from_mlflow(
+ self, mlflow_run_id: str, artifact_name: str, language: str, model_class=CodeBERTa
+ ):
+ """
+ Legacy method for CML/CLI: Predicts on the test dataset stored on disk.
+ """
+ # Load Dataset
+ try:
+ full_dataset = self.dataset_manager.get_dataset()
+ dataset_key = f"{language}_test"
+ if dataset_key not in full_dataset:
+ raise ValueError(f"Dataset key '{dataset_key}' not found.")
+ test_ds = full_dataset[dataset_key]
+ X_test = test_ds[INPUT_COLUMN]
+ except Exception as e:
+ logger.error(f"Error loading dataset: {e}")
+ raise e
+
+ # Load Model (Local Cache or Download)
+ model_path = self._get_cached_model_path(mlflow_run_id, artifact_name, language)
+ model = model_class(language=language, path=model_path)
+
+ raw_predictions = model.predict(X_test)
+
+ # Decode output
+ readable_predictions = self._decode_predictions(raw_predictions, language)
+
+ logger.info("Dataset prediction completed.")
+ return readable_predictions
diff --git a/turing/modeling/train.py b/turing/modeling/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..cad1b0567f663bcf20c40af5ecdb17c28fa49fe7
--- /dev/null
+++ b/turing/modeling/train.py
@@ -0,0 +1,212 @@
+from importlib import import_module
+import os
+import warnings
+
+import dagshub
+from loguru import logger
+import mlflow
+from mlflow.tracking import MlflowClient
+import numpy as np
+import typer
+
+import turing.config as config
+from turing.dataset import DatasetManager
+from turing.evaluate_model import evaluate_models
+
+dagshub.init(repo_owner="se4ai2526-uniba", repo_name="Turing", mlflow=True)
+
+warnings.filterwarnings("ignore")
+
+DEFAULT_MODEL = "codeberta"
+_default_cfg = config.MODEL_CONFIG[DEFAULT_MODEL]
+
+MODEL_CLASS_MODULE = _default_cfg["model_class_module"]
+MODEL_CLASS_NAME = _default_cfg["model_class_name"]
+MODEL_CLASS = __import__(MODEL_CLASS_MODULE, fromlist=[MODEL_CLASS_NAME])
+MODEL_CLASS = getattr(MODEL_CLASS, MODEL_CLASS_NAME)
+EXP_NAME = _default_cfg["exp_name"]
+MODEL_NAME = _default_cfg["model_name"]
+
+
+
+app = typer.Typer()
+
+
+def tag_best_models(
+ metric: str = "f1_score"
+):
+ """
+ Tag the best existing models in MLflow based on the specified metric.
+ Remove previous best_model tags before tagging the new best models.
+
+ Args:
+ metric: Metric to use for determining the best model
+ """
+
+ dagshub.init(repo_owner="se4ai2526-uniba", repo_name="Turing", mlflow=True)
+ client = MlflowClient()
+
+ # Get all experiments from Mlflow
+ experiments = client.search_experiments()
+ if not experiments:
+ logger.error("No experiments found in MLflow")
+ return
+
+ # Find the best run for each language
+ experiments_ids = [exp.experiment_id for exp in experiments]
+ for lang in config.LANGS:
+ # Get all runs for the language
+ runs = client.search_runs(
+ experiment_ids=experiments_ids,
+ filter_string=f"tags.Language = '{lang}'",
+ order_by=[f"metrics.{metric} DESC"]
+ )
+
+ if not runs:
+ logger.warning(f"No runs found for language {lang}")
+ continue
+ logger.info(f"Found {len(runs)} runs for {lang}")
+
+ # Get the best run for the language
+ best_run = runs[0]
+ run_id = best_run.info.run_id
+
+ # Remove previous best_model tags for this language
+ for run in runs[1:]:
+ try:
+ client.delete_tag(run.info.run_id, "best_model")
+ except Exception:
+ pass
+
+ # Tag the best model
+ client.set_tag(run_id, "best_model", "true")
+
+
+def show_tagged_models():
+ """
+ Show all models tagged as best_model.
+ """
+
+ dagshub.init(repo_owner="se4ai2526-uniba", repo_name="Turing", mlflow=True)
+ client = MlflowClient()
+
+ # Get all experiments from Mlflow
+ experiments = client.search_experiments()
+ if not experiments:
+ logger.error("No experiments found in MLflow")
+ return
+
+ # Find all runs tagged as best_model
+ runs = client.search_runs(
+ experiment_ids=[exp.experiment_id for exp in experiments],
+ filter_string="tags.best_model = 'true'",
+ order_by=["tags.Language ASC"]
+ )
+ logger.info(f"\nFound {len(runs)} best models in experiments:\n")
+
+ # Display details of each tagged best model
+ for run in runs:
+ language = run.data.tags.get("Language", "unknown")
+ exp_name = client.get_experiment(run.info.experiment_id).name
+ run_id = run.info.run_id
+ run_name = run.data.tags.get("mlflow.runName", "N/A")
+ dataset_name = run.data.tags.get("dataset_name", "unknown")
+
+ logger.info(f"Language: {language}")
+ logger.info(f" Run: {exp_name}/{run_name} ({run_id})")
+ logger.info(f" Dataset: {dataset_name}")
+
+ if run.data.metrics:
+ for metric in run.data.metrics:
+ logger.info(f" {metric}: {run.data.metrics[metric]:.4f}")
+
+ logger.info("")
+
+
+@app.command()
+def main(model: str = typer.Option("codeberta", help="Model to train: codeberta, graphcodebert, tinybert, or randomforest"), dataset: str = typer.Option(None, help="Dataset to use for training")):
+ # Get model configuration from config
+ model_key = model.lower()
+ if model_key not in config.MODEL_CONFIG:
+ logger.error(f"Unknown model: {model_key}. Available models: {list(config.MODEL_CONFIG.keys())}")
+ return
+
+ model_cfg = config.MODEL_CONFIG[model_key]
+ model_name = model_cfg["model_name"]
+ exp_name = model_cfg["exp_name"]
+
+ # Dynamically import model class
+ module = import_module(model_cfg["model_class_module"])
+ model_class = getattr(module, model_cfg["model_class_name"])
+
+ logger.info(f"Training model: {model_name}")
+
+ # Load dataset
+ dataset_path = config.INTERIM_DATA_DIR / "features" / dataset
+ dataset_manager = DatasetManager(dataset_path=dataset_path)
+ try:
+ full_dataset = dataset_manager.get_dataset()
+ dataset_name = dataset_manager.get_dataset_name()
+ except Exception as e:
+ logger.error(f"Error loading dataset: {e}")
+ return
+ logger.info(f"Dataset loaded successfully: {dataset_name}")
+
+ # Train and evaluate models for each language
+ mlflow.set_experiment(exp_name)
+ models = {}
+ for lang in config.LANGS:
+ # Prepare training and testing data
+ train_ds = full_dataset[f"{lang}_train"]
+ test_ds = full_dataset[f"{lang}_test"]
+ X_train = train_ds[config.INPUT_COLUMN]
+ y_train = train_ds[config.LABEL_COLUMN]
+ X_test = test_ds[config.INPUT_COLUMN]
+ y_test = test_ds[config.LABEL_COLUMN]
+ X_train = list(X_train)
+ X_test = list(X_test)
+ y_train = np.array(y_train)
+
+ # Initialize model
+ model = model_class(language=lang)
+
+ # Train and evaluate model within an MLflow run
+ try:
+ with mlflow.start_run(run_name=f"{model_name}_{lang}"):
+ mlflow.set_tag("Language", lang)
+ mlflow.set_tag("dataset_name", dataset_name)
+ mlflow.set_tag("model_id", model_key)
+ mlflow.log_params(model.params)
+ parameters_to_log = model.train(
+ X_train,
+ y_train
+ )
+ mlflow.log_params(parameters_to_log)
+ model.save(os.path.join(config.MODELS_DIR, exp_name),model_name=model_name)
+ metrics = model.evaluate(X_test, y_test)
+ mlflow.log_metrics(metrics)
+
+ # Log model name for later retrieval
+ mlflow.set_tag("model_name", f"{model_name}_{lang}")
+
+ except Exception as e:
+ logger.error(f"Error training/evaluating model for {lang}: {e}")
+ return
+
+ # Store trained model
+ models[lang] = model
+ logger.success(f"All {model_name} models trained and evaluated.")
+
+ # Competition-style evaluation of trained models
+ logger.info("Starting competition-style evaluation of trained models...")
+ evaluate_models(models, full_dataset)
+ logger.success("Evaluation completed.")
+
+ logger.info("Tagging best models in MLflow...")
+ tag_best_models()
+ logger.info("Best models:")
+ show_tagged_models()
+
+
+if __name__ == "__main__":
+ app()
diff --git a/turing/plots.py b/turing/plots.py
new file mode 100644
index 0000000000000000000000000000000000000000..10f8e958ed9634b8c2aceaff6fb3bd6a8841a998
--- /dev/null
+++ b/turing/plots.py
@@ -0,0 +1,29 @@
+from pathlib import Path
+
+from loguru import logger
+from tqdm import tqdm
+import typer
+
+from turing.config import FIGURES_DIR, PROCESSED_DATA_DIR
+
+app = typer.Typer()
+
+
+@app.command()
+def main(
+ # ---- REPLACE DEFAULT PATHS AS APPROPRIATE ----
+ input_path: Path = PROCESSED_DATA_DIR / "dataset.csv",
+ output_path: Path = FIGURES_DIR / "plot.png",
+ # -----------------------------------------
+):
+ # ---- REPLACE THIS WITH YOUR OWN CODE ----
+ logger.info("Generating plot from data...")
+ for i in tqdm(range(10), total=10):
+ if i == 5:
+ logger.info("Something happened for iteration 5.")
+ logger.success("Plot generation complete.")
+ # -----------------------------------------
+
+
+if __name__ == "__main__":
+ app()
diff --git a/turing/reporting.py b/turing/reporting.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff4fb88e672d38867d2b31f68a95c14c43a04f0f
--- /dev/null
+++ b/turing/reporting.py
@@ -0,0 +1,173 @@
+from datetime import datetime
+import platform
+import sys
+from typing import Optional
+
+from loguru import logger
+import pandas as pd
+
+from turing.config import REPORTS_DIR
+
+
+class TestReportGenerator:
+ """
+ Handles the generation of structured Markdown reports specifically for test execution results.
+ """
+
+ def __init__(self, context_name: str, report_category: str):
+ self.context_name = context_name
+ self.report_category = report_category
+ self.timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+ self.content = []
+ self.output_dir = REPORTS_DIR / self.report_category
+
+ def add_header(self, text: str, level: int = 1):
+ self.content.append(f"\n{'#' * level} {text}\n")
+
+ def add_divider(self, style: str = "thin"):
+ """Add a visual divider line."""
+ dividers = {
+ "thin": "---",
+ "thick": "___",
+ "section": "\n---\n",
+ }
+ self.content.append(f"\n{dividers.get(style, dividers['thin'])}\n")
+
+ def add_code_block(self, content: str, language: str = ""):
+ """Add a code block."""
+ self.content.append(f"\n```{language}\n{content}\n```\n")
+
+ def add_alert_box(self, message: str, box_type: str = "info"):
+ """Add a styled alert box using blockquotes."""
+ box_headers = {
+ "info": "INFO",
+ "success": "SUCCESS",
+ "warning": "WARNING",
+ "error": "ERROR",
+ }
+ header = box_headers.get(box_type, "INFO")
+ self.content.append(f"\n> **{header}**: {message}\n")
+
+ def add_progress_bar(self, passed: int, total: int, width: int = 50):
+ """Add an ASCII progress bar."""
+ if total == 0:
+ percentage = 0
+ filled = 0
+ else:
+ percentage = (passed / total * 100)
+ filled = int(width * passed / total)
+
+ empty = width - filled
+ bar = "█" * filled + "░" * empty
+ self.add_code_block(f"Progress: [{bar}] {percentage:.1f}%\nPassed: {passed}/{total} tests", "")
+
+ def add_summary_box(self, total: int, passed: int, failed: int, skipped: int = 0):
+ """Add a visually enhanced summary box."""
+ success_rate = (passed / total * 100) if total > 0 else 0
+
+ # Determine status
+ if success_rate == 100:
+ status = "ALL TESTS PASSED"
+ elif success_rate >= 80:
+ status = "MOSTLY PASSED"
+ elif success_rate >= 50:
+ status = "PARTIAL SUCCESS"
+ else:
+ status = "NEEDS ATTENTION"
+
+ self.add_header("Executive Summary", level=2)
+ self.add_text(f"**Overall Status:** {status}")
+ self.add_text(f"**Success Rate:** {success_rate:.1f}%")
+
+ # Summary table
+ summary_data = [
+ ["Total Tests", str(total)],
+ ["Passed", str(passed)],
+ ["Failed", str(failed)],
+ ]
+
+ if skipped > 0:
+ summary_data.append(["Skipped", str(skipped)])
+
+ summary_data.append(["Success Rate", f"{success_rate:.1f}%"])
+
+ df = pd.DataFrame(summary_data, columns=["Metric", "Count"])
+ self.add_dataframe(df, title=None, align=("left", "right"))
+
+ # Progress bar
+ self.add_text("**Visual Progress:**")
+ self.add_progress_bar(passed, total)
+
+ def add_environment_metadata(self):
+ """Add enhanced environment metadata."""
+ self.add_header("Environment Information", level=2)
+
+ metadata = [
+ ["Timestamp", datetime.now().strftime("%Y-%m-%d %H:%M:%S")],
+ ["Context", self.context_name.upper()],
+ ["Python Version", sys.version.split()[0]],
+ ["Platform", platform.platform()],
+ ["Architecture", platform.machine()],
+ ]
+ df = pd.DataFrame(metadata, columns=["Parameter", "Value"])
+ self.add_dataframe(df, title=None, align=("left", "left"))
+
+ def add_text(self, text: str):
+ self.content.append(f"\n{text}\n")
+
+ def add_category_stats(self, df: pd.DataFrame, category: str):
+ """Add statistics for a test category."""
+ total = len(df)
+ passed = len(df[df['Result'] == "PASS"])
+ failed = len(df[df['Result'] == "FAIL"])
+ skipped = len(df[df['Result'] == "SKIP"])
+
+ stats = [
+ ["Total", str(total)],
+ ["Passed", f"{passed} ({passed/total*100:.1f}%)" if total > 0 else "0"],
+ ["Failed", f"{failed} ({failed/total*100:.1f}%)" if total > 0 else "0"],
+ ]
+
+ if skipped > 0:
+ stats.append(["Skipped", f"{skipped} ({skipped/total*100:.1f}%)"])
+
+ stats_df = pd.DataFrame(stats, columns=["Status", "Count"])
+ self.add_dataframe(stats_df, title="Statistics", align=("left", "right"))
+
+ def add_dataframe(self, df: pd.DataFrame, title: Optional[str] = None, align: tuple = None):
+ """Add a formatted dataframe table."""
+ if title:
+ self.add_header(title, level=3)
+
+ if df.empty:
+ self.content.append("\n_No data available._\n")
+ return
+
+ try:
+ if not align:
+ align = tuple(["left"] * len(df.columns))
+
+ table_md = df.to_markdown(index=False, tablefmt="pipe", colalign=align)
+ self.content.append(f"\n{table_md}\n")
+ except Exception as e:
+ logger.warning(f"Tabulate error: {e}. Using simple text.")
+ self.content.append(f"\n```text\n{df.to_string(index=False)}\n```\n")
+
+ def save(self, filename: str = "test_report.md") -> str:
+ """Save the report to a file."""
+ try:
+ self.output_dir.mkdir(parents=True, exist_ok=True)
+ file_path = self.output_dir / filename
+
+ # Add footer
+ self.add_divider("section")
+ self.add_text(f"*Report generated on {datetime.now().strftime('%Y-%m-%d at %H:%M:%S')}*")
+ self.add_text("*Powered by Turing Test Suite*")
+
+ with open(file_path, "w", encoding="utf-8") as f:
+ f.write("\n".join(self.content))
+ logger.info(f"Test report saved: {file_path}")
+ return str(file_path)
+ except Exception as e:
+ logger.error(f"Save failed: {e}")
+ raise
diff --git a/turing/tests/behavioral/test_directional.py b/turing/tests/behavioral/test_directional.py
new file mode 100644
index 0000000000000000000000000000000000000000..d82d16743916763a64e603224e78f0e693660fc2
--- /dev/null
+++ b/turing/tests/behavioral/test_directional.py
@@ -0,0 +1,183 @@
+# These tests check that adding or removing keywords logically changes the prediction
+
+
+def test_java_directional_add_deprecation(java_model, get_predicted_labels):
+ """Tests that adding '@deprecated' ADDs the 'deprecation' label"""
+ # Base comment should be a 'Pointer' due to the link
+ base_comment = "/** Use {@link #newUserMethod()} instead. */"
+ # Perturbed comment adds a keyword
+ pert_comment = "/** @deprecated Use {@link #newUserMethod()} instead. */"
+
+ preds_base = get_predicted_labels(java_model, base_comment, "java")
+ preds_pert = get_predicted_labels(java_model, pert_comment, "java")
+
+ # The base comment should not have 'deprecation'
+ assert "deprecation" not in preds_base
+ # The perturbed comment must have 'deprecation'
+ assert "deprecation" in preds_pert
+ # The original 'Pointer' label should still be there
+ assert "Pointer" in preds_base
+ assert "Pointer" in preds_pert
+
+
+def test_python_directional_remove_todo(python_model, get_predicted_labels):
+ """Tests that removing 'TODO' REMOVES the 'DevelopmentNotes' labe."""
+ base_comment = "# TODO: Refactor this entire block."
+ pert_comment = "# Refactor this entire block."
+
+ preds_base = get_predicted_labels(python_model, base_comment, "python")
+ preds_pert = get_predicted_labels(python_model, pert_comment, "python")
+
+ # The base comment must have 'DevelopmentNotes'
+ assert "DevelopmentNotes" in preds_base
+ # The perturbed comment must not have 'DevelopmentNotes'
+ assert "DevelopmentNotes" not in preds_pert
+
+
+def test_pharo_directional_add_responsibility(pharo_model, get_predicted_labels):
+ """Tests that adding 'i am responsible for' adds the 'Responsibilities' label"""
+ base_comment = '"i am a simple arrow"'
+ pert_comment = '"i am a simple arrow. i am responsible for drawing."'
+
+ preds_base = get_predicted_labels(pharo_model, base_comment, "pharo")
+ preds_pert = get_predicted_labels(pharo_model, pert_comment, "pharo")
+
+ # base comment should have 'Intent'
+ assert "Intent" in preds_base
+ # base comment should not have 'Responsibilities'
+ assert "Responsibilities" not in preds_base
+ # perturbed comment must have 'Responsibilities'
+ assert "Responsibilities" in preds_pert
+ # original 'Intent' label should still be there
+ assert "Intent" in preds_pert
+
+
+def test_java_directional_contrast_rational(java_model, get_predicted_labels):
+ """
+ Tests that adding a design rationale adds the 'rational' label
+ """
+ # Base comment is a simple summary
+ base_comment = "/** Returns the user ID. */"
+ # Perturbed comment adds a design rationale
+ pert_comment = "/** Returns the user ID. This is cached for performance. */"
+
+ preds_base = get_predicted_labels(java_model, base_comment, "java")
+ preds_pert = get_predicted_labels(java_model, pert_comment, "java")
+
+ # Base comment should be a 'summary'
+ assert "summary" in preds_base
+ # Base comment should not have 'rational'
+ assert "rational" not in preds_base
+ # Perturbed comment must now have 'rational'
+ assert "rational" in preds_pert
+ # Perturbed comment should ideally still be a 'summary'
+ assert "summary" in preds_pert
+
+
+def test_python_directional_contrast_todo(python_model, get_predicted_labels):
+ """
+ Tests that adding a "TODO" clause adds the 'DevelopmentNotes' label
+ """
+ # Base comment is a simple summary
+ base_comment = "Fetches the user profile."
+ # Perturbed comment adds a development note
+ pert_comment = "Fetches the user profile. TODO: This is deprecated."
+
+ preds_base = get_predicted_labels(python_model, base_comment, "python")
+ preds_pert = get_predicted_labels(python_model, pert_comment, "python")
+
+ # Base comment should be a 'Summary'
+ assert "Summary" in preds_base
+ # Base comment should not have 'DevelopmentNotes'
+ assert "DevelopmentNotes" not in preds_base
+ # Perturbed comment must now have 'DevelopmentNotes'
+ assert "DevelopmentNotes" in preds_pert
+ # Perturbed comment should ideally still be a 'Summary'
+ assert "Summary" in preds_pert
+
+
+def test_pharo_directional_contrast_collaborators(pharo_model, get_predicted_labels):
+ """
+ Tests that adding a 'but i work with' clause adds the 'Collaborators' label
+ """
+ # Base comment is a simple intent
+ base_comment = '"i am a simple arrow like arrowhead."'
+ pert_comment = '"i am a simple arrow, but i work with BlSpace to position."'
+
+ preds_base = get_predicted_labels(pharo_model, base_comment, "pharo")
+ preds_pert = get_predicted_labels(pharo_model, pert_comment, "pharo")
+
+ # Base comment should be 'Intent'
+ assert "Intent" in preds_base
+ # Base comment should not have 'Collaborators'
+ assert "Collaborators" not in preds_base
+ # Perturbed comment must now have 'Collaborators'
+ assert "Collaborators" in preds_pert
+ # Perturbed comment should ideally still have 'Intent'
+ assert "Intent" in preds_pert
+
+
+def test_java_directional_shift_summary_to_expand(java_model, get_predicted_labels):
+ """
+ Tests that replacing a simple 'summary' with an 'Expand' implementation note
+ shifts the primary classification from 'summary' to 'Expand'
+ """
+ # Base comment is a simple summary
+ base_comment = "/** Returns the user ID. */"
+ # Perturbed comment shifts the focus entirely to implementation details
+ pert_comment = "/** Implementation Note: This delegates to the old system. */"
+
+ preds_base = get_predicted_labels(java_model, base_comment, "java")
+ preds_pert = get_predicted_labels(java_model, pert_comment, "java")
+
+ # Base comment must have 'summary'
+ assert "summary" in preds_base
+ # Perturbed comment must not have 'summary'
+ assert "summary" not in preds_pert
+ # Perturbed comment must now have 'Expand'
+ assert "Expand" in preds_pert
+
+
+def test_python_directional_shift_summary_to_devnotes(python_model, get_predicted_labels):
+ """
+ Tests that replacing a 'Summary' with a critical development note (deprecated)
+ shifts the classification from 'Summary' to 'DevelopmentNotes'
+ """
+ print(f"\n[DEBUG] Oggetto modello Python: {python_model}, Lingua: {python_model.language}")
+ # Base comment is a clear Summary
+ base_comment = "Fetches the user profile."
+ # Perturbed comment shifts the focus entirely to a note about future work
+ pert_comment = "DEPRECATED: This function is scheduled for removal in v2.0."
+
+ preds_base = get_predicted_labels(python_model, base_comment, "python")
+ preds_pert = get_predicted_labels(python_model, pert_comment, "python")
+
+ # Base comment must have 'Summary'
+ assert "Summary" in preds_base
+ # Perturbed comment must not have 'Summary'
+ assert "Summary" not in preds_pert
+ # Perturbed comment must now have 'DevelopmentNotes'
+ assert "DevelopmentNotes" in preds_pert
+
+
+def test_pharo_directional_shift_to_example(pharo_model, get_predicted_labels):
+ """
+ Tests that changing a comment from a 'Responsibility' statement to an
+ explicit 'Example' statement shifts the primary classification
+ """
+ # Base comment is a clear 'Responsibilities'
+ base_comment = '"i provide a data structure independent api"'
+ # Perturbed comment replaces the responsibility claim with an explicit example pattern
+ pert_comment = '"[Example] run the data structure independent api."'
+
+ preds_base = get_predicted_labels(pharo_model, base_comment, "pharo")
+ preds_pert = get_predicted_labels(pharo_model, pert_comment, "pharo")
+
+ # Base comment msut have Responsibilities
+ assert "Responsibilities" in preds_base
+ # Base comment should not have Example
+ assert "Example" not in preds_base
+ # Perturbed comment must now have Example
+ assert "Example" in preds_pert
+ # Perturbed comment should not have Responsibilities
+ assert "Responsibilities" not in preds_pert
diff --git a/turing/tests/behavioral/test_invariance.py b/turing/tests/behavioral/test_invariance.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe85fdb484ca2c6d1d1db8284b020826c2e23a88
--- /dev/null
+++ b/turing/tests/behavioral/test_invariance.py
@@ -0,0 +1,117 @@
+import pytest
+
+# These tests check that "noise" (like capitalization or punctuation) does not change the prediction
+
+
+@pytest.mark.parametrize(
+ "comment",
+ [
+ ":param user_id: The ID of the user.", # Base
+ ":PARAM USER_ID: THE ID OF THE USER.", # Uppercase
+ " :param user_id: The ID of the user . ", # Whitespace
+ ":param user_id: The ID of the user!!!", # Punctuation
+ ],
+)
+def test_python_invariance_parameters(python_model, comment, get_predicted_labels):
+ """Tests that noise doesn't break ':param' detection."""
+ expected = {"Parameters"}
+ preds = get_predicted_labels(python_model, comment, "python")
+ assert preds == expected
+
+
+def test_java_invariance_deprecation(java_model, get_predicted_labels):
+ """Tests that noise doesn't break '@deprecated' detection"""
+ base_comment = "/** @deprecated Use newUserMethod() */"
+ pert_comment = "/** @DEPRECATED... Use newUserMethod()!!! */"
+
+ preds_base = get_predicted_labels(java_model, base_comment, "java")
+ preds_pert = get_predicted_labels(java_model, pert_comment, "java")
+
+ assert {"deprecation"} <= preds_base
+ assert preds_base == preds_pert
+
+
+def test_python_invariance_summary(python_model, get_predicted_labels):
+ """Tests that noise doesn't break a simple 'Summary' detection"""
+
+ base_comment = "a service specific account of type bar."
+ expected = {"Summary"}
+
+ # Perturbations
+ variants = [
+ base_comment,
+ "A SERVICE SPECIFIC ACCOUNT OF TYPE BAR.",
+ " a service specific account of type bar. ",
+ "a service specific account of type bar!!!",
+ ]
+
+ for comment in variants:
+ preds = get_predicted_labels(python_model, comment, "python")
+ assert preds == expected
+
+
+def test_pharo_invariance_intent(pharo_model, get_predicted_labels):
+ """Tests that noise doesn't break Pharo's 'Intent' detection"""
+
+ base_comment = '"i am a simple arrow like arrowhead."'
+ expected = {"Intent"}
+
+ # Perturbations
+ variants = [
+ base_comment,
+ '"I AM A SIMPLE ARROW LIKE ARROWHEAD."',
+ ' "i am a simple arrow like arrowhead." ',
+ '"i am a simple arrow like arrowhead !!"', #
+ ]
+
+ for comment in variants:
+ preds = get_predicted_labels(pharo_model, comment, "pharo")
+ assert preds == expected
+
+
+def test_python_invariance_typos_parameters(python_model, get_predicted_labels):
+ """
+ Tests typo tolerance
+
+ """
+
+ # Define the single expected outcome
+ expected_labels = {"Parameters"}
+
+ # Define the base case and all its variants (with typos)
+ variants = [
+ ":param user_id: The ID of the user.",
+ ":paramater user_id: The ID of the user.",
+ ":pram user_id: The ID of teh user.",
+ ]
+
+ # Loop through all variants and assert they all produce the *exact* expected outcome
+ for comment in variants:
+ preds = get_predicted_labels(python_model, comment, "python")
+ assert preds == expected_labels
+
+
+def test_java_invariance_semantic_summary(java_model, get_predicted_labels):
+ """
+ Tests semantic invariance
+
+ """
+
+ # Get the prediction for the base comment
+ base_comment = "/** Returns the user ID. */"
+ base_preds = get_predicted_labels(java_model, base_comment, "java")
+
+ # Define semantic paraphrases of the base comment
+ variants = [
+ base_comment,
+ "/** Gets the user ID. */",
+ "/** Fetches the ID for the user. */",
+ "/** A method to return the user's ID. */",
+ ]
+
+ # Check that the base prediction is valid (summary)
+ assert "summary" in base_preds
+
+ for comment in variants:
+ preds = get_predicted_labels(java_model, comment, "java")
+ assert preds == base_preds
diff --git a/turing/tests/behavioral/test_minimum_functionality.py b/turing/tests/behavioral/test_minimum_functionality.py
new file mode 100644
index 0000000000000000000000000000000000000000..f088e7656bb98d67aaebbdfc1bc8da37a1dc5e74
--- /dev/null
+++ b/turing/tests/behavioral/test_minimum_functionality.py
@@ -0,0 +1,52 @@
+import pytest
+
+# These tests check for basic, obvious classifications
+
+
+@pytest.mark.parametrize(
+ "comment, expected_labels",
+ [
+ ("test getfilestatus and related listing operations.", {"summary"}),
+ ("/* @deprecated Use something else. */", {"deprecation"}),
+ ("code source of this file http grepcode.com", {"Pointer"}),
+ ("this is balanced if each pool is balanced.", {"rational"}),
+ ("// For internal use only.", {"Ownership"}),
+ ("this impl delegates to the old filesystem", {"Expand"}),
+ ("/** Usage: new MyClass(arg1). */", {"usage"}),
+ ],
+)
+def test_java_mft(java_model, comment, expected_labels, get_predicted_labels):
+ preds = get_predicted_labels(java_model, comment, "java")
+ assert preds == expected_labels
+
+
+@pytest.mark.parametrize(
+ "comment, expected_labels",
+ [
+ ("a service specific account of type bar.", {"Summary"}),
+ (":param user_id: The ID of the user.", {"Parameters"}),
+ ("# TODO: Refactor this entire block.", {"DevelopmentNotes"}),
+ ("use this class if you want access to all of the mechanisms", {"Usage"}),
+ ("# create a new list by filtering duplicates from the input", {"Expand"}),
+ ],
+)
+def test_python_mft(python_model, comment, expected_labels, get_predicted_labels):
+ preds = get_predicted_labels(python_model, comment, "python")
+ assert preds == expected_labels
+
+
+@pytest.mark.parametrize(
+ "comment, expected_labels",
+ [
+ ("i am a simple arrow like arrowhead.", {"Intent"}),
+ ("the example below shows how to create a simple element", {"Example"}),
+ ("i provide a data structure independent api", {"Responsibilities"}),
+ ("the cache is cleared after each test to ensure isolation.", {"Keyimplementationpoints"}),
+ ("it is possible hovewer to customize a length fraction", {"Keymessages"}),
+ ("collaborators: BlElement, BlSpace", {"Collaborators"}),
+ ],
+)
+def test_pharo_mft(pharo_model, comment, expected_labels, get_predicted_labels):
+ """Tests basic keyword-to-label mapping for Pharo (e.g., 'I am...')."""
+ preds = get_predicted_labels(pharo_model, comment, "pharo")
+ assert preds == expected_labels
diff --git a/turing/tests/conftest.py b/turing/tests/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..16c5c6e5a26b65bf0e7651248082882a7ffea364
--- /dev/null
+++ b/turing/tests/conftest.py
@@ -0,0 +1,305 @@
+import os
+from pathlib import Path
+import sys
+
+import numpy as np
+import pandas as pd
+import pytest
+
+import turing.config as config
+from turing.dataset import DatasetManager
+from turing.reporting import TestReportGenerator
+
+# --- Path Setup ---
+script_dir = os.path.dirname(os.path.abspath(__file__))
+proj_root = os.path.dirname(os.path.dirname(script_dir))
+sys.path.append(proj_root)
+
+train_dir = os.path.join(proj_root, "turing", "modeling")
+sys.path.insert(1, train_dir)
+
+
+try:
+ # Import train.py
+ import turing.modeling.train as train
+except ImportError as e:
+ pytest.skip(
+ f"Could not import 'train.py'. Check sys.path. Error: {e}", allow_module_level=True
+ )
+
+# --- Reporting Setup ---
+execution_results = []
+active_categories = set()
+
+
+def clean_test_name(nodeid):
+ """Pulisce il nome del test rimuovendo parametri lunghi."""
+ parts = nodeid.split("::")
+ test_name = parts[-1]
+ if len(test_name) > 50:
+ test_name = test_name[:47] + "..."
+ return test_name
+
+
+def format_error_message(long_repr):
+ """Estrae solo l'errore principale."""
+ if not long_repr:
+ return ""
+ lines = str(long_repr).split("\n")
+ last_line = lines[-1]
+ clean_msg = last_line.replace("|", "-").strip()
+ if len(clean_msg) > 60:
+ clean_msg = clean_msg[:57] + "..."
+ return clean_msg
+
+
+@pytest.hookimpl(tryfirst=True, hookwrapper=True)
+def pytest_runtest_makereport(item, call):
+ outcome = yield
+ report = outcome.get_result()
+
+ if report.when == "call":
+ path_str = str(item.fspath)
+ category = "GENERAL"
+
+ if "unit" in path_str:
+ category = "UNIT"
+ elif "behavioral" in path_str:
+ category = "BEHAVIORAL"
+ elif "modeling" in path_str:
+ category = "MODELING"
+
+ active_categories.add(category)
+
+ # Simplified status mapping
+ status_map = {"passed": "PASS", "failed": "FAIL", "skipped": "SKIP"}
+ status_str = status_map.get(report.outcome, report.outcome.upper())
+
+ execution_results.append(
+ {
+ "Category": category,
+ "Module": item.fspath.basename,
+ "Test Case": clean_test_name(item.nodeid),
+ "Result": status_str,
+ "Time": f"{report.duration:.2f}s",
+ "Message": format_error_message(report.longrepr) if report.failed else "",
+ }
+ )
+
+
+def pytest_sessionfinish(session, exitstatus):
+ """Generate enhanced test report at session end."""
+ if not execution_results:
+ return
+
+ report_type = (
+ f"{list(active_categories)[0].lower()}_tests"
+ if len(active_categories) == 1
+ else "unit_and_behavioral_tests"
+ )
+
+ try:
+ manager = TestReportGenerator(context_name="turing", report_category=report_type)
+
+ # Main title
+ manager.add_header("Turing Test Execution Report")
+ manager.add_divider("section")
+
+ # Environment info
+ manager.add_environment_metadata()
+ manager.add_divider("thin")
+
+ df = pd.DataFrame(execution_results)
+
+ # Sommario
+ total = len(df)
+ passed = len(df[df["Result"] == "[ PASS ]"])
+ failed = len(df[df["Result"] == "[ FAILED ]"])
+ summary = pd.DataFrame(
+ [
+ {
+ "Total": total,
+ "Passed": passed,
+ "Failed": failed,
+ "Success Rate": f"{(passed / total) * 100:.1f}%",
+ }
+ ]
+ )
+ manager.add_dataframe(summary, title="Executive Summary")
+
+ # Detailed breakdown by category
+ cols = ["Module", "Test Case", "Result", "Time", "Message"]
+
+ if len(active_categories) > 1:
+ manager.add_header("Detailed Test Results by Category", level=2)
+ manager.add_divider("thin")
+
+ for cat in sorted(active_categories):
+ subset = df[df["Category"] == cat][cols]
+ manager.add_dataframe(subset, title=f"{cat} Tests")
+ else:
+ manager.add_alert_box(
+ "All tests passed successfully!",
+ box_type="success"
+ )
+
+ manager.save("report.md")
+ except Exception as e:
+ print(f"\nError generating report: {e}")
+
+
+# --- Fixtures ---
+
+
+@pytest.fixture(scope="function")
+def manager() -> DatasetManager:
+ """
+ Provides a instance of DatasetManager for each test.
+ """
+ return DatasetManager()
+
+
+@pytest.fixture(scope="function")
+def fake_csv_data_dir(tmp_path: Path) -> Path:
+ """
+ Creates a temporary directory structure mocking 'data/interim/features/clean-aug-soft-k5000'
+ and populates it with minimal, valid CSV files for testing.
+
+ Returns:
+ Path: The path to the *parent* of 'features' (e.g., the mocked INTERIM_DATA_DIR).
+ """
+ interim_dir = tmp_path / "interim_test"
+ features_dir = interim_dir / "features" / "clean-aug-soft-k5000"
+ features_dir.mkdir(parents=True, exist_ok=True)
+
+ # Define minimal valid CSV content
+ csv_content = (
+ "combo,labels\n"
+ '"java code text","[1, 0, 0, 0, 0, 0, 0]"\n'
+ '"other java code","[0, 1, 0, 0, 0, 0, 0]"\n'
+ )
+
+ # Write mock files
+ (features_dir / "java_train.csv").write_text(csv_content)
+ (features_dir / "java_test.csv").write_text(csv_content)
+
+ # Return the root of the mocked interim directory
+ return interim_dir
+
+
+@pytest.fixture(scope="session")
+def mock_data():
+ """
+ Provides a minimal, consistent, session-scoped dataset for model testing.
+ This simulates the (X, y) data structure used for training and evaluation.
+ """
+ X = [
+ "this is java code for summary",
+ "python is great for parameters",
+ "a java example for usage",
+ "running python script for development notes",
+ "pharo is a language for intent",
+ "another java rational example",
+ ]
+
+ # Mock labels for a 'java' model (7 categories)
+ # Shape (6 samples, 7 features)
+ y = np.array(
+ [
+ [1, 0, 0, 0, 0, 0, 0],
+ [0, 1, 0, 0, 0, 0, 0],
+ [1, 0, 0, 1, 0, 0, 0],
+ [0, 0, 1, 0, 0, 0, 0],
+ [0, 0, 0, 0, 1, 0, 0],
+ [1, 0, 0, 0, 0, 0, 1],
+ ]
+ )
+ return {"X": X, "y": y}
+
+
+@pytest.fixture(scope="module")
+def trained_rf_model(mock_data, tmp_path_factory):
+ """
+ Provides a fully-trained RandomForestTfIdf model instance.
+ """
+ # Import locally to ensure proj_root is set
+ from modeling.models.randomForestTfIdf import RandomForestTfIdf
+
+ # Arrange
+ model = RandomForestTfIdf(language="java")
+
+ # Monkeypatch grid search parameters for maximum speed
+ model.grid_params = {
+ "tfidf__max_features": [10, 20], # Use minimal features
+ "clf__estimator__n_estimators": [2, 5], # Use minimal trees
+ }
+ model.params["cv_folds"] = 2 # Use minimal CV folds
+
+ # Create a persistent temp dir for this module's run
+ model_path = tmp_path_factory.mktemp("trained_rf_model")
+
+ # Act: Train the model
+ model.train(mock_data["X"], mock_data["y"], path=str(model_path), model_name="test_model")
+
+ # Yield the trained model and its save path
+ yield model, model_path
+
+
+MODEL_CLASS_TO_TEST = train.MODEL_CLASS
+MODEL_EXPERIMENT_NAME = train.EXP_NAME
+MODEL_NAME_BASE = train.MODEL_NAME
+
+
+@pytest.fixture(scope="session")
+def get_predicted_labels():
+ def _helper(model, comment_sentence: str, lang: str) -> set:
+ if config.INPUT_COLUMN == "combo":
+ combo_input = f"DummyClass.{lang} | {comment_sentence}"
+ input_data = [combo_input]
+ else:
+ input_data = [comment_sentence]
+
+ prediction_array = model.predict(input_data)[0]
+ labels_map = config.LABELS_MAP[lang]
+ predicted_labels = {labels_map[i] for i, val in enumerate(prediction_array) if val == 1}
+ return predicted_labels
+
+ return _helper
+
+
+@pytest.fixture(scope="module")
+def java_model():
+ """Loads the Java model from the config path"""
+ model_path = os.path.join(config.MODELS_DIR, MODEL_EXPERIMENT_NAME, f"{MODEL_NAME_BASE}_java")
+ if not os.path.exists(model_path):
+ pytest.skip(
+ "Production model not found. Skipping behavioral tests for Java.",
+ allow_module_level=True,
+ )
+ return MODEL_CLASS_TO_TEST(language="java", path=model_path)
+
+
+@pytest.fixture(scope="module")
+def python_model():
+ """Loads the Python model from the config path"""
+ model_path = os.path.join(
+ config.MODELS_DIR, MODEL_EXPERIMENT_NAME, f"{MODEL_NAME_BASE}_python"
+ )
+ if not os.path.exists(model_path):
+ pytest.skip(
+ "Production model not found. Skipping behavioral tests for Python.",
+ allow_module_level=True,
+ )
+ return MODEL_CLASS_TO_TEST(language="python", path=model_path)
+
+
+@pytest.fixture(scope="module")
+def pharo_model():
+ """Loads the Pharo model from the config path"""
+ model_path = os.path.join(config.MODELS_DIR, MODEL_EXPERIMENT_NAME, f"{MODEL_NAME_BASE}_pharo")
+ if not os.path.exists(model_path):
+ pytest.skip(
+ "Production model not found. Skipping behavioral tests for Pharo.",
+ allow_module_level=True,
+ )
+ return MODEL_CLASS_TO_TEST(language="pharo", path=model_path)
diff --git a/turing/tests/unit/test_api.py b/turing/tests/unit/test_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..27d7dccf645c0c12bfd920cb86ea02ded61e3f7c
--- /dev/null
+++ b/turing/tests/unit/test_api.py
@@ -0,0 +1,201 @@
+from unittest.mock import patch
+
+from fastapi.testclient import TestClient
+import numpy as np
+import pytest
+
+from turing.api.app import app
+from turing.api.schemas import PredictionRequest, PredictionResponse
+
+
+@pytest.fixture
+def client():
+ """Fixture that provides a test client for the FastAPI app."""
+ return TestClient(app)
+
+
+@pytest.fixture
+def mock_inference_engine():
+ """Fixture that provides a mocked inference engine."""
+ with patch('turing.api.app.inference_engine') as mock:
+ yield mock
+
+
+class TestHealthCheck:
+ """Test suite for the health check endpoint."""
+
+ def test_health_check_returns_ok(self, client):
+ """Test that the health check endpoint returns status ok."""
+ response = client.get("/")
+ assert response.status_code == 200
+ assert response.json() == {
+ "status": "ok",
+ "message": "Turing Code Classification API is ready."
+ }
+
+
+class TestPredictEndpoint:
+ """Test suite for the predict endpoint."""
+
+ def test_predict_success_java(self, client, mock_inference_engine):
+ """Test successful prediction for Java code."""
+ # Setup mock
+ mock_inference_engine.predict_payload.return_value = (
+ np.array([0, 1]), # raw predictions as numpy array
+ ["class", "method"], # labels
+ "run_id_123", # run_id
+ "models:/CodeBERTa_java/Production" # artifact
+ )
+
+ # Make request
+ request_data = {
+ "texts": ["public class Main", "public void test()"],
+ "language": "java"
+ }
+ response = client.post("/predict", json=request_data)
+
+ # Assertions
+ assert response.status_code == 200
+ data = response.json()
+ assert "predictions" in data
+ assert "labels" in data
+ assert "model_info" in data
+ assert data["labels"] == ["class", "method"]
+ assert data["model_info"]["language"] == "java"
+
+ def test_predict_success_python(self, client, mock_inference_engine):
+ """Test successful prediction for Python code."""
+ # Setup mock
+ mock_inference_engine.predict_payload.return_value = (
+ np.array([1, 0]), # raw predictions as numpy array
+ ["function", "class"], # labels
+ "run_id_456", # run_id
+ "models:/CodeBERTa_python/Production" # artifact
+ )
+
+ # Make request
+ request_data = {
+ "texts": ["def main():", "class MyClass:"],
+ "language": "python"
+ }
+ response = client.post("/predict", json=request_data)
+
+ # Assertions
+ assert response.status_code == 200
+ data = response.json()
+ assert data["labels"] == ["function", "class"]
+ assert data["model_info"]["language"] == "python"
+
+ def test_predict_success_pharo(self, client, mock_inference_engine):
+ """Test successful prediction for Pharo code."""
+ # Setup mock
+ mock_inference_engine.predict_payload.return_value = (
+ np.array([0]), # raw predictions as numpy array
+ ["method"], # labels
+ "run_id_789", # run_id
+ "models:/CodeBERTa_pharo/Production" # artifact
+ )
+
+ # Make request
+ request_data = {
+ "texts": ["initialize"],
+ "language": "pharo"
+ }
+ response = client.post("/predict", json=request_data)
+
+ # Assertions
+ assert response.status_code == 200
+ data = response.json()
+ assert data["labels"] == ["method"]
+ assert data["model_info"]["language"] == "pharo"
+
+ def test_predict_missing_texts(self, client):
+ """Test that prediction fails when texts are missing."""
+ request_data = {
+ "language": "java"
+ }
+ response = client.post("/predict", json=request_data)
+ assert response.status_code == 422 # Validation error
+
+ def test_predict_missing_language(self, client):
+ """Test that prediction fails when language is missing."""
+ request_data = {
+ "texts": ["public class Main"]
+ }
+ response = client.post("/predict", json=request_data)
+ assert response.status_code == 422 # Validation error
+
+ def test_predict_empty_texts(self, client, mock_inference_engine):
+ """Test prediction with empty texts list."""
+ mock_inference_engine.predict_payload.return_value = (
+ np.array([]), # raw predictions as empty numpy array
+ [], # labels
+ "run_id_000", # run_id
+ "models:/CodeBERTa_java/Production" # artifact
+ )
+
+ request_data = {
+ "texts": [],
+ "language": "java"
+ }
+ response = client.post("/predict", json=request_data)
+
+ # Should succeed with empty results
+ assert response.status_code == 200
+ data = response.json()
+ assert data["predictions"] == []
+ assert data["labels"] == []
+
+ def test_predict_error_handling(self, client, mock_inference_engine):
+ """Test that prediction endpoint handles errors gracefully."""
+ # Setup mock to raise an exception
+ mock_inference_engine.predict_payload.side_effect = Exception("Model loading failed")
+
+ request_data = {
+ "texts": ["public class Main"],
+ "language": "java"
+ }
+ response = client.post("/predict", json=request_data)
+
+ # Should return 500 error
+ assert response.status_code == 500
+ assert "Model loading failed" in response.json()["detail"]
+
+ def test_predict_invalid_language(self, client, mock_inference_engine):
+ """Test prediction with invalid language parameter."""
+ # The model might raise an error for unsupported language
+ mock_inference_engine.predict_payload.side_effect = ValueError("Unsupported language: cobol")
+
+ request_data = {
+ "texts": ["IDENTIFICATION DIVISION."],
+ "language": "cobol"
+ }
+ response = client.post("/predict", json=request_data)
+
+ # Should return 500 error
+ assert response.status_code == 500
+ assert "Unsupported language" in response.json()["detail"]
+
+
+class TestAPISchemas:
+ """Test suite for API schemas validation."""
+
+ def test_prediction_request_valid(self):
+ """Test that PredictionRequest validates correct data."""
+ request = PredictionRequest(
+ texts=["public void main"],
+ language="java"
+ )
+ assert request.texts == ["public void main"]
+ assert request.language == "java"
+
+ def test_prediction_response_valid(self):
+ """Test that PredictionResponse validates correct data."""
+ response = PredictionResponse(
+ predictions=[0, 1],
+ labels=["class", "method"],
+ model_info={"artifact": "models:/CodeBERTa_java/Production", "language": "java"}
+ )
+ assert response.predictions == [0, 1]
+ assert response.labels == ["class", "method"]
+ assert response.model_info["language"] == "java"
diff --git a/turing/tests/unit/test_config.py b/turing/tests/unit/test_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..fac2d83ad3d5ce0c0f15e658850e8b7f46b842e2
--- /dev/null
+++ b/turing/tests/unit/test_config.py
@@ -0,0 +1,133 @@
+import importlib
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+# Import the module to be tested
+import turing.config as config
+
+
+@pytest.mark.config
+class TestConfig:
+ """
+ Test suite for validating the project's configuration module (config.py).
+
+ These tests verify that paths are structured correctly, critical constants
+ are of the expected type and value, and module-level logic
+ (like calculations and .env loading) executes as intended.
+ """
+
+ def test_proj_root_is_correctly_identified(self):
+ """
+ Validates that PROJ_ROOT is a Path object and points to the
+ actual project root directory (which should contain 'pyproject.toml').
+ """
+ assert isinstance(config.PROJ_ROOT, Path)
+ assert config.PROJ_ROOT.is_dir()
+
+ # A common "sanity check" is to look for a known file at the root
+ expected_file = config.PROJ_ROOT / "pyproject.toml"
+ assert expected_file.is_file(), (
+ f"PROJ_ROOT ({config.PROJ_ROOT}) does not seem to be the project root. "
+ f"Could not find {expected_file}"
+ )
+
+ def test_directory_paths_are_correctly_structured(self):
+ """
+ Ensures all key directory variables are Path objects
+ and are correctly parented under PROJ_ROOT.
+ """
+ # List of all directory variables defined in config.py
+ path_vars = [
+ config.DATA_DIR,
+ config.RAW_DATA_DIR,
+ config.INTERIM_DATA_DIR,
+ config.PROCESSED_DATA_DIR,
+ config.EXTERNAL_DATA_DIR,
+ config.MODELS_DIR,
+ config.REPORTS_DIR,
+ config.FIGURES_DIR,
+ ]
+
+ for path_var in path_vars:
+ assert isinstance(path_var, Path)
+ # Check that PROJ_ROOT is an ancestor of this path
+ assert config.PROJ_ROOT in path_var.parents
+
+ # Spot-check a few for correct relative paths
+ assert config.DATA_DIR == config.PROJ_ROOT / "data"
+ assert config.RAW_DATA_DIR == config.PROJ_ROOT / "data" / "raw"
+ assert config.FIGURES_DIR == config.PROJ_ROOT / "reports" / "figures"
+
+ def test_dataset_constants_are_valid(self):
+ """
+ Validates that critical dataset constants are non-empty and of
+ the correct type.
+ """
+ assert isinstance(config.DATASET_HF_ID, str)
+ assert config.DATASET_HF_ID == "NLBSE/nlbse26-code-comment-classification"
+
+ assert isinstance(config.LANGS, list)
+ assert len(config.LANGS) == 3
+ assert "java" in config.LANGS
+
+ assert isinstance(config.INPUT_COLUMN, str) and config.INPUT_COLUMN
+ assert isinstance(config.LABEL_COLUMN, str) and config.LABEL_COLUMN
+
+ def test_labels_map_and_total_categories_are_correct(self):
+ """
+ Validates the LABELS_MAP structure and ensures TOTAL_CATEGORIES
+ is correctly calculated from it.
+ """
+ assert isinstance(config.LABELS_MAP, dict)
+
+ # Ensure all languages in LANGS are keys in LABELS_MAP
+ for lang in config.LANGS:
+ assert lang in config.LABELS_MAP
+ assert isinstance(config.LABELS_MAP[lang], list)
+ assert len(config.LABELS_MAP[lang]) > 0
+
+ # Validate the derived calculation
+ expected_total = (
+ len(config.LABELS_MAP["java"])
+ + len(config.LABELS_MAP["python"])
+ + len(config.LABELS_MAP["pharo"])
+ )
+ assert config.TOTAL_CATEGORIES == expected_total
+ assert config.TOTAL_CATEGORIES == 18 # 7 + 5 + 6
+
+ def test_numeric_parameters_are_positive(self):
+ """
+ Ensures that numeric scoring and training parameters are positive
+ and of the correct type.
+ """
+ numeric_params = {
+ "MAX_AVG_RUNTIME": config.MAX_AVG_RUNTIME,
+ "MAX_AVG_FLOPS": config.MAX_AVG_FLOPS,
+ "DEFAULT_BATCH_SIZE": config.DEFAULT_BATCH_SIZE,
+ "DEFAULT_NUM_ITERATIONS": config.DEFAULT_NUM_ITERATIONS,
+ }
+
+ for name, value in numeric_params.items():
+ assert isinstance(value, (int, float)), f"{name} is not numeric"
+ assert value > 0, f"{name} must be positive"
+
+ @patch("dotenv.load_dotenv")
+ def test_load_dotenv_is_called_on_module_load(self, mock_load_dotenv):
+ """
+ Tests that the load_dotenv() function is executed when the
+ config.py module is loaded.
+
+ This requires reloading the module, as it's likely already been
+ imported by pytest or conftest.
+ """
+ # Arrange (Patch is active)
+
+ # Act
+ # Reload the config module to trigger its top-level statements
+ importlib.reload(config)
+
+ # Assert
+ # Check that the patched load_dotenv was called
+ mock_load_dotenv.assert_called_once()
diff --git a/turing/tests/unit/test_dataset.py b/turing/tests/unit/test_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..271b19932e6fcf88faf65fc3b15cc846c8003961
--- /dev/null
+++ b/turing/tests/unit/test_dataset.py
@@ -0,0 +1,95 @@
+from pathlib import Path
+
+import pytest
+
+# Project modules are importable thanks to conftest.py
+import turing.config as config
+from turing.dataset import DatasetManager
+
+
+@pytest.mark.data_loader
+class TestDatasetManager:
+ """
+ Unit tests for the DatasetManager class.
+ This test suite validates initialization, data transformation logic,
+ and data loading mechanisms, including error handling.
+ """
+
+ def test_initialization_paths_are_correct(self, manager: DatasetManager):
+ """
+ Verifies that the DatasetManager initializes with the correct
+ Hugging Face ID and constructs its paths as expected.
+ """
+ assert manager.hf_id == "NLBSE/nlbse26-code-comment-classification"
+ assert "data/raw" in str(manager.raw_data_dir)
+ # base_interim_path should contain either 'base' or 'features'
+ path_str = str(manager.base_interim_path)
+ assert "data/interim" in path_str and ("base" in path_str or "features" in path_str)
+
+ @pytest.mark.parametrize(
+ "input_labels, expected_output",
+ [
+ ([1, 0, 1], "[1, 0, 1]"), # Case: Standard list
+ ("[1, 0, 1]", "[1, 0, 1]"), # Case: Already a string
+ ([], "[]"), # Case: Empty list
+ (None, None), # Case: None value
+ ],
+ )
+ def test_format_labels_for_csv(self, manager: DatasetManager, input_labels, expected_output):
+ """
+ Tests the internal _format_labels_for_csv method to ensure
+ it correctly serializes label lists (or handles other inputs) to strings.
+ """
+ # Arrange
+ example = {"labels": input_labels}
+
+ # Act
+ formatted_example = manager._format_labels_for_csv(example)
+
+ # Assert
+ assert formatted_example["labels"] == expected_output
+
+ def test_get_dataset_raises_file_not_found(self, monkeypatch):
+ """
+ Ensures that get_dataset() raises a FileNotFoundError when
+ the target interim CSV files do not exist.
+ """
+ # Arrange
+ # Patch the config to point to a non-existent directory
+ fake_dir = Path("/path/that/is/totally/fake")
+ monkeypatch.setattr(config, "INTERIM_DATA_DIR", fake_dir)
+
+ # Manager must be initialized *after* patching config
+ manager_with_fake_path = DatasetManager()
+
+ # Act & Assert
+ with pytest.raises(FileNotFoundError, match="Dataset CSV files not found."):
+ manager_with_fake_path.get_dataset()
+
+ def test_get_dataset_success_and_label_parsing(self, fake_csv_data_dir: Path, monkeypatch):
+ """
+ Verifies that get_dataset() successfully loads data from mock CSVs
+ and correctly parses the string-formatted labels back into lists.
+ """
+ # Arrange
+ # Point the config at our temporary fixture directory
+ monkeypatch.setattr(config, "INTERIM_DATA_DIR", fake_csv_data_dir)
+ manager = DatasetManager()
+
+ # Act
+ dataset = manager.get_dataset()
+
+ # Assert
+ # Check that the correct splits were loaded
+ assert "java_train" in dataset
+ assert "java_test" in dataset
+ assert "python_train" not in dataset # Confirms only found files are loaded
+
+ # Check content integrity
+ assert len(dataset["java_train"]) == 2
+ assert dataset["java_train"][0]["combo"] == "java code text"
+
+ # Ccheck that the string '[1, 0, ...]' was parsed back to a list
+ expected_labels = [1, 0, 0, 0, 0, 0, 0]
+ assert dataset["java_train"][0]["labels"] == expected_labels
+ assert isinstance(dataset["java_train"][0]["labels"], list)
diff --git a/turing/tests/unit/test_features.py b/turing/tests/unit/test_features.py
new file mode 100644
index 0000000000000000000000000000000000000000..6593a6425d4e8fb345c69db2e227308d5e90fb5d
--- /dev/null
+++ b/turing/tests/unit/test_features.py
@@ -0,0 +1,121 @@
+import pandas as pd
+import pytest
+
+from turing.features import (
+ FeatureEngineer,
+ FeaturePipelineConfig,
+ TextProcessor,
+)
+
+# --- Fixtures ---
+
+
+@pytest.fixture(scope="module")
+def full_config():
+ """Returns a config with stopwords and lemmatization enabled."""
+ return FeaturePipelineConfig(
+ use_stopwords=True,
+ use_lemmatization=True,
+ use_combo_feature=False,
+ max_features=5000,
+ min_comment_length=10,
+ max_comment_length=500,
+ enable_augmentation=False,
+ custom_tags="test",
+ )
+
+
+@pytest.fixture(scope="module")
+def basic_config():
+ """Returns a config with all extra steps disabled."""
+ return FeaturePipelineConfig(
+ use_stopwords=False,
+ use_lemmatization=False,
+ use_combo_feature=False,
+ max_features=100,
+ min_comment_length=5,
+ max_comment_length=200,
+ enable_augmentation=False,
+ )
+
+
+@pytest.fixture(scope="module")
+def full_processor(full_config):
+ """A TextProcessor with all steps enabled."""
+ return TextProcessor(config=full_config, language="english")
+
+
+@pytest.fixture(scope="module")
+def basic_processor(basic_config):
+ """A TextProcessor with only basic cleaning (lowercase, punctuation)."""
+ return TextProcessor(config=basic_config, language="english")
+
+
+# --- Tests ---
+
+
+class TestFeaturePipelineConfig:
+ def test_config_id_generation(self, full_config, basic_config):
+ """Tests that the readable ID is generated correctly."""
+ assert full_config.hash_id == "clean-k5000-test"
+ assert basic_config.hash_id == "clean-k100"
+
+ def test_config_attributes(self, full_config):
+ """Tests that attributes are set correctly."""
+ assert full_config.use_stopwords is True
+ assert full_config.use_lemmatization is True
+ assert full_config.max_features == 5000
+
+
+class TestTextProcessor:
+ def test_clean_text_basic(self, basic_processor):
+ """Tests lowercase and punctuation removal."""
+ text = "This is a TEST... with punctuation!!"
+ expected = "this is a test with punctuation"
+ assert basic_processor.clean_text(text) == expected
+
+ def test_clean_text_stopwords(self, full_processor, basic_processor):
+ """Tests stopword removal logic."""
+ text = "this is a test with a stopword"
+
+ # With stopwords enabled
+ expected_full = "test stopword"
+ assert full_processor.clean_text(text) == expected_full
+
+ # With stopwords disabled
+ expected_basic = "this is a test with a stopword"
+ assert basic_processor.clean_text(text) == expected_basic
+
+ def test_clean_text_lemmatization(self, full_processor, basic_processor):
+ """Tests lemmatization logic."""
+ text = "running tests while dogs are barking"
+
+ # With lemmatization enabled
+ expected_full = "running test dog barking" # 'are' and 'while' are stopwords
+ assert full_processor.clean_text(text) == expected_full
+
+ # With lemmatization disabled
+ expected_basic = "running tests while dogs are barking"
+ assert basic_processor.clean_text(text) == expected_basic
+
+ def test_clean_text_handles_none(self, basic_processor):
+ """Tests that it doesn't crash on None or pd.NA."""
+ assert basic_processor.clean_text(None) == ""
+ assert basic_processor.clean_text(pd.NA) == ""
+
+
+class TestFeatureEngineer:
+ def test_extract_numeric_features(self, basic_config):
+ """Tests that extract_features_for_check adds metadata features."""
+ fe = FeatureEngineer(config=basic_config)
+ data = {"comment_sentence": ["This is short.", "This one is a bit longer.", ""]}
+ df = pd.DataFrame(data)
+ df_out = fe.extract_features_for_check(df)
+
+ assert "f_length" in df_out.columns
+ assert "f_word_count" in df_out.columns
+ assert "f_starts_verb" in df_out.columns
+ assert "text_hash" in df_out.columns
+
+ assert df_out["f_length"].tolist() == [14, 25, 0]
+ assert df_out["f_word_count"].tolist() == [3, 6, 0]
diff --git a/turing/tests/unit/test_model.py b/turing/tests/unit/test_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ee8173e09935280b17ae0eae20ec1b1dbcb764b
--- /dev/null
+++ b/turing/tests/unit/test_model.py
@@ -0,0 +1,135 @@
+import inspect
+
+import numpy as np
+import pytest
+
+from turing.config import EXISTING_MODELS
+import turing.modeling.models as my_models
+
+
+@pytest.fixture
+def get_model(request: str):
+ """Fixture that returns a list of existing model names."""
+ model_name = request.param
+
+ module = getattr(my_models, model_name, None)
+
+ classes = [
+ cls
+ for _, cls in inspect.getmembers(module, inspect.isclass)
+ if cls.__module__ == module.__name__
+ ]
+
+ cls = classes[0]
+
+ from turing.config import LANGS
+
+ lang = LANGS[0]
+ return cls(language=lang)
+
+
+@pytest.mark.parametrize("get_model", EXISTING_MODELS, indirect=True)
+def test_model_initialization(get_model):
+ """
+ Test that each model class can be initialized without errors.
+ """
+ model = get_model
+ assert model is not None
+ from turing.modeling.baseModel import BaseModel
+
+ assert isinstance(model, BaseModel)
+
+
+@pytest.mark.parametrize("get_model", EXISTING_MODELS, indirect=True)
+def test_model_setup(get_model):
+ """
+ Test that each model class sets up its internal model correctly.
+ """
+ model = get_model
+ model.setup_model()
+ assert model.model is not None
+
+
+@pytest.mark.parametrize("get_model", EXISTING_MODELS, indirect=True)
+def test_model_train(tmp_path, get_model):
+ """
+ Test that each model class can run the train method without errors.
+ """
+ model = get_model
+ model.setup_model()
+
+ # Using mock data for training
+ X_train = ["sample text data"] * 10
+
+ y_train = [0, 1] * 5
+
+ y_train = np.array(y_train).reshape(-1, 1)
+
+ # fake directory and model name
+ fake_path = tmp_path / "out"
+ fake_path.mkdir()
+
+ parameters = model.train(X_train, y_train)
+
+ assert isinstance(parameters, dict)
+ assert model.model is not None
+
+
+@pytest.mark.parametrize("get_model", EXISTING_MODELS, indirect=True)
+def test_model_evaluate(tmp_path, get_model):
+ """
+ Test that each model class can run the evaluate method without errors.
+ """
+ model = get_model
+ model.setup_model()
+
+ # Using mock data for training
+ X_train = ["sample text data"] * 10
+
+ y_train = [0, 1] * 5
+
+ y_train = np.array(y_train).reshape(-1, 1)
+
+ # fake directory and model name
+ fake_path = tmp_path / "out"
+ fake_path.mkdir()
+
+ _ = model.train(X_train, y_train)
+
+ # Using mock data for evaluation
+ X_test = ["sample text data"] * 10
+ y_test = [0, 1] * 5
+ metrics = model.evaluate(X_test, y_test)
+
+ assert isinstance(metrics, dict)
+ assert metrics and "accuracy" in metrics
+ assert "f1_score" in metrics or "f1_score_micro" in metrics
+
+
+@pytest.mark.parametrize("get_model", EXISTING_MODELS, indirect=True)
+def test_model_predict(tmp_path, get_model):
+ """
+ Test that each model class can run the predict method without errors.
+ """
+ model = get_model
+ model.setup_model()
+
+ # Using mock data for training
+ X_train = ["sample text data"] * 10
+
+ y_train = [0, 1] * 5
+
+ y_train = np.array(y_train).reshape(-1, 1)
+
+ # fake directory and model name
+ fake_path = tmp_path / "out"
+ fake_path.mkdir()
+
+ _ = model.train(X_train, y_train)
+
+ # Using mock data for prediction
+ X_input = ["sample text data"] * 3
+ predictions = model.predict(X_input)
+
+ assert predictions is not None
+ assert len(predictions) == len(X_input)