diff --git "a/data/notebooks/HRHUB_v2_8.ipynb" "b/data/notebooks/HRHUB_v2_8.ipynb"
deleted file mode 100644--- "a/data/notebooks/HRHUB_v2_8.ipynb"
+++ /dev/null
@@ -1,3418 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# \ud83e\udde0 HRHUB v2.1 - Enhanced with LLM (FREE VERSION)\n",
-    "\n",
-    "## \ud83d\udcd8 Project Overview\n",
-    "\n",
-    "**Bilateral HR Matching System with LLM-Powered Intelligence**\n",
-    "\n",
-    "### What's New in v2.1:\n",
-    "- \u2705 **FREE LLM**: Using Hugging Face Inference API (no cost)\n",
-    "- \u2705 **Job Level Classification**: Zero-shot & few-shot learning\n",
-    "- \u2705 **Structured Skills Extraction**: Pydantic schemas\n",
-    "- \u2705 **Match Explainability**: LLM-generated reasoning\n",
-    "- \u2705 **Flexible Data Loading**: Upload OR Google Drive\n",
-    "\n",
-    "### Tech Stack:\n",
-    "```\n",
-    "Embeddings: sentence-transformers (local, free)\n",
-    "LLM: Hugging Face Inference API (free tier)\n",
-    "Schemas: Pydantic\n",
-    "Platform: Google Colab \u2192 VS Code\n",
-    "```\n",
-    "\n",
-    "---\n",
-    "\n",
-    "**Master's Thesis - Aalborg University**  \n",
-    "*Business Data Science Program*  \n",
-    "*December 2025*"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---",
-    "## \ud83d\udcca Step 1: Install Dependencies"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u2705 All packages installed!\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Install required packages\n",
-    "#!pip install -q sentence-transformers huggingface-hub pydantic plotly pyvis nbformat scikit-learn pandas numpy\n",
-    "\n",
-    "print(\"\u2705 All packages installed!\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---",
-    "## \ud83d\udcca Step 2: Import Libraries"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u2705 Environment variables loaded from .env\n",
-      "\u2705 All libraries imported!\n"
-     ]
-    }
-   ],
-   "source": [
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "import json\n",
-    "import os\n",
-    "from typing import List, Dict, Optional, Literal\n",
-    "import warnings\n",
-    "warnings.filterwarnings('ignore')\n",
-    "\n",
-    "# ML & NLP\n",
-    "from sentence_transformers import SentenceTransformer\n",
-    "from sklearn.metrics.pairwise import cosine_similarity\n",
-    "\n",
-    "# LLM Integration (FREE)\n",
-    "from huggingface_hub import InferenceClient\n",
-    "from pydantic import BaseModel, Field\n",
-    "\n",
-    "# Visualization\n",
-    "import plotly.graph_objects as go\n",
-    "from IPython.display import HTML, display\n",
-    "\n",
-    "# Configuration Settings\n",
-    "from dotenv import load_dotenv\n",
-    "\n",
-    "# Carrega vari\u00e1veis do .env\n",
-    "load_dotenv()\n",
-    "print(\"\u2705 Environment variables loaded from .env\")\n",
-    "# ============== AT\u00c9 AQUI \u2b06\ufe0f ==============\n",
-    "\n",
-    "print(\"\u2705 All libraries imported!\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---",
-    "## \ud83d\udcca Step 3: Configuration"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u2705 Configuration loaded!\n",
-      "\ud83e\udde0 Embedding model: all-MiniLM-L6-v2\n",
-      "\ud83e\udd16 LLM model: meta-llama/Llama-3.2-3B-Instruct\n",
-      "\ud83d\udd11 HF Token configured: Yes \u2705\n",
-      "\ud83d\udcc2 Data path: ../csv_files/\n"
-     ]
-    }
-   ],
-   "source": [
-    "class Config:\n",
-    "    \"\"\"Centralized configuration for VS Code\"\"\"\n",
-    "    \n",
-    "    # Paths - VS Code structure\n",
-    "    CSV_PATH = '../csv_files/'\n",
-    "    PROCESSED_PATH = '../processed/'\n",
-    "    RESULTS_PATH = '../results/'\n",
-    "    \n",
-    "    # Embedding Model\n",
-    "    EMBEDDING_MODEL = 'all-MiniLM-L6-v2'\n",
-    "    \n",
-    "    # LLM Settings (FREE - Hugging Face)\n",
-    "    HF_TOKEN = os.getenv('HF_TOKEN', '')  # \u2705 Pega do .env\n",
-    "    LLM_MODEL = 'meta-llama/Llama-3.2-3B-Instruct'\n",
-    "    \n",
-    "    LLM_MAX_TOKENS = 1000\n",
-    "    \n",
-    "    # Matching Parameters\n",
-    "    TOP_K_MATCHES = 10\n",
-    "    SIMILARITY_THRESHOLD = 0.5\n",
-    "    RANDOM_SEED = 42\n",
-    "\n",
-    "np.random.seed(Config.RANDOM_SEED)\n",
-    "\n",
-    "print(\"\u2705 Configuration loaded!\")\n",
-    "print(f\"\ud83e\udde0 Embedding model: {Config.EMBEDDING_MODEL}\")\n",
-    "print(f\"\ud83e\udd16 LLM model: {Config.LLM_MODEL}\")\n",
-    "print(f\"\ud83d\udd11 HF Token configured: {'Yes \u2705' if Config.HF_TOKEN else 'No \u26a0\ufe0f'}\")\n",
-    "print(f\"\ud83d\udcc2 Data path: {Config.CSV_PATH}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "## \ud83c\udfd7\ufe0f Step 3.5: Architecture - Text Builders\n",
-    "\n",
-    "**HIGH COHESION:** Each class has ONE responsibility\n",
-    "**LOW COUPLING:** Classes don't depend on each other"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ============================================================================\n",
-    "# TEXT BUILDER CLASSES - Single Responsibility Principle\n",
-    "# ============================================================================\n",
-    "\n",
-    "from abc import ABC, abstractmethod\n",
-    "from typing import List\n",
-    "\n",
-    "class TextBuilder(ABC):\n",
-    "    \"\"\"Abstract base class for text builders\"\"\"\n",
-    "    \n",
-    "    @abstractmethod\n",
-    "    def build(self, row: pd.Series) -> str:\n",
-    "        \"\"\"Build text representation from DataFrame row\"\"\"\n",
-    "        pass\n",
-    "    \n",
-    "    def build_batch(self, df: pd.DataFrame) -> List[str]:\n",
-    "        \"\"\"Build text representations for entire DataFrame\"\"\"\n",
-    "        return df.apply(self.build, axis=1).tolist()\n",
-    "\n",
-    "\n",
-    "class CandidateTextBuilder(TextBuilder):\n",
-    "    \"\"\"Builds text representation for candidates\"\"\"\n",
-    "    \n",
-    "    def __init__(self, fields: List[str] = None):\n",
-    "        self.fields = fields or [\n",
-    "            'Category',\n",
-    "            'skills',\n",
-    "            'career_objective',\n",
-    "            'degree_names',\n",
-    "            'positions'\n",
-    "        ]\n",
-    "    \n",
-    "    def build(self, row: pd.Series) -> str:\n",
-    "        parts = []\n",
-    "        \n",
-    "        if row.get('Category'):\n",
-    "            parts.append(f\"Job Category: {row['Category']}\")\n",
-    "        \n",
-    "        if row.get('skills'):\n",
-    "            parts.append(f\"Skills: {row['skills']}\")\n",
-    "        \n",
-    "        if row.get('career_objective'):\n",
-    "            parts.append(f\"Objective: {row['career_objective']}\")\n",
-    "        \n",
-    "        if row.get('degree_names'):\n",
-    "            parts.append(f\"Education: {row['degree_names']}\")\n",
-    "        \n",
-    "        if row.get('positions'):\n",
-    "            parts.append(f\"Experience: {row['positions']}\")\n",
-    "        \n",
-    "        return ' '.join(parts)\n",
-    "\n",
-    "\n",
-    "class CompanyTextBuilder(TextBuilder):\n",
-    "    \"\"\"Builds text representation for companies\"\"\"\n",
-    "    \n",
-    "    def __init__(self, include_postings: bool = True):\n",
-    "        self.include_postings = include_postings\n",
-    "    \n",
-    "    def build(self, row: pd.Series) -> str:\n",
-    "        parts = []\n",
-    "        \n",
-    "        if row.get('name'):\n",
-    "            parts.append(f\"Company: {row['name']}\")\n",
-    "        \n",
-    "        if row.get('description'):\n",
-    "            parts.append(f\"Description: {row['description']}\")\n",
-    "        \n",
-    "        if row.get('industries_list'):\n",
-    "            parts.append(f\"Industries: {row['industries_list']}\")\n",
-    "        \n",
-    "        if row.get('specialties_list'):\n",
-    "            parts.append(f\"Specialties: {row['specialties_list']}\")\n",
-    "        \n",
-    "        # Include job postings data (THE BRIDGE!)\n",
-    "        if self.include_postings:\n",
-    "            if row.get('required_skills'):\n",
-    "                parts.append(f\"Required Skills: {row['required_skills']}\")\n",
-    "            \n",
-    "            if row.get('posted_job_titles'):\n",
-    "                parts.append(f\"Job Titles: {row['posted_job_titles']}\")\n",
-    "            \n",
-    "            if row.get('experience_levels'):\n",
-    "                parts.append(f\"Experience: {row['experience_levels']}\")\n",
-    "        \n",
-    "        return ' '.join(parts)\n",
-    "\n",
-    "\n",
-    "print(\"\u2705 Text Builder classes loaded\")\n",
-    "print(\"   \u2022 CandidateTextBuilder\")\n",
-    "print(\"   \u2022 CompanyTextBuilder\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "## \ud83c\udfd7\ufe0f Step 3.6: Architecture - Embedding Manager\n",
-    "\n",
-    "**Responsibility:** Generate, save, and load embeddings"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ============================================================================\n",
-    "# EMBEDDING MANAGER - Handles all embedding operations\n",
-    "# ============================================================================\n",
-    "\n",
-    "from pathlib import Path\n",
-    "from typing import Tuple, Optional\n",
-    "\n",
-    "class EmbeddingManager:\n",
-    "    \"\"\"Manages embedding generation, saving, and loading\"\"\"\n",
-    "    \n",
-    "    def __init__(self, model: SentenceTransformer, save_dir: str):\n",
-    "        self.model = model\n",
-    "        self.save_dir = Path(save_dir)\n",
-    "        self.save_dir.mkdir(parents=True, exist_ok=True)\n",
-    "    \n",
-    "    def _get_file_paths(self, entity_type: str) -> Tuple[Path, Path]:\n",
-    "        \"\"\"Get file paths for embeddings and metadata\"\"\"\n",
-    "        emb_file = self.save_dir / f\"{entity_type}_embeddings.npy\"\n",
-    "        meta_file = self.save_dir / f\"{entity_type}_metadata.pkl\"\n",
-    "        return emb_file, meta_file\n",
-    "    \n",
-    "    def exists(self, entity_type: str) -> bool:\n",
-    "        \"\"\"Check if embeddings exist for entity type\"\"\"\n",
-    "        emb_file, _ = self._get_file_paths(entity_type)\n",
-    "        return emb_file.exists()\n",
-    "    \n",
-    "    def load(self, entity_type: str) -> Tuple[np.ndarray, pd.DataFrame]:\n",
-    "        \"\"\"Load embeddings and metadata\"\"\"\n",
-    "        emb_file, meta_file = self._get_file_paths(entity_type)\n",
-    "        \n",
-    "        if not emb_file.exists():\n",
-    "            raise FileNotFoundError(f\"Embeddings not found: {emb_file}\")\n",
-    "        \n",
-    "        embeddings = np.load(emb_file)\n",
-    "        metadata = pd.read_pickle(meta_file) if meta_file.exists() else None\n",
-    "        \n",
-    "        return embeddings, metadata\n",
-    "    \n",
-    "    def generate(self,\n",
-    "                texts: List[str],\n",
-    "                batch_size: int = 32,\n",
-    "                show_progress: bool = True) -> np.ndarray:\n",
-    "        \"\"\"Generate embeddings from texts\"\"\"\n",
-    "        return self.model.encode(\n",
-    "            texts,\n",
-    "            batch_size=batch_size,\n",
-    "            show_progress_bar=show_progress,\n",
-    "            normalize_embeddings=True,\n",
-    "            convert_to_numpy=True\n",
-    "        )\n",
-    "    \n",
-    "    def save(self,\n",
-    "            entity_type: str,\n",
-    "            embeddings: np.ndarray,\n",
-    "            metadata: pd.DataFrame) -> None:\n",
-    "        \"\"\"Save embeddings and metadata\"\"\"\n",
-    "        emb_file, meta_file = self._get_file_paths(entity_type)\n",
-    "        \n",
-    "        np.save(emb_file, embeddings)\n",
-    "        metadata.to_pickle(meta_file)\n",
-    "        \n",
-    "        print(f\"\ud83d\udcbe Saved:\")\n",
-    "        print(f\"   {emb_file}\")\n",
-    "        print(f\"   {meta_file}\")\n",
-    "    \n",
-    "    def generate_and_save(self,\n",
-    "                         entity_type: str,\n",
-    "                         texts: List[str],\n",
-    "                         metadata: pd.DataFrame,\n",
-    "                         batch_size: int = 32) -> np.ndarray:\n",
-    "        \"\"\"Generate embeddings and save everything\"\"\"\n",
-    "        print(f\"\ud83d\udd04 Generating {entity_type} embeddings...\")\n",
-    "        print(f\"   Processing {len(texts):,} items...\")\n",
-    "        \n",
-    "        embeddings = self.generate(texts, batch_size=batch_size)\n",
-    "        self.save(entity_type, embeddings, metadata)\n",
-    "        \n",
-    "        return embeddings\n",
-    "    \n",
-    "    def load_or_generate(self,\n",
-    "                        entity_type: str,\n",
-    "                        texts: List[str],\n",
-    "                        metadata: pd.DataFrame,\n",
-    "                        force_regenerate: bool = False) -> Tuple[np.ndarray, pd.DataFrame]:\n",
-    "        \"\"\"Load if exists, generate otherwise\"\"\"\n",
-    "        \n",
-    "        if not force_regenerate and self.exists(entity_type):\n",
-    "            print(f\"\ud83d\udce5 Loading {entity_type} embeddings...\")\n",
-    "            embeddings, saved_metadata = self.load(entity_type)\n",
-    "            \n",
-    "            # Verify alignment\n",
-    "            if len(embeddings) != len(metadata):\n",
-    "                print(f\"\u26a0\ufe0f  Size mismatch! Regenerating...\")\n",
-    "                embeddings = self.generate_and_save(\n",
-    "                    entity_type, texts, metadata\n",
-    "                )\n",
-    "            else:\n",
-    "                print(f\"\u2705 Loaded: {embeddings.shape}\")\n",
-    "        else:\n",
-    "            embeddings = self.generate_and_save(\n",
-    "                entity_type, texts, metadata\n",
-    "            )\n",
-    "        \n",
-    "        return embeddings, metadata\n",
-    "\n",
-    "\n",
-    "print(\"\u2705 EmbeddingManager class loaded\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "## \ud83c\udfd7\ufe0f Step 3.7: Architecture - Matching Engine\n",
-    "\n",
-    "**Responsibility:** Calculate similarities and find matches"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ============================================================================\n",
-    "# MATCHING ENGINE - Handles similarity calculations\n",
-    "# ============================================================================\n",
-    "\n",
-    "class MatchingEngine:\n",
-    "    \"\"\"Calculates similarities and finds top matches\"\"\"\n",
-    "    \n",
-    "    def __init__(self,\n",
-    "                candidate_vectors: np.ndarray,\n",
-    "                company_vectors: np.ndarray,\n",
-    "                candidate_metadata: pd.DataFrame,\n",
-    "                company_metadata: pd.DataFrame):\n",
-    "        \n",
-    "        self.cand_vectors = candidate_vectors\n",
-    "        self.comp_vectors = company_vectors\n",
-    "        self.cand_metadata = candidate_metadata\n",
-    "        self.comp_metadata = company_metadata\n",
-    "        \n",
-    "        # Verify alignment\n",
-    "        assert len(candidate_vectors) == len(candidate_metadata), \\\n",
-    "            \"Candidate embeddings and metadata size mismatch\"\n",
-    "        assert len(company_vectors) == len(company_metadata), \\\n",
-    "            \"Company embeddings and metadata size mismatch\"\n",
-    "    \n",
-    "    def find_matches(self,\n",
-    "                    candidate_idx: int,\n",
-    "                    top_k: int = 10) -> List[Tuple[int, float]]:\n",
-    "        \"\"\"Find top K company matches for a candidate\"\"\"\n",
-    "        \n",
-    "        if candidate_idx >= len(self.cand_vectors):\n",
-    "            raise IndexError(f\"Candidate index {candidate_idx} out of range\")\n",
-    "        \n",
-    "        # Get candidate vector\n",
-    "        cand_vec = self.cand_vectors[candidate_idx].reshape(1, -1)\n",
-    "        \n",
-    "        # Calculate similarities\n",
-    "        similarities = cosine_similarity(cand_vec, self.comp_vectors)[0]\n",
-    "        \n",
-    "        # Get top K\n",
-    "        top_indices = np.argsort(similarities)[::-1][:top_k]\n",
-    "        \n",
-    "        # Return (index, score) tuples\n",
-    "        return [(int(idx), float(similarities[idx])) for idx in top_indices]\n",
-    "    \n",
-    "    def get_match_details(self,\n",
-    "                         candidate_idx: int,\n",
-    "                         company_idx: int) -> dict:\n",
-    "        \"\"\"Get detailed match information\"\"\"\n",
-    "        \n",
-    "        candidate = self.cand_metadata.iloc[candidate_idx]\n",
-    "        company = self.comp_metadata.iloc[company_idx]\n",
-    "        \n",
-    "        # Calculate similarity\n",
-    "        cand_vec = self.cand_vectors[candidate_idx].reshape(1, -1)\n",
-    "        comp_vec = self.comp_vectors[company_idx].reshape(1, -1)\n",
-    "        similarity = float(cosine_similarity(cand_vec, comp_vec)[0][0])\n",
-    "        \n",
-    "        return {\n",
-    "            'candidate': candidate.to_dict(),\n",
-    "            'company': company.to_dict(),\n",
-    "            'similarity_score': similarity\n",
-    "        }\n",
-    "    \n",
-    "    def batch_match(self,\n",
-    "                   candidate_indices: List[int],\n",
-    "                   top_k: int = 10) -> dict:\n",
-    "        \"\"\"Find matches for multiple candidates\"\"\"\n",
-    "        \n",
-    "        results = {}\n",
-    "        for idx in candidate_indices:\n",
-    "            results[idx] = self.find_matches(idx, top_k=top_k)\n",
-    "        \n",
-    "        return results\n",
-    "\n",
-    "\n",
-    "print(\"\u2705 MatchingEngine class loaded\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---",
-    "## \ud83d\udcca Step 4: Load All Datasets"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\ud83d\udcc2 Loading all datasets...\n",
-      "\n",
-      "======================================================================\n",
-      "\u2705 Candidates: 9,544 rows \u00d7 35 columns\n",
-      "\u2705 Companies (base): 24,473 rows\n",
-      "\u2705 Company industries: 24,375 rows\n",
-      "\u2705 Company specialties: 169,387 rows\n",
-      "\u2705 Employee counts: 35,787 rows\n",
-      "\u2705 Postings: 123,849 rows \u00d7 31 columns\n",
-      "\u2705 Job skills: 213,768 rows\n",
-      "\u2705 Job industries: 164,808 rows\n",
-      "\n",
-      "======================================================================\n",
-      "\u2705 All datasets loaded successfully!\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"\ud83d\udcc2 Loading all datasets...\\n\")\n",
-    "print(\"=\" * 70)\n",
-    "\n",
-    "# Load main datasets\n",
-    "candidates = pd.read_csv(f'{Config.CSV_PATH}resume_data.csv')\n",
-    "print(f\"\u2705 Candidates: {len(candidates):,} rows \u00d7 {len(candidates.columns)} columns\")\n",
-    "\n",
-    "companies_base = pd.read_csv(f'{Config.CSV_PATH}companies.csv')\n",
-    "print(f\"\u2705 Companies (base): {len(companies_base):,} rows\")\n",
-    "\n",
-    "company_industries = pd.read_csv(f'{Config.CSV_PATH}company_industries.csv')\n",
-    "print(f\"\u2705 Company industries: {len(company_industries):,} rows\")\n",
-    "\n",
-    "company_specialties = pd.read_csv(f'{Config.CSV_PATH}company_specialities.csv')\n",
-    "print(f\"\u2705 Company specialties: {len(company_specialties):,} rows\")\n",
-    "\n",
-    "employee_counts = pd.read_csv(f'{Config.CSV_PATH}employee_counts.csv')\n",
-    "print(f\"\u2705 Employee counts: {len(employee_counts):,} rows\")\n",
-    "\n",
-    "postings = pd.read_csv(f'{Config.CSV_PATH}postings.csv', on_bad_lines='skip', engine='python')\n",
-    "print(f\"\u2705 Postings: {len(postings):,} rows \u00d7 {len(postings.columns)} columns\")\n",
-    "\n",
-    "# Optional datasets\n",
-    "try:\n",
-    "    job_skills = pd.read_csv(f'{Config.CSV_PATH}job_skills.csv')\n",
-    "    print(f\"\u2705 Job skills: {len(job_skills):,} rows\")\n",
-    "except:\n",
-    "    job_skills = None\n",
-    "    print(\"\u26a0\ufe0f  Job skills not found (optional)\")\n",
-    "\n",
-    "try:\n",
-    "    job_industries = pd.read_csv(f'{Config.CSV_PATH}job_industries.csv')\n",
-    "    print(f\"\u2705 Job industries: {len(job_industries):,} rows\")\n",
-    "except:\n",
-    "    job_industries = None\n",
-    "    print(\"\u26a0\ufe0f  Job industries not found (optional)\")\n",
-    "\n",
-    "print(\"\\n\" + \"=\" * 70)\n",
-    "print(\"\u2705 All datasets loaded successfully!\\n\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---",
-    "## \ud83d\udcca Step 5: Merge & Enrich Company Data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\ud83d\udd17 Merging company data...\n",
-      "\n",
-      "\u2705 Aggregated industries for 24,365 companies\n",
-      "\u2705 Aggregated specialties for 17,780 companies\n",
-      "\n",
-      "\u2705 Base company merge complete: 35,787 companies\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"\ud83d\udd17 Merging company data...\\n\")\n",
-    "\n",
-    "# Aggregate industries\n",
-    "company_industries_agg = company_industries.groupby('company_id')['industry'].apply(\n",
-    "    lambda x: ', '.join(map(str, x.tolist()))\n",
-    ").reset_index()\n",
-    "company_industries_agg.columns = ['company_id', 'industries_list']\n",
-    "print(f\"\u2705 Aggregated industries for {len(company_industries_agg):,} companies\")\n",
-    "\n",
-    "# Aggregate specialties\n",
-    "company_specialties_agg = company_specialties.groupby('company_id')['speciality'].apply(\n",
-    "    lambda x: ' | '.join(x.astype(str).tolist())\n",
-    ").reset_index()\n",
-    "company_specialties_agg.columns = ['company_id', 'specialties_list']\n",
-    "print(f\"\u2705 Aggregated specialties for {len(company_specialties_agg):,} companies\")\n",
-    "\n",
-    "# Merge all company data\n",
-    "companies_merged = companies_base.copy()\n",
-    "companies_merged = companies_merged.merge(company_industries_agg, on='company_id', how='left')\n",
-    "companies_merged = companies_merged.merge(company_specialties_agg, on='company_id', how='left')\n",
-    "companies_merged = companies_merged.merge(employee_counts, on='company_id', how='left')\n",
-    "\n",
-    "print(f\"\\n\u2705 Base company merge complete: {len(companies_merged):,} companies\\n\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---",
-    "## \ud83d\udcca Step 6: Enrich with Job Postings"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\ud83c\udf09 Enriching companies with job posting data...\n",
-      "\n",
-      "======================================================================\n",
-      "KEY INSIGHT: Postings = 'Requirements Language Bridge'\n",
-      "======================================================================\n",
-      "\n",
-      "\u2705 Enriched 35,787 companies with posting data\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"\ud83c\udf09 Enriching companies with job posting data...\\n\")\n",
-    "print(\"=\" * 70)\n",
-    "print(\"KEY INSIGHT: Postings = 'Requirements Language Bridge'\")\n",
-    "print(\"=\" * 70 + \"\\n\")\n",
-    "\n",
-    "postings = postings.fillna('')\n",
-    "postings['company_id'] = postings['company_id'].astype(str)\n",
-    "\n",
-    "# Aggregate postings per company\n",
-    "postings_agg = postings.groupby('company_id').agg({\n",
-    "    'title': lambda x: ' | '.join(x.astype(str).tolist()[:10]),\n",
-    "    'description': lambda x: ' '.join(x.astype(str).tolist()[:5]),\n",
-    "    'skills_desc': lambda x: ' | '.join(x.dropna().astype(str).tolist()),\n",
-    "    'formatted_experience_level': lambda x: ' | '.join(x.dropna().unique().astype(str)),\n",
-    "}).reset_index()\n",
-    "\n",
-    "postings_agg.columns = ['company_id', 'posted_job_titles', 'posted_descriptions', 'required_skills', 'experience_levels']\n",
-    "\n",
-    "companies_merged['company_id'] = companies_merged['company_id'].astype(str)\n",
-    "companies_full = companies_merged.merge(postings_agg, on='company_id', how='left').fillna('')\n",
-    "\n",
-    "print(f\"\u2705 Enriched {len(companies_full):,} companies with posting data\\n\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>company_id</th>\n",
-       "      <th>name</th>\n",
-       "      <th>description</th>\n",
-       "      <th>company_size</th>\n",
-       "      <th>state</th>\n",
-       "      <th>country</th>\n",
-       "      <th>city</th>\n",
-       "      <th>zip_code</th>\n",
-       "      <th>address</th>\n",
-       "      <th>url</th>\n",
-       "      <th>industries_list</th>\n",
-       "      <th>specialties_list</th>\n",
-       "      <th>employee_count</th>\n",
-       "      <th>follower_count</th>\n",
-       "      <th>time_recorded</th>\n",
-       "      <th>posted_job_titles</th>\n",
-       "      <th>posted_descriptions</th>\n",
-       "      <th>required_skills</th>\n",
-       "      <th>experience_levels</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1009</td>\n",
-       "      <td>IBM</td>\n",
-       "      <td>At IBM, we do more than work. We create. We cr...</td>\n",
-       "      <td>7.0</td>\n",
-       "      <td>NY</td>\n",
-       "      <td>US</td>\n",
-       "      <td>Armonk, New York</td>\n",
-       "      <td>10504</td>\n",
-       "      <td>International Business Machines Corp.</td>\n",
-       "      <td>https://www.linkedin.com/company/ibm</td>\n",
-       "      <td>IT Services and IT Consulting</td>\n",
-       "      <td>Cloud | Mobile | Cognitive | Security | Resear...</td>\n",
-       "      <td>314102</td>\n",
-       "      <td>16253625</td>\n",
-       "      <td>1712378162</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>1009</td>\n",
-       "      <td>IBM</td>\n",
-       "      <td>At IBM, we do more than work. We create. We cr...</td>\n",
-       "      <td>7.0</td>\n",
-       "      <td>NY</td>\n",
-       "      <td>US</td>\n",
-       "      <td>Armonk, New York</td>\n",
-       "      <td>10504</td>\n",
-       "      <td>International Business Machines Corp.</td>\n",
-       "      <td>https://www.linkedin.com/company/ibm</td>\n",
-       "      <td>IT Services and IT Consulting</td>\n",
-       "      <td>Cloud | Mobile | Cognitive | Security | Resear...</td>\n",
-       "      <td>313142</td>\n",
-       "      <td>16309464</td>\n",
-       "      <td>1713392385</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>1009</td>\n",
-       "      <td>IBM</td>\n",
-       "      <td>At IBM, we do more than work. We create. We cr...</td>\n",
-       "      <td>7.0</td>\n",
-       "      <td>NY</td>\n",
-       "      <td>US</td>\n",
-       "      <td>Armonk, New York</td>\n",
-       "      <td>10504</td>\n",
-       "      <td>International Business Machines Corp.</td>\n",
-       "      <td>https://www.linkedin.com/company/ibm</td>\n",
-       "      <td>IT Services and IT Consulting</td>\n",
-       "      <td>Cloud | Mobile | Cognitive | Security | Resear...</td>\n",
-       "      <td>313147</td>\n",
-       "      <td>16309985</td>\n",
-       "      <td>1713402495</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>1009</td>\n",
-       "      <td>IBM</td>\n",
-       "      <td>At IBM, we do more than work. We create. We cr...</td>\n",
-       "      <td>7.0</td>\n",
-       "      <td>NY</td>\n",
-       "      <td>US</td>\n",
-       "      <td>Armonk, New York</td>\n",
-       "      <td>10504</td>\n",
-       "      <td>International Business Machines Corp.</td>\n",
-       "      <td>https://www.linkedin.com/company/ibm</td>\n",
-       "      <td>IT Services and IT Consulting</td>\n",
-       "      <td>Cloud | Mobile | Cognitive | Security | Resear...</td>\n",
-       "      <td>311223</td>\n",
-       "      <td>16314846</td>\n",
-       "      <td>1713501255</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>1016</td>\n",
-       "      <td>GE HealthCare</td>\n",
-       "      <td>Every day millions of people feel the impact o...</td>\n",
-       "      <td>7.0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>US</td>\n",
-       "      <td>Chicago</td>\n",
-       "      <td>0</td>\n",
-       "      <td>-</td>\n",
-       "      <td>https://www.linkedin.com/company/gehealthcare</td>\n",
-       "      <td>Hospitals and Health Care</td>\n",
-       "      <td>Healthcare | Biotechnology</td>\n",
-       "      <td>56873</td>\n",
-       "      <td>2185368</td>\n",
-       "      <td>1712382540</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "  company_id           name  \\\n",
-       "0       1009            IBM   \n",
-       "1       1009            IBM   \n",
-       "2       1009            IBM   \n",
-       "3       1009            IBM   \n",
-       "4       1016  GE HealthCare   \n",
-       "\n",
-       "                                         description company_size state  \\\n",
-       "0  At IBM, we do more than work. We create. We cr...          7.0    NY   \n",
-       "1  At IBM, we do more than work. We create. We cr...          7.0    NY   \n",
-       "2  At IBM, we do more than work. We create. We cr...          7.0    NY   \n",
-       "3  At IBM, we do more than work. We create. We cr...          7.0    NY   \n",
-       "4  Every day millions of people feel the impact o...          7.0     0   \n",
-       "\n",
-       "  country              city zip_code                                address  \\\n",
-       "0      US  Armonk, New York    10504  International Business Machines Corp.   \n",
-       "1      US  Armonk, New York    10504  International Business Machines Corp.   \n",
-       "2      US  Armonk, New York    10504  International Business Machines Corp.   \n",
-       "3      US  Armonk, New York    10504  International Business Machines Corp.   \n",
-       "4      US           Chicago        0                                      -   \n",
-       "\n",
-       "                                             url  \\\n",
-       "0           https://www.linkedin.com/company/ibm   \n",
-       "1           https://www.linkedin.com/company/ibm   \n",
-       "2           https://www.linkedin.com/company/ibm   \n",
-       "3           https://www.linkedin.com/company/ibm   \n",
-       "4  https://www.linkedin.com/company/gehealthcare   \n",
-       "\n",
-       "                 industries_list  \\\n",
-       "0  IT Services and IT Consulting   \n",
-       "1  IT Services and IT Consulting   \n",
-       "2  IT Services and IT Consulting   \n",
-       "3  IT Services and IT Consulting   \n",
-       "4      Hospitals and Health Care   \n",
-       "\n",
-       "                                    specialties_list  employee_count  \\\n",
-       "0  Cloud | Mobile | Cognitive | Security | Resear...          314102   \n",
-       "1  Cloud | Mobile | Cognitive | Security | Resear...          313142   \n",
-       "2  Cloud | Mobile | Cognitive | Security | Resear...          313147   \n",
-       "3  Cloud | Mobile | Cognitive | Security | Resear...          311223   \n",
-       "4                         Healthcare | Biotechnology           56873   \n",
-       "\n",
-       "   follower_count  time_recorded posted_job_titles posted_descriptions  \\\n",
-       "0        16253625     1712378162                                         \n",
-       "1        16309464     1713392385                                         \n",
-       "2        16309985     1713402495                                         \n",
-       "3        16314846     1713501255                                         \n",
-       "4         2185368     1712382540                                         \n",
-       "\n",
-       "  required_skills experience_levels  \n",
-       "0                                    \n",
-       "1                                    \n",
-       "2                                    \n",
-       "3                                    \n",
-       "4                                    "
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "companies_full.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "================================================================================\n",
-      "\ud83d\udd0d DUPLICATE DETECTION REPORT\n",
-      "================================================================================\n",
-      "\n",
-      "\u250c\u2500 \ud83d\udcca resume_data.csv (Candidates)\n",
-      "\u2502  Primary Key: Resume_ID\n",
-      "\u2502  Total rows:     9,544\n",
-      "\u2502  Unique rows:    9,544\n",
-      "\u2502  Duplicates:     0\n",
-      "\u2502  Status:         \u2705 CLEAN\n",
-      "\u2514\u2500\n",
-      "\n",
-      "\u250c\u2500 \ud83d\udcca companies.csv (Companies Base)\n",
-      "\u2502  Primary Key: company_id\n",
-      "\u2502  Total rows:     24,473\n",
-      "\u2502  Unique rows:    24,473\n",
-      "\u2502  Duplicates:     0\n",
-      "\u2502  Status:         \u2705 CLEAN\n",
-      "\u2514\u2500\n",
-      "\n",
-      "\u250c\u2500 \ud83d\udcca company_industries.csv\n",
-      "\u2502  Primary Key: company_id + industry\n",
-      "\u2502  Total rows:     24,375\n",
-      "\u2502  Unique rows:    24,375\n",
-      "\u2502  Duplicates:     0\n",
-      "\u2502  Status:         \u2705 CLEAN\n",
-      "\u2514\u2500\n",
-      "\n",
-      "\u250c\u2500 \ud83d\udcca company_specialities.csv\n",
-      "\u2502  Primary Key: company_id + speciality\n",
-      "\u2502  Total rows:     169,387\n",
-      "\u2502  Unique rows:    169,387\n",
-      "\u2502  Duplicates:     0\n",
-      "\u2502  Status:         \u2705 CLEAN\n",
-      "\u2514\u2500\n",
-      "\n",
-      "\u250c\u2500 \ud83d\udcca employee_counts.csv\n",
-      "\u2502  Primary Key: company_id\n",
-      "\u2502  Total rows:     35,787\n",
-      "\u2502  Unique rows:    24,473\n",
-      "\u2502  Duplicates:     11,314\n",
-      "\u2502  Status:         \ud83d\udd34 HAS DUPLICATES\n",
-      "\u2514\u2500\n",
-      "\n",
-      "\u250c\u2500 \ud83d\udcca postings.csv (Job Postings)\n",
-      "\u2502  Primary Key: job_id\n",
-      "\u2502  Total rows:     123,849\n",
-      "\u2502  Unique rows:    123,849\n",
-      "\u2502  Duplicates:     0\n",
-      "\u2502  Status:         \u2705 CLEAN\n",
-      "\u2514\u2500\n",
-      "\n",
-      "\u250c\u2500 \ud83d\udcca companies_full (After Enrichment)\n",
-      "\u2502  Primary Key: company_id\n",
-      "\u2502  Total rows:     35,787\n",
-      "\u2502  Unique rows:    24,473\n",
-      "\u2502  Duplicates:     11,314\n",
-      "\u2502  Status:         \ud83d\udd34 HAS DUPLICATES\n",
-      "\u2502\n",
-      "\u2502  Top duplicate company_ids:\n",
-      "\u2502    - 33242739 (Confidential): 13 times\n",
-      "\u2502    - 5235 (LHH): 13 times\n",
-      "\u2502    - 79383535 (Akkodis): 12 times\n",
-      "\u2502    - 1681 (Robert Half): 12 times\n",
-      "\u2502    - 220336 (Hyatt Hotels Corporation): 11 times\n",
-      "\u2514\u2500\n",
-      "\n",
-      "================================================================================\n",
-      "\ud83d\udcca SUMMARY\n",
-      "================================================================================\n",
-      "\n",
-      "\u2705 Clean datasets:          5/7\n",
-      "\ud83d\udd34 Datasets with duplicates: 2/7\n",
-      "\ud83d\uddd1\ufe0f  Total duplicates found:  22,628 rows\n",
-      "\n",
-      "\u26a0\ufe0f  DUPLICATES DETECTED!\n",
-      "================================================================================\n"
-     ]
-    }
-   ],
-   "source": [
-    "## \ud83d\udd0d Data Quality Check - Duplicate Detection\n",
-    "\n",
-    "\"\"\"\n",
-    "Checking for duplicates in all datasets based on primary keys.\n",
-    "This cell only REPORTS duplicates, does not modify data.\n",
-    "\"\"\"\n",
-    "\n",
-    "print(\"=\" * 80)\n",
-    "print(\"\ud83d\udd0d DUPLICATE DETECTION REPORT\")\n",
-    "print(\"=\" * 80)\n",
-    "print()\n",
-    "\n",
-    "# Define primary keys for each dataset\n",
-    "duplicate_report = []\n",
-    "\n",
-    "# 1. Candidates\n",
-    "print(\"\u250c\u2500 \ud83d\udcca resume_data.csv (Candidates)\")\n",
-    "print(f\"\u2502  Primary Key: Resume_ID\")\n",
-    "cand_total = len(candidates)\n",
-    "cand_unique = candidates['Resume_ID'].nunique() if 'Resume_ID' in candidates.columns else len(candidates)\n",
-    "cand_dups = cand_total - cand_unique\n",
-    "print(f\"\u2502  Total rows:     {cand_total:,}\")\n",
-    "print(f\"\u2502  Unique rows:    {cand_unique:,}\")\n",
-    "print(f\"\u2502  Duplicates:     {cand_dups:,}\")\n",
-    "print(f\"\u2502  Status:         {'\u2705 CLEAN' if cand_dups == 0 else '\ud83d\udd34 HAS DUPLICATES'}\")\n",
-    "print(\"\u2514\u2500\\n\")\n",
-    "duplicate_report.append(('Candidates', cand_total, cand_unique, cand_dups))\n",
-    "\n",
-    "# 2. Companies Base\n",
-    "print(\"\u250c\u2500 \ud83d\udcca companies.csv (Companies Base)\")\n",
-    "print(f\"\u2502  Primary Key: company_id\")\n",
-    "comp_total = len(companies_base)\n",
-    "comp_unique = companies_base['company_id'].nunique()\n",
-    "comp_dups = comp_total - comp_unique\n",
-    "print(f\"\u2502  Total rows:     {comp_total:,}\")\n",
-    "print(f\"\u2502  Unique rows:    {comp_unique:,}\")\n",
-    "print(f\"\u2502  Duplicates:     {comp_dups:,}\")\n",
-    "print(f\"\u2502  Status:         {'\u2705 CLEAN' if comp_dups == 0 else '\ud83d\udd34 HAS DUPLICATES'}\")\n",
-    "if comp_dups > 0:\n",
-    "    dup_ids = companies_base[companies_base.duplicated('company_id', keep=False)]['company_id'].value_counts().head(3)\n",
-    "    print(f\"\u2502  Top duplicates:\")\n",
-    "    for cid, count in dup_ids.items():\n",
-    "        print(f\"\u2502    - company_id={cid}: {count} times\")\n",
-    "print(\"\u2514\u2500\\n\")\n",
-    "duplicate_report.append(('Companies Base', comp_total, comp_unique, comp_dups))\n",
-    "\n",
-    "# 3. Company Industries\n",
-    "print(\"\u250c\u2500 \ud83d\udcca company_industries.csv\")\n",
-    "print(f\"\u2502  Primary Key: company_id + industry\")\n",
-    "ci_total = len(company_industries)\n",
-    "ci_unique = len(company_industries.drop_duplicates(subset=['company_id', 'industry']))\n",
-    "ci_dups = ci_total - ci_unique\n",
-    "print(f\"\u2502  Total rows:     {ci_total:,}\")\n",
-    "print(f\"\u2502  Unique rows:    {ci_unique:,}\")\n",
-    "print(f\"\u2502  Duplicates:     {ci_dups:,}\")\n",
-    "print(f\"\u2502  Status:         {'\u2705 CLEAN' if ci_dups == 0 else '\ud83d\udd34 HAS DUPLICATES'}\")\n",
-    "print(\"\u2514\u2500\\n\")\n",
-    "duplicate_report.append(('Company Industries', ci_total, ci_unique, ci_dups))\n",
-    "\n",
-    "# 4. Company Specialties\n",
-    "print(\"\u250c\u2500 \ud83d\udcca company_specialities.csv\")\n",
-    "print(f\"\u2502  Primary Key: company_id + speciality\")\n",
-    "cs_total = len(company_specialties)\n",
-    "cs_unique = len(company_specialties.drop_duplicates(subset=['company_id', 'speciality']))\n",
-    "cs_dups = cs_total - cs_unique\n",
-    "print(f\"\u2502  Total rows:     {cs_total:,}\")\n",
-    "print(f\"\u2502  Unique rows:    {cs_unique:,}\")\n",
-    "print(f\"\u2502  Duplicates:     {cs_dups:,}\")\n",
-    "print(f\"\u2502  Status:         {'\u2705 CLEAN' if cs_dups == 0 else '\ud83d\udd34 HAS DUPLICATES'}\")\n",
-    "print(\"\u2514\u2500\\n\")\n",
-    "duplicate_report.append(('Company Specialties', cs_total, cs_unique, cs_dups))\n",
-    "\n",
-    "# 5. Employee Counts\n",
-    "print(\"\u250c\u2500 \ud83d\udcca employee_counts.csv\")\n",
-    "print(f\"\u2502  Primary Key: company_id\")\n",
-    "ec_total = len(employee_counts)\n",
-    "ec_unique = employee_counts['company_id'].nunique()\n",
-    "ec_dups = ec_total - ec_unique\n",
-    "print(f\"\u2502  Total rows:     {ec_total:,}\")\n",
-    "print(f\"\u2502  Unique rows:    {ec_unique:,}\")\n",
-    "print(f\"\u2502  Duplicates:     {ec_dups:,}\")\n",
-    "print(f\"\u2502  Status:         {'\u2705 CLEAN' if ec_dups == 0 else '\ud83d\udd34 HAS DUPLICATES'}\")\n",
-    "print(\"\u2514\u2500\\n\")\n",
-    "duplicate_report.append(('Employee Counts', ec_total, ec_unique, ec_dups))\n",
-    "\n",
-    "# 6. Postings\n",
-    "print(\"\u250c\u2500 \ud83d\udcca postings.csv (Job Postings)\")\n",
-    "print(f\"\u2502  Primary Key: job_id\")\n",
-    "if 'job_id' in postings.columns:\n",
-    "    post_total = len(postings)\n",
-    "    post_unique = postings['job_id'].nunique()\n",
-    "    post_dups = post_total - post_unique\n",
-    "else:\n",
-    "    post_total = len(postings)\n",
-    "    post_unique = len(postings.drop_duplicates())\n",
-    "    post_dups = post_total - post_unique\n",
-    "print(f\"\u2502  Total rows:     {post_total:,}\")\n",
-    "print(f\"\u2502  Unique rows:    {post_unique:,}\")\n",
-    "print(f\"\u2502  Duplicates:     {post_dups:,}\")\n",
-    "print(f\"\u2502  Status:         {'\u2705 CLEAN' if post_dups == 0 else '\ud83d\udd34 HAS DUPLICATES'}\")\n",
-    "print(\"\u2514\u2500\\n\")\n",
-    "duplicate_report.append(('Postings', post_total, post_unique, post_dups))\n",
-    "\n",
-    "# 7. Companies Full (After Merge)\n",
-    "print(\"\u250c\u2500 \ud83d\udcca companies_full (After Enrichment)\")\n",
-    "print(f\"\u2502  Primary Key: company_id\")\n",
-    "cf_total = len(companies_full)\n",
-    "cf_unique = companies_full['company_id'].nunique()\n",
-    "cf_dups = cf_total - cf_unique\n",
-    "print(f\"\u2502  Total rows:     {cf_total:,}\")\n",
-    "print(f\"\u2502  Unique rows:    {cf_unique:,}\")\n",
-    "print(f\"\u2502  Duplicates:     {cf_dups:,}\")\n",
-    "print(f\"\u2502  Status:         {'\u2705 CLEAN' if cf_dups == 0 else '\ud83d\udd34 HAS DUPLICATES'}\")\n",
-    "if cf_dups > 0:\n",
-    "    dup_ids = companies_full[companies_full.duplicated('company_id', keep=False)]['company_id'].value_counts().head(5)\n",
-    "    print(f\"\u2502\")\n",
-    "    print(f\"\u2502  Top duplicate company_ids:\")\n",
-    "    for cid, count in dup_ids.items():\n",
-    "        comp_name = companies_full[companies_full['company_id'] == cid]['name'].iloc[0]\n",
-    "        print(f\"\u2502    - {cid} ({comp_name}): {count} times\")\n",
-    "print(\"\u2514\u2500\\n\")\n",
-    "duplicate_report.append(('Companies Full', cf_total, cf_unique, cf_dups))\n",
-    "\n",
-    "# Summary\n",
-    "print(\"=\" * 80)\n",
-    "print(\"\ud83d\udcca SUMMARY\")\n",
-    "print(\"=\" * 80)\n",
-    "print()\n",
-    "\n",
-    "total_dups = sum(r[3] for r in duplicate_report)\n",
-    "clean_datasets = sum(1 for r in duplicate_report if r[3] == 0)\n",
-    "dirty_datasets = len(duplicate_report) - clean_datasets\n",
-    "\n",
-    "print(f\"\u2705 Clean datasets:          {clean_datasets}/{len(duplicate_report)}\")\n",
-    "print(f\"\ud83d\udd34 Datasets with duplicates: {dirty_datasets}/{len(duplicate_report)}\")\n",
-    "print(f\"\ud83d\uddd1\ufe0f  Total duplicates found:  {total_dups:,} rows\")\n",
-    "print()\n",
-    "\n",
-    "if dirty_datasets > 0:\n",
-    "    print(\"\u26a0\ufe0f  DUPLICATES DETECTED!\")\n",
-    "else:\n",
-    "    print(\"\u2705 All datasets are clean! No duplicates found.\")\n",
-    "\n",
-    "print(\"=\" * 80)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\ud83e\uddf9 CLEANING DUPLICATES...\n",
-      "\n",
-      "================================================================================\n",
-      "\u2705 companies_base: Already clean\n",
-      "\n",
-      "\u2705 company_industries: Already clean\n",
-      "\n",
-      "\u2705 company_specialties: Already clean\n",
-      "\n",
-      "\u2705 employee_counts:\n",
-      "   Removed 11,314 duplicates\n",
-      "   35,787 \u2192 24,473 rows\n",
-      "\n",
-      "\u2705 postings: Already clean\n",
-      "\n",
-      "\u2705 companies_full:\n",
-      "   Removed 11,314 duplicates\n",
-      "   35,787 \u2192 24,473 rows\n",
-      "\n",
-      "================================================================================\n",
-      "\u2705 DATA CLEANING COMPLETE!\n",
-      "================================================================================\n",
-      "\n",
-      "\ud83d\udcca Total duplicates removed: 22,628 rows\n",
-      "\n",
-      "Cleaned datasets:\n",
-      "  - employee_counts: 35,787 \u2192 24,473\n",
-      "  - companies_full: 35,787 \u2192 24,473\n"
-     ]
-    }
-   ],
-   "source": [
-    "\"\"\"\n",
-    "## \ud83e\uddf9 Data Cleaning - Remove Duplicates\n",
-    "\n",
-    "Based on the report above, removing duplicates from datasets.\n",
-    "\"\"\"\n",
-    "\n",
-    "print(\"\ud83e\uddf9 CLEANING DUPLICATES...\\n\")\n",
-    "print(\"=\" * 80)\n",
-    "\n",
-    "# Store original counts\n",
-    "original_counts = {}\n",
-    "\n",
-    "# 1. Clean Companies Base (if needed)\n",
-    "if len(companies_base) != companies_base['company_id'].nunique():\n",
-    "    original_counts['companies_base'] = len(companies_base)\n",
-    "    companies_base = companies_base.drop_duplicates(subset=['company_id'], keep='first')\n",
-    "    removed = original_counts['companies_base'] - len(companies_base)\n",
-    "    print(f\"\u2705 companies_base:\")\n",
-    "    print(f\"   Removed {removed:,} duplicates\")\n",
-    "    print(f\"   {original_counts['companies_base']:,} \u2192 {len(companies_base):,} rows\\n\")\n",
-    "else:\n",
-    "    print(f\"\u2705 companies_base: Already clean\\n\")\n",
-    "\n",
-    "# 2. Clean Company Industries (if needed)\n",
-    "if len(company_industries) != len(company_industries.drop_duplicates(subset=['company_id', 'industry'])):\n",
-    "    original_counts['company_industries'] = len(company_industries)\n",
-    "    company_industries = company_industries.drop_duplicates(subset=['company_id', 'industry'], keep='first')\n",
-    "    removed = original_counts['company_industries'] - len(company_industries)\n",
-    "    print(f\"\u2705 company_industries:\")\n",
-    "    print(f\"   Removed {removed:,} duplicates\")\n",
-    "    print(f\"   {original_counts['company_industries']:,} \u2192 {len(company_industries):,} rows\\n\")\n",
-    "else:\n",
-    "    print(f\"\u2705 company_industries: Already clean\\n\")\n",
-    "\n",
-    "# 3. Clean Company Specialties (if needed)\n",
-    "if len(company_specialties) != len(company_specialties.drop_duplicates(subset=['company_id', 'speciality'])):\n",
-    "    original_counts['company_specialties'] = len(company_specialties)\n",
-    "    company_specialties = company_specialties.drop_duplicates(subset=['company_id', 'speciality'], keep='first')\n",
-    "    removed = original_counts['company_specialties'] - len(company_specialties)\n",
-    "    print(f\"\u2705 company_specialties:\")\n",
-    "    print(f\"   Removed {removed:,} duplicates\")\n",
-    "    print(f\"   {original_counts['company_specialties']:,} \u2192 {len(company_specialties):,} rows\\n\")\n",
-    "else:\n",
-    "    print(f\"\u2705 company_specialties: Already clean\\n\")\n",
-    "\n",
-    "# 4. Clean Employee Counts (if needed)\n",
-    "if len(employee_counts) != employee_counts['company_id'].nunique():\n",
-    "    original_counts['employee_counts'] = len(employee_counts)\n",
-    "    employee_counts = employee_counts.drop_duplicates(subset=['company_id'], keep='first')\n",
-    "    removed = original_counts['employee_counts'] - len(employee_counts)\n",
-    "    print(f\"\u2705 employee_counts:\")\n",
-    "    print(f\"   Removed {removed:,} duplicates\")\n",
-    "    print(f\"   {original_counts['employee_counts']:,} \u2192 {len(employee_counts):,} rows\\n\")\n",
-    "else:\n",
-    "    print(f\"\u2705 employee_counts: Already clean\\n\")\n",
-    "\n",
-    "# 5. Clean Postings (if needed)\n",
-    "if 'job_id' in postings.columns:\n",
-    "    if len(postings) != postings['job_id'].nunique():\n",
-    "        original_counts['postings'] = len(postings)\n",
-    "        postings = postings.drop_duplicates(subset=['job_id'], keep='first')\n",
-    "        removed = original_counts['postings'] - len(postings)\n",
-    "        print(f\"\u2705 postings:\")\n",
-    "        print(f\"   Removed {removed:,} duplicates\")\n",
-    "        print(f\"   {original_counts['postings']:,} \u2192 {len(postings):,} rows\\n\")\n",
-    "    else:\n",
-    "        print(f\"\u2705 postings: Already clean\\n\")\n",
-    "\n",
-    "# 6. Clean Companies Full (if needed)\n",
-    "if len(companies_full) != companies_full['company_id'].nunique():\n",
-    "    original_counts['companies_full'] = len(companies_full)\n",
-    "    companies_full = companies_full.drop_duplicates(subset=['company_id'], keep='first')\n",
-    "    removed = original_counts['companies_full'] - len(companies_full)\n",
-    "    print(f\"\u2705 companies_full:\")\n",
-    "    print(f\"   Removed {removed:,} duplicates\")\n",
-    "    print(f\"   {original_counts['companies_full']:,} \u2192 {len(companies_full):,} rows\\n\")\n",
-    "else:\n",
-    "    print(f\"\u2705 companies_full: Already clean\\n\")\n",
-    "\n",
-    "print(\"=\" * 80)\n",
-    "print(\"\u2705 DATA CLEANING COMPLETE!\")\n",
-    "print(\"=\" * 80)\n",
-    "print()\n",
-    "\n",
-    "# Summary\n",
-    "if original_counts:\n",
-    "    total_removed = sum(original_counts[k] - globals()[k].shape[0] if k in globals() else 0 \n",
-    "                       for k in original_counts.keys())\n",
-    "    print(f\"\ud83d\udcca Total duplicates removed: {total_removed:,} rows\")\n",
-    "    print()\n",
-    "    print(\"Cleaned datasets:\")\n",
-    "    for dataset, original in original_counts.items():\n",
-    "        current = len(globals()[dataset]) if dataset in globals() else 0\n",
-    "        print(f\"  - {dataset}: {original:,} \u2192 {current:,}\")\n",
-    "else:\n",
-    "    print(\"\u2705 No duplicates found - all datasets were already clean!\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---",
-    "## \ud83d\udcca Step 7: Load Embedding Model & Pre-computed Vectors"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\ud83e\udde0 Loading embedding model...\n",
-      "\n",
-      "\u2705 Model loaded: all-MiniLM-L6-v2\n",
-      "\ud83d\udcd0 Embedding dimension: \u211d^384\n",
-      "\n",
-      "\ud83d\udcc2 Loading pre-computed embeddings...\n",
-      "\u2705 Loaded from ../processed/\n",
-      "\ud83d\udcca Candidate vectors: (9544, 384)\n",
-      "\ud83d\udcca Company vectors: (35787, 384)\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"\ud83e\udde0 Loading embedding model...\\n\")\n",
-    "model = SentenceTransformer(Config.EMBEDDING_MODEL)\n",
-    "embedding_dim = model.get_sentence_embedding_dimension()\n",
-    "print(f\"\u2705 Model loaded: {Config.EMBEDDING_MODEL}\")\n",
-    "print(f\"\ud83d\udcd0 Embedding dimension: \u211d^{embedding_dim}\\n\")\n",
-    "\n",
-    "print(\"\ud83d\udcc2 Loading pre-computed embeddings...\")\n",
-    "\n",
-    "try:\n",
-    "    # Try to load from processed folder\n",
-    "    cand_vectors = np.load(f'{Config.PROCESSED_PATH}candidate_embeddings.npy')\n",
-    "    comp_vectors = np.load(f'{Config.PROCESSED_PATH}company_embeddings.npy')\n",
-    "    \n",
-    "    print(f\"\u2705 Loaded from {Config.PROCESSED_PATH}\")\n",
-    "    print(f\"\ud83d\udcca Candidate vectors: {cand_vectors.shape}\")\n",
-    "    print(f\"\ud83d\udcca Company vectors: {comp_vectors.shape}\\n\")\n",
-    "    \n",
-    "except FileNotFoundError:\n",
-    "    print(\"\u26a0\ufe0f  Pre-computed embeddings not found!\")\n",
-    "    print(\"   Embeddings will need to be generated (takes ~5-10 minutes)\")\n",
-    "    print(\"   This is normal if running for the first time.\\n\")\n",
-    "    \n",
-    "    # You can add embedding generation code here if needed\n",
-    "    # For now, we'll skip to keep notebook clean\n",
-    "    cand_vectors = None\n",
-    "    comp_vectors = None"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---",
-    "## \ud83d\udcca Step 8: Core Matching Function"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u2705 Matching function ready\n"
-     ]
-    }
-   ],
-   "source": [
-    "# ============================================================================",
-    "# CORE MATCHING FUNCTION (SAFE VERSION)",
-    "# ============================================================================",
-    "",
-    "def find_top_matches(candidate_idx: int, top_k: int = 10) -> list:",
-    "    \"\"\"",
-    "    Find top K company matches for a candidate.",
-    "    ",
-    "    SAFE VERSION: Handles index mismatches between embeddings and dataset",
-    "    ",
-    "    Args:",
-    "        candidate_idx: Index of candidate in candidates DataFrame",
-    "        top_k: Number of top matches to return",
-    "    ",
-    "    Returns:",
-    "        List of tuples: [(company_idx, similarity_score), ...]",
-    "    \"\"\"",
-    "    ",
-    "    # Validate candidate index",
-    "    if candidate_idx >= len(cand_vectors):",
-    "        print(f\"\u274c Candidate index {candidate_idx} out of range\")",
-    "        return []",
-    "    ",
-    "    # Get candidate vector",
-    "    cand_vec = cand_vectors[candidate_idx].reshape(1, -1)",
-    "    ",
-    "    # Calculate similarities with all company vectors",
-    "    similarities = cosine_similarity(cand_vec, comp_vectors)[0]",
-    "    ",
-    "    # CRITICAL FIX: Only use indices that exist in companies_full",
-    "    max_valid_idx = len(companies_full) - 1",
-    "    ",
-    "    # Truncate similarities to valid range",
-    "    valid_similarities = similarities[:max_valid_idx + 1]",
-    "    ",
-    "    # Get top K indices from valid range",
-    "    top_indices = np.argsort(valid_similarities)[::-1][:top_k]",
-    "    ",
-    "    # Return (index, score) tuples",
-    "    results = [(int(idx), float(valid_similarities[idx])) for idx in top_indices]",
-    "    ",
-    "    return results",
-    "",
-    "# Test function and show diagnostics",
-    "print(\"\u2705 Safe matching function loaded!\")",
-    "print(f\"\\n\ud83d\udcca DIAGNOSTICS:\")",
-    "print(f\"   Candidate vectors: {len(cand_vectors):,}\")",
-    "print(f\"   Company vectors: {len(comp_vectors):,}\")",
-    "print(f\"   Companies dataset: {len(companies_full):,}\")",
-    "",
-    "if len(comp_vectors) > len(companies_full):",
-    "    print(f\"\\n\u26a0\ufe0f  INDEX MISMATCH DETECTED!\")",
-    "    print(f\"   Embeddings: {len(comp_vectors):,}\")",
-    "    print(f\"   Dataset: {len(companies_full):,}\")",
-    "    print(f\"   Missing rows: {len(comp_vectors) - len(companies_full):,}\")",
-    "    print(f\"\\n\ud83d\udca1 CAUSE: Embeddings generated BEFORE deduplication\")",
-    "    print(f\"\\n\ud83c\udfaf SOLUTIONS:\")",
-    "    print(f\"   A. Safe functions active (current) \u2705\")",
-    "    print(f\"   B. Regenerate embeddings after dedup\")",
-    "    print(f\"   C. Run collaborative filtering step\")",
-    "else:",
-    "    print(f\"\\n\u2705 Embeddings and dataset are aligned!\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---",
-    "## \ud83d\udcca Step 9: Initialize FREE LLM (Hugging Face)",
-    "",
-    "### Get your FREE token: https://huggingface.co/settings/tokens"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u2705 Hugging Face client initialized (FREE)\n",
-      "\ud83e\udd16 Model: meta-llama/Llama-3.2-3B-Instruct\n",
-      "\ud83d\udcb0 Cost: $0.00 (completely free!)\n",
-      "\n",
-      "\u2705 LLM helper functions ready\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Initialize Hugging Face Inference Client (FREE)\n",
-    "if Config.HF_TOKEN:\n",
-    "    try:\n",
-    "        hf_client = InferenceClient(token=Config.HF_TOKEN)\n",
-    "        print(\"\u2705 Hugging Face client initialized (FREE)\")\n",
-    "        print(f\"\ud83e\udd16 Model: {Config.LLM_MODEL}\")\n",
-    "        print(\"\ud83d\udcb0 Cost: $0.00 (completely free!)\\n\")\n",
-    "        LLM_AVAILABLE = True\n",
-    "    except Exception as e:\n",
-    "        print(f\"\u26a0\ufe0f  Failed to initialize HF client: {e}\")\n",
-    "        LLM_AVAILABLE = False\n",
-    "else:\n",
-    "    print(\"\u26a0\ufe0f  No Hugging Face token configured\")\n",
-    "    print(\"   LLM features will be disabled\")\n",
-    "    print(\"\\n\ud83d\udcdd To enable:\")\n",
-    "    print(\"   1. Go to: https://huggingface.co/settings/tokens\")\n",
-    "    print(\"   2. Create a token (free)\")\n",
-    "    print(\"   3. Set: Config.HF_TOKEN = 'your-token-here'\\n\")\n",
-    "    LLM_AVAILABLE = False\n",
-    "    hf_client = None\n",
-    "\n",
-    "def call_llm(prompt: str, max_tokens: int = 1000) -> str:\n",
-    "    \"\"\"\n",
-    "    Generic LLM call using Hugging Face Inference API (FREE).\n",
-    "    \"\"\"\n",
-    "    if not LLM_AVAILABLE:\n",
-    "        return \"[LLM not available - check .env file for HF_TOKEN]\"\n",
-    "    \n",
-    "    try:\n",
-    "        response = hf_client.chat_completion(  # \u2705 chat_completion\n",
-    "            messages=[{\"role\": \"user\", \"content\": prompt}],\n",
-    "            model=Config.LLM_MODEL,\n",
-    "            max_tokens=max_tokens,\n",
-    "            temperature=0.7\n",
-    "        )\n",
-    "        return response.choices[0].message.content  # \u2705 Extrai conte\u00fado\n",
-    "    except Exception as e:\n",
-    "        return f\"[Error: {str(e)}]\"\n",
-    "\n",
-    "print(\"\u2705 LLM helper functions ready\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---",
-    "## \ud83d\udcca Step 10: Pydantic Schemas for Structured Output"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u2705 Pydantic schemas defined\n"
-     ]
-    }
-   ],
-   "source": [
-    "class JobLevelClassification(BaseModel):\n",
-    "    \"\"\"Job level classification result\"\"\"\n",
-    "    level: Literal['Entry', 'Mid', 'Senior', 'Executive']\n",
-    "    confidence: float = Field(ge=0.0, le=1.0)\n",
-    "    reasoning: str\n",
-    "\n",
-    "class SkillsTaxonomy(BaseModel):\n",
-    "    \"\"\"Structured skills extraction\"\"\"\n",
-    "    technical_skills: List[str] = Field(default_factory=list)\n",
-    "    soft_skills: List[str] = Field(default_factory=list)\n",
-    "    certifications: List[str] = Field(default_factory=list)\n",
-    "    languages: List[str] = Field(default_factory=list)\n",
-    "\n",
-    "class MatchExplanation(BaseModel):\n",
-    "    \"\"\"Match reasoning\"\"\"\n",
-    "    overall_score: float = Field(ge=0.0, le=1.0)\n",
-    "    match_strengths: List[str]\n",
-    "    skill_gaps: List[str]\n",
-    "    recommendation: str\n",
-    "    fit_summary: str = Field(max_length=200)\n",
-    "\n",
-    "print(\"\u2705 Pydantic schemas defined\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---",
-    "## \ud83d\udcca Step 11: Job Level Classification (Zero-Shot)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\ud83e\uddea Testing zero-shot classification...\n",
-      "\n",
-      "\ud83d\udcca Classification Result:\n",
-      "{\n",
-      "  \"level\": \"Unknown\",\n",
-      "  \"confidence\": 0.0,\n",
-      "  \"reasoning\": \"Failed to parse response\"\n",
-      "}\n"
-     ]
-    }
-   ],
-   "source": [
-    "def classify_job_level_zero_shot(job_description: str) -> Dict:\n",
-    "    \"\"\"\n",
-    "    Zero-shot job level classification.\n",
-    "    \n",
-    "    Returns classification as: Entry, Mid, Senior, or Executive\n",
-    "    \"\"\"\n",
-    "    \n",
-    "    prompt = f\"\"\"Classify this job posting into ONE seniority level.\n",
-    "\n",
-    "Levels:\n",
-    "- Entry: 0-2 years experience, junior roles\n",
-    "- Mid: 3-5 years experience, independent work\n",
-    "- Senior: 6-10 years experience, technical leadership\n",
-    "- Executive: 10+ years, strategic leadership, C-level\n",
-    "\n",
-    "Job Posting:\n",
-    "{job_description[:500]}\n",
-    "\n",
-    "Return ONLY valid JSON:\n",
-    "{{\n",
-    "    \"level\": \"Entry|Mid|Senior|Executive\",\n",
-    "    \"confidence\": 0.85,\n",
-    "    \"reasoning\": \"Brief explanation\"\n",
-    "}}\n",
-    "\"\"\"\n",
-    "    \n",
-    "    response = call_llm(prompt)\n",
-    "    \n",
-    "    try:\n",
-    "        # Extract JSON\n",
-    "        json_str = response.strip()\n",
-    "        if '```json' in json_str:\n",
-    "            json_str = json_str.split('```json')[1].split('```')[0].strip()\n",
-    "        elif '```' in json_str:\n",
-    "            json_str = json_str.split('```')[1].split('```')[0].strip()\n",
-    "        \n",
-    "        # Find JSON in response\n",
-    "        if '{' in json_str and '}' in json_str:\n",
-    "            start = json_str.index('{')\n",
-    "            end = json_str.rindex('}') + 1\n",
-    "            json_str = json_str[start:end]\n",
-    "        \n",
-    "        result = json.loads(json_str)\n",
-    "        return result\n",
-    "    except:\n",
-    "        return {\n",
-    "            \"level\": \"Unknown\",\n",
-    "            \"confidence\": 0.0,\n",
-    "            \"reasoning\": \"Failed to parse response\"\n",
-    "        }\n",
-    "\n",
-    "# Test if LLM available and data loaded\n",
-    "if LLM_AVAILABLE and len(postings) > 0:\n",
-    "    print(\"\ud83e\uddea Testing zero-shot classification...\\n\")\n",
-    "    sample = postings.iloc[0]['description']\n",
-    "    result = classify_job_level_zero_shot(sample)\n",
-    "    \n",
-    "    print(\"\ud83d\udcca Classification Result:\")\n",
-    "    print(json.dumps(result, indent=2))\n",
-    "else:\n",
-    "    print(\"\u26a0\ufe0f  Skipped - LLM not available or no data\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---",
-    "## \ud83d\udcca Step 12: Few-Shot Learning"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\ud83e\uddea Comparing Zero-Shot vs Few-Shot...\n",
-      "\n",
-      "\ud83d\udcca Comparison:\n",
-      "Zero-shot: Unknown (confidence: 0.00)\n",
-      "Few-shot:  Unknown (confidence: 0.00)\n"
-     ]
-    }
-   ],
-   "source": [
-    "def classify_job_level_few_shot(job_description: str) -> Dict:\n",
-    "    \"\"\"\n",
-    "    Few-shot classification with examples.\n",
-    "    \"\"\"\n",
-    "    \n",
-    "    prompt = f\"\"\"Classify this job posting using examples.\n",
-    "\n",
-    "EXAMPLES:\n",
-    "\n",
-    "Example 1 (Entry):\n",
-    "\"Recent graduate wanted. Python basics. Mentorship provided.\"\n",
-    "\u2192 Entry level (learning focus, 0-2 years)\n",
-    "\n",
-    "Example 2 (Senior):\n",
-    "\"5+ years backend. Lead team of 3. System architecture.\"\n",
-    "\u2192 Senior level (technical leadership, 6-10 years)\n",
-    "\n",
-    "Example 3 (Executive):\n",
-    "\"CTO position. 15+ years. Define technical strategy.\"\n",
-    "\u2192 Executive level (C-level, strategic)\n",
-    "\n",
-    "NOW CLASSIFY:\n",
-    "{job_description[:500]}\n",
-    "\n",
-    "Return JSON:\n",
-    "{{\n",
-    "    \"level\": \"Entry|Mid|Senior|Executive\",\n",
-    "    \"confidence\": 0.0-1.0,\n",
-    "    \"reasoning\": \"Explain\"\n",
-    "}}\n",
-    "\"\"\"\n",
-    "    \n",
-    "    response = call_llm(prompt)\n",
-    "    \n",
-    "    try:\n",
-    "        json_str = response.strip()\n",
-    "        if '```json' in json_str:\n",
-    "            json_str = json_str.split('```json')[1].split('```')[0].strip()\n",
-    "        \n",
-    "        if '{' in json_str and '}' in json_str:\n",
-    "            start = json_str.index('{')\n",
-    "            end = json_str.rindex('}') + 1\n",
-    "            json_str = json_str[start:end]\n",
-    "        \n",
-    "        result = json.loads(json_str)\n",
-    "        return result\n",
-    "    except:\n",
-    "        return {\"level\": \"Unknown\", \"confidence\": 0.0, \"reasoning\": \"Parse error\"}\n",
-    "\n",
-    "# Compare zero-shot vs few-shot\n",
-    "if LLM_AVAILABLE and len(postings) > 0:\n",
-    "    print(\"\ud83e\uddea Comparing Zero-Shot vs Few-Shot...\\n\")\n",
-    "    sample = postings.iloc[0]['description']\n",
-    "    \n",
-    "    zero = classify_job_level_zero_shot(sample)\n",
-    "    few = classify_job_level_few_shot(sample)\n",
-    "    \n",
-    "    print(\"\ud83d\udcca Comparison:\")\n",
-    "    print(f\"Zero-shot: {zero['level']} (confidence: {zero['confidence']:.2f})\")\n",
-    "    print(f\"Few-shot:  {few['level']} (confidence: {few['confidence']:.2f})\")\n",
-    "else:\n",
-    "    print(\"\u26a0\ufe0f  Skipped\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---",
-    "## \ud83d\udcca Step 13: Structured Skills Extraction"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\ud83d\udd0d Testing skills extraction...\n",
-      "\n",
-      "\ud83d\udcca Extracted Skills:\n",
-      "{\n",
-      "  \"technical_skills\": [\n",
-      "    \"Adobe Creative Cloud (Indesign, Illustrator, Photoshop)\",\n",
-      "    \"Microsoft Office Suite\"\n",
-      "  ],\n",
-      "  \"soft_skills\": [\n",
-      "    \"Communication\",\n",
-      "    \"Leadership\"\n",
-      "  ],\n",
-      "  \"certifications\": [],\n",
-      "  \"languages\": [\n",
-      "    \"English\",\n",
-      "    \"Danish\"\n",
-      "  ]\n",
-      "}\n"
-     ]
-    }
-   ],
-   "source": [
-    "def extract_skills_taxonomy(job_description: str) -> Dict:\n",
-    "    \"\"\"\n",
-    "    Extract structured skills using LLM + Pydantic validation.\n",
-    "    \"\"\"\n",
-    "    \n",
-    "    prompt = f\"\"\"Extract skills from this job posting.\n",
-    "\n",
-    "Job Posting:\n",
-    "{job_description[:800]}\n",
-    "\n",
-    "Return ONLY valid JSON:\n",
-    "{{\n",
-    "    \"technical_skills\": [\"Python\", \"Docker\", \"AWS\"],\n",
-    "    \"soft_skills\": [\"Communication\", \"Leadership\"],\n",
-    "    \"certifications\": [\"AWS Certified\"],\n",
-    "    \"languages\": [\"English\", \"Danish\"]\n",
-    "}}\n",
-    "\"\"\"\n",
-    "    \n",
-    "    response = call_llm(prompt, max_tokens=800)\n",
-    "    \n",
-    "    try:\n",
-    "        json_str = response.strip()\n",
-    "        if '```json' in json_str:\n",
-    "            json_str = json_str.split('```json')[1].split('```')[0].strip()\n",
-    "        \n",
-    "        if '{' in json_str and '}' in json_str:\n",
-    "            start = json_str.index('{')\n",
-    "            end = json_str.rindex('}') + 1\n",
-    "            json_str = json_str[start:end]\n",
-    "        \n",
-    "        data = json.loads(json_str)\n",
-    "        # Validate with Pydantic\n",
-    "        validated = SkillsTaxonomy(**data)\n",
-    "        return validated.model_dump()\n",
-    "    except:\n",
-    "        return {\n",
-    "            \"technical_skills\": [],\n",
-    "            \"soft_skills\": [],\n",
-    "            \"certifications\": [],\n",
-    "            \"languages\": []\n",
-    "        }\n",
-    "\n",
-    "# Test extraction\n",
-    "if LLM_AVAILABLE and len(postings) > 0:\n",
-    "    print(\"\ud83d\udd0d Testing skills extraction...\\n\")\n",
-    "    sample = postings.iloc[0]['description']\n",
-    "    skills = extract_skills_taxonomy(sample)\n",
-    "    \n",
-    "    print(\"\ud83d\udcca Extracted Skills:\")\n",
-    "    print(json.dumps(skills, indent=2))\n",
-    "else:\n",
-    "    print(\"\u26a0\ufe0f  Skipped\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---",
-    "## \ud83d\udcca Step 14: Match Explainability"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\ud83d\udca1 Testing match explainability...\n",
-      "\n",
-      "\ud83d\udcca Match Explanation:\n",
-      "{\n",
-      "  \"overall_score\": 0.7028058171272278,\n",
-      "  \"match_strengths\": [\n",
-      "    \"Big Data\",\n",
-      "    \"Machine Learning\",\n",
-      "    \"Cloud\",\n",
-      "    \"Data Science\",\n",
-      "    \"Data Structures\"\n",
-      "  ],\n",
-      "  \"skill_gaps\": [\n",
-      "    \"TeachTown-specific skills\"\n",
-      "  ],\n",
-      "  \"recommendation\": \"Encourage the candidate to learn TeachTown-specific skills\",\n",
-      "  \"fit_summary\": \"The candidate has a strong background in big data, machine learning, and cloud technologies, but may need to learn TeachTown-specific skills to fully align with the company's needs.\"\n",
-      "}\n"
-     ]
-    }
-   ],
-   "source": [
-    "def explain_match(candidate_idx: int, company_idx: int, similarity_score: float) -> Dict:\n",
-    "    \"\"\"\n",
-    "    Generate LLM explanation for why candidate matches company.\n",
-    "    \"\"\"\n",
-    "    \n",
-    "    cand = candidates.iloc[candidate_idx]\n",
-    "    comp = companies_full.iloc[company_idx]\n",
-    "    \n",
-    "    cand_skills = str(cand.get('skills', 'N/A'))[:300]\n",
-    "    cand_exp = str(cand.get('positions', 'N/A'))[:300]\n",
-    "    comp_req = str(comp.get('required_skills', 'N/A'))[:300]\n",
-    "    comp_name = comp.get('name', 'Unknown')\n",
-    "    \n",
-    "    prompt = f\"\"\"Explain why this candidate matches this company.\n",
-    "\n",
-    "Candidate:\n",
-    "Skills: {cand_skills}\n",
-    "Experience: {cand_exp}\n",
-    "\n",
-    "Company: {comp_name}\n",
-    "Requirements: {comp_req}\n",
-    "\n",
-    "Similarity Score: {similarity_score:.2f}\n",
-    "\n",
-    "Return JSON:\n",
-    "{{\n",
-    "    \"overall_score\": {similarity_score},\n",
-    "    \"match_strengths\": [\"Top 3-5 matching factors\"],\n",
-    "    \"skill_gaps\": [\"Missing skills\"],\n",
-    "    \"recommendation\": \"What candidate should do\",\n",
-    "    \"fit_summary\": \"One sentence summary\"\n",
-    "}}\n",
-    "\"\"\"\n",
-    "    \n",
-    "    response = call_llm(prompt, max_tokens=1000)\n",
-    "    \n",
-    "    try:\n",
-    "        json_str = response.strip()\n",
-    "        if '```json' in json_str:\n",
-    "            json_str = json_str.split('```json')[1].split('```')[0].strip()\n",
-    "        \n",
-    "        if '{' in json_str and '}' in json_str:\n",
-    "            start = json_str.index('{')\n",
-    "            end = json_str.rindex('}') + 1\n",
-    "            json_str = json_str[start:end]\n",
-    "        \n",
-    "        data = json.loads(json_str)\n",
-    "        return data\n",
-    "    except:\n",
-    "        return {\n",
-    "            \"overall_score\": similarity_score,\n",
-    "            \"match_strengths\": [\"Unable to generate\"],\n",
-    "            \"skill_gaps\": [],\n",
-    "            \"recommendation\": \"Review manually\",\n",
-    "            \"fit_summary\": f\"Match score: {similarity_score:.2f}\"\n",
-    "        }\n",
-    "\n",
-    "# Test explainability\n",
-    "if LLM_AVAILABLE and cand_vectors is not None and len(candidates) > 0:\n",
-    "    print(\"\ud83d\udca1 Testing match explainability...\\n\")\n",
-    "    matches = find_top_matches(0, top_k=1)\n",
-    "    if matches:\n",
-    "        comp_idx, score = matches[0]\n",
-    "        explanation = explain_match(0, comp_idx, score)\n",
-    "        \n",
-    "        print(\"\ud83d\udcca Match Explanation:\")\n",
-    "        print(json.dumps(explanation, indent=2))\n",
-    "else:\n",
-    "    print(\"\u26a0\ufe0f  Skipped - requirements not met\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "## \ud83d\udcca Step 16: Detailed Match Visualization"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ============================================================================\n",
-    "# \ud83d\udd0d DETAILED MATCH EXAMPLE\n",
-    "# ============================================================================\n",
-    "\n",
-    "def show_detailed_match_example(candidate_idx=0, top_k=5):\n",
-    "    print(\"\ud83d\udd0d DETAILED MATCH ANALYSIS\")\n",
-    "    print(\"=\" * 100)\n",
-    "    \n",
-    "    if candidate_idx >= len(candidates):\n",
-    "        print(f\"\u274c ERROR: Candidate {candidate_idx} out of range\")\n",
-    "        return None\n",
-    "    \n",
-    "    cand = candidates.iloc[candidate_idx]\n",
-    "    \n",
-    "    print(f\"\\n\ud83c\udfaf CANDIDATE #{candidate_idx}\")\n",
-    "    print(f\"Resume ID: {cand.get('Resume_ID', 'N/A')}\")\n",
-    "    print(f\"Category: {cand.get('Category', 'N/A')}\")\n",
-    "    print(f\"Skills: {str(cand.get('skills', 'N/A'))[:150]}...\\n\")\n",
-    "    \n",
-    "    matches = find_top_matches(candidate_idx, top_k=top_k)\n",
-    "    \n",
-    "    print(f\"\ud83d\udd17 TOP {len(matches)} MATCHES:\\n\")\n",
-    "    \n",
-    "    for rank, (comp_idx, score) in enumerate(matches, 1):\n",
-    "        if comp_idx >= len(companies_full):\n",
-    "            continue\n",
-    "        \n",
-    "        company = companies_full.iloc[comp_idx]\n",
-    "        print(f\"#{rank}. {company.get('name', 'N/A')} (Score: {score:.4f})\")\n",
-    "        print(f\"    Industries: {str(company.get('industries_list', 'N/A'))[:60]}...\")\n",
-    "    \n",
-    "    print(\"\\n\" + \"=\" * 100)\n",
-    "    return matches\n",
-    "\n",
-    "# Test\n",
-    "show_detailed_match_example(candidate_idx=0, top_k=5)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "## \ud83d\udcca Step 17: Bridging Concept Analysis"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ============================================================================\n",
-    "# \ud83c\udf09 BRIDGING CONCEPT ANALYSIS\n",
-    "# ============================================================================\n",
-    "\n",
-    "def show_bridging_concept_analysis():\n",
-    "    print(\"\ud83c\udf09 THE BRIDGING CONCEPT\")\n",
-    "    print(\"=\" * 90)\n",
-    "    \n",
-    "    companies_with = companies_full[companies_full['required_skills'] != '']\n",
-    "    companies_without = companies_full[companies_full['required_skills'] == '']\n",
-    "    \n",
-    "    print(f\"\\n\ud83d\udcca DATA REALITY:\")\n",
-    "    print(f\"   Total companies: {len(companies_full):,}\")\n",
-    "    print(f\"   WITH postings: {len(companies_with):,} ({len(companies_with)/len(companies_full)*100:.1f}%)\")\n",
-    "    print(f\"   WITHOUT postings: {len(companies_without):,}\\n\")\n",
-    "    \n",
-    "    print(\"\ud83c\udfaf THE PROBLEM:\")\n",
-    "    print(\"   Companies: 'We are in TECH INDUSTRY'\")\n",
-    "    print(\"   Candidates: 'I know PYTHON, AWS'\")\n",
-    "    print(\"   \u2192 Different languages! \ud83d\udeab\\n\")\n",
-    "    \n",
-    "    print(\"\ud83c\udf09 THE SOLUTION (BRIDGING):\")\n",
-    "    print(\"   1. Extract from postings: 'Need PYTHON developers'\")\n",
-    "    print(\"   2. Enrich company profile with skills\")\n",
-    "    print(\"   3. Now both speak SKILLS LANGUAGE! \u2705\\n\")\n",
-    "    \n",
-    "    print(\"=\" * 90)\n",
-    "    return companies_with, companies_without\n",
-    "\n",
-    "# Test\n",
-    "show_bridging_concept_analysis()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "## \ud83d\udcca Step 18: Export Results to CSV"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ============================================================================\n",
-    "# \ud83d\udcbe EXPORT MATCHES TO CSV\n",
-    "# ============================================================================\n",
-    "\n",
-    "def export_matches_to_csv(num_candidates=100, top_k=10):\n",
-    "    print(f\"\ud83d\udcbe Exporting {num_candidates} candidates (top {top_k} each)...\\n\")\n",
-    "    \n",
-    "    results = []\n",
-    "    \n",
-    "    for i in range(min(num_candidates, len(candidates))):\n",
-    "        if i % 50 == 0:\n",
-    "            print(f\"   Processing {i+1}/{num_candidates}...\")\n",
-    "        \n",
-    "        matches = find_top_matches(i, top_k=top_k)\n",
-    "        cand = candidates.iloc[i]\n",
-    "        \n",
-    "        for rank, (comp_idx, score) in enumerate(matches, 1):\n",
-    "            if comp_idx >= len(companies_full):\n",
-    "                continue\n",
-    "            \n",
-    "            company = companies_full.iloc[comp_idx]\n",
-    "            \n",
-    "            results.append({\n",
-    "                'candidate_id': i,\n",
-    "                'candidate_category': cand.get('Category', 'N/A'),\n",
-    "                'company_id': company.get('company_id', 'N/A'),\n",
-    "                'company_name': company.get('name', 'N/A'),\n",
-    "                'match_rank': rank,\n",
-    "                'similarity_score': round(float(score), 4)\n",
-    "            })\n",
-    "    \n",
-    "    results_df = pd.DataFrame(results)\n",
-    "    output_file = f'{Config.RESULTS_PATH}hrhub_matches.csv'\n",
-    "    results_df.to_csv(output_file, index=False)\n",
-    "    \n",
-    "    print(f\"\\n\u2705 Exported {len(results_df):,} matches\")\n",
-    "    print(f\"\ud83d\udcc4 File: {output_file}\\n\")\n",
-    "    \n",
-    "    return results_df\n",
-    "\n",
-    "# Export sample\n",
-    "matches_df = export_matches_to_csv(num_candidates=50, top_k=5)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "## \ud83d\udcca Interactive Visualization 1: t-SNE Vector Space\n",
-    "\n",
-    "Project embeddings from \u211d\u00b3\u2078\u2074 \u2192 \u211d\u00b2 to visualize candidates and companies"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ============================================================================\n",
-    "# \ud83c\udfa8 T-SNE VECTOR SPACE VISUALIZATION\n",
-    "# ============================================================================\n",
-    "\n",
-    "from sklearn.manifold import TSNE\n",
-    "\n",
-    "print(\"\ud83c\udfa8 VECTOR SPACE VISUALIZATION\\n\")\n",
-    "print(\"=\" * 70)\n",
-    "\n",
-    "# Sample for visualization\n",
-    "n_cand_viz = min(500, len(candidates))\n",
-    "n_comp_viz = min(2000, len(companies_full))\n",
-    "\n",
-    "print(f\"\ud83d\udcca Visualizing:\")\n",
-    "print(f\"   \u2022 {n_cand_viz} candidates\")\n",
-    "print(f\"   \u2022 {n_comp_viz} companies\")\n",
-    "print(f\"   \u2022 From \u211d^384 \u2192 \u211d\u00b2 (t-SNE)\\n\")\n",
-    "\n",
-    "# Sample vectors\n",
-    "cand_sample = cand_vectors[:n_cand_viz]\n",
-    "comp_sample = comp_vectors[:n_comp_viz]\n",
-    "all_vectors = np.vstack([cand_sample, comp_sample])\n",
-    "\n",
-    "print(\"\ud83d\udd04 Running t-SNE (2-3 minutes)...\")\n",
-    "tsne = TSNE(\n",
-    "    n_components=2,\n",
-    "    perplexity=30,\n",
-    "    random_state=42,\n",
-    "    n_iter=1000\n",
-    ")\n",
-    "\n",
-    "vectors_2d = tsne.fit_transform(all_vectors)\n",
-    "cand_2d = vectors_2d[:n_cand_viz]\n",
-    "comp_2d = vectors_2d[n_cand_viz:]\n",
-    "\n",
-    "print(\"\\n\u2705 t-SNE complete!\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create interactive plot\n",
-    "fig = go.Figure()\n",
-    "\n",
-    "# Companies (red)\n",
-    "fig.add_trace(go.Scatter(\n",
-    "    x=comp_2d[:, 0],\n",
-    "    y=comp_2d[:, 1],\n",
-    "    mode='markers',\n",
-    "    name='Companies',\n",
-    "    marker=dict(size=6, color='#ff6b6b', opacity=0.6),\n",
-    "    text=[f\"Company: {companies_full.iloc[i].get('name', 'N/A')[:30]}\" \n",
-    "          for i in range(n_comp_viz)],\n",
-    "    hovertemplate='<b>%{text}</b><extra></extra>'\n",
-    "))\n",
-    "\n",
-    "# Candidates (green)\n",
-    "fig.add_trace(go.Scatter(\n",
-    "    x=cand_2d[:, 0],\n",
-    "    y=cand_2d[:, 1],\n",
-    "    mode='markers',\n",
-    "    name='Candidates',\n",
-    "    marker=dict(\n",
-    "        size=10,\n",
-    "        color='#00ff00',\n",
-    "        opacity=0.8,\n",
-    "        line=dict(width=1, color='white')\n",
-    "    ),\n",
-    "    text=[f\"Candidate {i}\" for i in range(n_cand_viz)],\n",
-    "    hovertemplate='<b>%{text}</b><extra></extra>'\n",
-    "))\n",
-    "\n",
-    "fig.update_layout(\n",
-    "    title='Vector Space: Candidates & Companies (Enriched with Postings)',\n",
-    "    xaxis_title='Dimension 1',\n",
-    "    yaxis_title='Dimension 2',\n",
-    "    width=1200,\n",
-    "    height=800,\n",
-    "    plot_bgcolor='#1a1a1a',\n",
-    "    paper_bgcolor='#0d0d0d',\n",
-    "    font=dict(color='white')\n",
-    ")\n",
-    "\n",
-    "fig.show()\n",
-    "\n",
-    "print(\"\\n\u2705 Visualization complete!\")\n",
-    "print(\"\ud83d\udca1 If green & red OVERLAP \u2192 Alignment worked!\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "## \ud83d\udcca Interactive Visualization 2: Highlighted Match Network\n",
-    "\n",
-    "Show candidate and their top matches with connection lines"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ============================================================================\n",
-    "# \ud83d\udd0d HIGHLIGHTED MATCH NETWORK\n",
-    "# ============================================================================\n",
-    "\n",
-    "target_candidate = 0\n",
-    "\n",
-    "print(f\"\ud83d\udd0d Analyzing Candidate #{target_candidate}...\\n\")\n",
-    "\n",
-    "matches = find_top_matches(target_candidate, top_k=10)\n",
-    "match_indices = [comp_idx for comp_idx, score in matches if comp_idx < n_comp_viz]\n",
-    "\n",
-    "# Create highlighted plot\n",
-    "fig2 = go.Figure()\n",
-    "\n",
-    "# All companies (background)\n",
-    "fig2.add_trace(go.Scatter(\n",
-    "    x=comp_2d[:, 0],\n",
-    "    y=comp_2d[:, 1],\n",
-    "    mode='markers',\n",
-    "    name='All Companies',\n",
-    "    marker=dict(size=4, color='#ff6b6b', opacity=0.3),\n",
-    "    showlegend=True\n",
-    "))\n",
-    "\n",
-    "# Top matches (highlighted)\n",
-    "if match_indices:\n",
-    "    match_positions = comp_2d[match_indices]\n",
-    "    fig2.add_trace(go.Scatter(\n",
-    "        x=match_positions[:, 0],\n",
-    "        y=match_positions[:, 1],\n",
-    "        mode='markers',\n",
-    "        name='Top Matches',\n",
-    "        marker=dict(\n",
-    "            size=15,\n",
-    "            color='#ff0000',\n",
-    "            line=dict(width=2, color='white')\n",
-    "        ),\n",
-    "        text=[f\"Match #{i+1}: {companies_full.iloc[match_indices[i]].get('name', 'N/A')[:30]}<br>Score: {matches[i][1]:.3f}\" \n",
-    "              for i in range(len(match_indices))],\n",
-    "        hovertemplate='<b>%{text}</b><extra></extra>'\n",
-    "    ))\n",
-    "\n",
-    "# Target candidate (star)\n",
-    "fig2.add_trace(go.Scatter(\n",
-    "    x=[cand_2d[target_candidate, 0]],\n",
-    "    y=[cand_2d[target_candidate, 1]],\n",
-    "    mode='markers',\n",
-    "    name=f'Candidate #{target_candidate}',\n",
-    "    marker=dict(\n",
-    "        size=25,\n",
-    "        color='#00ff00',\n",
-    "        symbol='star',\n",
-    "        line=dict(width=3, color='white')\n",
-    "    )\n",
-    "))\n",
-    "\n",
-    "# Connection lines (top 5)\n",
-    "for i, match_idx in enumerate(match_indices[:5]):\n",
-    "    fig2.add_trace(go.Scatter(\n",
-    "        x=[cand_2d[target_candidate, 0], comp_2d[match_idx, 0]],\n",
-    "        y=[cand_2d[target_candidate, 1], comp_2d[match_idx, 1]],\n",
-    "        mode='lines',\n",
-    "        line=dict(color='yellow', width=1, dash='dot'),\n",
-    "        opacity=0.5,\n",
-    "        showlegend=False\n",
-    "    ))\n",
-    "\n",
-    "fig2.update_layout(\n",
-    "    title=f'Candidate #{target_candidate} and Top Matches',\n",
-    "    xaxis_title='Dimension 1',\n",
-    "    yaxis_title='Dimension 2',\n",
-    "    width=1200,\n",
-    "    height=800,\n",
-    "    plot_bgcolor='#1a1a1a',\n",
-    "    paper_bgcolor='#0d0d0d',\n",
-    "    font=dict(color='white')\n",
-    ")\n",
-    "\n",
-    "fig2.show()\n",
-    "\n",
-    "print(\"\\n\u2705 Highlighted visualization created!\")\n",
-    "print(f\"   \u2b50 Green star = Candidate #{target_candidate}\")\n",
-    "print(f\"   \ud83d\udd34 Red dots = Top matches\")\n",
-    "print(f\"   \ud83d\udc9b Yellow lines = Connections\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "## \ud83c\udf10 Interactive Visualization 3: Network Graph (PyVis)\n",
-    "\n",
-    "Interactive network showing candidate-company connections with nodes & edges"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ============================================================================\n",
-    "# \ud83c\udf10 NETWORK GRAPH WITH PYVIS\n",
-    "# ============================================================================\n",
-    "\n",
-    "from pyvis.network import Network\n",
-    "import webbrowser\n",
-    "import os\n",
-    "\n",
-    "print(\"\ud83c\udf10 Creating interactive network graph...\\n\")\n",
-    "\n",
-    "target_candidate = 0\n",
-    "top_k_network = 10\n",
-    "\n",
-    "# Get matches\n",
-    "matches = find_top_matches(target_candidate, top_k=top_k_network)\n",
-    "\n",
-    "# Create network\n",
-    "net = Network(\n",
-    "    height='800px',\n",
-    "    width='100%',\n",
-    "    bgcolor='#1a1a1a',\n",
-    "    font_color='white',\n",
-    "    directed=False\n",
-    ")\n",
-    "\n",
-    "# Configure physics\n",
-    "net.barnes_hut(\n",
-    "    gravity=-5000,\n",
-    "    central_gravity=0.3,\n",
-    "    spring_length=100,\n",
-    "    spring_strength=0.01\n",
-    ")\n",
-    "\n",
-    "# Add candidate node (center)\n",
-    "cand = candidates.iloc[target_candidate]\n",
-    "cand_label = f\"Candidate #{target_candidate}\"\n",
-    "net.add_node(\n",
-    "    f'cand_{target_candidate}',\n",
-    "    label=cand_label,\n",
-    "    title=f\"{cand.get('Category', 'N/A')}<br>Skills: {str(cand.get('skills', 'N/A'))[:100]}\",\n",
-    "    color='#00ff00',\n",
-    "    size=40,\n",
-    "    shape='star'\n",
-    ")\n",
-    "\n",
-    "# Add company nodes + edges\n",
-    "for rank, (comp_idx, score) in enumerate(matches, 1):\n",
-    "    if comp_idx >= len(companies_full):\n",
-    "        continue\n",
-    "    \n",
-    "    company = companies_full.iloc[comp_idx]\n",
-    "    comp_name = company.get('name', f'Company {comp_idx}')[:30]\n",
-    "    \n",
-    "    # Color by score\n",
-    "    if score > 0.7:\n",
-    "        color = '#ff0000'  # Red (strong match)\n",
-    "    elif score > 0.5:\n",
-    "        color = '#ff6b6b'  # Light red (good match)\n",
-    "    else:\n",
-    "        color = '#ffaaaa'  # Pink (weak match)\n",
-    "    \n",
-    "    # Add company node\n",
-    "    net.add_node(\n",
-    "        f'comp_{comp_idx}',\n",
-    "        label=f\"#{rank}. {comp_name}\",\n",
-    "        title=f\"Score: {score:.3f}<br>Industries: {str(company.get('industries_list', 'N/A'))[:50]}<br>Required: {str(company.get('required_skills', 'N/A'))[:100]}\",\n",
-    "        color=color,\n",
-    "        size=20 + (score * 20)  # Size by score\n",
-    "    )\n",
-    "    \n",
-    "    # Add edge\n",
-    "    net.add_edge(\n",
-    "        f'cand_{target_candidate}',\n",
-    "        f'comp_{comp_idx}',\n",
-    "        value=float(score),\n",
-    "        title=f\"Similarity: {score:.3f}\",\n",
-    "        color='yellow'\n",
-    "    )\n",
-    "\n",
-    "# Save\n",
-    "output_file = f'{Config.RESULTS_PATH}network_graph.html'\n",
-    "net.save_graph(output_file)\n",
-    "\n",
-    "print(f\"\u2705 Network graph created!\")\n",
-    "print(f\"\ud83d\udcc4 Saved: {output_file}\")\n",
-    "print(f\"\\n\ud83d\udca1 LEGEND:\")\n",
-    "print(f\"   \u2b50 Green star = Candidate #{target_candidate}\")\n",
-    "print(f\"   \ud83d\udd34 Red nodes = Companies (size = match score)\")\n",
-    "print(f\"   \ud83d\udc9b Yellow edges = Connections\")\n",
-    "print(f\"\\n\u2139\ufe0f  Hover over nodes to see details\")\n",
-    "print(f\"   Drag nodes to rearrange\")\n",
-    "print(f\"   Zoom with mouse wheel\\n\")\n",
-    "\n",
-    "# Display in notebook\n",
-    "from IPython.display import IFrame\n",
-    "IFrame(output_file, width=1000, height=800)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### \ud83d\udcca Network Node Data\n",
-    "\n",
-    "Detailed information about nodes and connections"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ============================================================================\n",
-    "# DISPLAY NODE DATA\n",
-    "# ============================================================================\n",
-    "\n",
-    "print(\"\ud83d\udcca NETWORK DATA SUMMARY\")\n",
-    "print(\"=\" * 80)\n",
-    "print(f\"\\nTotal nodes: {1 + len(matches)}\")\n",
-    "print(f\"   - 1 candidate node (green star)\")\n",
-    "print(f\"   - {len(matches)} company nodes (red circles)\")\n",
-    "print(f\"\\nTotal edges: {len(matches)}\")\n",
-    "print(f\"\\n\" + \"=\" * 80)\n",
-    "\n",
-    "# Show node details\n",
-    "print(f\"\\n\ud83c\udfaf CANDIDATE NODE:\")\n",
-    "print(f\"   ID: cand_{target_candidate}\")\n",
-    "print(f\"   Category: {cand.get('Category', 'N/A')}\")\n",
-    "print(f\"   Skills: {str(cand.get('skills', 'N/A'))[:100]}...\")\n",
-    "\n",
-    "print(f\"\\n\ud83c\udfe2 COMPANY NODES (Top 5):\")\n",
-    "for rank, (comp_idx, score) in enumerate(matches[:5], 1):\n",
-    "    if comp_idx < len(companies_full):\n",
-    "        company = companies_full.iloc[comp_idx]\n",
-    "        print(f\"\\n   #{rank}. {company.get('name', 'N/A')[:40]}\")\n",
-    "        print(f\"       ID: comp_{comp_idx}\")\n",
-    "        print(f\"       Score: {score:.4f}\")\n",
-    "        print(f\"       Industries: {str(company.get('industries_list', 'N/A'))[:60]}...\")\n",
-    "\n",
-    "print(f\"\\n\" + \"=\" * 80)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "## \ud83d\udd0d Visualization 4: Display Node Data\n",
-    "\n",
-    "Inspect detailed information about candidates and companies"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ============================================================================",
-    "# DISPLAY NODE DATA - See what's behind the graph",
-    "# ============================================================================",
-    "",
-    "def display_node_data(node_id):",
-    "    print(\"=\" * 80)",
-    "    ",
-    "    if node_id.startswith('C'):",
-    "        # CANDIDATE",
-    "        cand_idx = int(node_id[1:])",
-    "        ",
-    "        if cand_idx >= len(candidates):",
-    "            print(f\"\u274c Candidate {cand_idx} not found!\")",
-    "            return",
-    "        ",
-    "        candidate = candidates.iloc[cand_idx]",
-    "        ",
-    "        print(f\"\ud83d\udfe2 CANDIDATE #{cand_idx}\")",
-    "        print(\"=\" * 80)",
-    "        print(f\"\\n\ud83d\udcca KEY INFORMATION:\\n\")",
-    "        print(f\"Resume ID: {candidate.get('Resume_ID', 'N/A')}\")",
-    "        print(f\"Category: {candidate.get('Category', 'N/A')}\")",
-    "        print(f\"Skills: {str(candidate.get('skills', 'N/A'))[:200]}\")",
-    "        print(f\"Career Objective: {str(candidate.get('career_objective', 'N/A'))[:200]}\")",
-    "        ",
-    "    elif node_id.startswith('J'):",
-    "        # COMPANY",
-    "        comp_idx = int(node_id[1:])",
-    "        ",
-    "        if comp_idx >= len(companies_full):",
-    "            print(f\"\u274c Company {comp_idx} not found!\")",
-    "            return",
-    "        ",
-    "        company = companies_full.iloc[comp_idx]",
-    "        ",
-    "        print(f\"\ud83d\udd34 COMPANY #{comp_idx}\")",
-    "        print(\"=\" * 80)",
-    "        print(f\"\\n\ud83d\udcca COMPANY INFORMATION:\\n\")",
-    "        print(f\"Name: {company.get('name', 'N/A')}\")",
-    "        print(f\"Industries: {str(company.get('industries_list', 'N/A'))[:200]}\")",
-    "        print(f\"Required Skills: {str(company.get('required_skills', 'N/A'))[:200]}\")",
-    "        print(f\"Posted Jobs: {str(company.get('posted_job_titles', 'N/A'))[:200]}\")",
-    "    ",
-    "    print(\"\\n\" + \"=\" * 80 + \"\\n\")",
-    "",
-    "def display_node_with_connections(node_id, top_k=10):",
-    "    display_node_data(node_id)",
-    "    ",
-    "    if node_id.startswith('C'):",
-    "        cand_idx = int(node_id[1:])",
-    "        ",
-    "        print(f\"\ud83c\udfaf TOP {top_k} MATCHES:\")",
-    "        print(\"=\" * 80)",
-    "        ",
-    "        matches = find_top_matches(cand_idx, top_k=top_k)",
-    "        ",
-    "        # FIXED: Validate indices before accessing",
-    "        valid_matches = 0",
-    "        for rank, (comp_idx, score) in enumerate(matches, 1):",
-    "            # Check if index is valid",
-    "            if comp_idx >= len(companies_full):",
-    "                print(f\"\u26a0\ufe0f  Match #{rank}: Index {comp_idx} out of range (skipping)\")",
-    "                continue",
-    "            ",
-    "            company = companies_full.iloc[comp_idx]",
-    "            print(f\"#{rank}. {company.get('name', 'N/A')[:40]} (Score: {score:.4f})\")",
-    "            valid_matches += 1",
-    "        ",
-    "        if valid_matches == 0:",
-    "            print(\"\u26a0\ufe0f  No valid matches found (all indices out of bounds)\")",
-    "            print(\"\\n\ud83d\udca1 SOLUTION: Regenerate embeddings after deduplication!\")",
-    "        ",
-    "        print(\"\\n\" + \"=\" * 80)",
-    "",
-    "# Example usage",
-    "display_node_with_connections('C0', top_k=5)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "## \ud83d\udd78\ufe0f Visualization 5: NetworkX Graph\n",
-    "\n",
-    "Network graph using NetworkX + Plotly with force-directed layout"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ============================================================================\n",
-    "# NETWORK GRAPH WITH NETWORKX + PLOTLY\n",
-    "# ============================================================================\n",
-    "\n",
-    "import networkx as nx\n",
-    "\n",
-    "print(\"\ud83d\udd78\ufe0f  Creating NETWORK GRAPH...\\n\")\n",
-    "\n",
-    "# Create graph\n",
-    "G = nx.Graph()\n",
-    "\n",
-    "# Sample\n",
-    "n_cand_sample = min(20, len(candidates))\n",
-    "top_k_per_cand = 5\n",
-    "\n",
-    "print(f\"\ud83d\udcca Network size:\")\n",
-    "print(f\"   \u2022 {n_cand_sample} candidates\")\n",
-    "print(f\"   \u2022 {top_k_per_cand} companies per candidate\\n\")\n",
-    "\n",
-    "# Add nodes + edges\n",
-    "companies_in_graph = set()\n",
-    "\n",
-    "for i in range(n_cand_sample):\n",
-    "    G.add_node(f\"C{i}\", node_type='candidate', label=f\"C{i}\")\n",
-    "    \n",
-    "    matches = find_top_matches(i, top_k=top_k_per_cand)\n",
-    "    \n",
-    "    for comp_idx, score in matches:\n",
-    "        comp_id = f\"J{comp_idx}\"\n",
-    "        \n",
-    "        if comp_id not in companies_in_graph:\n",
-    "            company_name = companies_full.iloc[comp_idx].get('name', 'N/A')[:20]\n",
-    "            G.add_node(comp_id, node_type='company', label=company_name)\n",
-    "            companies_in_graph.add(comp_id)\n",
-    "        \n",
-    "        G.add_edge(f\"C{i}\", comp_id, weight=float(score))\n",
-    "\n",
-    "print(f\"\u2705 Network created!\")\n",
-    "print(f\"   Nodes: {G.number_of_nodes()}\")\n",
-    "print(f\"   Edges: {G.number_of_edges()}\\n\")\n",
-    "\n",
-    "# Calculate layout\n",
-    "print(\"\ud83d\udd04 Calculating layout...\")\n",
-    "pos = nx.spring_layout(G, k=2, iterations=50, seed=42)\n",
-    "print(\"\u2705 Layout done!\\n\")\n",
-    "\n",
-    "# Create edge traces\n",
-    "edge_trace = []\n",
-    "for edge in G.edges(data=True):\n",
-    "    x0, y0 = pos[edge[0]]\n",
-    "    x1, y1 = pos[edge[1]]\n",
-    "    weight = edge[2]['weight']\n",
-    "    \n",
-    "    edge_trace.append(go.Scatter(\n",
-    "        x=[x0, x1, None],\n",
-    "        y=[y0, y1, None],\n",
-    "        mode='lines',\n",
-    "        line=dict(width=weight*3, color='rgba(255,255,255,0.3)'),\n",
-    "        hoverinfo='none',\n",
-    "        showlegend=False\n",
-    "    ))\n",
-    "\n",
-    "# Candidate nodes\n",
-    "cand_nodes = [n for n, d in G.nodes(data=True) if d['node_type']=='candidate']\n",
-    "cand_x = [pos[n][0] for n in cand_nodes]\n",
-    "cand_y = [pos[n][1] for n in cand_nodes]\n",
-    "cand_labels = [G.nodes[n]['label'] for n in cand_nodes]\n",
-    "\n",
-    "candidate_trace = go.Scatter(\n",
-    "    x=cand_x, y=cand_y,\n",
-    "    mode='markers+text',\n",
-    "    name='Candidates',\n",
-    "    marker=dict(size=25, color='#00ff00', line=dict(width=2, color='white')),\n",
-    "    text=cand_labels,\n",
-    "    textposition='top center',\n",
-    "    hovertemplate='<b>%{text}</b><extra></extra>'\n",
-    ")\n",
-    "\n",
-    "# Company nodes\n",
-    "comp_nodes = [n for n, d in G.nodes(data=True) if d['node_type']=='company']\n",
-    "comp_x = [pos[n][0] for n in comp_nodes]\n",
-    "comp_y = [pos[n][1] for n in comp_nodes]\n",
-    "comp_labels = [G.nodes[n]['label'] for n in comp_nodes]\n",
-    "\n",
-    "company_trace = go.Scatter(\n",
-    "    x=comp_x, y=comp_y,\n",
-    "    mode='markers+text',\n",
-    "    name='Companies',\n",
-    "    marker=dict(size=15, color='#ff6b6b', symbol='square'),\n",
-    "    text=comp_labels,\n",
-    "    textposition='top center',\n",
-    "    hovertemplate='<b>%{text}</b><extra></extra>'\n",
-    ")\n",
-    "\n",
-    "# Create figure\n",
-    "fig = go.Figure(data=edge_trace + [candidate_trace, company_trace])\n",
-    "\n",
-    "fig.update_layout(\n",
-    "    title='Network Graph: Candidates \u2194 Companies',\n",
-    "    showlegend=True,\n",
-    "    width=1400, height=900,\n",
-    "    plot_bgcolor='#1a1a1a',\n",
-    "    paper_bgcolor='#0d0d0d',\n",
-    "    font=dict(color='white'),\n",
-    "    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),\n",
-    "    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)\n",
-    ")\n",
-    "\n",
-    "fig.show()\n",
-    "\n",
-    "print(\"\u2705 NetworkX graph created!\")\n",
-    "print(\"   \ud83d\udfe2 Green = Candidates\")\n",
-    "print(\"   \ud83d\udd34 Red = Companies\")\n",
-    "print(\"   Lines = Connections (thicker = stronger)\\n\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "## \ud83d\udc1b DEBUG: Why aren't candidates & companies overlapping?\n",
-    "\n",
-    "Investigating the embedding space alignment"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ============================================================================\n",
-    "# DEBUG: CHECK EMBEDDING ALIGNMENT\n",
-    "# ============================================================================\n",
-    "\n",
-    "print(\"\ud83d\udc1b DEBUGGING EMBEDDING SPACE\")\n",
-    "print(\"=\" * 80)\n",
-    "\n",
-    "# 1. Check if vectors loaded correctly\n",
-    "print(f\"\\n1\ufe0f\u20e3 VECTOR SHAPES:\")\n",
-    "print(f\"   Candidates: {cand_vectors.shape}\")\n",
-    "print(f\"   Companies: {comp_vectors.shape}\")\n",
-    "\n",
-    "# 2. Check vector norms\n",
-    "print(f\"\\n2\ufe0f\u20e3 VECTOR NORMS (should be ~1.0 if normalized):\")\n",
-    "cand_norms = np.linalg.norm(cand_vectors, axis=1)\n",
-    "comp_norms = np.linalg.norm(comp_vectors, axis=1)\n",
-    "print(f\"   Candidates: mean={cand_norms.mean():.4f}, min={cand_norms.min():.4f}, max={cand_norms.max():.4f}\")\n",
-    "print(f\"   Companies: mean={comp_norms.mean():.4f}, min={comp_norms.min():.4f}, max={comp_norms.max():.4f}\")\n",
-    "\n",
-    "# 3. Sample similarity\n",
-    "print(f\"\\n3\ufe0f\u20e3 SAMPLE SIMILARITIES:\")\n",
-    "sample_cand = 0\n",
-    "matches = find_top_matches(sample_cand, top_k=5)\n",
-    "print(f\"   Candidate #{sample_cand} top 5 matches:\")\n",
-    "for rank, (comp_idx, score) in enumerate(matches, 1):\n",
-    "    print(f\"      #{rank}. Company {comp_idx}: {score:.4f}\")\n",
-    "\n",
-    "# 4. Check text representations\n",
-    "print(f\"\\n4\ufe0f\u20e3 TEXT REPRESENTATION SAMPLES:\")\n",
-    "print(f\"\\n   \ud83d\udccb CANDIDATE #{sample_cand}:\")\n",
-    "cand = candidates.iloc[sample_cand]\n",
-    "print(f\"      Skills: {str(cand.get('skills', 'N/A'))[:100]}\")\n",
-    "print(f\"      Category: {cand.get('Category', 'N/A')}\")\n",
-    "\n",
-    "top_company_idx = matches[0][0]\n",
-    "print(f\"\\n   \ud83c\udfe2 TOP MATCH COMPANY #{top_company_idx}:\")\n",
-    "company = companies_full.iloc[top_company_idx]\n",
-    "print(f\"      Name: {company.get('name', 'N/A')}\")\n",
-    "print(f\"      Required Skills: {str(company.get('required_skills', 'N/A'))[:100]}\")\n",
-    "print(f\"      Industries: {str(company.get('industries_list', 'N/A'))[:100]}\")\n",
-    "\n",
-    "# 5. Check if postings enrichment worked\n",
-    "print(f\"\\n5\ufe0f\u20e3 POSTINGS ENRICHMENT CHECK:\")\n",
-    "companies_with_postings = companies_full[companies_full['required_skills'] != ''].shape[0]\n",
-    "companies_without = companies_full[companies_full['required_skills'] == ''].shape[0]\n",
-    "print(f\"   WITH postings: {companies_with_postings:,} ({companies_with_postings/len(companies_full)*100:.1f}%)\")\n",
-    "print(f\"   WITHOUT postings: {companies_without:,}\")\n",
-    "\n",
-    "# 6. HYPOTHESIS\n",
-    "print(f\"\\n\u2753 HYPOTHESIS:\")\n",
-    "if companies_without > companies_with_postings:\n",
-    "    print(f\"   \u26a0\ufe0f  Most companies DON'T have postings!\")\n",
-    "    print(f\"   \u26a0\ufe0f  They only have: industries, specialties, description\")\n",
-    "    print(f\"   \u26a0\ufe0f  This creates DIFFERENT language than candidates\")\n",
-    "    print(f\"\\n   \ud83d\udca1 SOLUTION:\")\n",
-    "    print(f\"      Option A: Filter to only companies WITH postings\")\n",
-    "    print(f\"      Option B: Use LLM to translate industries \u2192 skills\")\n",
-    "else:\n",
-    "    print(f\"   \u2705 Most companies have postings\")\n",
-    "    print(f\"   \u2753 Need to check if embeddings were generated AFTER enrichment\")\n",
-    "\n",
-    "print(f\"\\n\" + \"=\" * 80)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n## \ud83d\udcca Step 19: Summary\n\n### What We Built"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "======================================================================\n",
-      "\ud83c\udfaf HRHUB v2.1 - SUMMARY\n",
-      "======================================================================\n",
-      "\n",
-      "\u2705 IMPLEMENTED:\n",
-      "  1. Zero-Shot Job Classification (Entry/Mid/Senior/Executive)\n",
-      "  2. Few-Shot Learning with Examples\n",
-      "  3. Structured Skills Extraction (Pydantic schemas)\n",
-      "  4. Match Explainability (LLM-generated reasoning)\n",
-      "  5. FREE LLM Integration (Hugging Face)\n",
-      "  6. Flexible Data Loading (Upload OR Google Drive)\n",
-      "\n",
-      "\ud83d\udcb0 COST: $0.00 (completely free!)\n",
-      "\n",
-      "\ud83d\udcc8 COURSE ALIGNMENT:\n",
-      "  \u2705 LLMs for structured output\n",
-      "  \u2705 Pydantic schemas\n",
-      "  \u2705 Classification pipelines\n",
-      "  \u2705 Zero-shot & few-shot learning\n",
-      "  \u2705 JSON extraction\n",
-      "  \u2705 Transformer architecture (embeddings)\n",
-      "  \u2705 API deployment strategies\n",
-      "\n",
-      "======================================================================\n",
-      "\ud83d\ude80 READY TO MOVE TO VS CODE!\n",
-      "======================================================================\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"=\"*70)\n",
-    "print(\"\ud83c\udfaf HRHUB v2.1 - SUMMARY\")\n",
-    "print(\"=\"*70)\n",
-    "print(\"\")\n",
-    "print(\"\u2705 IMPLEMENTED:\")\n",
-    "print(\"  1. Zero-Shot Job Classification (Entry/Mid/Senior/Executive)\")\n",
-    "print(\"  2. Few-Shot Learning with Examples\")\n",
-    "print(\"  3. Structured Skills Extraction (Pydantic schemas)\")\n",
-    "print(\"  4. Match Explainability (LLM-generated reasoning)\")\n",
-    "print(\"  5. FREE LLM Integration (Hugging Face)\")\n",
-    "print(\"  6. Flexible Data Loading (Upload OR Google Drive)\")\n",
-    "print(\"\")\n",
-    "print(\"\ud83d\udcb0 COST: $0.00 (completely free!)\")\n",
-    "print(\"\")\n",
-    "print(\"\ud83d\udcc8 COURSE ALIGNMENT:\")\n",
-    "print(\"  \u2705 LLMs for structured output\")\n",
-    "print(\"  \u2705 Pydantic schemas\")\n",
-    "print(\"  \u2705 Classification pipelines\")\n",
-    "print(\"  \u2705 Zero-shot & few-shot learning\")\n",
-    "print(\"  \u2705 JSON extraction\")\n",
-    "print(\"  \u2705 Transformer architecture (embeddings)\")\n",
-    "print(\"  \u2705 API deployment strategies\")\n",
-    "print(\"\")\n",
-    "print(\"=\"*70)\n",
-    "print(\"\ud83d\ude80 READY TO MOVE TO VS CODE!\")\n",
-    "print(\"=\"*70)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "## \ud83c\udfaf Step 7.5: Collaborative Filtering for Companies\n",
-    "\n",
-    "**THE GENIUS SOLUTION!**\n",
-    "\n",
-    "Companies WITHOUT postings can inherit skills from similar companies WITH postings!\n",
-    "\n",
-    "Like Netflix recommendations:\n",
-    "- Company A (no postings) similar to Company B (has postings)\n",
-    "- \u2192 Company A inherits Company B's required skills!"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ============================================================================\n",
-    "# COLLABORATIVE FILTERING: Companies without postings\n",
-    "# ============================================================================\n",
-    "\n",
-    "print(\"\ud83c\udfaf COLLABORATIVE FILTERING FOR COMPANIES\")\n",
-    "print(\"=\" * 80)\n",
-    "print(\"\\nLike Netflix: Similar companies \u2192 Similar skills needed!\\n\")\n",
-    "\n",
-    "# Step 1: Separate companies\n",
-    "companies_with_postings = companies_full[companies_full['required_skills'] != ''].copy()\n",
-    "companies_without_postings = companies_full[companies_full['required_skills'] == ''].copy()\n",
-    "\n",
-    "print(f\"\ud83d\udcca DATA SPLIT:\")\n",
-    "print(f\"   WITH postings: {len(companies_with_postings):,} companies\")\n",
-    "print(f\"   WITHOUT postings: {len(companies_without_postings):,} companies\")\n",
-    "print(f\"\\n\ud83d\udca1 Goal: Infer skills for {len(companies_without_postings):,} companies\\n\")\n",
-    "\n",
-    "# Step 2: Build company profile vectors (BEFORE postings)\n",
-    "# Using ONLY: industries, specialties, employee_count, description\n",
-    "print(\"\ud83d\udd27 Building company profile vectors...\")\n",
-    "\n",
-    "def build_company_profile_text(row):\n",
-    "    \"\"\"Build text representation WITHOUT postings data\"\"\"\n",
-    "    parts = []\n",
-    "    \n",
-    "    if row.get('name'):\n",
-    "        parts.append(f\"Company: {row['name']}\")\n",
-    "    \n",
-    "    if row.get('description'):\n",
-    "        parts.append(f\"Description: {row['description']}\")\n",
-    "    \n",
-    "    if row.get('industries_list'):\n",
-    "        parts.append(f\"Industries: {row['industries_list']}\")\n",
-    "    \n",
-    "    if row.get('specialties_list'):\n",
-    "        parts.append(f\"Specialties: {row['specialties_list']}\")\n",
-    "    \n",
-    "    if row.get('employee_count'):\n",
-    "        parts.append(f\"Size: {row['employee_count']} employees\")\n",
-    "    \n",
-    "    return ' '.join(parts)\n",
-    "\n",
-    "# Generate profile embeddings\n",
-    "with_postings_profiles = companies_with_postings.apply(build_company_profile_text, axis=1).tolist()\n",
-    "without_postings_profiles = companies_without_postings.apply(build_company_profile_text, axis=1).tolist()\n",
-    "\n",
-    "print(f\"   Encoding {len(with_postings_profiles):,} companies WITH postings...\")\n",
-    "with_postings_embeddings = model.encode(\n",
-    "    with_postings_profiles,\n",
-    "    show_progress_bar=True,\n",
-    "    batch_size=32,\n",
-    "    normalize_embeddings=True\n",
-    ")\n",
-    "\n",
-    "print(f\"   Encoding {len(without_postings_profiles):,} companies WITHOUT postings...\")\n",
-    "without_postings_embeddings = model.encode(\n",
-    "    without_postings_profiles,\n",
-    "    show_progress_bar=True,\n",
-    "    batch_size=32,\n",
-    "    normalize_embeddings=True\n",
-    ")\n",
-    "\n",
-    "print(f\"\\n\u2705 Profile embeddings created!\")\n",
-    "print(f\"   Shape WITH: {with_postings_embeddings.shape}\")\n",
-    "print(f\"   Shape WITHOUT: {without_postings_embeddings.shape}\\n\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ============================================================================\n",
-    "# STEP 3: Find Similar Companies & Inherit Skills\n",
-    "# ============================================================================\n",
-    "\n",
-    "print(\"\ud83d\udd0d Finding similar companies for skill inheritance...\\n\")\n",
-    "\n",
-    "# For each company WITHOUT postings, find top-K similar WITH postings\n",
-    "TOP_K_SIMILAR = 5  # Use top 5 similar companies\n",
-    "\n",
-    "print(f\"\ud83d\udcca Method: Top-{TOP_K_SIMILAR} Collaborative Filtering\\n\")\n",
-    "\n",
-    "inferred_skills = []\n",
-    "inferred_titles = []\n",
-    "inferred_levels = []\n",
-    "\n",
-    "# Calculate similarities (batch processing)\n",
-    "print(\"\u2699\ufe0f  Calculating company-to-company similarities...\")\n",
-    "similarities = cosine_similarity(without_postings_embeddings, with_postings_embeddings)\n",
-    "\n",
-    "print(f\"\u2705 Similarity matrix: {similarities.shape}\\n\")\n",
-    "print(f\"\ud83d\udd04 Inferring skills for {len(companies_without_postings):,} companies...\\n\")\n",
-    "\n",
-    "for i in range(len(companies_without_postings)):\n",
-    "    if i % 10000 == 0:\n",
-    "        print(f\"   Progress: {i:,}/{len(companies_without_postings):,}\")\n",
-    "    \n",
-    "    # Get top-K similar companies WITH postings\n",
-    "    top_k_indices = np.argsort(similarities[i])[::-1][:TOP_K_SIMILAR]\n",
-    "    \n",
-    "    # Collect skills from similar companies\n",
-    "    similar_skills = []\n",
-    "    similar_titles = []\n",
-    "    similar_levels = []\n",
-    "    \n",
-    "    for idx in top_k_indices:\n",
-    "        similar_company = companies_with_postings.iloc[idx]\n",
-    "        \n",
-    "        if similar_company.get('required_skills'):\n",
-    "            similar_skills.append(str(similar_company['required_skills']))\n",
-    "        \n",
-    "        if similar_company.get('posted_job_titles'):\n",
-    "            similar_titles.append(str(similar_company['posted_job_titles']))\n",
-    "        \n",
-    "        if similar_company.get('experience_levels'):\n",
-    "            similar_levels.append(str(similar_company['experience_levels']))\n",
-    "    \n",
-    "    # Aggregate (simple concatenation)\n",
-    "    inferred_skills.append(' | '.join(similar_skills) if similar_skills else '')\n",
-    "    inferred_titles.append(' | '.join(similar_titles) if similar_titles else '')\n",
-    "    inferred_levels.append(' | '.join(similar_levels) if similar_levels else '')\n",
-    "\n",
-    "print(f\"\\n\u2705 Skill inference complete!\\n\")\n",
-    "\n",
-    "# Add to companies_without_postings\n",
-    "companies_without_postings['required_skills'] = inferred_skills\n",
-    "companies_without_postings['posted_job_titles'] = inferred_titles\n",
-    "companies_without_postings['experience_levels'] = inferred_levels\n",
-    "\n",
-    "# Mark as inferred\n",
-    "companies_without_postings['skills_source'] = 'inferred_cf'\n",
-    "companies_with_postings['skills_source'] = 'actual_postings'\n",
-    "\n",
-    "print(f\"\ud83d\udcca RESULTS:\")\n",
-    "non_empty = sum(1 for s in inferred_skills if s != '')\n",
-    "print(f\"   Successfully inferred skills: {non_empty:,}/{len(inferred_skills):,} ({non_empty/len(inferred_skills)*100:.1f}%)\\n\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ============================================================================\n",
-    "# STEP 4: Rebuild companies_full with INFERRED skills\n",
-    "# ============================================================================\n",
-    "\n",
-    "print(\"\ud83d\udd04 Rebuilding companies_full with inferred skills...\\n\")\n",
-    "\n",
-    "# Combine\n",
-    "companies_full_enhanced = pd.concat([\n",
-    "    companies_with_postings,\n",
-    "    companies_without_postings\n",
-    "], ignore_index=False).sort_index()\n",
-    "\n",
-    "print(f\"\u2705 Enhanced dataset created!\")\n",
-    "print(f\"   Total companies: {len(companies_full_enhanced):,}\")\n",
-    "print(f\"   With actual postings: {len(companies_with_postings):,}\")\n",
-    "print(f\"   With inferred skills: {len(companies_without_postings):,}\")\n",
-    "\n",
-    "# Verify\n",
-    "total_with_skills = companies_full_enhanced[companies_full_enhanced['required_skills'] != ''].shape[0]\n",
-    "print(f\"\\n\ud83d\udcc8 IMPROVEMENT:\")\n",
-    "print(f\"   BEFORE: {len(companies_with_postings):,} companies with skills ({len(companies_with_postings)/len(companies_full)*100:.1f}%)\")\n",
-    "print(f\"   AFTER: {total_with_skills:,} companies with skills ({total_with_skills/len(companies_full_enhanced)*100:.1f}%)\")\n",
-    "print(f\"   \ud83d\udcca Increase: +{total_with_skills - len(companies_with_postings):,} companies!\\n\")\n",
-    "\n",
-    "# Replace companies_full\n",
-    "companies_full = companies_full_enhanced\n",
-    "\n",
-    "print(f\"\u2705 companies_full updated with collaborative filtering!\\n\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ============================================================================\n",
-    "# STEP 5: Regenerate Company Embeddings with INFERRED skills\n",
-    "# ============================================================================\n",
-    "\n",
-    "print(\"\ud83d\udd04 Regenerating company embeddings with inferred skills...\\n\")\n",
-    "\n",
-    "def build_company_text_enhanced(row):\n",
-    "    \"\"\"Build company text WITH inferred/actual skills\"\"\"\n",
-    "    parts = []\n",
-    "    \n",
-    "    if row.get('name'):\n",
-    "        parts.append(f\"Company: {row['name']}\")\n",
-    "    \n",
-    "    if row.get('description'):\n",
-    "        parts.append(f\"Description: {row['description']}\")\n",
-    "    \n",
-    "    if row.get('industries_list'):\n",
-    "        parts.append(f\"Industries: {row['industries_list']}\")\n",
-    "    \n",
-    "    if row.get('specialties_list'):\n",
-    "        parts.append(f\"Specialties: {row['specialties_list']}\")\n",
-    "    \n",
-    "    # NOW INCLUDES INFERRED SKILLS!\n",
-    "    if row.get('required_skills'):\n",
-    "        parts.append(f\"Required Skills: {row['required_skills']}\")\n",
-    "    \n",
-    "    if row.get('posted_job_titles'):\n",
-    "        parts.append(f\"Job Titles: {row['posted_job_titles']}\")\n",
-    "    \n",
-    "    if row.get('experience_levels'):\n",
-    "        parts.append(f\"Experience: {row['experience_levels']}\")\n",
-    "    \n",
-    "    return ' '.join(parts)\n",
-    "\n",
-    "# Build texts\n",
-    "company_texts_enhanced = companies_full.apply(build_company_text_enhanced, axis=1).tolist()\n",
-    "\n",
-    "print(f\"\ud83d\udcdd Encoding {len(company_texts_enhanced):,} enhanced company profiles...\\n\")\n",
-    "\n",
-    "comp_vectors_enhanced = model.encode(\n",
-    "    company_texts_enhanced,\n",
-    "    show_progress_bar=True,\n",
-    "    batch_size=32,\n",
-    "    normalize_embeddings=True\n",
-    ")\n",
-    "\n",
-    "print(f\"\\n\u2705 Enhanced embeddings created!\")\n",
-    "print(f\"   Shape: {comp_vectors_enhanced.shape}\")\n",
-    "\n",
-    "# Replace global comp_vectors\n",
-    "comp_vectors = comp_vectors_enhanced\n",
-    "\n",
-    "print(f\"\\n\ud83c\udfaf NOW candidates & companies speak the SAME LANGUAGE!\")\n",
-    "print(f\"   All companies have skill information (actual or inferred)\")\n",
-    "print(f\"   Ready for matching!\\n\")\n",
-    "\n",
-    "# Save\n",
-    "np.save(f'{Config.PROCESSED_PATH}company_embeddings_cf_enhanced.npy', comp_vectors)\n",
-    "print(f\"\ud83d\udcbe Saved: company_embeddings_cf_enhanced.npy\\n\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### \ud83d\udd0d Example: Check Inferred Skills"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ============================================================================\n",
-    "# EXAMPLE: See skill inference in action\n",
-    "# ============================================================================\n",
-    "\n",
-    "print(\"\ud83d\udd0d COLLABORATIVE FILTERING EXAMPLE\")\n",
-    "print(\"=\" * 80)\n",
-    "\n",
-    "# Find a company that got inferred skills\n",
-    "inferred_companies = companies_full[companies_full['skills_source'] == 'inferred_cf']\n",
-    "\n",
-    "if len(inferred_companies) > 0:\n",
-    "    example = inferred_companies.iloc[0]\n",
-    "    \n",
-    "    print(f\"\\n\ud83d\udccb COMPANY (skills were INFERRED):\")\n",
-    "    print(f\"   Name: {example.get('name', 'N/A')}\")\n",
-    "    print(f\"   Industries: {str(example.get('industries_list', 'N/A'))[:100]}\")\n",
-    "    print(f\"   Specialties: {str(example.get('specialties_list', 'N/A'))[:100]}\")\n",
-    "    print(f\"\\n   \ud83c\udfaf INFERRED Required Skills:\")\n",
-    "    print(f\"      {str(example.get('required_skills', 'N/A'))[:200]}\")\n",
-    "    print(f\"\\n   \ud83d\udcbc INFERRED Job Titles:\")\n",
-    "    print(f\"      {str(example.get('posted_job_titles', 'N/A'))[:200]}\")\n",
-    "    \n",
-    "    print(f\"\\n\ud83d\udca1 These skills were inherited from similar companies!\")\n",
-    "else:\n",
-    "    print(\"\\n\u26a0\ufe0f  No inferred companies found\")\n",
-    "\n",
-    "print(\"\\n\" + \"=\" * 80)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "## \ud83e\udde0 Step 8: Generate OR Load Embeddings\n",
-    "\n",
-    "**Smart pipeline:**\n",
-    "- First run: Generate embeddings (slow ~5 min)\n",
-    "- Subsequent runs: Load from file (fast <5 sec)\n",
-    "\n",
-    "**CRITICAL:** Embeddings generated AFTER deduplication for perfect alignment!"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ============================================================================\n",
-    "# EMBEDDING GENERATION + SAVE/LOAD PIPELINE\n",
-    "# ============================================================================\n",
-    "\n",
-    "import os\n",
-    "from pathlib import Path\n",
-    "\n",
-    "print(\"\ud83e\udde0 EMBEDDING PIPELINE\")\n",
-    "print(\"=\" * 80)\n",
-    "print()\n",
-    "\n",
-    "# Ensure processed directory exists\n",
-    "Path(Config.PROCESSED_PATH).mkdir(parents=True, exist_ok=True)\n",
-    "\n",
-    "# Define file paths\n",
-    "CAND_EMBEDDINGS_FILE = f'{Config.PROCESSED_PATH}candidate_embeddings.npy'\n",
-    "COMP_EMBEDDINGS_FILE = f'{Config.PROCESSED_PATH}company_embeddings.npy'\n",
-    "CAND_METADATA_FILE = f'{Config.PROCESSED_PATH}candidates_metadata.pkl'\n",
-    "COMP_METADATA_FILE = f'{Config.PROCESSED_PATH}companies_metadata.pkl'\n",
-    "\n",
-    "# Check if embeddings already exist\n",
-    "cand_exists = os.path.exists(CAND_EMBEDDINGS_FILE)\n",
-    "comp_exists = os.path.exists(COMP_EMBEDDINGS_FILE)\n",
-    "\n",
-    "print(f\"\ud83d\udcc1 Checking for existing embeddings...\")\n",
-    "print(f\"   Candidates: {'\u2705 Found' if cand_exists else '\u274c Not found'}\")\n",
-    "print(f\"   Companies: {'\u2705 Found' if comp_exists else '\u274c Not found'}\")\n",
-    "print()\n",
-    "\n",
-    "# Load model\n",
-    "print(f\"\ud83d\udd27 Loading embedding model: {Config.EMBEDDING_MODEL}\")\n",
-    "model = SentenceTransformer(Config.EMBEDDING_MODEL)\n",
-    "embedding_dim = model.get_sentence_embedding_dimension()\n",
-    "print(f\"\u2705 Model loaded! Dimension: {embedding_dim}\\n\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ============================================================================\n",
-    "# CANDIDATE EMBEDDINGS - Generate or Load\n",
-    "# ============================================================================\n",
-    "\n",
-    "if cand_exists:\n",
-    "    print(\"\ud83d\udce5 LOADING candidate embeddings from file...\")\n",
-    "    cand_vectors = np.load(CAND_EMBEDDINGS_FILE)\n",
-    "    print(f\"\u2705 Loaded: {cand_vectors.shape}\")\n",
-    "    \n",
-    "    # Verify alignment\n",
-    "    if len(cand_vectors) != len(candidates):\n",
-    "        print(f\"\\n\u26a0\ufe0f  WARNING: Size mismatch!\")\n",
-    "        print(f\"   Embeddings: {len(cand_vectors):,}\")\n",
-    "        print(f\"   Dataset: {len(candidates):,}\")\n",
-    "        print(f\"\\n\ud83d\udd04 Regenerating...\")\n",
-    "        cand_exists = False\n",
-    "\n",
-    "if not cand_exists:\n",
-    "    print(\"\ud83d\udd04 GENERATING candidate embeddings...\")\n",
-    "    print(f\"   Processing {len(candidates):,} candidates...\\n\")\n",
-    "    \n",
-    "    # Build text representations\n",
-    "    def build_candidate_text(row):\n",
-    "        parts = []\n",
-    "        \n",
-    "        if row.get('Category'):\n",
-    "            parts.append(f\"Job Category: {row['Category']}\")\n",
-    "        \n",
-    "        if row.get('skills'):\n",
-    "            parts.append(f\"Skills: {row['skills']}\")\n",
-    "        \n",
-    "        if row.get('career_objective'):\n",
-    "            parts.append(f\"Objective: {row['career_objective']}\")\n",
-    "        \n",
-    "        if row.get('degree_names'):\n",
-    "            parts.append(f\"Education: {row['degree_names']}\")\n",
-    "        \n",
-    "        if row.get('positions'):\n",
-    "            parts.append(f\"Experience: {row['positions']}\")\n",
-    "        \n",
-    "        return ' '.join(parts)\n",
-    "    \n",
-    "    candidate_texts = candidates.apply(build_candidate_text, axis=1).tolist()\n",
-    "    \n",
-    "    # Generate embeddings\n",
-    "    cand_vectors = model.encode(\n",
-    "        candidate_texts,\n",
-    "        show_progress_bar=True,\n",
-    "        batch_size=32,\n",
-    "        normalize_embeddings=True,\n",
-    "        convert_to_numpy=True\n",
-    "    )\n",
-    "    \n",
-    "    # Save\n",
-    "    np.save(CAND_EMBEDDINGS_FILE, cand_vectors)\n",
-    "    candidates.to_pickle(CAND_METADATA_FILE)\n",
-    "    \n",
-    "    print(f\"\\n\ud83d\udcbe Saved:\")\n",
-    "    print(f\"   {CAND_EMBEDDINGS_FILE}\")\n",
-    "    print(f\"   {CAND_METADATA_FILE}\")\n",
-    "\n",
-    "print(f\"\\n\u2705 CANDIDATE EMBEDDINGS READY\")\n",
-    "print(f\"   Shape: {cand_vectors.shape}\")\n",
-    "print(f\"   Dataset size: {len(candidates):,}\")\n",
-    "print(f\"   Alignment: {'\u2705 PERFECT' if len(cand_vectors) == len(candidates) else '\u274c MISMATCH'}\\n\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ============================================================================\n",
-    "# COMPANY EMBEDDINGS - Generate or Load\n",
-    "# ============================================================================\n",
-    "\n",
-    "if comp_exists:\n",
-    "    print(\"\ud83d\udce5 LOADING company embeddings from file...\")\n",
-    "    comp_vectors = np.load(COMP_EMBEDDINGS_FILE)\n",
-    "    print(f\"\u2705 Loaded: {comp_vectors.shape}\")\n",
-    "    \n",
-    "    # Verify alignment\n",
-    "    if len(comp_vectors) != len(companies_full):\n",
-    "        print(f\"\\n\u26a0\ufe0f  WARNING: Size mismatch!\")\n",
-    "        print(f\"   Embeddings: {len(comp_vectors):,}\")\n",
-    "        print(f\"   Dataset: {len(companies_full):,}\")\n",
-    "        print(f\"\\n\ud83d\udd04 Regenerating...\")\n",
-    "        comp_exists = False\n",
-    "\n",
-    "if not comp_exists:\n",
-    "    print(\"\ud83d\udd04 GENERATING company embeddings...\")\n",
-    "    print(f\"   Processing {len(companies_full):,} companies...\")\n",
-    "    print(f\"   IMPORTANT: Generated AFTER deduplication for alignment!\\n\")\n",
-    "    \n",
-    "    # Build text representations\n",
-    "    def build_company_text(row):\n",
-    "        parts = []\n",
-    "        \n",
-    "        if row.get('name'):\n",
-    "            parts.append(f\"Company: {row['name']}\")\n",
-    "        \n",
-    "        if row.get('description'):\n",
-    "            parts.append(f\"Description: {row['description']}\")\n",
-    "        \n",
-    "        if row.get('industries_list'):\n",
-    "            parts.append(f\"Industries: {row['industries_list']}\")\n",
-    "        \n",
-    "        if row.get('specialties_list'):\n",
-    "            parts.append(f\"Specialties: {row['specialties_list']}\")\n",
-    "        \n",
-    "        # Include job postings data (THE BRIDGE!)\n",
-    "        if row.get('required_skills'):\n",
-    "            parts.append(f\"Required Skills: {row['required_skills']}\")\n",
-    "        \n",
-    "        if row.get('posted_job_titles'):\n",
-    "            parts.append(f\"Job Titles: {row['posted_job_titles']}\")\n",
-    "        \n",
-    "        if row.get('experience_levels'):\n",
-    "            parts.append(f\"Experience Levels: {row['experience_levels']}\")\n",
-    "        \n",
-    "        return ' '.join(parts)\n",
-    "    \n",
-    "    company_texts = companies_full.apply(build_company_text, axis=1).tolist()\n",
-    "    \n",
-    "    # Generate embeddings\n",
-    "    comp_vectors = model.encode(\n",
-    "        company_texts,\n",
-    "        show_progress_bar=True,\n",
-    "        batch_size=32,\n",
-    "        normalize_embeddings=True,\n",
-    "        convert_to_numpy=True\n",
-    "    )\n",
-    "    \n",
-    "    # Save\n",
-    "    np.save(COMP_EMBEDDINGS_FILE, comp_vectors)\n",
-    "    companies_full.to_pickle(COMP_METADATA_FILE)\n",
-    "    \n",
-    "    print(f\"\\n\ud83d\udcbe Saved:\")\n",
-    "    print(f\"   {COMP_EMBEDDINGS_FILE}\")\n",
-    "    print(f\"   {COMP_METADATA_FILE}\")\n",
-    "\n",
-    "print(f\"\\n\u2705 COMPANY EMBEDDINGS READY\")\n",
-    "print(f\"   Shape: {comp_vectors.shape}\")\n",
-    "print(f\"   Dataset size: {len(companies_full):,}\")\n",
-    "print(f\"   Alignment: {'\u2705 PERFECT' if len(comp_vectors) == len(companies_full) else '\u274c MISMATCH'}\\n\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ============================================================================\n",
-    "# FINAL VERIFICATION\n",
-    "# ============================================================================\n",
-    "\n",
-    "print(\"\ud83d\udd0d FINAL ALIGNMENT CHECK\")\n",
-    "print(\"=\" * 80)\n",
-    "print()\n",
-    "\n",
-    "print(f\"\ud83d\udcca CANDIDATES:\")\n",
-    "print(f\"   Dataset rows: {len(candidates):,}\")\n",
-    "print(f\"   Embedding vectors: {len(cand_vectors):,}\")\n",
-    "print(f\"   Status: {'\u2705 ALIGNED' if len(candidates) == len(cand_vectors) else '\u274c MISALIGNED'}\")\n",
-    "print()\n",
-    "\n",
-    "print(f\"\ud83d\udcca COMPANIES:\")\n",
-    "print(f\"   Dataset rows: {len(companies_full):,}\")\n",
-    "print(f\"   Embedding vectors: {len(comp_vectors):,}\")\n",
-    "print(f\"   Status: {'\u2705 ALIGNED' if len(companies_full) == len(comp_vectors) else '\u274c MISALIGNED'}\")\n",
-    "print()\n",
-    "\n",
-    "if len(candidates) == len(cand_vectors) and len(companies_full) == len(comp_vectors):\n",
-    "    print(\"\ud83c\udfaf PERFECT ALIGNMENT! Ready for matching!\")\n",
-    "    print(\"\\n\ud83d\udca1 Next runs will LOAD embeddings (fast!)\")\n",
-    "else:\n",
-    "    print(\"\u26a0\ufe0f  ALIGNMENT ISSUE DETECTED\")\n",
-    "    print(\"   Delete .npy files and regenerate\")\n",
-    "\n",
-    "print(\"\\n\" + \"=\" * 80)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
\ No newline at end of file