# %% [markdown]
# # ๐ง HRHUB v2.1 - Enhanced with LLM (FREE VERSION)
#
# ## ๐ Project Overview
#
# **Bilateral HR Matching System with LLM-Powered Intelligence**
#
# ### What's New in v2.1:
# - โ
**FREE LLM**: Using Hugging Face Inference API (no cost)
# - โ
**Job Level Classification**: Zero-shot & few-shot learning
# - โ
**Structured Skills Extraction**: Pydantic schemas
# - โ
**Match Explainability**: LLM-generated reasoning
# - โ
**Flexible Data Loading**: Upload OR Google Drive
#
# ### Tech Stack:
# ```
# Embeddings: sentence-transformers (local, free)
# LLM: Hugging Face Inference API (free tier)
# Schemas: Pydantic
# Platform: Google Colab โ VS Code
# ```
#
# ---
#
# **Master's Thesis - Aalborg University**
# *Business Data Science Program*
# *December 2025*
# %% [markdown]
# ---
# ## ๐ Step 1: Install Dependencies
# %%
# Install required packages
#!pip install -q sentence-transformers huggingface-hub pydantic plotly pyvis nbformat scikit-learn pandas numpy
print("โ
All packages installed!")
# %% [markdown]
# ---
# ## ๐ Step 2: Import Libraries
# %%
import pandas as pd
import numpy as np
import json
import os
from typing import List, Dict, Optional, Literal
import warnings
warnings.filterwarnings('ignore')
# ML & NLP
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
# LLM Integration (FREE)
from huggingface_hub import InferenceClient
from pydantic import BaseModel, Field
# Visualization
import plotly.graph_objects as go
from IPython.display import HTML, display
# Configuration Settings
from dotenv import load_dotenv
# Carrega variรกveis do .env
load_dotenv()
print("โ
Environment variables loaded from .env")
print("โ
All libraries imported!")
# %% [markdown]
# ---
# ## ๐ Step 3: Configuration
# %%
class Config:
"""Centralized configuration for VS Code"""
# Paths - VS Code structure
CSV_PATH = '../csv_files/'
PROCESSED_PATH = '../processed/'
RESULTS_PATH = '../results/'
# Embedding Model
EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
# LLM Settings (FREE - Hugging Face)
HF_TOKEN = os.getenv('HF_TOKEN', '') # โ
Pega do .env
LLM_MODEL = 'meta-llama/Llama-3.2-3B-Instruct'
LLM_MAX_TOKENS = 1000
# Matching Parameters
TOP_K_MATCHES = 10
SIMILARITY_THRESHOLD = 0.5
RANDOM_SEED = 42
np.random.seed(Config.RANDOM_SEED)
print("โ
Configuration loaded!")
print(f"๐ง Embedding model: {Config.EMBEDDING_MODEL}")
print(f"๐ค LLM model: {Config.LLM_MODEL}")
print(f"๐ HF Token configured: {'Yes โ
' if Config.HF_TOKEN else 'No โ ๏ธ'}")
print(f"๐ Data path: {Config.CSV_PATH}")
# %% [markdown]
# ---
# ## ๐๏ธ Step 4: Architecture - Text Builders
#
# **HIGH COHESION:** Each class has ONE responsibility
# **LOW COUPLING:** Classes don't depend on each other
# %%
# ============================================================================
# TEXT BUILDER CLASSES - Single Responsibility Principle
# ============================================================================
from abc import ABC, abstractmethod
from typing import List
class TextBuilder(ABC):
"""Abstract base class for text builders"""
@abstractmethod
def build(self, row: pd.Series) -> str:
"""Build text representation from DataFrame row"""
pass
def build_batch(self, df: pd.DataFrame) -> List[str]:
"""Build text representations for entire DataFrame"""
return df.apply(self.build, axis=1).tolist()
class CandidateTextBuilder(TextBuilder):
"""Builds text representation for candidates"""
def __init__(self, fields: List[str] = None):
self.fields = fields or [
'Category',
'skills',
'career_objective',
'degree_names',
'positions'
]
def build(self, row: pd.Series) -> str:
parts = []
if row.get('Category'):
parts.append(f"Job Category: {row['Category']}")
if row.get('skills'):
parts.append(f"Skills: {row['skills']}")
if row.get('career_objective'):
parts.append(f"Objective: {row['career_objective']}")
if row.get('degree_names'):
parts.append(f"Education: {row['degree_names']}")
if row.get('positions'):
parts.append(f"Experience: {row['positions']}")
return ' '.join(parts)
class CompanyTextBuilder(TextBuilder):
"""Builds text representation for companies"""
def __init__(self, include_postings: bool = True):
self.include_postings = include_postings
def build(self, row: pd.Series) -> str:
parts = []
if row.get('name'):
parts.append(f"Company: {row['name']}")
if row.get('description'):
parts.append(f"Description: {row['description']}")
if row.get('industries_list'):
parts.append(f"Industries: {row['industries_list']}")
if row.get('specialties_list'):
parts.append(f"Specialties: {row['specialties_list']}")
# Include job postings data (THE BRIDGE!)
if self.include_postings:
if row.get('required_skills'):
parts.append(f"Required Skills: {row['required_skills']}")
if row.get('posted_job_titles'):
parts.append(f"Job Titles: {row['posted_job_titles']}")
if row.get('experience_levels'):
parts.append(f"Experience: {row['experience_levels']}")
return ' '.join(parts)
print("โ
Text Builder classes loaded")
print(" โข CandidateTextBuilder")
print(" โข CompanyTextBuilder")
# %% [markdown]
# ---
# ## ๐๏ธ Step 5: Architecture - Embedding Manager
#
# **Responsibility:** Generate, save, and load embeddings
# %%
# ============================================================================
# EMBEDDING MANAGER - Handles all embedding operations
# ============================================================================
from pathlib import Path
from typing import Tuple, Optional
class EmbeddingManager:
"""Manages embedding generation, saving, and loading"""
def __init__(self, model: SentenceTransformer, save_dir: str):
self.model = model
self.save_dir = Path(save_dir)
self.save_dir.mkdir(parents=True, exist_ok=True)
def _get_file_paths(self, entity_type: str) -> Tuple[Path, Path]:
"""Get file paths for embeddings and metadata"""
emb_file = self.save_dir / f"{entity_type}_embeddings.npy"
meta_file = self.save_dir / f"{entity_type}_metadata.pkl"
return emb_file, meta_file
def exists(self, entity_type: str) -> bool:
"""Check if embeddings exist for entity type"""
emb_file, _ = self._get_file_paths(entity_type)
return emb_file.exists()
def load(self, entity_type: str) -> Tuple[np.ndarray, pd.DataFrame]:
"""Load embeddings and metadata"""
emb_file, meta_file = self._get_file_paths(entity_type)
if not emb_file.exists():
raise FileNotFoundError(f"Embeddings not found: {emb_file}")
embeddings = np.load(emb_file)
metadata = pd.read_pickle(meta_file) if meta_file.exists() else None
return embeddings, metadata
def generate(self,
texts: List[str],
batch_size: int = 32,
show_progress: bool = True) -> np.ndarray:
"""Generate embeddings from texts"""
return self.model.encode(
texts,
batch_size=batch_size,
show_progress_bar=show_progress,
normalize_embeddings=True,
convert_to_numpy=True
)
def save(self,
entity_type: str,
embeddings: np.ndarray,
metadata: pd.DataFrame) -> None:
"""Save embeddings and metadata"""
emb_file, meta_file = self._get_file_paths(entity_type)
np.save(emb_file, embeddings)
metadata.to_pickle(meta_file)
print(f"๐พ Saved:")
print(f" {emb_file}")
print(f" {meta_file}")
def generate_and_save(self,
entity_type: str,
texts: List[str],
metadata: pd.DataFrame,
batch_size: int = 32) -> np.ndarray:
"""Generate embeddings and save everything"""
print(f"๐ Generating {entity_type} embeddings...")
print(f" Processing {len(texts):,} items...")
embeddings = self.generate(texts, batch_size=batch_size)
self.save(entity_type, embeddings, metadata)
return embeddings
def load_or_generate(self,
entity_type: str,
texts: List[str],
metadata: pd.DataFrame,
force_regenerate: bool = False) -> Tuple[np.ndarray, pd.DataFrame]:
"""Load if exists, generate otherwise"""
if not force_regenerate and self.exists(entity_type):
print(f"๐ฅ Loading {entity_type} embeddings...")
embeddings, saved_metadata = self.load(entity_type)
# Verify alignment
if len(embeddings) != len(metadata):
print(f"โ ๏ธ Size mismatch! Regenerating...")
embeddings = self.generate_and_save(
entity_type, texts, metadata
)
else:
print(f"โ
Loaded: {embeddings.shape}")
else:
embeddings = self.generate_and_save(
entity_type, texts, metadata
)
return embeddings, metadata
print("โ
EmbeddingManager class loaded")
# %% [markdown]
# ---
# ## ๐๏ธ Step 6: Architecture - Matching Engine
#
# **Responsibility:** Calculate similarities and find matches
# %%
# ============================================================================
# MATCHING ENGINE - Handles similarity calculations
# ============================================================================
class MatchingEngine:
"""Calculates similarities and finds top matches"""
def __init__(self,
candidate_vectors: np.ndarray,
company_vectors: np.ndarray,
candidate_metadata: pd.DataFrame,
company_metadata: pd.DataFrame):
self.cand_vectors = candidate_vectors
self.comp_vectors = company_vectors
self.cand_metadata = candidate_metadata
self.comp_metadata = company_metadata
# Verify alignment
assert len(candidate_vectors) == len(candidate_metadata), \
"Candidate embeddings and metadata size mismatch"
assert len(company_vectors) == len(company_metadata), \
"Company embeddings and metadata size mismatch"
def find_matches(self,
candidate_idx: int,
top_k: int = 10) -> List[Tuple[int, float]]:
"""Find top K company matches for a candidate"""
if candidate_idx >= len(self.cand_vectors):
raise IndexError(f"Candidate index {candidate_idx} out of range")
# Get candidate vector
cand_vec = self.cand_vectors[candidate_idx].reshape(1, -1)
# Calculate similarities
similarities = cosine_similarity(cand_vec, self.comp_vectors)[0]
# Get top K
top_indices = np.argsort(similarities)[::-1][:top_k]
# Return (index, score) tuples
return [(int(idx), float(similarities[idx])) for idx in top_indices]
def get_match_details(self,
candidate_idx: int,
company_idx: int) -> dict:
"""Get detailed match information"""
candidate = self.cand_metadata.iloc[candidate_idx]
company = self.comp_metadata.iloc[company_idx]
# Calculate similarity
cand_vec = self.cand_vectors[candidate_idx].reshape(1, -1)
comp_vec = self.comp_vectors[company_idx].reshape(1, -1)
similarity = float(cosine_similarity(cand_vec, comp_vec)[0][0])
return {
'candidate': candidate.to_dict(),
'company': company.to_dict(),
'similarity_score': similarity
}
def batch_match(self,
candidate_indices: List[int],
top_k: int = 10) -> dict:
"""Find matches for multiple candidates"""
results = {}
for idx in candidate_indices:
results[idx] = self.find_matches(idx, top_k=top_k)
return results
print("โ
MatchingEngine class loaded")
# %% [markdown]
# ---
# ## ๐ Step 7: Load All Datasets
# %%
print("๐ Loading all datasets...\n")
print("=" * 70)
# Load main datasets
candidates = pd.read_csv(f'{Config.CSV_PATH}resume_data.csv')
print(f"โ
Candidates: {len(candidates):,} rows ร {len(candidates.columns)} columns")
companies_base = pd.read_csv(f'{Config.CSV_PATH}companies.csv')
print(f"โ
Companies (base): {len(companies_base):,} rows")
company_industries = pd.read_csv(f'{Config.CSV_PATH}company_industries.csv')
print(f"โ
Company industries: {len(company_industries):,} rows")
company_specialties = pd.read_csv(f'{Config.CSV_PATH}company_specialities.csv')
print(f"โ
Company specialties: {len(company_specialties):,} rows")
employee_counts = pd.read_csv(f'{Config.CSV_PATH}employee_counts.csv')
print(f"โ
Employee counts: {len(employee_counts):,} rows")
postings = pd.read_csv(f'{Config.CSV_PATH}postings.csv', on_bad_lines='skip', engine='python')
print(f"โ
Postings: {len(postings):,} rows ร {len(postings.columns)} columns")
# Optional datasets
try:
job_skills = pd.read_csv(f'{Config.CSV_PATH}job_skills.csv')
print(f"โ
Job skills: {len(job_skills):,} rows")
except:
job_skills = None
print("โ ๏ธ Job skills not found (optional)")
try:
job_industries = pd.read_csv(f'{Config.CSV_PATH}job_industries.csv')
print(f"โ
Job industries: {len(job_industries):,} rows")
except:
job_industries = None
print("โ ๏ธ Job industries not found (optional)")
print("\n" + "=" * 70)
print("โ
All datasets loaded successfully!\n")
# %% [markdown]
# ---
# ## ๐ Step 8: Merge & Enrich Company Data
# %%
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# CELL 8: Merge & Enrich Company Data + Empty Columns Validation
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
print("๐ ENRICHING COMPANY DATA...")
print("=" * 80)
# ============================================================================
# STEP 1: Aggregate Industries per Company
# ============================================================================
print("\n1๏ธโฃ Aggregating industries...")
industries_grouped = company_industries.groupby('company_id')['industry'].apply(
lambda x: ', '.join(x.dropna().astype(str).unique())
).reset_index()
industries_grouped.columns = ['company_id', 'industries_list']
print(f"โ
Industries aggregated: {len(industries_grouped):,} companies")
# ============================================================================
# STEP 2: Aggregate Specialties per Company
# ============================================================================
print("\n2๏ธโฃ Aggregating specialties...")
specialties_grouped = company_specialties.groupby('company_id')['speciality'].apply(
lambda x: ', '.join(x.dropna().astype(str).unique())
).reset_index()
specialties_grouped.columns = ['company_id', 'specialties_list']
print(f"โ
Specialties aggregated: {len(specialties_grouped):,} companies")
# ============================================================================
# STEP 3: Aggregate Skills from Job Postings
# ============================================================================
print("\n3๏ธโฃ Aggregating job posting skills...")
if job_skills is not None:
skills_df = pd.read_csv(f'{Config.CSV_PATH}skills.csv')
job_skills_enriched = job_skills.merge(
skills_df,
on='skill_abr',
how='left'
)
skills_per_posting = job_skills_enriched.groupby('job_id')['skill_name'].apply(
lambda x: ', '.join(x.dropna().astype(str).unique())
).reset_index()
skills_per_posting.columns = ['job_id', 'required_skills']
print(f"โ
Skills aggregated: {len(skills_per_posting):,} job postings")
else:
skills_per_posting = pd.DataFrame(columns=['job_id', 'required_skills'])
print("โ ๏ธ Job skills not available")
# ============================================================================
# STEP 4: Aggregate Job Posting Data per Company
# ============================================================================
print("\n4๏ธโฃ Aggregating job postings...")
postings_enriched = postings.merge(skills_per_posting, on='job_id', how='left')
job_data_grouped = postings_enriched.groupby('company_id').agg({
'title': lambda x: ', '.join(x.dropna().astype(str).unique()[:10]),
'required_skills': lambda x: ', '.join(x.dropna().astype(str).unique()),
'med_salary': 'mean',
'max_salary': 'mean',
'job_id': 'count'
}).reset_index()
job_data_grouped.columns = [
'company_id', 'posted_job_titles', 'required_skills',
'avg_med_salary', 'avg_max_salary', 'total_postings'
]
print(f"โ
Job data aggregated: {len(job_data_grouped):,} companies")
# ============================================================================
# STEP 5: Merge Everything
# ============================================================================
print("\n5๏ธโฃ Merging all data...")
companies_full = companies_base.copy()
companies_full = companies_full.merge(industries_grouped, on='company_id', how='left')
companies_full = companies_full.merge(specialties_grouped, on='company_id', how='left')
companies_full = companies_full.merge(job_data_grouped, on='company_id', how='left')
print(f"โ
Shape: {companies_full.shape}")
# ============================================================================
# STEP 6: Fill Empty Columns
# ============================================================================
print("\n6๏ธโฃ Filling nulls...")
fill_values = {
'name': 'Unknown Company',
'description': 'No description',
'industries_list': 'General',
'specialties_list': 'Not specified',
'required_skills': 'Not specified',
'posted_job_titles': 'Various',
'avg_med_salary': 0,
'avg_max_salary': 0,
'total_postings': 0
}
for col, val in fill_values.items():
if col in companies_full.columns:
before = companies_full[col].isna().sum()
companies_full[col] = companies_full[col].fillna(val)
if before > 0:
print(f" โ
{col:25s} {before:>6,} โ 0")
# ============================================================================
# STEP 7: Validation
# ============================================================================
print("\n7๏ธโฃ Validation...")
print("=" * 80)
critical = ['name', 'description', 'industries_list', 'specialties_list',
'required_skills', 'posted_job_titles']
ok = True
for col in critical:
if col in companies_full.columns:
issues = companies_full[col].isna().sum() + (companies_full[col] == '').sum()
print(f"{'โ
' if issues == 0 else 'โ'} {col:25s} {issues} issues")
if issues > 0:
ok = False
print("=" * 80)
print(f"{'๐ฏ PERFECT!' if ok else 'โ ๏ธ ISSUES!'}")
print(f"\nTotal: {len(companies_full):,}")
print(f"With postings: {(companies_full['total_postings'] > 0).sum():,}")
# %%
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# CELL 9: Fill Missing Required Skills via Keyword Matching
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
print("๐ FILLING MISSING REQUIRED SKILLS...")
print("=" * 80)
# Load skills reference
skills_ref = pd.read_csv(f'{Config.CSV_PATH}skills.csv')
skill_names = set(skills_ref['skill_name'].str.lower().unique())
print(f"โ
Loaded {len(skill_names):,} unique skills")
# Find companies with empty required_skills
empty_mask = (companies_full['required_skills'] == 'Not specified') | \
(companies_full['required_skills'].isna())
empty_count = empty_mask.sum()
print(f"๐ Found {empty_count:,} companies with missing skills")
if empty_count > 0:
print(f"\n๐ Extracting skills from job postings text...")
# Get postings for companies with empty skills
empty_companies = companies_full[empty_mask]['company_id'].tolist()
relevant_postings = postings[postings['company_id'].isin(empty_companies)].copy()
print(f" Processing {len(relevant_postings):,} job postings...")
# Extract skills from description
def extract_skills_from_text(text):
if pd.isna(text):
return []
text_lower = str(text).lower()
found_skills = []
for skill in skill_names:
if skill in text_lower:
found_skills.append(skill)
return found_skills
# Extract from description column
relevant_postings['extracted_skills'] = relevant_postings['description'].apply(extract_skills_from_text)
# Aggregate by company
skills_extracted = relevant_postings.groupby('company_id')['extracted_skills'].apply(
lambda x: ', '.join(set([skill for sublist in x for skill in sublist]))
).reset_index()
skills_extracted.columns = ['company_id', 'extracted_skills']
# Update companies_full
for idx, row in skills_extracted.iterrows():
comp_id = row['company_id']
extracted = row['extracted_skills']
if extracted: # Only update if we found skills
mask = companies_full['company_id'] == comp_id
companies_full.loc[mask, 'required_skills'] = extracted
# Final check
still_empty = ((companies_full['required_skills'] == 'Not specified') |
(companies_full['required_skills'].isna())).sum()
filled = empty_count - still_empty
print(f"\nโ
RESULTS:")
print(f" Filled: {filled:,} companies")
print(f" Still empty: {still_empty:,} companies")
print(f" Success rate: {(filled/empty_count*100):.1f}%")
else:
print("โ
No missing skills to fill!")
print("\n" + "=" * 80)
# %%
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# VALIDATION: Check Job Posting Enrichment
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
print("๐ VALIDATING JOB POSTING ENRICHMENT...")
print("=" * 80)
# Stats
print(f"\n๐ COVERAGE:")
print(f" Total companies: {len(companies_full):,}")
print(f" With postings: {(companies_full['total_postings'] > 0).sum():,}")
print(f" Without postings: {(companies_full['total_postings'] == 0).sum():,}")
print(f" Coverage: {(companies_full['total_postings'] > 0).sum() / len(companies_full) * 100:.1f}%")
# Sample companies
sample = companies_full.sample(5, random_state=42)
print("\n๐ SAMPLE COMPANIES (random 5):")
print("-" * 80)
for idx, row in sample.iterrows():
print(f"\n๐ข {row['name']}")
print(f" Total Postings: {row['total_postings']}")
print(f" Industries: {str(row['industries_list'])[:80]}...")
print(f" Required Skills: {str(row['required_skills'])[:80]}...")
print(f" Job Titles: {str(row['posted_job_titles'])[:80]}...")
# Check if enrichment columns exist and are populated
print("\n\n๐ ENRICHMENT QUALITY CHECK:")
print("-" * 80)
enrichment_cols = ['industries_list', 'specialties_list', 'required_skills', 'posted_job_titles']
for col in enrichment_cols:
empty = (companies_full[col] == 'Not specified') | (companies_full[col] == 'Various') | (companies_full[col] == 'General')
empty_count = empty.sum()
filled_count = len(companies_full) - empty_count
print(f"{col:25s} Filled: {filled_count:>6,} ({filled_count/len(companies_full)*100:>5.1f}%) Empty: {empty_count:>6,}")
print("\n" + "=" * 80)
print("\n๐ฏ CONCLUSION:")
print(" โ
If 'Filled' percentages are high โ Enrichment working!")
print(" โ If 'Empty' counts are high โ Need to fix enrichment")
# %%
companies_full.head()
# %%
## ๐ Data Quality Check - Duplicate Detection
"""
Checking for duplicates in all datasets based on primary keys.
This cell only REPORTS duplicates, does not modify data.
"""
print("=" * 80)
print("๐ DUPLICATE DETECTION REPORT")
print("=" * 80)
print()
# Define primary keys for each dataset
duplicate_report = []
# 1. Candidates
print("โโ ๐ resume_data.csv (Candidates)")
print(f"โ Primary Key: Resume_ID")
cand_total = len(candidates)
cand_unique = candidates['Resume_ID'].nunique() if 'Resume_ID' in candidates.columns else len(candidates)
cand_dups = cand_total - cand_unique
print(f"โ Total rows: {cand_total:,}")
print(f"โ Unique rows: {cand_unique:,}")
print(f"โ Duplicates: {cand_dups:,}")
print(f"โ Status: {'โ
CLEAN' if cand_dups == 0 else '๐ด HAS DUPLICATES'}")
print("โโ\n")
duplicate_report.append(('Candidates', cand_total, cand_unique, cand_dups))
# 2. Companies Base
print("โโ ๐ companies.csv (Companies Base)")
print(f"โ Primary Key: company_id")
comp_total = len(companies_base)
comp_unique = companies_base['company_id'].nunique()
comp_dups = comp_total - comp_unique
print(f"โ Total rows: {comp_total:,}")
print(f"โ Unique rows: {comp_unique:,}")
print(f"โ Duplicates: {comp_dups:,}")
print(f"โ Status: {'โ
CLEAN' if comp_dups == 0 else '๐ด HAS DUPLICATES'}")
if comp_dups > 0:
dup_ids = companies_base[companies_base.duplicated('company_id', keep=False)]['company_id'].value_counts().head(3)
print(f"โ Top duplicates:")
for cid, count in dup_ids.items():
print(f"โ - company_id={cid}: {count} times")
print("โโ\n")
duplicate_report.append(('Companies Base', comp_total, comp_unique, comp_dups))
# 3. Company Industries
print("โโ ๐ company_industries.csv")
print(f"โ Primary Key: company_id + industry")
ci_total = len(company_industries)
ci_unique = len(company_industries.drop_duplicates(subset=['company_id', 'industry']))
ci_dups = ci_total - ci_unique
print(f"โ Total rows: {ci_total:,}")
print(f"โ Unique rows: {ci_unique:,}")
print(f"โ Duplicates: {ci_dups:,}")
print(f"โ Status: {'โ
CLEAN' if ci_dups == 0 else '๐ด HAS DUPLICATES'}")
print("โโ\n")
duplicate_report.append(('Company Industries', ci_total, ci_unique, ci_dups))
# 4. Company Specialties
print("โโ ๐ company_specialities.csv")
print(f"โ Primary Key: company_id + speciality")
cs_total = len(company_specialties)
cs_unique = len(company_specialties.drop_duplicates(subset=['company_id', 'speciality']))
cs_dups = cs_total - cs_unique
print(f"โ Total rows: {cs_total:,}")
print(f"โ Unique rows: {cs_unique:,}")
print(f"โ Duplicates: {cs_dups:,}")
print(f"โ Status: {'โ
CLEAN' if cs_dups == 0 else '๐ด HAS DUPLICATES'}")
print("โโ\n")
duplicate_report.append(('Company Specialties', cs_total, cs_unique, cs_dups))
# 5. Employee Counts
print("โโ ๐ employee_counts.csv")
print(f"โ Primary Key: company_id")
ec_total = len(employee_counts)
ec_unique = employee_counts['company_id'].nunique()
ec_dups = ec_total - ec_unique
print(f"โ Total rows: {ec_total:,}")
print(f"โ Unique rows: {ec_unique:,}")
print(f"โ Duplicates: {ec_dups:,}")
print(f"โ Status: {'โ
CLEAN' if ec_dups == 0 else '๐ด HAS DUPLICATES'}")
print("โโ\n")
duplicate_report.append(('Employee Counts', ec_total, ec_unique, ec_dups))
# 6. Postings
print("โโ ๐ postings.csv (Job Postings)")
print(f"โ Primary Key: job_id")
if 'job_id' in postings.columns:
post_total = len(postings)
post_unique = postings['job_id'].nunique()
post_dups = post_total - post_unique
else:
post_total = len(postings)
post_unique = len(postings.drop_duplicates())
post_dups = post_total - post_unique
print(f"โ Total rows: {post_total:,}")
print(f"โ Unique rows: {post_unique:,}")
print(f"โ Duplicates: {post_dups:,}")
print(f"โ Status: {'โ
CLEAN' if post_dups == 0 else '๐ด HAS DUPLICATES'}")
print("โโ\n")
duplicate_report.append(('Postings', post_total, post_unique, post_dups))
# 7. Companies Full (After Merge)
print("โโ ๐ companies_full (After Enrichment)")
print(f"โ Primary Key: company_id")
cf_total = len(companies_full)
cf_unique = companies_full['company_id'].nunique()
cf_dups = cf_total - cf_unique
print(f"โ Total rows: {cf_total:,}")
print(f"โ Unique rows: {cf_unique:,}")
print(f"โ Duplicates: {cf_dups:,}")
print(f"โ Status: {'โ
CLEAN' if cf_dups == 0 else '๐ด HAS DUPLICATES'}")
if cf_dups > 0:
dup_ids = companies_full[companies_full.duplicated('company_id', keep=False)]['company_id'].value_counts().head(5)
print(f"โ")
print(f"โ Top duplicate company_ids:")
for cid, count in dup_ids.items():
comp_name = companies_full[companies_full['company_id'] == cid]['name'].iloc[0]
print(f"โ - {cid} ({comp_name}): {count} times")
print("โโ\n")
duplicate_report.append(('Companies Full', cf_total, cf_unique, cf_dups))
# Summary
print("=" * 80)
print("๐ SUMMARY")
print("=" * 80)
print()
total_dups = sum(r[3] for r in duplicate_report)
clean_datasets = sum(1 for r in duplicate_report if r[3] == 0)
dirty_datasets = len(duplicate_report) - clean_datasets
print(f"โ
Clean datasets: {clean_datasets}/{len(duplicate_report)}")
print(f"๐ด Datasets with duplicates: {dirty_datasets}/{len(duplicate_report)}")
print(f"๐๏ธ Total duplicates found: {total_dups:,} rows")
print()
if dirty_datasets > 0:
print("โ ๏ธ DUPLICATES DETECTED!")
else:
print("โ
All datasets are clean! No duplicates found.")
print("=" * 80)
# %% [markdown]
# ---
# ## ๐ Step 12a: Load Embedding Model & Pre-computed Vectors
# %%
print("๐ง Loading embedding model...\n")
model = SentenceTransformer(Config.EMBEDDING_MODEL)
embedding_dim = model.get_sentence_embedding_dimension()
print(f"โ
Model loaded: {Config.EMBEDDING_MODEL}")
print(f"๐ Embedding dimension: โ^{embedding_dim}\n")
print("๐ Loading pre-computed embeddings...")
try:
# Try to load from processed folder
cand_vectors = np.load(f'{Config.PROCESSED_PATH}candidate_embeddings.npy')
comp_vectors = np.load(f'{Config.PROCESSED_PATH}company_embeddings.npy')
print(f"โ
Loaded from {Config.PROCESSED_PATH}")
print(f"๐ Candidate vectors: {cand_vectors.shape}")
print(f"๐ Company vectors: {comp_vectors.shape}\n")
except FileNotFoundError:
print("โ ๏ธ Pre-computed embeddings not found!")
print(" Embeddings will need to be generated (takes ~5-10 minutes)")
print(" This is normal if running for the first time.\n")
# You can add embedding generation code here if needed
# For now, we'll skip to keep notebook clean
cand_vectors = None
comp_vectors = None
# %% [markdown]
# ---
# ## ๐ Step 12b: Generate Embeddings & Pre-computed Vectors
# %%
# #last time running:
# from datetime import datetime
# print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
# %%
# # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# # CELL 9: Generate Embeddings (CPU ONLY)
# # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# print("๐ง GENERATING EMBEDDINGS...")
# print("=" * 80)
# print(f"\n๐ง Loading model: {Config.EMBEDDING_MODEL} (CPU)")
# model = SentenceTransformer(Config.EMBEDDING_MODEL, device='cpu')
# print(f"โ
Loaded! Dim: {model.get_sentence_embedding_dimension()}")
# # ============================================================================
# # CANDIDATES
# # ============================================================================
# print(f"\n1๏ธโฃ CANDIDATES ({len(candidates):,})")
# cand_builder = CandidateTextBuilder()
# candidate_texts = cand_builder.build_batch(candidates)
# cand_vectors = model.encode(
# candidate_texts,
# show_progress_bar=True,
# batch_size=16,
# normalize_embeddings=True,
# convert_to_numpy=True
# )
# print(f"โ
Shape: {cand_vectors.shape}")
# np.save(f'{Config.PROCESSED_PATH}candidate_embeddings.npy', cand_vectors)
# candidates.to_pickle(f'{Config.PROCESSED_PATH}candidates_metadata.pkl')
# print(f"๐พ Saved")
# # ============================================================================
# # COMPANIES
# # ============================================================================
# print(f"\n2๏ธโฃ COMPANIES ({len(companies_full):,})")
# comp_builder = CompanyTextBuilder()
# company_texts = comp_builder.build_batch(companies_full)
# comp_vectors = model.encode(
# company_texts,
# show_progress_bar=True,
# batch_size=16,
# normalize_embeddings=True,
# convert_to_numpy=True
# )
# print(f"โ
Shape: {comp_vectors.shape}")
# np.save(f'{Config.PROCESSED_PATH}company_embeddings.npy', comp_vectors)
# companies_full.to_pickle(f'{Config.PROCESSED_PATH}companies_metadata.pkl')
# print(f"๐พ Saved")
# # ============================================================================
# # DONE
# # ============================================================================
# print(f"\n{'='*80}")
# print(f"๐ฏ DONE!")
# print(f"Candidates: {cand_vectors.shape}")
# print(f"Companies: {comp_vectors.shape}")
# print(f"{'='*80}")
# %% [markdown]
# ---
# ## ๐ Step 8: Core Matching Function
# %%
# ============================================================================
# CORE MATCHING FUNCTION (SAFE VERSION)
# ============================================================================
def find_top_matches(candidate_idx: int, top_k: int = 10) -> list:
"""
Find top K company matches for a candidate.
SAFE VERSION: Handles index mismatches between embeddings and dataset
Args:
candidate_idx: Index of candidate in candidates DataFrame
top_k: Number of top matches to return
Returns:
List of tuples: [(company_idx, similarity_score), ...]
"""
# Validate candidate index
if candidate_idx >= len(cand_vectors):
print(f"โ Candidate index {candidate_idx} out of range")
return []
# Get candidate vector
cand_vec = cand_vectors[candidate_idx].reshape(1, -1)
# Calculate similarities with all company vectors
similarities = cosine_similarity(cand_vec, comp_vectors)[0]
# CRITICAL FIX: Only use indices that exist in companies_full
max_valid_idx = len(companies_full) - 1
# Truncate similarities to valid range
valid_similarities = similarities[:max_valid_idx + 1]
# Get top K indices from valid range
top_indices = np.argsort(valid_similarities)[::-1][:top_k]
# Return (index, score) tuples
results = [(int(idx), float(valid_similarities[idx])) for idx in top_indices]
return results
# Test function and show diagnostics
print("โ
Safe matching function loaded!")
print(f"\n๐ DIAGNOSTICS:")
print(f" Candidate vectors: {len(cand_vectors):,}")
print(f" Company vectors: {len(comp_vectors):,}")
print(f" Companies dataset: {len(companies_full):,}")
if len(comp_vectors) > len(companies_full):
print(f"\nโ ๏ธ INDEX MISMATCH DETECTED!")
print(f" Embeddings: {len(comp_vectors):,}")
print(f" Dataset: {len(companies_full):,}")
print(f" Missing rows: {len(comp_vectors) - len(companies_full):,}")
print(f"\n๐ก CAUSE: Embeddings generated BEFORE deduplication")
print(f"\n๐ฏ SOLUTIONS:")
print(f" A. Safe functions active (current) โ
")
print(f" B. Regenerate embeddings after dedup")
print(f" C. Run collaborative filtering step")
else:
print(f"\nโ
Embeddings and dataset are aligned!")
# %% [markdown]
# ---
# ## ๐ Step 9: Initialize FREE LLM (Hugging Face)
#
# ### Get your FREE token: https://huggingface.co/settings/tokens
# %%
# Initialize Hugging Face Inference Client (FREE)
if Config.HF_TOKEN:
try:
hf_client = InferenceClient(token=Config.HF_TOKEN)
print("โ
Hugging Face client initialized (FREE)")
print(f"๐ค Model: {Config.LLM_MODEL}")
print("๐ฐ Cost: $0.00 (completely free!)\n")
LLM_AVAILABLE = True
except Exception as e:
print(f"โ ๏ธ Failed to initialize HF client: {e}")
LLM_AVAILABLE = False
else:
print("โ ๏ธ No Hugging Face token configured")
print(" LLM features will be disabled")
print("\n๐ To enable:")
print(" 1. Go to: https://huggingface.co/settings/tokens")
print(" 2. Create a token (free)")
print(" 3. Set: Config.HF_TOKEN = 'your-token-here'\n")
LLM_AVAILABLE = False
hf_client = None
def call_llm(prompt: str, max_tokens: int = 1000) -> str:
"""
Generic LLM call using Hugging Face Inference API (FREE).
"""
if not LLM_AVAILABLE:
return "[LLM not available - check .env file for HF_TOKEN]"
try:
response = hf_client.chat_completion( # โ
chat_completion
messages=[{"role": "user", "content": prompt}],
model=Config.LLM_MODEL,
max_tokens=max_tokens,
temperature=0.7
)
return response.choices[0].message.content # โ
Extrai conteรบdo
except Exception as e:
return f"[Error: {str(e)}]"
print("โ
LLM helper functions ready")
# %% [markdown]
# ---
# ## ๐ Step 10: Pydantic Schemas for Structured Output
# %%
class JobLevelClassification(BaseModel):
"""Job level classification result"""
level: Literal['Entry', 'Mid', 'Senior', 'Executive']
confidence: float = Field(ge=0.0, le=1.0)
reasoning: str
class SkillsTaxonomy(BaseModel):
"""Structured skills extraction"""
technical_skills: List[str] = Field(default_factory=list)
soft_skills: List[str] = Field(default_factory=list)
certifications: List[str] = Field(default_factory=list)
languages: List[str] = Field(default_factory=list)
class MatchExplanation(BaseModel):
"""Match reasoning"""
overall_score: float = Field(ge=0.0, le=1.0)
match_strengths: List[str]
skill_gaps: List[str]
recommendation: str
fit_summary: str = Field(max_length=200)
print("โ
Pydantic schemas defined")
# %% [markdown]
# ---
# ## ๐ Step 11: Job Level Classification (Zero-Shot)
# %%
def classify_job_level_zero_shot(job_description: str) -> Dict:
"""
Zero-shot job level classification.
Returns classification as: Entry, Mid, Senior, or Executive
"""
prompt = f"""Classify this job posting into ONE seniority level.
Levels:
- Entry: 0-2 years experience, junior roles
- Mid: 3-5 years experience, independent work
- Senior: 6-10 years experience, technical leadership
- Executive: 10+ years, strategic leadership, C-level
Job Posting:
{job_description[:500]}
Return ONLY valid JSON:
{{
"level": "Entry|Mid|Senior|Executive",
"confidence": 0.85,
"reasoning": "Brief explanation"
}}
"""
response = call_llm(prompt)
try:
# Extract JSON
json_str = response.strip()
if '```json' in json_str:
json_str = json_str.split('```json')[1].split('```')[0].strip()
elif '```' in json_str:
json_str = json_str.split('```')[1].split('```')[0].strip()
# Find JSON in response
if '{' in json_str and '}' in json_str:
start = json_str.index('{')
end = json_str.rindex('}') + 1
json_str = json_str[start:end]
result = json.loads(json_str)
return result
except:
return {
"level": "Unknown",
"confidence": 0.0,
"reasoning": "Failed to parse response"
}
# Test if LLM available and data loaded
if LLM_AVAILABLE and len(postings) > 0:
print("๐งช Testing zero-shot classification...\n")
sample = postings.iloc[0]['description']
result = classify_job_level_zero_shot(sample)
print("๐ Classification Result:")
print(json.dumps(result, indent=2))
else:
print("โ ๏ธ Skipped - LLM not available or no data")
# %% [markdown]
# ---
# ## ๐ Step 12: Few-Shot Learning
# %%
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# FEW-SHOT Job Level Classification (FIXED)
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
def classify_job_level_few_shot(job_description: str) -> Dict:
"""Few-shot classification with robust parsing"""
prompt = f"""Classify this job posting using examples.
EXAMPLES:
- "Recent graduate wanted. Python basics." โ Entry
- "5+ years backend. Lead team." โ Senior
- "CTO position. 15+ years strategy." โ Executive
JOB POSTING:
{job_description[:500]}
IMPORTANT: Return ONLY valid JSON in this exact format:
{{"level": "Entry|Mid|Senior|Executive", "confidence": 0.85, "reasoning": "brief explanation"}}
Do not include any other text, markdown, or code blocks."""
response = call_llm(prompt, max_tokens=200)
try:
# Clean response
json_str = response.strip()
# Remove markdown if present
if '```' in json_str:
json_str = json_str.split('```json')[-1].split('```')[0].strip()
if not json_str:
json_str = response.split('```')[-2].strip()
# Extract JSON object
if '{' in json_str and '}' in json_str:
start = json_str.index('{')
end = json_str.rindex('}') + 1
json_str = json_str[start:end]
result = json.loads(json_str)
# Validate fields
if 'level' not in result:
raise ValueError("Missing 'level' field")
# Ensure confidence exists
if 'confidence' not in result:
result['confidence'] = 0.85
return result
except Exception as e:
# Fallback: try to extract level from raw text
response_lower = response.lower()
if 'entry' in response_lower or 'junior' in response_lower:
level = 'Entry'
elif 'senior' in response_lower:
level = 'Senior'
elif 'executive' in response_lower or 'c-level' in response_lower:
level = 'Executive'
elif 'mid' in response_lower:
level = 'Mid'
else:
level = 'Unknown'
return {
"level": level,
"confidence": 0.70 if level != 'Unknown' else 0.0,
"reasoning": f"Extracted from text (parse error: {str(e)[:50]})"
}
print("โ
Few-shot classifier (robust parsing)")
# Test comparison
if LLM_AVAILABLE and len(postings) > 0:
print("\n๐งช Comparing Zero-Shot vs Few-Shot...")
sample = postings.iloc[0]['description']
zero = classify_job_level_zero_shot(sample)
few = classify_job_level_few_shot(sample)
print("\n๐ Comparison:")
print(f"Zero-shot: {zero['level']} (confidence: {zero['confidence']:.2f})")
print(f"Few-shot: {few['level']} (confidence: {few['confidence']:.2f})")
print(f"\n๐ Few-shot reasoning: {few['reasoning'][:100]}...")
else:
print("โ ๏ธ LLM not available")
# %% [markdown]
# ---
# ## ๐ Step 13: Structured Skills Extraction
# %%
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# FIXED: Skills Extraction (better prompt)
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
def extract_skills_taxonomy(job_description: str) -> Dict:
"""Extract structured skills using LLM + Pydantic validation"""
prompt = f"""Extract ALL skills mentioned in this job posting.
JOB POSTING:
{job_description[:800]}
Analyze the text above and extract:
- Technical skills (programming, tools, platforms)
- Soft skills (teamwork, communication, problem-solving)
- Certifications (if any)
- Languages (if mentioned)
Return ONLY valid JSON with actual skills found in the text:
{{
"technical_skills": ["skill1", "skill2"],
"soft_skills": ["skill1", "skill2"],
"certifications": ["cert1"],
"languages": ["lang1"]
}}
IMPORTANT:
- Extract ONLY skills that are ACTUALLY in the job posting above
- If no skills found in a category, use empty array []
- Do not include example values
"""
response = call_llm(prompt, max_tokens=800)
try:
json_str = response.strip()
# Remove markdown
if '```json' in json_str:
json_str = json_str.split('```json')[1].split('```')[0].strip()
elif '```' in json_str:
json_str = json_str.split('```')[1].split('```')[0].strip()
# Extract JSON
if '{' in json_str and '}' in json_str:
start = json_str.index('{')
end = json_str.rindex('}') + 1
json_str = json_str[start:end]
data = json.loads(json_str)
# Validate with Pydantic
validated = SkillsTaxonomy(**data)
return validated.model_dump()
except Exception as e:
print(f"โ ๏ธ Parse error: {e}")
return {
"technical_skills": [],
"soft_skills": [],
"certifications": [],
"languages": []
}
print("โ
Skills extraction (fixed prompt)")
# Test
if LLM_AVAILABLE and len(postings) > 0:
print("\n๐ Testing skills extraction...")
sample = postings.iloc[0]['description']
print(f"\n๐ Job posting sample:")
print(f" {sample[:200]}...\n")
skills = extract_skills_taxonomy(sample)
print("๐ Extracted Skills:")
print(json.dumps(skills, indent=2))
# Check if actually extracted something
total_skills = sum(len(v) for v in skills.values())
print(f"\n{'โ
' if total_skills > 0 else 'โ ๏ธ '} Total skills found: {total_skills}")
else:
print("โ ๏ธ LLM not available")
# %% [markdown]
# ---
# ## ๐ Step 14: Match Explainability
# %%
def explain_match(candidate_idx: int, company_idx: int, similarity_score: float) -> Dict:
"""
Generate LLM explanation for why candidate matches company.
"""
cand = candidates.iloc[candidate_idx]
comp = companies_full.iloc[company_idx]
cand_skills = str(cand.get('skills', 'N/A'))[:300]
cand_exp = str(cand.get('positions', 'N/A'))[:300]
comp_req = str(comp.get('required_skills', 'N/A'))[:300]
comp_name = comp.get('name', 'Unknown')
prompt = f"""Explain why this candidate matches this company.
Candidate:
Skills: {cand_skills}
Experience: {cand_exp}
Company: {comp_name}
Requirements: {comp_req}
Similarity Score: {similarity_score:.2f}
Return JSON:
{{
"overall_score": {similarity_score},
"match_strengths": ["Top 3-5 matching factors"],
"skill_gaps": ["Missing skills"],
"recommendation": "What candidate should do",
"fit_summary": "One sentence summary"
}}
"""
response = call_llm(prompt, max_tokens=1000)
try:
json_str = response.strip()
if '```json' in json_str:
json_str = json_str.split('```json')[1].split('```')[0].strip()
if '{' in json_str and '}' in json_str:
start = json_str.index('{')
end = json_str.rindex('}') + 1
json_str = json_str[start:end]
data = json.loads(json_str)
return data
except:
return {
"overall_score": similarity_score,
"match_strengths": ["Unable to generate"],
"skill_gaps": [],
"recommendation": "Review manually",
"fit_summary": f"Match score: {similarity_score:.2f}"
}
# Test explainability
if LLM_AVAILABLE and cand_vectors is not None and len(candidates) > 0:
print("๐ก Testing match explainability...\n")
matches = find_top_matches(0, top_k=1)
if matches:
comp_idx, score = matches[0]
explanation = explain_match(0, comp_idx, score)
print("๐ Match Explanation:")
print(json.dumps(explanation, indent=2))
else:
print("โ ๏ธ Skipped - requirements not met")
# %%
# Check if matches make semantic sense
print("๐ MATCH QUALITY CHECK")
print("=" * 80)
cand_0 = candidates.iloc[0]
print(f"\nCandidate 0:")
print(f" Category: {cand_0.get('Category', 'N/A')}")
print(f" Skills: {str(cand_0.get('skills', 'N/A'))[:150]}...")
matches = find_top_matches(0, top_k=3)
print(f"\nTop 3 Company Matches:")
for i, (comp_idx, score) in enumerate(matches, 1):
comp = companies_full.iloc[comp_idx]
print(f"\n{i}. {comp['name']} (score: {score:.3f})")
print(f" Industries: {str(comp['industries_list'])[:100]}...")
print(f" Required Skills: {str(comp['required_skills'])[:100]}...")
print("\n" + "=" * 80)
print("โ Do these matches make SEMANTIC SENSE?")
# %% [markdown]
# ---
# ## ๐ Step 16: Detailed Match Visualization
# %%
# ============================================================================
# ๐ DETAILED MATCH EXAMPLE
# ============================================================================
def show_detailed_match_example(candidate_idx=0, top_k=5):
print("๐ DETAILED MATCH ANALYSIS")
print("=" * 100)
if candidate_idx >= len(candidates):
print(f"โ ERROR: Candidate {candidate_idx} out of range")
return None
cand = candidates.iloc[candidate_idx]
print(f"\n๐ฏ CANDIDATE #{candidate_idx}")
print(f"Resume ID: {cand.get('Resume_ID', 'N/A')}")
print(f"Category: {cand.get('Category', 'N/A')}")
print(f"Skills: {str(cand.get('skills', 'N/A'))[:150]}...\n")
matches = find_top_matches(candidate_idx, top_k=top_k)
print(f"๐ TOP {len(matches)} MATCHES:\n")
for rank, (comp_idx, score) in enumerate(matches, 1):
if comp_idx >= len(companies_full):
continue
company = companies_full.iloc[comp_idx]
print(f"#{rank}. {company.get('name', 'N/A')} (Score: {score:.4f})")
print(f" Industries: {str(company.get('industries_list', 'N/A'))[:60]}...")
print("\n" + "=" * 100)
return matches
# Test
show_detailed_match_example(candidate_idx=9543, top_k=5)
# %% [markdown]
# ---
# ## ๐ Step 17: Bridging Concept Analysis
# %%
# ============================================================================
# ๐ BRIDGING CONCEPT ANALYSIS
# ============================================================================
def show_bridging_concept_analysis():
print("๐ THE BRIDGING CONCEPT")
print("=" * 90)
companies_with = companies_full[companies_full['required_skills'] != '']
companies_without = companies_full[companies_full['required_skills'] == '']
print(f"\n๐ DATA REALITY:")
print(f" Total companies: {len(companies_full):,}")
print(f" WITH postings: {len(companies_with):,} ({len(companies_with)/len(companies_full)*100:.1f}%)")
print(f" WITHOUT postings: {len(companies_without):,}\n")
print("๐ฏ THE PROBLEM:")
print(" Companies: 'We are in TECH INDUSTRY'")
print(" Candidates: 'I know PYTHON, AWS'")
print(" โ Different languages! ๐ซ\n")
print("๐ THE SOLUTION (BRIDGING):")
print(" 1. Extract from postings: 'Need PYTHON developers'")
print(" 2. Enrich company profile with skills")
print(" 3. Now both speak SKILLS LANGUAGE! โ
\n")
print("=" * 90)
return companies_with, companies_without
# Test
show_bridging_concept_analysis()
# %%
# Check what's in required_skills
print("๐ REQUIRED_SKILLS CHECK")
print("=" * 80)
print(f"\nTotal companies: {len(companies_full):,}")
print(f"\nValue counts:")
print(companies_full['required_skills'].value_counts().head(10))
print(f"\nEmpty string: {(companies_full['required_skills'] == '').sum()}")
print(f"'Not specified': {(companies_full['required_skills'] == 'Not specified').sum()}")
print(f"NaN: {companies_full['required_skills'].isna().sum()}")
# Real check
truly_empty = (companies_full['required_skills'] == '') | \
(companies_full['required_skills'] == 'Not specified') | \
(companies_full['required_skills'].isna())
print(f"\n๐ฏ TRULY EMPTY: {truly_empty.sum():,}")
# %% [markdown]
# ---
# ## ๐ Step 18: Export Results to CSV
# %%
# ============================================================================
# ๐พ EXPORT MATCHES TO CSV
# ============================================================================
def export_matches_to_csv(num_candidates=100, top_k=10):
print(f"๐พ Exporting {num_candidates} candidates (top {top_k} each)...\n")
results = []
for i in range(min(num_candidates, len(candidates))):
if i % 50 == 0:
print(f" Processing {i+1}/{num_candidates}...")
matches = find_top_matches(i, top_k=top_k)
cand = candidates.iloc[i]
for rank, (comp_idx, score) in enumerate(matches, 1):
if comp_idx >= len(companies_full):
continue
company = companies_full.iloc[comp_idx]
results.append({
'candidate_id': i,
'candidate_category': cand.get('Category', 'N/A'),
'company_id': company.get('company_id', 'N/A'),
'company_name': company.get('name', 'N/A'),
'match_rank': rank,
'similarity_score': round(float(score), 4)
})
results_df = pd.DataFrame(results)
output_file = f'{Config.RESULTS_PATH}hrhub_matches.csv'
results_df.to_csv(output_file, index=False)
print(f"\nโ
Exported {len(results_df):,} matches")
print(f"๐ File: {output_file}\n")
return results_df
# Export sample
matches_df = export_matches_to_csv(num_candidates=50, top_k=5)
# %% [markdown]
# ---
# ## ๐ Interactive Visualization 1: t-SNE Vector Space
#
# Project embeddings from โยณโธโด โ โยฒ to visualize candidates and companies
# %%
# ============================================================================
# ๐จ T-SNE VECTOR SPACE VISUALIZATION
# ============================================================================
from sklearn.manifold import TSNE
print("๐จ VECTOR SPACE VISUALIZATION\n")
print("=" * 70)
# Sample for visualization
n_cand_viz = min(500, len(candidates))
n_comp_viz = min(2000, len(companies_full))
print(f"๐ Visualizing:")
print(f" โข {n_cand_viz} candidates")
print(f" โข {n_comp_viz} companies")
print(f" โข From โ^384 โ โยฒ (t-SNE)\n")
# Sample vectors
cand_sample = cand_vectors[:n_cand_viz]
comp_sample = comp_vectors[:n_comp_viz]
all_vectors = np.vstack([cand_sample, comp_sample])
print("๐ Running t-SNE (2-3 minutes)...")
tsne = TSNE(
n_components=2,
perplexity=30,
random_state=42,
n_iter=1000
)
vectors_2d = tsne.fit_transform(all_vectors)
cand_2d = vectors_2d[:n_cand_viz]
comp_2d = vectors_2d[n_cand_viz:]
print("\nโ
t-SNE complete!")
# %%
# Create interactive plot
fig = go.Figure()
# Companies (red)
fig.add_trace(go.Scatter(
x=comp_2d[:, 0],
y=comp_2d[:, 1],
mode='markers',
name='Companies',
marker=dict(size=6, color='#ff6b6b', opacity=0.6),
text=[f"Company: {companies_full.iloc[i].get('name', 'N/A')[:30]}"
for i in range(n_comp_viz)],
hovertemplate='%{text}
Score: {matches[i][1]:.3f}"
for i in range(len(match_indices))],
hovertemplate='%{text}
Skills: {str(cand.get('skills', 'N/A'))[:100]}",
color='#00ff00',
size=40,
shape='star'
)
# Add company nodes + edges
for rank, (comp_idx, score) in enumerate(matches, 1):
if comp_idx >= len(companies_full):
continue
company = companies_full.iloc[comp_idx]
comp_name = company.get('name', f'Company {comp_idx}')[:30]
# Color by score
if score > 0.7:
color = '#ff0000' # Red (strong match)
elif score > 0.5:
color = '#ff6b6b' # Light red (good match)
else:
color = '#ffaaaa' # Pink (weak match)
# Add company node
net.add_node(
f'comp_{comp_idx}',
label=f"#{rank}. {comp_name}",
title=f"Score: {score:.3f}
Industries: {str(company.get('industries_list', 'N/A'))[:50]}
Required: {str(company.get('required_skills', 'N/A'))[:100]}",
color=color,
size=20 + (score * 20) # Size by score
)
# Add edge
net.add_edge(
f'cand_{target_candidate}',
f'comp_{comp_idx}',
value=float(score),
title=f"Similarity: {score:.3f}",
color='yellow'
)
# Save
output_file = f'{Config.RESULTS_PATH}network_graph.html'
net.save_graph(output_file)
print(f"โ
Network graph created!")
print(f"๐ Saved: {output_file}")
print(f"\n๐ก LEGEND:")
print(f" โญ Green star = Candidate #{target_candidate}")
print(f" ๐ด Red nodes = Companies (size = match score)")
print(f" ๐ Yellow edges = Connections")
print(f"\nโน๏ธ Hover over nodes to see details")
print(f" Drag nodes to rearrange")
print(f" Zoom with mouse wheel\n")
# Display in notebook
from IPython.display import IFrame
IFrame(output_file, width=1000, height=800)
# %% [markdown]
# ### ๐ Network Node Data
#
# Detailed information about nodes and connections
# %%
# ============================================================================
# DISPLAY NODE DATA
# ============================================================================
print("๐ NETWORK DATA SUMMARY")
print("=" * 80)
print(f"\nTotal nodes: {1 + len(matches)}")
print(f" - 1 candidate node (green star)")
print(f" - {len(matches)} company nodes (red circles)")
print(f"\nTotal edges: {len(matches)}")
print(f"\n" + "=" * 80)
# Show node details
print(f"\n๐ฏ CANDIDATE NODE:")
print(f" ID: cand_{target_candidate}")
print(f" Category: {cand.get('Category', 'N/A')}")
print(f" Skills: {str(cand.get('skills', 'N/A'))[:100]}...")
print(f"\n๐ข COMPANY NODES (Top 5):")
for rank, (comp_idx, score) in enumerate(matches[:5], 1):
if comp_idx < len(companies_full):
company = companies_full.iloc[comp_idx]
print(f"\n #{rank}. {company.get('name', 'N/A')[:40]}")
print(f" ID: comp_{comp_idx}")
print(f" Score: {score:.4f}")
print(f" Industries: {str(company.get('industries_list', 'N/A'))[:60]}...")
print(f"\n" + "=" * 80)
# %% [markdown]
# ---
# ## ๐ Visualization 4: Display Node Data
#
# Inspect detailed information about candidates and companies
# %%
# ============================================================================
# DISPLAY NODE DATA - See what's behind the graph
# ============================================================================
def display_node_data(node_id):
print("=" * 80)
if node_id.startswith('C'):
# CANDIDATE
cand_idx = int(node_id[1:])
if cand_idx >= len(candidates):
print(f"โ Candidate {cand_idx} not found!")
return
candidate = candidates.iloc[cand_idx]
print(f"๐ข CANDIDATE #{cand_idx}")
print("=" * 80)
print(f"\n๐ KEY INFORMATION:\n")
print(f"Resume ID: {candidate.get('Resume_ID', 'N/A')}")
print(f"Category: {candidate.get('Category', 'N/A')}")
print(f"Skills: {str(candidate.get('skills', 'N/A'))[:200]}")
print(f"Career Objective: {str(candidate.get('career_objective', 'N/A'))[:200]}")
elif node_id.startswith('J'):
# COMPANY
comp_idx = int(node_id[1:])
if comp_idx >= len(companies_full):
print(f"โ Company {comp_idx} not found!")
return
company = companies_full.iloc[comp_idx]
print(f"๐ด COMPANY #{comp_idx}")
print("=" * 80)
print(f"\n๐ COMPANY INFORMATION:\n")
print(f"Name: {company.get('name', 'N/A')}")
print(f"Industries: {str(company.get('industries_list', 'N/A'))[:200]}")
print(f"Required Skills: {str(company.get('required_skills', 'N/A'))[:200]}")
print(f"Posted Jobs: {str(company.get('posted_job_titles', 'N/A'))[:200]}")
print("\n" + "=" * 80 + "\n")
def display_node_with_connections(node_id, top_k=10):
display_node_data(node_id)
if node_id.startswith('C'):
cand_idx = int(node_id[1:])
print(f"๐ฏ TOP {top_k} MATCHES:")
print("=" * 80)
matches = find_top_matches(cand_idx, top_k=top_k)
# FIXED: Validate indices before accessing
valid_matches = 0
for rank, (comp_idx, score) in enumerate(matches, 1):
# Check if index is valid
if comp_idx >= len(companies_full):
print(f"โ ๏ธ Match #{rank}: Index {comp_idx} out of range (skipping)")
continue
company = companies_full.iloc[comp_idx]
print(f"#{rank}. {company.get('name', 'N/A')[:40]} (Score: {score:.4f})")
valid_matches += 1
if valid_matches == 0:
print("โ ๏ธ No valid matches found (all indices out of bounds)")
print("\n๐ก SOLUTION: Regenerate embeddings after deduplication!")
print("\n" + "=" * 80)
# Example usage
display_node_with_connections('C0', top_k=5)
# %% [markdown]
# ---
# ## ๐ธ๏ธ Visualization 5: NetworkX Graph
#
# Network graph using NetworkX + Plotly with force-directed layout
# %%
# ============================================================================
# NETWORK GRAPH WITH NETWORKX + PLOTLY
# ============================================================================
import networkx as nx
print("๐ธ๏ธ Creating NETWORK GRAPH...\n")
# Create graph
G = nx.Graph()
# Sample
n_cand_sample = min(20, len(candidates))
top_k_per_cand = 5
print(f"๐ Network size:")
print(f" โข {n_cand_sample} candidates")
print(f" โข {top_k_per_cand} companies per candidate\n")
# Add nodes + edges
companies_in_graph = set()
for i in range(n_cand_sample):
G.add_node(f"C{i}", node_type='candidate', label=f"C{i}")
matches = find_top_matches(i, top_k=top_k_per_cand)
for comp_idx, score in matches:
comp_id = f"J{comp_idx}"
if comp_id not in companies_in_graph:
company_name = companies_full.iloc[comp_idx].get('name', 'N/A')[:20]
G.add_node(comp_id, node_type='company', label=company_name)
companies_in_graph.add(comp_id)
G.add_edge(f"C{i}", comp_id, weight=float(score))
print(f"โ
Network created!")
print(f" Nodes: {G.number_of_nodes()}")
print(f" Edges: {G.number_of_edges()}\n")
# Calculate layout
print("๐ Calculating layout...")
pos = nx.spring_layout(G, k=2, iterations=50, seed=42)
print("โ
Layout done!\n")
# Create edge traces
edge_trace = []
for edge in G.edges(data=True):
x0, y0 = pos[edge[0]]
x1, y1 = pos[edge[1]]
weight = edge[2]['weight']
edge_trace.append(go.Scatter(
x=[x0, x1, None],
y=[y0, y1, None],
mode='lines',
line=dict(width=weight*3, color='rgba(255,255,255,0.3)'),
hoverinfo='none',
showlegend=False
))
# Candidate nodes
cand_nodes = [n for n, d in G.nodes(data=True) if d['node_type']=='candidate']
cand_x = [pos[n][0] for n in cand_nodes]
cand_y = [pos[n][1] for n in cand_nodes]
cand_labels = [G.nodes[n]['label'] for n in cand_nodes]
candidate_trace = go.Scatter(
x=cand_x, y=cand_y,
mode='markers+text',
name='Candidates',
marker=dict(size=25, color='#00ff00', line=dict(width=2, color='white')),
text=cand_labels,
textposition='top center',
hovertemplate='%{text}
{candidates.iloc[i].get('Category', 'N/A')}"
for i in range(len(sample_cands))],
hovertemplate='%{text}
Industry: {companies_full.iloc[i].get('industries_list', 'N/A')[:50]}"
for i in range(len(sample_comps))],
hovertemplate='%{text}
Category: {category}
Skills: {skills}...",
color='#2ecc71',
size=20,
shape='dot'
)
print(f"๐ด Adding company nodes...")
# Add company nodes (red)
for i in range(n_companies):
comp = companies_full.iloc[i]
node_id = f"CO{i}"
name = comp.get('name', 'Unknown')
industry = str(comp.get('industries_list', 'N/A'))[:100]
net.add_node(
node_id,
label=name[:20],
title=f"{name}
Industry: {industry}...",
color='#e74c3c',
size=15,
shape='dot'
)
print(f"๐ Adding edges (matches)...")
# Add edges (top 5 matches per candidate)
edge_count = 0
for cand_idx in range(n_candidates):
matches = find_top_matches(cand_idx, top_k=5)
for comp_idx, score in matches:
if comp_idx < n_companies: # Only if company in sample
net.add_edge(
f"C{cand_idx}",
f"CO{comp_idx}",
value=float(score * 10), # Thickness based on score
title=f"Match Score: {score:.3f}",
color={'color': '#95a5a6', 'opacity': 0.3}
)
edge_count += 1
print(f"\nโ
Network built!")
print(f" Nodes: {n_candidates + n_companies}")
print(f" Edges: {edge_count}")
# Save HTML
network_path = f'{Config.RESULTS_PATH}network_interactive.html'
net.save_graph(network_path)
print(f"\n๐พ Saved: {network_path}")
print(f"\n๐ฏ USAGE:")
print(" - Drag nodes to rearrange")
print(" - Hover for details")
print(" - Zoom with mouse wheel")
print(" - Green = Candidates, Red = Companies")
print("=" * 80)
# Show in notebook
net.show(network_path)
# %%
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# CELL 12: Evaluation Metrics (Precision, Bilateral Fairness, Coverage)
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
print("๐ EVALUATION METRICS")
print("=" * 80)
# ============================================================================
# METRIC 1: Match Score Distribution
# ============================================================================
print("\n1๏ธโฃ MATCH SCORE DISTRIBUTION")
# Sample matches
n_sample = min(500, len(candidates))
all_scores = []
for i in range(n_sample):
matches = find_top_matches(i, top_k=10)
scores = [score for _, score in matches]
all_scores.extend(scores)
print(f" Sample size: {n_sample} candidates ร 10 matches = {len(all_scores)} scores")
print(f"\n Statistics:")
print(f" Mean: {np.mean(all_scores):.4f}")
print(f" Median: {np.median(all_scores):.4f}")
print(f" Std: {np.std(all_scores):.4f}")
print(f" Min: {np.min(all_scores):.4f}")
print(f" Max: {np.max(all_scores):.4f}")
# Histogram
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10, 6), facecolor='#1a1a1a')
ax.set_facecolor('#1a1a1a')
ax.hist(all_scores, bins=50, color='#3498db', alpha=0.7, edgecolor='white')
ax.set_xlabel('Match Score', color='white', fontsize=12)
ax.set_ylabel('Frequency', color='white', fontsize=12)
ax.set_title('Distribution of Match Scores', color='white', fontsize=14, fontweight='bold')
ax.tick_params(colors='white')
ax.grid(True, alpha=0.2)
plt.tight_layout()
plt.savefig(f'{Config.RESULTS_PATH}score_distribution.png', facecolor='#1a1a1a', dpi=150)
print(f"\n ๐พ Saved: score_distribution.png")
# ============================================================================
# METRIC 2: Bilateral Fairness Ratio
# ============================================================================
print(f"\n2๏ธโฃ BILATERAL FAIRNESS RATIO")
# Candidate โ Company scores
cand_to_comp_scores = []
for i in range(min(200, len(candidates))):
matches = find_top_matches(i, top_k=5)
avg_score = np.mean([score for _, score in matches])
cand_to_comp_scores.append(avg_score)
# Company โ Candidate scores (sample companies)
comp_to_cand_scores = []
for i in range(min(200, len(companies_full))):
comp_vec = comp_vectors[i].reshape(1, -1)
similarities = cosine_similarity(comp_vec, cand_vectors)[0]
top_5_scores = np.sort(similarities)[-5:]
avg_score = np.mean(top_5_scores)
comp_to_cand_scores.append(avg_score)
cand_avg = np.mean(cand_to_comp_scores)
comp_avg = np.mean(comp_to_cand_scores)
bilateral_fairness = min(cand_avg, comp_avg) / max(cand_avg, comp_avg)
print(f" Candidate โ Company avg: {cand_avg:.4f}")
print(f" Company โ Candidate avg: {comp_avg:.4f}")
print(f" Bilateral Fairness Ratio: {bilateral_fairness:.4f}")
print(f" {'โ
FAIR (>0.85)' if bilateral_fairness > 0.85 else '๐ก Acceptable (>0.70)' if bilateral_fairness > 0.70 else 'โ Imbalanced'}")
# ============================================================================
# METRIC 3: Job Posting Coverage
# ============================================================================
print(f"\n3๏ธโฃ JOB POSTING COVERAGE")
has_real_skills = ~companies_full['required_skills'].isin(['', 'Not specified'])
with_postings = has_real_skills.sum()
total_companies = len(companies_full)
coverage = (with_postings / total_companies) * 100
print(f" Total companies: {total_companies:,}")
print(f" With job posting skills: {with_postings:,}")
print(f" Without: {total_companies - with_postings:,}")
print(f" Coverage: {coverage:.1f}%")
print(f" {'โ
Excellent (>90%)' if coverage > 90 else '๐ก Good (>70%)' if coverage > 70 else 'โ Poor'}")
# ============================================================================
# METRIC 4: Embedding Quality (Cosine Similarity Stats)
# ============================================================================
print(f"\n4๏ธโฃ EMBEDDING QUALITY")
# Sample similarity matrix
sample_size = min(100, len(cand_vectors), len(comp_vectors))
sim_matrix = cosine_similarity(cand_vectors[:sample_size], comp_vectors[:sample_size])
print(f" Sample: {sample_size}ร{sample_size} matrix")
print(f" Mean similarity: {np.mean(sim_matrix):.4f}")
print(f" Std: {np.std(sim_matrix):.4f}")
print(f" Top 1% scores: {np.percentile(sim_matrix, 99):.4f}")
print(f" {'โ
Good spread' if np.std(sim_matrix) > 0.1 else 'โ ๏ธ Low variance'}")
# ============================================================================
# SUMMARY
# ============================================================================
print(f"\n{'='*80}")
print("๐ METRICS SUMMARY")
print(f"{'='*80}")
print(f"โ
Match Score Distribution: Mean={np.mean(all_scores):.3f}, Std={np.std(all_scores):.3f}")
print(f"โ
Bilateral Fairness: {bilateral_fairness:.3f} {'(FAIR)' if bilateral_fairness > 0.85 else '(ACCEPTABLE)'}")
print(f"โ
Job Posting Coverage: {coverage:.1f}%")
print(f"โ
Embedding Quality: Std={np.std(sim_matrix):.3f}")
print(f"{'='*80}")
# %%
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
# CELL 11: PyVis Interactive Network - BROWSER ONLY (Full Info)
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
from pyvis.network import Network
import webbrowser
import os
print("๐ธ๏ธ CREATING INTERACTIVE NETWORK (BROWSER MODE)...")
print("=" * 80)
# ============================================================================
# Configuration
# ============================================================================
n_cand_sample = 20 # 20 candidates
top_k_per_cand = 5 # Top 5 matches each
print(f"\n๐ Network configuration:")
print(f" Candidates: {n_cand_sample}")
print(f" Matches per candidate: {top_k_per_cand}")
print(f" Target: ~{n_cand_sample * top_k_per_cand} connections")
# ============================================================================
# Initialize PyVis Network
# ============================================================================
net = Network(
height='900px',
width='100%',
bgcolor='#1a1a1a',
font_color='white',
notebook=False, # Browser mode
cdn_resources='remote'
)
# Physics for nice layout
net.set_options("""
var options = {
"physics": {
"forceAtlas2Based": {
"gravitationalConstant": -50,
"centralGravity": 0.01,
"springLength": 200,
"springConstant": 0.08,
"avoidOverlap": 1
},
"maxVelocity": 30,
"solver": "forceAtlas2Based",
"timestep": 0.35,
"stabilization": {
"enabled": true,
"iterations": 150
}
},
"nodes": {
"font": {
"size": 16,
"color": "white",
"face": "arial"
},
"borderWidth": 2
},
"edges": {
"smooth": {
"enabled": true,
"type": "continuous"
},
"width": 2
},
"interaction": {
"hover": true,
"tooltipDelay": 50,
"navigationButtons": true,
"keyboard": {
"enabled": true
},
"zoomView": true,
"dragView": true
}
}
""")
print(f"\n๐ต Adding candidate nodes...")
# ============================================================================
# Add Candidate Nodes (GREEN CIRCLES)
# ============================================================================
companies_added = set()
for i in range(min(n_cand_sample, len(candidates))):
cand = candidates.iloc[i]
# Build rich tooltip
category = cand.get('Category', 'Unknown')
skills = str(cand.get('skills', 'N/A'))
if isinstance(skills, list):
skills = ', '.join(skills[:5]) # First 5 skills
else:
skills = skills[:150]
experience = str(cand.get('positions', 'N/A'))[:100]
tooltip = f"""
Category: {category}
Top Skills:
{skills}...
Experience:
{experience}...
Industry: {industry}
Specialties: {specialties}
Required Skills:
{required_skills}...
Total Job Postings: {total_postings}