{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Processing HTML Files\n",
    "\n",
    "We will be using **docling**\n",
    "\n",
    "References\n",
    "- [docling](https://github.com/DS4SD/docling)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step-1: Data\n",
    "\n",
    "We will process data that is downloaded using [1_crawl_site.ipynb](1_crawl_site.ipynb).\n",
    "\n",
    "We have a couple of crawled HTML files in  `input` directory. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step-2: Configuration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "## All config is defined here\n",
    "from my_config import MY_CONFIG"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Cleared  processed data directory :  workspace/processed\n"
     ]
    }
   ],
   "source": [
    "import os, sys\n",
    "import shutil\n",
    "\n",
    "shutil.rmtree(MY_CONFIG.PROCESSED_DATA_DIR, ignore_errors=True)\n",
    "shutil.os.makedirs(MY_CONFIG.PROCESSED_DATA_DIR, exist_ok=True)\n",
    "print (f\"✅ Cleared  processed data directory :  {MY_CONFIG.PROCESSED_DATA_DIR}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step-3: Convet FILES --> MD\n",
    "\n",
    "Process HTML documents and extract the text in markdown format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time \n",
    "\n",
    "import os\n",
    "import sys\n",
    "from pathlib import Path\n",
    "from docling.document_converter import DocumentConverter\n",
    "\n",
    "converter = DocumentConverter(format_options={\"preserve_links\": True})\n",
    "\n",
    "input_path = Path(MY_CONFIG.CRAWL_DIR)\n",
    "input_files = list(input_path.glob('*.html')) + list(input_path.glob('*.htm')) + list(input_path.glob('*.pdf'))\n",
    "print (f\"Found {len(input_files)} files to convert\")\n",
    "\n",
    "files_processed = 0\n",
    "errors = 0\n",
    "for input_file in input_files:\n",
    "    try:\n",
    "        result = converter.convert(input_file)\n",
    "        markdown_content = result.document.export_to_markdown()\n",
    "        \n",
    "        md_file_name = os.path.join(MY_CONFIG.PROCESSED_DATA_DIR, f\"{input_file.stem}.md\")\n",
    "        with open(md_file_name, \"w\", encoding=\"utf-8\") as md_file:\n",
    "            md_file.write(markdown_content)\n",
    "            \n",
    "        print (f\"Converted '{input_file}' --> '{md_file_name}'\")\n",
    "        files_processed += 1\n",
    "    except Exception as e:\n",
    "        errors += 1\n",
    "        print (f\"Error processing {input_file}: {e}\")\n",
    "\n",
    "print (f\"✅ Processed {files_processed} files.  Errors: {errors}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "allycat-1",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}