{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Processing HTML Files\n", "\n", "We will be using **docling**\n", "\n", "References\n", "- [docling](https://github.com/DS4SD/docling)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step-1: Data\n", "\n", "We will process data that is downloaded using [1_crawl_site.ipynb](1_crawl_site.ipynb).\n", "\n", "We have a couple of crawled HTML files in `input` directory. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step-2: Configuration" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "## All config is defined here\n", "from my_config import MY_CONFIG" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Cleared processed data directory : workspace/processed\n" ] } ], "source": [ "import os, sys\n", "import shutil\n", "\n", "shutil.rmtree(MY_CONFIG.PROCESSED_DATA_DIR, ignore_errors=True)\n", "shutil.os.makedirs(MY_CONFIG.PROCESSED_DATA_DIR, exist_ok=True)\n", "print (f\"✅ Cleared processed data directory : {MY_CONFIG.PROCESSED_DATA_DIR}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step-3: Convet FILES --> MD\n", "\n", "Process HTML documents and extract the text in markdown format" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%time \n", "\n", "import os\n", "import sys\n", "from pathlib import Path\n", "from docling.document_converter import DocumentConverter\n", "\n", "converter = DocumentConverter(format_options={\"preserve_links\": True})\n", "\n", "input_path = Path(MY_CONFIG.CRAWL_DIR)\n", "input_files = list(input_path.glob('*.html')) + list(input_path.glob('*.htm')) + list(input_path.glob('*.pdf'))\n", "print (f\"Found {len(input_files)} files to convert\")\n", "\n", "files_processed = 0\n", "errors = 0\n", "for input_file in input_files:\n", " try:\n", " result = converter.convert(input_file)\n", " markdown_content = result.document.export_to_markdown()\n", " \n", " md_file_name = os.path.join(MY_CONFIG.PROCESSED_DATA_DIR, f\"{input_file.stem}.md\")\n", " with open(md_file_name, \"w\", encoding=\"utf-8\") as md_file:\n", " md_file.write(markdown_content)\n", " \n", " print (f\"Converted '{input_file}' --> '{md_file_name}'\")\n", " files_processed += 1\n", " except Exception as e:\n", " errors += 1\n", " print (f\"Error processing {input_file}: {e}\")\n", "\n", "print (f\"✅ Processed {files_processed} files. Errors: {errors}\")" ] } ], "metadata": { "kernelspec": { "display_name": "allycat-1", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.12" } }, "nbformat": 4, "nbformat_minor": 2 }