Spaces:

niloydebbarma
/

allycat

Runtime error

File size: 8,501 Bytes

a7d2416

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Save Markdown text into Vector DB"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step-1: Config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from my_config import MY_CONFIG"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step-2: Read Markdown"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import glob\n",
    "\n",
    "pattern = os.path.join(MY_CONFIG.PROCESSED_DATA_DIR, '*.md')\n",
    "md_file_count = len(glob.glob(pattern, recursive=True)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded 96 documents from 96 files\n"
     ]
    }
   ],
   "source": [
    "from llama_index.core import SimpleDirectoryReader\n",
    "\n",
    "reader = SimpleDirectoryReader(input_dir=MY_CONFIG.PROCESSED_DATA_DIR, recursive=False , required_exts=[\".md\"])\n",
    "documents = reader.load_data()\n",
    "\n",
    "print (f\"Loaded {len(documents)} documents from {md_file_count} files\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Doc ID: 20eef2cd-ee21-4dd4-baf6-eda09d5d793b\n",
      "Text: # Building the open future of AI  We are technology developers,\n",
      "researchers, industry leaders and advocates who collaborate to advance\n",
      "safe, responsible AI rooted in open innovation.  ![Conference\n",
      "Speaker](https://images.prismic.io/ai-alliance/Zy08cq8jQArT0jJI_Imagef\n",
      "romNotion.jpeg?auto=format%2Ccompress&fit=max&w=3840)  ![Skills &\n",
      "Education](htt...\n"
     ]
    }
   ],
   "source": [
    "## Inspect a sample doc\n",
    "print (documents[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step-3: Create Chunks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Created 223 chunks from 96 documents\n"
     ]
    }
   ],
   "source": [
    "from llama_index.core import Document\n",
    "from llama_index.core.node_parser import SentenceSplitter\n",
    "\n",
    "parser = SentenceSplitter(chunk_size=MY_CONFIG.CHUNK_SIZE, chunk_overlap=MY_CONFIG.CHUNK_OVERLAP)\n",
    "nodes = parser.get_nodes_from_documents(documents)\n",
    "print(f\"Created {len(nodes)} chunks from {len(documents)} documents\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step-4: Setup Embedding Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# If connection to https://huggingface.co/ failed, uncomment the following path\n",
    "import os\n",
    "os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/sujee/apps/anaconda3/envs/allycat-6/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
    "from llama_index.core import Settings\n",
    "\n",
    "Settings.embed_model = HuggingFaceEmbedding(\n",
    "    model_name = MY_CONFIG.EMBEDDING_MODEL\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step-5: Connect to Milvus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Connected to Milvus instance:  workspace/rag_website_milvus.db\n"
     ]
    }
   ],
   "source": [
    "## Clear up any old data\n",
    "\n",
    "from pymilvus import MilvusClient\n",
    "\n",
    "milvus_client = MilvusClient(MY_CONFIG.DB_URI)\n",
    "print (\"✅ Connected to Milvus instance: \", MY_CONFIG.DB_URI )\n",
    "\n",
    "# if we already have a collection, clear it first\n",
    "if milvus_client.has_collection(collection_name = MY_CONFIG.COLLECTION_NAME):\n",
    "    milvus_client.drop_collection(collection_name = MY_CONFIG.COLLECTION_NAME)\n",
    "    print ('✅ Cleared collection :', MY_CONFIG.COLLECTION_NAME)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025-05-12 23:36:12,218 [DEBUG][_create_connection]: Created new connection using: f81ea0e5320b44f7b5ba8b89f6aa43f7 (async_milvus_client.py:600)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Connected Llama-index to Milvus instance:  workspace/rag_website_milvus.db\n"
     ]
    }
   ],
   "source": [
    "# connect llama-index to vector db\n",
    "\n",
    "from llama_index.core import StorageContext\n",
    "from llama_index.vector_stores.milvus import MilvusVectorStore\n",
    "\n",
    "vector_store = MilvusVectorStore(\n",
    "    uri = MY_CONFIG.DB_URI ,\n",
    "    dim = MY_CONFIG.EMBEDDING_LENGTH , \n",
    "    collection_name = MY_CONFIG.COLLECTION_NAME,\n",
    "    overwrite=True\n",
    ")\n",
    "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
    "\n",
    "print (\"✅ Connected Llama-index to Milvus instance: \", MY_CONFIG.DB_URI )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step-6: Save to DB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 9 μs, sys: 0 ns, total: 9 μs\n",
      "Wall time: 18.8 μs\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "## We save entire md documents into vector store\n",
    "\n",
    "# from llama_index.core import VectorStoreIndex\n",
    "\n",
    "# index = VectorStoreIndex.from_documents(\n",
    "#     documents, storage_context=storage_context\n",
    "# )\n",
    "# print (f\"✅ Saved {len(documents)} documents to db: {MY_CONFIG.DB_URI}\" )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Successfully stored 223 chunks in Milvus collection 'pages'\n",
      "CPU times: user 900 ms, sys: 142 ms, total: 1.04 s\n",
      "Wall time: 807 ms\n"
     ]
    }
   ],
   "source": [
    "%%time \n",
    "\n",
    "# save chunks into vector db\n",
    "\n",
    "from llama_index.core import VectorStoreIndex\n",
    "\n",
    "index = VectorStoreIndex(\n",
    "        nodes=nodes,\n",
    "        storage_context=storage_context,\n",
    "    )\n",
    "\n",
    "print(f\"Successfully stored {len(nodes)} chunks in Milvus collection '{MY_CONFIG.COLLECTION_NAME}'\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "milvus_client.close()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "allycat-6",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}