{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Save Markdown text into Vector DB" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step-1: Config" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from my_config import MY_CONFIG" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step-2: Read Markdown" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import os\n", "import glob\n", "\n", "pattern = os.path.join(MY_CONFIG.PROCESSED_DATA_DIR, '*.md')\n", "md_file_count = len(glob.glob(pattern, recursive=True)) " ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded 96 documents from 96 files\n" ] } ], "source": [ "from llama_index.core import SimpleDirectoryReader\n", "\n", "reader = SimpleDirectoryReader(input_dir=MY_CONFIG.PROCESSED_DATA_DIR, recursive=False , required_exts=[\".md\"])\n", "documents = reader.load_data()\n", "\n", "print (f\"Loaded {len(documents)} documents from {md_file_count} files\")\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Doc ID: 20eef2cd-ee21-4dd4-baf6-eda09d5d793b\n", "Text: # Building the open future of AI We are technology developers,\n", "researchers, industry leaders and advocates who collaborate to advance\n", "safe, responsible AI rooted in open innovation. ![Conference\n", "Speaker](https://images.prismic.io/ai-alliance/Zy08cq8jQArT0jJI_Imagef\n", "romNotion.jpeg?auto=format%2Ccompress&fit=max&w=3840) ![Skills &\n", "Education](htt...\n" ] } ], "source": [ "## Inspect a sample doc\n", "print (documents[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step-3: Create Chunks" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Created 223 chunks from 96 documents\n" ] } ], "source": [ "from llama_index.core import Document\n", "from llama_index.core.node_parser import SentenceSplitter\n", "\n", "parser = SentenceSplitter(chunk_size=MY_CONFIG.CHUNK_SIZE, chunk_overlap=MY_CONFIG.CHUNK_OVERLAP)\n", "nodes = parser.get_nodes_from_documents(documents)\n", "print(f\"Created {len(nodes)} chunks from {len(documents)} documents\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step-4: Setup Embedding Model" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# If connection to https://huggingface.co/ failed, uncomment the following path\n", "import os\n", "os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/sujee/apps/anaconda3/envs/allycat-6/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n", "from llama_index.core import Settings\n", "\n", "Settings.embed_model = HuggingFaceEmbedding(\n", " model_name = MY_CONFIG.EMBEDDING_MODEL\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step-5: Connect to Milvus" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Connected to Milvus instance: workspace/rag_website_milvus.db\n" ] } ], "source": [ "## Clear up any old data\n", "\n", "from pymilvus import MilvusClient\n", "\n", "milvus_client = MilvusClient(MY_CONFIG.DB_URI)\n", "print (\"✅ Connected to Milvus instance: \", MY_CONFIG.DB_URI )\n", "\n", "# if we already have a collection, clear it first\n", "if milvus_client.has_collection(collection_name = MY_CONFIG.COLLECTION_NAME):\n", " milvus_client.drop_collection(collection_name = MY_CONFIG.COLLECTION_NAME)\n", " print ('✅ Cleared collection :', MY_CONFIG.COLLECTION_NAME)\n", " " ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2025-05-12 23:36:12,218 [DEBUG][_create_connection]: Created new connection using: f81ea0e5320b44f7b5ba8b89f6aa43f7 (async_milvus_client.py:600)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "✅ Connected Llama-index to Milvus instance: workspace/rag_website_milvus.db\n" ] } ], "source": [ "# connect llama-index to vector db\n", "\n", "from llama_index.core import StorageContext\n", "from llama_index.vector_stores.milvus import MilvusVectorStore\n", "\n", "vector_store = MilvusVectorStore(\n", " uri = MY_CONFIG.DB_URI ,\n", " dim = MY_CONFIG.EMBEDDING_LENGTH , \n", " collection_name = MY_CONFIG.COLLECTION_NAME,\n", " overwrite=True\n", ")\n", "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n", "\n", "print (\"✅ Connected Llama-index to Milvus instance: \", MY_CONFIG.DB_URI )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Step-6: Save to DB" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 9 μs, sys: 0 ns, total: 9 μs\n", "Wall time: 18.8 μs\n" ] } ], "source": [ "%%time\n", "\n", "## We save entire md documents into vector store\n", "\n", "# from llama_index.core import VectorStoreIndex\n", "\n", "# index = VectorStoreIndex.from_documents(\n", "# documents, storage_context=storage_context\n", "# )\n", "# print (f\"✅ Saved {len(documents)} documents to db: {MY_CONFIG.DB_URI}\" )" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Successfully stored 223 chunks in Milvus collection 'pages'\n", "CPU times: user 900 ms, sys: 142 ms, total: 1.04 s\n", "Wall time: 807 ms\n" ] } ], "source": [ "%%time \n", "\n", "# save chunks into vector db\n", "\n", "from llama_index.core import VectorStoreIndex\n", "\n", "index = VectorStoreIndex(\n", " nodes=nodes,\n", " storage_context=storage_context,\n", " )\n", "\n", "print(f\"Successfully stored {len(nodes)} chunks in Milvus collection '{MY_CONFIG.COLLECTION_NAME}'\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "milvus_client.close()" ] } ], "metadata": { "kernelspec": { "display_name": "allycat-6", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 2 }