# Save Markdown text into Vector DB

## Step-1: Config

In [1]:
from my_config import MY_CONFIG

## Step-2: Read Markdown

In [2]:
import os
import glob

pattern = os.path.join(MY_CONFIG.PROCESSED_DATA_DIR, '*.md')
md_file_count = len(glob.glob(pattern, recursive=True)) 

In [3]:
from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader(input_dir=MY_CONFIG.PROCESSED_DATA_DIR, recursive=False , required_exts=[".md"])
documents = reader.load_data()

print (f"Loaded {len(documents)} documents from {md_file_count} files")


Loaded 96 documents from 96 files


In [4]:
## Inspect a sample doc
print (documents[0])

Doc ID: 20eef2cd-ee21-4dd4-baf6-eda09d5d793b
Text: # Building the open future of AI We are technology developers,
researchers, industry leaders and advocates who collaborate to advance
safe, responsible AI rooted in open innovation. ![Conference
Speaker](https://images.prismic.io/ai-alliance/Zy08cq8jQArT0jJI_Imagef
romNotion.jpeg?auto=format%2Ccompress&fit=max&w=3840) ![Skills &
Education](htt...


## Step-3: Create Chunks

In [5]:
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter

parser = SentenceSplitter(chunk_size=MY_CONFIG.CHUNK_SIZE, chunk_overlap=MY_CONFIG.CHUNK_OVERLAP)
nodes = parser.get_nodes_from_documents(documents)
print(f"Created {len(nodes)} chunks from {len(documents)} documents")

Created 223 chunks from 96 documents


## Step-4: Setup Embedding Model

In [6]:
# If connection to https://huggingface.co/ failed, uncomment the following path
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

In [7]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

Settings.embed_model = HuggingFaceEmbedding(
 model_name = MY_CONFIG.EMBEDDING_MODEL
)

 from .autonotebook import tqdm as notebook_tqdm


## Step-5: Connect to Milvus

In [8]:
## Clear up any old data

from pymilvus import MilvusClient

milvus_client = MilvusClient(MY_CONFIG.DB_URI)
print ("✅ Connected to Milvus instance: ", MY_CONFIG.DB_URI )

# if we already have a collection, clear it first
if milvus_client.has_collection(collection_name = MY_CONFIG.COLLECTION_NAME):
 milvus_client.drop_collection(collection_name = MY_CONFIG.COLLECTION_NAME)
 print ('✅ Cleared collection :', MY_CONFIG.COLLECTION_NAME)
 

✅ Connected to Milvus instance: workspace/rag_website_milvus.db


In [9]:
# connect llama-index to vector db

from llama_index.core import StorageContext
from llama_index.vector_stores.milvus import MilvusVectorStore

vector_store = MilvusVectorStore(
 uri = MY_CONFIG.DB_URI ,
 dim = MY_CONFIG.EMBEDDING_LENGTH , 
 collection_name = MY_CONFIG.COLLECTION_NAME,
 overwrite=True
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

print ("✅ Connected Llama-index to Milvus instance: ", MY_CONFIG.DB_URI )

2025-05-12 23:36:12,218 [DEBUG][_create_connection]: Created new connection using: f81ea0e5320b44f7b5ba8b89f6aa43f7 (async_milvus_client.py:600)


✅ Connected Llama-index to Milvus instance: workspace/rag_website_milvus.db


## Step-6: Save to DB

In [10]:
%%time

## We save entire md documents into vector store

# from llama_index.core import VectorStoreIndex

# index = VectorStoreIndex.from_documents(
# documents, storage_context=storage_context
# )
# print (f"✅ Saved {len(documents)} documents to db: {MY_CONFIG.DB_URI}" )

CPU times: user 9 μs, sys: 0 ns, total: 9 μs
Wall time: 18.8 μs


In [11]:
%%time 

# save chunks into vector db

from llama_index.core import VectorStoreIndex

index = VectorStoreIndex(
 nodes=nodes,
 storage_context=storage_context,
 )

print(f"Successfully stored {len(nodes)} chunks in Milvus collection '{MY_CONFIG.COLLECTION_NAME}'")


Successfully stored 223 chunks in Milvus collection 'pages'
CPU times: user 900 ms, sys: 142 ms, total: 1.04 s
Wall time: 807 ms


In [None]:
milvus_client.close()