Spaces:

thuonguyenvan
/

Plant-Recognition-with-Q-A-System-Backend

Running

Plant-Recognition-with-Q-A-System-Backend / utils /chunker.py

Thuong Nguyen

Initial deployment: FastAPI backend with data and Dockerfile

f3cb94f 22 days ago

5.17 kB

	"""
	Intelligent Value Chunker for Vietnamese Text
	Sentence-level chunking with semantic preservation
	"""
	import re
	from typing import List, Tuple


	def estimate_tokens(text: str) -> int:
	"""
	Estimate token count for Vietnamese text
	Vietnamese: ~1.3 tokens per word

	Args:
	text: Input Vietnamese text

	Returns:
	Estimated token count
	"""
	if not text:
	return 0
	words = text.split()
	return int(len(words) * 1.3)


	def split_into_sentences(text: str) -> List[str]:
	"""
	Split Vietnamese text into sentences
	Handles:
	- Period, exclamation, question marks
	- Abbreviations (Dr., Jr., etc.)
	- Numbers with decimals

	Args:
	text: Input Vietnamese text

	Returns:
	List of sentences
	"""
	if not text:
	return []

	# Vietnamese sentence delimiters
	text = re.sub(r'([.!?])\s+', r'\1<SPLIT>', text)

	# Don't split on abbreviations
	text = re.sub(r'([A-Z]\.<SPLIT>)', r'\1', text)

	# Don't split on numbers
	text = re.sub(r'(\d+\.<SPLIT>\d+)', r'\1', text)

	sentences = text.split('<SPLIT>')
	return [s.strip() for s in sentences if s.strip()]


	def chunk_long_value(
	key: str,
	value: str,
	max_tokens: int = 250,
	min_tokens: int = 30
	) -> List[Tuple[str, str, int]]:
	"""
	Intelligent sentence-level chunking

	Args:
	key: Field key (Vietnamese)
	value: Field value to chunk
	max_tokens: Maximum tokens per chunk (default: 250)
	min_tokens: Minimum tokens per chunk (default: 30)

	Returns:
	List of (key, chunk_text, chunk_id) tuples
	"""
	# Check if chunking is needed
	if estimate_tokens(value) <= max_tokens:
	return [(key, value, 0)]

	sentences = split_into_sentences(value)

	# If only one long sentence, split by comma
	if len(sentences) == 1:
	parts = value.split(',')
	if len(parts) > 1:
	sentences = [p.strip() + ',' for p in parts[:-1]] + [parts[-1].strip()]

	chunks = []
	current_chunk = []
	current_tokens = 0
	chunk_id = 0

	for sentence in sentences:
	sent_tokens = estimate_tokens(sentence)

	# If single sentence is too long, force split by comma
	if sent_tokens > max_tokens:
	if current_chunk:
	chunks.append((
	key,
	' '.join(current_chunk),
	chunk_id
	))
	chunk_id += 1
	current_chunk = []
	current_tokens = 0

	# Split long sentence by comma
	sub_parts = [p.strip() for p in sentence.split(',') if p.strip()]
	for part in sub_parts:
	part_tokens = estimate_tokens(part)
	if current_tokens + part_tokens <= max_tokens:
	current_chunk.append(part)
	current_tokens += part_tokens
	else:
	if current_chunk:
	chunks.append((
	key,
	', '.join(current_chunk),
	chunk_id
	))
	chunk_id += 1
	current_chunk = [part]
	current_tokens = part_tokens

	# Normal sentence processing
	elif current_tokens + sent_tokens <= max_tokens:
	current_chunk.append(sentence)
	current_tokens += sent_tokens
	else:
	# Save current chunk
	if current_chunk:
	chunks.append((
	key,
	' '.join(current_chunk),
	chunk_id
	))
	chunk_id += 1

	# Start new chunk
	current_chunk = [sentence]
	current_tokens = sent_tokens

	# Save remaining
	if current_chunk:
	chunks.append((
	key,
	' '.join(current_chunk),
	chunk_id
	))

	return chunks if chunks else [(key, value, 0)]


	# Test if run directly
	if __name__ == "__main__":
	test_value = """Ở Ấn Độ, Nepal và Philipin, thân rễ sâm cau được dùng làm thuốc lợi tiểu và kích dục chữa bệnh ngoài da, loét dạ dày tá tràng, trĩ, lậu, bạch đới, hen, vàng da, tiêu chảy và nhức đầu. Ở Ấn Độ, người ta còn dùng thân rễ sâm cau để gây sẩy thai dưới dạng thuốc sắc, hoặc thuốc bột uống với đường trong một cốc sữa. Rễ sâm cau là một thành phần trong bài thuốc cổ truyền Ấn Độ gồm 10 dược liệu trị sỏi niệu."""

	print(f"Original text: {len(test_value)} chars, ~{estimate_tokens(test_value)} tokens\n")

	chunks = chunk_long_value("Làm thuốc", test_value, max_tokens=250)

	print(f"Generated {len(chunks)} chunks:\n")
	for key, text, chunk_id in chunks:
	print(f"Chunk {chunk_id}: {estimate_tokens(text)} tokens")
	print(f"{text[:100]}...\n")