DocsNavigatorMCP / document_intelligence.py
mackenzietechdocs's picture
adding files again
f639a6f
"""
Document Intelligence Module for Advanced Text Analysis and Processing
"""
from pathlib import Path
from typing import List, Dict, Any
import re
import math
from collections import Counter
class DocumentIntelligence:
"""Advanced document intelligence for smart analysis and summarization."""
def __init__(self, docs_root: Path):
self.docs_root = docs_root
def generate_smart_summary(self, content: str, summary_type: str = "medium") -> str:
"""Generate an intelligent summary based on content analysis."""
# Handle PDF page markers
content = self._clean_pdf_content(content)
sentences = self._split_into_sentences(content)
if not sentences:
return "No content available for summarization."
# Score sentences based on multiple factors
sentence_scores = {}
# Factor 1: Word frequency
words = self._extract_words(content)
word_freq = Counter(words)
# Factor 2: Position (early sentences often important)
# Factor 3: Length (moderate length sentences preferred)
# Factor 4: Keywords (technical terms, action words)
for i, sentence in enumerate(sentences):
score = 0
sentence_words = self._extract_words(sentence)
# Word frequency score
for word in sentence_words:
score += word_freq.get(word, 0)
# Position score (first and last sentences get bonus)
if i < 3:
score += 5
elif i >= len(sentences) - 2:
score += 3
# Length score (prefer moderate length)
word_count = len(sentence_words)
if 10 <= word_count <= 25:
score += 3
elif 5 <= word_count <= 35:
score += 1
# Keyword bonus
keywords = ['important', 'key', 'main', 'primary', 'essential',
'note', 'must', 'should', 'required', 'configure',
'setup', 'install', 'create', 'build']
for keyword in keywords:
if keyword in sentence.lower():
score += 2
sentence_scores[i] = score / max(len(sentence_words), 1)
# Select top sentences based on summary type
if summary_type == "short":
top_count = min(3, len(sentences))
elif summary_type == "long":
top_count = min(10, len(sentences))
else: # medium
top_count = min(6, len(sentences))
# Get top scoring sentences, maintaining order
top_sentence_indices = sorted(
sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)[:top_count],
key=lambda x: x[0]
)
summary_sentences = [sentences[i] for i, _ in top_sentence_indices]
return ' '.join(summary_sentences)
def extract_key_concepts(self, content: str, min_frequency: int = 2) -> List[Dict[str, Any]]:
"""Extract key concepts and terms from content."""
# Clean PDF content for better concept extraction
content = self._clean_pdf_content(content)
concepts = []
# Extract technical terms (words in backticks)
tech_terms = re.findall(r'`([^`]+)`', content)
tech_term_freq = Counter(tech_terms)
for term, freq in tech_term_freq.items():
if freq >= min_frequency:
concepts.append({
'concept': term,
'frequency': freq,
'type': 'technical_term'
})
# Extract important phrases (words in bold)
bold_terms = re.findall(r'\*\*([^*]+)\*\*', content)
bold_term_freq = Counter(bold_terms)
for term, freq in bold_term_freq.items():
if freq >= min_frequency:
concepts.append({
'concept': term,
'frequency': freq,
'type': 'emphasized_term'
})
# Extract capitalized words (potential proper nouns/concepts)
words = re.findall(r'\b[A-Z][a-z]+\b', content)
cap_word_freq = Counter(words)
for word, freq in cap_word_freq.items():
if freq >= min_frequency and len(word) > 3:
concepts.append({
'concept': word,
'frequency': freq,
'type': 'proper_noun'
})
# Sort by frequency and return top concepts
concepts.sort(key=lambda x: x['frequency'], reverse=True)
return concepts[:20]
def analyze_readability(self, content: str) -> Dict[str, Any]:
"""Analyze content readability using various metrics."""
# Clean PDF content for better analysis
content = self._clean_pdf_content(content)
sentences = self._split_into_sentences(content)
words = self._extract_words(content)
if not sentences or not words:
return {"flesch_score": 0, "grade_level": 0, "complexity": "unknown"}
# Basic counts
sentence_count = len(sentences)
word_count = len(words)
syllable_count = sum(self._count_syllables(word) for word in words)
# Average sentence length
avg_sentence_length = word_count / sentence_count
# Average syllables per word
avg_syllables = syllable_count / word_count if word_count > 0 else 0
# Flesch Reading Ease Score
flesch_score = 206.835 - (1.015 * avg_sentence_length) - (84.6 * avg_syllables)
flesch_score = max(0, min(100, flesch_score)) # Clamp to 0-100
# Grade level estimation
grade_level = 0.39 * avg_sentence_length + 11.8 * avg_syllables - 15.59
grade_level = max(1, grade_level)
# Complexity assessment
if flesch_score >= 70:
complexity = "easy"
elif flesch_score >= 50:
complexity = "moderate"
elif flesch_score >= 30:
complexity = "difficult"
else:
complexity = "very difficult"
return {
"flesch_score": round(flesch_score, 1),
"grade_level": round(grade_level, 1),
"complexity": complexity,
"avg_sentence_length": round(avg_sentence_length, 1),
"avg_syllables_per_word": round(avg_syllables, 2),
"total_sentences": sentence_count,
"total_words": word_count
}
def extract_questions_and_answers(self, content: str) -> List[Dict[str, str]]:
"""Extract Q&A pairs from content."""
qa_pairs = []
# Look for FAQ sections
sections = self._extract_sections(content)
for section in sections:
if any(keyword in section['title'].lower() for keyword in ['faq', 'question', 'q&a', 'troubleshoot']):
pairs = self._extract_qa_from_section(section['content'])
qa_pairs.extend(pairs)
# Look for question patterns throughout the text
question_patterns = [
r'(?:Q:|Question:|Q\d+:)\s*([^?]+\?)\s*(?:A:|Answer:)?\s*([^Q\n]+)',
r'(?:^|\n)([^.!?\n]*\?)\s*\n([^?\n]+)',
r'How (?:do|to|can) ([^?]+\?)\s*([^?\n]+)'
]
for pattern in question_patterns:
matches = re.findall(pattern, content, re.MULTILINE | re.IGNORECASE)
for match in matches:
if len(match) == 2:
question, answer = match
qa_pairs.append({
"question": question.strip(),
"answer": answer.strip()[:300], # Limit answer length
"type": "extracted"
})
return qa_pairs[:15] # Return top 15 Q&A pairs
def find_related_content(self, query: str, doc_paths: List[Path], max_results: int = 5) -> List[Dict[str, Any]]:
"""Find documents related to a query using TF-IDF-like scoring."""
query_words = set(self._extract_words(query.lower()))
results = []
for path in doc_paths:
try:
content = path.read_text(encoding='utf-8', errors='ignore')
content_words = self._extract_words(content.lower())
if not content_words:
continue
# Calculate similarity score
word_freq = Counter(content_words)
score = 0
for query_word in query_words:
if query_word in word_freq:
# TF-IDF like scoring
tf = word_freq[query_word] / len(content_words)
score += tf * len(query_word) # Longer words get more weight
if score > 0:
# Normalize by document length
normalized_score = score / math.log(len(content_words) + 1)
# Get context snippet
snippet = self._extract_snippet(content, query_words)
results.append({
'path': str(path.relative_to(self.docs_root)),
'relevance_score': normalized_score,
'snippet': snippet,
'word_count': len(content_words)
})
except Exception:
continue
# Sort by relevance and return top results
results.sort(key=lambda x: x['relevance_score'], reverse=True)
return results[:max_results]
def _split_into_sentences(self, content: str) -> List[str]:
"""Split content into sentences."""
# Simple sentence splitting
sentences = re.split(r'[.!?]+', content)
return [s.strip() for s in sentences if s.strip() and len(s.strip()) > 10]
def _extract_words(self, text: str) -> List[str]:
"""Extract words from text."""
words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
# Filter out common stop words
stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those', 'it', 'its', 'they', 'them', 'their'}
return [word for word in words if word not in stop_words and len(word) > 2]
def _count_syllables(self, word: str) -> int:
"""Estimate syllable count for a word."""
word = word.lower()
if len(word) <= 3:
return 1
vowels = 'aeiouy'
syllable_count = 0
prev_was_vowel = False
for char in word:
if char in vowels:
if not prev_was_vowel:
syllable_count += 1
prev_was_vowel = True
else:
prev_was_vowel = False
# Handle silent e
if word.endswith('e') and syllable_count > 1:
syllable_count -= 1
return max(1, syllable_count)
def _extract_sections(self, content: str) -> List[Dict[str, str]]:
"""Extract sections from markdown content."""
sections = []
lines = content.split('\n')
current_section = None
current_content = []
for line in lines:
if line.strip().startswith('#'):
if current_section:
sections.append({
'title': current_section,
'content': '\n'.join(current_content).strip()
})
current_section = line.strip()
current_content = []
else:
current_content.append(line)
if current_section:
sections.append({
'title': current_section,
'content': '\n'.join(current_content).strip()
})
return sections
def _extract_qa_from_section(self, section_content: str) -> List[Dict[str, str]]:
"""Extract Q&A pairs from a section."""
qa_pairs = []
lines = section_content.split('\n')
current_question = None
current_answer = []
for line in lines:
line = line.strip()
if line.endswith('?') and not current_question:
current_question = line
elif current_question and line and not line.endswith('?'):
current_answer.append(line)
elif current_question and (line.endswith('?') or not line):
if current_answer:
qa_pairs.append({
"question": current_question,
"answer": ' '.join(current_answer),
"type": "faq"
})
current_question = line if line.endswith('?') else None
current_answer = []
# Don't forget the last Q&A pair
if current_question and current_answer:
qa_pairs.append({
"question": current_question,
"answer": ' '.join(current_answer),
"type": "faq"
})
return qa_pairs
def _extract_snippet(self, content: str, query_words: set, snippet_length: int = 150) -> str:
"""Extract a relevant snippet containing query words."""
content_lower = content.lower()
# Find the first occurrence of any query word
first_pos = len(content)
for word in query_words:
pos = content_lower.find(word)
if pos != -1:
first_pos = min(first_pos, pos)
if first_pos == len(content):
# No query words found, return beginning
return content[:snippet_length] + "..." if len(content) > snippet_length else content
# Extract snippet around the found position
start = max(0, first_pos - snippet_length // 2)
end = min(len(content), start + snippet_length)
snippet = content[start:end]
if start > 0:
snippet = "..." + snippet
if end < len(content):
snippet = snippet + "..."
return snippet.replace('\n', ' ')
def _clean_pdf_content(self, content: str) -> str:
"""Clean PDF content by removing page markers and fixing formatting."""
import re
# Remove page markers like "--- Page 1 ---"
content = re.sub(r'\n--- Page \d+ ---\n', '\n\n', content)
content = re.sub(r'\n--- Page \d+ \(Error reading:.*?\) ---\n', '\n\n', content)
# Fix common PDF extraction issues
# Remove excessive whitespace
content = re.sub(r'\n\s*\n\s*\n+', '\n\n', content)
# Fix broken words (common in PDF extraction)
content = re.sub(r'(\w)-\s*\n\s*(\w)', r'\1\2', content)
# Fix spacing issues
content = re.sub(r'([a-z])([A-Z])', r'\1 \2', content)
# Remove extra spaces
content = re.sub(r' +', ' ', content)
return content.strip()