Spaces:
Sleeping
Sleeping
File size: 15,719 Bytes
f639a6f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 |
"""
Document Intelligence Module for Advanced Text Analysis and Processing
"""
from pathlib import Path
from typing import List, Dict, Any
import re
import math
from collections import Counter
class DocumentIntelligence:
"""Advanced document intelligence for smart analysis and summarization."""
def __init__(self, docs_root: Path):
self.docs_root = docs_root
def generate_smart_summary(self, content: str, summary_type: str = "medium") -> str:
"""Generate an intelligent summary based on content analysis."""
# Handle PDF page markers
content = self._clean_pdf_content(content)
sentences = self._split_into_sentences(content)
if not sentences:
return "No content available for summarization."
# Score sentences based on multiple factors
sentence_scores = {}
# Factor 1: Word frequency
words = self._extract_words(content)
word_freq = Counter(words)
# Factor 2: Position (early sentences often important)
# Factor 3: Length (moderate length sentences preferred)
# Factor 4: Keywords (technical terms, action words)
for i, sentence in enumerate(sentences):
score = 0
sentence_words = self._extract_words(sentence)
# Word frequency score
for word in sentence_words:
score += word_freq.get(word, 0)
# Position score (first and last sentences get bonus)
if i < 3:
score += 5
elif i >= len(sentences) - 2:
score += 3
# Length score (prefer moderate length)
word_count = len(sentence_words)
if 10 <= word_count <= 25:
score += 3
elif 5 <= word_count <= 35:
score += 1
# Keyword bonus
keywords = ['important', 'key', 'main', 'primary', 'essential',
'note', 'must', 'should', 'required', 'configure',
'setup', 'install', 'create', 'build']
for keyword in keywords:
if keyword in sentence.lower():
score += 2
sentence_scores[i] = score / max(len(sentence_words), 1)
# Select top sentences based on summary type
if summary_type == "short":
top_count = min(3, len(sentences))
elif summary_type == "long":
top_count = min(10, len(sentences))
else: # medium
top_count = min(6, len(sentences))
# Get top scoring sentences, maintaining order
top_sentence_indices = sorted(
sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)[:top_count],
key=lambda x: x[0]
)
summary_sentences = [sentences[i] for i, _ in top_sentence_indices]
return ' '.join(summary_sentences)
def extract_key_concepts(self, content: str, min_frequency: int = 2) -> List[Dict[str, Any]]:
"""Extract key concepts and terms from content."""
# Clean PDF content for better concept extraction
content = self._clean_pdf_content(content)
concepts = []
# Extract technical terms (words in backticks)
tech_terms = re.findall(r'`([^`]+)`', content)
tech_term_freq = Counter(tech_terms)
for term, freq in tech_term_freq.items():
if freq >= min_frequency:
concepts.append({
'concept': term,
'frequency': freq,
'type': 'technical_term'
})
# Extract important phrases (words in bold)
bold_terms = re.findall(r'\*\*([^*]+)\*\*', content)
bold_term_freq = Counter(bold_terms)
for term, freq in bold_term_freq.items():
if freq >= min_frequency:
concepts.append({
'concept': term,
'frequency': freq,
'type': 'emphasized_term'
})
# Extract capitalized words (potential proper nouns/concepts)
words = re.findall(r'\b[A-Z][a-z]+\b', content)
cap_word_freq = Counter(words)
for word, freq in cap_word_freq.items():
if freq >= min_frequency and len(word) > 3:
concepts.append({
'concept': word,
'frequency': freq,
'type': 'proper_noun'
})
# Sort by frequency and return top concepts
concepts.sort(key=lambda x: x['frequency'], reverse=True)
return concepts[:20]
def analyze_readability(self, content: str) -> Dict[str, Any]:
"""Analyze content readability using various metrics."""
# Clean PDF content for better analysis
content = self._clean_pdf_content(content)
sentences = self._split_into_sentences(content)
words = self._extract_words(content)
if not sentences or not words:
return {"flesch_score": 0, "grade_level": 0, "complexity": "unknown"}
# Basic counts
sentence_count = len(sentences)
word_count = len(words)
syllable_count = sum(self._count_syllables(word) for word in words)
# Average sentence length
avg_sentence_length = word_count / sentence_count
# Average syllables per word
avg_syllables = syllable_count / word_count if word_count > 0 else 0
# Flesch Reading Ease Score
flesch_score = 206.835 - (1.015 * avg_sentence_length) - (84.6 * avg_syllables)
flesch_score = max(0, min(100, flesch_score)) # Clamp to 0-100
# Grade level estimation
grade_level = 0.39 * avg_sentence_length + 11.8 * avg_syllables - 15.59
grade_level = max(1, grade_level)
# Complexity assessment
if flesch_score >= 70:
complexity = "easy"
elif flesch_score >= 50:
complexity = "moderate"
elif flesch_score >= 30:
complexity = "difficult"
else:
complexity = "very difficult"
return {
"flesch_score": round(flesch_score, 1),
"grade_level": round(grade_level, 1),
"complexity": complexity,
"avg_sentence_length": round(avg_sentence_length, 1),
"avg_syllables_per_word": round(avg_syllables, 2),
"total_sentences": sentence_count,
"total_words": word_count
}
def extract_questions_and_answers(self, content: str) -> List[Dict[str, str]]:
"""Extract Q&A pairs from content."""
qa_pairs = []
# Look for FAQ sections
sections = self._extract_sections(content)
for section in sections:
if any(keyword in section['title'].lower() for keyword in ['faq', 'question', 'q&a', 'troubleshoot']):
pairs = self._extract_qa_from_section(section['content'])
qa_pairs.extend(pairs)
# Look for question patterns throughout the text
question_patterns = [
r'(?:Q:|Question:|Q\d+:)\s*([^?]+\?)\s*(?:A:|Answer:)?\s*([^Q\n]+)',
r'(?:^|\n)([^.!?\n]*\?)\s*\n([^?\n]+)',
r'How (?:do|to|can) ([^?]+\?)\s*([^?\n]+)'
]
for pattern in question_patterns:
matches = re.findall(pattern, content, re.MULTILINE | re.IGNORECASE)
for match in matches:
if len(match) == 2:
question, answer = match
qa_pairs.append({
"question": question.strip(),
"answer": answer.strip()[:300], # Limit answer length
"type": "extracted"
})
return qa_pairs[:15] # Return top 15 Q&A pairs
def find_related_content(self, query: str, doc_paths: List[Path], max_results: int = 5) -> List[Dict[str, Any]]:
"""Find documents related to a query using TF-IDF-like scoring."""
query_words = set(self._extract_words(query.lower()))
results = []
for path in doc_paths:
try:
content = path.read_text(encoding='utf-8', errors='ignore')
content_words = self._extract_words(content.lower())
if not content_words:
continue
# Calculate similarity score
word_freq = Counter(content_words)
score = 0
for query_word in query_words:
if query_word in word_freq:
# TF-IDF like scoring
tf = word_freq[query_word] / len(content_words)
score += tf * len(query_word) # Longer words get more weight
if score > 0:
# Normalize by document length
normalized_score = score / math.log(len(content_words) + 1)
# Get context snippet
snippet = self._extract_snippet(content, query_words)
results.append({
'path': str(path.relative_to(self.docs_root)),
'relevance_score': normalized_score,
'snippet': snippet,
'word_count': len(content_words)
})
except Exception:
continue
# Sort by relevance and return top results
results.sort(key=lambda x: x['relevance_score'], reverse=True)
return results[:max_results]
def _split_into_sentences(self, content: str) -> List[str]:
"""Split content into sentences."""
# Simple sentence splitting
sentences = re.split(r'[.!?]+', content)
return [s.strip() for s in sentences if s.strip() and len(s.strip()) > 10]
def _extract_words(self, text: str) -> List[str]:
"""Extract words from text."""
words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
# Filter out common stop words
stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those', 'it', 'its', 'they', 'them', 'their'}
return [word for word in words if word not in stop_words and len(word) > 2]
def _count_syllables(self, word: str) -> int:
"""Estimate syllable count for a word."""
word = word.lower()
if len(word) <= 3:
return 1
vowels = 'aeiouy'
syllable_count = 0
prev_was_vowel = False
for char in word:
if char in vowels:
if not prev_was_vowel:
syllable_count += 1
prev_was_vowel = True
else:
prev_was_vowel = False
# Handle silent e
if word.endswith('e') and syllable_count > 1:
syllable_count -= 1
return max(1, syllable_count)
def _extract_sections(self, content: str) -> List[Dict[str, str]]:
"""Extract sections from markdown content."""
sections = []
lines = content.split('\n')
current_section = None
current_content = []
for line in lines:
if line.strip().startswith('#'):
if current_section:
sections.append({
'title': current_section,
'content': '\n'.join(current_content).strip()
})
current_section = line.strip()
current_content = []
else:
current_content.append(line)
if current_section:
sections.append({
'title': current_section,
'content': '\n'.join(current_content).strip()
})
return sections
def _extract_qa_from_section(self, section_content: str) -> List[Dict[str, str]]:
"""Extract Q&A pairs from a section."""
qa_pairs = []
lines = section_content.split('\n')
current_question = None
current_answer = []
for line in lines:
line = line.strip()
if line.endswith('?') and not current_question:
current_question = line
elif current_question and line and not line.endswith('?'):
current_answer.append(line)
elif current_question and (line.endswith('?') or not line):
if current_answer:
qa_pairs.append({
"question": current_question,
"answer": ' '.join(current_answer),
"type": "faq"
})
current_question = line if line.endswith('?') else None
current_answer = []
# Don't forget the last Q&A pair
if current_question and current_answer:
qa_pairs.append({
"question": current_question,
"answer": ' '.join(current_answer),
"type": "faq"
})
return qa_pairs
def _extract_snippet(self, content: str, query_words: set, snippet_length: int = 150) -> str:
"""Extract a relevant snippet containing query words."""
content_lower = content.lower()
# Find the first occurrence of any query word
first_pos = len(content)
for word in query_words:
pos = content_lower.find(word)
if pos != -1:
first_pos = min(first_pos, pos)
if first_pos == len(content):
# No query words found, return beginning
return content[:snippet_length] + "..." if len(content) > snippet_length else content
# Extract snippet around the found position
start = max(0, first_pos - snippet_length // 2)
end = min(len(content), start + snippet_length)
snippet = content[start:end]
if start > 0:
snippet = "..." + snippet
if end < len(content):
snippet = snippet + "..."
return snippet.replace('\n', ' ')
def _clean_pdf_content(self, content: str) -> str:
"""Clean PDF content by removing page markers and fixing formatting."""
import re
# Remove page markers like "--- Page 1 ---"
content = re.sub(r'\n--- Page \d+ ---\n', '\n\n', content)
content = re.sub(r'\n--- Page \d+ \(Error reading:.*?\) ---\n', '\n\n', content)
# Fix common PDF extraction issues
# Remove excessive whitespace
content = re.sub(r'\n\s*\n\s*\n+', '\n\n', content)
# Fix broken words (common in PDF extraction)
content = re.sub(r'(\w)-\s*\n\s*(\w)', r'\1\2', content)
# Fix spacing issues
content = re.sub(r'([a-z])([A-Z])', r'\1 \2', content)
# Remove extra spaces
content = re.sub(r' +', ' ', content)
return content.strip() |