File size: 15,719 Bytes
f639a6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
"""
Document Intelligence Module for Advanced Text Analysis and Processing
"""
from pathlib import Path
from typing import List, Dict, Any
import re
import math
from collections import Counter


class DocumentIntelligence:
    """Advanced document intelligence for smart analysis and summarization."""
    
    def __init__(self, docs_root: Path):
        self.docs_root = docs_root
        
    def generate_smart_summary(self, content: str, summary_type: str = "medium") -> str:
        """Generate an intelligent summary based on content analysis."""
        # Handle PDF page markers
        content = self._clean_pdf_content(content)
        
        sentences = self._split_into_sentences(content)
        
        if not sentences:
            return "No content available for summarization."
        
        # Score sentences based on multiple factors
        sentence_scores = {}
        
        # Factor 1: Word frequency
        words = self._extract_words(content)
        word_freq = Counter(words)
        
        # Factor 2: Position (early sentences often important)
        # Factor 3: Length (moderate length sentences preferred)
        # Factor 4: Keywords (technical terms, action words)
        
        for i, sentence in enumerate(sentences):
            score = 0
            sentence_words = self._extract_words(sentence)
            
            # Word frequency score
            for word in sentence_words:
                score += word_freq.get(word, 0)
            
            # Position score (first and last sentences get bonus)
            if i < 3:
                score += 5
            elif i >= len(sentences) - 2:
                score += 3
                
            # Length score (prefer moderate length)
            word_count = len(sentence_words)
            if 10 <= word_count <= 25:
                score += 3
            elif 5 <= word_count <= 35:
                score += 1
                
            # Keyword bonus
            keywords = ['important', 'key', 'main', 'primary', 'essential', 
                       'note', 'must', 'should', 'required', 'configure', 
                       'setup', 'install', 'create', 'build']
            for keyword in keywords:
                if keyword in sentence.lower():
                    score += 2
            
            sentence_scores[i] = score / max(len(sentence_words), 1)
        
        # Select top sentences based on summary type
        if summary_type == "short":
            top_count = min(3, len(sentences))
        elif summary_type == "long":
            top_count = min(10, len(sentences))
        else:  # medium
            top_count = min(6, len(sentences))
        
        # Get top scoring sentences, maintaining order
        top_sentence_indices = sorted(
            sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)[:top_count],
            key=lambda x: x[0]
        )
        
        summary_sentences = [sentences[i] for i, _ in top_sentence_indices]
        return ' '.join(summary_sentences)
    
    def extract_key_concepts(self, content: str, min_frequency: int = 2) -> List[Dict[str, Any]]:
        """Extract key concepts and terms from content."""
        # Clean PDF content for better concept extraction
        content = self._clean_pdf_content(content)
        
        concepts = []
        
        # Extract technical terms (words in backticks)
        tech_terms = re.findall(r'`([^`]+)`', content)
        tech_term_freq = Counter(tech_terms)
        
        for term, freq in tech_term_freq.items():
            if freq >= min_frequency:
                concepts.append({
                    'concept': term,
                    'frequency': freq,
                    'type': 'technical_term'
                })
        
        # Extract important phrases (words in bold)
        bold_terms = re.findall(r'\*\*([^*]+)\*\*', content)
        bold_term_freq = Counter(bold_terms)
        
        for term, freq in bold_term_freq.items():
            if freq >= min_frequency:
                concepts.append({
                    'concept': term,
                    'frequency': freq,
                    'type': 'emphasized_term'
                })
        
        # Extract capitalized words (potential proper nouns/concepts)
        words = re.findall(r'\b[A-Z][a-z]+\b', content)
        cap_word_freq = Counter(words)
        
        for word, freq in cap_word_freq.items():
            if freq >= min_frequency and len(word) > 3:
                concepts.append({
                    'concept': word,
                    'frequency': freq,
                    'type': 'proper_noun'
                })
        
        # Sort by frequency and return top concepts
        concepts.sort(key=lambda x: x['frequency'], reverse=True)
        return concepts[:20]
    
    def analyze_readability(self, content: str) -> Dict[str, Any]:
        """Analyze content readability using various metrics."""
        # Clean PDF content for better analysis
        content = self._clean_pdf_content(content)
        
        sentences = self._split_into_sentences(content)
        words = self._extract_words(content)
        
        if not sentences or not words:
            return {"flesch_score": 0, "grade_level": 0, "complexity": "unknown"}
        
        # Basic counts
        sentence_count = len(sentences)
        word_count = len(words)
        syllable_count = sum(self._count_syllables(word) for word in words)
        
        # Average sentence length
        avg_sentence_length = word_count / sentence_count
        
        # Average syllables per word
        avg_syllables = syllable_count / word_count if word_count > 0 else 0
        
        # Flesch Reading Ease Score
        flesch_score = 206.835 - (1.015 * avg_sentence_length) - (84.6 * avg_syllables)
        flesch_score = max(0, min(100, flesch_score))  # Clamp to 0-100
        
        # Grade level estimation
        grade_level = 0.39 * avg_sentence_length + 11.8 * avg_syllables - 15.59
        grade_level = max(1, grade_level)
        
        # Complexity assessment
        if flesch_score >= 70:
            complexity = "easy"
        elif flesch_score >= 50:
            complexity = "moderate"
        elif flesch_score >= 30:
            complexity = "difficult"
        else:
            complexity = "very difficult"
        
        return {
            "flesch_score": round(flesch_score, 1),
            "grade_level": round(grade_level, 1),
            "complexity": complexity,
            "avg_sentence_length": round(avg_sentence_length, 1),
            "avg_syllables_per_word": round(avg_syllables, 2),
            "total_sentences": sentence_count,
            "total_words": word_count
        }
    
    def extract_questions_and_answers(self, content: str) -> List[Dict[str, str]]:
        """Extract Q&A pairs from content."""
        qa_pairs = []
        
        # Look for FAQ sections
        sections = self._extract_sections(content)
        for section in sections:
            if any(keyword in section['title'].lower() for keyword in ['faq', 'question', 'q&a', 'troubleshoot']):
                pairs = self._extract_qa_from_section(section['content'])
                qa_pairs.extend(pairs)
        
        # Look for question patterns throughout the text
        question_patterns = [
            r'(?:Q:|Question:|Q\d+:)\s*([^?]+\?)\s*(?:A:|Answer:)?\s*([^Q\n]+)',
            r'(?:^|\n)([^.!?\n]*\?)\s*\n([^?\n]+)',
            r'How (?:do|to|can) ([^?]+\?)\s*([^?\n]+)'
        ]
        
        for pattern in question_patterns:
            matches = re.findall(pattern, content, re.MULTILINE | re.IGNORECASE)
            for match in matches:
                if len(match) == 2:
                    question, answer = match
                    qa_pairs.append({
                        "question": question.strip(),
                        "answer": answer.strip()[:300],  # Limit answer length
                        "type": "extracted"
                    })
        
        return qa_pairs[:15]  # Return top 15 Q&A pairs
    
    def find_related_content(self, query: str, doc_paths: List[Path], max_results: int = 5) -> List[Dict[str, Any]]:
        """Find documents related to a query using TF-IDF-like scoring."""
        query_words = set(self._extract_words(query.lower()))
        results = []
        
        for path in doc_paths:
            try:
                content = path.read_text(encoding='utf-8', errors='ignore')
                content_words = self._extract_words(content.lower())
                
                if not content_words:
                    continue
                
                # Calculate similarity score
                word_freq = Counter(content_words)
                score = 0
                
                for query_word in query_words:
                    if query_word in word_freq:
                        # TF-IDF like scoring
                        tf = word_freq[query_word] / len(content_words)
                        score += tf * len(query_word)  # Longer words get more weight
                
                if score > 0:
                    # Normalize by document length
                    normalized_score = score / math.log(len(content_words) + 1)
                    
                    # Get context snippet
                    snippet = self._extract_snippet(content, query_words)
                    
                    results.append({
                        'path': str(path.relative_to(self.docs_root)),
                        'relevance_score': normalized_score,
                        'snippet': snippet,
                        'word_count': len(content_words)
                    })
                    
            except Exception:
                continue
        
        # Sort by relevance and return top results
        results.sort(key=lambda x: x['relevance_score'], reverse=True)
        return results[:max_results]
    
    def _split_into_sentences(self, content: str) -> List[str]:
        """Split content into sentences."""
        # Simple sentence splitting
        sentences = re.split(r'[.!?]+', content)
        return [s.strip() for s in sentences if s.strip() and len(s.strip()) > 10]
    
    def _extract_words(self, text: str) -> List[str]:
        """Extract words from text."""
        words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
        # Filter out common stop words
        stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those', 'it', 'its', 'they', 'them', 'their'}
        return [word for word in words if word not in stop_words and len(word) > 2]
    
    def _count_syllables(self, word: str) -> int:
        """Estimate syllable count for a word."""
        word = word.lower()
        if len(word) <= 3:
            return 1
        
        vowels = 'aeiouy'
        syllable_count = 0
        prev_was_vowel = False
        
        for char in word:
            if char in vowels:
                if not prev_was_vowel:
                    syllable_count += 1
                prev_was_vowel = True
            else:
                prev_was_vowel = False
        
        # Handle silent e
        if word.endswith('e') and syllable_count > 1:
            syllable_count -= 1
        
        return max(1, syllable_count)
    
    def _extract_sections(self, content: str) -> List[Dict[str, str]]:
        """Extract sections from markdown content."""
        sections = []
        lines = content.split('\n')
        current_section = None
        current_content = []
        
        for line in lines:
            if line.strip().startswith('#'):
                if current_section:
                    sections.append({
                        'title': current_section,
                        'content': '\n'.join(current_content).strip()
                    })
                current_section = line.strip()
                current_content = []
            else:
                current_content.append(line)
        
        if current_section:
            sections.append({
                'title': current_section,
                'content': '\n'.join(current_content).strip()
            })
        
        return sections
    
    def _extract_qa_from_section(self, section_content: str) -> List[Dict[str, str]]:
        """Extract Q&A pairs from a section."""
        qa_pairs = []
        lines = section_content.split('\n')
        current_question = None
        current_answer = []
        
        for line in lines:
            line = line.strip()
            if line.endswith('?') and not current_question:
                current_question = line
            elif current_question and line and not line.endswith('?'):
                current_answer.append(line)
            elif current_question and (line.endswith('?') or not line):
                if current_answer:
                    qa_pairs.append({
                        "question": current_question,
                        "answer": ' '.join(current_answer),
                        "type": "faq"
                    })
                current_question = line if line.endswith('?') else None
                current_answer = []
        
        # Don't forget the last Q&A pair
        if current_question and current_answer:
            qa_pairs.append({
                "question": current_question,
                "answer": ' '.join(current_answer),
                "type": "faq"
            })
        
        return qa_pairs
    
    def _extract_snippet(self, content: str, query_words: set, snippet_length: int = 150) -> str:
        """Extract a relevant snippet containing query words."""
        content_lower = content.lower()
        
        # Find the first occurrence of any query word
        first_pos = len(content)
        for word in query_words:
            pos = content_lower.find(word)
            if pos != -1:
                first_pos = min(first_pos, pos)
        
        if first_pos == len(content):
            # No query words found, return beginning
            return content[:snippet_length] + "..." if len(content) > snippet_length else content
        
        # Extract snippet around the found position
        start = max(0, first_pos - snippet_length // 2)
        end = min(len(content), start + snippet_length)
        snippet = content[start:end]
        
        if start > 0:
            snippet = "..." + snippet
        if end < len(content):
            snippet = snippet + "..."
        
        return snippet.replace('\n', ' ')
    
    def _clean_pdf_content(self, content: str) -> str:
        """Clean PDF content by removing page markers and fixing formatting."""
        import re
        
        # Remove page markers like "--- Page 1 ---"
        content = re.sub(r'\n--- Page \d+ ---\n', '\n\n', content)
        content = re.sub(r'\n--- Page \d+ \(Error reading:.*?\) ---\n', '\n\n', content)
        
        # Fix common PDF extraction issues
        # Remove excessive whitespace
        content = re.sub(r'\n\s*\n\s*\n+', '\n\n', content)
        
        # Fix broken words (common in PDF extraction)
        content = re.sub(r'(\w)-\s*\n\s*(\w)', r'\1\2', content)
        
        # Fix spacing issues
        content = re.sub(r'([a-z])([A-Z])', r'\1 \2', content)
        
        # Remove extra spaces
        content = re.sub(r' +', ' ', content)
        
        return content.strip()