fix: improve content capture and title extraction

- Optimize clipboard content capture order (Unicode > HTML > Text) - Add proper HTML content extraction - Improve title extraction with markdown and topic sentence support - Add automatic copy operation before content capture - Fix content encoding issues
2025-01-15 21:57:46 +08:00 · 2025-01-15 21:57:46 +08:00 · 0a5179209d
commit 0a5179209d
parent 595f22c929
2 changed files with 68 additions and 37 deletions
--- a/project/llmclipboard/llmclipboard/app.py
+++ b/project/llmclipboard/llmclipboard/app.py
@ -46,54 +46,64 @@ class TextCaptureService:
            win32clipboard.OpenClipboard()
            content = None

-            # 尝试获取HTML格式
+            # 首先尝试获取Unicode文本
            try:
-                content = win32clipboard.GetClipboardData(win32con.CF_HTML)
+                content = win32clipboard.GetClipboardData(win32con.CF_UNICODETEXT)
+                self.logger.info("获取Unicode文本格式内容")
+                if content:
+                    return content
+            except Exception as e:
+                self.logger.debug(f"获取Unicode文本失败: {e}")
+
+            # 如果Unicode获取失败，尝试获取HTML格式
+            try:
+                html_content = win32clipboard.GetClipboardData(win32con.CF_HTML)
                self.logger.info("获取HTML格式内容")
-                h = HTML2Text()
-                h.body_width = 0  # 禁用自动换行
-                h.single_line_break = True  # 使用单行换行
-                h.ignore_emphasis = False
-                h.ignore_images = False
-                h.ignore_links = False
-                h.ignore_tables = False
-                content = h.handle(content).strip()
+                if html_content:
+                    # 解析HTML格式的内容
+                    h = HTML2Text()
+                    h.body_width = 0  # 禁用自动换行
+                    h.single_line_break = True  # 使用单行换行
+                    h.ignore_emphasis = False
+                    h.ignore_images = False
+                    h.ignore_links = False
+                    h.ignore_tables = False
+                    
+                    # 从HTML字符串中提取实际的HTML内容
+                    start = html_content.find('<html>')
+                    end = html_content.find('</html>')
+                    if start != -1 and end != -1:
+                        html_content = html_content[start:end+7]
+                    
+                    content = h.handle(html_content).strip()
+                    if content:
+                        return content
            except Exception as e:
                self.logger.debug(f"获取HTML格式失败: {e}")

-            # 如果HTML格式获取失败，尝试获取Unicode文本
-            if not content:
-                try:
-                    content = win32clipboard.GetClipboardData(win32con.CF_UNICODETEXT)
-                    self.logger.info("获取Unicode文本格式内容")
-                except Exception as e:
-                    self.logger.debug(f"获取Unicode文本失败: {e}")
+            # 如果HTML获取失败，尝试获取普通文本
+            try:
+                text_content = win32clipboard.GetClipboardData(win32con.CF_TEXT)
+                self.logger.info("获取普通文本格式内容")
+                if text_content:
+                    return text_content.decode('gbk')
+            except Exception as e:
+                self.logger.debug(f"获取普通文本失败: {e}")

-            # 如果Unicode获取失败，尝试获取普通文本
            if not content:
-                try:
-                    content = win32clipboard.GetClipboardData(win32con.CF_TEXT)
-                    content = content.decode('gbk')
-                    self.logger.info("获取普通文本格式内容")
-                except Exception as e:
-                    self.logger.debug(f"获取普通文本失败: {e}")
-
-            win32clipboard.CloseClipboard()
-            
-            if content:
-                self.logger.info("成功获取剪贴板内容")
-                return content
-            else:
                self.logger.warning("剪贴板内容为空")
                return None

+            return content
+
        except Exception as e:
            self.logger.error(f"获取剪贴板内容失败: {e}")
+            return None
+        finally:
            try:
                win32clipboard.CloseClipboard()
            except:
                pass
-            return None

    def on_click(self, x, y, button, pressed):
        if not pressed or not self.running:
@ -103,6 +113,11 @@ class TextCaptureService:
            current_time = time.time()
            if (current_time - self.last_right_click_time) < self.double_click_threshold:
                self.logger.info("检测到双击右键")
+                # 先执行复制操作
+                self.simulate_copy()
+                time.sleep(0.1)  # 等待复制完成
+                
+                # 获取剪贴板内容
                content = self.get_clipboard_content()
                if content:
                    self.logger.info("开始保存内容")
--- a/project/llmclipboard/llmclipboard/document_processor.py
+++ b/project/llmclipboard/llmclipboard/document_processor.py
@ -36,17 +36,33 @@ class DocumentProcessor:
    def extract_title(self, content):
        """从内容中提取或生成标题"""
        # 首先尝试从内容的第一行提取标题
-        first_line = content.strip().split('\n')[0]
-        if len(first_line) <= 50:  # 如果第一行不太长，可能是标题
+        lines = content.strip().split('\n')
+        first_line = lines[0].strip()
+        
+        # 如果第一行是markdown标题格式，直接使用
+        if first_line.startswith('#'):
+            return first_line.lstrip('#').strip()
+            
+        # 如果第一行不太长且不是很短，可能是标题
+        if 10 <= len(first_line) <= 100 and not first_line.endswith('：') and not first_line.endswith(':'):
            return first_line
        
+        # 尝试查找文章的主题句
+        for line in lines[:5]:  # 只查看前5行
+            line = line.strip()
+            if '主题' in line or '概要' in line or '总结' in line:
+                # 提取冒号或者是后面的内容
+                if ':' in line or '：' in line:
+                    return line.split(':', 1)[1].strip() if ':' in line else line.split('：', 1)[1].strip()
+                return line
+        
        # 使用结巴分词提取关键词
-        keywords = jieba.analyse.textrank(content, topK=3, allowPOS=('ns', 'n', 'vn', 'v'))
+        keywords = jieba.analyse.textrank(content[:500], topK=3, allowPOS=('ns', 'n', 'vn', 'v'))
        if keywords:
            return ' '.join(keywords)
        
-        # 如果无法提取关键词，使用内容的前20个字符
-        return content[:20].strip() + "..."
+        # 如果无法提取关键词，使用内容的前30个字符
+        return content[:30].strip() + "..."
    
    def extract_tags(self, content):
        """提取内容的标签"""