feat: add intelligent document processing

- Add DocumentProcessor class for smart content analysis - Implement automatic title generation - Add automatic tagging system - Add intelligent categorization - Add metadata support with YAML front matter - Update documentation with new features - Add jieba dependency for Chinese text processing
2025-01-15 21:34:42 +08:00 · 2025-01-15 21:34:42 +08:00 · b2494d439b
commit b2494d439b
parent 46b7da2a77
4 changed files with 190 additions and 7 deletions
--- a/project/llmclipboard/README.md
+++ b/project/llmclipboard/README.md
@ -35,6 +35,60 @@
  - 多语言文本兼容
  - 保持特殊字符格式

+### 智能文档处理
+
+- **智能标题生成**
+  - 自动从内容提取关键信息作为标题
+  - 使用自然语言处理识别主题
+  - 生成简洁有意义的标题
+
+- **自动标签系统**
+  - 基于内容智能提取关键词
+  - 自动识别文档主题
+  - 支持多维度标签分类
+  - 可自定义标签规则
+
+- **智能分类系统**
+  - 自动对文档进行分类
+  - 支持多级目录结构
+  - 预设常用分类模板
+  - 可自定义分类规则
+
+- **文档元数据**
+  - 自动添加创建时间
+  - 记录文档分类信息
+  - 保存标签信息
+  - YAML front matter 格式
+
+### 目录结构
+
+保存的文档会自动按以下结构组织：
+```
+save_location/
+├── 技术/
+│   └── 20250115_210000_Python项目最佳实践.md
+├── 学习/
+│   └── 20250115_210100_机器学习基础概念.md
+├── 工作/
+│   └── 20250115_210200_项目进度报告.md
+├── 想法/
+│   └── 20250115_210300_产品改进建议.md
+└── 资源/
+    └── 20250115_210400_有用的开发工具集合.md
+```
+
+每个文档都包含以下格式的元数据：
+```markdown
+---
+title: 文档标题
+date: 2025-01-15 21:00:00
+tags: 标签1, 标签2, 标签3
+category: 分类名称
+---
+
+文档内容...
+```
+
 ### 格式优化

 - 自动清理冗余空行
--- a/project/llmclipboard/llmclipboard/app.py
+++ b/project/llmclipboard/llmclipboard/app.py
@ -11,6 +11,7 @@ import sys
 import logging
 from PyQt6.QtWidgets import QApplication
 from .gui import MainWindow
+from .document_processor import DocumentProcessor

 class TextCaptureService:
    def __init__(self):
@ -20,6 +21,7 @@ class TextCaptureService:
        self.setup_logging()
        self._mouse_listener = None
        self._keyboard_listener = None
+        self.doc_processor = DocumentProcessor()

    def load_config(self):
        self.config = configparser.ConfigParser()
@ -40,12 +42,18 @@ class TextCaptureService:

    def save_to_markdown(self, content):
        try:
-            timestamp = time.strftime("%Y%m%d_%H%M%S")
-            file_path = os.path.join(self.save_location, f'captured_text_{timestamp}.md')
-
-            with open(file_path, 'w', encoding='utf-8') as f:
-                f.write(content)
-            self.logger.info(f"Content saved to: {file_path}")
+            # 使用文档处理器处理内容
+            result = self.doc_processor.process_document(content, self.save_location)
+            
+            # 保存处理后的内容
+            with open(result['path'], 'w', encoding='utf-8') as f:
+                f.write(result['content'])
+            
+            self.logger.info(f"Content saved to: {result['path']}")
+            self.logger.info(f"Title: {result['title']}")
+            self.logger.info(f"Category: {result['category']}")
+            self.logger.info(f"Tags: {', '.join(result['tags'])}")
+            
        except Exception as e:
            self.logger.error(f"Error saving file: {e}")

--- a/project/llmclipboard/llmclipboard/document_processor.py
+++ b/project/llmclipboard/llmclipboard/document_processor.py
@ -0,0 +1,120 @@
+import os
+import re
+import jieba
+import jieba.analyse
+from datetime import datetime
+from collections import Counter
+import json
+from pathlib import Path
+
+class DocumentProcessor:
+    def __init__(self, config_path=None):
+        self.config_path = config_path or os.path.join(os.path.dirname(__file__), 'categories.json')
+        self.load_categories()
+        
+    def load_categories(self):
+        """加载预定义的分类规则"""
+        if os.path.exists(self.config_path):
+            with open(self.config_path, 'r', encoding='utf-8') as f:
+                self.categories = json.load(f)
+        else:
+            self.categories = {
+                "技术": ["编程", "开发", "代码", "框架", "算法", "数据库", "API"],
+                "学习": ["教程", "课程", "学习", "笔记", "知识", "总结"],
+                "工作": ["会议", "项目", "计划", "报告", "任务", "进度"],
+                "想法": ["想法", "创意", "思考", "灵感", "观点", "建议"],
+                "资源": ["工具", "资源", "链接", "参考", "文档", "书籍"]
+            }
+            self._save_categories()
+    
+    def _save_categories(self):
+        """保存分类规则"""
+        os.makedirs(os.path.dirname(self.config_path), exist_ok=True)
+        with open(self.config_path, 'w', encoding='utf-8') as f:
+            json.dump(self.categories, f, ensure_ascii=False, indent=2)
+    
+    def extract_title(self, content):
+        """从内容中提取或生成标题"""
+        # 首先尝试从内容的第一行提取标题
+        first_line = content.strip().split('\n')[0]
+        if len(first_line) <= 50:  # 如果第一行不太长，可能是标题
+            return first_line
+        
+        # 使用结巴分词提取关键词
+        keywords = jieba.analyse.textrank(content, topK=3, allowPOS=('ns', 'n', 'vn', 'v'))
+        if keywords:
+            return ' '.join(keywords)
+        
+        # 如果无法提取关键词，使用内容的前20个字符
+        return content[:20].strip() + "..."
+    
+    def extract_tags(self, content):
+        """提取内容的标签"""
+        # 使用结巴分词提取关键词作为标签
+        tags = set(jieba.analyse.extract_tags(content, topK=5))
+        
+        # 添加基于规则的标签
+        for category, keywords in self.categories.items():
+            if any(keyword in content for keyword in keywords):
+                tags.add(category)
+        
+        return list(tags)
+    
+    def categorize(self, content, tags):
+        """对内容进行分类"""
+        # 基于标签和内容关键词进行分类
+        category_scores = Counter()
+        
+        # 根据标签评分
+        for tag in tags:
+            for category, keywords in self.categories.items():
+                if tag in keywords or tag == category:
+                    category_scores[category] += 2
+        
+        # 根据内容关键词评分
+        for category, keywords in self.categories.items():
+            for keyword in keywords:
+                if keyword in content:
+                    category_scores[category] += 1
+        
+        # 返回得分最高的分类，如果没有匹配则返回"未分类"
+        return category_scores.most_common(1)[0][0] if category_scores else "未分类"
+    
+    def process_document(self, content, base_save_path):
+        """处理文档内容并返回保存信息"""
+        # 提取标题
+        title = self.extract_title(content)
+        
+        # 提取标签
+        tags = self.extract_tags(content)
+        
+        # 确定分类
+        category = self.categorize(content, tags)
+        
+        # 生成文件名
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        safe_title = re.sub(r'[\\/:*?"<>|]', '_', title)  # 移除不安全的文件名字符
+        filename = f"{timestamp}_{safe_title[:30]}.md"
+        
+        # 构建保存路径
+        save_dir = os.path.join(base_save_path, category)
+        os.makedirs(save_dir, exist_ok=True)
+        save_path = os.path.join(save_dir, filename)
+        
+        # 构建完整的markdown内容
+        full_content = f"""---
+title: {title}
+date: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
+tags: {', '.join(tags)}
+category: {category}
+---
+
+{content}
+"""
+        return {
+            'path': save_path,
+            'content': full_content,
+            'title': title,
+            'tags': tags,
+            'category': category
+        }
--- a/project/llmclipboard/pyproject.toml
+++ b/project/llmclipboard/pyproject.toml
@ -12,7 +12,8 @@ dependencies = [
    "configparser",
    "PyQt6",
    "darkdetect",
-    "qt-material"
+    "qt-material",
+    "jieba"
 ]

 [project.scripts]