feat: add intelligent document processing

- Add DocumentProcessor class for smart content analysis - Implement automatic title generation - Add automatic tagging system - Add intelligent categorization - Add metadata support with YAML front matter - Update documentation with new features - Add jieba dependency for Chinese text processing
2025-01-15 21:34:42 +08:00 · 2025-01-15 21:34:42 +08:00 · b2494d439b
commit b2494d439b
parent 46b7da2a77
4 changed files with 190 additions and 7 deletions
--- a/project/llmclipboard/README.md
+++ b/project/llmclipboard/README.md
@ -35,6 +35,60 @@
  - 多语言文本兼容
  - 保持特殊字符格式
 ### 智能文档处理
 - **智能标题生成**
  - 自动从内容提取关键信息作为标题
  - 使用自然语言处理识别主题
  - 生成简洁有意义的标题
 - **自动标签系统**
  - 基于内容智能提取关键词
  - 自动识别文档主题
  - 支持多维度标签分类
  - 可自定义标签规则
 - **智能分类系统**
  - 自动对文档进行分类
  - 支持多级目录结构
  - 预设常用分类模板
  - 可自定义分类规则
 - **文档元数据**
  - 自动添加创建时间
  - 记录文档分类信息
  - 保存标签信息
  - YAML front matter 格式
 ### 目录结构
 保存的文档会自动按以下结构组织：
 ```
 save_location/
 ├── 技术/
 │   └── 20250115_210000_Python项目最佳实践.md
 ├── 学习/
 │   └── 20250115_210100_机器学习基础概念.md
 ├── 工作/
 │   └── 20250115_210200_项目进度报告.md
 ├── 想法/
 │   └── 20250115_210300_产品改进建议.md
 └── 资源/
    └── 20250115_210400_有用的开发工具集合.md
 ```
 每个文档都包含以下格式的元数据：
 ```markdown
 ---
 title: 文档标题
 date: 2025-01-15 21:00:00
 tags: 标签1, 标签2, 标签3
 category: 分类名称
 ---
 文档内容...
 ```
 ### 格式优化
 - 自动清理冗余空行
--- a/project/llmclipboard/llmclipboard/app.py
+++ b/project/llmclipboard/llmclipboard/app.py
@ -11,6 +11,7 @@ import sys
 import logging
 from PyQt6.QtWidgets import QApplication
 from .gui import MainWindow
 from .document_processor import DocumentProcessor
 class TextCaptureService:
    def __init__(self):
@ -20,6 +21,7 @@ class TextCaptureService:
        self.setup_logging()
        self._mouse_listener = None
        self._keyboard_listener = None
        self.doc_processor = DocumentProcessor()
    def load_config(self):
        self.config = configparser.ConfigParser()
@ -40,12 +42,18 @@ class TextCaptureService:
    def save_to_markdown(self, content):
        try:
-            timestamp = time.strftime("%Y%m%d_%H%M%S")
+            # 使用文档处理器处理内容
-            file_path = os.path.join(self.save_location, f'captured_text_{timestamp}.md')
+            result = self.doc_processor.process_document(content, self.save_location)
            # 保存处理后的内容
            with open(result['path'], 'w', encoding='utf-8') as f:
                f.write(result['content'])
            self.logger.info(f"Content saved to: {result['path']}")
            self.logger.info(f"Title: {result['title']}")
            self.logger.info(f"Category: {result['category']}")
            self.logger.info(f"Tags: {', '.join(result['tags'])}")
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(content)
            self.logger.info(f"Content saved to: {file_path}")
        except Exception as e:
            self.logger.error(f"Error saving file: {e}")
--- a/project/llmclipboard/llmclipboard/document_processor.py
+++ b/project/llmclipboard/llmclipboard/document_processor.py
@ -0,0 +1,120 @@
 import os
 import re
 import jieba
 import jieba.analyse
 from datetime import datetime
 from collections import Counter
 import json
 from pathlib import Path
 class DocumentProcessor:
    def __init__(self, config_path=None):
        self.config_path = config_path or os.path.join(os.path.dirname(__file__), 'categories.json')
        self.load_categories()
    def load_categories(self):
        """加载预定义的分类规则"""
        if os.path.exists(self.config_path):
            with open(self.config_path, 'r', encoding='utf-8') as f:
                self.categories = json.load(f)
        else:
            self.categories = {
                "技术": ["编程", "开发", "代码", "框架", "算法", "数据库", "API"],
                "学习": ["教程", "课程", "学习", "笔记", "知识", "总结"],
                "工作": ["会议", "项目", "计划", "报告", "任务", "进度"],
                "想法": ["想法", "创意", "思考", "灵感", "观点", "建议"],
                "资源": ["工具", "资源", "链接", "参考", "文档", "书籍"]
            }
            self._save_categories()
    def _save_categories(self):
        """保存分类规则"""
        os.makedirs(os.path.dirname(self.config_path), exist_ok=True)
        with open(self.config_path, 'w', encoding='utf-8') as f:
            json.dump(self.categories, f, ensure_ascii=False, indent=2)
    def extract_title(self, content):
        """从内容中提取或生成标题"""
        # 首先尝试从内容的第一行提取标题
        first_line = content.strip().split('\n')[0]
        if len(first_line) <= 50:  # 如果第一行不太长，可能是标题
            return first_line
        # 使用结巴分词提取关键词
        keywords = jieba.analyse.textrank(content, topK=3, allowPOS=('ns', 'n', 'vn', 'v'))
        if keywords:
            return ' '.join(keywords)
        # 如果无法提取关键词，使用内容的前20个字符
        return content[:20].strip() + "..."
    def extract_tags(self, content):
        """提取内容的标签"""
        # 使用结巴分词提取关键词作为标签
        tags = set(jieba.analyse.extract_tags(content, topK=5))
        # 添加基于规则的标签
        for category, keywords in self.categories.items():
            if any(keyword in content for keyword in keywords):
                tags.add(category)
        return list(tags)
    def categorize(self, content, tags):
        """对内容进行分类"""
        # 基于标签和内容关键词进行分类
        category_scores = Counter()
        # 根据标签评分
        for tag in tags:
            for category, keywords in self.categories.items():
                if tag in keywords or tag == category:
                    category_scores[category] += 2
        # 根据内容关键词评分
        for category, keywords in self.categories.items():
            for keyword in keywords:
                if keyword in content:
                    category_scores[category] += 1
        # 返回得分最高的分类，如果没有匹配则返回"未分类"
        return category_scores.most_common(1)[0][0] if category_scores else "未分类"
    def process_document(self, content, base_save_path):
        """处理文档内容并返回保存信息"""
        # 提取标题
        title = self.extract_title(content)
        # 提取标签
        tags = self.extract_tags(content)
        # 确定分类
        category = self.categorize(content, tags)
        # 生成文件名
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        safe_title = re.sub(r'[\\/:*?"<>|]', '_', title)  # 移除不安全的文件名字符
        filename = f"{timestamp}_{safe_title[:30]}.md"
        # 构建保存路径
        save_dir = os.path.join(base_save_path, category)
        os.makedirs(save_dir, exist_ok=True)
        save_path = os.path.join(save_dir, filename)
        # 构建完整的markdown内容
        full_content = f"""---
 title: {title}
 date: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
 tags: {', '.join(tags)}
 category: {category}
 ---
 {content}
 """
        return {
            'path': save_path,
            'content': full_content,
            'title': title,
            'tags': tags,
            'category': category
        }
--- a/project/llmclipboard/pyproject.toml
+++ b/project/llmclipboard/pyproject.toml
@ -12,7 +12,8 @@ dependencies = [
    "configparser",
    "PyQt6",
    "darkdetect",
-    "qt-material"
+    "qt-material",
    "jieba"
 ]
 [project.scripts]