From b2494d439bd5348e04878daf2e5909b4ae0d4291 Mon Sep 17 00:00:00 2001 From: zhukang <274546966@qq.com> Date: Wed, 15 Jan 2025 21:34:42 +0800 Subject: [PATCH] feat: add intelligent document processing - Add DocumentProcessor class for smart content analysis - Implement automatic title generation - Add automatic tagging system - Add intelligent categorization - Add metadata support with YAML front matter - Update documentation with new features - Add jieba dependency for Chinese text processing --- project/llmclipboard/README.md | 54 ++++++++ project/llmclipboard/llmclipboard/app.py | 20 ++- .../llmclipboard/document_processor.py | 120 ++++++++++++++++++ project/llmclipboard/pyproject.toml | 3 +- 4 files changed, 190 insertions(+), 7 deletions(-) create mode 100644 project/llmclipboard/llmclipboard/document_processor.py diff --git a/project/llmclipboard/README.md b/project/llmclipboard/README.md index 6a7dd6f..8d07e33 100644 --- a/project/llmclipboard/README.md +++ b/project/llmclipboard/README.md @@ -35,6 +35,60 @@ - 多语言文本兼容 - 保持特殊字符格式 +### 智能文档处理 + +- **智能标题生成** + - 自动从内容提取关键信息作为标题 + - 使用自然语言处理识别主题 + - 生成简洁有意义的标题 + +- **自动标签系统** + - 基于内容智能提取关键词 + - 自动识别文档主题 + - 支持多维度标签分类 + - 可自定义标签规则 + +- **智能分类系统** + - 自动对文档进行分类 + - 支持多级目录结构 + - 预设常用分类模板 + - 可自定义分类规则 + +- **文档元数据** + - 自动添加创建时间 + - 记录文档分类信息 + - 保存标签信息 + - YAML front matter 格式 + +### 目录结构 + +保存的文档会自动按以下结构组织: +``` +save_location/ +├── 技术/ +│ └── 20250115_210000_Python项目最佳实践.md +├── 学习/ +│ └── 20250115_210100_机器学习基础概念.md +├── 工作/ +│ └── 20250115_210200_项目进度报告.md +├── 想法/ +│ └── 20250115_210300_产品改进建议.md +└── 资源/ + └── 20250115_210400_有用的开发工具集合.md +``` + +每个文档都包含以下格式的元数据: +```markdown +--- +title: 文档标题 +date: 2025-01-15 21:00:00 +tags: 标签1, 标签2, 标签3 +category: 分类名称 +--- + +文档内容... +``` + ### 格式优化 - 自动清理冗余空行 diff --git a/project/llmclipboard/llmclipboard/app.py b/project/llmclipboard/llmclipboard/app.py index 46e0fd9..eea8b3d 100644 --- a/project/llmclipboard/llmclipboard/app.py +++ b/project/llmclipboard/llmclipboard/app.py @@ -11,6 +11,7 @@ import sys import logging from PyQt6.QtWidgets import QApplication from .gui import MainWindow +from .document_processor import DocumentProcessor class TextCaptureService: def __init__(self): @@ -20,6 +21,7 @@ class TextCaptureService: self.setup_logging() self._mouse_listener = None self._keyboard_listener = None + self.doc_processor = DocumentProcessor() def load_config(self): self.config = configparser.ConfigParser() @@ -40,12 +42,18 @@ class TextCaptureService: def save_to_markdown(self, content): try: - timestamp = time.strftime("%Y%m%d_%H%M%S") - file_path = os.path.join(self.save_location, f'captured_text_{timestamp}.md') - - with open(file_path, 'w', encoding='utf-8') as f: - f.write(content) - self.logger.info(f"Content saved to: {file_path}") + # 使用文档处理器处理内容 + result = self.doc_processor.process_document(content, self.save_location) + + # 保存处理后的内容 + with open(result['path'], 'w', encoding='utf-8') as f: + f.write(result['content']) + + self.logger.info(f"Content saved to: {result['path']}") + self.logger.info(f"Title: {result['title']}") + self.logger.info(f"Category: {result['category']}") + self.logger.info(f"Tags: {', '.join(result['tags'])}") + except Exception as e: self.logger.error(f"Error saving file: {e}") diff --git a/project/llmclipboard/llmclipboard/document_processor.py b/project/llmclipboard/llmclipboard/document_processor.py new file mode 100644 index 0000000..4049873 --- /dev/null +++ b/project/llmclipboard/llmclipboard/document_processor.py @@ -0,0 +1,120 @@ +import os +import re +import jieba +import jieba.analyse +from datetime import datetime +from collections import Counter +import json +from pathlib import Path + +class DocumentProcessor: + def __init__(self, config_path=None): + self.config_path = config_path or os.path.join(os.path.dirname(__file__), 'categories.json') + self.load_categories() + + def load_categories(self): + """加载预定义的分类规则""" + if os.path.exists(self.config_path): + with open(self.config_path, 'r', encoding='utf-8') as f: + self.categories = json.load(f) + else: + self.categories = { + "技术": ["编程", "开发", "代码", "框架", "算法", "数据库", "API"], + "学习": ["教程", "课程", "学习", "笔记", "知识", "总结"], + "工作": ["会议", "项目", "计划", "报告", "任务", "进度"], + "想法": ["想法", "创意", "思考", "灵感", "观点", "建议"], + "资源": ["工具", "资源", "链接", "参考", "文档", "书籍"] + } + self._save_categories() + + def _save_categories(self): + """保存分类规则""" + os.makedirs(os.path.dirname(self.config_path), exist_ok=True) + with open(self.config_path, 'w', encoding='utf-8') as f: + json.dump(self.categories, f, ensure_ascii=False, indent=2) + + def extract_title(self, content): + """从内容中提取或生成标题""" + # 首先尝试从内容的第一行提取标题 + first_line = content.strip().split('\n')[0] + if len(first_line) <= 50: # 如果第一行不太长,可能是标题 + return first_line + + # 使用结巴分词提取关键词 + keywords = jieba.analyse.textrank(content, topK=3, allowPOS=('ns', 'n', 'vn', 'v')) + if keywords: + return ' '.join(keywords) + + # 如果无法提取关键词,使用内容的前20个字符 + return content[:20].strip() + "..." + + def extract_tags(self, content): + """提取内容的标签""" + # 使用结巴分词提取关键词作为标签 + tags = set(jieba.analyse.extract_tags(content, topK=5)) + + # 添加基于规则的标签 + for category, keywords in self.categories.items(): + if any(keyword in content for keyword in keywords): + tags.add(category) + + return list(tags) + + def categorize(self, content, tags): + """对内容进行分类""" + # 基于标签和内容关键词进行分类 + category_scores = Counter() + + # 根据标签评分 + for tag in tags: + for category, keywords in self.categories.items(): + if tag in keywords or tag == category: + category_scores[category] += 2 + + # 根据内容关键词评分 + for category, keywords in self.categories.items(): + for keyword in keywords: + if keyword in content: + category_scores[category] += 1 + + # 返回得分最高的分类,如果没有匹配则返回"未分类" + return category_scores.most_common(1)[0][0] if category_scores else "未分类" + + def process_document(self, content, base_save_path): + """处理文档内容并返回保存信息""" + # 提取标题 + title = self.extract_title(content) + + # 提取标签 + tags = self.extract_tags(content) + + # 确定分类 + category = self.categorize(content, tags) + + # 生成文件名 + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + safe_title = re.sub(r'[\\/:*?"<>|]', '_', title) # 移除不安全的文件名字符 + filename = f"{timestamp}_{safe_title[:30]}.md" + + # 构建保存路径 + save_dir = os.path.join(base_save_path, category) + os.makedirs(save_dir, exist_ok=True) + save_path = os.path.join(save_dir, filename) + + # 构建完整的markdown内容 + full_content = f"""--- +title: {title} +date: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} +tags: {', '.join(tags)} +category: {category} +--- + +{content} +""" + return { + 'path': save_path, + 'content': full_content, + 'title': title, + 'tags': tags, + 'category': category + } diff --git a/project/llmclipboard/pyproject.toml b/project/llmclipboard/pyproject.toml index 5f0d6aa..b071cc0 100644 --- a/project/llmclipboard/pyproject.toml +++ b/project/llmclipboard/pyproject.toml @@ -12,7 +12,8 @@ dependencies = [ "configparser", "PyQt6", "darkdetect", - "qt-material" + "qt-material", + "jieba" ] [project.scripts]