feat: add intelligent document processing

- Add DocumentProcessor class for smart content analysis
- Implement automatic title generation
- Add automatic tagging system
- Add intelligent categorization
- Add metadata support with YAML front matter
- Update documentation with new features
- Add jieba dependency for Chinese text processing
This commit is contained in:
zhukang 2025-01-15 21:34:42 +08:00
parent 46b7da2a77
commit b2494d439b
4 changed files with 190 additions and 7 deletions

View File

@ -35,6 +35,60 @@
- 多语言文本兼容
- 保持特殊字符格式
### 智能文档处理
- **智能标题生成**
- 自动从内容提取关键信息作为标题
- 使用自然语言处理识别主题
- 生成简洁有意义的标题
- **自动标签系统**
- 基于内容智能提取关键词
- 自动识别文档主题
- 支持多维度标签分类
- 可自定义标签规则
- **智能分类系统**
- 自动对文档进行分类
- 支持多级目录结构
- 预设常用分类模板
- 可自定义分类规则
- **文档元数据**
- 自动添加创建时间
- 记录文档分类信息
- 保存标签信息
- YAML front matter 格式
### 目录结构
保存的文档会自动按以下结构组织:
```
save_location/
├── 技术/
│ └── 20250115_210000_Python项目最佳实践.md
├── 学习/
│ └── 20250115_210100_机器学习基础概念.md
├── 工作/
│ └── 20250115_210200_项目进度报告.md
├── 想法/
│ └── 20250115_210300_产品改进建议.md
└── 资源/
└── 20250115_210400_有用的开发工具集合.md
```
每个文档都包含以下格式的元数据:
```markdown
---
title: 文档标题
date: 2025-01-15 21:00:00
tags: 标签1, 标签2, 标签3
category: 分类名称
---
文档内容...
```
### 格式优化
- 自动清理冗余空行

View File

@ -11,6 +11,7 @@ import sys
import logging
from PyQt6.QtWidgets import QApplication
from .gui import MainWindow
from .document_processor import DocumentProcessor
class TextCaptureService:
def __init__(self):
@ -20,6 +21,7 @@ class TextCaptureService:
self.setup_logging()
self._mouse_listener = None
self._keyboard_listener = None
self.doc_processor = DocumentProcessor()
def load_config(self):
self.config = configparser.ConfigParser()
@ -40,12 +42,18 @@ class TextCaptureService:
def save_to_markdown(self, content):
try:
timestamp = time.strftime("%Y%m%d_%H%M%S")
file_path = os.path.join(self.save_location, f'captured_text_{timestamp}.md')
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)
self.logger.info(f"Content saved to: {file_path}")
# 使用文档处理器处理内容
result = self.doc_processor.process_document(content, self.save_location)
# 保存处理后的内容
with open(result['path'], 'w', encoding='utf-8') as f:
f.write(result['content'])
self.logger.info(f"Content saved to: {result['path']}")
self.logger.info(f"Title: {result['title']}")
self.logger.info(f"Category: {result['category']}")
self.logger.info(f"Tags: {', '.join(result['tags'])}")
except Exception as e:
self.logger.error(f"Error saving file: {e}")

View File

@ -0,0 +1,120 @@
import os
import re
import jieba
import jieba.analyse
from datetime import datetime
from collections import Counter
import json
from pathlib import Path
class DocumentProcessor:
def __init__(self, config_path=None):
self.config_path = config_path or os.path.join(os.path.dirname(__file__), 'categories.json')
self.load_categories()
def load_categories(self):
"""加载预定义的分类规则"""
if os.path.exists(self.config_path):
with open(self.config_path, 'r', encoding='utf-8') as f:
self.categories = json.load(f)
else:
self.categories = {
"技术": ["编程", "开发", "代码", "框架", "算法", "数据库", "API"],
"学习": ["教程", "课程", "学习", "笔记", "知识", "总结"],
"工作": ["会议", "项目", "计划", "报告", "任务", "进度"],
"想法": ["想法", "创意", "思考", "灵感", "观点", "建议"],
"资源": ["工具", "资源", "链接", "参考", "文档", "书籍"]
}
self._save_categories()
def _save_categories(self):
"""保存分类规则"""
os.makedirs(os.path.dirname(self.config_path), exist_ok=True)
with open(self.config_path, 'w', encoding='utf-8') as f:
json.dump(self.categories, f, ensure_ascii=False, indent=2)
def extract_title(self, content):
"""从内容中提取或生成标题"""
# 首先尝试从内容的第一行提取标题
first_line = content.strip().split('\n')[0]
if len(first_line) <= 50: # 如果第一行不太长,可能是标题
return first_line
# 使用结巴分词提取关键词
keywords = jieba.analyse.textrank(content, topK=3, allowPOS=('ns', 'n', 'vn', 'v'))
if keywords:
return ' '.join(keywords)
# 如果无法提取关键词使用内容的前20个字符
return content[:20].strip() + "..."
def extract_tags(self, content):
"""提取内容的标签"""
# 使用结巴分词提取关键词作为标签
tags = set(jieba.analyse.extract_tags(content, topK=5))
# 添加基于规则的标签
for category, keywords in self.categories.items():
if any(keyword in content for keyword in keywords):
tags.add(category)
return list(tags)
def categorize(self, content, tags):
"""对内容进行分类"""
# 基于标签和内容关键词进行分类
category_scores = Counter()
# 根据标签评分
for tag in tags:
for category, keywords in self.categories.items():
if tag in keywords or tag == category:
category_scores[category] += 2
# 根据内容关键词评分
for category, keywords in self.categories.items():
for keyword in keywords:
if keyword in content:
category_scores[category] += 1
# 返回得分最高的分类,如果没有匹配则返回"未分类"
return category_scores.most_common(1)[0][0] if category_scores else "未分类"
def process_document(self, content, base_save_path):
"""处理文档内容并返回保存信息"""
# 提取标题
title = self.extract_title(content)
# 提取标签
tags = self.extract_tags(content)
# 确定分类
category = self.categorize(content, tags)
# 生成文件名
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
safe_title = re.sub(r'[\\/:*?"<>|]', '_', title) # 移除不安全的文件名字符
filename = f"{timestamp}_{safe_title[:30]}.md"
# 构建保存路径
save_dir = os.path.join(base_save_path, category)
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, filename)
# 构建完整的markdown内容
full_content = f"""---
title: {title}
date: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
tags: {', '.join(tags)}
category: {category}
---
{content}
"""
return {
'path': save_path,
'content': full_content,
'title': title,
'tags': tags,
'category': category
}

View File

@ -12,7 +12,8 @@ dependencies = [
"configparser",
"PyQt6",
"darkdetect",
"qt-material"
"qt-material",
"jieba"
]
[project.scripts]