feat: add intelligent document processing
- Add DocumentProcessor class for smart content analysis - Implement automatic title generation - Add automatic tagging system - Add intelligent categorization - Add metadata support with YAML front matter - Update documentation with new features - Add jieba dependency for Chinese text processing
This commit is contained in:
parent
46b7da2a77
commit
b2494d439b
@ -35,6 +35,60 @@
|
||||
- 多语言文本兼容
|
||||
- 保持特殊字符格式
|
||||
|
||||
### 智能文档处理
|
||||
|
||||
- **智能标题生成**
|
||||
- 自动从内容提取关键信息作为标题
|
||||
- 使用自然语言处理识别主题
|
||||
- 生成简洁有意义的标题
|
||||
|
||||
- **自动标签系统**
|
||||
- 基于内容智能提取关键词
|
||||
- 自动识别文档主题
|
||||
- 支持多维度标签分类
|
||||
- 可自定义标签规则
|
||||
|
||||
- **智能分类系统**
|
||||
- 自动对文档进行分类
|
||||
- 支持多级目录结构
|
||||
- 预设常用分类模板
|
||||
- 可自定义分类规则
|
||||
|
||||
- **文档元数据**
|
||||
- 自动添加创建时间
|
||||
- 记录文档分类信息
|
||||
- 保存标签信息
|
||||
- YAML front matter 格式
|
||||
|
||||
### 目录结构
|
||||
|
||||
保存的文档会自动按以下结构组织:
|
||||
```
|
||||
save_location/
|
||||
├── 技术/
|
||||
│ └── 20250115_210000_Python项目最佳实践.md
|
||||
├── 学习/
|
||||
│ └── 20250115_210100_机器学习基础概念.md
|
||||
├── 工作/
|
||||
│ └── 20250115_210200_项目进度报告.md
|
||||
├── 想法/
|
||||
│ └── 20250115_210300_产品改进建议.md
|
||||
└── 资源/
|
||||
└── 20250115_210400_有用的开发工具集合.md
|
||||
```
|
||||
|
||||
每个文档都包含以下格式的元数据:
|
||||
```markdown
|
||||
---
|
||||
title: 文档标题
|
||||
date: 2025-01-15 21:00:00
|
||||
tags: 标签1, 标签2, 标签3
|
||||
category: 分类名称
|
||||
---
|
||||
|
||||
文档内容...
|
||||
```
|
||||
|
||||
### 格式优化
|
||||
|
||||
- 自动清理冗余空行
|
||||
|
||||
@ -11,6 +11,7 @@ import sys
|
||||
import logging
|
||||
from PyQt6.QtWidgets import QApplication
|
||||
from .gui import MainWindow
|
||||
from .document_processor import DocumentProcessor
|
||||
|
||||
class TextCaptureService:
|
||||
def __init__(self):
|
||||
@ -20,6 +21,7 @@ class TextCaptureService:
|
||||
self.setup_logging()
|
||||
self._mouse_listener = None
|
||||
self._keyboard_listener = None
|
||||
self.doc_processor = DocumentProcessor()
|
||||
|
||||
def load_config(self):
|
||||
self.config = configparser.ConfigParser()
|
||||
@ -40,12 +42,18 @@ class TextCaptureService:
|
||||
|
||||
def save_to_markdown(self, content):
|
||||
try:
|
||||
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
||||
file_path = os.path.join(self.save_location, f'captured_text_{timestamp}.md')
|
||||
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
f.write(content)
|
||||
self.logger.info(f"Content saved to: {file_path}")
|
||||
# 使用文档处理器处理内容
|
||||
result = self.doc_processor.process_document(content, self.save_location)
|
||||
|
||||
# 保存处理后的内容
|
||||
with open(result['path'], 'w', encoding='utf-8') as f:
|
||||
f.write(result['content'])
|
||||
|
||||
self.logger.info(f"Content saved to: {result['path']}")
|
||||
self.logger.info(f"Title: {result['title']}")
|
||||
self.logger.info(f"Category: {result['category']}")
|
||||
self.logger.info(f"Tags: {', '.join(result['tags'])}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error saving file: {e}")
|
||||
|
||||
|
||||
120
project/llmclipboard/llmclipboard/document_processor.py
Normal file
120
project/llmclipboard/llmclipboard/document_processor.py
Normal file
@ -0,0 +1,120 @@
|
||||
import os
|
||||
import re
|
||||
import jieba
|
||||
import jieba.analyse
|
||||
from datetime import datetime
|
||||
from collections import Counter
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
class DocumentProcessor:
|
||||
def __init__(self, config_path=None):
|
||||
self.config_path = config_path or os.path.join(os.path.dirname(__file__), 'categories.json')
|
||||
self.load_categories()
|
||||
|
||||
def load_categories(self):
|
||||
"""加载预定义的分类规则"""
|
||||
if os.path.exists(self.config_path):
|
||||
with open(self.config_path, 'r', encoding='utf-8') as f:
|
||||
self.categories = json.load(f)
|
||||
else:
|
||||
self.categories = {
|
||||
"技术": ["编程", "开发", "代码", "框架", "算法", "数据库", "API"],
|
||||
"学习": ["教程", "课程", "学习", "笔记", "知识", "总结"],
|
||||
"工作": ["会议", "项目", "计划", "报告", "任务", "进度"],
|
||||
"想法": ["想法", "创意", "思考", "灵感", "观点", "建议"],
|
||||
"资源": ["工具", "资源", "链接", "参考", "文档", "书籍"]
|
||||
}
|
||||
self._save_categories()
|
||||
|
||||
def _save_categories(self):
|
||||
"""保存分类规则"""
|
||||
os.makedirs(os.path.dirname(self.config_path), exist_ok=True)
|
||||
with open(self.config_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(self.categories, f, ensure_ascii=False, indent=2)
|
||||
|
||||
def extract_title(self, content):
|
||||
"""从内容中提取或生成标题"""
|
||||
# 首先尝试从内容的第一行提取标题
|
||||
first_line = content.strip().split('\n')[0]
|
||||
if len(first_line) <= 50: # 如果第一行不太长,可能是标题
|
||||
return first_line
|
||||
|
||||
# 使用结巴分词提取关键词
|
||||
keywords = jieba.analyse.textrank(content, topK=3, allowPOS=('ns', 'n', 'vn', 'v'))
|
||||
if keywords:
|
||||
return ' '.join(keywords)
|
||||
|
||||
# 如果无法提取关键词,使用内容的前20个字符
|
||||
return content[:20].strip() + "..."
|
||||
|
||||
def extract_tags(self, content):
|
||||
"""提取内容的标签"""
|
||||
# 使用结巴分词提取关键词作为标签
|
||||
tags = set(jieba.analyse.extract_tags(content, topK=5))
|
||||
|
||||
# 添加基于规则的标签
|
||||
for category, keywords in self.categories.items():
|
||||
if any(keyword in content for keyword in keywords):
|
||||
tags.add(category)
|
||||
|
||||
return list(tags)
|
||||
|
||||
def categorize(self, content, tags):
|
||||
"""对内容进行分类"""
|
||||
# 基于标签和内容关键词进行分类
|
||||
category_scores = Counter()
|
||||
|
||||
# 根据标签评分
|
||||
for tag in tags:
|
||||
for category, keywords in self.categories.items():
|
||||
if tag in keywords or tag == category:
|
||||
category_scores[category] += 2
|
||||
|
||||
# 根据内容关键词评分
|
||||
for category, keywords in self.categories.items():
|
||||
for keyword in keywords:
|
||||
if keyword in content:
|
||||
category_scores[category] += 1
|
||||
|
||||
# 返回得分最高的分类,如果没有匹配则返回"未分类"
|
||||
return category_scores.most_common(1)[0][0] if category_scores else "未分类"
|
||||
|
||||
def process_document(self, content, base_save_path):
|
||||
"""处理文档内容并返回保存信息"""
|
||||
# 提取标题
|
||||
title = self.extract_title(content)
|
||||
|
||||
# 提取标签
|
||||
tags = self.extract_tags(content)
|
||||
|
||||
# 确定分类
|
||||
category = self.categorize(content, tags)
|
||||
|
||||
# 生成文件名
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
safe_title = re.sub(r'[\\/:*?"<>|]', '_', title) # 移除不安全的文件名字符
|
||||
filename = f"{timestamp}_{safe_title[:30]}.md"
|
||||
|
||||
# 构建保存路径
|
||||
save_dir = os.path.join(base_save_path, category)
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
save_path = os.path.join(save_dir, filename)
|
||||
|
||||
# 构建完整的markdown内容
|
||||
full_content = f"""---
|
||||
title: {title}
|
||||
date: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
|
||||
tags: {', '.join(tags)}
|
||||
category: {category}
|
||||
---
|
||||
|
||||
{content}
|
||||
"""
|
||||
return {
|
||||
'path': save_path,
|
||||
'content': full_content,
|
||||
'title': title,
|
||||
'tags': tags,
|
||||
'category': category
|
||||
}
|
||||
@ -12,7 +12,8 @@ dependencies = [
|
||||
"configparser",
|
||||
"PyQt6",
|
||||
"darkdetect",
|
||||
"qt-material"
|
||||
"qt-material",
|
||||
"jieba"
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user