feat: add intelligent document processing
- Add DocumentProcessor class for smart content analysis - Implement automatic title generation - Add automatic tagging system - Add intelligent categorization - Add metadata support with YAML front matter - Update documentation with new features - Add jieba dependency for Chinese text processing
This commit is contained in:
parent
46b7da2a77
commit
b2494d439b
@ -35,6 +35,60 @@
|
|||||||
- 多语言文本兼容
|
- 多语言文本兼容
|
||||||
- 保持特殊字符格式
|
- 保持特殊字符格式
|
||||||
|
|
||||||
|
### 智能文档处理
|
||||||
|
|
||||||
|
- **智能标题生成**
|
||||||
|
- 自动从内容提取关键信息作为标题
|
||||||
|
- 使用自然语言处理识别主题
|
||||||
|
- 生成简洁有意义的标题
|
||||||
|
|
||||||
|
- **自动标签系统**
|
||||||
|
- 基于内容智能提取关键词
|
||||||
|
- 自动识别文档主题
|
||||||
|
- 支持多维度标签分类
|
||||||
|
- 可自定义标签规则
|
||||||
|
|
||||||
|
- **智能分类系统**
|
||||||
|
- 自动对文档进行分类
|
||||||
|
- 支持多级目录结构
|
||||||
|
- 预设常用分类模板
|
||||||
|
- 可自定义分类规则
|
||||||
|
|
||||||
|
- **文档元数据**
|
||||||
|
- 自动添加创建时间
|
||||||
|
- 记录文档分类信息
|
||||||
|
- 保存标签信息
|
||||||
|
- YAML front matter 格式
|
||||||
|
|
||||||
|
### 目录结构
|
||||||
|
|
||||||
|
保存的文档会自动按以下结构组织:
|
||||||
|
```
|
||||||
|
save_location/
|
||||||
|
├── 技术/
|
||||||
|
│ └── 20250115_210000_Python项目最佳实践.md
|
||||||
|
├── 学习/
|
||||||
|
│ └── 20250115_210100_机器学习基础概念.md
|
||||||
|
├── 工作/
|
||||||
|
│ └── 20250115_210200_项目进度报告.md
|
||||||
|
├── 想法/
|
||||||
|
│ └── 20250115_210300_产品改进建议.md
|
||||||
|
└── 资源/
|
||||||
|
└── 20250115_210400_有用的开发工具集合.md
|
||||||
|
```
|
||||||
|
|
||||||
|
每个文档都包含以下格式的元数据:
|
||||||
|
```markdown
|
||||||
|
---
|
||||||
|
title: 文档标题
|
||||||
|
date: 2025-01-15 21:00:00
|
||||||
|
tags: 标签1, 标签2, 标签3
|
||||||
|
category: 分类名称
|
||||||
|
---
|
||||||
|
|
||||||
|
文档内容...
|
||||||
|
```
|
||||||
|
|
||||||
### 格式优化
|
### 格式优化
|
||||||
|
|
||||||
- 自动清理冗余空行
|
- 自动清理冗余空行
|
||||||
|
|||||||
@ -11,6 +11,7 @@ import sys
|
|||||||
import logging
|
import logging
|
||||||
from PyQt6.QtWidgets import QApplication
|
from PyQt6.QtWidgets import QApplication
|
||||||
from .gui import MainWindow
|
from .gui import MainWindow
|
||||||
|
from .document_processor import DocumentProcessor
|
||||||
|
|
||||||
class TextCaptureService:
|
class TextCaptureService:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -20,6 +21,7 @@ class TextCaptureService:
|
|||||||
self.setup_logging()
|
self.setup_logging()
|
||||||
self._mouse_listener = None
|
self._mouse_listener = None
|
||||||
self._keyboard_listener = None
|
self._keyboard_listener = None
|
||||||
|
self.doc_processor = DocumentProcessor()
|
||||||
|
|
||||||
def load_config(self):
|
def load_config(self):
|
||||||
self.config = configparser.ConfigParser()
|
self.config = configparser.ConfigParser()
|
||||||
@ -40,12 +42,18 @@ class TextCaptureService:
|
|||||||
|
|
||||||
def save_to_markdown(self, content):
|
def save_to_markdown(self, content):
|
||||||
try:
|
try:
|
||||||
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
# 使用文档处理器处理内容
|
||||||
file_path = os.path.join(self.save_location, f'captured_text_{timestamp}.md')
|
result = self.doc_processor.process_document(content, self.save_location)
|
||||||
|
|
||||||
|
# 保存处理后的内容
|
||||||
|
with open(result['path'], 'w', encoding='utf-8') as f:
|
||||||
|
f.write(result['content'])
|
||||||
|
|
||||||
|
self.logger.info(f"Content saved to: {result['path']}")
|
||||||
|
self.logger.info(f"Title: {result['title']}")
|
||||||
|
self.logger.info(f"Category: {result['category']}")
|
||||||
|
self.logger.info(f"Tags: {', '.join(result['tags'])}")
|
||||||
|
|
||||||
with open(file_path, 'w', encoding='utf-8') as f:
|
|
||||||
f.write(content)
|
|
||||||
self.logger.info(f"Content saved to: {file_path}")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(f"Error saving file: {e}")
|
self.logger.error(f"Error saving file: {e}")
|
||||||
|
|
||||||
|
|||||||
120
project/llmclipboard/llmclipboard/document_processor.py
Normal file
120
project/llmclipboard/llmclipboard/document_processor.py
Normal file
@ -0,0 +1,120 @@
|
|||||||
|
import os
|
||||||
|
import re
|
||||||
|
import jieba
|
||||||
|
import jieba.analyse
|
||||||
|
from datetime import datetime
|
||||||
|
from collections import Counter
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
class DocumentProcessor:
|
||||||
|
def __init__(self, config_path=None):
|
||||||
|
self.config_path = config_path or os.path.join(os.path.dirname(__file__), 'categories.json')
|
||||||
|
self.load_categories()
|
||||||
|
|
||||||
|
def load_categories(self):
|
||||||
|
"""加载预定义的分类规则"""
|
||||||
|
if os.path.exists(self.config_path):
|
||||||
|
with open(self.config_path, 'r', encoding='utf-8') as f:
|
||||||
|
self.categories = json.load(f)
|
||||||
|
else:
|
||||||
|
self.categories = {
|
||||||
|
"技术": ["编程", "开发", "代码", "框架", "算法", "数据库", "API"],
|
||||||
|
"学习": ["教程", "课程", "学习", "笔记", "知识", "总结"],
|
||||||
|
"工作": ["会议", "项目", "计划", "报告", "任务", "进度"],
|
||||||
|
"想法": ["想法", "创意", "思考", "灵感", "观点", "建议"],
|
||||||
|
"资源": ["工具", "资源", "链接", "参考", "文档", "书籍"]
|
||||||
|
}
|
||||||
|
self._save_categories()
|
||||||
|
|
||||||
|
def _save_categories(self):
|
||||||
|
"""保存分类规则"""
|
||||||
|
os.makedirs(os.path.dirname(self.config_path), exist_ok=True)
|
||||||
|
with open(self.config_path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(self.categories, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
def extract_title(self, content):
|
||||||
|
"""从内容中提取或生成标题"""
|
||||||
|
# 首先尝试从内容的第一行提取标题
|
||||||
|
first_line = content.strip().split('\n')[0]
|
||||||
|
if len(first_line) <= 50: # 如果第一行不太长,可能是标题
|
||||||
|
return first_line
|
||||||
|
|
||||||
|
# 使用结巴分词提取关键词
|
||||||
|
keywords = jieba.analyse.textrank(content, topK=3, allowPOS=('ns', 'n', 'vn', 'v'))
|
||||||
|
if keywords:
|
||||||
|
return ' '.join(keywords)
|
||||||
|
|
||||||
|
# 如果无法提取关键词,使用内容的前20个字符
|
||||||
|
return content[:20].strip() + "..."
|
||||||
|
|
||||||
|
def extract_tags(self, content):
|
||||||
|
"""提取内容的标签"""
|
||||||
|
# 使用结巴分词提取关键词作为标签
|
||||||
|
tags = set(jieba.analyse.extract_tags(content, topK=5))
|
||||||
|
|
||||||
|
# 添加基于规则的标签
|
||||||
|
for category, keywords in self.categories.items():
|
||||||
|
if any(keyword in content for keyword in keywords):
|
||||||
|
tags.add(category)
|
||||||
|
|
||||||
|
return list(tags)
|
||||||
|
|
||||||
|
def categorize(self, content, tags):
|
||||||
|
"""对内容进行分类"""
|
||||||
|
# 基于标签和内容关键词进行分类
|
||||||
|
category_scores = Counter()
|
||||||
|
|
||||||
|
# 根据标签评分
|
||||||
|
for tag in tags:
|
||||||
|
for category, keywords in self.categories.items():
|
||||||
|
if tag in keywords or tag == category:
|
||||||
|
category_scores[category] += 2
|
||||||
|
|
||||||
|
# 根据内容关键词评分
|
||||||
|
for category, keywords in self.categories.items():
|
||||||
|
for keyword in keywords:
|
||||||
|
if keyword in content:
|
||||||
|
category_scores[category] += 1
|
||||||
|
|
||||||
|
# 返回得分最高的分类,如果没有匹配则返回"未分类"
|
||||||
|
return category_scores.most_common(1)[0][0] if category_scores else "未分类"
|
||||||
|
|
||||||
|
def process_document(self, content, base_save_path):
|
||||||
|
"""处理文档内容并返回保存信息"""
|
||||||
|
# 提取标题
|
||||||
|
title = self.extract_title(content)
|
||||||
|
|
||||||
|
# 提取标签
|
||||||
|
tags = self.extract_tags(content)
|
||||||
|
|
||||||
|
# 确定分类
|
||||||
|
category = self.categorize(content, tags)
|
||||||
|
|
||||||
|
# 生成文件名
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
safe_title = re.sub(r'[\\/:*?"<>|]', '_', title) # 移除不安全的文件名字符
|
||||||
|
filename = f"{timestamp}_{safe_title[:30]}.md"
|
||||||
|
|
||||||
|
# 构建保存路径
|
||||||
|
save_dir = os.path.join(base_save_path, category)
|
||||||
|
os.makedirs(save_dir, exist_ok=True)
|
||||||
|
save_path = os.path.join(save_dir, filename)
|
||||||
|
|
||||||
|
# 构建完整的markdown内容
|
||||||
|
full_content = f"""---
|
||||||
|
title: {title}
|
||||||
|
date: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
|
||||||
|
tags: {', '.join(tags)}
|
||||||
|
category: {category}
|
||||||
|
---
|
||||||
|
|
||||||
|
{content}
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
'path': save_path,
|
||||||
|
'content': full_content,
|
||||||
|
'title': title,
|
||||||
|
'tags': tags,
|
||||||
|
'category': category
|
||||||
|
}
|
||||||
@ -12,7 +12,8 @@ dependencies = [
|
|||||||
"configparser",
|
"configparser",
|
||||||
"PyQt6",
|
"PyQt6",
|
||||||
"darkdetect",
|
"darkdetect",
|
||||||
"qt-material"
|
"qt-material",
|
||||||
|
"jieba"
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user