fix: improve content capture and title extraction

- Optimize clipboard content capture order (Unicode > HTML > Text)
- Add proper HTML content extraction
- Improve title extraction with markdown and topic sentence support
- Add automatic copy operation before content capture
- Fix content encoding issues
This commit is contained in:
zhukang 2025-01-15 21:57:46 +08:00
parent 595f22c929
commit 0a5179209d
2 changed files with 68 additions and 37 deletions

View File

@ -46,54 +46,64 @@ class TextCaptureService:
win32clipboard.OpenClipboard()
content = None
# 尝试获取HTML格式
# 首先尝试获取Unicode文本
try:
content = win32clipboard.GetClipboardData(win32con.CF_HTML)
content = win32clipboard.GetClipboardData(win32con.CF_UNICODETEXT)
self.logger.info("获取Unicode文本格式内容")
if content:
return content
except Exception as e:
self.logger.debug(f"获取Unicode文本失败: {e}")
# 如果Unicode获取失败尝试获取HTML格式
try:
html_content = win32clipboard.GetClipboardData(win32con.CF_HTML)
self.logger.info("获取HTML格式内容")
h = HTML2Text()
h.body_width = 0 # 禁用自动换行
h.single_line_break = True # 使用单行换行
h.ignore_emphasis = False
h.ignore_images = False
h.ignore_links = False
h.ignore_tables = False
content = h.handle(content).strip()
if html_content:
# 解析HTML格式的内容
h = HTML2Text()
h.body_width = 0 # 禁用自动换行
h.single_line_break = True # 使用单行换行
h.ignore_emphasis = False
h.ignore_images = False
h.ignore_links = False
h.ignore_tables = False
# 从HTML字符串中提取实际的HTML内容
start = html_content.find('<html>')
end = html_content.find('</html>')
if start != -1 and end != -1:
html_content = html_content[start:end+7]
content = h.handle(html_content).strip()
if content:
return content
except Exception as e:
self.logger.debug(f"获取HTML格式失败: {e}")
# 如果HTML格式获取失败尝试获取Unicode文本
if not content:
try:
content = win32clipboard.GetClipboardData(win32con.CF_UNICODETEXT)
self.logger.info("获取Unicode文本格式内容")
except Exception as e:
self.logger.debug(f"获取Unicode文本失败: {e}")
# 如果HTML获取失败尝试获取普通文本
try:
text_content = win32clipboard.GetClipboardData(win32con.CF_TEXT)
self.logger.info("获取普通文本格式内容")
if text_content:
return text_content.decode('gbk')
except Exception as e:
self.logger.debug(f"获取普通文本失败: {e}")
# 如果Unicode获取失败尝试获取普通文本
if not content:
try:
content = win32clipboard.GetClipboardData(win32con.CF_TEXT)
content = content.decode('gbk')
self.logger.info("获取普通文本格式内容")
except Exception as e:
self.logger.debug(f"获取普通文本失败: {e}")
win32clipboard.CloseClipboard()
if content:
self.logger.info("成功获取剪贴板内容")
return content
else:
self.logger.warning("剪贴板内容为空")
return None
return content
except Exception as e:
self.logger.error(f"获取剪贴板内容失败: {e}")
return None
finally:
try:
win32clipboard.CloseClipboard()
except:
pass
return None
def on_click(self, x, y, button, pressed):
if not pressed or not self.running:
@ -103,6 +113,11 @@ class TextCaptureService:
current_time = time.time()
if (current_time - self.last_right_click_time) < self.double_click_threshold:
self.logger.info("检测到双击右键")
# 先执行复制操作
self.simulate_copy()
time.sleep(0.1) # 等待复制完成
# 获取剪贴板内容
content = self.get_clipboard_content()
if content:
self.logger.info("开始保存内容")

View File

@ -36,17 +36,33 @@ class DocumentProcessor:
def extract_title(self, content):
"""从内容中提取或生成标题"""
# 首先尝试从内容的第一行提取标题
first_line = content.strip().split('\n')[0]
if len(first_line) <= 50: # 如果第一行不太长,可能是标题
lines = content.strip().split('\n')
first_line = lines[0].strip()
# 如果第一行是markdown标题格式直接使用
if first_line.startswith('#'):
return first_line.lstrip('#').strip()
# 如果第一行不太长且不是很短,可能是标题
if 10 <= len(first_line) <= 100 and not first_line.endswith('') and not first_line.endswith(':'):
return first_line
# 尝试查找文章的主题句
for line in lines[:5]: # 只查看前5行
line = line.strip()
if '主题' in line or '概要' in line or '总结' in line:
# 提取冒号或者是后面的内容
if ':' in line or '' in line:
return line.split(':', 1)[1].strip() if ':' in line else line.split('', 1)[1].strip()
return line
# 使用结巴分词提取关键词
keywords = jieba.analyse.textrank(content, topK=3, allowPOS=('ns', 'n', 'vn', 'v'))
keywords = jieba.analyse.textrank(content[:500], topK=3, allowPOS=('ns', 'n', 'vn', 'v'))
if keywords:
return ' '.join(keywords)
# 如果无法提取关键词使用内容的前20个字符
return content[:20].strip() + "..."
# 如果无法提取关键词,使用内容的前30个字符
return content[:30].strip() + "..."
def extract_tags(self, content):
"""提取内容的标签"""