fix: improve content capture and title extraction
- Optimize clipboard content capture order (Unicode > HTML > Text) - Add proper HTML content extraction - Improve title extraction with markdown and topic sentence support - Add automatic copy operation before content capture - Fix content encoding issues
This commit is contained in:
parent
595f22c929
commit
0a5179209d
@ -46,54 +46,64 @@ class TextCaptureService:
|
||||
win32clipboard.OpenClipboard()
|
||||
content = None
|
||||
|
||||
# 尝试获取HTML格式
|
||||
# 首先尝试获取Unicode文本
|
||||
try:
|
||||
content = win32clipboard.GetClipboardData(win32con.CF_HTML)
|
||||
content = win32clipboard.GetClipboardData(win32con.CF_UNICODETEXT)
|
||||
self.logger.info("获取Unicode文本格式内容")
|
||||
if content:
|
||||
return content
|
||||
except Exception as e:
|
||||
self.logger.debug(f"获取Unicode文本失败: {e}")
|
||||
|
||||
# 如果Unicode获取失败,尝试获取HTML格式
|
||||
try:
|
||||
html_content = win32clipboard.GetClipboardData(win32con.CF_HTML)
|
||||
self.logger.info("获取HTML格式内容")
|
||||
h = HTML2Text()
|
||||
h.body_width = 0 # 禁用自动换行
|
||||
h.single_line_break = True # 使用单行换行
|
||||
h.ignore_emphasis = False
|
||||
h.ignore_images = False
|
||||
h.ignore_links = False
|
||||
h.ignore_tables = False
|
||||
content = h.handle(content).strip()
|
||||
if html_content:
|
||||
# 解析HTML格式的内容
|
||||
h = HTML2Text()
|
||||
h.body_width = 0 # 禁用自动换行
|
||||
h.single_line_break = True # 使用单行换行
|
||||
h.ignore_emphasis = False
|
||||
h.ignore_images = False
|
||||
h.ignore_links = False
|
||||
h.ignore_tables = False
|
||||
|
||||
# 从HTML字符串中提取实际的HTML内容
|
||||
start = html_content.find('<html>')
|
||||
end = html_content.find('</html>')
|
||||
if start != -1 and end != -1:
|
||||
html_content = html_content[start:end+7]
|
||||
|
||||
content = h.handle(html_content).strip()
|
||||
if content:
|
||||
return content
|
||||
except Exception as e:
|
||||
self.logger.debug(f"获取HTML格式失败: {e}")
|
||||
|
||||
# 如果HTML格式获取失败,尝试获取Unicode文本
|
||||
if not content:
|
||||
try:
|
||||
content = win32clipboard.GetClipboardData(win32con.CF_UNICODETEXT)
|
||||
self.logger.info("获取Unicode文本格式内容")
|
||||
except Exception as e:
|
||||
self.logger.debug(f"获取Unicode文本失败: {e}")
|
||||
# 如果HTML获取失败,尝试获取普通文本
|
||||
try:
|
||||
text_content = win32clipboard.GetClipboardData(win32con.CF_TEXT)
|
||||
self.logger.info("获取普通文本格式内容")
|
||||
if text_content:
|
||||
return text_content.decode('gbk')
|
||||
except Exception as e:
|
||||
self.logger.debug(f"获取普通文本失败: {e}")
|
||||
|
||||
# 如果Unicode获取失败,尝试获取普通文本
|
||||
if not content:
|
||||
try:
|
||||
content = win32clipboard.GetClipboardData(win32con.CF_TEXT)
|
||||
content = content.decode('gbk')
|
||||
self.logger.info("获取普通文本格式内容")
|
||||
except Exception as e:
|
||||
self.logger.debug(f"获取普通文本失败: {e}")
|
||||
|
||||
win32clipboard.CloseClipboard()
|
||||
|
||||
if content:
|
||||
self.logger.info("成功获取剪贴板内容")
|
||||
return content
|
||||
else:
|
||||
self.logger.warning("剪贴板内容为空")
|
||||
return None
|
||||
|
||||
return content
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"获取剪贴板内容失败: {e}")
|
||||
return None
|
||||
finally:
|
||||
try:
|
||||
win32clipboard.CloseClipboard()
|
||||
except:
|
||||
pass
|
||||
return None
|
||||
|
||||
def on_click(self, x, y, button, pressed):
|
||||
if not pressed or not self.running:
|
||||
@ -103,6 +113,11 @@ class TextCaptureService:
|
||||
current_time = time.time()
|
||||
if (current_time - self.last_right_click_time) < self.double_click_threshold:
|
||||
self.logger.info("检测到双击右键")
|
||||
# 先执行复制操作
|
||||
self.simulate_copy()
|
||||
time.sleep(0.1) # 等待复制完成
|
||||
|
||||
# 获取剪贴板内容
|
||||
content = self.get_clipboard_content()
|
||||
if content:
|
||||
self.logger.info("开始保存内容")
|
||||
|
||||
@ -36,17 +36,33 @@ class DocumentProcessor:
|
||||
def extract_title(self, content):
|
||||
"""从内容中提取或生成标题"""
|
||||
# 首先尝试从内容的第一行提取标题
|
||||
first_line = content.strip().split('\n')[0]
|
||||
if len(first_line) <= 50: # 如果第一行不太长,可能是标题
|
||||
lines = content.strip().split('\n')
|
||||
first_line = lines[0].strip()
|
||||
|
||||
# 如果第一行是markdown标题格式,直接使用
|
||||
if first_line.startswith('#'):
|
||||
return first_line.lstrip('#').strip()
|
||||
|
||||
# 如果第一行不太长且不是很短,可能是标题
|
||||
if 10 <= len(first_line) <= 100 and not first_line.endswith(':') and not first_line.endswith(':'):
|
||||
return first_line
|
||||
|
||||
# 尝试查找文章的主题句
|
||||
for line in lines[:5]: # 只查看前5行
|
||||
line = line.strip()
|
||||
if '主题' in line or '概要' in line or '总结' in line:
|
||||
# 提取冒号或者是后面的内容
|
||||
if ':' in line or ':' in line:
|
||||
return line.split(':', 1)[1].strip() if ':' in line else line.split(':', 1)[1].strip()
|
||||
return line
|
||||
|
||||
# 使用结巴分词提取关键词
|
||||
keywords = jieba.analyse.textrank(content, topK=3, allowPOS=('ns', 'n', 'vn', 'v'))
|
||||
keywords = jieba.analyse.textrank(content[:500], topK=3, allowPOS=('ns', 'n', 'vn', 'v'))
|
||||
if keywords:
|
||||
return ' '.join(keywords)
|
||||
|
||||
# 如果无法提取关键词,使用内容的前20个字符
|
||||
return content[:20].strip() + "..."
|
||||
# 如果无法提取关键词,使用内容的前30个字符
|
||||
return content[:30].strip() + "..."
|
||||
|
||||
def extract_tags(self, content):
|
||||
"""提取内容的标签"""
|
||||
|
||||
Loading…
Reference in New Issue
Block a user