fix: improve content capture and title extraction
- Optimize clipboard content capture order (Unicode > HTML > Text) - Add proper HTML content extraction - Improve title extraction with markdown and topic sentence support - Add automatic copy operation before content capture - Fix content encoding issues
This commit is contained in:
parent
595f22c929
commit
0a5179209d
@ -46,54 +46,64 @@ class TextCaptureService:
|
|||||||
win32clipboard.OpenClipboard()
|
win32clipboard.OpenClipboard()
|
||||||
content = None
|
content = None
|
||||||
|
|
||||||
# 尝试获取HTML格式
|
# 首先尝试获取Unicode文本
|
||||||
try:
|
try:
|
||||||
content = win32clipboard.GetClipboardData(win32con.CF_HTML)
|
content = win32clipboard.GetClipboardData(win32con.CF_UNICODETEXT)
|
||||||
|
self.logger.info("获取Unicode文本格式内容")
|
||||||
|
if content:
|
||||||
|
return content
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.debug(f"获取Unicode文本失败: {e}")
|
||||||
|
|
||||||
|
# 如果Unicode获取失败,尝试获取HTML格式
|
||||||
|
try:
|
||||||
|
html_content = win32clipboard.GetClipboardData(win32con.CF_HTML)
|
||||||
self.logger.info("获取HTML格式内容")
|
self.logger.info("获取HTML格式内容")
|
||||||
h = HTML2Text()
|
if html_content:
|
||||||
h.body_width = 0 # 禁用自动换行
|
# 解析HTML格式的内容
|
||||||
h.single_line_break = True # 使用单行换行
|
h = HTML2Text()
|
||||||
h.ignore_emphasis = False
|
h.body_width = 0 # 禁用自动换行
|
||||||
h.ignore_images = False
|
h.single_line_break = True # 使用单行换行
|
||||||
h.ignore_links = False
|
h.ignore_emphasis = False
|
||||||
h.ignore_tables = False
|
h.ignore_images = False
|
||||||
content = h.handle(content).strip()
|
h.ignore_links = False
|
||||||
|
h.ignore_tables = False
|
||||||
|
|
||||||
|
# 从HTML字符串中提取实际的HTML内容
|
||||||
|
start = html_content.find('<html>')
|
||||||
|
end = html_content.find('</html>')
|
||||||
|
if start != -1 and end != -1:
|
||||||
|
html_content = html_content[start:end+7]
|
||||||
|
|
||||||
|
content = h.handle(html_content).strip()
|
||||||
|
if content:
|
||||||
|
return content
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.debug(f"获取HTML格式失败: {e}")
|
self.logger.debug(f"获取HTML格式失败: {e}")
|
||||||
|
|
||||||
# 如果HTML格式获取失败,尝试获取Unicode文本
|
# 如果HTML获取失败,尝试获取普通文本
|
||||||
if not content:
|
try:
|
||||||
try:
|
text_content = win32clipboard.GetClipboardData(win32con.CF_TEXT)
|
||||||
content = win32clipboard.GetClipboardData(win32con.CF_UNICODETEXT)
|
self.logger.info("获取普通文本格式内容")
|
||||||
self.logger.info("获取Unicode文本格式内容")
|
if text_content:
|
||||||
except Exception as e:
|
return text_content.decode('gbk')
|
||||||
self.logger.debug(f"获取Unicode文本失败: {e}")
|
except Exception as e:
|
||||||
|
self.logger.debug(f"获取普通文本失败: {e}")
|
||||||
|
|
||||||
# 如果Unicode获取失败,尝试获取普通文本
|
|
||||||
if not content:
|
if not content:
|
||||||
try:
|
|
||||||
content = win32clipboard.GetClipboardData(win32con.CF_TEXT)
|
|
||||||
content = content.decode('gbk')
|
|
||||||
self.logger.info("获取普通文本格式内容")
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.debug(f"获取普通文本失败: {e}")
|
|
||||||
|
|
||||||
win32clipboard.CloseClipboard()
|
|
||||||
|
|
||||||
if content:
|
|
||||||
self.logger.info("成功获取剪贴板内容")
|
|
||||||
return content
|
|
||||||
else:
|
|
||||||
self.logger.warning("剪贴板内容为空")
|
self.logger.warning("剪贴板内容为空")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(f"获取剪贴板内容失败: {e}")
|
self.logger.error(f"获取剪贴板内容失败: {e}")
|
||||||
|
return None
|
||||||
|
finally:
|
||||||
try:
|
try:
|
||||||
win32clipboard.CloseClipboard()
|
win32clipboard.CloseClipboard()
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
return None
|
|
||||||
|
|
||||||
def on_click(self, x, y, button, pressed):
|
def on_click(self, x, y, button, pressed):
|
||||||
if not pressed or not self.running:
|
if not pressed or not self.running:
|
||||||
@ -103,6 +113,11 @@ class TextCaptureService:
|
|||||||
current_time = time.time()
|
current_time = time.time()
|
||||||
if (current_time - self.last_right_click_time) < self.double_click_threshold:
|
if (current_time - self.last_right_click_time) < self.double_click_threshold:
|
||||||
self.logger.info("检测到双击右键")
|
self.logger.info("检测到双击右键")
|
||||||
|
# 先执行复制操作
|
||||||
|
self.simulate_copy()
|
||||||
|
time.sleep(0.1) # 等待复制完成
|
||||||
|
|
||||||
|
# 获取剪贴板内容
|
||||||
content = self.get_clipboard_content()
|
content = self.get_clipboard_content()
|
||||||
if content:
|
if content:
|
||||||
self.logger.info("开始保存内容")
|
self.logger.info("开始保存内容")
|
||||||
|
|||||||
@ -36,17 +36,33 @@ class DocumentProcessor:
|
|||||||
def extract_title(self, content):
|
def extract_title(self, content):
|
||||||
"""从内容中提取或生成标题"""
|
"""从内容中提取或生成标题"""
|
||||||
# 首先尝试从内容的第一行提取标题
|
# 首先尝试从内容的第一行提取标题
|
||||||
first_line = content.strip().split('\n')[0]
|
lines = content.strip().split('\n')
|
||||||
if len(first_line) <= 50: # 如果第一行不太长,可能是标题
|
first_line = lines[0].strip()
|
||||||
|
|
||||||
|
# 如果第一行是markdown标题格式,直接使用
|
||||||
|
if first_line.startswith('#'):
|
||||||
|
return first_line.lstrip('#').strip()
|
||||||
|
|
||||||
|
# 如果第一行不太长且不是很短,可能是标题
|
||||||
|
if 10 <= len(first_line) <= 100 and not first_line.endswith(':') and not first_line.endswith(':'):
|
||||||
return first_line
|
return first_line
|
||||||
|
|
||||||
|
# 尝试查找文章的主题句
|
||||||
|
for line in lines[:5]: # 只查看前5行
|
||||||
|
line = line.strip()
|
||||||
|
if '主题' in line or '概要' in line or '总结' in line:
|
||||||
|
# 提取冒号或者是后面的内容
|
||||||
|
if ':' in line or ':' in line:
|
||||||
|
return line.split(':', 1)[1].strip() if ':' in line else line.split(':', 1)[1].strip()
|
||||||
|
return line
|
||||||
|
|
||||||
# 使用结巴分词提取关键词
|
# 使用结巴分词提取关键词
|
||||||
keywords = jieba.analyse.textrank(content, topK=3, allowPOS=('ns', 'n', 'vn', 'v'))
|
keywords = jieba.analyse.textrank(content[:500], topK=3, allowPOS=('ns', 'n', 'vn', 'v'))
|
||||||
if keywords:
|
if keywords:
|
||||||
return ' '.join(keywords)
|
return ' '.join(keywords)
|
||||||
|
|
||||||
# 如果无法提取关键词,使用内容的前20个字符
|
# 如果无法提取关键词,使用内容的前30个字符
|
||||||
return content[:20].strip() + "..."
|
return content[:30].strip() + "..."
|
||||||
|
|
||||||
def extract_tags(self, content):
|
def extract_tags(self, content):
|
||||||
"""提取内容的标签"""
|
"""提取内容的标签"""
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user