Refine chat history message parsing

feat/daemon-cli
Banghao Chi 2026-03-08 20:52:33 -05:00
parent fa273b810d
commit fd67536ef7
1 changed files with 218 additions and 157 deletions

View File

@ -220,6 +220,8 @@ atexit.register(_cache.cleanup)
_contact_names = None # {username: display_name}
_contact_full = None # [{username, nick_name, remark}]
_self_username = None
_XML_UNSAFE_RE = re.compile(r'<!DOCTYPE|<!ENTITY', re.IGNORECASE)
_XML_PARSE_MAX_LEN = 20000
def _load_contacts_from(db_path):
@ -286,6 +288,7 @@ def _split_msg_type(t):
t = int(t)
except (TypeError, ValueError):
return 0, 0
# WeChat packs the base type into the low 32 bits and app subtype into the high 32 bits.
if t > 0xFFFFFFFF:
return t & 0xFFFFFFFF, t >> 32
return t, 0
@ -352,9 +355,12 @@ def _collapse_text(text):
def _get_self_username():
global _self_username
if _self_username is not None:
if _self_username:
return _self_username
if not DB_DIR:
return ''
names = get_contact_names()
account_dir = os.path.basename(os.path.dirname(DB_DIR))
candidates = [account_dir]
@ -368,24 +374,21 @@ def _get_self_username():
_self_username = candidate
return _self_username
_self_username = ''
return _self_username
return ''
def _load_name2id_maps(conn):
id_to_username = {}
username_to_id = {}
try:
rows = conn.execute("SELECT rowid, user_name FROM Name2Id").fetchall()
except sqlite3.Error:
return id_to_username, username_to_id
return id_to_username
for rowid, user_name in rows:
if not user_name:
continue
id_to_username[rowid] = user_name
username_to_id[user_name] = rowid
return id_to_username, username_to_id
return id_to_username
def _display_name_for_username(username, names):
@ -419,26 +422,47 @@ def _resolve_quote_sender_label(ref_user, ref_display_name, is_group, chat_usern
return _display_name_for_username(ref_user, names)
return ref_display_name or ''
self_username = _get_self_username()
if ref_user:
if ref_user == chat_username:
return chat_display_name
if self_username and ref_user == self_username:
return 'me'
return names.get(ref_user, ref_display_name or ref_user)
if ref_display_name:
if ref_display_name == chat_display_name:
return chat_display_name
self_display_name = names.get(self_username, self_username) if self_username else ''
if self_display_name and ref_display_name == self_display_name:
return 'me'
return ref_display_name
return ''
def _parse_xml_root(content):
if not content or len(content) > _XML_PARSE_MAX_LEN or _XML_UNSAFE_RE.search(content):
return None
try:
return ET.fromstring(content)
except ET.ParseError:
return None
def _parse_int(value, fallback=0):
try:
return int(value)
except (TypeError, ValueError):
return fallback
def _format_app_message_text(content, local_type, is_group, chat_username, chat_display_name, names):
if not content or '<appmsg' not in content:
return None
_, sub_type = _split_msg_type(local_type)
try:
root = ET.fromstring(content)
except ET.ParseError:
root = _parse_xml_root(content)
if root is None:
return None
appmsg = root.find('.//appmsg')
@ -447,7 +471,7 @@ def _format_app_message_text(content, local_type, is_group, chat_username, chat_
title = _collapse_text(appmsg.findtext('title') or '')
app_type_text = (appmsg.findtext('type') or '').strip()
app_type = int(app_type_text or sub_type or 0)
app_type = _parse_int(app_type_text, _parse_int(sub_type, 0))
if app_type == 57:
ref = appmsg.find('.//refermsg')
@ -481,6 +505,35 @@ def _format_app_message_text(content, local_type, is_group, chat_username, chat_
return "[链接/文件]"
def _format_voip_message_text(content):
if not content or '<voip' not in content:
return None
root = _parse_xml_root(content)
if root is None:
return "[通话]"
raw_text = _collapse_text(root.findtext('.//msg') or '')
if not raw_text:
return "[通话]"
status_map = {
'Canceled': '已取消',
'Line busy': '对方忙线',
'Already answered elsewhere': '已在其他设备接听',
'Declined on other device': '已在其他设备拒接',
'Call canceled by caller': '主叫已取消',
'Call not answered': '未接听',
"Call wasn't answered": '未接听',
}
if raw_text.startswith('Duration:'):
duration = raw_text.split(':', 1)[1].strip()
return f"[通话] 通话时长 {duration}" if duration else "[通话]"
return f"[通话] {status_map.get(raw_text, raw_text)}"
def _format_message_text(local_id, local_type, content, is_group, chat_username, chat_display_name, names):
sender_from_content, text = _parse_message_content(content, local_type, is_group)
base_type, _ = _split_msg_type(local_type)
@ -489,6 +542,8 @@ def _format_message_text(local_id, local_type, content, is_group, chat_username,
text = f"[图片] (local_id={local_id})"
elif base_type == 47:
text = "[表情]"
elif base_type == 50:
text = _format_voip_message_text(text) or "[通话]"
elif base_type == 49:
text = _format_app_message_text(
text, local_type, is_group, chat_username, chat_display_name, names
@ -500,6 +555,10 @@ def _format_message_text(local_id, local_type, content, is_group, chat_username,
return sender_from_content, text
def _is_safe_msg_table_name(table_name):
return bool(re.fullmatch(r'Msg_[0-9a-f]{32}', table_name))
# 消息 DB 的 rel_keys
# 用 message_\d+\.db$ 匹配,自然排除 message_resource.db / message_fts_*.db
MSG_DB_KEYS = sorted([
@ -513,6 +572,8 @@ def _find_msg_table_for_user(username):
"""在所有 message_N.db 中查找用户的消息表,返回 (db_path, table_name)"""
table_hash = hashlib.md5(username.encode()).hexdigest()
table_name = f"Msg_{table_hash}"
if not _is_safe_msg_table_name(table_name):
return None, None
for rel_key in MSG_DB_KEYS:
path = _cache.get(rel_key)
@ -624,7 +685,7 @@ def get_chat_history(chat_name: str, limit: int = 50) -> str:
conn = sqlite3.connect(db_path)
try:
id_to_username, _ = _load_name2id_maps(conn)
id_to_username = _load_name2id_maps(conn)
rows = conn.execute(f"""
SELECT local_id, local_type, create_time, real_sender_id, message_content,
WCDB_CT_message_content