From 2ace24695253c33eda91882928cd799e7883c62a Mon Sep 17 00:00:00 2001 From: leeguooooo Date: Sat, 18 Apr 2026 21:39:01 +0900 Subject: [PATCH] =?UTF-8?q?feat(sns):=20=E7=94=A8=20roxmltree=20=E6=8A=BD?= =?UTF-8?q?=E5=AE=8C=E6=95=B4=20media[]=20=E5=AD=97=E6=AE=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `sns-feed` / `sns-search` 输出里新增 `media` 数组,每项含 url/thumb 文本 + enc_idx/key/token/md5/size 属性 + video_md5/ video_duration,供下游做图片代理或离线渲染。 - Cargo.toml 新增 `roxmltree = "0.20"` - `parse_post_media` 取代原先 `sns_media_count_re` 的 regex 计数 - `ParsedPost.media_count: i64` 改为 `media: Vec` - `post_to_value` 同时保留 `media_count = media.len()` 以兼容下游 - 字段命名与 artifacts 仓库的 Python 参考实现对齐,便于跨实现 diff --- Cargo.lock | 7 +++ Cargo.toml | 1 + src/daemon/query.rs | 131 +++++++++++++++++++++++++++++++++++++++----- 3 files changed, 124 insertions(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a83b5ae..210158a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -719,6 +719,12 @@ version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" +[[package]] +name = "roxmltree" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c20b6793b5c2fa6553b250154b78d6d0db37e72700ae35fad9387a46f487c97" + [[package]] name = "rusqlite" version = "0.31.0" @@ -1315,6 +1321,7 @@ dependencies = [ "md5", "pbkdf2", "regex", + "roxmltree", "rusqlite", "serde", "serde_json", diff --git a/Cargo.toml b/Cargo.toml index e8f85eb..b28beb4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -52,6 +52,7 @@ md5 = "0.7" # 正则表达式 regex = "1" +roxmltree = "0.20" # IPC Windows named pipe(Unix 直接用 tokio::net::UnixListener) [target.'cfg(windows)'.dependencies] diff --git a/src/daemon/query.rs b/src/daemon/query.rs index b3994c3..f491287 100644 --- a/src/daemon/query.rs +++ b/src/daemon/query.rs @@ -1,6 +1,7 @@ use anyhow::{Context, Result}; use chrono::{Local, TimeZone, Timelike}; use regex::Regex; +use roxmltree::{Document, Node}; use rusqlite::Connection; use serde_json::{json, Value}; use std::collections::HashMap; @@ -1634,12 +1635,6 @@ pub async fn q_sns_notifications( Ok(json!({ "notifications": out, "total": total })) } -fn sns_media_count_re() -> &'static Regex { - static RE: OnceLock = OnceLock::new(); - // 只在 里数 开标签,避免匹配到嵌套的其他 字段 - RE.get_or_init(|| Regex::new(r"").unwrap()) -} - fn sns_location_re() -> &'static Regex { static RE: OnceLock = OnceLock::new(); // location 是自闭合标签,poiName 在属性里 @@ -1660,13 +1655,114 @@ fn escape_like_pattern(s: &str) -> String { .replace('_', r"\_") } +fn xml_child<'a, 'input>(node: Node<'a, 'input>, tag: &str) -> Option> { + node.children() + .find(|child| child.is_element() && child.has_tag_name(tag)) +} + +fn xml_text<'a, 'input>(node: Option>) -> Option { + node.and_then(|n| n.text()) + .map(str::trim) + .filter(|s| !s.is_empty()) + .map(str::to_string) +} + +fn xml_attr<'a, 'input>(node: Option>, attr: &str) -> Option { + node.and_then(|n| n.attribute(attr)) + .map(str::trim) + .filter(|s| !s.is_empty()) + .map(str::to_string) +} + +fn insert_media_string(out: &mut serde_json::Map, key: &str, value: Option) { + if let Some(value) = value { + out.insert(key.to_string(), Value::String(value)); + } +} + +fn insert_media_i64(out: &mut serde_json::Map, key: &str, value: Option) { + if let Some(value) = value { + out.insert(key.to_string(), Value::from(value)); + } +} + +/// 从 `SnsTimeLine.content` XML 里抽每个 `` 的完整字段。 +/// +/// 字段名与 artifacts 仓库 `wechat_sns_dump.py::_parse_media` 对齐, +/// 便于跨实现 diff。缺失字段直接省略(不输出 null),供下游代理图片 / 离线渲染。 +fn parse_post_media(xml: &str) -> Vec { + let doc = match Document::parse(xml) { + Ok(doc) => doc, + Err(_) => return Vec::new(), + }; + + let Some(media_list) = doc + .descendants() + .find(|node| node.has_tag_name("TimelineObject")) + .and_then(|node| xml_child(node, "ContentObject")) + .and_then(|node| xml_child(node, "mediaList")) + else { + return Vec::new(); + }; + + media_list + .children() + .filter(|node| node.is_element() && node.has_tag_name("media")) + .map(|media| { + let url_el = xml_child(media, "url"); + let thumb_el = xml_child(media, "thumb"); + let size_el = xml_child(media, "size"); + let mut out = serde_json::Map::new(); + + insert_media_string(&mut out, "type", xml_text(xml_child(media, "type"))); + insert_media_string(&mut out, "sub_type", xml_text(xml_child(media, "sub_type"))); + insert_media_string(&mut out, "url", xml_text(url_el)); + insert_media_string(&mut out, "thumb", xml_text(thumb_el)); + insert_media_string(&mut out, "md5", xml_attr(url_el, "md5")); + insert_media_string(&mut out, "url_key", xml_attr(url_el, "key")); + insert_media_string(&mut out, "url_token", xml_attr(url_el, "token")); + insert_media_string(&mut out, "url_enc_idx", xml_attr(url_el, "enc_idx")); + insert_media_string(&mut out, "thumb_key", xml_attr(thumb_el, "key")); + insert_media_string(&mut out, "thumb_token", xml_attr(thumb_el, "token")); + insert_media_string(&mut out, "thumb_enc_idx", xml_attr(thumb_el, "enc_idx")); + insert_media_i64( + &mut out, + "width", + xml_attr(size_el, "width").and_then(|v| v.parse::().ok()), + ); + insert_media_i64( + &mut out, + "height", + xml_attr(size_el, "height").and_then(|v| v.parse::().ok()), + ); + insert_media_i64( + &mut out, + "total_size", + xml_attr(size_el, "totalSize").and_then(|v| v.parse::().ok()), + ); + insert_media_string( + &mut out, + "video_md5", + xml_text(xml_child(media, "videomd5")), + ); + insert_media_i64( + &mut out, + "video_duration", + xml_text(xml_child(media, "videoDuration")).and_then(|v| v.parse::().ok()), + ); + + Value::Object(out) + }) + .collect() +} + /// SnsTimeLine 行解析产物。不含 display name(依赖 Names,需要出 spawn_blocking 再补)。 struct ParsedPost { tid: i64, create_time: i64, author_username: String, content: String, - media_count: i64, + media: Vec, location: String, } @@ -1682,13 +1778,13 @@ fn parse_post_xml(tid: i64, user_name_column: &str, content: &str) -> ParsedPost } else { user_name_column.to_string() }; - let media_count = sns_media_count_re().find_iter(content).count() as i64; + let media = parse_post_media(content); let location = sns_location_re() .captures(content) .and_then(|c| c.get(1)) .map(|m| m.as_str().to_string()) .unwrap_or_default(); - ParsedPost { tid, create_time, author_username, content: text, media_count, location } + ParsedPost { tid, create_time, author_username, content: text, media, location } } fn post_to_value(p: ParsedPost, names: &Names) -> Value { @@ -1704,7 +1800,8 @@ fn post_to_value(p: ParsedPost, names: &Names) -> Value { "author_username": p.author_username, "author": author, "content": p.content, - "media_count": p.media_count, + "media_count": p.media.len() as i64, + "media": p.media, "location": p.location, }) } @@ -1856,14 +1953,18 @@ mod sns_tests { fn make_post_xml(create_time: &str, desc: &str, username_tag: Option<&str>, media: usize, location: Option<&str>) -> String { let username = username_tag.map(|u| format!("{}", u)).unwrap_or_default(); - let media_tags = "...".repeat(media); - let media_list = if media > 0 { format!("{}", media_tags) } else { String::new() }; + let media_tags = "2".repeat(media); + let content_object = if media > 0 { + format!("{}", media_tags) + } else { + String::new() + }; let loc = location .map(|p| format!(r#""#, p)) .unwrap_or_default(); format!( "{}{}{}{}{}", - username, create_time, desc, media_list, loc + username, create_time, desc, content_object, loc ) } @@ -1874,7 +1975,7 @@ mod sns_tests { assert_eq!(p.author_username, "wxid_column"); assert_eq!(p.create_time, 1700000000); assert_eq!(p.content, "hello"); - assert_eq!(p.media_count, 0); + assert_eq!(p.media.len(), 0); assert_eq!(p.location, ""); } @@ -1897,7 +1998,7 @@ mod sns_tests { fn parse_counts_media_and_extracts_location() { let xml = make_post_xml("1700000002", "post", None, 3, Some("Wuxi")); let p = parse_post_xml(4, "wxid", &xml); - assert_eq!(p.media_count, 3); + assert_eq!(p.media.len(), 3); assert_eq!(p.location, "Wuxi"); }