From 2b5d872f0be7922131f72a80208d5adca216955e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E7=AB=8Blee?= Date: Sun, 19 Apr 2026 02:22:55 +0800 Subject: [PATCH] =?UTF-8?q?feat(sns):=20sns-feed=20/=20sns-search=20?= =?UTF-8?q?=E8=BE=93=E5=87=BA=E5=AE=8C=E6=95=B4=20media[]=20=E5=AD=97?= =?UTF-8?q?=E6=AE=B5=20(#15)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 在 #14 之上增量:把 sns-feed / sns-search 的 media_count 升级成完整 media[] 数组(含 url/thumb/key/token/md5/enc_idx/size + video_md5/duration),下游可直接做图片代理或离线渲染。 - 用 roxmltree(pure Rust,无 C 依赖)替代 regex 抽属性 - 字段命名对齐 artifacts 仓库 Python _parse_media,跨实现 diff 友好 - 14 个 sns 单测:作者新增 6 个 fixture(单图/三图/视频/纯文字/malformed/缺 totalSize)+ 已有 8 个保持 - 与之前 PR #14 的 --user XML fallback 修复 / SNS_MAX_LIMIT / SNS_MAX_SCAN / escape_like_pattern 完全兼容 Author: leeguooooo Co-fixed-by: wx-cli-coder (rebase + 冲突解决 + 测试模块合并 + media_count 语义文档补充) --- Cargo.lock | 7 + Cargo.toml | 1 + README.md | 2 +- SKILL.md | 2 +- src/daemon/query.rs | 341 ++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 336 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a83b5ae..210158a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -719,6 +719,12 @@ version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" +[[package]] +name = "roxmltree" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c20b6793b5c2fa6553b250154b78d6d0db37e72700ae35fad9387a46f487c97" + [[package]] name = "rusqlite" version = "0.31.0" @@ -1315,6 +1321,7 @@ dependencies = [ "md5", "pbkdf2", "regex", + "roxmltree", "rusqlite", "serde", "serde_json", diff --git a/Cargo.toml b/Cargo.toml index e8f85eb..b28beb4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -52,6 +52,7 @@ md5 = "0.7" # 正则表达式 regex = "1" +roxmltree = "0.20" # IPC Windows named pipe(Unix 直接用 tokio::net::UnixListener) [target.'cfg(windows)'.dependencies] diff --git a/README.md b/README.md index 50b7394..e0f06da 100644 --- a/README.md +++ b/README.md @@ -173,7 +173,7 @@ wx sns-search "婚礼" --user "李四" --since 2023-01-01 ``` - **sns-notifications** 返回互动通知:`type`(`like`/`comment`)、`from_nickname`、`content`(评论正文)、`feed_preview` + `feed_author`(对应原帖) -- **sns-feed** / **sns-search** 返回朋友圈帖子:`author`、`content`(正文)、`media_count`、`location`、`timestamp` +- **sns-feed** / **sns-search** 返回朋友圈帖子:`author`、`content`(正文)、`media`、`media_count`、`location`、`timestamp`;`media` 字段含每张图的 url/thumb/key/token/md5/enc_idx/size,供下游做图片代理或离线渲染。`media_count = media.len()`,按 DOM 解析的合法 `` 子节点计数(malformed XML 返回 0) 朋友圈数据只覆盖你本地刷到过的帖子(微信 app 按需下载)。 diff --git a/SKILL.md b/SKILL.md index 9a2f605..4ce28c3 100644 --- a/SKILL.md +++ b/SKILL.md @@ -170,7 +170,7 @@ wx sns-search "婚礼" --user "李四" --since 2023-01-01 -n 50 **字段区分**: - `sns-notifications` 返回"通知"条目:`type`(`like`/`comment`)、`from_nickname`、`content`(评论正文,点赞为空)、`feed_preview` + `feed_author`(对应的原帖) -- `sns-feed` / `sns-search` 返回"帖子"条目:`author`、`content`(朋友圈正文)、`media_count`(图片/视频数)、`location`、`timestamp` +- `sns-feed` / `sns-search` 返回"帖子"条目:`author`、`content`(朋友圈正文)、`media`、`media_count`(图片/视频数)、`location`、`timestamp`;`media` 字段含每张图的 url/thumb/key/token/md5/enc_idx/size,供下游做图片代理或离线渲染。`media_count = media.len()`,按 DOM 解析的合法 `` 子节点计数(malformed XML 返回 0) > 只保存你本地刷到过的朋友圈(微信 app 按需下载)。没刷到过的帖子不在本地,任何命令都拿不到。 diff --git a/src/daemon/query.rs b/src/daemon/query.rs index b3994c3..fa2124c 100644 --- a/src/daemon/query.rs +++ b/src/daemon/query.rs @@ -1,6 +1,7 @@ use anyhow::{Context, Result}; use chrono::{Local, TimeZone, Timelike}; use regex::Regex; +use roxmltree::{Document, Node}; use rusqlite::Connection; use serde_json::{json, Value}; use std::collections::HashMap; @@ -1634,12 +1635,6 @@ pub async fn q_sns_notifications( Ok(json!({ "notifications": out, "total": total })) } -fn sns_media_count_re() -> &'static Regex { - static RE: OnceLock = OnceLock::new(); - // 只在 里数 开标签,避免匹配到嵌套的其他 字段 - RE.get_or_init(|| Regex::new(r"").unwrap()) -} - fn sns_location_re() -> &'static Regex { static RE: OnceLock = OnceLock::new(); // location 是自闭合标签,poiName 在属性里 @@ -1660,13 +1655,114 @@ fn escape_like_pattern(s: &str) -> String { .replace('_', r"\_") } +fn xml_child<'a, 'input>(node: Node<'a, 'input>, tag: &str) -> Option> { + node.children() + .find(|child| child.is_element() && child.has_tag_name(tag)) +} + +fn xml_text<'a, 'input>(node: Option>) -> Option { + node.and_then(|n| n.text()) + .map(str::trim) + .filter(|s| !s.is_empty()) + .map(str::to_string) +} + +fn xml_attr<'a, 'input>(node: Option>, attr: &str) -> Option { + node.and_then(|n| n.attribute(attr)) + .map(str::trim) + .filter(|s| !s.is_empty()) + .map(str::to_string) +} + +fn insert_media_string(out: &mut serde_json::Map, key: &str, value: Option) { + if let Some(value) = value { + out.insert(key.to_string(), Value::String(value)); + } +} + +fn insert_media_i64(out: &mut serde_json::Map, key: &str, value: Option) { + if let Some(value) = value { + out.insert(key.to_string(), Value::from(value)); + } +} + +/// 从 `SnsTimeLine.content` XML 里抽每个 `` 的完整字段。 +/// +/// 字段名与 artifacts 仓库 `wechat_sns_dump.py::_parse_media` 对齐, +/// 便于跨实现 diff。缺失字段直接省略(不输出 null),供下游代理图片 / 离线渲染。 +fn parse_post_media(xml: &str) -> Vec { + let doc = match Document::parse(xml) { + Ok(doc) => doc, + Err(_) => return Vec::new(), + }; + + let Some(media_list) = doc + .descendants() + .find(|node| node.has_tag_name("TimelineObject")) + .and_then(|node| xml_child(node, "ContentObject")) + .and_then(|node| xml_child(node, "mediaList")) + else { + return Vec::new(); + }; + + media_list + .children() + .filter(|node| node.is_element() && node.has_tag_name("media")) + .map(|media| { + let url_el = xml_child(media, "url"); + let thumb_el = xml_child(media, "thumb"); + let size_el = xml_child(media, "size"); + let mut out = serde_json::Map::new(); + + insert_media_string(&mut out, "type", xml_text(xml_child(media, "type"))); + insert_media_string(&mut out, "sub_type", xml_text(xml_child(media, "sub_type"))); + insert_media_string(&mut out, "url", xml_text(url_el)); + insert_media_string(&mut out, "thumb", xml_text(thumb_el)); + insert_media_string(&mut out, "md5", xml_attr(url_el, "md5")); + insert_media_string(&mut out, "url_key", xml_attr(url_el, "key")); + insert_media_string(&mut out, "url_token", xml_attr(url_el, "token")); + insert_media_string(&mut out, "url_enc_idx", xml_attr(url_el, "enc_idx")); + insert_media_string(&mut out, "thumb_key", xml_attr(thumb_el, "key")); + insert_media_string(&mut out, "thumb_token", xml_attr(thumb_el, "token")); + insert_media_string(&mut out, "thumb_enc_idx", xml_attr(thumb_el, "enc_idx")); + insert_media_i64( + &mut out, + "width", + xml_attr(size_el, "width").and_then(|v| v.parse::().ok()), + ); + insert_media_i64( + &mut out, + "height", + xml_attr(size_el, "height").and_then(|v| v.parse::().ok()), + ); + insert_media_i64( + &mut out, + "total_size", + xml_attr(size_el, "totalSize").and_then(|v| v.parse::().ok()), + ); + insert_media_string( + &mut out, + "video_md5", + xml_text(xml_child(media, "videomd5")), + ); + insert_media_i64( + &mut out, + "video_duration", + xml_text(xml_child(media, "videoDuration")).and_then(|v| v.parse::().ok()), + ); + + Value::Object(out) + }) + .collect() +} + /// SnsTimeLine 行解析产物。不含 display name(依赖 Names,需要出 spawn_blocking 再补)。 struct ParsedPost { tid: i64, create_time: i64, author_username: String, content: String, - media_count: i64, + media: Vec, location: String, } @@ -1682,13 +1778,13 @@ fn parse_post_xml(tid: i64, user_name_column: &str, content: &str) -> ParsedPost } else { user_name_column.to_string() }; - let media_count = sns_media_count_re().find_iter(content).count() as i64; + let media = parse_post_media(content); let location = sns_location_re() .captures(content) .and_then(|c| c.get(1)) .map(|m| m.as_str().to_string()) .unwrap_or_default(); - ParsedPost { tid, create_time, author_username, content: text, media_count, location } + ParsedPost { tid, create_time, author_username, content: text, media, location } } fn post_to_value(p: ParsedPost, names: &Names) -> Value { @@ -1704,7 +1800,8 @@ fn post_to_value(p: ParsedPost, names: &Names) -> Value { "author_username": p.author_username, "author": author, "content": p.content, - "media_count": p.media_count, + "media_count": p.media.len() as i64, + "media": p.media, "location": p.location, }) } @@ -1856,14 +1953,18 @@ mod sns_tests { fn make_post_xml(create_time: &str, desc: &str, username_tag: Option<&str>, media: usize, location: Option<&str>) -> String { let username = username_tag.map(|u| format!("{}", u)).unwrap_or_default(); - let media_tags = "...".repeat(media); - let media_list = if media > 0 { format!("{}", media_tags) } else { String::new() }; + let media_tags = "2".repeat(media); + let content_object = if media > 0 { + format!("{}", media_tags) + } else { + String::new() + }; let loc = location .map(|p| format!(r#""#, p)) .unwrap_or_default(); format!( "{}{}{}{}{}", - username, create_time, desc, media_list, loc + username, create_time, desc, content_object, loc ) } @@ -1874,7 +1975,7 @@ mod sns_tests { assert_eq!(p.author_username, "wxid_column"); assert_eq!(p.create_time, 1700000000); assert_eq!(p.content, "hello"); - assert_eq!(p.media_count, 0); + assert_eq!(p.media.len(), 0); assert_eq!(p.location, ""); } @@ -1897,7 +1998,7 @@ mod sns_tests { fn parse_counts_media_and_extracts_location() { let xml = make_post_xml("1700000002", "post", None, 3, Some("Wuxi")); let p = parse_post_xml(4, "wxid", &xml); - assert_eq!(p.media_count, 3); + assert_eq!(p.media.len(), 3); assert_eq!(p.location, "Wuxi"); } @@ -1929,4 +2030,214 @@ mod sns_tests { assert_eq!(escape_like_pattern("中文关键词"), "中文关键词"); assert_eq!(escape_like_pattern(""), ""); } + + fn media_object(value: &Value) -> &serde_json::Map { + value.as_object().expect("media entry should be an object") + } + + #[test] + fn single_image_media() { + let xml = r#" + + + + + + 2 + https://szmmsns.qpic.cn/<redacted>/image.jpg + https://szmmsns.qpic.cn/<redacted>/thumb.jpg + + + + + + + "#; + + let media = parse_post_media(xml); + assert_eq!(media.len(), 1); + + let item = media_object(&media[0]); + assert_eq!(item.get("type").and_then(Value::as_str), Some("2")); + assert_eq!( + item.get("url").and_then(Value::as_str), + Some("https://szmmsns.qpic.cn//image.jpg") + ); + assert_eq!( + item.get("thumb").and_then(Value::as_str), + Some("https://szmmsns.qpic.cn//thumb.jpg") + ); + assert_eq!(item.get("url_enc_idx").and_then(Value::as_str), Some("1")); + assert_eq!( + item.get("url_key").and_then(Value::as_str), + Some("placeholder-key") + ); + assert_eq!( + item.get("url_token").and_then(Value::as_str), + Some("placeholder-token") + ); + assert_eq!( + item.get("md5").and_then(Value::as_str), + Some("placeholder-md5") + ); + assert_eq!(item.get("width").and_then(Value::as_i64), Some(1440)); + assert_eq!(item.get("height").and_then(Value::as_i64), Some(1080)); + assert_eq!(item.get("total_size").and_then(Value::as_i64), Some(123456)); + } + + #[test] + fn three_images_media() { + let xml = r#" + + + + + + 2 + 10 + https://szmmsns.qpic.cn/<redacted>/image-1.jpg + https://szmmsns.qpic.cn/<redacted>/thumb-1.jpg + + + + 2 + 11 + https://szmmsns.qpic.cn/<redacted>/image-2.jpg + https://szmmsns.qpic.cn/<redacted>/thumb-2.jpg + + + + 6 + https://szmmsns.qpic.cn/<redacted>/image-3.jpg + https://szmmsns.qpic.cn/<redacted>/thumb-3.jpg + + + + + + + "#; + + let media = parse_post_media(xml); + assert_eq!(media.len(), 3); + + let first = media_object(&media[0]); + assert_eq!(first.get("sub_type").and_then(Value::as_str), Some("10")); + assert_eq!( + first.get("url_key").and_then(Value::as_str), + Some("placeholder-key-1") + ); + + let second = media_object(&media[1]); + assert_eq!(second.get("sub_type").and_then(Value::as_str), Some("11")); + assert_eq!(second.get("width").and_then(Value::as_i64), Some(300)); + + let third = media_object(&media[2]); + assert_eq!(third.get("type").and_then(Value::as_str), Some("6")); + assert_eq!( + third.get("thumb_key").and_then(Value::as_str), + Some("placeholder-thumb-key-3") + ); + } + + #[test] + fn video_media() { + let xml = r#" + + + + + + 15 + https://szmmsns.qpic.cn/<redacted>/video.mp4 + https://szmmsns.qpic.cn/<redacted>/video-thumb.jpg + + <placeholder-video-md5> + 37 + + + + + + "#; + + let media = parse_post_media(xml); + assert_eq!(media.len(), 1); + + let item = media_object(&media[0]); + assert_eq!( + item.get("video_md5").and_then(Value::as_str), + Some("") + ); + assert_eq!(item.get("video_duration").and_then(Value::as_i64), Some(37)); + assert!(!item.contains_key("total_size")); + } + + #[test] + fn text_only_post() { + let without_media_list = r#" + + + + 1 + + + + "#; + let empty_media_list = r#" + + + + + + + + "#; + + assert!(parse_post_media(without_media_list).is_empty()); + assert!(parse_post_media(empty_media_list).is_empty()); + } + + #[test] + fn malformed_xml() { + let xml = r#" + + + + + + 2 + + + + + "#; + + assert!(parse_post_media(xml).is_empty()); + } + + #[test] + fn size_without_total_size_omits_total_size_key() { + let xml = r#" + + + + + + 2 + + + + + + + "#; + + let media = parse_post_media(xml); + assert_eq!(media.len(), 1); + let item = media_object(&media[0]); + assert_eq!(item.get("width").and_then(Value::as_i64), Some(640)); + assert_eq!(item.get("height").and_then(Value::as_i64), Some(480)); + assert!(!item.contains_key("total_size")); + } }