feat(biz): add biz-articles command to query public account messages

加载 biz_message_0.db 提取公众号推送(标题/url/作者/时间)。

- daemon 端通过 DbCache 按需解密 biz_message_0.db(密钥已在 all_keys.json 中)
- 新增 IPC 变体 BizArticles(limit/account/since/until 参数)
- 新增 query 处理器 q_biz_articles:
  - 通过 Name2Id 反查 gh_* username → md5 → Msg_<hash> 表映射
  - 过滤 local_type & 0xFFFFFFFF = 49(appmsg 公众号文章)
  - zstd 解压 + extract_cdata 解析 <mmreader>/<item> XML
  - 支持多文章推送(一条消息含多篇文章)
  - 输出字段:time/timestamp/recv_time/account/account_username/title/url/digest/cover_url
- 新增 CLI 子命令 wx biz-articles,参数:-n / --account / --since / --until / --json
- 新增工具函数 extract_cdata(CDATA 块解析)和 parse_biz_xml_items
- 新增 8 个单测(biz_tests 模块)覆盖 CDATA 解析和多文章场景

支持工作流:
  wx biz-articles --since today --json | jq ".[].url" | xargs opencli weixin download

Verified: 返朴 ADHD 文章、Datawhale Claude Code 文章、土猛员外知识引擎文章均已正确提取。
pull/33/head
ChenyqThu 2026-05-10 20:49:19 -07:00 committed by jackwener
parent c284b4ade6
commit a6700362fc
5 changed files with 408 additions and 0 deletions

View File

@ -0,0 +1,28 @@
use anyhow::Result;
use crate::ipc::Request;
use super::history::{parse_time, parse_time_end};
use super::transport;
use super::output::{resolve, print_value};
pub fn cmd_biz_articles(
limit: usize,
account: Option<String>,
since: Option<String>,
until: Option<String>,
json: bool,
) -> Result<()> {
let since_ts = since.as_deref().map(parse_time).transpose()?;
let until_ts = until.as_deref().map(parse_time_end).transpose()?;
let req = Request::BizArticles {
limit,
account,
since: since_ts,
until: until_ts,
};
let resp = transport::send(req)?;
let data = resp.data.get("articles")
.cloned()
.unwrap_or(serde_json::Value::Array(vec![]));
print_value(&data, &resolve(json))
}

View File

@ -1,4 +1,5 @@
mod init;
pub mod biz_articles;
pub mod sessions;
pub mod history;
pub mod search;
@ -220,6 +221,24 @@ enum Commands {
#[arg(long)]
json: bool,
},
/// 查询公众号文章推送(本地缓存)
BizArticles {
/// 显示数量
#[arg(short = 'n', long, default_value = "50")]
limit: usize,
/// 限定公众号(名称模糊匹配)
#[arg(long)]
account: Option<String>,
/// 起始时间 YYYY-MM-DD
#[arg(long)]
since: Option<String>,
/// 结束时间 YYYY-MM-DD
#[arg(long)]
until: Option<String>,
/// 输出 JSON默认 YAML
#[arg(long)]
json: bool,
},
/// 朋友圈全文搜索:匹配正文关键词
SnsSearch {
/// 关键词
@ -304,6 +323,9 @@ fn dispatch(cli: Cli) -> Result<()> {
Commands::SnsSearch { keyword, limit, since, until, user, json } => {
sns_search::cmd_sns_search(keyword, limit, since, until, user, json)
}
Commands::BizArticles { limit, account, since, until, json } => {
biz_articles::cmd_biz_articles(limit, account, since, until, json)
}
Commands::Daemon { cmd } => daemon_cmd::cmd_daemon(cmd),
}
}

View File

@ -2945,6 +2945,346 @@ pub async fn q_sns_search(
Ok(json!({ "keyword": keyword, "posts": posts, "total": total }))
}
// ─── 公众号文章查询 ───────────────────────────────────────────────────────────
/// 一条公众号文章的解析产物
#[derive(Debug)]
struct BizArticle {
/// 接收该推送的时间戳(即消息的 create_time
recv_time: i64,
/// 公众号 username
account_username: String,
/// 文章标题
title: String,
/// 文章链接
url: String,
/// 摘要
digest: String,
/// 封面图
cover: String,
/// 文章发布时间pub_time单位秒
pub_time: i64,
}
/// 从 biz_message 表的单条 XML 解析出全部 article items
fn parse_biz_xml_items(recv_time: i64, account_username: &str, xml: &str) -> Vec<BizArticle> {
let mut items = Vec::new();
let mut search_from = 0;
loop {
let Some(item_start) = xml[search_from..].find("<item>") else { break; };
let abs_start = search_from + item_start;
let Some(item_end) = xml[abs_start..].find("</item>") else { break; };
let abs_end = abs_start + item_end + 7;
let item_xml = &xml[abs_start..abs_end];
let title = extract_cdata(item_xml, "title").unwrap_or_default();
let url = extract_cdata(item_xml, "url").unwrap_or_default();
// Skip items with no URL or empty title (e.g. payment entries)
if url.is_empty() || title.is_empty() {
search_from = abs_end;
continue;
}
let digest = extract_cdata(item_xml, "digest").unwrap_or_default();
let cover = extract_cdata(item_xml, "cover").unwrap_or_default();
let pub_time = extract_xml_text(item_xml, "pub_time")
.and_then(|s| s.parse::<i64>().ok())
.unwrap_or(recv_time);
items.push(BizArticle {
recv_time,
account_username: account_username.to_string(),
title,
url,
digest,
cover,
pub_time,
});
search_from = abs_end;
}
items
}
/// 提取 CDATA 或普通文本内容: `<tag><![CDATA[...]]></tag>` 或 `<tag>...</tag>`
///
/// 注意: 内容匹配到 `</tag>` 之前的内容。CDATA 块中的 "]]"已在 "]]\x3e" 之前,
/// 所以 inner 为 `<![CDATA[content]]>` 或 `<![CDATA[content]]` (如果 ">" 被 close tag 吸掉)
fn extract_cdata(xml: &str, tag: &str) -> Option<String> {
let open = format!("<{}>", tag);
let close = format!("</{}>", tag);
let start = xml.find(&open)? + open.len();
let end = xml[start..].find(&close)?;
let inner = xml[start..start + end].trim();
if inner.starts_with("<![CDATA[") {
// inner = `<![CDATA[content]]>` → strip 9-char `<![CDATA[` prefix + 3-char `]]>` suffix
let body = &inner[9..];
// Strip `]]>` (normal) or `]]` (edge case)
let cdata_end = b"]]>";
let cdata_end2 = b"]]";
let content: &str = if body.as_bytes().ends_with(cdata_end) {
&body[..body.len() - 3]
} else if body.as_bytes().ends_with(cdata_end2) {
&body[..body.len() - 2]
} else {
body
};
let content = content.trim();
if content.is_empty() { None } else { Some(content.to_string()) }
} else if inner.is_empty() {
None
} else {
Some(unescape_html(inner))
}
}
/// 查询公众号文章推送biz_message_0.db
///
/// 每条消息可能包含多篇文章(多图文推送)。返回所有文章展开就的平底列表。
pub async fn q_biz_articles(
db: &DbCache,
names: &Names,
limit: usize,
account: Option<String>,
since: Option<i64>,
until: Option<i64>,
) -> Result<Value> {
let biz_path = db.get("message/biz_message_0.db").await?
.context("无法解密 biz_message_0.db请确认 all_keys.json 包含对应密鑰")?
;
// 1. 从 Name2Id 表获取 rowid -> username 映射,再推导 md5 -> username
let biz_path2 = biz_path.clone();
let id2username: HashMap<i64, String> = tokio::task::spawn_blocking(move || {
let conn = Connection::open(&biz_path2)?;
let mut stmt = conn.prepare("SELECT rowid, user_name FROM Name2Id WHERE user_name LIKE 'gh_%'")?
;
let rows = stmt.query_map([], |row| {
Ok((row.get::<_, i64>(0)?, row.get::<_, String>(1)?))
})?
.collect::<rusqlite::Result<Vec<_>>>()?;
Ok::<_, anyhow::Error>(rows.into_iter().collect())
}).await??;
// 构建 md5(username) -> username 映射
let md5_to_uname: HashMap<String, String> = id2username.values()
.map(|u| (format!("{:x}", md5::compute(u.as_bytes())), u.clone()))
.collect();
// 2. 如果 指定了 --account找到匹配的 username 列表
let account_low = account.as_deref().map(|s| s.to_lowercase());
let target_usernames: Option<Vec<String>> = account_low.as_ref().map(|low| {
id2username.values()
.filter(|u| {
let display = names.display(u);
display.to_lowercase().contains(low.as_str())
|| u.to_lowercase().contains(low.as_str())
})
.cloned()
.collect()
});
// 3. 进行数据库查询
let biz_path3 = biz_path.clone();
let since2 = since;
let until2 = until;
let target_hashes: Option<Vec<String>> = target_usernames.as_ref().map(|unames| {
unames.iter()
.map(|u| format!("{:x}", md5::compute(u.as_bytes())))
.collect()
});
let rows: Vec<(String, i64, i64, Vec<u8>, i64)> = tokio::task::spawn_blocking(move || {
let conn = Connection::open(&biz_path3)?;
// 列出所有 Msg_<hash> 表
let mut stmt = conn.prepare(
"SELECT name FROM sqlite_master WHERE type='table' AND name LIKE 'Msg_%'"
)?;
let table_names: Vec<String> = stmt.query_map([], |row| row.get(0))?
.filter_map(|r| r.ok())
.collect();
let re = regex::Regex::new(r"^Msg_[0-9a-f]{32}$").unwrap();
let mut all_rows: Vec<(String, i64, i64, Vec<u8>, i64)> = Vec::new();
for tname in &table_names {
if !re.is_match(tname) { continue; }
let hash = &tname[4..];
// account 过滤
if let Some(ref hashes) = target_hashes {
if !hashes.iter().any(|h| h == hash) { continue; }
}
let username = md5_to_uname.get(hash).cloned().unwrap_or_default();
// 构建过滤条件
let mut clauses: Vec<String> = Vec::new();
let mut params: Vec<Box<dyn rusqlite::types::ToSql>> = Vec::new();
// local_type & 0xFFFFFFFF = 49 是 appmsg公众号文章
clauses.push("(local_type & 4294967295) = 49".to_string());
if let Some(s) = since2 {
clauses.push("create_time >= ?".to_string());
params.push(Box::new(s));
}
if let Some(u) = until2 {
clauses.push("create_time <= ?".to_string());
params.push(Box::new(u));
}
let where_clause = format!("WHERE {}", clauses.join(" AND "));
let sql = format!(
"SELECT create_time, WCDB_CT_message_content, message_content \
FROM [{}] {} ORDER BY create_time DESC",
tname, where_clause
);
let params_ref: Vec<&dyn rusqlite::types::ToSql> = params.iter().map(|p| p.as_ref()).collect();
if let Ok(mut inner_stmt) = conn.prepare(&sql) {
let msg_rows: Vec<_> = inner_stmt
.query_map(params_ref.as_slice(), |row| {
Ok((
username.clone(),
row.get::<_, i64>(0)?,
row.get::<_, i64>(1).unwrap_or(0),
get_content_bytes(row, 2),
0i64,
))
})
.map(|it| it.filter_map(|r| r.ok()).collect())
.unwrap_or_default();
all_rows.extend(msg_rows);
}
}
Ok::<_, anyhow::Error>(all_rows)
}).await??;
// 4. 解压并解析 XML
let mut articles: Vec<BizArticle> = Vec::new();
for (username, recv_time, ct, content_bytes, _) in rows {
let content = decompress_message(&content_bytes, ct);
if content.is_empty() { continue; }
let items = parse_biz_xml_items(recv_time, &username, &content);
articles.extend(items);
}
// 5. 按 pub_time DESC 排序,取前 N 条
articles.sort_by_key(|a| std::cmp::Reverse(a.pub_time));
articles.truncate(limit);
let results: Vec<Value> = articles.into_iter().map(|a| {
let account_display = names.display(&a.account_username);
json!({
"time": fmt_time(a.pub_time, "%Y-%m-%d %H:%M"),
"timestamp": a.pub_time,
"recv_time": a.recv_time,
"recv_time_str": fmt_time(a.recv_time, "%Y-%m-%d %H:%M"),
"account": account_display,
"account_username": a.account_username,
"title": a.title,
"url": a.url,
"digest": a.digest,
"cover_url": a.cover,
})
}).collect();
Ok(json!({ "count": results.len(), "articles": results }))
}
#[cfg(test)]
mod biz_tests {
use super::*;
#[test]
fn extract_cdata_normal() {
let xml = "<title><![CDATA[TencentResearch]]></title>";
assert_eq!(extract_cdata(xml, "title"), Some("TencentResearch".into()));
}
#[test]
fn extract_cdata_empty() {
let xml = "<cover><![CDATA[]]></cover>";
assert_eq!(extract_cdata(xml, "cover"), None);
}
#[test]
fn extract_cdata_url() {
let xml = "<url><![CDATA[http://mp.weixin.qq.com/s?__biz=abc&mid=123]]></url>";
let result = extract_cdata(xml, "url");
assert!(result.is_some());
let url = result.unwrap();
assert!(url.starts_with("http://mp.weixin.qq.com"));
assert!(!url.contains("CDATA"));
}
#[test]
fn extract_cdata_no_cdata_wrapper() {
let xml = "<pub_time>1700000000</pub_time>";
assert_eq!(extract_cdata(xml, "pub_time"), Some("1700000000".into()));
}
#[test]
fn parse_biz_xml_items_single_article() {
let xml = r#"<msg><appmsg><mmreader><category><item>
<title><![CDATA[Test Article Title]]></title>
<url><![CDATA[http://mp.weixin.qq.com/s?test=1]]></url>
<digest><![CDATA[Test Digest]]></digest>
<cover><![CDATA[https://example.com/cover.jpg]]></cover>
<pub_time>1700000000</pub_time>
</item></category></mmreader></appmsg></msg>"#;
let items = parse_biz_xml_items(1699999999, "gh_test123", xml);
assert_eq!(items.len(), 1);
assert_eq!(items[0].title, "Test Article Title");
assert_eq!(items[0].url, "http://mp.weixin.qq.com/s?test=1");
assert_eq!(items[0].digest, "Test Digest");
assert_eq!(items[0].pub_time, 1700000000);
assert_eq!(items[0].account_username, "gh_test123");
}
#[test]
fn parse_biz_xml_items_skips_no_url() {
let xml = r#"<msg><mmreader><category><item>
<title><![CDATA[Has Title No URL]]></title>
<url><![CDATA[]]></url>
<pub_time>1700000001</pub_time>
</item></category></mmreader></msg>"#;
let items = parse_biz_xml_items(1700000001, "gh_test", xml);
assert_eq!(items.len(), 0);
}
#[test]
fn parse_biz_xml_items_multi_article() {
let xml = r#"<msg><mmreader><category>
<item>
<title><![CDATA[Article 1]]></title>
<url><![CDATA[http://mp.weixin.qq.com/s?a=1]]></url>
<pub_time>1700000010</pub_time>
</item>
<item>
<title><![CDATA[Article 2]]></title>
<url><![CDATA[http://mp.weixin.qq.com/s?a=2]]></url>
<pub_time>1700000020</pub_time>
</item>
</category></mmreader></msg>"#;
let items = parse_biz_xml_items(1700000000, "gh_multi", xml);
assert_eq!(items.len(), 2);
assert_eq!(items[0].title, "Article 1");
assert_eq!(items[1].title, "Article 2");
}
#[test]
fn parse_biz_xml_items_pub_time_fallback() {
// When pub_time is missing, should fall back to recv_time
let xml = r#"<item>
<title><![CDATA[No PubTime]]></title>
<url><![CDATA[http://mp.weixin.qq.com/s?x=1]]></url>
</item>"#;
let items = parse_biz_xml_items(1700000099, "gh_fallback", xml);
assert_eq!(items.len(), 1);
assert_eq!(items[0].pub_time, 1700000099); // falls back to recv_time
}
}
#[cfg(test)]
mod group_nickname_tests {
use super::*;

View File

@ -234,5 +234,11 @@ async fn dispatch(
ReloadConfig => {
Response::ok(serde_json::json!({ "reloading": true }))
}
BizArticles { limit, account, since, until } => {
match query::q_biz_articles(db, &names_arc, limit, account, since, until).await {
Ok(v) => Response::ok(v),
Err(e) => Response::err(e.to_string()),
}
}
}
}

View File

@ -102,6 +102,18 @@ pub enum Request {
#[serde(skip_serializing_if = "Option::is_none")]
user: Option<String>,
},
/// 查询公众号文章推送biz_message_0.db
BizArticles {
#[serde(default = "default_limit_50")]
limit: usize,
/// 公众号名称过滤(模糊匹配 display nameNone = 全部)
#[serde(skip_serializing_if = "Option::is_none")]
account: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
since: Option<i64>,
#[serde(skip_serializing_if = "Option::is_none")]
until: Option<i64>,
},
/// 朋友圈全文搜索(匹配 contentDesc
SnsSearch {
keyword: String,