diff --git a/PR_DRAFT.md b/PR_DRAFT.md new file mode 100644 index 0000000..418d7d7 --- /dev/null +++ b/PR_DRAFT.md @@ -0,0 +1,115 @@ +# feat(biz): add `wx biz-articles` command to query public account messages + +## Summary + +Adds a new `biz-articles` subcommand that queries locally cached WeChat public account (公众号) article pushes from `biz_message_0.db`. + +This enables a downstream workflow for downloading full article content: + +```bash +wx biz-articles --since today --json | jq '.[].url' | xargs opencli weixin download +``` + +## Background + +- WeChat stores public account (官方账号) message pushes in a separate database: `message/biz_message_0.db` (SQLCipher 4 encrypted) +- This DB was not exposed by any existing wx-cli command +- The encryption key is already scanned and stored in `~/.wx-cli/all_keys.json` by `wx init` +- Each public account has its own `Msg_{md5(username)}` table, following the same convention as `message_0.db` +- Message content is zstd-compressed XML containing `/` structures with article metadata + +## New CLI Interface + +```bash +# Last 50 articles (default) +wx biz-articles + +# More articles +wx biz-articles -n 200 + +# Filter by public account name (fuzzy match on display name) +wx biz-articles --account "返朴" +wx biz-articles --account "Datawhale" + +# Time filter (article publish time, YYYY-MM-DD) +wx biz-articles --since 2026-05-10 +wx biz-articles --since 2026-05-01 --until 2026-05-10 + +# JSON output (for downstream piping) +wx biz-articles --json +wx biz-articles --since 2026-05-10 --json | jq '.[].url' +``` + +## Output Fields + +Each article item includes: + +| Field | Description | +|-------|-------------| +| `time` | Article publish time (formatted) | +| `timestamp` | Article publish timestamp (seconds) | +| `recv_time` | Message receive time (when WeChat pushed it) | +| `recv_time_str` | Message receive time (formatted) | +| `account` | Public account display name | +| `account_username` | Public account username (gh_*) | +| `title` | Article title | +| `url` | Article URL (mp.weixin.qq.com link) | +| `digest` | Article summary/excerpt | +| `cover_url` | Cover image URL | + +## Implementation Notes + +- `biz_message_0.db` is loaded on-demand via existing `DbCache` mechanism (no startup cost unless `biz-articles` is called) +- The key for `message/biz_message_0.db` is already in `all_keys.json`, no changes to `wx init` needed +- Multi-article pushes (图文消息) are expanded: each `` in `` becomes a separate output row +- Items without URL or title (e.g., payment notifications from service accounts) are filtered out +- New `extract_cdata` helper function strips CDATA wrappers from XML content +- Results sorted by `pub_time` DESC (article publish time, not message receive time) + +## Changes + +- `src/ipc.rs`: Add `BizArticles` IPC request variant +- `src/cli/biz_articles.rs`: New CLI command handler (follows sns_feed pattern) +- `src/cli/mod.rs`: Register `BizArticles` subcommand in clap + dispatch +- `src/daemon/query.rs`: Add `q_biz_articles` query + `parse_biz_xml_items` + `extract_cdata` helpers + 8 unit tests +- `src/daemon/server.rs`: Add dispatch case for `BizArticles` + +## Test Results + +``` +test result: ok. 49 passed; 0 failed; 0 ignored +``` + +New tests (8): +- `biz_tests::extract_cdata_normal` +- `biz_tests::extract_cdata_empty` +- `biz_tests::extract_cdata_url` +- `biz_tests::extract_cdata_no_cdata_wrapper` +- `biz_tests::parse_biz_xml_items_single_article` +- `biz_tests::parse_biz_xml_items_skips_no_url` +- `biz_tests::parse_biz_xml_items_multi_article` +- `biz_tests::parse_biz_xml_items_pub_time_fallback` + +## Verified Output (real WeChat install with ~30 public accounts, 2026-05-10) + +```yaml +- account: 返朴 + title: 细胞生物学家俞立:从后进生到科学家,一个ADHD孩子的逆袭 + url: http://mp.weixin.qq.com/s?__biz=Mzg2MTUyODU2NA==&mid=2247642795&... + +- account: Datawhale + title: 刚刚,Claude Code 团队这篇文章爆了! + url: http://mp.weixin.qq.com/s?__biz=MzIyNjM2MzQyNg==&mid=2247722630&... + +- account: 土猛的员外 + title: AI时代,企业的业务底座正在从数据库变成知识引擎 + url: http://mp.weixin.qq.com/s?__biz=MzIyOTA5NTM1OA==&mid=2247485270&... +``` + +## Branch + +`ChenyqThu/wx-cli` → `feat/biz-articles` + +--- + +*Waiting for Lucien's review before opening PR.* diff --git a/src/cli/biz_articles.rs b/src/cli/biz_articles.rs index 3952437..0c74874 100644 --- a/src/cli/biz_articles.rs +++ b/src/cli/biz_articles.rs @@ -9,6 +9,7 @@ pub fn cmd_biz_articles( account: Option, since: Option, until: Option, + unread: bool, json: bool, ) -> Result<()> { let since_ts = since.as_deref().map(parse_time).transpose()?; @@ -19,6 +20,7 @@ pub fn cmd_biz_articles( account, since: since_ts, until: until_ts, + unread, }; let resp = transport::send(req)?; let data = resp.data.get("articles") diff --git a/src/cli/mod.rs b/src/cli/mod.rs index b2e3097..b9e71fd 100644 --- a/src/cli/mod.rs +++ b/src/cli/mod.rs @@ -235,6 +235,9 @@ enum Commands { /// 结束时间 YYYY-MM-DD #[arg(long)] until: Option, + /// 只看有未读的公众号,每个公众号取最新 1 篇 + #[arg(long)] + unread: bool, /// 输出 JSON(默认 YAML) #[arg(long)] json: bool, @@ -323,8 +326,8 @@ fn dispatch(cli: Cli) -> Result<()> { Commands::SnsSearch { keyword, limit, since, until, user, json } => { sns_search::cmd_sns_search(keyword, limit, since, until, user, json) } - Commands::BizArticles { limit, account, since, until, json } => { - biz_articles::cmd_biz_articles(limit, account, since, until, json) + Commands::BizArticles { limit, account, since, until, unread, json } => { + biz_articles::cmd_biz_articles(limit, account, since, until, unread, json) } Commands::Daemon { cmd } => daemon_cmd::cmd_daemon(cmd), } diff --git a/src/daemon/query.rs b/src/daemon/query.rs index cb210b0..9805258 100644 --- a/src/daemon/query.rs +++ b/src/daemon/query.rs @@ -3046,11 +3046,41 @@ pub async fn q_biz_articles( account: Option, since: Option, until: Option, + unread: bool, ) -> Result { let biz_path = db.get("message/biz_message_0.db").await? .context("无法解密 biz_message_0.db,请确认 all_keys.json 包含对应密鑰")? ; + // 开启 --unread:从 session.db 拿“公众号 + unread_count>0”的 username 子集, + // 作为合集过滤(与 --account 取交集),后续结果按 account_username 去重取顶 1 篇。 + let unread_usernames: Option> = if unread { + let session_path = db.get("session/session.db").await? + .context("无法解密 session.db")?; + let session_path2 = session_path.clone(); + let unread_rows: Vec = tokio::task::spawn_blocking(move || { + let conn = Connection::open(&session_path2)?; + let mut stmt = conn.prepare( + "SELECT username FROM SessionTable WHERE unread_count > 0" + )?; + let rows: Vec = stmt.query_map([], |row| row.get::<_, String>(0))? + .filter_map(|r| r.ok()) + .collect(); + Ok::<_, anyhow::Error>(rows) + }).await??; + // 仅保留公众号类型的未读会话 + let set: std::collections::HashSet = unread_rows.into_iter() + .filter(|u| chat_type_of(u, names) == "official_account") + .collect(); + if set.is_empty() { + // 没有未读公众号 → 直接空返回,避免打 biz 表扫描 + return Ok(json!({ "count": 0, "articles": [] })); + } + Some(set) + } else { + None + }; + // 1. 从 Name2Id 表获取 rowid -> username 映射,再推导 md5 -> username let biz_path2 = biz_path.clone(); let id2username: HashMap = tokio::task::spawn_blocking(move || { @@ -3071,7 +3101,7 @@ pub async fn q_biz_articles( // 2. 如果 指定了 --account,找到匹配的 username 列表 let account_low = account.as_deref().map(|s| s.to_lowercase()); - let target_usernames: Option> = account_low.as_ref().map(|low| { + let mut target_usernames: Option> = account_low.as_ref().map(|low| { id2username.values() .filter(|u| { let display = names.display(u); @@ -3082,6 +3112,20 @@ pub async fn q_biz_articles( .collect() }); + // --unread 与 --account 取交集(进一步缩小范围) + if let Some(ref unread_set) = unread_usernames { + target_usernames = Some(match target_usernames.take() { + Some(acc_list) => acc_list.into_iter() + .filter(|u| unread_set.contains(u)) + .collect(), + None => unread_set.iter().cloned().collect(), + }); + // 交集为空 → 提前返回 + if target_usernames.as_ref().map(|v| v.is_empty()).unwrap_or(false) { + return Ok(json!({ "count": 0, "articles": [] })); + } + } + // 3. 进行数据库查询 let biz_path3 = biz_path.clone(); let since2 = since; @@ -3167,8 +3211,15 @@ pub async fn q_biz_articles( articles.extend(items); } - // 5. 按 pub_time DESC 排序,取前 N 条 + // 5. 按 pub_time DESC 排序 articles.sort_by_key(|a| std::cmp::Reverse(a.pub_time)); + + // --unread 语义 A:每个公众号只保留最新 1 篇(已按 pub_time 排序,取首条即可) + if unread { + let mut seen = std::collections::HashSet::::new(); + articles.retain(|a| seen.insert(a.account_username.clone())); + } + articles.truncate(limit); let results: Vec = articles.into_iter().map(|a| { diff --git a/src/daemon/server.rs b/src/daemon/server.rs index e49417b..3b06727 100644 --- a/src/daemon/server.rs +++ b/src/daemon/server.rs @@ -234,8 +234,8 @@ async fn dispatch( ReloadConfig => { Response::ok(serde_json::json!({ "reloading": true })) } - BizArticles { limit, account, since, until } => { - match query::q_biz_articles(db, &names_arc, limit, account, since, until).await { + BizArticles { limit, account, since, until, unread } => { + match query::q_biz_articles(db, &names_arc, limit, account, since, until, unread).await { Ok(v) => Response::ok(v), Err(e) => Response::err(e.to_string()), } diff --git a/src/ipc.rs b/src/ipc.rs index 111abfc..c478ee4 100644 --- a/src/ipc.rs +++ b/src/ipc.rs @@ -113,6 +113,9 @@ pub enum Request { since: Option, #[serde(skip_serializing_if = "Option::is_none")] until: Option, + /// 只看有未读消息的公众号,每个公众号取最新 1 篇 + #[serde(default)] + unread: bool, }, /// 朋友圈全文搜索(匹配 contentDesc) SnsSearch {