diff --git a/src/cli/biz_articles.rs b/src/cli/biz_articles.rs new file mode 100644 index 0000000..3952437 --- /dev/null +++ b/src/cli/biz_articles.rs @@ -0,0 +1,28 @@ +use anyhow::Result; +use crate::ipc::Request; +use super::history::{parse_time, parse_time_end}; +use super::transport; +use super::output::{resolve, print_value}; + +pub fn cmd_biz_articles( + limit: usize, + account: Option, + since: Option, + until: Option, + json: bool, +) -> Result<()> { + let since_ts = since.as_deref().map(parse_time).transpose()?; + let until_ts = until.as_deref().map(parse_time_end).transpose()?; + + let req = Request::BizArticles { + limit, + account, + since: since_ts, + until: until_ts, + }; + let resp = transport::send(req)?; + let data = resp.data.get("articles") + .cloned() + .unwrap_or(serde_json::Value::Array(vec![])); + print_value(&data, &resolve(json)) +} diff --git a/src/cli/mod.rs b/src/cli/mod.rs index 3a28060..b2e3097 100644 --- a/src/cli/mod.rs +++ b/src/cli/mod.rs @@ -1,4 +1,5 @@ mod init; +pub mod biz_articles; pub mod sessions; pub mod history; pub mod search; @@ -220,6 +221,24 @@ enum Commands { #[arg(long)] json: bool, }, + /// 查询公众号文章推送(本地缓存) + BizArticles { + /// 显示数量 + #[arg(short = 'n', long, default_value = "50")] + limit: usize, + /// 限定公众号(名称模糊匹配) + #[arg(long)] + account: Option, + /// 起始时间 YYYY-MM-DD + #[arg(long)] + since: Option, + /// 结束时间 YYYY-MM-DD + #[arg(long)] + until: Option, + /// 输出 JSON(默认 YAML) + #[arg(long)] + json: bool, + }, /// 朋友圈全文搜索:匹配正文关键词 SnsSearch { /// 关键词 @@ -304,6 +323,9 @@ fn dispatch(cli: Cli) -> Result<()> { Commands::SnsSearch { keyword, limit, since, until, user, json } => { sns_search::cmd_sns_search(keyword, limit, since, until, user, json) } + Commands::BizArticles { limit, account, since, until, json } => { + biz_articles::cmd_biz_articles(limit, account, since, until, json) + } Commands::Daemon { cmd } => daemon_cmd::cmd_daemon(cmd), } } diff --git a/src/daemon/query.rs b/src/daemon/query.rs index 2d33e97..cb210b0 100644 --- a/src/daemon/query.rs +++ b/src/daemon/query.rs @@ -2945,6 +2945,346 @@ pub async fn q_sns_search( Ok(json!({ "keyword": keyword, "posts": posts, "total": total })) } +// ─── 公众号文章查询 ─────────────────────────────────────────────────────────── + +/// 一条公众号文章的解析产物 +#[derive(Debug)] +struct BizArticle { + /// 接收该推送的时间戳(即消息的 create_time) + recv_time: i64, + /// 公众号 username + account_username: String, + /// 文章标题 + title: String, + /// 文章链接 + url: String, + /// 摘要 + digest: String, + /// 封面图 + cover: String, + /// 文章发布时间(pub_time,单位秒) + pub_time: i64, +} + +/// 从 biz_message 表的单条 XML 解析出全部 article items +fn parse_biz_xml_items(recv_time: i64, account_username: &str, xml: &str) -> Vec { + let mut items = Vec::new(); + let mut search_from = 0; + loop { + let Some(item_start) = xml[search_from..].find("") else { break; }; + let abs_start = search_from + item_start; + let Some(item_end) = xml[abs_start..].find("") else { break; }; + let abs_end = abs_start + item_end + 7; + let item_xml = &xml[abs_start..abs_end]; + + let title = extract_cdata(item_xml, "title").unwrap_or_default(); + let url = extract_cdata(item_xml, "url").unwrap_or_default(); + // Skip items with no URL or empty title (e.g. payment entries) + if url.is_empty() || title.is_empty() { + search_from = abs_end; + continue; + } + let digest = extract_cdata(item_xml, "digest").unwrap_or_default(); + let cover = extract_cdata(item_xml, "cover").unwrap_or_default(); + let pub_time = extract_xml_text(item_xml, "pub_time") + .and_then(|s| s.parse::().ok()) + .unwrap_or(recv_time); + + items.push(BizArticle { + recv_time, + account_username: account_username.to_string(), + title, + url, + digest, + cover, + pub_time, + }); + search_from = abs_end; + } + items +} + +/// 提取 CDATA 或普通文本内容: `` 或 `...` +/// +/// 注意: 内容匹配到 `` 之前的内容。CDATA 块中的 "]]"已在 "]]\x3e" 之前, +/// 所以 inner 为 `` 或 `" 被 close tag 吸掉) +fn extract_cdata(xml: &str, tag: &str) -> Option { + let open = format!("<{}>", tag); + let close = format!("", tag); + let start = xml.find(&open)? + open.len(); + let end = xml[start..].find(&close)?; + let inner = xml[start..start + end].trim(); + if inner.starts_with("` → strip 9-char `` suffix + let body = &inner[9..]; + // Strip `]]>` (normal) or `]]` (edge case) + let cdata_end = b"]]>"; + let cdata_end2 = b"]]"; + let content: &str = if body.as_bytes().ends_with(cdata_end) { + &body[..body.len() - 3] + } else if body.as_bytes().ends_with(cdata_end2) { + &body[..body.len() - 2] + } else { + body + }; + let content = content.trim(); + if content.is_empty() { None } else { Some(content.to_string()) } + } else if inner.is_empty() { + None + } else { + Some(unescape_html(inner)) + } +} + +/// 查询公众号文章推送(biz_message_0.db) +/// +/// 每条消息可能包含多篇文章(多图文推送)。返回所有文章展开就的平底列表。 +pub async fn q_biz_articles( + db: &DbCache, + names: &Names, + limit: usize, + account: Option, + since: Option, + until: Option, +) -> Result { + let biz_path = db.get("message/biz_message_0.db").await? + .context("无法解密 biz_message_0.db,请确认 all_keys.json 包含对应密鑰")? +; + + // 1. 从 Name2Id 表获取 rowid -> username 映射,再推导 md5 -> username + let biz_path2 = biz_path.clone(); + let id2username: HashMap = tokio::task::spawn_blocking(move || { + let conn = Connection::open(&biz_path2)?; + let mut stmt = conn.prepare("SELECT rowid, user_name FROM Name2Id WHERE user_name LIKE 'gh_%'")? + ; + let rows = stmt.query_map([], |row| { + Ok((row.get::<_, i64>(0)?, row.get::<_, String>(1)?)) + })? + .collect::>>()?; + Ok::<_, anyhow::Error>(rows.into_iter().collect()) + }).await??; + + // 构建 md5(username) -> username 映射 + let md5_to_uname: HashMap = id2username.values() + .map(|u| (format!("{:x}", md5::compute(u.as_bytes())), u.clone())) + .collect(); + + // 2. 如果 指定了 --account,找到匹配的 username 列表 + let account_low = account.as_deref().map(|s| s.to_lowercase()); + let target_usernames: Option> = account_low.as_ref().map(|low| { + id2username.values() + .filter(|u| { + let display = names.display(u); + display.to_lowercase().contains(low.as_str()) + || u.to_lowercase().contains(low.as_str()) + }) + .cloned() + .collect() + }); + + // 3. 进行数据库查询 + let biz_path3 = biz_path.clone(); + let since2 = since; + let until2 = until; + let target_hashes: Option> = target_usernames.as_ref().map(|unames| { + unames.iter() + .map(|u| format!("{:x}", md5::compute(u.as_bytes()))) + .collect() + }); + + let rows: Vec<(String, i64, i64, Vec, i64)> = tokio::task::spawn_blocking(move || { + let conn = Connection::open(&biz_path3)?; + + // 列出所有 Msg_ 表 + let mut stmt = conn.prepare( + "SELECT name FROM sqlite_master WHERE type='table' AND name LIKE 'Msg_%'" + )?; + let table_names: Vec = stmt.query_map([], |row| row.get(0))? + .filter_map(|r| r.ok()) + .collect(); + + let re = regex::Regex::new(r"^Msg_[0-9a-f]{32}$").unwrap(); + let mut all_rows: Vec<(String, i64, i64, Vec, i64)> = Vec::new(); + + for tname in &table_names { + if !re.is_match(tname) { continue; } + let hash = &tname[4..]; + + // account 过滤 + if let Some(ref hashes) = target_hashes { + if !hashes.iter().any(|h| h == hash) { continue; } + } + + let username = md5_to_uname.get(hash).cloned().unwrap_or_default(); + + // 构建过滤条件 + let mut clauses: Vec = Vec::new(); + let mut params: Vec> = Vec::new(); + // local_type & 0xFFFFFFFF = 49 是 appmsg(公众号文章) + clauses.push("(local_type & 4294967295) = 49".to_string()); + if let Some(s) = since2 { + clauses.push("create_time >= ?".to_string()); + params.push(Box::new(s)); + } + if let Some(u) = until2 { + clauses.push("create_time <= ?".to_string()); + params.push(Box::new(u)); + } + let where_clause = format!("WHERE {}", clauses.join(" AND ")); + + let sql = format!( + "SELECT create_time, WCDB_CT_message_content, message_content \ + FROM [{}] {} ORDER BY create_time DESC", + tname, where_clause + ); + + let params_ref: Vec<&dyn rusqlite::types::ToSql> = params.iter().map(|p| p.as_ref()).collect(); + if let Ok(mut inner_stmt) = conn.prepare(&sql) { + let msg_rows: Vec<_> = inner_stmt + .query_map(params_ref.as_slice(), |row| { + Ok(( + username.clone(), + row.get::<_, i64>(0)?, + row.get::<_, i64>(1).unwrap_or(0), + get_content_bytes(row, 2), + 0i64, + )) + }) + .map(|it| it.filter_map(|r| r.ok()).collect()) + .unwrap_or_default(); + all_rows.extend(msg_rows); + } + } + Ok::<_, anyhow::Error>(all_rows) + }).await??; + + // 4. 解压并解析 XML + let mut articles: Vec = Vec::new(); + for (username, recv_time, ct, content_bytes, _) in rows { + let content = decompress_message(&content_bytes, ct); + if content.is_empty() { continue; } + let items = parse_biz_xml_items(recv_time, &username, &content); + articles.extend(items); + } + + // 5. 按 pub_time DESC 排序,取前 N 条 + articles.sort_by_key(|a| std::cmp::Reverse(a.pub_time)); + articles.truncate(limit); + + let results: Vec = articles.into_iter().map(|a| { + let account_display = names.display(&a.account_username); + json!({ + "time": fmt_time(a.pub_time, "%Y-%m-%d %H:%M"), + "timestamp": a.pub_time, + "recv_time": a.recv_time, + "recv_time_str": fmt_time(a.recv_time, "%Y-%m-%d %H:%M"), + "account": account_display, + "account_username": a.account_username, + "title": a.title, + "url": a.url, + "digest": a.digest, + "cover_url": a.cover, + }) + }).collect(); + + Ok(json!({ "count": results.len(), "articles": results })) +} + +#[cfg(test)] +mod biz_tests { + use super::*; + + #[test] + fn extract_cdata_normal() { + let xml = "<![CDATA[TencentResearch]]>"; + assert_eq!(extract_cdata(xml, "title"), Some("TencentResearch".into())); + } + + #[test] + fn extract_cdata_empty() { + let xml = ""; + assert_eq!(extract_cdata(xml, "cover"), None); + } + + #[test] + fn extract_cdata_url() { + let xml = ""; + let result = extract_cdata(xml, "url"); + assert!(result.is_some()); + let url = result.unwrap(); + assert!(url.starts_with("http://mp.weixin.qq.com")); + assert!(!url.contains("CDATA")); + } + + #[test] + fn extract_cdata_no_cdata_wrapper() { + let xml = "1700000000"; + assert_eq!(extract_cdata(xml, "pub_time"), Some("1700000000".into())); + } + + #[test] + fn parse_biz_xml_items_single_article() { + let xml = r#" + <![CDATA[Test Article Title]]> + + + + 1700000000 + "#; + + let items = parse_biz_xml_items(1699999999, "gh_test123", xml); + assert_eq!(items.len(), 1); + assert_eq!(items[0].title, "Test Article Title"); + assert_eq!(items[0].url, "http://mp.weixin.qq.com/s?test=1"); + assert_eq!(items[0].digest, "Test Digest"); + assert_eq!(items[0].pub_time, 1700000000); + assert_eq!(items[0].account_username, "gh_test123"); + } + + #[test] + fn parse_biz_xml_items_skips_no_url() { + let xml = r#" + <![CDATA[Has Title No URL]]> + + 1700000001 + "#; + let items = parse_biz_xml_items(1700000001, "gh_test", xml); + assert_eq!(items.len(), 0); + } + + #[test] + fn parse_biz_xml_items_multi_article() { + let xml = r#" + + <![CDATA[Article 1]]> + + 1700000010 + + + <![CDATA[Article 2]]> + + 1700000020 + + "#; + let items = parse_biz_xml_items(1700000000, "gh_multi", xml); + assert_eq!(items.len(), 2); + assert_eq!(items[0].title, "Article 1"); + assert_eq!(items[1].title, "Article 2"); + } + + #[test] + fn parse_biz_xml_items_pub_time_fallback() { + // When pub_time is missing, should fall back to recv_time + let xml = r#" + <![CDATA[No PubTime]]> + + "#; + let items = parse_biz_xml_items(1700000099, "gh_fallback", xml); + assert_eq!(items.len(), 1); + assert_eq!(items[0].pub_time, 1700000099); // falls back to recv_time + } +} + #[cfg(test)] mod group_nickname_tests { use super::*; diff --git a/src/daemon/server.rs b/src/daemon/server.rs index 4d7fd54..e49417b 100644 --- a/src/daemon/server.rs +++ b/src/daemon/server.rs @@ -234,5 +234,11 @@ async fn dispatch( ReloadConfig => { Response::ok(serde_json::json!({ "reloading": true })) } + BizArticles { limit, account, since, until } => { + match query::q_biz_articles(db, &names_arc, limit, account, since, until).await { + Ok(v) => Response::ok(v), + Err(e) => Response::err(e.to_string()), + } + } } } diff --git a/src/ipc.rs b/src/ipc.rs index 32e0a8f..111abfc 100644 --- a/src/ipc.rs +++ b/src/ipc.rs @@ -102,6 +102,18 @@ pub enum Request { #[serde(skip_serializing_if = "Option::is_none")] user: Option, }, + /// 查询公众号文章推送(biz_message_0.db) + BizArticles { + #[serde(default = "default_limit_50")] + limit: usize, + /// 公众号名称过滤(模糊匹配 display name,None = 全部) + #[serde(skip_serializing_if = "Option::is_none")] + account: Option, + #[serde(skip_serializing_if = "Option::is_none")] + since: Option, + #[serde(skip_serializing_if = "Option::is_none")] + until: Option, + }, /// 朋友圈全文搜索(匹配 contentDesc) SnsSearch { keyword: String,