review: tighten attachment extraction scope

pull/57/head
jackwener 2026-05-14 19:10:03 +08:00
parent 7feacc6371
commit b63589b368
6 changed files with 112 additions and 26 deletions

View File

@ -211,14 +211,14 @@ wx biz-articles --json | jq '.[].url' # 下游消费 URL
每条返回:`account` / `account_username` / `title` / `url` / `digest` / `cover_url` / `time` / `timestamp` / `recv_time_str`。多图文推送会展开成多行。
### 附件提取(图片 / 视频 / 文件 / 语音
### 附件提取(图片)
聊天里的附件本体存在 `xwechat_files/<wxid>/msg/attach/...` 下的 `.dat` 文件,需要按消息所在 `message_resource.db` 的 md5 + 平台相关 image key 解码才能拿到原图。
```bash
# 1) 列出会话里的附件,先拿到不透明的 attachment_id(默认 image可多选
# 1) 列出会话里的图片附件,先拿到不透明的 attachment_id
wx attachments "张三"
wx attachments "AI群" --kind image --kind video -n 100
wx attachments "AI群" --kind image -n 100
wx attachments "AI群" --since 2026-04-01 --until 2026-04-15
# 2) 把单个 attachment_id 解密写出去(扩展名建议保留 .jpg / .mp4 等)
@ -226,7 +226,7 @@ wx extract <attachment_id> -o ~/Desktop/photo.jpg
wx extract <attachment_id> -o /tmp/x.jpg --overwrite
```
`attachments` 输出每条带:`attachment_id` / `kind` / `type` / `local_id` / `timestamp` / `time`,群聊里还有 `sender`
`attachments` 输出每条带:`attachment_id` / `kind` / `type` / `local_id` / `timestamp` / `time`,群聊里还有 `sender`当前 `kind` 固定为 `image`;命令名保留成 `attachments` 是为了后续扩到其他附件类型时不 break CLI。
`extract` 输出报告里带:`md5` / `dat_path` / `dat_size` / `output` / `output_size` / `format`实际识别出的图片格式jpg / png / gif / webp / hevc 等)/ `decoder`(实际选用的解码器:`legacy_xor` / `v1_aes` / `v2`)。

View File

@ -242,14 +242,14 @@ wx biz-articles --since 2026-05-10 --json | jq '.[].url'
每条返回的字段:`account` / `account_username``gh_*`/ `title` / `url``mp.weixin.qq.com` 链接)/ `digest` / `cover_url` / `time` + `timestamp`(文章发布时间)/ `recv_time_str` + `recv_time`(微信接收推送的时间)。多图文推送会展开为多行。
### 附件提取(图片 / 视频 / 文件 / 语音
### 附件提取(图片)
聊天里的图片/视频/文件本体在 `xwechat_files/<wxid>/msg/attach/...` 下加密存储(`.dat`),需要按消息所在 `message_resource.db` 的 md5 + 平台相关 image key 才能解码。两步走:
聊天里的图片本体在 `xwechat_files/<wxid>/msg/attach/...` 下加密存储(`.dat`),需要按消息所在 `message_resource.db` 的 md5 + 平台相关 image key 才能解码。两步走:
```bash
# 1) 先列出附件,拿到不透明的 attachment_id(默认 image可多选
# 1) 先列出图片附件,拿到不透明的 attachment_id
wx attachments "张三"
wx attachments "AI群" --kind image --kind video -n 100
wx attachments "AI群" --kind image -n 100
wx attachments "AI群" --since 2026-04-01 --until 2026-04-15
# 2) 用 attachment_id 把单个资源解密写到指定路径
@ -257,7 +257,7 @@ wx extract <attachment_id> -o ~/Desktop/photo.jpg
wx extract <attachment_id> -o /tmp/x.jpg --overwrite
```
`attachments` 输出每条带:`attachment_id` / `kind`image/voice/video/file/ `type` / `local_id` / `timestamp` / `time`,群聊里另带 `sender`
`attachments` 输出每条带:`attachment_id` / `kind`当前固定 `image`/ `type` / `local_id` / `timestamp` / `time`,群聊里另带 `sender`命令名保留成 `attachments` 是为了后续扩到其他附件类型时不 break CLI。
`extract` 报告里带:`md5` / `dat_path` / `dat_size` / `output` / `output_size` / `format`实际识别出的图片格式jpg / png / gif / webp / hevc 等)/ `decoder`(实际选用的解码器:`legacy_xor` / `v1_aes` / `v2`)。

View File

@ -48,6 +48,7 @@ pub fn lookup_md5_blocking(
resource_db_path: &Path,
chat: &str,
local_id: i64,
create_time: i64,
msg_local_type_lo32: i64,
) -> Result<Option<AttachmentMetadata>> {
let conn = Connection::open_with_flags(
@ -68,9 +69,25 @@ pub fn lookup_md5_blocking(
return Ok(None);
};
// 2) MessageResourceInfo: 同 chat 内 local_id 也会复用,按 create_time DESC 取最新
// 2) MessageResourceInfo:
// 同 chat 内 local_id 会复用,所以先用 create_time 精确命中;
// 若资源库里的时间戳跟 message_N.db 不完全对齐,再 fallback 到“同 local_id/type 取最新”
// message_local_type 高 32 bit 是版本/会话 flag低 32 bit 才是真实类型
let packed: Option<Vec<u8>> = conn
let packed_exact: Option<Vec<u8>> = conn
.query_row(
"SELECT packed_info FROM MessageResourceInfo
WHERE chat_id = ?1
AND message_local_id = ?2
AND (message_local_type = ?3 OR message_local_type % 4294967296 = ?3)
AND message_create_time = ?4
ORDER BY rowid DESC
LIMIT 1",
rusqlite::params![chat_id, local_id, msg_local_type_lo32, create_time],
|row| row.get(0),
)
.ok();
let packed: Option<Vec<u8>> = packed_exact.or_else(|| conn
.query_row(
"SELECT packed_info FROM MessageResourceInfo
WHERE chat_id = ?1
@ -81,7 +98,7 @@ pub fn lookup_md5_blocking(
rusqlite::params![chat_id, local_id, msg_local_type_lo32],
|row| row.get(0),
)
.ok();
.ok());
let Some(blob) = packed else {
return Ok(None);
@ -235,7 +252,13 @@ pub fn resolve_blocking(
super::AttachmentKind::File => 49,
};
let meta = lookup_md5_blocking(resource_db_path, &id.chat, id.local_id, lo32_type)?
let meta = lookup_md5_blocking(
resource_db_path,
&id.chat,
id.local_id,
id.create_time,
lo32_type,
)?
.ok_or_else(|| {
anyhow!(
"message_resource.db 中找不到 chat={} local_id={} type={} 的资源行(可能是非附件消息或资源库未同步)",
@ -306,6 +329,69 @@ mod tests {
assert!(extract_md5_from_packed_info(&blob).is_none());
}
#[test]
fn lookup_md5_prefers_exact_create_time_over_latest_reuse() {
let dir = tempdir_for_test();
let db_path = dir.join("message_resource.db");
let conn = Connection::open(&db_path).unwrap();
conn.execute(
"CREATE TABLE ChatName2Id (user_name TEXT)",
[],
)
.unwrap();
conn.execute(
"INSERT INTO ChatName2Id (rowid, user_name) VALUES (1, 'room@chatroom')",
[],
)
.unwrap();
conn.execute(
"CREATE TABLE MessageResourceInfo (
chat_id INTEGER,
message_local_id INTEGER,
message_local_type INTEGER,
message_create_time INTEGER,
packed_info BLOB
)",
[],
)
.unwrap();
let old_blob = {
let mut blob = vec![0x12, 0x22, 0x0A, 0x20];
blob.extend_from_slice(b"11111111111111111111111111111111");
blob
};
let new_blob = {
let mut blob = vec![0x12, 0x22, 0x0A, 0x20];
blob.extend_from_slice(b"22222222222222222222222222222222");
blob
};
conn.execute(
"INSERT INTO MessageResourceInfo
(chat_id, message_local_id, message_local_type, message_create_time, packed_info)
VALUES (?1, ?2, ?3, ?4, ?5)",
rusqlite::params![1i64, 7i64, 3i64, 1000i64, old_blob],
)
.unwrap();
conn.execute(
"INSERT INTO MessageResourceInfo
(chat_id, message_local_id, message_local_type, message_create_time, packed_info)
VALUES (?1, ?2, ?3, ?4, ?5)",
rusqlite::params![1i64, 7i64, 3i64, 2000i64, new_blob],
)
.unwrap();
let old = lookup_md5_blocking(&db_path, "room@chatroom", 7, 1000, 3)
.unwrap()
.unwrap();
let new = lookup_md5_blocking(&db_path, "room@chatroom", 7, 2000, 3)
.unwrap()
.unwrap();
assert_eq!(old.md5, "11111111111111111111111111111111");
assert_eq!(new.md5, "22222222222222222222222222222222");
}
#[test]
fn three_month_candidates_includes_prev_curr_next() {
// 2025-08-15 (mid-month) → 2025-07, 2025-08, 2025-09

View File

@ -264,13 +264,13 @@ enum Commands {
#[arg(long)]
json: bool,
},
/// 列出某会话的附件(图片 / 视频 / 文件 / 语音),返回不透明 attachment_id
/// 列出某会话的图片附件,返回不透明 attachment_id
Attachments {
/// 会话名称(联系人显示名 / wxid / @chatroom username 都可以)
chat: String,
/// 类型(多选,默认 image。可选image / voice / video / file
/// 类型(当前仅支持 image
#[arg(long = "kind", value_name = "KIND",
value_parser = ["image", "voice", "video", "file", "audio", "img"])]
value_parser = ["image", "img"])]
kinds: Vec<String>,
/// 显示数量
#[arg(short = 'n', long, default_value = "50")]

View File

@ -3285,7 +3285,7 @@ pub async fn q_biz_articles(
Ok(json!({ "count": results.len(), "articles": results }))
}
// ─── 附件(图片 / 视频 / 文件 / 语音)查询与提取 ─────────────────────────────────
// ─── 附件(当前先支持图片)查询与提取 ─────────────────────────────────
//
// 设计要点:
// - `q_attachments` 只走 `Msg_<chat_md5>` 表,按 `local_type & 0xFFFFFFFF IN (...)` 过滤
@ -3296,7 +3296,7 @@ pub async fn q_biz_articles(
// - V2 image AES key 通过 `image_key::default_provider()` 拿codex 后续填实现)。
// 缺 key 时 V2 解码会返回明确错误CLI 直接抛给用户。
/// 列出某会话内的附件消息(默认 image可多选)。返回每条的 `attachment_id`
/// 列出某会话内的附件消息(当前仅 image)。返回每条的 `attachment_id`
/// 后续传给 `Extract` 才真正读 message_resource.db + 解密 .dat。
pub async fn q_attachments(
db: &DbCache,
@ -3319,7 +3319,7 @@ pub async fn q_attachments(
// 解析 kinds → 低 32 bit local_type 集合
let kind_filters: Vec<(AttachmentKind, i64)> = parse_attachment_kinds(kinds.as_deref())?;
if kind_filters.is_empty() {
anyhow::bail!("kinds 为空 — 至少传一种 image/video/file/voice");
anyhow::bail!("kinds 为空 — 当前至少传一种 image");
}
let lo32_types: Vec<i64> = kind_filters.iter().map(|(_, t)| *t).collect();
// local_type → AttachmentKind 反查mask 完后定 kind
@ -3569,7 +3569,7 @@ pub async fn q_extract(
}
/// 解析 `kinds` 参数到 `(AttachmentKind, lo32_local_type)` 列表。
/// 缺省None / 空)按 image 处理
/// 当前只支持 image命令名保留成 `attachments` 是为了后续扩到其他附件类型时不 break CLI
fn parse_attachment_kinds(
kinds: Option<&[String]>,
) -> Result<Vec<(crate::attachment::AttachmentKind, i64)>> {
@ -3583,10 +3583,10 @@ fn parse_attachment_kinds(
for k in raw {
let (kind, t): (AttachmentKind, i64) = match k.to_ascii_lowercase().as_str() {
"image" | "img" => (AttachmentKind::Image, 3),
"voice" | "audio" => (AttachmentKind::Voice, 34),
"video" => (AttachmentKind::Video, 43),
"file" => (AttachmentKind::File, 49),
other => anyhow::bail!("未知附件类型:{}支持 image/voice/video/file", other),
"voice" | "audio" | "video" | "file" => {
anyhow::bail!("当前只支持 image 提取video/file/voice 的资源路径与 decoder 还没接通")
}
other => anyhow::bail!("未知附件类型:{}当前仅支持 image", other),
};
if seen.insert(kind.as_str()) {
out.push((kind, t));

View File

@ -131,11 +131,11 @@ pub enum Request {
},
/// 重新加载配置和密钥init --force 后 daemon 不会自动重读)
ReloadConfig,
/// 列出某个会话里的附件(图片 / 视频 / 文件 / 语音)
/// 列出某个会话里的图片附件
/// 输出每条带 `attachment_id`(不透明 base64url 句柄),传给 `Extract` 时取回本体
Attachments {
chat: String,
/// 类型过滤:image / video / file / voice多选缺省返回 image
/// 类型过滤:当前仅支持 image
#[serde(default, skip_serializing_if = "Option::is_none")]
kinds: Option<Vec<String>>,
#[serde(default = "default_limit_50")]