From c3cb372c8c6bf47c7cc24eaa58fa855c27614c97 Mon Sep 17 00:00:00 2001 From: Richard Liu <1625351+richardzone@users.noreply.github.com> Date: Tue, 9 Jun 2026 11:58:12 +0800 Subject: [PATCH] feat: transcode WXGF image attachments --- README.md | 5 +- SKILL.md | 5 +- src/attachment/decoder/mod.rs | 1 + src/attachment/decoder/wxgf.rs | 199 +++++++++++++++++++++++++++++++++ src/cli/extract.rs | 2 + src/cli/mod.rs | 6 +- src/daemon/query.rs | 51 ++++++++- src/daemon/server.rs | 3 +- src/ipc.rs | 3 + 9 files changed, 265 insertions(+), 10 deletions(-) create mode 100644 src/attachment/decoder/wxgf.rs diff --git a/README.md b/README.md index 1c1c7b5..63e3e67 100644 --- a/README.md +++ b/README.md @@ -243,11 +243,14 @@ wx attachments "AI群" --since 2026-04-01 --until 2026-04-15 # 2) 把单个 attachment_id 解密写出去(扩展名建议保留 .jpg / .mp4 等) wx extract -o ~/Desktop/photo.jpg wx extract -o /tmp/x.jpg --overwrite +wx extract -o /tmp/raw.wxgf --raw # 保留原始 WXGF/HEVC 容器 ``` `attachments` 输出每条带:`attachment_id` / `kind` / `type` / `local_id` / `timestamp` / `time`,群聊里还有 `sender` 以及稳定身份三件套 `sender_username` / `sender_contact_display` / `sender_group_nickname`(语义同 `history` / `search` / `new-messages`:`sender_username` 是 wxid,用于两个同名成员之间的稳定区分;解析不到 wxid 时这三字段不输出)。当前 `kind` 固定为 `image`;命令名保留成 `attachments` 是为了后续扩到其他附件类型时不 break CLI。 -`extract` 输出报告里带:`md5` / `dat_path` / `dat_size` / `output` / `output_size` / `format`(实际识别出的图片格式:jpg / png / gif / webp / hevc 等)/ `decoder`(实际选用的解码器:`legacy_xor` / `v1_aes` / `v2`)。 +`extract` 输出报告里带:`md5` / `dat_path` / `dat_size` / `output` / `output_size` / `format`(实际写出的图片格式:jpg / png / gif / webp 等)/ `decoder`(实际选用的解码器:`legacy_xor` / `v1_aes` / `v2`)。 + +微信 4 会把部分图片保存成内部 `WXGF/WXAM` 容器(解码后头部为 `wxgf`,报告里的 `source_format` 为 `hevc`)。默认 `wx extract` 会从 WXGF 中提取最大的 HEVC partition,并调用 `ffmpeg` 转成 JPG;报告会额外带 `source_format` / `source_size` / `transcoder` / `wxgf_partition_*`。如果本机没有 `ffmpeg`,请安装后重试,或用 `WX_FFMPEG=/path/to/ffmpeg` 指定路径;确实需要原始容器时传 `--raw`。如果微信里从未点开过该图片,本地通常只有 `_t.dat` 缩略图,先在微信客户端点开图片让它下载完整 `.dat`,再重新执行 `wx extract`。 支持的解码档位: - **legacy XOR**:早期单字节 XOR,无 magic(按文件首字节探测格式自动反推) diff --git a/SKILL.md b/SKILL.md index 61082fe..f169e2d 100644 --- a/SKILL.md +++ b/SKILL.md @@ -280,11 +280,14 @@ wx attachments "AI群" --since 2026-04-01 --until 2026-04-15 # 2) 用 attachment_id 把单个资源解密写到指定路径 wx extract -o ~/Desktop/photo.jpg wx extract -o /tmp/x.jpg --overwrite +wx extract -o /tmp/raw.wxgf --raw # 保留原始 WXGF/HEVC 容器 ``` `attachments` 输出每条带:`attachment_id` / `kind`(当前固定 `image`)/ `type` / `local_id` / `timestamp` / `time`,群聊里另带 `sender` 和稳定身份三件套(同上文)。命令名保留成 `attachments` 是为了后续扩到其他附件类型时不 break CLI。 -`extract` 报告里带:`md5` / `dat_path` / `dat_size` / `output` / `output_size` / `format`(实际识别出的图片格式:jpg / png / gif / webp / hevc 等)/ `decoder`(实际选用的解码器:`legacy_xor` / `v1_aes` / `v2`)。 +`extract` 报告里带:`md5` / `dat_path` / `dat_size` / `output` / `output_size` / `format`(实际写出的图片格式:jpg / png / gif / webp 等)/ `decoder`(实际选用的解码器:`legacy_xor` / `v1_aes` / `v2`)。 + +微信 4 图片可能解码成 `WXGF/WXAM` 容器(头部 `wxgf`,内部是 HEVC)。默认 `wx extract` 会自动提取 WXGF 里的最大 HEVC partition 并用 `ffmpeg` 转 JPG;报告会带 `source_format: hevc`、`source_size`、`transcoder` 和 `wxgf_partition_*`。如果只需要原始容器,传 `--raw`。如果本地只拿到 `_t.dat` 缩略图,输出会很小且文字图片不可读;让用户先在微信客户端点开图片,等完整 `.dat` 下载到本地后再重新 `wx extract`。 支持的解码档位: - **legacy XOR**:早期单字节 XOR,无 magic(按文件首字节探测格式自动反推) diff --git a/src/attachment/decoder/mod.rs b/src/attachment/decoder/mod.rs index a5723c5..7e4aad1 100644 --- a/src/attachment/decoder/mod.rs +++ b/src/attachment/decoder/mod.rs @@ -13,6 +13,7 @@ use anyhow::{anyhow, Result}; pub mod v1_xor; pub mod v2; +pub mod wxgf; /// 完整 V2 magic:`\x07\x08V2\x08\x07` pub const V2_MAGIC: [u8; 6] = [0x07, 0x08, b'V', b'2', 0x08, 0x07]; diff --git a/src/attachment/decoder/wxgf.rs b/src/attachment/decoder/wxgf.rs new file mode 100644 index 0000000..5c2bd25 --- /dev/null +++ b/src/attachment/decoder/wxgf.rs @@ -0,0 +1,199 @@ +//! WeChat 4 WXGF/WXAM image container support. +//! +//! `wxgf` is not a normal image format. It is a private WeChat container whose +//! largest data partition is usually an Annex B HEVC bitstream. We keep the +//! parser tiny: find HEVC start codes after the WXGF header, validate the +//! 4-byte big-endian length immediately before the start code, then hand the +//! largest partition to ffmpeg. + +use anyhow::{bail, Context, Result}; +use std::path::PathBuf; +use std::process::Command; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::time::{SystemTime, UNIX_EPOCH}; + +const WXGF_MAGIC: &[u8; 4] = b"wxgf"; +const FFMPEG_ENV: &str = "WX_FFMPEG"; + +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct WxgfPartition { + pub offset: usize, + /// Partition byte length, including the HEVC start code at `offset`. + pub size: usize, + pub ratio: f64, +} + +#[derive(Debug)] +pub struct WxgfJpeg { + pub data: Vec, + pub partition: WxgfPartition, + pub ffmpeg: String, +} + +struct TempPaths { + input: PathBuf, + output: PathBuf, +} + +impl Drop for TempPaths { + fn drop(&mut self) { + let _ = std::fs::remove_file(&self.input); + let _ = std::fs::remove_file(&self.output); + } +} + +/// Return the largest HEVC Annex B partition inside a WXGF/WXAM container. +pub fn largest_partition(data: &[u8]) -> Result { + if data.len() < 15 || &data[..4] != WXGF_MAGIC { + bail!("invalid WXGF image container"); + } + + let header_len = data[4] as usize; + if header_len >= data.len() { + bail!("invalid WXGF header length {}", header_len); + } + + for pattern in [&[0x00, 0x00, 0x00, 0x01][..], &[0x00, 0x00, 0x01][..]] { + let mut partitions = Vec::new(); + let mut rel_offset = 0usize; + + while header_len + rel_offset < data.len() { + let search_from = header_len + rel_offset; + let Some(idx) = find_subslice(&data[search_from..], pattern) else { + break; + }; + let abs_idx = search_from + idx; + if abs_idx < 4 { + rel_offset = rel_offset.saturating_add(idx + 1); + continue; + } + + let size = u32::from_be_bytes(data[abs_idx - 4..abs_idx].try_into().unwrap()) as usize; + if size > 0 && abs_idx.checked_add(size).is_some_and(|end| end <= data.len()) { + partitions.push(WxgfPartition { + offset: abs_idx, + size, + ratio: size as f64 / data.len() as f64, + }); + rel_offset = abs_idx - header_len + size; + } else { + rel_offset = abs_idx - header_len + 1; + } + } + + if let Some(max) = partitions.into_iter().max_by_key(|p| p.size) { + return Ok(max); + } + } + + bail!("WXGF image has no valid HEVC partition") +} + +/// Convert a WXGF/WXAM image to JPEG through ffmpeg. +/// +/// The ffmpeg path is resolved from `WX_FFMPEG`, then falls back to `ffmpeg` in +/// PATH. This avoids adding Python or native HEVC decoder dependencies. +pub fn transcode_to_jpeg(data: &[u8]) -> Result { + let partition = largest_partition(data)?; + let hevc = &data[partition.offset..partition.offset + partition.size]; + let ffmpeg = std::env::var(FFMPEG_ENV).unwrap_or_else(|_| "ffmpeg".to_string()); + let paths = temp_paths(); + + std::fs::write(&paths.input, hevc) + .with_context(|| format!("写出 WXGF/HEVC 临时输入失败:{}", paths.input.display()))?; + + let output = Command::new(&ffmpeg) + .arg("-y") + .arg("-hide_banner") + .arg("-loglevel") + .arg("error") + .arg("-f") + .arg("hevc") + .arg("-i") + .arg(&paths.input) + .arg("-vframes") + .arg("1") + .arg("-c:v") + .arg("mjpeg") + .arg("-q:v") + .arg("4") + .arg(&paths.output) + .output() + .with_context(|| { + format!( + "启动 ffmpeg 失败;请安装 ffmpeg 或用 {FFMPEG_ENV} 指定路径,或用 wx extract --raw 导出原始 WXGF" + ) + })?; + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + bail!( + "ffmpeg 转码 WXGF/HEVC 失败:{}", + stderr.trim().chars().take(800).collect::() + ); + } + + let data = std::fs::read(&paths.output) + .with_context(|| format!("读取 ffmpeg 输出失败:{}", paths.output.display()))?; + if data.is_empty() { + bail!("ffmpeg 转码 WXGF/HEVC 成功但没有输出 JPEG 数据"); + } + + Ok(WxgfJpeg { + data, + partition, + ffmpeg, + }) +} + +fn temp_paths() -> TempPaths { + static COUNTER: AtomicU64 = AtomicU64::new(0); + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_nanos()) + .unwrap_or(0); + let seq = COUNTER.fetch_add(1, Ordering::Relaxed); + let stem = format!("wx-cli-wxgf-{}-{}-{}", std::process::id(), nanos, seq); + let dir = std::env::temp_dir(); + TempPaths { + input: dir.join(format!("{}.hevc", stem)), + output: dir.join(format!("{}.jpg", stem)), + } +} + +fn find_subslice(haystack: &[u8], needle: &[u8]) -> Option { + if needle.is_empty() || needle.len() > haystack.len() { + return None; + } + haystack.windows(needle.len()).position(|w| w == needle) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn finds_largest_partition() { + let mut data = b"wxgf".to_vec(); + data.push(19); // header length + data.extend_from_slice(&[0; 14]); + + data.extend_from_slice(&8u32.to_be_bytes()); + data.extend_from_slice(&[0, 0, 0, 1]); + data.extend_from_slice(&[1, 2, 3, 4]); + + let second_offset = data.len() + 4; + data.extend_from_slice(&12u32.to_be_bytes()); + data.extend_from_slice(&[0, 0, 0, 1]); + data.extend_from_slice(&[5, 6, 7, 8, 9, 10, 11, 12]); + + let p = largest_partition(&data).unwrap(); + assert_eq!(p.offset, second_offset); + assert_eq!(p.size, 12); + } + + #[test] + fn rejects_non_wxgf() { + let err = largest_partition(b"not wxgf").unwrap_err().to_string(); + assert!(err.contains("WXGF")); + } +} diff --git a/src/cli/extract.rs b/src/cli/extract.rs index a0eba0d..bfee72e 100644 --- a/src/cli/extract.rs +++ b/src/cli/extract.rs @@ -13,12 +13,14 @@ pub fn cmd_extract( attachment_id: String, output: String, overwrite: bool, + raw: bool, json: bool, ) -> Result<()> { let req = Request::Extract { attachment_id, output, overwrite, + raw, }; let resp = transport::send(req)?; print_value(&resp.data, &resolve(json)) diff --git a/src/cli/mod.rs b/src/cli/mod.rs index b4d6cf4..7681e25 100644 --- a/src/cli/mod.rs +++ b/src/cli/mod.rs @@ -305,6 +305,9 @@ enum Commands { /// 目标已存在时覆盖 #[arg(long)] overwrite: bool, + /// 原样导出解码后的附件数据;WXGF/HEVC 图片不转 JPG + #[arg(long)] + raw: bool, /// 输出 JSON(默认 YAML) #[arg(long)] json: bool, @@ -518,8 +521,9 @@ fn dispatch(cli: Cli) -> Result<()> { attachment_id, output, overwrite, + raw, json, - } => extract::cmd_extract(attachment_id, output, overwrite, json), + } => extract::cmd_extract(attachment_id, output, overwrite, raw, json), Commands::Daemon { cmd } => daemon_cmd::cmd_daemon(cmd), } } diff --git a/src/daemon/query.rs b/src/daemon/query.rs index ac9ec0d..ba1fb49 100644 --- a/src/daemon/query.rs +++ b/src/daemon/query.rs @@ -4483,6 +4483,7 @@ pub async fn q_extract( attachment_id: &str, output: &str, overwrite: bool, + raw: bool, ) -> Result { use crate::attachment::{ attachment_id::AttachmentId, @@ -4573,25 +4574,63 @@ pub async fn q_extract( }; let decoded = decoder::dispatch(&dat_bytes, v2_key)?; + let source_format = decoded.format; + let source_size = decoded.data.len(); + let mut output_format = source_format.to_string(); + let mut decoder_name = decoded.decoder.to_string(); + let mut output_data = decoded.data; + let mut wxgf_partition_offset: Option = None; + let mut wxgf_partition_size: Option = None; + let mut wxgf_partition_ratio: Option = None; + let mut transcoder: Option = None; + + if source_format == "hevc" && !raw { + let jpg = decoder::wxgf::transcode_to_jpeg(&output_data) + .context("WXGF/HEVC 图片转 JPG 失败;可安装 ffmpeg 或用 wx extract --raw 导出原始 WXGF")?; + wxgf_partition_offset = Some(jpg.partition.offset); + wxgf_partition_size = Some(jpg.partition.size); + wxgf_partition_ratio = Some(jpg.partition.ratio); + transcoder = Some(format!("ffmpeg:{}", jpg.ffmpeg)); + output_data = jpg.data; + output_format = "jpg".to_string(); + decoder_name.push_str("+wxgf_ffmpeg"); + } // 写盘 - std::fs::write(&output_path2, &decoded.data) + std::fs::write(&output_path2, &output_data) .with_context(|| format!("写出文件失败:{}", output_path2.display()))?; // 注意:不要在这里塞 `ok: true`。dispatch 会用 Response::ok(v) 包一层, // Response 的 `data: Value` 字段是 #[serde(flatten)] 写出的,本 payload // 的 `ok` 会和 Response 自带的 `ok` 在线上拼成两个同名 key,CLI 反序列化时 // serde_json 直接报 "duplicate field",业务请求看上去像 daemon 解析失败。 - Ok(json!({ + let mut report = json!({ "kind": id_for_task.kind.as_str(), "md5": resolved.md5, "dat_path": resolved.dat_path.display().to_string(), "dat_size": resolved.size, "output": output_path2.display().to_string(), - "output_size": decoded.data.len(), - "format": decoded.format, - "decoder": decoded.decoder, - })) + "output_size": output_data.len(), + "format": output_format, + "decoder": decoder_name, + }); + if source_format != report["format"].as_str().unwrap_or_default() { + report["source_format"] = json!(source_format); + report["source_size"] = json!(source_size); + } + if let Some(transcoder) = transcoder { + report["transcoder"] = json!(transcoder); + } + if let Some(offset) = wxgf_partition_offset { + report["wxgf_partition_offset"] = json!(offset); + } + if let Some(size) = wxgf_partition_size { + report["wxgf_partition_size"] = json!(size); + } + if let Some(ratio) = wxgf_partition_ratio { + report["wxgf_partition_ratio"] = json!(ratio); + } + Ok(report) }) .await??; diff --git a/src/daemon/server.rs b/src/daemon/server.rs index 242edc1..546432e 100644 --- a/src/daemon/server.rs +++ b/src/daemon/server.rs @@ -353,7 +353,8 @@ async fn dispatch(req: Request, db: &DbCache, names: &tokio::sync::RwLock match query::q_extract(db, &names_arc, &attachment_id, &output, overwrite).await { + raw, + } => match query::q_extract(db, &names_arc, &attachment_id, &output, overwrite, raw).await { Ok(v) => Response::ok(v), Err(e) => Response::err(e.to_string()), }, diff --git a/src/ipc.rs b/src/ipc.rs index 93306fb..4565016 100644 --- a/src/ipc.rs +++ b/src/ipc.rs @@ -184,6 +184,9 @@ pub enum Request { /// 已存在时是否覆盖 #[serde(default)] overwrite: bool, + /// 原样导出解码后的附件数据;图片为 WXGF/HEVC 时不调用 ffmpeg 转 JPG + #[serde(default, skip_serializing_if = "is_false")] + raw: bool, }, }