feat: transcode WXGF image attachments

pull/103/head
Richard Liu 2026-06-09 11:58:12 +08:00
parent 08af894594
commit c3cb372c8c
9 changed files with 265 additions and 10 deletions

View File

@ -243,11 +243,14 @@ wx attachments "AI群" --since 2026-04-01 --until 2026-04-15
# 2) 把单个 attachment_id 解密写出去(扩展名建议保留 .jpg / .mp4 等) # 2) 把单个 attachment_id 解密写出去(扩展名建议保留 .jpg / .mp4 等)
wx extract <attachment_id> -o ~/Desktop/photo.jpg wx extract <attachment_id> -o ~/Desktop/photo.jpg
wx extract <attachment_id> -o /tmp/x.jpg --overwrite wx extract <attachment_id> -o /tmp/x.jpg --overwrite
wx extract <attachment_id> -o /tmp/raw.wxgf --raw # 保留原始 WXGF/HEVC 容器
``` ```
`attachments` 输出每条带:`attachment_id` / `kind` / `type` / `local_id` / `timestamp` / `time`,群聊里还有 `sender` 以及稳定身份三件套 `sender_username` / `sender_contact_display` / `sender_group_nickname`(语义同 `history` / `search` / `new-messages``sender_username` 是 wxid用于两个同名成员之间的稳定区分解析不到 wxid 时这三字段不输出)。当前 `kind` 固定为 `image`;命令名保留成 `attachments` 是为了后续扩到其他附件类型时不 break CLI。 `attachments` 输出每条带:`attachment_id` / `kind` / `type` / `local_id` / `timestamp` / `time`,群聊里还有 `sender` 以及稳定身份三件套 `sender_username` / `sender_contact_display` / `sender_group_nickname`(语义同 `history` / `search` / `new-messages``sender_username` 是 wxid用于两个同名成员之间的稳定区分解析不到 wxid 时这三字段不输出)。当前 `kind` 固定为 `image`;命令名保留成 `attachments` 是为了后续扩到其他附件类型时不 break CLI。
`extract` 输出报告里带:`md5` / `dat_path` / `dat_size` / `output` / `output_size` / `format`实际识别出的图片格式jpg / png / gif / webp / hevc 等)/ `decoder`(实际选用的解码器:`legacy_xor` / `v1_aes` / `v2`)。 `extract` 输出报告里带:`md5` / `dat_path` / `dat_size` / `output` / `output_size` / `format`实际写出的图片格式jpg / png / gif / webp 等)/ `decoder`(实际选用的解码器:`legacy_xor` / `v1_aes` / `v2`)。
微信 4 会把部分图片保存成内部 `WXGF/WXAM` 容器(解码后头部为 `wxgf`,报告里的 `source_format``hevc`)。默认 `wx extract` 会从 WXGF 中提取最大的 HEVC partition并调用 `ffmpeg` 转成 JPG报告会额外带 `source_format` / `source_size` / `transcoder` / `wxgf_partition_*`。如果本机没有 `ffmpeg`,请安装后重试,或用 `WX_FFMPEG=/path/to/ffmpeg` 指定路径;确实需要原始容器时传 `--raw`。如果微信里从未点开过该图片,本地通常只有 `_t.dat` 缩略图,先在微信客户端点开图片让它下载完整 `.dat`,再重新执行 `wx extract`
支持的解码档位: 支持的解码档位:
- **legacy XOR**:早期单字节 XOR无 magic按文件首字节探测格式自动反推 - **legacy XOR**:早期单字节 XOR无 magic按文件首字节探测格式自动反推

View File

@ -280,11 +280,14 @@ wx attachments "AI群" --since 2026-04-01 --until 2026-04-15
# 2) 用 attachment_id 把单个资源解密写到指定路径 # 2) 用 attachment_id 把单个资源解密写到指定路径
wx extract <attachment_id> -o ~/Desktop/photo.jpg wx extract <attachment_id> -o ~/Desktop/photo.jpg
wx extract <attachment_id> -o /tmp/x.jpg --overwrite wx extract <attachment_id> -o /tmp/x.jpg --overwrite
wx extract <attachment_id> -o /tmp/raw.wxgf --raw # 保留原始 WXGF/HEVC 容器
``` ```
`attachments` 输出每条带:`attachment_id` / `kind`(当前固定 `image`/ `type` / `local_id` / `timestamp` / `time`,群聊里另带 `sender` 和稳定身份三件套(同上文)。命令名保留成 `attachments` 是为了后续扩到其他附件类型时不 break CLI。 `attachments` 输出每条带:`attachment_id` / `kind`(当前固定 `image`/ `type` / `local_id` / `timestamp` / `time`,群聊里另带 `sender` 和稳定身份三件套(同上文)。命令名保留成 `attachments` 是为了后续扩到其他附件类型时不 break CLI。
`extract` 报告里带:`md5` / `dat_path` / `dat_size` / `output` / `output_size` / `format`实际识别出的图片格式jpg / png / gif / webp / hevc 等)/ `decoder`(实际选用的解码器:`legacy_xor` / `v1_aes` / `v2`)。 `extract` 报告里带:`md5` / `dat_path` / `dat_size` / `output` / `output_size` / `format`实际写出的图片格式jpg / png / gif / webp 等)/ `decoder`(实际选用的解码器:`legacy_xor` / `v1_aes` / `v2`)。
微信 4 图片可能解码成 `WXGF/WXAM` 容器(头部 `wxgf`,内部是 HEVC。默认 `wx extract` 会自动提取 WXGF 里的最大 HEVC partition 并用 `ffmpeg` 转 JPG报告会带 `source_format: hevc`、`source_size`、`transcoder` 和 `wxgf_partition_*`。如果只需要原始容器,传 `--raw`。如果本地只拿到 `_t.dat` 缩略图,输出会很小且文字图片不可读;让用户先在微信客户端点开图片,等完整 `.dat` 下载到本地后再重新 `wx extract`
支持的解码档位: 支持的解码档位:
- **legacy XOR**:早期单字节 XOR无 magic按文件首字节探测格式自动反推 - **legacy XOR**:早期单字节 XOR无 magic按文件首字节探测格式自动反推

View File

@ -13,6 +13,7 @@ use anyhow::{anyhow, Result};
pub mod v1_xor; pub mod v1_xor;
pub mod v2; pub mod v2;
pub mod wxgf;
/// 完整 V2 magic`\x07\x08V2\x08\x07` /// 完整 V2 magic`\x07\x08V2\x08\x07`
pub const V2_MAGIC: [u8; 6] = [0x07, 0x08, b'V', b'2', 0x08, 0x07]; pub const V2_MAGIC: [u8; 6] = [0x07, 0x08, b'V', b'2', 0x08, 0x07];

View File

@ -0,0 +1,199 @@
//! WeChat 4 WXGF/WXAM image container support.
//!
//! `wxgf` is not a normal image format. It is a private WeChat container whose
//! largest data partition is usually an Annex B HEVC bitstream. We keep the
//! parser tiny: find HEVC start codes after the WXGF header, validate the
//! 4-byte big-endian length immediately before the start code, then hand the
//! largest partition to ffmpeg.
use anyhow::{bail, Context, Result};
use std::path::PathBuf;
use std::process::Command;
use std::sync::atomic::{AtomicU64, Ordering};
use std::time::{SystemTime, UNIX_EPOCH};
const WXGF_MAGIC: &[u8; 4] = b"wxgf";
const FFMPEG_ENV: &str = "WX_FFMPEG";
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct WxgfPartition {
pub offset: usize,
/// Partition byte length, including the HEVC start code at `offset`.
pub size: usize,
pub ratio: f64,
}
#[derive(Debug)]
pub struct WxgfJpeg {
pub data: Vec<u8>,
pub partition: WxgfPartition,
pub ffmpeg: String,
}
struct TempPaths {
input: PathBuf,
output: PathBuf,
}
impl Drop for TempPaths {
fn drop(&mut self) {
let _ = std::fs::remove_file(&self.input);
let _ = std::fs::remove_file(&self.output);
}
}
/// Return the largest HEVC Annex B partition inside a WXGF/WXAM container.
pub fn largest_partition(data: &[u8]) -> Result<WxgfPartition> {
if data.len() < 15 || &data[..4] != WXGF_MAGIC {
bail!("invalid WXGF image container");
}
let header_len = data[4] as usize;
if header_len >= data.len() {
bail!("invalid WXGF header length {}", header_len);
}
for pattern in [&[0x00, 0x00, 0x00, 0x01][..], &[0x00, 0x00, 0x01][..]] {
let mut partitions = Vec::new();
let mut rel_offset = 0usize;
while header_len + rel_offset < data.len() {
let search_from = header_len + rel_offset;
let Some(idx) = find_subslice(&data[search_from..], pattern) else {
break;
};
let abs_idx = search_from + idx;
if abs_idx < 4 {
rel_offset = rel_offset.saturating_add(idx + 1);
continue;
}
let size = u32::from_be_bytes(data[abs_idx - 4..abs_idx].try_into().unwrap()) as usize;
if size > 0 && abs_idx.checked_add(size).is_some_and(|end| end <= data.len()) {
partitions.push(WxgfPartition {
offset: abs_idx,
size,
ratio: size as f64 / data.len() as f64,
});
rel_offset = abs_idx - header_len + size;
} else {
rel_offset = abs_idx - header_len + 1;
}
}
if let Some(max) = partitions.into_iter().max_by_key(|p| p.size) {
return Ok(max);
}
}
bail!("WXGF image has no valid HEVC partition")
}
/// Convert a WXGF/WXAM image to JPEG through ffmpeg.
///
/// The ffmpeg path is resolved from `WX_FFMPEG`, then falls back to `ffmpeg` in
/// PATH. This avoids adding Python or native HEVC decoder dependencies.
pub fn transcode_to_jpeg(data: &[u8]) -> Result<WxgfJpeg> {
let partition = largest_partition(data)?;
let hevc = &data[partition.offset..partition.offset + partition.size];
let ffmpeg = std::env::var(FFMPEG_ENV).unwrap_or_else(|_| "ffmpeg".to_string());
let paths = temp_paths();
std::fs::write(&paths.input, hevc)
.with_context(|| format!("写出 WXGF/HEVC 临时输入失败:{}", paths.input.display()))?;
let output = Command::new(&ffmpeg)
.arg("-y")
.arg("-hide_banner")
.arg("-loglevel")
.arg("error")
.arg("-f")
.arg("hevc")
.arg("-i")
.arg(&paths.input)
.arg("-vframes")
.arg("1")
.arg("-c:v")
.arg("mjpeg")
.arg("-q:v")
.arg("4")
.arg(&paths.output)
.output()
.with_context(|| {
format!(
"启动 ffmpeg 失败;请安装 ffmpeg 或用 {FFMPEG_ENV} 指定路径,或用 wx extract --raw 导出原始 WXGF"
)
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
bail!(
"ffmpeg 转码 WXGF/HEVC 失败:{}",
stderr.trim().chars().take(800).collect::<String>()
);
}
let data = std::fs::read(&paths.output)
.with_context(|| format!("读取 ffmpeg 输出失败:{}", paths.output.display()))?;
if data.is_empty() {
bail!("ffmpeg 转码 WXGF/HEVC 成功但没有输出 JPEG 数据");
}
Ok(WxgfJpeg {
data,
partition,
ffmpeg,
})
}
fn temp_paths() -> TempPaths {
static COUNTER: AtomicU64 = AtomicU64::new(0);
let nanos = SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_nanos())
.unwrap_or(0);
let seq = COUNTER.fetch_add(1, Ordering::Relaxed);
let stem = format!("wx-cli-wxgf-{}-{}-{}", std::process::id(), nanos, seq);
let dir = std::env::temp_dir();
TempPaths {
input: dir.join(format!("{}.hevc", stem)),
output: dir.join(format!("{}.jpg", stem)),
}
}
fn find_subslice(haystack: &[u8], needle: &[u8]) -> Option<usize> {
if needle.is_empty() || needle.len() > haystack.len() {
return None;
}
haystack.windows(needle.len()).position(|w| w == needle)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn finds_largest_partition() {
let mut data = b"wxgf".to_vec();
data.push(19); // header length
data.extend_from_slice(&[0; 14]);
data.extend_from_slice(&8u32.to_be_bytes());
data.extend_from_slice(&[0, 0, 0, 1]);
data.extend_from_slice(&[1, 2, 3, 4]);
let second_offset = data.len() + 4;
data.extend_from_slice(&12u32.to_be_bytes());
data.extend_from_slice(&[0, 0, 0, 1]);
data.extend_from_slice(&[5, 6, 7, 8, 9, 10, 11, 12]);
let p = largest_partition(&data).unwrap();
assert_eq!(p.offset, second_offset);
assert_eq!(p.size, 12);
}
#[test]
fn rejects_non_wxgf() {
let err = largest_partition(b"not wxgf").unwrap_err().to_string();
assert!(err.contains("WXGF"));
}
}

View File

@ -13,12 +13,14 @@ pub fn cmd_extract(
attachment_id: String, attachment_id: String,
output: String, output: String,
overwrite: bool, overwrite: bool,
raw: bool,
json: bool, json: bool,
) -> Result<()> { ) -> Result<()> {
let req = Request::Extract { let req = Request::Extract {
attachment_id, attachment_id,
output, output,
overwrite, overwrite,
raw,
}; };
let resp = transport::send(req)?; let resp = transport::send(req)?;
print_value(&resp.data, &resolve(json)) print_value(&resp.data, &resolve(json))

View File

@ -305,6 +305,9 @@ enum Commands {
/// 目标已存在时覆盖 /// 目标已存在时覆盖
#[arg(long)] #[arg(long)]
overwrite: bool, overwrite: bool,
/// 原样导出解码后的附件数据WXGF/HEVC 图片不转 JPG
#[arg(long)]
raw: bool,
/// 输出 JSON默认 YAML /// 输出 JSON默认 YAML
#[arg(long)] #[arg(long)]
json: bool, json: bool,
@ -518,8 +521,9 @@ fn dispatch(cli: Cli) -> Result<()> {
attachment_id, attachment_id,
output, output,
overwrite, overwrite,
raw,
json, json,
} => extract::cmd_extract(attachment_id, output, overwrite, json), } => extract::cmd_extract(attachment_id, output, overwrite, raw, json),
Commands::Daemon { cmd } => daemon_cmd::cmd_daemon(cmd), Commands::Daemon { cmd } => daemon_cmd::cmd_daemon(cmd),
} }
} }

View File

@ -4483,6 +4483,7 @@ pub async fn q_extract(
attachment_id: &str, attachment_id: &str,
output: &str, output: &str,
overwrite: bool, overwrite: bool,
raw: bool,
) -> Result<Value> { ) -> Result<Value> {
use crate::attachment::{ use crate::attachment::{
attachment_id::AttachmentId, attachment_id::AttachmentId,
@ -4573,25 +4574,63 @@ pub async fn q_extract(
}; };
let decoded = decoder::dispatch(&dat_bytes, v2_key)?; let decoded = decoder::dispatch(&dat_bytes, v2_key)?;
let source_format = decoded.format;
let source_size = decoded.data.len();
let mut output_format = source_format.to_string();
let mut decoder_name = decoded.decoder.to_string();
let mut output_data = decoded.data;
let mut wxgf_partition_offset: Option<usize> = None;
let mut wxgf_partition_size: Option<usize> = None;
let mut wxgf_partition_ratio: Option<f64> = None;
let mut transcoder: Option<String> = None;
if source_format == "hevc" && !raw {
let jpg = decoder::wxgf::transcode_to_jpeg(&output_data)
.context("WXGF/HEVC 图片转 JPG 失败;可安装 ffmpeg 或用 wx extract --raw 导出原始 WXGF")?;
wxgf_partition_offset = Some(jpg.partition.offset);
wxgf_partition_size = Some(jpg.partition.size);
wxgf_partition_ratio = Some(jpg.partition.ratio);
transcoder = Some(format!("ffmpeg:{}", jpg.ffmpeg));
output_data = jpg.data;
output_format = "jpg".to_string();
decoder_name.push_str("+wxgf_ffmpeg");
}
// 写盘 // 写盘
std::fs::write(&output_path2, &decoded.data) std::fs::write(&output_path2, &output_data)
.with_context(|| format!("写出文件失败:{}", output_path2.display()))?; .with_context(|| format!("写出文件失败:{}", output_path2.display()))?;
// 注意:不要在这里塞 `ok: true`。dispatch 会用 Response::ok(v) 包一层, // 注意:不要在这里塞 `ok: true`。dispatch 会用 Response::ok(v) 包一层,
// Response 的 `data: Value` 字段是 #[serde(flatten)] 写出的,本 payload // Response 的 `data: Value` 字段是 #[serde(flatten)] 写出的,本 payload
// 的 `ok` 会和 Response 自带的 `ok` 在线上拼成两个同名 keyCLI 反序列化时 // 的 `ok` 会和 Response 自带的 `ok` 在线上拼成两个同名 keyCLI 反序列化时
// serde_json 直接报 "duplicate field",业务请求看上去像 daemon 解析失败。 // serde_json 直接报 "duplicate field",业务请求看上去像 daemon 解析失败。
Ok(json!({ let mut report = json!({
"kind": id_for_task.kind.as_str(), "kind": id_for_task.kind.as_str(),
"md5": resolved.md5, "md5": resolved.md5,
"dat_path": resolved.dat_path.display().to_string(), "dat_path": resolved.dat_path.display().to_string(),
"dat_size": resolved.size, "dat_size": resolved.size,
"output": output_path2.display().to_string(), "output": output_path2.display().to_string(),
"output_size": decoded.data.len(), "output_size": output_data.len(),
"format": decoded.format, "format": output_format,
"decoder": decoded.decoder, "decoder": decoder_name,
})) });
if source_format != report["format"].as_str().unwrap_or_default() {
report["source_format"] = json!(source_format);
report["source_size"] = json!(source_size);
}
if let Some(transcoder) = transcoder {
report["transcoder"] = json!(transcoder);
}
if let Some(offset) = wxgf_partition_offset {
report["wxgf_partition_offset"] = json!(offset);
}
if let Some(size) = wxgf_partition_size {
report["wxgf_partition_size"] = json!(size);
}
if let Some(ratio) = wxgf_partition_ratio {
report["wxgf_partition_ratio"] = json!(ratio);
}
Ok(report)
}) })
.await??; .await??;

View File

@ -353,7 +353,8 @@ async fn dispatch(req: Request, db: &DbCache, names: &tokio::sync::RwLock<Arc<Na
attachment_id, attachment_id,
output, output,
overwrite, overwrite,
} => match query::q_extract(db, &names_arc, &attachment_id, &output, overwrite).await { raw,
} => match query::q_extract(db, &names_arc, &attachment_id, &output, overwrite, raw).await {
Ok(v) => Response::ok(v), Ok(v) => Response::ok(v),
Err(e) => Response::err(e.to_string()), Err(e) => Response::err(e.to_string()),
}, },

View File

@ -184,6 +184,9 @@ pub enum Request {
/// 已存在时是否覆盖 /// 已存在时是否覆盖
#[serde(default)] #[serde(default)]
overwrite: bool, overwrite: bool,
/// 原样导出解码后的附件数据;图片为 WXGF/HEVC 时不调用 ffmpeg 转 JPG
#[serde(default, skip_serializing_if = "is_false")]
raw: bool,
}, },
} }