From d7bd351cecc356ee574c306242c37593af67ba1c Mon Sep 17 00:00:00 2001
From: Richard Liu <1625351+richardzone@users.noreply.github.com>
Date: Tue, 9 Jun 2026 13:12:17 +0800
Subject: [PATCH] feat: transcribe WeChat voice attachments

---
 README.md                  |  33 +-
 SKILL.md                   |  30 +-
 src/attachment/resolver.rs | 618 +++++++++++++++++++++++++++++++++++--
 src/cli/attachments.rs     |   3 +-
 src/cli/extract.rs         |   8 +-
 src/cli/mod.rs             |  57 +++-
 src/cli/transcribe.rs      | 467 ++++++++++++++++++++++++++++
 src/daemon/query.rs        | 246 ++++++++++++---
 src/ipc.rs                 |   6 +-
 9 files changed, 1373 insertions(+), 95 deletions(-)
 create mode 100644 src/cli/transcribe.rs
diff --git a/README.md b/README.md
index 1c1c7b5..af5ff9b 100644
--- a/README.md
+++ b/README.md
@@ -230,9 +230,9 @@ wx biz-articles --json | jq '.[].url'             # 下游消费 URL
 
 每条返回：`account` / `account_username` / `title` / `url` / `digest` / `cover_url` / `time` / `timestamp` / `recv_time_str`。多图文推送会展开成多行。
 
-### 附件提取（图片）
+### 附件提取（图片；语音 POC）
 
-聊天里的附件本体存在 `xwechat_files/<wxid>/msg/attach/...` 下的 `.dat` 文件，需要按消息所在 `message_resource.db` 的 md5 + 平台相关 image key 解码才能拿到原图。
+聊天里的附件本体存在本地数据库或 `xwechat_files/<wxid>/msg/attach/...` 下的资源文件。图片需要按消息所在 `message_resource.db` 的 md5 + 平台相关 image key 解码才能拿到原图；语音目前是 POC，优先从 `message/media_0.db::VoiceInfo` 导出 `voice_data`，未命中时再尝试本地文件缓存，只做原样复制，不做转码或转文字。
 
 ```bash
 # 1) 列出会话里的图片附件，先拿到不透明的 attachment_id
@@ -240,14 +240,37 @@ wx attachments "张三"
 wx attachments "AI群" --kind image -n 100
 wx attachments "AI群" --since 2026-04-01 --until 2026-04-15
 
-# 2) 把单个 attachment_id 解密写出去（扩展名建议保留 .jpg / .mp4 等）
+# POC: 列出语音消息资源
+wx attachments "张三" --kind voice -n 20
+
+# 2) 把单个 attachment_id 写出去（图片会解码；语音 POC 原样复制）
 wx extract <attachment_id> -o ~/Desktop/photo.jpg
+wx extract <voice_attachment_id> -o /tmp/voice.aud
 wx extract <attachment_id> -o /tmp/x.jpg --overwrite
 ```
 
-`attachments` 输出每条带：`attachment_id` / `kind` / `type` / `local_id` / `timestamp` / `time`，群聊里还有 `sender` 以及稳定身份三件套 `sender_username` / `sender_contact_display` / `sender_group_nickname`（语义同 `history` / `search` / `new-messages`：`sender_username` 是 wxid，用于两个同名成员之间的稳定区分；解析不到 wxid 时这三字段不输出）。当前 `kind` 固定为 `image`；命令名保留成 `attachments` 是为了后续扩到其他附件类型时不 break CLI。
+`attachments` 输出每条带：`attachment_id` / `kind` / `type` / `local_id` / `timestamp` / `time`，群聊里还有 `sender` 以及稳定身份三件套 `sender_username` / `sender_contact_display` / `sender_group_nickname`（语义同 `history` / `search` / `new-messages`：`sender_username` 是 wxid，用于两个同名成员之间的稳定区分；解析不到 wxid 时这三字段不输出）。默认 `kind` 是 `image`；`--kind voice` / `--kind audio` 是实验能力，依赖本地 `media_0.db` 或语音文件缓存仍可读取。
 
-`extract` 输出报告里带：`md5` / `dat_path` / `dat_size` / `output` / `output_size` / `format`（实际识别出的图片格式：jpg / png / gif / webp / hevc 等）/ `decoder`（实际选用的解码器：`legacy_xor` / `v1_aes` / `v2`）。
+`extract` 输出报告里带：`output` / `output_size` / `format` / `decoder`；从本地附件文件命中时还带 `md5` / `dat_path` / `dat_size`。图片的 `format` 是实际识别出的图片格式（jpg / png / gif / webp / hevc 等），`decoder` 是 `legacy_xor` / `v1_aes` / `v2`；语音 POC 的 `decoder` 是 `media_0_voice_data` 或 `raw_copy`。
+
+#### 语音转文字 POC
+
+`wx transcribe` 会把语音 `attachment_id` 走完整本地链路：导出 WeChat 原始语音 bytes → SILK v3 decoder 转 PCM → `ffmpeg` 转 16k mono WAV → `whisper.cpp` 本地 ASR。wx-cli 不内置模型，也不下载依赖；所有工具都在本机执行。`--keep-temp` 会保留中间音频文件，目录权限保持 `0700`，但这些文件仍然是私密语音数据，只应在调试时使用。
+
+```bash
+# 依赖示例：
+# 1) kn007/silk-v3-decoder 编译得到 silk/decoder
+# 2) whisper.cpp 编译得到 whisper-cli，并下载 ggml 多语种模型
+# 3) ffmpeg 在 PATH 中
+
+wx transcribe <voice_attachment_id> \
+  --silk-decoder /path/to/silk-v3-decoder/silk/decoder \
+  --whisper-bin /path/to/whisper.cpp/build/bin/whisper-cli \
+  --model /path/to/whisper.cpp/models/ggml-large-v3-turbo.bin \
+  --language zh
+```
+
+也可用环境变量减少参数：`WX_SILK_DECODER` / `WX_WHISPER_BIN` / `WX_WHISPER_MODEL` / `WX_FFMPEG`。
 
 支持的解码档位：
 - **legacy XOR**：早期单字节 XOR，无 magic（按文件首字节探测格式自动反推）
diff --git a/SKILL.md b/SKILL.md
index 61082fe..857a0e1 100644
--- a/SKILL.md
+++ b/SKILL.md
@@ -267,24 +267,42 @@ wx biz-articles --since 2026-05-10 --json | jq '.[].url'
 
 每条返回的字段：`account` / `account_username`（`gh_*`）/ `title` / `url`（`mp.weixin.qq.com` 链接）/ `digest` / `cover_url` / `time` + `timestamp`（文章发布时间）/ `recv_time_str` + `recv_time`（微信接收推送的时间）。多图文推送会展开为多行。
 
-### 附件提取（图片）
+### 附件提取（图片；语音 POC）
 
-聊天里的图片本体在 `xwechat_files/<wxid>/msg/attach/...` 下加密存储（`.dat`），需要按消息所在 `message_resource.db` 的 md5 + 平台相关 image key 才能解码。两步走：
+聊天里的附件本体存在本地数据库或 `xwechat_files/<wxid>/msg/attach/...` 下的资源文件。图片需要按消息所在 `message_resource.db` 的 md5 + 平台相关 image key 解码才能拿到原图；语音目前是 POC，优先从 `message/media_0.db::VoiceInfo` 导出 `voice_data`，未命中时再尝试本地文件缓存，只做原样复制，不做转码或转文字。
 
 ```bash
-# 1) 先列出图片附件，拿到不透明的 attachment_id
+# 1) 先列出附件，拿到不透明的 attachment_id
 wx attachments "张三"
 wx attachments "AI群" --kind image -n 100
 wx attachments "AI群" --since 2026-04-01 --until 2026-04-15
 
-# 2) 用 attachment_id 把单个资源解密写到指定路径
+# POC: 列出语音消息资源
+wx attachments "张三" --kind voice -n 20
+
+# 2) 用 attachment_id 把单个资源写到指定路径
 wx extract <attachment_id> -o ~/Desktop/photo.jpg
+wx extract <voice_attachment_id> -o /tmp/voice.aud
 wx extract <attachment_id> -o /tmp/x.jpg --overwrite
 ```
 
-`attachments` 输出每条带：`attachment_id` / `kind`（当前固定 `image`）/ `type` / `local_id` / `timestamp` / `time`，群聊里另带 `sender` 和稳定身份三件套（同上文）。命令名保留成 `attachments` 是为了后续扩到其他附件类型时不 break CLI。
+`attachments` 输出每条带：`attachment_id` / `kind` / `type` / `local_id` / `timestamp` / `time`，群聊里另带 `sender` 和稳定身份三件套（同上文）。默认 `kind` 是 `image`；`--kind voice` / `--kind audio` 是 POC，优先从 `message/media_0.db::VoiceInfo` 导出 `voice_data`，未命中时再尝试本地文件缓存，只做原样复制，不做转码或转文字。
 
-`extract` 报告里带：`md5` / `dat_path` / `dat_size` / `output` / `output_size` / `format`（实际识别出的图片格式：jpg / png / gif / webp / hevc 等）/ `decoder`（实际选用的解码器：`legacy_xor` / `v1_aes` / `v2`）。
+`extract` 报告里带：`output` / `output_size` / `format` / `decoder`；从本地附件文件命中时还带 `md5` / `dat_path` / `dat_size`。图片的 `decoder` 是 `legacy_xor` / `v1_aes` / `v2`；语音 POC 的 `decoder` 是 `media_0_voice_data` 或 `raw_copy`。
+
+#### 语音转文字 POC
+
+`wx transcribe` 会把语音 `attachment_id` 走完整本地链路：导出 WeChat 原始语音 bytes → SILK v3 decoder 转 PCM → `ffmpeg` 转 16k mono WAV → `whisper.cpp` 本地 ASR。wx-cli 不内置模型，也不下载依赖；所有工具都在本机执行。`--keep-temp` 会保留中间音频文件，目录权限保持 `0700`，但这些文件仍然是私密语音数据，只应在调试时使用。
+
+```bash
+wx transcribe <voice_attachment_id> \
+  --silk-decoder /path/to/silk-v3-decoder/silk/decoder \
+  --whisper-bin /path/to/whisper.cpp/build/bin/whisper-cli \
+  --model /path/to/whisper.cpp/models/ggml-large-v3-turbo.bin \
+  --language zh
+```
+
+也可用环境变量减少参数：`WX_SILK_DECODER` / `WX_WHISPER_BIN` / `WX_WHISPER_MODEL` / `WX_FFMPEG`。
 
 支持的解码档位：
 - **legacy XOR**：早期单字节 XOR，无 magic（按文件首字节探测格式自动反推）
diff --git a/src/attachment/resolver.rs b/src/attachment/resolver.rs
index 8db4f41..282eee9 100644
--- a/src/attachment/resolver.rs
+++ b/src/attachment/resolver.rs
@@ -17,9 +17,10 @@
 use anyhow::{anyhow, Context, Result};
 use chrono::TimeZone;
 use rusqlite::Connection;
+use std::collections::HashSet;
 use std::path::{Path, PathBuf};
 
-use super::AttachmentId;
+use super::{AttachmentId, AttachmentKind};
 
 /// 单条 attachment 在资源库 + 本地 attach 树下的解析结果。
 #[derive(Debug, Clone)]
@@ -40,6 +41,14 @@ pub struct AttachmentMetadata {
     pub md5: String,
 }
 
+/// `message/media_0.db::VoiceInfo` 中的一条语音资源。
+#[derive(Debug, Clone)]
+pub struct ResolvedVoiceMedia {
+    pub data: Vec<u8>,
+    pub chunks: usize,
+    pub svr_id: Option<i64>,
+}
+
 /// 用 `(chat, local_id)` 查 message_resource.db 拿 file md5。
 ///
 /// 调用方传已经解密好的 `message_resource.db` 路径（由 daemon 的 `DBCache` 准备）。
@@ -87,8 +96,8 @@ pub fn lookup_md5_blocking(
         )
         .ok();
 
-    let packed: Option<Vec<u8>> = packed_exact.or_else(|| conn
-        .query_row(
+    let packed: Option<Vec<u8>> = packed_exact.or_else(|| {
+        conn.query_row(
             "SELECT packed_info FROM MessageResourceInfo
              WHERE chat_id = ?1
                AND message_local_id = ?2
@@ -98,7 +107,8 @@ pub fn lookup_md5_blocking(
             rusqlite::params![chat_id, local_id, msg_local_type_lo32],
             |row| row.get(0),
         )
-        .ok());
+        .ok()
+    });
 
     let Some(blob) = packed else {
         return Ok(None);
@@ -106,6 +116,170 @@ pub fn lookup_md5_blocking(
     Ok(extract_md5_from_packed_info(&blob).map(|md5| AttachmentMetadata { md5 }))
 }
 
+/// 从 `message/media_0.db` 的 VoiceInfo 表读取语音 BLOB。
+///
+/// WeChat 4.x 语音不一定进入 `message_resource.db`，常见路径是：
+/// `media_0.db::VoiceInfo(local_id, create_time, voice_data, data_index)`。
+/// `data_index` 预留分片能力，所以这里按 data_index 顺序拼接同一条语音的所有 chunk。
+pub fn lookup_voice_media_blocking(
+    media_db_path: &Path,
+    chat: &str,
+    local_id: i64,
+    create_time: i64,
+) -> Result<Option<ResolvedVoiceMedia>> {
+    let conn = Connection::open_with_flags(
+        media_db_path,
+        rusqlite::OpenFlags::SQLITE_OPEN_READ_ONLY | rusqlite::OpenFlags::SQLITE_OPEN_URI,
+    )
+    .with_context(|| format!("打开 media_0.db {:?}", media_db_path))?;
+
+    let has_voice_info: bool = conn
+        .query_row(
+            "SELECT 1 FROM sqlite_master WHERE type='table' AND name='VoiceInfo'",
+            [],
+            |_| Ok(()),
+        )
+        .is_ok();
+    if !has_voice_info {
+        return Ok(None);
+    }
+
+    let columns = table_columns(&conn, "VoiceInfo")?;
+    if !columns.contains("voice_data") {
+        return Ok(None);
+    }
+    let data_index_expr = if columns.contains("data_index") {
+        "CAST(COALESCE(data_index, '0') AS INTEGER)"
+    } else {
+        "0"
+    };
+    let svr_id_expr = if columns.contains("svr_id") {
+        "svr_id"
+    } else {
+        "NULL"
+    };
+
+    let mut rows = Vec::new();
+
+    if columns.contains("local_id") {
+        if columns.contains("chat_name_id") {
+            let chat_id: Option<i64> = conn
+                .query_row(
+                    "SELECT rowid FROM Name2Id WHERE user_name = ?1",
+                    [chat],
+                    |row| row.get(0),
+                )
+                .ok();
+
+            let Some(chat_id) = chat_id else {
+                return Ok(None);
+            };
+
+            if columns.contains("create_time") {
+                rows = query_voice_rows(
+                    &conn,
+                    "chat_name_id = ?1 AND local_id = ?2 AND create_time = ?3",
+                    rusqlite::params![chat_id, local_id, create_time],
+                    data_index_expr,
+                    svr_id_expr,
+                )?;
+            }
+            if rows.is_empty() && !columns.contains("create_time") {
+                rows = query_voice_rows(
+                    &conn,
+                    "chat_name_id = ?1 AND local_id = ?2",
+                    rusqlite::params![chat_id, local_id],
+                    data_index_expr,
+                    svr_id_expr,
+                )?;
+            }
+        }
+    }
+
+    if rows.is_empty() && columns.contains("msgid") {
+        if !columns.contains("user_name") {
+            return Ok(None);
+        }
+        if columns.contains("msgtime") {
+            rows = query_voice_rows(
+                &conn,
+                "user_name = ?1 AND msgid = ?2 AND msgtime = ?3",
+                rusqlite::params![chat, local_id, create_time],
+                data_index_expr,
+                svr_id_expr,
+            )?;
+        }
+        if rows.is_empty() && !columns.contains("msgtime") {
+            rows = query_voice_rows(
+                &conn,
+                "user_name = ?1 AND msgid = ?2",
+                rusqlite::params![chat, local_id],
+                data_index_expr,
+                svr_id_expr,
+            )?;
+        }
+    }
+
+    if rows.is_empty() {
+        return Ok(None);
+    }
+
+    rows.sort_by_key(|row| row.0);
+    let svr_id = rows.iter().find_map(|row| row.2);
+    let chunks = rows.len();
+    let total_len: usize = rows.iter().map(|row| row.1.len()).sum();
+    if total_len == 0 {
+        return Ok(None);
+    }
+    let mut data = Vec::with_capacity(total_len);
+    for (_idx, chunk, _svr_id) in rows {
+        data.extend_from_slice(&chunk);
+    }
+
+    Ok(Some(ResolvedVoiceMedia {
+        data,
+        chunks,
+        svr_id,
+    }))
+}
+
+fn table_columns(conn: &Connection, table: &str) -> Result<HashSet<String>> {
+    let mut stmt = conn.prepare(&format!("PRAGMA table_info({table})"))?;
+    let columns = stmt
+        .query_map([], |row| row.get::<_, String>(1))?
+        .collect::<rusqlite::Result<HashSet<_>>>()?;
+    Ok(columns)
+}
+
+fn query_voice_rows<P>(
+    conn: &Connection,
+    where_clause: &str,
+    params: P,
+    data_index_expr: &str,
+    svr_id_expr: &str,
+) -> Result<Vec<(i64, Vec<u8>, Option<i64>)>>
+where
+    P: rusqlite::Params,
+{
+    let sql = format!(
+        "SELECT {data_index_expr} AS voice_index, voice_data, {svr_id_expr} AS voice_svr_id
+         FROM VoiceInfo
+         WHERE {where_clause}
+         ORDER BY voice_index, rowid"
+    );
+    let mut stmt = conn.prepare(&sql)?;
+    let rows = stmt
+        .query_map(params, |row| {
+            Ok((
+                row.get::<_, i64>(0).unwrap_or(0),
+                row.get::<_, Vec<u8>>(1).unwrap_or_default(),
+                row.get::<_, i64>(2).ok(),
+            ))
+        })?
+        .collect::<rusqlite::Result<Vec<_>>>()?;
+    Ok(rows)
+}
+
 /// 从 `MessageResourceInfo.packed_info` (protobuf) 提取 32 字节 ASCII hex md5。
 ///
 /// 主路径：搜 4 字节 marker `12 22 0a 20`（field=2 LEN, length=34, sub field=1 LEN, length=32），
@@ -145,12 +319,10 @@ fn find_subslice(haystack: &[u8], needle: &[u8]) -> Option<usize> {
     if needle.is_empty() || needle.len() > haystack.len() {
         return None;
     }
-    haystack
-        .windows(needle.len())
-        .position(|w| w == needle)
+    haystack.windows(needle.len()).position(|w| w == needle)
 }
 
-/// 在 `<attach_root>/<md5(chat)>/<YYYY-MM>/Img/<md5>[_t|_h].dat` 下找文件。
+/// 在 `<attach_root>/<md5(chat)>/<YYYY-MM>/Img/<md5>[_t|_h].dat` 下找图片文件。
 ///
 /// 优先级：full > `_h`（HD thumbnail）> `_t`（thumbnail）。返回最优的一个；
 /// 找不到返回 None。
@@ -163,18 +335,40 @@ pub fn find_dat_file(
     chat: &str,
     file_md5: &str,
     create_time: i64,
+) -> Option<PathBuf> {
+    find_media_file(
+        attach_root,
+        chat,
+        file_md5,
+        create_time,
+        AttachmentKind::Image,
+    )
+}
+
+/// 在本地附件树中定位指定 kind 的媒体文件。
+///
+/// image 走已经验证过的 `Img/<md5>[_h|_t].dat` 规则；voice 是 POC 路径，优先试
+/// `Voice` / `Audio` 目录里的 md5 同名文件，最后在 `msg/attach` 下按 md5 前缀递归兜底。
+pub fn find_media_file(
+    attach_root: &Path,
+    chat: &str,
+    file_md5: &str,
+    create_time: i64,
+    kind: AttachmentKind,
 ) -> Option<PathBuf> {
     let chat_hash = format!("{:x}", md5::compute(chat.as_bytes()));
     let chat_dir = attach_root.join(&chat_hash);
     if !chat_dir.is_dir() {
-        return None;
+        return match kind {
+            AttachmentKind::Voice => find_by_md5_recursive(attach_root, file_md5, kind),
+            _ => None,
+        };
     }
 
     // 第一步：试 create_time 当月 + 前后各一个月（共 3 个候选目录）
     let candidates_ym: Vec<String> = three_month_candidates(create_time);
     for ym in &candidates_ym {
-        let img_dir = chat_dir.join(ym).join("Img");
-        if let Some(p) = pick_best_in_img_dir(&img_dir, file_md5) {
+        if let Some(p) = pick_best_in_month_dir(&chat_dir.join(ym), file_md5, kind) {
             return Some(p);
         }
     }
@@ -189,12 +383,37 @@ pub fn find_dat_file(
     // 已经试过的 3 个候选可以跳过，但成本极小；保留全量扫
     all_months.sort();
     for month_dir in all_months {
-        let img_dir = month_dir.join("Img");
-        if let Some(p) = pick_best_in_img_dir(&img_dir, file_md5) {
+        if let Some(p) = pick_best_in_month_dir(&month_dir, file_md5, kind) {
             return Some(p);
         }
     }
-    None
+
+    // POC fallback：Mac 4.x 的语音路径未完全验证。若上面的目录名猜错，仍按资源 md5
+    // 在 attach 树下递归找一次，避免因为 `Voice`/`Audio` 布局差异直接失败。
+    match kind {
+        AttachmentKind::Voice => find_by_md5_recursive(attach_root, file_md5, kind),
+        _ => None,
+    }
+}
+
+fn pick_best_in_month_dir(
+    month_dir: &Path,
+    file_md5: &str,
+    kind: AttachmentKind,
+) -> Option<PathBuf> {
+    match kind {
+        AttachmentKind::Image => pick_best_in_img_dir(&month_dir.join("Img"), file_md5),
+        AttachmentKind::Voice => {
+            for subdir in ["Voice", "Audio", "Aud"] {
+                if let Some(p) = pick_best_media_file(&month_dir.join(subdir), file_md5, kind) {
+                    return Some(p);
+                }
+            }
+            None
+        }
+        AttachmentKind::Video => pick_best_media_file(&month_dir.join("Video"), file_md5, kind),
+        AttachmentKind::File => pick_best_media_file(month_dir, file_md5, kind),
+    }
 }
 
 fn pick_best_in_img_dir(img_dir: &Path, file_md5: &str) -> Option<PathBuf> {
@@ -216,6 +435,94 @@ fn pick_best_in_img_dir(img_dir: &Path, file_md5: &str) -> Option<PathBuf> {
     None
 }
 
+fn pick_best_media_file(media_dir: &Path, file_md5: &str, kind: AttachmentKind) -> Option<PathBuf> {
+    if !media_dir.is_dir() {
+        return None;
+    }
+
+    for name in exact_media_names(file_md5, kind) {
+        let path = media_dir.join(name);
+        if path.is_file() {
+            return Some(path);
+        }
+    }
+
+    let mut candidates = media_dir
+        .read_dir()
+        .ok()?
+        .filter_map(|e| e.ok())
+        .map(|e| e.path())
+        .filter(|p| {
+            p.is_file()
+                && p.file_name()
+                    .and_then(|s| s.to_str())
+                    .map(|name| name.starts_with(file_md5))
+                    .unwrap_or(false)
+        })
+        .collect::<Vec<_>>();
+    candidates.sort_by_key(|p| {
+        let size = p.metadata().map(|m| m.len()).unwrap_or(0);
+        std::cmp::Reverse(size)
+    });
+    candidates.into_iter().next()
+}
+
+fn exact_media_names(file_md5: &str, kind: AttachmentKind) -> Vec<String> {
+    match kind {
+        AttachmentKind::Image => vec![
+            format!("{}.dat", file_md5),
+            format!("{}_h.dat", file_md5),
+            format!("{}_t.dat", file_md5),
+        ],
+        AttachmentKind::Voice => ["", ".aud", ".amr", ".silk", ".wav", ".m4a", ".mp3", ".dat"]
+            .iter()
+            .map(|ext| format!("{}{}", file_md5, ext))
+            .collect(),
+        AttachmentKind::Video => [".mp4", ".mov", ".m4v", ".dat"]
+            .iter()
+            .map(|ext| format!("{}{}", file_md5, ext))
+            .collect(),
+        AttachmentKind::File => vec![file_md5.to_string()],
+    }
+}
+
+fn find_by_md5_recursive(root: &Path, file_md5: &str, kind: AttachmentKind) -> Option<PathBuf> {
+    if !root.is_dir() {
+        return None;
+    }
+    let mut stack = vec![root.to_path_buf()];
+    let mut matches = Vec::new();
+    while let Some(dir) = stack.pop() {
+        let Ok(entries) = std::fs::read_dir(&dir) else {
+            continue;
+        };
+        for entry in entries.filter_map(|e| e.ok()) {
+            let path = entry.path();
+            if path.is_dir() {
+                stack.push(path);
+                continue;
+            }
+            if !path.is_file() {
+                continue;
+            }
+            let Some(name) = path.file_name().and_then(|s| s.to_str()) else {
+                continue;
+            };
+            if name == file_md5
+                || exact_media_names(file_md5, kind).iter().any(|n| n == name)
+                || name.starts_with(file_md5)
+            {
+                matches.push(path);
+            }
+        }
+    }
+    matches.sort_by_key(|p| {
+        let size = p.metadata().map(|m| m.len()).unwrap_or(0);
+        std::cmp::Reverse(size)
+    });
+    matches.into_iter().next()
+}
+
 fn three_month_candidates(unix_ts: i64) -> Vec<String> {
     use chrono::{Datelike, Duration};
     let dt = match chrono::Local.timestamp_opt(unix_ts, 0).single() {
@@ -268,19 +575,26 @@ pub fn resolve_blocking(
             )
         })?;
 
-    let dat_path = find_dat_file(attach_root, &id.chat, &meta.md5, id.create_time).ok_or_else(
-        || {
-            anyhow!(
-                "找不到本地 .dat（md5={} chat={} create_time={}）— 微信可能尚未下载该附件，或附件已被清理",
-                meta.md5,
-                id.chat,
-                id.create_time
-            )
-        },
-    )?;
+    let dat_path =
+        find_media_file(attach_root, &id.chat, &meta.md5, id.create_time, id.kind).ok_or_else(
+            || {
+                anyhow!(
+                    "找不到本地附件文件（kind={} md5={} chat={} create_time={}）— 微信可能尚未下载该附件，或附件已被清理",
+                    id.kind.as_str(),
+                    meta.md5,
+                    id.chat,
+                    id.create_time
+                )
+            },
+        )?;
     let size = std::fs::metadata(&dat_path).map(|m| m.len()).unwrap_or(0);
 
-    Ok(ResolvedAttachment { id: id.clone(), md5: meta.md5, dat_path, size })
+    Ok(ResolvedAttachment {
+        id: id.clone(),
+        md5: meta.md5,
+        dat_path,
+        size,
+    })
 }
 
 #[cfg(test)]
@@ -334,11 +648,8 @@ mod tests {
         let dir = tempdir_for_test();
         let db_path = dir.join("message_resource.db");
         let conn = Connection::open(&db_path).unwrap();
-        conn.execute(
-            "CREATE TABLE ChatName2Id (user_name TEXT)",
-            [],
-        )
-        .unwrap();
+        conn.execute("CREATE TABLE ChatName2Id (user_name TEXT)", [])
+            .unwrap();
         conn.execute(
             "INSERT INTO ChatName2Id (rowid, user_name) VALUES (1, 'room@chatroom')",
             [],
@@ -392,6 +703,208 @@ mod tests {
         assert_eq!(new.md5, "22222222222222222222222222222222");
     }
 
+    #[test]
+    fn lookup_voice_media_reads_chunks_from_media_db() {
+        let dir = tempdir_for_test();
+        let db_path = dir.join("media_0.db");
+        let conn = Connection::open(&db_path).unwrap();
+        conn.execute("CREATE TABLE Name2Id (user_name TEXT)", [])
+            .unwrap();
+        conn.execute(
+            "INSERT INTO Name2Id (rowid, user_name) VALUES (9, 'room@chatroom')",
+            [],
+        )
+        .unwrap();
+        conn.execute(
+            "CREATE TABLE VoiceInfo (
+                chat_name_id INTEGER,
+                create_time INTEGER,
+                local_id INTEGER,
+                svr_id INTEGER,
+                voice_data BLOB,
+                data_index TEXT DEFAULT '0'
+            )",
+            [],
+        )
+        .unwrap();
+        conn.execute(
+            "INSERT INTO VoiceInfo
+             (chat_name_id, create_time, local_id, svr_id, voice_data, data_index)
+             VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
+            rusqlite::params![9i64, 2000i64, 7i64, 123i64, b"two", "2"],
+        )
+        .unwrap();
+        conn.execute(
+            "INSERT INTO VoiceInfo
+             (chat_name_id, create_time, local_id, svr_id, voice_data, data_index)
+             VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
+            rusqlite::params![9i64, 2000i64, 7i64, 123i64, b"one", "1"],
+        )
+        .unwrap();
+
+        let media = lookup_voice_media_blocking(&db_path, "room@chatroom", 7, 2000)
+            .unwrap()
+            .unwrap();
+        assert_eq!(media.data, b"onetwo");
+        assert_eq!(media.chunks, 2);
+        assert_eq!(media.svr_id, Some(123));
+    }
+
+    #[test]
+    fn lookup_voice_media_keeps_rows_scoped_to_chat() {
+        let dir = tempdir_for_test();
+        let db_path = dir.join("media_0.db");
+        let conn = Connection::open(&db_path).unwrap();
+        conn.execute("CREATE TABLE Name2Id (user_name TEXT)", [])
+            .unwrap();
+        conn.execute(
+            "INSERT INTO Name2Id (rowid, user_name) VALUES (9, 'room@chatroom')",
+            [],
+        )
+        .unwrap();
+        conn.execute(
+            "INSERT INTO Name2Id (rowid, user_name) VALUES (10, 'other@chatroom')",
+            [],
+        )
+        .unwrap();
+        conn.execute(
+            "CREATE TABLE VoiceInfo (
+                chat_name_id INTEGER,
+                create_time INTEGER,
+                local_id INTEGER,
+                svr_id INTEGER,
+                voice_data BLOB,
+                data_index TEXT DEFAULT '0'
+            )",
+            [],
+        )
+        .unwrap();
+        for (chat_id, data) in [(10i64, b"wrong".as_slice()), (9i64, b"right".as_slice())] {
+            conn.execute(
+                "INSERT INTO VoiceInfo
+                 (chat_name_id, create_time, local_id, svr_id, voice_data, data_index)
+                 VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
+                rusqlite::params![chat_id, 2000i64, 7i64, 123i64, data, "0"],
+            )
+            .unwrap();
+        }
+
+        let media = lookup_voice_media_blocking(&db_path, "room@chatroom", 7, 2000)
+            .unwrap()
+            .unwrap();
+        assert_eq!(media.data, b"right");
+    }
+
+    #[test]
+    fn lookup_voice_media_uses_create_time_to_disambiguate_reused_local_id() {
+        let dir = tempdir_for_test();
+        let db_path = dir.join("media_0.db");
+        let conn = Connection::open(&db_path).unwrap();
+        conn.execute("CREATE TABLE Name2Id (user_name TEXT)", [])
+            .unwrap();
+        conn.execute(
+            "INSERT INTO Name2Id (rowid, user_name) VALUES (9, 'room@chatroom')",
+            [],
+        )
+        .unwrap();
+        conn.execute(
+            "CREATE TABLE VoiceInfo (
+                chat_name_id INTEGER,
+                create_time INTEGER,
+                local_id INTEGER,
+                svr_id INTEGER,
+                voice_data BLOB,
+                data_index TEXT DEFAULT '0'
+            )",
+            [],
+        )
+        .unwrap();
+        for (create_time, data) in [(1000i64, b"old".as_slice()), (2000i64, b"new".as_slice())] {
+            conn.execute(
+                "INSERT INTO VoiceInfo
+                 (chat_name_id, create_time, local_id, svr_id, voice_data, data_index)
+                 VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
+                rusqlite::params![9i64, create_time, 7i64, 123i64, data, "0"],
+            )
+            .unwrap();
+        }
+
+        let media = lookup_voice_media_blocking(&db_path, "room@chatroom", 7, 2000)
+            .unwrap()
+            .unwrap();
+        assert_eq!(media.data, b"new");
+        assert!(
+            lookup_voice_media_blocking(&db_path, "room@chatroom", 7, 3000)
+                .unwrap()
+                .is_none()
+        );
+    }
+
+    #[test]
+    fn lookup_voice_media_reads_legacy_schema_without_chunk_columns() {
+        let dir = tempdir_for_test();
+        let db_path = dir.join("media_0.db");
+        let conn = Connection::open(&db_path).unwrap();
+        conn.execute(
+            "CREATE TABLE VoiceInfo (
+                user_name TEXT,
+                msgid INTEGER,
+                msgtime INTEGER,
+                voice_data BLOB
+            )",
+            [],
+        )
+        .unwrap();
+        conn.execute(
+            "INSERT INTO VoiceInfo (user_name, msgid, msgtime, voice_data)
+             VALUES (?1, ?2, ?3, ?4)",
+            rusqlite::params!["room@chatroom", 7i64, 2000i64, b"voice"],
+        )
+        .unwrap();
+
+        let media = lookup_voice_media_blocking(&db_path, "room@chatroom", 7, 2000)
+            .unwrap()
+            .unwrap();
+        assert_eq!(media.data, b"voice");
+        assert_eq!(media.chunks, 1);
+        assert_eq!(media.svr_id, None);
+    }
+
+    #[test]
+    fn lookup_voice_media_legacy_schema_uses_msgtime_to_disambiguate_reused_msgid() {
+        let dir = tempdir_for_test();
+        let db_path = dir.join("media_0.db");
+        let conn = Connection::open(&db_path).unwrap();
+        conn.execute(
+            "CREATE TABLE VoiceInfo (
+                user_name TEXT,
+                msgid INTEGER,
+                msgtime INTEGER,
+                voice_data BLOB
+            )",
+            [],
+        )
+        .unwrap();
+        for (msgtime, data) in [(1000i64, b"old".as_slice()), (2000i64, b"new".as_slice())] {
+            conn.execute(
+                "INSERT INTO VoiceInfo (user_name, msgid, msgtime, voice_data)
+                 VALUES (?1, ?2, ?3, ?4)",
+                rusqlite::params!["room@chatroom", 7i64, msgtime, data],
+            )
+            .unwrap();
+        }
+
+        let media = lookup_voice_media_blocking(&db_path, "room@chatroom", 7, 2000)
+            .unwrap()
+            .unwrap();
+        assert_eq!(media.data, b"new");
+        assert!(
+            lookup_voice_media_blocking(&db_path, "room@chatroom", 7, 3000)
+                .unwrap()
+                .is_none()
+        );
+    }
+
     #[test]
     fn three_month_candidates_includes_prev_curr_next() {
         // 2025-08-15 (mid-month) → 2025-07, 2025-08, 2025-09
@@ -415,17 +928,58 @@ mod tests {
         std::fs::write(img.join(format!("{}_h.dat", md5)), b"hd").unwrap();
         // 只有 _t / _h 时取 _h
         assert_eq!(
-            pick_best_in_img_dir(&img, md5).unwrap().file_name().unwrap(),
+            pick_best_in_img_dir(&img, md5)
+                .unwrap()
+                .file_name()
+                .unwrap(),
             format!("{}_h.dat", md5).as_str()
         );
         // 加 full 后取 full
         std::fs::write(img.join(format!("{}.dat", md5)), b"full").unwrap();
         assert_eq!(
-            pick_best_in_img_dir(&img, md5).unwrap().file_name().unwrap(),
+            pick_best_in_img_dir(&img, md5)
+                .unwrap()
+                .file_name()
+                .unwrap(),
             format!("{}.dat", md5).as_str()
         );
     }
 
+    #[test]
+    fn find_media_file_finds_voice_by_month_voice_dir() {
+        let tmp = tempdir_for_test();
+        let chat = "room@chatroom";
+        let chat_hash = format!("{:x}", md5::compute(chat.as_bytes()));
+        let ts = chrono::Local
+            .with_ymd_and_hms(2026, 6, 9, 12, 0, 0)
+            .unwrap()
+            .timestamp();
+        let voice_dir = tmp.join(chat_hash).join("2026-06").join("Voice");
+        std::fs::create_dir_all(&voice_dir).unwrap();
+        let md5 = "00112233445566778899aabbccddeeff";
+        std::fs::write(voice_dir.join(format!("{}.aud", md5)), b"voice").unwrap();
+
+        let found = find_media_file(&tmp, chat, md5, ts, AttachmentKind::Voice).unwrap();
+        assert_eq!(found.file_name().unwrap(), format!("{}.aud", md5).as_str());
+    }
+
+    #[test]
+    fn find_media_file_voice_recurses_when_layout_unknown() {
+        let tmp = tempdir_for_test();
+        let chat = "room@chatroom";
+        let ts = chrono::Local
+            .with_ymd_and_hms(2026, 6, 9, 12, 0, 0)
+            .unwrap()
+            .timestamp();
+        let odd_dir = tmp.join("somehash").join("2026-06").join("NotVoice");
+        std::fs::create_dir_all(&odd_dir).unwrap();
+        let md5 = "abcdefabcdefabcdefabcdefabcdefab";
+        std::fs::write(odd_dir.join(format!("{}.silk", md5)), b"voice").unwrap();
+
+        let found = find_media_file(&tmp, chat, md5, ts, AttachmentKind::Voice).unwrap();
+        assert_eq!(found.file_name().unwrap(), format!("{}.silk", md5).as_str());
+    }
+
     fn tempdir_for_test() -> PathBuf {
         let pid = std::process::id();
         let nanos = std::time::SystemTime::now()
diff --git a/src/cli/attachments.rs b/src/cli/attachments.rs
index 87e4434..c693d81 100644
--- a/src/cli/attachments.rs
+++ b/src/cli/attachments.rs
@@ -8,7 +8,8 @@ use crate::ipc::Request;
 /// `wx attachments` — 列出指定会话的附件消息（默认 image，可多选）。
 ///
 /// 输出每条 `attachment_id`，再传给 `wx extract` 才真正读 message_resource.db
-/// 与本地 .dat 解码。这一步只查 `Msg_<chat>` 表，几千条群聊也能秒返。
+/// 与本地资源文件。POC 中 image 解码，voice/audio 原样复制；这一步只查
+/// `Msg_<chat>` 表，几千条群聊也能秒返。
 pub fn cmd_attachments(
     chat: String,
     kinds: Vec<String>,
diff --git a/src/cli/extract.rs b/src/cli/extract.rs
index a0eba0d..d0dd41a 100644
--- a/src/cli/extract.rs
+++ b/src/cli/extract.rs
@@ -1,14 +1,14 @@
 use anyhow::Result;
 
-use crate::ipc::Request;
 use super::output::{print_value, resolve};
 use super::transport;
+use crate::ipc::Request;
 
-/// `wx extract` — 把单个 `attachment_id` 对应的资源解密写到指定路径。
+/// `wx extract` — 把单个 `attachment_id` 对应的资源写到指定路径。
 ///
 /// daemon 端：解析 `attachment_id` → 查 `message_resource.db` 拿 file md5 →
-/// 在 `<wxchat_base>/msg/attach/...` 找 .dat → 按 magic 分发到 v1/v2 解码器 →
-/// 写出真实图片/文件。
+/// 在 `<wxchat_base>/msg/attach/...` 找资源文件。image 按 magic 分发到 v1/v2
+/// 解码器，voice/audio POC 原样复制。
 pub fn cmd_extract(
     attachment_id: String,
     output: String,
diff --git a/src/cli/mod.rs b/src/cli/mod.rs
index b4d6cf4..c6d3ab3 100644
--- a/src/cli/mod.rs
+++ b/src/cli/mod.rs
@@ -16,12 +16,14 @@ pub mod sns_feed;
 pub mod sns_notifications;
 pub mod sns_search;
 pub mod stats;
+pub mod transcribe;
 pub mod transport;
 pub mod unread;
 
 use self::output::OutputOpts;
 use anyhow::Result;
 use clap::{Parser, Subcommand};
+use std::path::PathBuf;
 
 /// wx — 微信本地数据 CLI
 #[derive(Parser)]
@@ -271,13 +273,13 @@ enum Commands {
         #[arg(long)]
         json: bool,
     },
-    /// 列出某会话的图片附件，返回不透明 attachment_id
+    /// 列出某会话的附件，返回不透明 attachment_id
     Attachments {
         /// 会话名称（联系人显示名 / wxid / @chatroom username 都可以）
         chat: String,
-        /// 类型（当前仅支持 image）
+        /// 类型（POC 支持 image / voice）
         #[arg(long = "kind", value_name = "KIND",
-              value_parser = ["image", "img"])]
+              value_parser = ["image", "img", "voice", "audio"])]
         kinds: Vec<String>,
         /// 显示数量
         #[arg(short = 'n', long, default_value = "50")]
@@ -295,11 +297,11 @@ enum Commands {
         #[arg(long)]
         json: bool,
     },
-    /// 把单个 attachment_id 对应的资源解密写到指定文件路径
+    /// 把单个 attachment_id 对应的资源写到指定文件路径
     Extract {
         /// 由 `wx attachments` 输出的不透明 ID（base64url 字符串）
         attachment_id: String,
-        /// 输出文件路径（绝对或相对当前工作目录均可；扩展名建议保留为 .jpg 等）
+        /// 输出文件路径（图片建议 .jpg/.png；语音 POC 建议先保留原始扩展名）
         #[arg(short = 'o', long)]
         output: String,
         /// 目标已存在时覆盖
@@ -309,6 +311,32 @@ enum Commands {
         #[arg(long)]
         json: bool,
     },
+    /// 转写单个语音 attachment_id（SILK -> WAV -> whisper.cpp）
+    Transcribe {
+        /// 由 `wx attachments --kind voice` 输出的不透明 ID（base64url 字符串）
+        attachment_id: String,
+        /// whisper.cpp 模型路径；也可用 WX_WHISPER_MODEL
+        #[arg(long, value_name = "PATH")]
+        model: Option<PathBuf>,
+        /// whisper.cpp 的 whisper-cli 路径；默认找 WX_WHISPER_BIN 或 PATH 里的 whisper-cli
+        #[arg(long = "whisper-bin", value_name = "PATH")]
+        whisper_bin: Option<PathBuf>,
+        /// SILK v3 decoder 路径；默认找 WX_SILK_DECODER 或 PATH 里的 silk-decoder/silk_v3_decoder/silk_decoder
+        #[arg(long = "silk-decoder", value_name = "PATH")]
+        silk_decoder: Option<PathBuf>,
+        /// ffmpeg 路径；默认找 WX_FFMPEG 或 PATH 里的 ffmpeg
+        #[arg(long, value_name = "PATH")]
+        ffmpeg: Option<PathBuf>,
+        /// 语音语言，传给 whisper.cpp -l；普通话建议 zh，自动识别用 auto
+        #[arg(short = 'l', long = "language", default_value = "zh")]
+        language: String,
+        /// 保留中间文件（raw/silk/pcm/wav），用于调试转码质量；目录权限保持 0700
+        #[arg(long)]
+        keep_temp: bool,
+        /// 输出 JSON（默认 YAML）
+        #[arg(long)]
+        json: bool,
+    },
     /// 管理 wx-daemon
     Daemon {
         #[command(subcommand)]
@@ -520,6 +548,25 @@ fn dispatch(cli: Cli) -> Result<()> {
             overwrite,
             json,
         } => extract::cmd_extract(attachment_id, output, overwrite, json),
+        Commands::Transcribe {
+            attachment_id,
+            model,
+            whisper_bin,
+            silk_decoder,
+            ffmpeg,
+            language,
+            keep_temp,
+            json,
+        } => transcribe::cmd_transcribe(
+            attachment_id,
+            model,
+            whisper_bin,
+            silk_decoder,
+            ffmpeg,
+            language,
+            keep_temp,
+            json,
+        ),
         Commands::Daemon { cmd } => daemon_cmd::cmd_daemon(cmd),
     }
 }
diff --git a/src/cli/transcribe.rs b/src/cli/transcribe.rs
new file mode 100644
index 0000000..3d80635
--- /dev/null
+++ b/src/cli/transcribe.rs
@@ -0,0 +1,467 @@
+use anyhow::{anyhow, Context, Result};
+use serde_json::{json, Value};
+use std::ffi::OsStr;
+use std::io::Write;
+use std::path::{Path, PathBuf};
+use std::process::Command;
+
+use super::output::{print_value, resolve};
+use super::transport;
+use crate::ipc::Request;
+
+/// `wx transcribe` — 从语音 attachment_id 导出音频并调用本机 ASR。
+///
+/// Pipeline:
+/// 1. daemon `Extract` 导出 WeChat 原始语音 bytes
+/// 2. SILK v3: 规整 `#!SILK` header → decoder 输出 s16le PCM
+/// 3. ffmpeg 转为 whisper.cpp 需要的 16k mono WAV
+/// 4. whisper-cli 做本地 ASR
+pub fn cmd_transcribe(
+    attachment_id: String,
+    model: Option<PathBuf>,
+    whisper_bin: Option<PathBuf>,
+    silk_decoder: Option<PathBuf>,
+    ffmpeg: Option<PathBuf>,
+    language: String,
+    keep_temp: bool,
+    json_out: bool,
+) -> Result<()> {
+    let model = resolve_required_model(model)?;
+    let whisper_bin = resolve_tool(
+        whisper_bin,
+        "WX_WHISPER_BIN",
+        &["whisper-cli"],
+        "找不到 whisper.cpp 的 whisper-cli；请用 --whisper-bin 指定路径，或设置 WX_WHISPER_BIN",
+    )?;
+    let ffmpeg = resolve_tool(
+        ffmpeg,
+        "WX_FFMPEG",
+        &["ffmpeg"],
+        "找不到 ffmpeg；请安装 ffmpeg，或用 --ffmpeg 指定路径",
+    )?;
+
+    let work = WorkDir::new(keep_temp)?;
+    let raw_path = work.path.join("voice.aud");
+    let silk_path = work.path.join("voice.silk");
+    let pcm_path = work.path.join("voice.pcm");
+    let wav_path = work.path.join("voice.wav");
+
+    let extract_report = extract_voice(&attachment_id, &raw_path)?;
+    let kind = extract_report
+        .get("kind")
+        .and_then(Value::as_str)
+        .unwrap_or("");
+    if kind != "voice" {
+        return Err(anyhow!(
+            "attachment_id 不是语音资源（kind={}），请先用 `wx attachments CHAT --kind voice` 获取语音 ID",
+            kind
+        ));
+    }
+
+    let raw_bytes = std::fs::read(&raw_path)
+        .with_context(|| format!("读取语音文件失败：{}", raw_path.display()))?;
+    let format = detect_audio_format(
+        extract_report
+            .get("format")
+            .and_then(Value::as_str)
+            .unwrap_or_default(),
+        &raw_bytes,
+        &raw_path,
+    );
+
+    let mut silk_header_offset: Option<usize> = None;
+    let decode_stage = if format == "silk" {
+        let silk_decoder = resolve_tool(
+            silk_decoder,
+            "WX_SILK_DECODER",
+            &["silk-decoder", "silk_v3_decoder", "silk_decoder"],
+            "找不到 SILK v3 decoder；请用 --silk-decoder 指定 kn007/silk-v3-decoder 的 silk/decoder 路径，或设置 WX_SILK_DECODER",
+        )?;
+        silk_header_offset = Some(write_normalized_silk(&raw_bytes, &silk_path)?);
+        run_silk_decoder(&silk_decoder, &silk_path, &pcm_path)?;
+        run_ffmpeg_pcm_to_wav(&ffmpeg, &pcm_path, &wav_path)?;
+        json!({
+            "input_format": "silk",
+            "silk_header_offset": silk_header_offset,
+            "silk_decoder": silk_decoder.display().to_string(),
+        })
+    } else {
+        run_ffmpeg_audio_to_wav(&ffmpeg, &raw_path, &wav_path)?;
+        json!({
+            "input_format": format,
+            "silk_header_offset": silk_header_offset,
+        })
+    };
+
+    let whisper = run_whisper(&whisper_bin, &model, &wav_path, &language)?;
+    let transcript = clean_whisper_stdout(&whisper.stdout);
+
+    let mut report = json!({
+        "transcript": transcript,
+        "language": language,
+        "engine": "whisper.cpp",
+        "model": model.display().to_string(),
+        "whisper_bin": whisper_bin.display().to_string(),
+        "ffmpeg": ffmpeg.display().to_string(),
+        "audio": {
+            "source": extract_report.get("source").cloned(),
+            "format": format,
+            "decoder": extract_report.get("decoder").cloned(),
+            "output_size": extract_report.get("output_size").cloned(),
+        },
+        "decode": decode_stage,
+        "whisper": {
+            "stderr": whisper.stderr.trim(),
+        },
+        "kept_temp": keep_temp,
+    });
+
+    if keep_temp {
+        report["temp_dir"] = json!(work.path.display().to_string());
+        report["files"] = json!({
+            "raw": raw_path.display().to_string(),
+            "silk": if silk_path.exists() { Some(silk_path.display().to_string()) } else { None },
+            "pcm": if pcm_path.exists() { Some(pcm_path.display().to_string()) } else { None },
+            "wav": wav_path.display().to_string(),
+        });
+    }
+
+    print_value(&report, &resolve(json_out))
+}
+
+fn extract_voice(attachment_id: &str, raw_path: &Path) -> Result<Value> {
+    let resp = transport::send(Request::Extract {
+        attachment_id: attachment_id.to_string(),
+        output: raw_path.display().to_string(),
+        overwrite: true,
+    })?;
+    set_private_file_permissions(raw_path)?;
+    Ok(resp.data)
+}
+
+fn resolve_required_model(model: Option<PathBuf>) -> Result<PathBuf> {
+    if let Some(path) = model {
+        return require_existing_file(path, "--model");
+    }
+    if let Ok(path) = std::env::var("WX_WHISPER_MODEL") {
+        return require_existing_file(PathBuf::from(path), "WX_WHISPER_MODEL");
+    }
+    Err(anyhow!(
+        "缺少 whisper.cpp 模型路径；请传 --model /path/to/ggml-large-v3-turbo.bin，或设置 WX_WHISPER_MODEL"
+    ))
+}
+
+fn resolve_tool(
+    explicit: Option<PathBuf>,
+    env_name: &str,
+    candidates: &[&str],
+    missing_msg: &str,
+) -> Result<PathBuf> {
+    if let Some(path) = explicit {
+        return require_existing_file(path, env_name);
+    }
+    if let Ok(path) = std::env::var(env_name) {
+        return require_existing_file(PathBuf::from(path), env_name);
+    }
+    for candidate in candidates {
+        if let Some(path) = find_in_path(candidate) {
+            return Ok(path);
+        }
+    }
+    Err(anyhow!(missing_msg.to_string()))
+}
+
+fn require_existing_file(path: PathBuf, label: &str) -> Result<PathBuf> {
+    if path.is_file() {
+        Ok(path)
+    } else {
+        Err(anyhow!("{} 指向的文件不存在：{}", label, path.display()))
+    }
+}
+
+fn find_in_path(name: &str) -> Option<PathBuf> {
+    let candidate = Path::new(name);
+    if candidate.components().count() > 1 && candidate.is_file() {
+        return Some(candidate.to_path_buf());
+    }
+    let paths = std::env::var_os("PATH")?;
+    for dir in std::env::split_paths(&paths) {
+        let path = dir.join(name);
+        if path.is_file() {
+            return Some(path);
+        }
+    }
+    None
+}
+
+fn detect_audio_format<'a>(reported: &'a str, bytes: &[u8], path: &Path) -> &'a str {
+    if find_subslice_prefix(bytes, b"#!SILK", 8).is_some() {
+        return "silk";
+    }
+    if bytes.starts_with(b"#!AMR") {
+        return "amr";
+    }
+    if bytes.len() >= 12 && &bytes[..4] == b"RIFF" && &bytes[8..12] == b"WAVE" {
+        return "wav";
+    }
+    if bytes.starts_with(b"ID3") || bytes.starts_with(&[0xFF, 0xFB]) {
+        return "mp3";
+    }
+    if bytes.len() >= 12 && &bytes[4..8] == b"ftyp" {
+        return "m4a";
+    }
+    if !reported.is_empty() && reported != "bin" && reported != "dat" {
+        return reported;
+    }
+    match path.extension().and_then(OsStr::to_str).unwrap_or_default() {
+        "amr" => "amr",
+        "wav" => "wav",
+        "m4a" => "m4a",
+        "mp3" => "mp3",
+        "silk" | "slk" => "silk",
+        _ => "bin",
+    }
+}
+
+fn write_normalized_silk(bytes: &[u8], silk_path: &Path) -> Result<usize> {
+    let offset = find_subslice_prefix(bytes, b"#!SILK", 8).ok_or_else(|| {
+        anyhow!("语音报告为 SILK，但前 8 字节内找不到 #!SILK header，无法调用 SILK decoder")
+    })?;
+    write_private_file(silk_path, &bytes[offset..])
+        .with_context(|| format!("写出 SILK 中间文件失败：{}", silk_path.display()))?;
+    Ok(offset)
+}
+
+fn find_subslice_prefix(haystack: &[u8], needle: &[u8], max_offset: usize) -> Option<usize> {
+    if needle.is_empty() || haystack.len() < needle.len() {
+        return None;
+    }
+    let end = haystack.len().saturating_sub(needle.len()).min(max_offset);
+    (0..=end).find(|&idx| &haystack[idx..idx + needle.len()] == needle)
+}
+
+fn run_silk_decoder(decoder: &Path, silk_path: &Path, pcm_path: &Path) -> Result<()> {
+    let output = Command::new(decoder)
+        .arg(silk_path)
+        .arg(pcm_path)
+        .output()
+        .with_context(|| format!("启动 SILK decoder 失败：{}", decoder.display()))?;
+    if !output.status.success() || !pcm_path.is_file() {
+        return Err(anyhow!(
+            "SILK decoder 失败：{}\n{}",
+            output.status,
+            String::from_utf8_lossy(&output.stderr).trim()
+        ));
+    }
+    set_private_file_permissions(pcm_path)?;
+    Ok(())
+}
+
+fn run_ffmpeg_pcm_to_wav(ffmpeg: &Path, pcm_path: &Path, wav_path: &Path) -> Result<()> {
+    run_command(
+        Command::new(ffmpeg)
+            .arg("-y")
+            .arg("-f")
+            .arg("s16le")
+            .arg("-ar")
+            .arg("24000")
+            .arg("-ac")
+            .arg("1")
+            .arg("-i")
+            .arg(pcm_path)
+            .arg("-ar")
+            .arg("16000")
+            .arg("-ac")
+            .arg("1")
+            .arg("-c:a")
+            .arg("pcm_s16le")
+            .arg(wav_path),
+        "ffmpeg PCM -> WAV",
+    )?;
+    set_private_file_permissions(wav_path)
+}
+
+fn run_ffmpeg_audio_to_wav(ffmpeg: &Path, input_path: &Path, wav_path: &Path) -> Result<()> {
+    run_command(
+        Command::new(ffmpeg)
+            .arg("-y")
+            .arg("-i")
+            .arg(input_path)
+            .arg("-ar")
+            .arg("16000")
+            .arg("-ac")
+            .arg("1")
+            .arg("-c:a")
+            .arg("pcm_s16le")
+            .arg(wav_path),
+        "ffmpeg audio -> WAV",
+    )?;
+    set_private_file_permissions(wav_path)
+}
+
+fn run_whisper(
+    whisper_bin: &Path,
+    model: &Path,
+    wav_path: &Path,
+    language: &str,
+) -> Result<CommandOutput> {
+    let output = Command::new(whisper_bin)
+        .arg("-m")
+        .arg(model)
+        .arg("-f")
+        .arg(wav_path)
+        .arg("-l")
+        .arg(language)
+        .arg("-nt")
+        .arg("-np")
+        .output()
+        .with_context(|| format!("启动 whisper-cli 失败：{}", whisper_bin.display()))?;
+    if !output.status.success() {
+        return Err(anyhow!(
+            "whisper-cli 失败：{}\n{}",
+            output.status,
+            String::from_utf8_lossy(&output.stderr).trim()
+        ));
+    }
+    Ok(CommandOutput {
+        stdout: String::from_utf8_lossy(&output.stdout).to_string(),
+        stderr: String::from_utf8_lossy(&output.stderr).to_string(),
+    })
+}
+
+fn run_command(cmd: &mut Command, stage: &str) -> Result<()> {
+    let output = cmd
+        .output()
+        .with_context(|| format!("启动 {} 失败", stage))?;
+    if output.status.success() {
+        Ok(())
+    } else {
+        Err(anyhow!(
+            "{} 失败：{}\n{}",
+            stage,
+            output.status,
+            String::from_utf8_lossy(&output.stderr).trim()
+        ))
+    }
+}
+
+fn write_private_file(path: &Path, bytes: &[u8]) -> Result<()> {
+    let mut options = std::fs::OpenOptions::new();
+    options.write(true).create_new(true);
+    #[cfg(unix)]
+    {
+        use std::os::unix::fs::OpenOptionsExt;
+        options.mode(0o600);
+    }
+    let mut file = options
+        .open(path)
+        .with_context(|| format!("创建私有文件失败：{}", path.display()))?;
+    file.write_all(bytes)
+        .with_context(|| format!("写入私有文件失败：{}", path.display()))?;
+    set_private_file_permissions(path)
+}
+
+fn set_private_file_permissions(path: &Path) -> Result<()> {
+    #[cfg(unix)]
+    {
+        use std::os::unix::fs::PermissionsExt;
+        std::fs::set_permissions(path, std::fs::Permissions::from_mode(0o600))
+            .with_context(|| format!("设置文件权限失败：{}", path.display()))?;
+    }
+    Ok(())
+}
+
+fn clean_whisper_stdout(stdout: &str) -> String {
+    stdout
+        .lines()
+        .map(str::trim)
+        .filter(|line| !line.is_empty())
+        .collect::<Vec<_>>()
+        .join("\n")
+}
+
+struct CommandOutput {
+    stdout: String,
+    stderr: String,
+}
+
+struct WorkDir {
+    path: PathBuf,
+    keep: bool,
+}
+
+impl WorkDir {
+    fn new(keep: bool) -> Result<Self> {
+        for attempt in 0..128u32 {
+            let nanos = std::time::SystemTime::now()
+                .duration_since(std::time::UNIX_EPOCH)
+                .unwrap_or_default()
+                .as_nanos();
+            let path = std::env::temp_dir().join(format!(
+                "wx-transcribe-{}-{}-{}",
+                std::process::id(),
+                nanos,
+                attempt
+            ));
+            match create_private_dir(&path) {
+                Ok(()) => {
+                    return Ok(Self { path, keep });
+                }
+                Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => continue,
+                Err(e) => {
+                    return Err(e).with_context(|| format!("创建临时目录失败：{}", path.display()));
+                }
+            }
+        }
+        Err(anyhow!("创建临时目录失败：连续 128 次命名冲突"))
+    }
+}
+
+fn create_private_dir(path: &Path) -> std::io::Result<()> {
+    #[cfg(unix)]
+    {
+        use std::os::unix::fs::DirBuilderExt;
+        std::fs::DirBuilder::new().mode(0o700).create(path)
+    }
+    #[cfg(not(unix))]
+    {
+        std::fs::create_dir(path)
+    }
+}
+
+impl Drop for WorkDir {
+    fn drop(&mut self) {
+        if !self.keep {
+            let _ = std::fs::remove_dir_all(&self.path);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn find_silk_header_after_wechat_prefix() {
+        assert_eq!(
+            find_subslice_prefix(b"\x02#!SILK_V3", b"#!SILK", 8),
+            Some(1)
+        );
+        assert_eq!(find_subslice_prefix(b"#!SILK_V3", b"#!SILK", 8), Some(0));
+    }
+
+    #[test]
+    fn clean_whisper_stdout_keeps_non_empty_lines() {
+        assert_eq!(clean_whisper_stdout("\n  你好  \n\n世界\n"), "你好\n世界");
+    }
+
+    #[cfg(unix)]
+    #[test]
+    fn workdir_is_private() {
+        use std::os::unix::fs::PermissionsExt;
+        let work = WorkDir::new(true).unwrap();
+        let mode = std::fs::metadata(&work.path).unwrap().permissions().mode() & 0o777;
+        assert_eq!(mode, 0o700);
+        std::fs::remove_dir_all(&work.path).unwrap();
+    }
+}
diff --git a/src/daemon/query.rs b/src/daemon/query.rs
index ac9ec0d..3868271 100644
--- a/src/daemon/query.rs
+++ b/src/daemon/query.rs
@@ -925,7 +925,8 @@ fn query_messages(
     let mut result = Vec::new();
     for (local_id, local_type, ts, real_sender_id, content_bytes, ct) in rows {
         let content = decompress_message(&content_bytes, ct);
-        let sender_username = sender_username(real_sender_id, &content, is_group, chat_username, &id2u);
+        let sender_username =
+            sender_username(real_sender_id, &content, is_group, chat_username, &id2u);
         let sender = sender_label(
             real_sender_id,
             &content,
@@ -946,7 +947,13 @@ fn query_messages(
             "type": fmt_type(local_type),
             "local_id": local_id,
         });
-        add_sender_identity(&mut msg, is_group, &sender_username, names_map, group_nicknames);
+        add_sender_identity(
+            &mut msg,
+            is_group,
+            &sender_username,
+            names_map,
+            group_nicknames,
+        );
         if let Some(u) = url {
             msg["url"] = serde_json::Value::String(u);
         }
@@ -1032,7 +1039,8 @@ fn search_in_table(
     let mut result = Vec::new();
     for (local_id, local_type, ts, real_sender_id, content_bytes, ct) in rows {
         let content = decompress_message(&content_bytes, ct);
-        let sender_username = sender_username(real_sender_id, &content, is_group, chat_username, &id2u);
+        let sender_username =
+            sender_username(real_sender_id, &content, is_group, chat_username, &id2u);
         let sender = sender_label(
             real_sender_id,
             &content,
@@ -1057,7 +1065,13 @@ fn search_in_table(
             "content": text,
             "type": fmt_type(local_type),
         });
-        add_sender_identity(&mut msg, is_group, &sender_username, names_map, group_nicknames);
+        add_sender_identity(
+            &mut msg,
+            is_group,
+            &sender_username,
+            names_map,
+            group_nicknames,
+        );
         if let Some(u) = url {
             msg["url"] = serde_json::Value::String(u);
         }
@@ -1558,11 +1572,13 @@ fn add_sender_identity(
     }
     row["sender_username"] = Value::String(username.to_string());
     row["sender_contact_display"] = Value::String(
-        names.get(username).cloned().unwrap_or_else(|| username.to_string())
-    );
-    row["sender_group_nickname"] = Value::String(
-        group_nicknames.get(username).cloned().unwrap_or_default()
+        names
+            .get(username)
+            .cloned()
+            .unwrap_or_else(|| username.to_string()),
     );
+    row["sender_group_nickname"] =
+        Value::String(group_nicknames.get(username).cloned().unwrap_or_default());
 }
 
 fn sender_label(
@@ -2193,14 +2209,7 @@ mod appmsg_tests {
             .expect("create message table");
             conn.execute(
                 "INSERT INTO Msg_test VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
-                rusqlite::params![
-                    1_i64,
-                    1_i64,
-                    1775146911_i64,
-                    42_i64,
-                    "hello",
-                    0_i64
-                ],
+                rusqlite::params![1_i64, 1_i64, 1775146911_i64, 42_i64, "hello", 0_i64],
             )
             .expect("insert text message");
         }
@@ -2227,7 +2236,10 @@ mod appmsg_tests {
         assert_eq!(rows.len(), 1);
         assert_eq!(rows[0]["sender"].as_str(), Some("同名"));
         assert_eq!(rows[0]["sender_username"].as_str(), Some("wxid_alice"));
-        assert_eq!(rows[0]["sender_contact_display"].as_str(), Some("Alice Contact"));
+        assert_eq!(
+            rows[0]["sender_contact_display"].as_str(),
+            Some("Alice Contact")
+        );
         assert_eq!(rows[0]["sender_group_nickname"].as_str(), Some("同名"));
     }
 
@@ -2284,7 +2296,10 @@ mod appmsg_tests {
         assert_eq!(rows.len(), 1);
         assert_eq!(rows[0]["sender"].as_str(), Some("同名"));
         assert_eq!(rows[0]["sender_username"].as_str(), Some("wxid_alice"));
-        assert_eq!(rows[0]["sender_contact_display"].as_str(), Some("Alice Contact"));
+        assert_eq!(
+            rows[0]["sender_contact_display"].as_str(),
+            Some("Alice Contact")
+        );
         assert_eq!(rows[0]["sender_group_nickname"].as_str(), Some("同名"));
     }
 
@@ -2314,7 +2329,10 @@ mod appmsg_tests {
         add_sender_identity(&mut alice_row, true, "wxid_alice", &names, &group_nicknames);
         assert_eq!(alice_row["sender"].as_str(), Some("同名"));
         assert_eq!(alice_row["sender_username"].as_str(), Some("wxid_alice"));
-        assert_eq!(alice_row["sender_contact_display"].as_str(), Some("Alice Contact"));
+        assert_eq!(
+            alice_row["sender_contact_display"].as_str(),
+            Some("Alice Contact")
+        );
         assert_eq!(alice_row["sender_group_nickname"].as_str(), Some("同名"));
 
         let mut bob_row = json!({
@@ -2336,7 +2354,13 @@ mod appmsg_tests {
 
         // 非群 chat 不该追加 identity 字段（行为对齐 history/search/new-messages）
         let mut private_row = json!({"attachment_id": "ghi", "sender": ""});
-        add_sender_identity(&mut private_row, false, "wxid_alice", &names, &group_nicknames);
+        add_sender_identity(
+            &mut private_row,
+            false,
+            "wxid_alice",
+            &names,
+            &group_nicknames,
+        );
         assert!(private_row.get("sender_username").is_none());
         assert!(private_row.get("sender_contact_display").is_none());
         assert!(private_row.get("sender_group_nickname").is_none());
@@ -2992,7 +3016,8 @@ pub async fn q_new_messages(
                 let mut result = Vec::new();
                 for (local_id, local_type, ts, real_sender_id, content_bytes, ct) in rows {
                     let content = decompress_message(&content_bytes, ct);
-                    let sender_username = sender_username(real_sender_id, &content, is_group, &uname2, &id2u);
+                    let sender_username =
+                        sender_username(real_sender_id, &content, is_group, &uname2, &id2u);
                     let sender = sender_label(
                         real_sender_id,
                         &content,
@@ -3015,7 +3040,13 @@ pub async fn q_new_messages(
                         "content": text,
                         "type": fmt_type(local_type),
                     });
-                    add_sender_identity(&mut msg, is_group, &sender_username, &names_map, &group_nicknames2);
+                    add_sender_identity(
+                        &mut msg,
+                        is_group,
+                        &sender_username,
+                        &names_map,
+                        &group_nicknames2,
+                    );
                     if let Some(u) = url {
                         msg["url"] = serde_json::Value::String(u);
                     }
@@ -4393,18 +4424,21 @@ pub async fn q_attachments(
                                     &names_map,
                                     &group_nicknames2,
                                 ),
-                                sender_username(
-                                    real_sender_id,
-                                    &content,
-                                    true,
-                                    &uname,
-                                    &id2u,
-                                ),
+                                sender_username(real_sender_id, &content, true, &uname, &id2u),
                             )
                         } else {
                             (String::new(), String::new())
                         };
-                        Ok((local_id, lo32, ts, real_sender_id, sender, sender_uname, ts, db_idx2))
+                        Ok((
+                            local_id,
+                            lo32,
+                            ts,
+                            real_sender_id,
+                            sender,
+                            sender_uname,
+                            ts,
+                            db_idx2,
+                        ))
                     })?
                     .filter_map(|r| r.ok())
                     .collect();
@@ -4449,7 +4483,13 @@ pub async fn q_attachments(
         if is_group && !sender.is_empty() {
             row["sender"] = Value::String(sender);
         }
-        add_sender_identity(&mut row, is_group, &sender_uname, &names.map, &group_nicknames);
+        add_sender_identity(
+            &mut row,
+            is_group,
+            &sender_uname,
+            &names.map,
+            &group_nicknames,
+        );
         results.push(row);
     }
     let unknown_shards = current_unknown_shards(db, names);
@@ -4476,7 +4516,9 @@ pub async fn q_attachments(
     }))
 }
 
-/// 解码 attachment_id → 查 message_resource.db → 找本地 .dat → 解密 → 写盘。
+/// 解码 attachment_id → 写出附件资源。
+/// image: message_resource.db → 本地 .dat → 解码。
+/// voice POC: 优先 media_0.db::VoiceInfo → 原样写出 SILK/音频 bytes；未命中再走资源文件 fallback。
 pub async fn q_extract(
     db: &DbCache,
     _names: &Names,
@@ -4487,7 +4529,7 @@ pub async fn q_extract(
     use crate::attachment::{
         attachment_id::AttachmentId,
         decoder::{self, V2KeyMaterial},
-        image_key, resolver,
+        image_key, resolver, AttachmentKind,
     };
 
     let id = AttachmentId::decode(attachment_id)
@@ -4508,6 +4550,44 @@ pub async fn q_extract(
         }
     }
 
+    if id.kind == AttachmentKind::Voice {
+        if let Some(media_path) = db.get("message/media_0.db").await? {
+            let id_for_task = id.clone();
+            let output_path2 = output_path.clone();
+            let report = tokio::task::spawn_blocking(move || -> Result<Option<Value>> {
+                let Some(voice) = resolver::lookup_voice_media_blocking(
+                    &media_path,
+                    &id_for_task.chat,
+                    id_for_task.local_id,
+                    id_for_task.create_time,
+                )?
+                else {
+                    return Ok(None);
+                };
+
+                std::fs::write(&output_path2, &voice.data)
+                    .with_context(|| format!("写出文件失败：{}", output_path2.display()))?;
+                Ok(Some(json!({
+                    "kind": id_for_task.kind.as_str(),
+                    "source": "message/media_0.db",
+                    "local_id": id_for_task.local_id,
+                    "create_time": id_for_task.create_time,
+                    "chunks": voice.chunks,
+                    "svr_id": voice.svr_id,
+                    "output": output_path2.display().to_string(),
+                    "output_size": voice.data.len(),
+                    "format": raw_media_format(&output_path2, &voice.data),
+                    "decoder": "media_0_voice_data",
+                    "poc": true,
+                })))
+            })
+            .await??;
+            if let Some(report) = report {
+                return Ok(report);
+            }
+        }
+    }
+
     // 1) 拿 message_resource.db
     let resource_path = db
         .get("message/message_resource.db")
@@ -4535,6 +4615,22 @@ pub async fn q_extract(
         let dat_bytes = std::fs::read(&resolved.dat_path)
             .with_context(|| format!("读取 .dat 失败：{}", resolved.dat_path.display()))?;
 
+        if id_for_task.kind != AttachmentKind::Image {
+            std::fs::write(&output_path2, &dat_bytes)
+                .with_context(|| format!("写出文件失败：{}", output_path2.display()))?;
+            return Ok(json!({
+                "kind": id_for_task.kind.as_str(),
+                "md5": resolved.md5,
+                "dat_path": resolved.dat_path.display().to_string(),
+                "dat_size": resolved.size,
+                "output": output_path2.display().to_string(),
+                "output_size": dat_bytes.len(),
+                "format": raw_media_format(&resolved.dat_path, &dat_bytes),
+                "decoder": "raw_copy",
+                "poc": true,
+            }));
+        }
+
         // V2 image key — 平台相关。`ImageKeyMaterial` 同时给 aes_key + xor_key。
         // xor_key 不能硬编码 0x88：实测 macOS 真实账号上是 `uin & 0xff` 派生的（0xa2 等），
         // 所以这里桥接时必须把 provider 的 xor_key 透传给 V2KeyMaterial。
@@ -4599,7 +4695,7 @@ pub async fn q_extract(
 }
 
 /// 解析 `kinds` 参数到 `(AttachmentKind, lo32_local_type)` 列表。
-/// 当前只支持 image；命令名保留成 `attachments` 是为了后续扩到其他附件类型时不 break CLI。
+/// 默认 image；voice/audio 是 POC：可以枚举并 raw-copy 本地语音文件，但不做转码/转写。
 fn parse_attachment_kinds(
     kinds: Option<&[String]>,
 ) -> Result<Vec<(crate::attachment::AttachmentKind, i64)>> {
@@ -4613,12 +4709,13 @@ fn parse_attachment_kinds(
     for k in raw {
         let (kind, t): (AttachmentKind, i64) = match k.to_ascii_lowercase().as_str() {
             "image" | "img" => (AttachmentKind::Image, 3),
-            "voice" | "audio" | "video" | "file" => {
+            "voice" | "audio" => (AttachmentKind::Voice, 34),
+            "video" | "file" => {
                 anyhow::bail!(
-                    "当前只支持 image 提取；video/file/voice 的资源路径与 decoder 还没接通"
+                    "当前只支持 image 和 voice POC；video/file 的资源路径与 decoder 还没接通"
                 )
             }
-            other => anyhow::bail!("未知附件类型：{}（当前仅支持 image）", other),
+            other => anyhow::bail!("未知附件类型：{}（当前支持 image / voice POC）", other),
         };
         if seen.insert(kind.as_str()) {
             out.push((kind, t));
@@ -4627,10 +4724,75 @@ fn parse_attachment_kinds(
     Ok(out)
 }
 
+fn raw_media_format(path: &std::path::Path, bytes: &[u8]) -> &'static str {
+    if bytes.starts_with(b"#!SILK")
+        || bytes
+            .windows(b"#!SILK".len())
+            .take(8)
+            .any(|chunk| chunk == b"#!SILK")
+    {
+        return "silk";
+    }
+    if bytes.starts_with(b"#!AMR") {
+        return "amr";
+    }
+    if bytes.len() >= 12 && &bytes[..4] == b"RIFF" && &bytes[8..12] == b"WAVE" {
+        return "wav";
+    }
+    if bytes.starts_with(b"ID3") || bytes.starts_with(&[0xFF, 0xFB]) {
+        return "mp3";
+    }
+    if bytes.len() >= 12 && &bytes[4..8] == b"ftyp" {
+        return "m4a";
+    }
+    match path
+        .extension()
+        .and_then(|s| s.to_str())
+        .unwrap_or_default()
+    {
+        "aud" => "aud",
+        "amr" => "amr",
+        "silk" => "silk",
+        "wav" => "wav",
+        "m4a" => "m4a",
+        "mp3" => "mp3",
+        "dat" => "dat",
+        _ => "bin",
+    }
+}
+
 #[cfg(test)]
 mod biz_tests {
     use super::*;
 
+    #[test]
+    fn parse_attachment_kinds_accepts_voice_aliases() {
+        let kinds = vec!["voice".to_string(), "audio".to_string()];
+        let parsed = parse_attachment_kinds(Some(&kinds)).unwrap();
+        assert_eq!(parsed.len(), 1);
+        assert_eq!(parsed[0].0.as_str(), "voice");
+        assert_eq!(parsed[0].1, 34);
+    }
+
+    #[test]
+    fn raw_media_format_detects_common_audio_headers() {
+        assert_eq!(
+            raw_media_format(std::path::Path::new("x.bin"), b"#!SILK_V3"),
+            "silk"
+        );
+        assert_eq!(
+            raw_media_format(std::path::Path::new("x.aud"), b"\x02#!SILK_V3"),
+            "silk"
+        );
+        assert_eq!(
+            raw_media_format(std::path::Path::new("x.bin"), b"#!AMR\n"),
+            "amr"
+        );
+        let mut wav = b"RIFF0000WAVE".to_vec();
+        wav.extend_from_slice(&[0; 8]);
+        assert_eq!(raw_media_format(std::path::Path::new("x.bin"), &wav), "wav");
+    }
+
     #[test]
     fn extract_cdata_normal() {
         let xml = "<title><![CDATA[TencentResearch]]></title>";
@@ -4837,12 +4999,18 @@ mod group_nickname_tests {
         assert_eq!(top.len(), 2);
         assert_eq!(top[0]["sender"].as_str(), Some("同名"));
         assert_eq!(top[0]["sender_username"].as_str(), Some("wxid_alice"));
-        assert_eq!(top[0]["sender_contact_display"].as_str(), Some("Alice Contact"));
+        assert_eq!(
+            top[0]["sender_contact_display"].as_str(),
+            Some("Alice Contact")
+        );
         assert_eq!(top[0]["sender_group_nickname"].as_str(), Some("同名"));
         assert_eq!(top[0]["count"].as_i64(), Some(7));
         assert_eq!(top[1]["sender"].as_str(), Some("同名"));
         assert_eq!(top[1]["sender_username"].as_str(), Some("wxid_bob"));
-        assert_eq!(top[1]["sender_contact_display"].as_str(), Some("Bob Contact"));
+        assert_eq!(
+            top[1]["sender_contact_display"].as_str(),
+            Some("Bob Contact")
+        );
         assert_eq!(top[1]["sender_group_nickname"].as_str(), Some("同名"));
         assert_eq!(top[1]["count"].as_i64(), Some(3));
     }
diff --git a/src/ipc.rs b/src/ipc.rs
index 93306fb..d382b46 100644
--- a/src/ipc.rs
+++ b/src/ipc.rs
@@ -155,11 +155,11 @@ pub enum Request {
     },
     /// 重新加载配置和密钥（init --force 后 daemon 不会自动重读）
     ReloadConfig,
-    /// 列出某个会话里的图片附件
+    /// 列出某个会话里的附件
     /// 输出每条带 `attachment_id`（不透明 base64url 句柄），传给 `Extract` 时取回本体
     Attachments {
         chat: String,
-        /// 类型过滤：当前仅支持 image
+        /// 类型过滤：默认 image；POC 支持 voice/audio
         #[serde(default, skip_serializing_if = "Option::is_none")]
         kinds: Option<Vec<String>>,
         #[serde(default = "default_limit_50")]
@@ -175,7 +175,7 @@ pub enum Request {
         #[serde(default, skip_serializing_if = "is_false")]
         debug_source: bool,
     },
-    /// 提取（解密）单个附件的本体到指定路径
+    /// 提取单个附件的本体到指定路径；图片解码，语音 POC 原样复制
     Extract {
         /// `Attachments` 返回的不透明 ID
         attachment_id: String,