Initial commit
This commit is contained in:
commit
8199769093
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
/target
|
||||
1762
Cargo.lock
generated
Normal file
1762
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
26
Cargo.toml
Normal file
26
Cargo.toml
Normal file
@ -0,0 +1,26 @@
|
||||
[package]
|
||||
name = "voice-ime"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
windows = { version = "0.61", features = [
|
||||
"Win32_UI_WindowsAndMessaging",
|
||||
"Win32_UI_Input_KeyboardAndMouse",
|
||||
"Win32_UI_Shell",
|
||||
"Win32_Graphics_Gdi",
|
||||
"Win32_System_LibraryLoader",
|
||||
"Win32_Foundation",
|
||||
"Win32_UI_Controls",
|
||||
] }
|
||||
cpal = "0.15"
|
||||
tokio = { version = "1", features = ["rt-multi-thread", "sync", "macros", "time"] }
|
||||
tokio-tungstenite = { version = "0.26", features = ["native-tls"] }
|
||||
futures-util = "0.3"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
base64 = "0.22"
|
||||
rodio = "0.20"
|
||||
|
||||
[build-dependencies]
|
||||
winres = "0.1"
|
||||
79
README.md
Normal file
79
README.md
Normal file
@ -0,0 +1,79 @@
|
||||
# Voice IME 语音输入法
|
||||
|
||||
Windows 系统托盘语音输入工具。按下快捷键开始录音,通过阿里云 Qwen ASR 实时语音识别 API 将语音转为文字,自动输入到当前光标位置。
|
||||
|
||||
## 功能
|
||||
|
||||
- **快捷键切换录音**:默认 F10,按一次开始,再按一次停止
|
||||
- **流式语音识别**:使用 Qwen3 ASR Realtime API,支持 VAD 自动断句,边说边输入
|
||||
- **增量文本插入**:对识别结果做 diff,仅输入变化部分,不影响输入框已有内容
|
||||
- **系统托盘**:托盘图标显示当前状态(空闲/录音中),右键菜单提供设置
|
||||
- **音效提示**:录音开始和停止时播放提示音
|
||||
- **暂停媒体播放**:录音时可自动暂停系统媒体播放(可关闭)
|
||||
- **可自定义配置**:
|
||||
- 快捷键
|
||||
- API Key
|
||||
- ASR 模型
|
||||
- 媒体暂停开关
|
||||
|
||||
## 使用方法
|
||||
|
||||
### 获取 API Key
|
||||
|
||||
前往 [阿里云百炼](https://bailian.console.aliyun.com/) 开通 Qwen ASR 服务并获取 API Key。
|
||||
|
||||
### 运行
|
||||
|
||||
```
|
||||
cargo build --release
|
||||
./target/release/voice-ime.exe
|
||||
```
|
||||
|
||||
首次启动会弹窗要求输入 API Key。输入后程序最小化到系统托盘。
|
||||
|
||||
### 操作
|
||||
|
||||
| 操作 | 说明 |
|
||||
|------|------|
|
||||
| 按下快捷键(默认 F10) | 开始/停止录音 |
|
||||
| 右键托盘图标 | 打开设置菜单 |
|
||||
|
||||
### 右键菜单
|
||||
|
||||
- **设置快捷键** — 按下任意键即可更换
|
||||
- **录音时暂停媒体播放** — 勾选开关
|
||||
- **设置 API Key** — 修改 ASR 服务密钥
|
||||
- **设置模型** — 修改 ASR 模型名称
|
||||
- **退出**
|
||||
|
||||
### 配置文件
|
||||
|
||||
配置保存在 `%APPDATA%\voice-ime\config.json`,格式示例:
|
||||
|
||||
```json
|
||||
{
|
||||
"hotkey_vk": 121,
|
||||
"media_pause_enabled": true,
|
||||
"api_key": "sk-xxxxxxxx",
|
||||
"model": "qwen3-asr-flash-realtime-2026-02-10"
|
||||
}
|
||||
```
|
||||
|
||||
## 技术栈
|
||||
|
||||
- **Rust 2024 Edition**
|
||||
- **windows** crate — Win32 API(托盘图标、热键、SendInput 文字输入)
|
||||
- **cpal** — WASAPI 麦克风采集
|
||||
- **tokio + tokio-tungstenite** — 异步 WebSocket 客户端
|
||||
- **rodio** — 音效播放
|
||||
- **serde** — 配置序列化
|
||||
|
||||
## 系统要求
|
||||
|
||||
- Windows 10/11
|
||||
- 麦克风
|
||||
- 网络连接(用于访问阿里云 ASR API)
|
||||
|
||||
## 许可证
|
||||
|
||||
MIT
|
||||
BIN
assets/idle.ico
Normal file
BIN
assets/idle.ico
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 90 KiB |
BIN
assets/recording.ico
Normal file
BIN
assets/recording.ico
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 110 KiB |
BIN
assets/start.mp3
Normal file
BIN
assets/start.mp3
Normal file
Binary file not shown.
BIN
assets/stop.mp3
Normal file
BIN
assets/stop.mp3
Normal file
Binary file not shown.
7
build.rs
Normal file
7
build.rs
Normal file
@ -0,0 +1,7 @@
|
||||
fn main() {
|
||||
if std::env::var_os("CARGO_CFG_TARGET_OS").as_deref() == Some(std::ffi::OsStr::new("windows")) {
|
||||
let mut res = winres::WindowsResource::new();
|
||||
res.set_icon("assets/idle.ico");
|
||||
res.compile().expect("Failed to compile Windows resources");
|
||||
}
|
||||
}
|
||||
6
convert_icon.py
Normal file
6
convert_icon.py
Normal file
@ -0,0 +1,6 @@
|
||||
from PIL import Image
|
||||
for name in ['idle', 'recording']:
|
||||
img = Image.open(f'{name}.png').convert('RGBA')
|
||||
sizes = [(16,16),(24,24),(32,32),(48,48),(64,64),(128,128),(256,256)]
|
||||
img.save(f'{name}.ico', format='ICO', sizes=sizes)
|
||||
print(f'{name}.ico created')
|
||||
140
src/audio.rs
Normal file
140
src/audio.rs
Normal file
@ -0,0 +1,140 @@
|
||||
use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
|
||||
use cpal::{SampleFormat, Stream, StreamConfig};
|
||||
use tokio::sync::mpsc;
|
||||
|
||||
const TARGET_SAMPLE_RATE: u32 = 16000;
|
||||
|
||||
pub struct AudioCapture {
|
||||
stream: Stream,
|
||||
}
|
||||
|
||||
pub struct AudioCaptureConfig {
|
||||
#[allow(dead_code)]
|
||||
pub sample_rate: u32,
|
||||
}
|
||||
|
||||
/// Linear interpolation resampling from `from_rate` to `to_rate`.
|
||||
fn resample(samples: &[i16], from_rate: u32, to_rate: u32) -> Vec<i16> {
|
||||
if from_rate == to_rate {
|
||||
return samples.to_vec();
|
||||
}
|
||||
let ratio = from_rate as f64 / to_rate as f64;
|
||||
let out_len = (samples.len() as f64 / ratio) as usize;
|
||||
let mut output = Vec::with_capacity(out_len);
|
||||
for i in 0..out_len {
|
||||
let src_pos = i as f64 * ratio;
|
||||
let idx = src_pos as usize;
|
||||
let frac = src_pos - idx as f64;
|
||||
let s = if idx + 1 < samples.len() {
|
||||
let a = samples[idx] as f64;
|
||||
let b = samples[idx + 1] as f64;
|
||||
(a + frac * (b - a)) as i16
|
||||
} else {
|
||||
samples[idx.min(samples.len().saturating_sub(1))]
|
||||
};
|
||||
output.push(s);
|
||||
}
|
||||
output
|
||||
}
|
||||
|
||||
/// Mix multi-channel i16 to mono, resample to 16kHz, return PCM bytes.
|
||||
fn process_i16(data: &[i16], channels: u16, source_rate: u32) -> Vec<u8> {
|
||||
let ch = channels as usize;
|
||||
let mono: Vec<i16> = data
|
||||
.chunks(ch)
|
||||
.map(|frame| {
|
||||
let sum: i32 = frame.iter().map(|&s| s as i32).sum();
|
||||
(sum / ch as i32) as i16
|
||||
})
|
||||
.collect();
|
||||
let resampled = resample(&mono, source_rate, TARGET_SAMPLE_RATE);
|
||||
resampled.iter().flat_map(|s| s.to_le_bytes()).collect()
|
||||
}
|
||||
|
||||
/// Mix multi-channel f32 to mono, resample to 16kHz, return PCM bytes.
|
||||
fn process_f32(data: &[f32], channels: u16, source_rate: u32) -> Vec<u8> {
|
||||
let ch = channels as usize;
|
||||
let mono: Vec<i16> = data
|
||||
.chunks(ch)
|
||||
.map(|frame| {
|
||||
let sum: f32 = frame.iter().sum();
|
||||
let m = sum / ch as f32;
|
||||
(m * 32768.0).clamp(-32768.0, 32767.0) as i16
|
||||
})
|
||||
.collect();
|
||||
let resampled = resample(&mono, source_rate, TARGET_SAMPLE_RATE);
|
||||
resampled.iter().flat_map(|s| s.to_le_bytes()).collect()
|
||||
}
|
||||
|
||||
impl AudioCapture {
|
||||
/// Start capturing audio from the default input device.
|
||||
/// Audio data is always resampled to 16kHz mono PCM i16 LE.
|
||||
pub fn start(tx: mpsc::UnboundedSender<Vec<u8>>) -> Result<(Self, AudioCaptureConfig), String> {
|
||||
let host = cpal::default_host();
|
||||
let device = host
|
||||
.default_input_device()
|
||||
.ok_or_else(|| "No input device available".to_string())?;
|
||||
|
||||
let default_config = device
|
||||
.default_input_config()
|
||||
.map_err(|e| format!("Failed to get default input config: {e}"))?;
|
||||
|
||||
let source_rate = default_config.sample_rate().0;
|
||||
let channels = default_config.channels();
|
||||
let sample_format = default_config.sample_format();
|
||||
let config: StreamConfig = default_config.into();
|
||||
|
||||
eprintln!("[voice-ime] Audio device: {source_rate}Hz, {channels}ch, {sample_format:?} → resampling to {TARGET_SAMPLE_RATE}Hz mono");
|
||||
|
||||
let stream = match sample_format {
|
||||
SampleFormat::I16 => {
|
||||
let ch = channels;
|
||||
let rate = source_rate;
|
||||
device
|
||||
.build_input_stream(
|
||||
&config,
|
||||
move |data: &[i16], _: &cpal::InputCallbackInfo| {
|
||||
let bytes = process_i16(data, ch, rate);
|
||||
let _ = tx.send(bytes);
|
||||
},
|
||||
|err| eprintln!("Audio capture error: {err}"),
|
||||
None,
|
||||
)
|
||||
.map_err(|e| format!("Failed to build i16 input stream: {e}"))?
|
||||
}
|
||||
SampleFormat::F32 => {
|
||||
let ch = channels;
|
||||
let rate = source_rate;
|
||||
device
|
||||
.build_input_stream(
|
||||
&config,
|
||||
move |data: &[f32], _: &cpal::InputCallbackInfo| {
|
||||
let bytes = process_f32(data, ch, rate);
|
||||
let _ = tx.send(bytes);
|
||||
},
|
||||
|err| eprintln!("Audio capture error: {err}"),
|
||||
None,
|
||||
)
|
||||
.map_err(|e| format!("Failed to build f32 input stream: {e}"))?
|
||||
}
|
||||
_ => return Err(format!("Unsupported sample format: {sample_format:?}")),
|
||||
};
|
||||
|
||||
stream
|
||||
.play()
|
||||
.map_err(|e| format!("Failed to start audio stream: {e}"))?;
|
||||
|
||||
Ok((
|
||||
AudioCapture { stream },
|
||||
AudioCaptureConfig {
|
||||
sample_rate: TARGET_SAMPLE_RATE,
|
||||
},
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for AudioCapture {
|
||||
fn drop(&mut self) {
|
||||
let _ = self.stream.pause();
|
||||
}
|
||||
}
|
||||
116
src/config.rs
Normal file
116
src/config.rs
Normal file
@ -0,0 +1,116 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::cell::RefCell;
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
|
||||
pub const DEFAULT_MODEL: &str = "qwen3-asr-flash-realtime-2026-02-10";
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct Config {
|
||||
/// Virtual key code for the hotkey (default: VK_F10 = 0x79)
|
||||
pub hotkey_vk: u16,
|
||||
/// Whether to send media play/pause when toggling recording
|
||||
pub media_pause_enabled: bool,
|
||||
/// API key for Qwen3 ASR service
|
||||
#[serde(default)]
|
||||
pub api_key: String,
|
||||
/// ASR model name
|
||||
#[serde(default = "default_model")]
|
||||
pub model: String,
|
||||
}
|
||||
|
||||
fn default_model() -> String {
|
||||
DEFAULT_MODEL.to_string()
|
||||
}
|
||||
|
||||
impl Default for Config {
|
||||
fn default() -> Self {
|
||||
Config {
|
||||
hotkey_vk: 0x79, // VK_F10
|
||||
media_pause_enabled: true,
|
||||
api_key: String::new(),
|
||||
model: DEFAULT_MODEL.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn config_path() -> PathBuf {
|
||||
let dir = std::env::var_os("APPDATA")
|
||||
.map(PathBuf::from)
|
||||
.unwrap_or_else(|| {
|
||||
dirs_fallback_appdata()
|
||||
})
|
||||
.join("voice-ime");
|
||||
let _ = fs::create_dir_all(&dir);
|
||||
dir.join("config.json")
|
||||
}
|
||||
|
||||
/// Fallback: %USERPROFILE%\AppData\Roaming
|
||||
fn dirs_fallback_appdata() -> PathBuf {
|
||||
std::env::var_os("USERPROFILE")
|
||||
.map(|p| PathBuf::from(p).join("AppData").join("Roaming"))
|
||||
.unwrap_or_else(|| PathBuf::from("."))
|
||||
}
|
||||
|
||||
thread_local! {
|
||||
static CONFIG: RefCell<Config> = RefCell::new(Config::default());
|
||||
}
|
||||
|
||||
pub fn load() {
|
||||
let path = config_path();
|
||||
if let Ok(data) = fs::read_to_string(&path) {
|
||||
if let Ok(cfg) = serde_json::from_str::<Config>(&data) {
|
||||
CONFIG.with(|c| *c.borrow_mut() = cfg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn save() {
|
||||
let path = config_path();
|
||||
CONFIG.with(|c| {
|
||||
if let Ok(json) = serde_json::to_string_pretty(&*c.borrow()) {
|
||||
let _ = fs::write(&path, json);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
pub fn get() -> Config {
|
||||
CONFIG.with(|c| c.borrow().clone())
|
||||
}
|
||||
|
||||
pub fn set_hotkey_vk(vk: u16) {
|
||||
CONFIG.with(|c| c.borrow_mut().hotkey_vk = vk);
|
||||
save();
|
||||
}
|
||||
|
||||
pub fn set_media_pause(enabled: bool) {
|
||||
CONFIG.with(|c| c.borrow_mut().media_pause_enabled = enabled);
|
||||
save();
|
||||
}
|
||||
|
||||
pub fn set_api_key(key: String) {
|
||||
CONFIG.with(|c| c.borrow_mut().api_key = key);
|
||||
save();
|
||||
}
|
||||
|
||||
pub fn set_model(model: String) {
|
||||
CONFIG.with(|c| c.borrow_mut().model = model);
|
||||
save();
|
||||
}
|
||||
|
||||
/// Return a display name for a virtual key code.
|
||||
pub fn vk_name(vk: u16) -> String {
|
||||
match vk {
|
||||
0x70..=0x87 => format!("F{}", vk - 0x70 + 1),
|
||||
0x21 => "PageUp".into(),
|
||||
0x22 => "PageDown".into(),
|
||||
0x23 => "End".into(),
|
||||
0x24 => "Home".into(),
|
||||
0x2D => "Insert".into(),
|
||||
0x2E => "Delete".into(),
|
||||
0x13 => "Pause".into(),
|
||||
0x91 => "ScrollLock".into(),
|
||||
0xC0 => "`".into(),
|
||||
_ => format!("VK(0x{vk:02X})"),
|
||||
}
|
||||
}
|
||||
139
src/input.rs
Normal file
139
src/input.rs
Normal file
@ -0,0 +1,139 @@
|
||||
use std::mem;
|
||||
use windows::Win32::UI::Input::KeyboardAndMouse::{
|
||||
SendInput, INPUT, INPUT_KEYBOARD, KEYBDINPUT, KEYEVENTF_KEYUP, KEYEVENTF_UNICODE,
|
||||
VIRTUAL_KEY, VK_BACK, VK_MEDIA_PLAY_PAUSE,
|
||||
};
|
||||
|
||||
fn send_inputs(inputs: &[INPUT]) {
|
||||
if inputs.is_empty() {
|
||||
return;
|
||||
}
|
||||
unsafe {
|
||||
SendInput(inputs, mem::size_of::<INPUT>() as i32);
|
||||
}
|
||||
}
|
||||
|
||||
fn make_unicode_key_down(scan: u16) -> INPUT {
|
||||
INPUT {
|
||||
r#type: INPUT_KEYBOARD,
|
||||
Anonymous: windows::Win32::UI::Input::KeyboardAndMouse::INPUT_0 {
|
||||
ki: KEYBDINPUT {
|
||||
wVk: VIRTUAL_KEY(0),
|
||||
wScan: scan,
|
||||
dwFlags: KEYEVENTF_UNICODE,
|
||||
time: 0,
|
||||
dwExtraInfo: 0,
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn make_unicode_key_up(scan: u16) -> INPUT {
|
||||
INPUT {
|
||||
r#type: INPUT_KEYBOARD,
|
||||
Anonymous: windows::Win32::UI::Input::KeyboardAndMouse::INPUT_0 {
|
||||
ki: KEYBDINPUT {
|
||||
wVk: VIRTUAL_KEY(0),
|
||||
wScan: scan,
|
||||
dwFlags: KEYEVENTF_UNICODE | KEYEVENTF_KEYUP,
|
||||
time: 0,
|
||||
dwExtraInfo: 0,
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn make_vk_key_down(vk: VIRTUAL_KEY) -> INPUT {
|
||||
INPUT {
|
||||
r#type: INPUT_KEYBOARD,
|
||||
Anonymous: windows::Win32::UI::Input::KeyboardAndMouse::INPUT_0 {
|
||||
ki: KEYBDINPUT {
|
||||
wVk: vk,
|
||||
wScan: 0,
|
||||
dwFlags: windows::Win32::UI::Input::KeyboardAndMouse::KEYBD_EVENT_FLAGS(0),
|
||||
time: 0,
|
||||
dwExtraInfo: 0,
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn make_vk_key_up(vk: VIRTUAL_KEY) -> INPUT {
|
||||
INPUT {
|
||||
r#type: INPUT_KEYBOARD,
|
||||
Anonymous: windows::Win32::UI::Input::KeyboardAndMouse::INPUT_0 {
|
||||
ki: KEYBDINPUT {
|
||||
wVk: vk,
|
||||
wScan: 0,
|
||||
dwFlags: KEYEVENTF_KEYUP,
|
||||
time: 0,
|
||||
dwExtraInfo: 0,
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Send N backspace key presses.
|
||||
fn send_backspaces(count: usize) {
|
||||
if count == 0 {
|
||||
return;
|
||||
}
|
||||
let mut inputs = Vec::with_capacity(count * 2);
|
||||
for _ in 0..count {
|
||||
inputs.push(make_vk_key_down(VK_BACK));
|
||||
inputs.push(make_vk_key_up(VK_BACK));
|
||||
}
|
||||
send_inputs(&inputs);
|
||||
}
|
||||
|
||||
/// Type a string using SendInput with KEYEVENTF_UNICODE.
|
||||
/// Handles surrogate pairs for characters outside BMP.
|
||||
fn send_unicode_string(text: &str) {
|
||||
if text.is_empty() {
|
||||
return;
|
||||
}
|
||||
let mut inputs = Vec::new();
|
||||
for c in text.chars() {
|
||||
let mut buf = [0u16; 2];
|
||||
let encoded = c.encode_utf16(&mut buf);
|
||||
for &code_unit in encoded.iter() {
|
||||
inputs.push(make_unicode_key_down(code_unit));
|
||||
inputs.push(make_unicode_key_up(code_unit));
|
||||
}
|
||||
}
|
||||
send_inputs(&inputs);
|
||||
}
|
||||
|
||||
/// Compute the common prefix length (in chars) between two strings.
|
||||
fn common_prefix_chars(a: &str, b: &str) -> usize {
|
||||
a.chars()
|
||||
.zip(b.chars())
|
||||
.take_while(|(ca, cb)| ca == cb)
|
||||
.count()
|
||||
}
|
||||
|
||||
/// Given the previously inserted text and the new full text from ASR,
|
||||
/// send the minimal backspaces + new characters to update the input field.
|
||||
/// Returns the new "last inserted" text (i.e. `current`).
|
||||
pub fn apply_text_update(last: &str, current: &str) -> String {
|
||||
let prefix_len = common_prefix_chars(last, current);
|
||||
let last_char_count = last.chars().count();
|
||||
let backspace_count = last_char_count - prefix_len;
|
||||
|
||||
// Get the byte offset where the common prefix ends in `current`
|
||||
let new_suffix: String = current.chars().skip(prefix_len).collect();
|
||||
|
||||
send_backspaces(backspace_count);
|
||||
send_unicode_string(&new_suffix);
|
||||
|
||||
current.to_string()
|
||||
}
|
||||
|
||||
/// Simulate pressing the media Play/Pause key.
|
||||
pub fn send_media_play_pause() {
|
||||
let inputs = [
|
||||
make_vk_key_down(VK_MEDIA_PLAY_PAUSE),
|
||||
make_vk_key_up(VK_MEDIA_PLAY_PAUSE),
|
||||
];
|
||||
send_inputs(&inputs);
|
||||
}
|
||||
679
src/main.rs
Normal file
679
src/main.rs
Normal file
@ -0,0 +1,679 @@
|
||||
#![windows_subsystem = "windows"]
|
||||
|
||||
mod audio;
|
||||
mod config;
|
||||
mod input;
|
||||
mod session;
|
||||
mod sound;
|
||||
mod ws;
|
||||
|
||||
use session::RecordingSession;
|
||||
use std::cell::RefCell;
|
||||
use windows::core::{w, PCWSTR};
|
||||
use windows::Win32::Foundation::{HWND, LPARAM, LRESULT, WPARAM};
|
||||
use windows::Win32::Graphics::Gdi::{GetStockObject, BLACK_BRUSH};
|
||||
use windows::Win32::System::LibraryLoader::GetModuleHandleW;
|
||||
use windows::Win32::UI::Input::KeyboardAndMouse::{
|
||||
RegisterHotKey, UnregisterHotKey, HOT_KEY_MODIFIERS,
|
||||
};
|
||||
use windows::Win32::UI::Shell::{
|
||||
Shell_NotifyIconW, NIF_ICON, NIF_MESSAGE, NIF_TIP, NIM_ADD, NIM_DELETE, NIM_MODIFY,
|
||||
NOTIFYICONDATAW,
|
||||
};
|
||||
use windows::Win32::UI::WindowsAndMessaging::*;
|
||||
|
||||
static ICO_IDLE: &[u8] = include_bytes!("../assets/idle.ico");
|
||||
static ICO_RECORDING: &[u8] = include_bytes!("../assets/recording.ico");
|
||||
|
||||
/// Parse an ICO file and load the best-matching icon entry as an HICON.
|
||||
fn load_icon_from_ico(data: &[u8]) -> HICON {
|
||||
// ICO header: reserved(2) + type(2) + count(2) = 6 bytes
|
||||
// Each entry: width(1) + height(1) + colorCount(1) + reserved(1)
|
||||
// + planes(2) + bitCount(2) + bytesInRes(4) + imageOffset(4) = 16 bytes
|
||||
if data.len() < 6 {
|
||||
return HICON::default();
|
||||
}
|
||||
let count = u16::from_le_bytes([data[4], data[5]]) as usize;
|
||||
if count == 0 || data.len() < 6 + count * 16 {
|
||||
return HICON::default();
|
||||
}
|
||||
|
||||
// Get system small icon size for tray
|
||||
let desired = unsafe { GetSystemMetrics(SM_CXSMICON) } as u32;
|
||||
|
||||
// Find best entry: prefer exact match on desired size, else closest larger, else largest
|
||||
let mut best_idx = 0;
|
||||
let mut best_w = 0u32;
|
||||
for i in 0..count {
|
||||
let off = 6 + i * 16;
|
||||
let w = if data[off] == 0 { 256 } else { data[off] as u32 };
|
||||
if w == desired {
|
||||
best_idx = i;
|
||||
best_w = w;
|
||||
break;
|
||||
}
|
||||
if (w >= desired && (best_w < desired || w < best_w))
|
||||
|| (best_w < desired && w > best_w)
|
||||
{
|
||||
best_idx = i;
|
||||
best_w = w;
|
||||
}
|
||||
}
|
||||
|
||||
let off = 6 + best_idx * 16;
|
||||
let bytes_in_res = u32::from_le_bytes([data[off + 8], data[off + 9], data[off + 10], data[off + 11]]) as usize;
|
||||
let image_offset = u32::from_le_bytes([data[off + 12], data[off + 13], data[off + 14], data[off + 15]]) as usize;
|
||||
|
||||
if data.len() < image_offset + bytes_in_res {
|
||||
return HICON::default();
|
||||
}
|
||||
|
||||
let image_data = &data[image_offset..image_offset + bytes_in_res];
|
||||
unsafe {
|
||||
CreateIconFromResourceEx(
|
||||
image_data,
|
||||
true,
|
||||
0x00030000,
|
||||
desired as i32,
|
||||
desired as i32,
|
||||
LR_DEFAULTCOLOR,
|
||||
)
|
||||
.unwrap_or_default()
|
||||
}
|
||||
}
|
||||
|
||||
const WM_TRAYICON: u32 = WM_APP + 1;
|
||||
const HOTKEY_ID: i32 = 1;
|
||||
const IDM_EXIT: usize = 1001;
|
||||
const IDM_CHANGE_HOTKEY: usize = 1002;
|
||||
const IDM_TOGGLE_MEDIA_PAUSE: usize = 1003;
|
||||
const IDM_SET_API_KEY: usize = 1004;
|
||||
const IDM_SET_MODEL: usize = 1005;
|
||||
|
||||
thread_local! {
|
||||
static SESSION: RefCell<Option<RecordingSession>> = RefCell::new(None);
|
||||
static HWND_MAIN: RefCell<HWND> = RefCell::new(HWND::default());
|
||||
/// When true, next key press in the dialog sets the hotkey.
|
||||
static PICKING_HOTKEY: RefCell<bool> = RefCell::new(false);
|
||||
}
|
||||
|
||||
fn set_tray_tooltip(hwnd: HWND, tip: &str, recording: bool) {
|
||||
let mut nid = NOTIFYICONDATAW {
|
||||
cbSize: std::mem::size_of::<NOTIFYICONDATAW>() as u32,
|
||||
hWnd: hwnd,
|
||||
uID: 1,
|
||||
uFlags: NIF_TIP | NIF_ICON,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Set tooltip
|
||||
let tip_wide: Vec<u16> = tip.encode_utf16().chain(std::iter::once(0)).collect();
|
||||
let len = tip_wide.len().min(nid.szTip.len());
|
||||
nid.szTip[..len].copy_from_slice(&tip_wide[..len]);
|
||||
|
||||
nid.hIcon = if recording {
|
||||
load_icon_from_ico(ICO_RECORDING)
|
||||
} else {
|
||||
load_icon_from_ico(ICO_IDLE)
|
||||
};
|
||||
|
||||
unsafe {
|
||||
let _ = Shell_NotifyIconW(NIM_MODIFY, &nid);
|
||||
}
|
||||
}
|
||||
|
||||
fn toggle_recording(hwnd: HWND) {
|
||||
SESSION.with(|s| {
|
||||
let mut session = s.borrow_mut();
|
||||
let cfg = config::get();
|
||||
if session.is_some() {
|
||||
// Stop recording
|
||||
session.take().unwrap().stop();
|
||||
set_tray_tooltip(hwnd, "语音输入 - 空闲", false);
|
||||
sound::play_stop();
|
||||
if cfg.media_pause_enabled {
|
||||
input::send_media_play_pause();
|
||||
}
|
||||
} else {
|
||||
// Start recording
|
||||
if cfg.media_pause_enabled {
|
||||
input::send_media_play_pause();
|
||||
}
|
||||
match RecordingSession::start() {
|
||||
Ok(s) => {
|
||||
*session = Some(s);
|
||||
set_tray_tooltip(hwnd, "语音输入 - 录音中...", true);
|
||||
sound::play_start();
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("[voice-ime] Failed to start recording: {e}");
|
||||
set_tray_tooltip(hwnd, &format!("语音输入 - 错误: {e}"), false);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/// Show a generic single-line text input dialog. Returns Some(text) if confirmed, None if cancelled.
|
||||
fn show_text_input_dialog(hwnd: HWND, title: &str, label: &str, initial: &str) -> Option<String> {
|
||||
unsafe {
|
||||
let instance = GetModuleHandleW(None).unwrap();
|
||||
|
||||
let wc = WNDCLASSEXW {
|
||||
cbSize: std::mem::size_of::<WNDCLASSEXW>() as u32,
|
||||
lpfnWndProc: Some(textinput_wnd_proc),
|
||||
hInstance: instance.into(),
|
||||
lpszClassName: w!("VoiceIMETextInput"),
|
||||
hbrBackground: std::mem::transmute(GetStockObject(
|
||||
windows::Win32::Graphics::Gdi::WHITE_BRUSH,
|
||||
)),
|
||||
..Default::default()
|
||||
};
|
||||
RegisterClassExW(&wc);
|
||||
|
||||
let screen_w = GetSystemMetrics(SM_CXSCREEN);
|
||||
let screen_h = GetSystemMetrics(SM_CYSCREEN);
|
||||
let dlg_w = 420;
|
||||
let dlg_h = 160;
|
||||
|
||||
TEXTINPUT_RESULT.with(|r| *r.borrow_mut() = None);
|
||||
TEXTINPUT_LABEL.with(|r| *r.borrow_mut() = label.to_string());
|
||||
TEXTINPUT_INITIAL.with(|r| *r.borrow_mut() = initial.to_string());
|
||||
|
||||
let mut title_w: Vec<u16> = title.encode_utf16().collect();
|
||||
title_w.push(0);
|
||||
|
||||
let dlg = CreateWindowExW(
|
||||
WS_EX_TOPMOST | WS_EX_TOOLWINDOW,
|
||||
w!("VoiceIMETextInput"),
|
||||
PCWSTR(title_w.as_ptr()),
|
||||
WS_POPUP | WS_CAPTION | WS_SYSMENU,
|
||||
(screen_w - dlg_w) / 2,
|
||||
(screen_h - dlg_h) / 2,
|
||||
dlg_w,
|
||||
dlg_h,
|
||||
Some(hwnd),
|
||||
None,
|
||||
Some(instance.into()),
|
||||
None,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let _ = ShowWindow(dlg, SW_SHOW);
|
||||
let _ = SetForegroundWindow(dlg);
|
||||
|
||||
let mut msg = MSG::default();
|
||||
while GetMessageW(&mut msg, None, 0, 0).as_bool() {
|
||||
if !IsWindow(Some(dlg)).as_bool() {
|
||||
break;
|
||||
}
|
||||
let _ = TranslateMessage(&msg);
|
||||
DispatchMessageW(&msg);
|
||||
}
|
||||
|
||||
TEXTINPUT_RESULT.with(|r| r.borrow().clone()).filter(|s| !s.is_empty())
|
||||
}
|
||||
}
|
||||
|
||||
thread_local! {
|
||||
static TEXTINPUT_RESULT: RefCell<Option<String>> = RefCell::new(None);
|
||||
static TEXTINPUT_LABEL: RefCell<String> = RefCell::new(String::new());
|
||||
static TEXTINPUT_INITIAL: RefCell<String> = RefCell::new(String::new());
|
||||
static TEXTINPUT_EDIT_HWND: RefCell<HWND> = RefCell::new(HWND::default());
|
||||
}
|
||||
|
||||
const IDC_TEXTINPUT_EDIT: i32 = 3001;
|
||||
const IDC_TEXTINPUT_OK: i32 = 3002;
|
||||
|
||||
unsafe extern "system" fn textinput_wnd_proc(
|
||||
hwnd: HWND,
|
||||
msg: u32,
|
||||
wparam: WPARAM,
|
||||
lparam: LPARAM,
|
||||
) -> LRESULT {
|
||||
match msg {
|
||||
WM_CREATE => {
|
||||
let instance = unsafe { GetModuleHandleW(None).unwrap() };
|
||||
|
||||
let label_text = TEXTINPUT_LABEL.with(|r| r.borrow().clone());
|
||||
let mut label_w: Vec<u16> = label_text.encode_utf16().collect();
|
||||
label_w.push(0);
|
||||
unsafe {
|
||||
let _ = CreateWindowExW(
|
||||
WINDOW_EX_STYLE::default(),
|
||||
w!("STATIC"),
|
||||
PCWSTR(label_w.as_ptr()),
|
||||
WS_CHILD | WS_VISIBLE,
|
||||
15, 15, 380, 20,
|
||||
Some(hwnd), None, Some(instance.into()), None,
|
||||
);
|
||||
}
|
||||
|
||||
let init_text = TEXTINPUT_INITIAL.with(|r| r.borrow().clone());
|
||||
let mut init_w: Vec<u16> = init_text.encode_utf16().collect();
|
||||
init_w.push(0);
|
||||
let edit = unsafe {
|
||||
CreateWindowExW(
|
||||
WS_EX_CLIENTEDGE,
|
||||
w!("EDIT"),
|
||||
PCWSTR(init_w.as_ptr()),
|
||||
WS_CHILD | WS_VISIBLE | WINDOW_STYLE(0x0080),
|
||||
15, 42, 375, 25,
|
||||
Some(hwnd), Some(HMENU(IDC_TEXTINPUT_EDIT as _)),
|
||||
Some(instance.into()), None,
|
||||
).unwrap()
|
||||
};
|
||||
TEXTINPUT_EDIT_HWND.with(|h| *h.borrow_mut() = edit);
|
||||
|
||||
unsafe {
|
||||
let _ = CreateWindowExW(
|
||||
WINDOW_EX_STYLE::default(),
|
||||
w!("BUTTON"),
|
||||
w!("确定"),
|
||||
WS_CHILD | WS_VISIBLE | WINDOW_STYLE(0x0001),
|
||||
160, 80, 90, 30,
|
||||
Some(hwnd), Some(HMENU(IDC_TEXTINPUT_OK as _)),
|
||||
Some(instance.into()), None,
|
||||
);
|
||||
}
|
||||
|
||||
unsafe {
|
||||
let _ = SendMessageW(edit, 0x00B1, Some(WPARAM(0)), Some(LPARAM(-1)));
|
||||
let _ = windows::Win32::UI::Input::KeyboardAndMouse::SetFocus(Some(edit));
|
||||
}
|
||||
LRESULT(0)
|
||||
}
|
||||
WM_COMMAND => {
|
||||
let id = (wparam.0 & 0xFFFF) as i32;
|
||||
if id == IDC_TEXTINPUT_OK {
|
||||
let edit = TEXTINPUT_EDIT_HWND.with(|h| *h.borrow());
|
||||
let len = unsafe { GetWindowTextLengthW(edit) } as usize;
|
||||
let mut buf = vec![0u16; len + 1];
|
||||
unsafe { GetWindowTextW(edit, &mut buf) };
|
||||
let text = String::from_utf16_lossy(&buf[..len]);
|
||||
TEXTINPUT_RESULT.with(|r| *r.borrow_mut() = Some(text.trim().to_string()));
|
||||
unsafe { let _ = DestroyWindow(hwnd); }
|
||||
}
|
||||
LRESULT(0)
|
||||
}
|
||||
WM_CLOSE => {
|
||||
TEXTINPUT_RESULT.with(|r| *r.borrow_mut() = Some(String::new()));
|
||||
unsafe { let _ = DestroyWindow(hwnd); }
|
||||
LRESULT(0)
|
||||
}
|
||||
WM_DESTROY => {
|
||||
unsafe { PostQuitMessage(0); }
|
||||
LRESULT(0)
|
||||
}
|
||||
_ => unsafe { DefWindowProcW(hwnd, msg, wparam, lparam) },
|
||||
}
|
||||
}
|
||||
|
||||
/// Show API key input dialog. Returns true if user provided a key.
|
||||
fn show_api_key_dialog(hwnd: HWND) -> bool {
|
||||
let current = config::get().api_key;
|
||||
if let Some(key) = show_text_input_dialog(hwnd, "设置 API Key", "请输入 Qwen ASR API Key:", ¤t) {
|
||||
config::set_api_key(key);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Show model input dialog.
|
||||
fn show_model_dialog(hwnd: HWND) {
|
||||
let current = config::get().model;
|
||||
if let Some(model) = show_text_input_dialog(hwnd, "设置模型", "请输入 ASR 模型名称:", ¤t) {
|
||||
config::set_model(model);
|
||||
}
|
||||
}
|
||||
|
||||
fn show_context_menu(hwnd: HWND) {
|
||||
unsafe {
|
||||
let cfg = config::get();
|
||||
let menu = CreatePopupMenu().unwrap();
|
||||
|
||||
// Hotkey item
|
||||
let hotkey_label: Vec<u16> = format!("设置快捷键 (当前: {})\0", config::vk_name(cfg.hotkey_vk))
|
||||
.encode_utf16()
|
||||
.collect();
|
||||
AppendMenuW(menu, MF_STRING, IDM_CHANGE_HOTKEY, PCWSTR(hotkey_label.as_ptr())).unwrap();
|
||||
|
||||
// Media pause toggle
|
||||
let pause_label: Vec<u16> = "录音时暂停媒体播放\0".encode_utf16().collect();
|
||||
let flags = if cfg.media_pause_enabled {
|
||||
MF_STRING | MF_CHECKED
|
||||
} else {
|
||||
MF_STRING | MF_UNCHECKED
|
||||
};
|
||||
AppendMenuW(menu, flags, IDM_TOGGLE_MEDIA_PAUSE, PCWSTR(pause_label.as_ptr())).unwrap();
|
||||
|
||||
// API Key
|
||||
let api_key_label: Vec<u16> = "设置 API Key...\0".encode_utf16().collect();
|
||||
AppendMenuW(menu, MF_STRING, IDM_SET_API_KEY, PCWSTR(api_key_label.as_ptr())).unwrap();
|
||||
|
||||
// Model
|
||||
let model_label: Vec<u16> = format!("设置模型 ({})...\0", cfg.model).encode_utf16().collect();
|
||||
AppendMenuW(menu, MF_STRING, IDM_SET_MODEL, PCWSTR(model_label.as_ptr())).unwrap();
|
||||
|
||||
// Separator
|
||||
AppendMenuW(menu, MF_SEPARATOR, 0, None).unwrap();
|
||||
|
||||
// Exit
|
||||
let exit_label: Vec<u16> = "退出\0".encode_utf16().collect();
|
||||
AppendMenuW(menu, MF_STRING, IDM_EXIT, PCWSTR(exit_label.as_ptr())).unwrap();
|
||||
|
||||
let mut pt = windows::Win32::Foundation::POINT::default();
|
||||
let _ = GetCursorPos(&mut pt);
|
||||
|
||||
// Required for the menu to disappear when clicking outside
|
||||
let _ = SetForegroundWindow(hwnd);
|
||||
|
||||
let _ = TrackPopupMenu(menu, TPM_BOTTOMALIGN | TPM_LEFTALIGN, pt.x, pt.y, Some(0), hwnd, None);
|
||||
let _ = DestroyMenu(menu);
|
||||
}
|
||||
}
|
||||
|
||||
/// Register the global hotkey using the current config.
|
||||
fn register_configured_hotkey(hwnd: HWND) -> bool {
|
||||
let vk = config::get().hotkey_vk;
|
||||
unsafe {
|
||||
RegisterHotKey(
|
||||
Some(hwnd),
|
||||
HOTKEY_ID,
|
||||
HOT_KEY_MODIFIERS(0),
|
||||
vk as u32,
|
||||
)
|
||||
.is_ok()
|
||||
}
|
||||
}
|
||||
|
||||
/// Show a small dialog that captures the next key press as the new hotkey.
|
||||
fn show_hotkey_picker(hwnd: HWND) {
|
||||
// Create a small popup window for key capture
|
||||
unsafe {
|
||||
let instance = GetModuleHandleW(None).unwrap();
|
||||
|
||||
// Register class for picker window (once is fine, re-register is harmless)
|
||||
let wc = WNDCLASSEXW {
|
||||
cbSize: std::mem::size_of::<WNDCLASSEXW>() as u32,
|
||||
lpfnWndProc: Some(picker_wnd_proc),
|
||||
hInstance: instance.into(),
|
||||
lpszClassName: w!("VoiceIMEPicker"),
|
||||
hbrBackground: std::mem::transmute(GetStockObject(
|
||||
windows::Win32::Graphics::Gdi::WHITE_BRUSH,
|
||||
)),
|
||||
..Default::default()
|
||||
};
|
||||
RegisterClassExW(&wc);
|
||||
|
||||
// Get screen center
|
||||
let screen_w = GetSystemMetrics(SM_CXSCREEN);
|
||||
let screen_h = GetSystemMetrics(SM_CYSCREEN);
|
||||
let dlg_w = 320;
|
||||
let dlg_h = 120;
|
||||
|
||||
let picker = CreateWindowExW(
|
||||
WS_EX_TOPMOST | WS_EX_TOOLWINDOW,
|
||||
w!("VoiceIMEPicker"),
|
||||
w!("设置快捷键"),
|
||||
WS_POPUP | WS_CAPTION | WS_SYSMENU,
|
||||
(screen_w - dlg_w) / 2,
|
||||
(screen_h - dlg_h) / 2,
|
||||
dlg_w,
|
||||
dlg_h,
|
||||
Some(hwnd),
|
||||
None,
|
||||
Some(instance.into()),
|
||||
None,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// Unregister current hotkey so the key can be captured
|
||||
let _ = UnregisterHotKey(Some(hwnd), HOTKEY_ID);
|
||||
PICKING_HOTKEY.with(|p| *p.borrow_mut() = true);
|
||||
|
||||
let _ = ShowWindow(picker, SW_SHOW);
|
||||
let _ = SetForegroundWindow(picker);
|
||||
|
||||
// Run a modal message loop for the picker
|
||||
let mut msg = MSG::default();
|
||||
while GetMessageW(&mut msg, None, 0, 0).as_bool() {
|
||||
let _ = TranslateMessage(&msg);
|
||||
DispatchMessageW(&msg);
|
||||
if !PICKING_HOTKEY.with(|p| *p.borrow()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Re-register hotkey with (possibly new) config
|
||||
if !register_configured_hotkey(hwnd) {
|
||||
let text: Vec<u16> = format!(
|
||||
"无法注册快捷键 {},可能被其他程序占用。\0",
|
||||
config::vk_name(config::get().hotkey_vk)
|
||||
)
|
||||
.encode_utf16()
|
||||
.collect();
|
||||
let title: Vec<u16> = "语音输入\0".encode_utf16().collect();
|
||||
MessageBoxW(
|
||||
Some(hwnd),
|
||||
PCWSTR(text.as_ptr()),
|
||||
PCWSTR(title.as_ptr()),
|
||||
MB_OK | MB_ICONWARNING,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsafe extern "system" fn picker_wnd_proc(
|
||||
hwnd: HWND,
|
||||
msg: u32,
|
||||
wparam: WPARAM,
|
||||
lparam: LPARAM,
|
||||
) -> LRESULT {
|
||||
match msg {
|
||||
WM_CREATE => {
|
||||
// Create a static label
|
||||
let text: Vec<u16> = "请按下新的快捷键...\0".encode_utf16().collect();
|
||||
let instance = unsafe { GetModuleHandleW(None).unwrap() };
|
||||
unsafe {
|
||||
let _ = CreateWindowExW(
|
||||
WINDOW_EX_STYLE::default(),
|
||||
w!("STATIC"),
|
||||
PCWSTR(text.as_ptr()),
|
||||
WS_CHILD | WS_VISIBLE | WINDOW_STYLE(0x01),
|
||||
0,
|
||||
25,
|
||||
320,
|
||||
40,
|
||||
Some(hwnd),
|
||||
None,
|
||||
Some(instance.into()),
|
||||
None,
|
||||
);
|
||||
}
|
||||
LRESULT(0)
|
||||
}
|
||||
WM_KEYDOWN | WM_SYSKEYDOWN => {
|
||||
let vk = (wparam.0 & 0xFF) as u16;
|
||||
// Ignore modifier-only keys
|
||||
if !matches!(vk, 0x10 | 0x11 | 0x12 | 0xA0..=0xA5) {
|
||||
config::set_hotkey_vk(vk);
|
||||
PICKING_HOTKEY.with(|p| *p.borrow_mut() = false);
|
||||
unsafe { let _ = DestroyWindow(hwnd); }
|
||||
}
|
||||
LRESULT(0)
|
||||
}
|
||||
WM_CLOSE => {
|
||||
PICKING_HOTKEY.with(|p| *p.borrow_mut() = false);
|
||||
unsafe { let _ = DestroyWindow(hwnd); }
|
||||
LRESULT(0)
|
||||
}
|
||||
WM_DESTROY => {
|
||||
PICKING_HOTKEY.with(|p| *p.borrow_mut() = false);
|
||||
LRESULT(0)
|
||||
}
|
||||
_ => unsafe { DefWindowProcW(hwnd, msg, wparam, lparam) },
|
||||
}
|
||||
}
|
||||
|
||||
unsafe extern "system" fn wnd_proc(
|
||||
hwnd: HWND,
|
||||
msg: u32,
|
||||
wparam: WPARAM,
|
||||
lparam: LPARAM,
|
||||
) -> LRESULT {
|
||||
match msg {
|
||||
WM_HOTKEY => {
|
||||
if wparam.0 == HOTKEY_ID as usize {
|
||||
toggle_recording(hwnd);
|
||||
}
|
||||
LRESULT(0)
|
||||
}
|
||||
WM_COMMAND => {
|
||||
let id = (wparam.0 & 0xFFFF) as usize;
|
||||
if id == IDM_EXIT {
|
||||
// Clean up and exit
|
||||
SESSION.with(|s| {
|
||||
let mut session = s.borrow_mut();
|
||||
if let Some(mut sess) = session.take() {
|
||||
sess.stop();
|
||||
}
|
||||
});
|
||||
unsafe { DestroyWindow(hwnd).unwrap() };
|
||||
} else if id == IDM_CHANGE_HOTKEY {
|
||||
show_hotkey_picker(hwnd);
|
||||
// Update tray tooltip with new hotkey name
|
||||
set_tray_tooltip(hwnd, &format!("语音输入 ({})", config::vk_name(config::get().hotkey_vk)), false);
|
||||
} else if id == IDM_TOGGLE_MEDIA_PAUSE {
|
||||
let enabled = config::get().media_pause_enabled;
|
||||
config::set_media_pause(!enabled);
|
||||
} else if id == IDM_SET_API_KEY {
|
||||
show_api_key_dialog(hwnd);
|
||||
} else if id == IDM_SET_MODEL {
|
||||
show_model_dialog(hwnd);
|
||||
}
|
||||
LRESULT(0)
|
||||
}
|
||||
x if x == WM_TRAYICON => {
|
||||
let event = (lparam.0 & 0xFFFF) as u32;
|
||||
if event == WM_RBUTTONUP {
|
||||
show_context_menu(hwnd);
|
||||
}
|
||||
LRESULT(0)
|
||||
}
|
||||
WM_DESTROY => {
|
||||
// Remove tray icon
|
||||
let nid = NOTIFYICONDATAW {
|
||||
cbSize: std::mem::size_of::<NOTIFYICONDATAW>() as u32,
|
||||
hWnd: hwnd,
|
||||
uID: 1,
|
||||
..Default::default()
|
||||
};
|
||||
unsafe { let _ = Shell_NotifyIconW(NIM_DELETE, &nid); };
|
||||
|
||||
// Unregister hotkey
|
||||
unsafe { let _ = UnregisterHotKey(Some(hwnd), HOTKEY_ID); };
|
||||
|
||||
unsafe { PostQuitMessage(0) };
|
||||
LRESULT(0)
|
||||
}
|
||||
_ => unsafe { DefWindowProcW(hwnd, msg, wparam, lparam) },
|
||||
}
|
||||
}
|
||||
|
||||
fn create_hidden_window() -> HWND {
|
||||
unsafe {
|
||||
let instance = GetModuleHandleW(None).unwrap();
|
||||
|
||||
let wc = WNDCLASSEXW {
|
||||
cbSize: std::mem::size_of::<WNDCLASSEXW>() as u32,
|
||||
lpfnWndProc: Some(wnd_proc),
|
||||
hInstance: instance.into(),
|
||||
lpszClassName: w!("VoiceIMEWindow"),
|
||||
hbrBackground: std::mem::transmute(GetStockObject(BLACK_BRUSH)),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
RegisterClassExW(&wc);
|
||||
|
||||
let hwnd = CreateWindowExW(
|
||||
WINDOW_EX_STYLE::default(),
|
||||
w!("VoiceIMEWindow"),
|
||||
w!("Voice IME"),
|
||||
WS_OVERLAPPEDWINDOW,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
Some(HWND_MESSAGE), // Message-only window
|
||||
None,
|
||||
Some(instance.into()),
|
||||
None,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
hwnd
|
||||
}
|
||||
}
|
||||
|
||||
fn create_tray_icon(hwnd: HWND) {
|
||||
let mut nid = NOTIFYICONDATAW {
|
||||
cbSize: std::mem::size_of::<NOTIFYICONDATAW>() as u32,
|
||||
hWnd: hwnd,
|
||||
uID: 1,
|
||||
uFlags: NIF_MESSAGE | NIF_ICON | NIF_TIP,
|
||||
uCallbackMessage: WM_TRAYICON,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
nid.hIcon = load_icon_from_ico(ICO_IDLE);
|
||||
|
||||
let tip = "语音输入 - 空闲 (F10)";
|
||||
let tip_wide: Vec<u16> = tip.encode_utf16().chain(std::iter::once(0)).collect();
|
||||
let len = tip_wide.len().min(nid.szTip.len());
|
||||
nid.szTip[..len].copy_from_slice(&tip_wide[..len]);
|
||||
|
||||
unsafe {
|
||||
let _ = Shell_NotifyIconW(NIM_ADD, &nid);
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
config::load();
|
||||
|
||||
let hwnd = create_hidden_window();
|
||||
|
||||
HWND_MAIN.with(|h| *h.borrow_mut() = hwnd);
|
||||
|
||||
// Prompt for API key if not configured
|
||||
if config::get().api_key.is_empty() {
|
||||
if !show_api_key_dialog(hwnd) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Register global hotkey from config
|
||||
if !register_configured_hotkey(hwnd) {
|
||||
eprintln!("[voice-ime] Failed to register hotkey. Is another instance running?");
|
||||
return;
|
||||
}
|
||||
|
||||
create_tray_icon(hwnd);
|
||||
|
||||
let hotkey_name = config::vk_name(config::get().hotkey_vk);
|
||||
set_tray_tooltip(hwnd, &format!("语音输入 ({})", hotkey_name), false);
|
||||
|
||||
eprintln!("[voice-ime] Running. Press {} to toggle recording.", hotkey_name);
|
||||
|
||||
// Message loop
|
||||
unsafe {
|
||||
let mut msg = MSG::default();
|
||||
while GetMessageW(&mut msg, None, 0, 0).as_bool() {
|
||||
let _ = TranslateMessage(&msg);
|
||||
DispatchMessageW(&msg);
|
||||
}
|
||||
}
|
||||
|
||||
eprintln!("[voice-ime] Exiting.");
|
||||
}
|
||||
99
src/session.rs
Normal file
99
src/session.rs
Normal file
@ -0,0 +1,99 @@
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::thread;
|
||||
use tokio::sync::mpsc;
|
||||
|
||||
use crate::audio::AudioCapture;
|
||||
use crate::config;
|
||||
use crate::input;
|
||||
use crate::ws;
|
||||
|
||||
/// Tracks text state across multiple speech segments in a VAD session.
|
||||
struct TextState {
|
||||
/// Concatenation of all completed segment transcripts.
|
||||
completed_text: String,
|
||||
/// Current segment's partial preview (text + stash from latest .text event).
|
||||
current_partial: String,
|
||||
/// The full text we last typed into the input field.
|
||||
last_displayed: String,
|
||||
}
|
||||
|
||||
pub struct RecordingSession {
|
||||
stop_tx: Option<mpsc::Sender<()>>,
|
||||
thread_handle: Option<thread::JoinHandle<()>>,
|
||||
_capture: Option<AudioCapture>,
|
||||
}
|
||||
|
||||
impl RecordingSession {
|
||||
pub fn start() -> Result<Self, String> {
|
||||
let (stop_tx, stop_rx) = mpsc::channel::<()>(1);
|
||||
let (audio_tx, audio_rx) = mpsc::unbounded_channel::<Vec<u8>>();
|
||||
|
||||
let (capture, _audio_cfg) = AudioCapture::start(audio_tx)?;
|
||||
|
||||
let cfg = config::get();
|
||||
let api_key = cfg.api_key.clone();
|
||||
let model = cfg.model.clone();
|
||||
|
||||
let thread_handle = thread::spawn(move || {
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("Failed to create tokio runtime");
|
||||
|
||||
let state = Arc::new(Mutex::new(TextState {
|
||||
completed_text: String::new(),
|
||||
current_partial: String::new(),
|
||||
last_displayed: String::new(),
|
||||
}));
|
||||
|
||||
let on_event = {
|
||||
let state = state.clone();
|
||||
move |event: ws::AsrEvent| {
|
||||
let mut st = state.lock().unwrap();
|
||||
match event {
|
||||
ws::AsrEvent::Partial { text, stash } => {
|
||||
st.current_partial = format!("{text}{stash}");
|
||||
let full = format!("{}{}", st.completed_text, st.current_partial);
|
||||
st.last_displayed = input::apply_text_update(&st.last_displayed, &full);
|
||||
}
|
||||
ws::AsrEvent::SegmentCompleted { transcript } => {
|
||||
st.completed_text.push_str(&transcript);
|
||||
st.current_partial.clear();
|
||||
let full = st.completed_text.clone();
|
||||
st.last_displayed = input::apply_text_update(&st.last_displayed, &full);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let result = rt.block_on(ws::run_ws_session(&api_key, &model, audio_rx, stop_rx, on_event));
|
||||
|
||||
if let Err(e) = result {
|
||||
eprintln!("[voice-ime] Recording session error: {e}");
|
||||
}
|
||||
});
|
||||
|
||||
Ok(RecordingSession {
|
||||
stop_tx: Some(stop_tx),
|
||||
thread_handle: Some(thread_handle),
|
||||
_capture: Some(capture),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn stop(&mut self) {
|
||||
self._capture.take();
|
||||
|
||||
if let Some(tx) = self.stop_tx.take() {
|
||||
let _ = tx.blocking_send(());
|
||||
}
|
||||
if let Some(handle) = self.thread_handle.take() {
|
||||
let _ = handle.join();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for RecordingSession {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
}
|
||||
}
|
||||
31
src/sound.rs
Normal file
31
src/sound.rs
Normal file
@ -0,0 +1,31 @@
|
||||
use rodio::{Decoder, OutputStream, Sink};
|
||||
use std::io::Cursor;
|
||||
|
||||
static SND_START: &[u8] = include_bytes!("../assets/start.mp3");
|
||||
static SND_STOP: &[u8] = include_bytes!("../assets/stop.mp3");
|
||||
|
||||
/// Play an embedded sound in a background thread (non-blocking).
|
||||
fn play_bytes(data: &'static [u8]) {
|
||||
std::thread::spawn(move || {
|
||||
let Ok((_stream, handle)) = OutputStream::try_default() else {
|
||||
return;
|
||||
};
|
||||
let Ok(sink) = Sink::try_new(&handle) else {
|
||||
return;
|
||||
};
|
||||
let cursor = Cursor::new(data);
|
||||
let Ok(source) = Decoder::new(cursor) else {
|
||||
return;
|
||||
};
|
||||
sink.append(source);
|
||||
sink.sleep_until_end();
|
||||
});
|
||||
}
|
||||
|
||||
pub fn play_start() {
|
||||
play_bytes(SND_START);
|
||||
}
|
||||
|
||||
pub fn play_stop() {
|
||||
play_bytes(SND_STOP);
|
||||
}
|
||||
230
src/ws.rs
Normal file
230
src/ws.rs
Normal file
@ -0,0 +1,230 @@
|
||||
use base64::{engine::general_purpose::STANDARD, Engine as _};
|
||||
use futures_util::{SinkExt, StreamExt};
|
||||
use serde_json::Value;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio_tungstenite::{
|
||||
connect_async_tls_with_config,
|
||||
tungstenite::{http::Request, Message},
|
||||
};
|
||||
|
||||
const WS_BASE_URL: &str = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime?model=";
|
||||
|
||||
/// Events emitted by the ASR WebSocket session.
|
||||
pub enum AsrEvent {
|
||||
/// Partial result for current speech segment. Full preview = text + stash.
|
||||
Partial { text: String, stash: String },
|
||||
/// Final result for a completed speech segment.
|
||||
SegmentCompleted { transcript: String },
|
||||
}
|
||||
|
||||
/// Run a WebSocket ASR session using the Qwen3 ASR Realtime API.
|
||||
pub async fn run_ws_session(
|
||||
api_key: &str,
|
||||
model: &str,
|
||||
mut audio_rx: mpsc::UnboundedReceiver<Vec<u8>>,
|
||||
mut stop_rx: mpsc::Receiver<()>,
|
||||
on_event: impl Fn(AsrEvent) + Send + 'static,
|
||||
) -> Result<(), String> {
|
||||
let ws_url = format!("{WS_BASE_URL}{model}");
|
||||
// Build request with auth headers
|
||||
let request = Request::builder()
|
||||
.uri(&ws_url)
|
||||
.header("Authorization", format!("Bearer {api_key}"))
|
||||
.header("OpenAI-Beta", "realtime=v1")
|
||||
.header("Host", "dashscope.aliyuncs.com")
|
||||
.header("Connection", "Upgrade")
|
||||
.header("Upgrade", "websocket")
|
||||
.header("Sec-WebSocket-Version", "13")
|
||||
.header(
|
||||
"Sec-WebSocket-Key",
|
||||
tokio_tungstenite::tungstenite::handshake::client::generate_key(),
|
||||
)
|
||||
.body(())
|
||||
.map_err(|e| format!("Build request failed: {e}"))?;
|
||||
|
||||
let (ws_stream, _) = connect_async_tls_with_config(request, None, false, None)
|
||||
.await
|
||||
.map_err(|e| format!("WebSocket connect failed: {e}"))?;
|
||||
|
||||
let (mut write, mut read) = ws_stream.split();
|
||||
|
||||
// 1. Wait for session.created
|
||||
wait_for_type(&mut read, "session.created").await?;
|
||||
eprintln!("[voice-ime] WS: session.created");
|
||||
|
||||
// 2. Send session.update (VAD mode)
|
||||
let session_update = serde_json::json!({
|
||||
"event_id": "evt_session_update",
|
||||
"type": "session.update",
|
||||
"session": {
|
||||
"modalities": ["text"],
|
||||
"input_audio_format": "pcm16",
|
||||
"sample_rate": 16000,
|
||||
"input_audio_transcription": {
|
||||
"language": "zh"
|
||||
},
|
||||
"turn_detection": {
|
||||
"type": "server_vad",
|
||||
"threshold": 0.0,
|
||||
"silence_duration_ms": 400
|
||||
}
|
||||
}
|
||||
});
|
||||
write
|
||||
.send(Message::Text(session_update.to_string().into()))
|
||||
.await
|
||||
.map_err(|e| format!("Send session.update failed: {e}"))?;
|
||||
|
||||
// 3. Wait for session.updated
|
||||
wait_for_type(&mut read, "session.updated").await?;
|
||||
eprintln!("[voice-ime] WS: session.updated, streaming audio...");
|
||||
|
||||
// 4. Spawn sender task: forwards audio as base64 JSON events
|
||||
let (finish_tx, mut finish_rx) = mpsc::channel::<()>(1);
|
||||
|
||||
let send_task = tokio::spawn(async move {
|
||||
let mut seq = 0u64;
|
||||
loop {
|
||||
tokio::select! {
|
||||
biased;
|
||||
_ = stop_rx.recv() => {
|
||||
// Drain remaining audio
|
||||
while let Ok(chunk) = audio_rx.try_recv() {
|
||||
let _ = send_audio(&mut write, &chunk, &mut seq).await;
|
||||
}
|
||||
// Send session.finish
|
||||
let finish = serde_json::json!({
|
||||
"event_id": "evt_finish",
|
||||
"type": "session.finish"
|
||||
});
|
||||
let _ = write.send(Message::Text(finish.to_string().into())).await;
|
||||
let _ = finish_tx.send(()).await;
|
||||
break;
|
||||
}
|
||||
chunk = audio_rx.recv() => {
|
||||
match chunk {
|
||||
Some(data) => {
|
||||
if send_audio(&mut write, &data, &mut seq).await.is_err() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// Audio channel closed
|
||||
let finish = serde_json::json!({
|
||||
"event_id": "evt_finish",
|
||||
"type": "session.finish"
|
||||
});
|
||||
let _ = write.send(Message::Text(finish.to_string().into())).await;
|
||||
let _ = finish_tx.send(()).await;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// 5. Receive task: process server events
|
||||
let recv_task = tokio::spawn(async move {
|
||||
let mut finish_sent = false;
|
||||
loop {
|
||||
tokio::select! {
|
||||
biased;
|
||||
_ = finish_rx.recv(), if !finish_sent => {
|
||||
finish_sent = true;
|
||||
// Keep reading until session.finished
|
||||
}
|
||||
msg = read.next() => {
|
||||
match msg {
|
||||
Some(Ok(Message::Text(text))) => {
|
||||
let v: Value = match serde_json::from_str(&text) {
|
||||
Ok(v) => v,
|
||||
Err(_) => continue,
|
||||
};
|
||||
let event_type = v["type"].as_str().unwrap_or("");
|
||||
match event_type {
|
||||
"conversation.item.input_audio_transcription.text" => {
|
||||
let text_part = v["text"].as_str().unwrap_or("").to_string();
|
||||
let stash = v["stash"].as_str().unwrap_or("").to_string();
|
||||
on_event(AsrEvent::Partial { text: text_part, stash });
|
||||
}
|
||||
"conversation.item.input_audio_transcription.completed" => {
|
||||
let transcript = v["transcript"].as_str().unwrap_or("").to_string();
|
||||
eprintln!("[voice-ime] Segment completed: {transcript}");
|
||||
on_event(AsrEvent::SegmentCompleted { transcript });
|
||||
}
|
||||
"session.finished" => {
|
||||
eprintln!("[voice-ime] WS: session.finished");
|
||||
return;
|
||||
}
|
||||
"error" => {
|
||||
let msg = v["error"]["message"].as_str().unwrap_or("unknown");
|
||||
eprintln!("[voice-ime] ASR error: {msg}");
|
||||
return;
|
||||
}
|
||||
_ => {} // ignore speech_started, speech_stopped, committed, etc.
|
||||
}
|
||||
}
|
||||
Some(Ok(_)) => {} // ping/pong/binary
|
||||
Some(Err(e)) => {
|
||||
eprintln!("[voice-ime] WS read error: {e}");
|
||||
return;
|
||||
}
|
||||
None => return,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
let _ = send_task.await;
|
||||
let _ = recv_task.await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Send a PCM audio chunk as a base64-encoded input_audio_buffer.append event.
|
||||
async fn send_audio<S>(write: &mut S, pcm_bytes: &[u8], seq: &mut u64) -> Result<(), String>
|
||||
where
|
||||
S: futures_util::Sink<Message> + Unpin,
|
||||
S::Error: std::fmt::Display,
|
||||
{
|
||||
let encoded = STANDARD.encode(pcm_bytes);
|
||||
*seq += 1;
|
||||
let event = serde_json::json!({
|
||||
"event_id": format!("evt_audio_{seq}"),
|
||||
"type": "input_audio_buffer.append",
|
||||
"audio": encoded
|
||||
});
|
||||
write
|
||||
.send(Message::Text(event.to_string().into()))
|
||||
.await
|
||||
.map_err(|e| format!("Send audio failed: {e}"))
|
||||
}
|
||||
|
||||
/// Read messages until one matches the expected type.
|
||||
async fn wait_for_type<S>(read: &mut S, expected: &str) -> Result<Value, String>
|
||||
where
|
||||
S: futures_util::Stream<Item = Result<Message, tokio_tungstenite::tungstenite::Error>> + Unpin,
|
||||
{
|
||||
loop {
|
||||
match read.next().await {
|
||||
Some(Ok(Message::Text(text))) => {
|
||||
let v: Value = serde_json::from_str(&text)
|
||||
.map_err(|e| format!("Parse JSON failed: {e}"))?;
|
||||
let t = v["type"].as_str().unwrap_or("");
|
||||
if t == expected {
|
||||
return Ok(v);
|
||||
}
|
||||
if t == "error" {
|
||||
let msg = v["error"]["message"].as_str().unwrap_or("unknown");
|
||||
return Err(format!("Server error: {msg}"));
|
||||
}
|
||||
// Ignore other event types while waiting
|
||||
}
|
||||
Some(Ok(_)) => {} // ignore non-text
|
||||
Some(Err(e)) => return Err(format!("WS read error: {e}")),
|
||||
None => return Err("Connection closed unexpectedly".to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user