Initial commit
This commit is contained in:
commit
8199769093
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
/target
|
||||||
1762
Cargo.lock
generated
Normal file
1762
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
26
Cargo.toml
Normal file
26
Cargo.toml
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
[package]
|
||||||
|
name = "voice-ime"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2024"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
windows = { version = "0.61", features = [
|
||||||
|
"Win32_UI_WindowsAndMessaging",
|
||||||
|
"Win32_UI_Input_KeyboardAndMouse",
|
||||||
|
"Win32_UI_Shell",
|
||||||
|
"Win32_Graphics_Gdi",
|
||||||
|
"Win32_System_LibraryLoader",
|
||||||
|
"Win32_Foundation",
|
||||||
|
"Win32_UI_Controls",
|
||||||
|
] }
|
||||||
|
cpal = "0.15"
|
||||||
|
tokio = { version = "1", features = ["rt-multi-thread", "sync", "macros", "time"] }
|
||||||
|
tokio-tungstenite = { version = "0.26", features = ["native-tls"] }
|
||||||
|
futures-util = "0.3"
|
||||||
|
serde = { version = "1", features = ["derive"] }
|
||||||
|
serde_json = "1"
|
||||||
|
base64 = "0.22"
|
||||||
|
rodio = "0.20"
|
||||||
|
|
||||||
|
[build-dependencies]
|
||||||
|
winres = "0.1"
|
||||||
79
README.md
Normal file
79
README.md
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
# Voice IME 语音输入法
|
||||||
|
|
||||||
|
Windows 系统托盘语音输入工具。按下快捷键开始录音,通过阿里云 Qwen ASR 实时语音识别 API 将语音转为文字,自动输入到当前光标位置。
|
||||||
|
|
||||||
|
## 功能
|
||||||
|
|
||||||
|
- **快捷键切换录音**:默认 F10,按一次开始,再按一次停止
|
||||||
|
- **流式语音识别**:使用 Qwen3 ASR Realtime API,支持 VAD 自动断句,边说边输入
|
||||||
|
- **增量文本插入**:对识别结果做 diff,仅输入变化部分,不影响输入框已有内容
|
||||||
|
- **系统托盘**:托盘图标显示当前状态(空闲/录音中),右键菜单提供设置
|
||||||
|
- **音效提示**:录音开始和停止时播放提示音
|
||||||
|
- **暂停媒体播放**:录音时可自动暂停系统媒体播放(可关闭)
|
||||||
|
- **可自定义配置**:
|
||||||
|
- 快捷键
|
||||||
|
- API Key
|
||||||
|
- ASR 模型
|
||||||
|
- 媒体暂停开关
|
||||||
|
|
||||||
|
## 使用方法
|
||||||
|
|
||||||
|
### 获取 API Key
|
||||||
|
|
||||||
|
前往 [阿里云百炼](https://bailian.console.aliyun.com/) 开通 Qwen ASR 服务并获取 API Key。
|
||||||
|
|
||||||
|
### 运行
|
||||||
|
|
||||||
|
```
|
||||||
|
cargo build --release
|
||||||
|
./target/release/voice-ime.exe
|
||||||
|
```
|
||||||
|
|
||||||
|
首次启动会弹窗要求输入 API Key。输入后程序最小化到系统托盘。
|
||||||
|
|
||||||
|
### 操作
|
||||||
|
|
||||||
|
| 操作 | 说明 |
|
||||||
|
|------|------|
|
||||||
|
| 按下快捷键(默认 F10) | 开始/停止录音 |
|
||||||
|
| 右键托盘图标 | 打开设置菜单 |
|
||||||
|
|
||||||
|
### 右键菜单
|
||||||
|
|
||||||
|
- **设置快捷键** — 按下任意键即可更换
|
||||||
|
- **录音时暂停媒体播放** — 勾选开关
|
||||||
|
- **设置 API Key** — 修改 ASR 服务密钥
|
||||||
|
- **设置模型** — 修改 ASR 模型名称
|
||||||
|
- **退出**
|
||||||
|
|
||||||
|
### 配置文件
|
||||||
|
|
||||||
|
配置保存在 `%APPDATA%\voice-ime\config.json`,格式示例:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"hotkey_vk": 121,
|
||||||
|
"media_pause_enabled": true,
|
||||||
|
"api_key": "sk-xxxxxxxx",
|
||||||
|
"model": "qwen3-asr-flash-realtime-2026-02-10"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## 技术栈
|
||||||
|
|
||||||
|
- **Rust 2024 Edition**
|
||||||
|
- **windows** crate — Win32 API(托盘图标、热键、SendInput 文字输入)
|
||||||
|
- **cpal** — WASAPI 麦克风采集
|
||||||
|
- **tokio + tokio-tungstenite** — 异步 WebSocket 客户端
|
||||||
|
- **rodio** — 音效播放
|
||||||
|
- **serde** — 配置序列化
|
||||||
|
|
||||||
|
## 系统要求
|
||||||
|
|
||||||
|
- Windows 10/11
|
||||||
|
- 麦克风
|
||||||
|
- 网络连接(用于访问阿里云 ASR API)
|
||||||
|
|
||||||
|
## 许可证
|
||||||
|
|
||||||
|
MIT
|
||||||
BIN
assets/idle.ico
Normal file
BIN
assets/idle.ico
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 90 KiB |
BIN
assets/recording.ico
Normal file
BIN
assets/recording.ico
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 110 KiB |
BIN
assets/start.mp3
Normal file
BIN
assets/start.mp3
Normal file
Binary file not shown.
BIN
assets/stop.mp3
Normal file
BIN
assets/stop.mp3
Normal file
Binary file not shown.
7
build.rs
Normal file
7
build.rs
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
fn main() {
|
||||||
|
if std::env::var_os("CARGO_CFG_TARGET_OS").as_deref() == Some(std::ffi::OsStr::new("windows")) {
|
||||||
|
let mut res = winres::WindowsResource::new();
|
||||||
|
res.set_icon("assets/idle.ico");
|
||||||
|
res.compile().expect("Failed to compile Windows resources");
|
||||||
|
}
|
||||||
|
}
|
||||||
6
convert_icon.py
Normal file
6
convert_icon.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
from PIL import Image
|
||||||
|
for name in ['idle', 'recording']:
|
||||||
|
img = Image.open(f'{name}.png').convert('RGBA')
|
||||||
|
sizes = [(16,16),(24,24),(32,32),(48,48),(64,64),(128,128),(256,256)]
|
||||||
|
img.save(f'{name}.ico', format='ICO', sizes=sizes)
|
||||||
|
print(f'{name}.ico created')
|
||||||
140
src/audio.rs
Normal file
140
src/audio.rs
Normal file
@ -0,0 +1,140 @@
|
|||||||
|
use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
|
||||||
|
use cpal::{SampleFormat, Stream, StreamConfig};
|
||||||
|
use tokio::sync::mpsc;
|
||||||
|
|
||||||
|
const TARGET_SAMPLE_RATE: u32 = 16000;
|
||||||
|
|
||||||
|
pub struct AudioCapture {
|
||||||
|
stream: Stream,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct AudioCaptureConfig {
|
||||||
|
#[allow(dead_code)]
|
||||||
|
pub sample_rate: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Linear interpolation resampling from `from_rate` to `to_rate`.
|
||||||
|
fn resample(samples: &[i16], from_rate: u32, to_rate: u32) -> Vec<i16> {
|
||||||
|
if from_rate == to_rate {
|
||||||
|
return samples.to_vec();
|
||||||
|
}
|
||||||
|
let ratio = from_rate as f64 / to_rate as f64;
|
||||||
|
let out_len = (samples.len() as f64 / ratio) as usize;
|
||||||
|
let mut output = Vec::with_capacity(out_len);
|
||||||
|
for i in 0..out_len {
|
||||||
|
let src_pos = i as f64 * ratio;
|
||||||
|
let idx = src_pos as usize;
|
||||||
|
let frac = src_pos - idx as f64;
|
||||||
|
let s = if idx + 1 < samples.len() {
|
||||||
|
let a = samples[idx] as f64;
|
||||||
|
let b = samples[idx + 1] as f64;
|
||||||
|
(a + frac * (b - a)) as i16
|
||||||
|
} else {
|
||||||
|
samples[idx.min(samples.len().saturating_sub(1))]
|
||||||
|
};
|
||||||
|
output.push(s);
|
||||||
|
}
|
||||||
|
output
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Mix multi-channel i16 to mono, resample to 16kHz, return PCM bytes.
|
||||||
|
fn process_i16(data: &[i16], channels: u16, source_rate: u32) -> Vec<u8> {
|
||||||
|
let ch = channels as usize;
|
||||||
|
let mono: Vec<i16> = data
|
||||||
|
.chunks(ch)
|
||||||
|
.map(|frame| {
|
||||||
|
let sum: i32 = frame.iter().map(|&s| s as i32).sum();
|
||||||
|
(sum / ch as i32) as i16
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
let resampled = resample(&mono, source_rate, TARGET_SAMPLE_RATE);
|
||||||
|
resampled.iter().flat_map(|s| s.to_le_bytes()).collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Mix multi-channel f32 to mono, resample to 16kHz, return PCM bytes.
|
||||||
|
fn process_f32(data: &[f32], channels: u16, source_rate: u32) -> Vec<u8> {
|
||||||
|
let ch = channels as usize;
|
||||||
|
let mono: Vec<i16> = data
|
||||||
|
.chunks(ch)
|
||||||
|
.map(|frame| {
|
||||||
|
let sum: f32 = frame.iter().sum();
|
||||||
|
let m = sum / ch as f32;
|
||||||
|
(m * 32768.0).clamp(-32768.0, 32767.0) as i16
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
let resampled = resample(&mono, source_rate, TARGET_SAMPLE_RATE);
|
||||||
|
resampled.iter().flat_map(|s| s.to_le_bytes()).collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AudioCapture {
|
||||||
|
/// Start capturing audio from the default input device.
|
||||||
|
/// Audio data is always resampled to 16kHz mono PCM i16 LE.
|
||||||
|
pub fn start(tx: mpsc::UnboundedSender<Vec<u8>>) -> Result<(Self, AudioCaptureConfig), String> {
|
||||||
|
let host = cpal::default_host();
|
||||||
|
let device = host
|
||||||
|
.default_input_device()
|
||||||
|
.ok_or_else(|| "No input device available".to_string())?;
|
||||||
|
|
||||||
|
let default_config = device
|
||||||
|
.default_input_config()
|
||||||
|
.map_err(|e| format!("Failed to get default input config: {e}"))?;
|
||||||
|
|
||||||
|
let source_rate = default_config.sample_rate().0;
|
||||||
|
let channels = default_config.channels();
|
||||||
|
let sample_format = default_config.sample_format();
|
||||||
|
let config: StreamConfig = default_config.into();
|
||||||
|
|
||||||
|
eprintln!("[voice-ime] Audio device: {source_rate}Hz, {channels}ch, {sample_format:?} → resampling to {TARGET_SAMPLE_RATE}Hz mono");
|
||||||
|
|
||||||
|
let stream = match sample_format {
|
||||||
|
SampleFormat::I16 => {
|
||||||
|
let ch = channels;
|
||||||
|
let rate = source_rate;
|
||||||
|
device
|
||||||
|
.build_input_stream(
|
||||||
|
&config,
|
||||||
|
move |data: &[i16], _: &cpal::InputCallbackInfo| {
|
||||||
|
let bytes = process_i16(data, ch, rate);
|
||||||
|
let _ = tx.send(bytes);
|
||||||
|
},
|
||||||
|
|err| eprintln!("Audio capture error: {err}"),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.map_err(|e| format!("Failed to build i16 input stream: {e}"))?
|
||||||
|
}
|
||||||
|
SampleFormat::F32 => {
|
||||||
|
let ch = channels;
|
||||||
|
let rate = source_rate;
|
||||||
|
device
|
||||||
|
.build_input_stream(
|
||||||
|
&config,
|
||||||
|
move |data: &[f32], _: &cpal::InputCallbackInfo| {
|
||||||
|
let bytes = process_f32(data, ch, rate);
|
||||||
|
let _ = tx.send(bytes);
|
||||||
|
},
|
||||||
|
|err| eprintln!("Audio capture error: {err}"),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.map_err(|e| format!("Failed to build f32 input stream: {e}"))?
|
||||||
|
}
|
||||||
|
_ => return Err(format!("Unsupported sample format: {sample_format:?}")),
|
||||||
|
};
|
||||||
|
|
||||||
|
stream
|
||||||
|
.play()
|
||||||
|
.map_err(|e| format!("Failed to start audio stream: {e}"))?;
|
||||||
|
|
||||||
|
Ok((
|
||||||
|
AudioCapture { stream },
|
||||||
|
AudioCaptureConfig {
|
||||||
|
sample_rate: TARGET_SAMPLE_RATE,
|
||||||
|
},
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for AudioCapture {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
let _ = self.stream.pause();
|
||||||
|
}
|
||||||
|
}
|
||||||
116
src/config.rs
Normal file
116
src/config.rs
Normal file
@ -0,0 +1,116 @@
|
|||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::cell::RefCell;
|
||||||
|
use std::fs;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
pub const DEFAULT_MODEL: &str = "qwen3-asr-flash-realtime-2026-02-10";
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize, Clone)]
|
||||||
|
pub struct Config {
|
||||||
|
/// Virtual key code for the hotkey (default: VK_F10 = 0x79)
|
||||||
|
pub hotkey_vk: u16,
|
||||||
|
/// Whether to send media play/pause when toggling recording
|
||||||
|
pub media_pause_enabled: bool,
|
||||||
|
/// API key for Qwen3 ASR service
|
||||||
|
#[serde(default)]
|
||||||
|
pub api_key: String,
|
||||||
|
/// ASR model name
|
||||||
|
#[serde(default = "default_model")]
|
||||||
|
pub model: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_model() -> String {
|
||||||
|
DEFAULT_MODEL.to_string()
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for Config {
|
||||||
|
fn default() -> Self {
|
||||||
|
Config {
|
||||||
|
hotkey_vk: 0x79, // VK_F10
|
||||||
|
media_pause_enabled: true,
|
||||||
|
api_key: String::new(),
|
||||||
|
model: DEFAULT_MODEL.to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn config_path() -> PathBuf {
|
||||||
|
let dir = std::env::var_os("APPDATA")
|
||||||
|
.map(PathBuf::from)
|
||||||
|
.unwrap_or_else(|| {
|
||||||
|
dirs_fallback_appdata()
|
||||||
|
})
|
||||||
|
.join("voice-ime");
|
||||||
|
let _ = fs::create_dir_all(&dir);
|
||||||
|
dir.join("config.json")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fallback: %USERPROFILE%\AppData\Roaming
|
||||||
|
fn dirs_fallback_appdata() -> PathBuf {
|
||||||
|
std::env::var_os("USERPROFILE")
|
||||||
|
.map(|p| PathBuf::from(p).join("AppData").join("Roaming"))
|
||||||
|
.unwrap_or_else(|| PathBuf::from("."))
|
||||||
|
}
|
||||||
|
|
||||||
|
thread_local! {
|
||||||
|
static CONFIG: RefCell<Config> = RefCell::new(Config::default());
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn load() {
|
||||||
|
let path = config_path();
|
||||||
|
if let Ok(data) = fs::read_to_string(&path) {
|
||||||
|
if let Ok(cfg) = serde_json::from_str::<Config>(&data) {
|
||||||
|
CONFIG.with(|c| *c.borrow_mut() = cfg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn save() {
|
||||||
|
let path = config_path();
|
||||||
|
CONFIG.with(|c| {
|
||||||
|
if let Ok(json) = serde_json::to_string_pretty(&*c.borrow()) {
|
||||||
|
let _ = fs::write(&path, json);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get() -> Config {
|
||||||
|
CONFIG.with(|c| c.borrow().clone())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn set_hotkey_vk(vk: u16) {
|
||||||
|
CONFIG.with(|c| c.borrow_mut().hotkey_vk = vk);
|
||||||
|
save();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn set_media_pause(enabled: bool) {
|
||||||
|
CONFIG.with(|c| c.borrow_mut().media_pause_enabled = enabled);
|
||||||
|
save();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn set_api_key(key: String) {
|
||||||
|
CONFIG.with(|c| c.borrow_mut().api_key = key);
|
||||||
|
save();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn set_model(model: String) {
|
||||||
|
CONFIG.with(|c| c.borrow_mut().model = model);
|
||||||
|
save();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return a display name for a virtual key code.
|
||||||
|
pub fn vk_name(vk: u16) -> String {
|
||||||
|
match vk {
|
||||||
|
0x70..=0x87 => format!("F{}", vk - 0x70 + 1),
|
||||||
|
0x21 => "PageUp".into(),
|
||||||
|
0x22 => "PageDown".into(),
|
||||||
|
0x23 => "End".into(),
|
||||||
|
0x24 => "Home".into(),
|
||||||
|
0x2D => "Insert".into(),
|
||||||
|
0x2E => "Delete".into(),
|
||||||
|
0x13 => "Pause".into(),
|
||||||
|
0x91 => "ScrollLock".into(),
|
||||||
|
0xC0 => "`".into(),
|
||||||
|
_ => format!("VK(0x{vk:02X})"),
|
||||||
|
}
|
||||||
|
}
|
||||||
139
src/input.rs
Normal file
139
src/input.rs
Normal file
@ -0,0 +1,139 @@
|
|||||||
|
use std::mem;
|
||||||
|
use windows::Win32::UI::Input::KeyboardAndMouse::{
|
||||||
|
SendInput, INPUT, INPUT_KEYBOARD, KEYBDINPUT, KEYEVENTF_KEYUP, KEYEVENTF_UNICODE,
|
||||||
|
VIRTUAL_KEY, VK_BACK, VK_MEDIA_PLAY_PAUSE,
|
||||||
|
};
|
||||||
|
|
||||||
|
fn send_inputs(inputs: &[INPUT]) {
|
||||||
|
if inputs.is_empty() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
unsafe {
|
||||||
|
SendInput(inputs, mem::size_of::<INPUT>() as i32);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn make_unicode_key_down(scan: u16) -> INPUT {
|
||||||
|
INPUT {
|
||||||
|
r#type: INPUT_KEYBOARD,
|
||||||
|
Anonymous: windows::Win32::UI::Input::KeyboardAndMouse::INPUT_0 {
|
||||||
|
ki: KEYBDINPUT {
|
||||||
|
wVk: VIRTUAL_KEY(0),
|
||||||
|
wScan: scan,
|
||||||
|
dwFlags: KEYEVENTF_UNICODE,
|
||||||
|
time: 0,
|
||||||
|
dwExtraInfo: 0,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn make_unicode_key_up(scan: u16) -> INPUT {
|
||||||
|
INPUT {
|
||||||
|
r#type: INPUT_KEYBOARD,
|
||||||
|
Anonymous: windows::Win32::UI::Input::KeyboardAndMouse::INPUT_0 {
|
||||||
|
ki: KEYBDINPUT {
|
||||||
|
wVk: VIRTUAL_KEY(0),
|
||||||
|
wScan: scan,
|
||||||
|
dwFlags: KEYEVENTF_UNICODE | KEYEVENTF_KEYUP,
|
||||||
|
time: 0,
|
||||||
|
dwExtraInfo: 0,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn make_vk_key_down(vk: VIRTUAL_KEY) -> INPUT {
|
||||||
|
INPUT {
|
||||||
|
r#type: INPUT_KEYBOARD,
|
||||||
|
Anonymous: windows::Win32::UI::Input::KeyboardAndMouse::INPUT_0 {
|
||||||
|
ki: KEYBDINPUT {
|
||||||
|
wVk: vk,
|
||||||
|
wScan: 0,
|
||||||
|
dwFlags: windows::Win32::UI::Input::KeyboardAndMouse::KEYBD_EVENT_FLAGS(0),
|
||||||
|
time: 0,
|
||||||
|
dwExtraInfo: 0,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn make_vk_key_up(vk: VIRTUAL_KEY) -> INPUT {
|
||||||
|
INPUT {
|
||||||
|
r#type: INPUT_KEYBOARD,
|
||||||
|
Anonymous: windows::Win32::UI::Input::KeyboardAndMouse::INPUT_0 {
|
||||||
|
ki: KEYBDINPUT {
|
||||||
|
wVk: vk,
|
||||||
|
wScan: 0,
|
||||||
|
dwFlags: KEYEVENTF_KEYUP,
|
||||||
|
time: 0,
|
||||||
|
dwExtraInfo: 0,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Send N backspace key presses.
|
||||||
|
fn send_backspaces(count: usize) {
|
||||||
|
if count == 0 {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let mut inputs = Vec::with_capacity(count * 2);
|
||||||
|
for _ in 0..count {
|
||||||
|
inputs.push(make_vk_key_down(VK_BACK));
|
||||||
|
inputs.push(make_vk_key_up(VK_BACK));
|
||||||
|
}
|
||||||
|
send_inputs(&inputs);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Type a string using SendInput with KEYEVENTF_UNICODE.
|
||||||
|
/// Handles surrogate pairs for characters outside BMP.
|
||||||
|
fn send_unicode_string(text: &str) {
|
||||||
|
if text.is_empty() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let mut inputs = Vec::new();
|
||||||
|
for c in text.chars() {
|
||||||
|
let mut buf = [0u16; 2];
|
||||||
|
let encoded = c.encode_utf16(&mut buf);
|
||||||
|
for &code_unit in encoded.iter() {
|
||||||
|
inputs.push(make_unicode_key_down(code_unit));
|
||||||
|
inputs.push(make_unicode_key_up(code_unit));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
send_inputs(&inputs);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compute the common prefix length (in chars) between two strings.
|
||||||
|
fn common_prefix_chars(a: &str, b: &str) -> usize {
|
||||||
|
a.chars()
|
||||||
|
.zip(b.chars())
|
||||||
|
.take_while(|(ca, cb)| ca == cb)
|
||||||
|
.count()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Given the previously inserted text and the new full text from ASR,
|
||||||
|
/// send the minimal backspaces + new characters to update the input field.
|
||||||
|
/// Returns the new "last inserted" text (i.e. `current`).
|
||||||
|
pub fn apply_text_update(last: &str, current: &str) -> String {
|
||||||
|
let prefix_len = common_prefix_chars(last, current);
|
||||||
|
let last_char_count = last.chars().count();
|
||||||
|
let backspace_count = last_char_count - prefix_len;
|
||||||
|
|
||||||
|
// Get the byte offset where the common prefix ends in `current`
|
||||||
|
let new_suffix: String = current.chars().skip(prefix_len).collect();
|
||||||
|
|
||||||
|
send_backspaces(backspace_count);
|
||||||
|
send_unicode_string(&new_suffix);
|
||||||
|
|
||||||
|
current.to_string()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Simulate pressing the media Play/Pause key.
|
||||||
|
pub fn send_media_play_pause() {
|
||||||
|
let inputs = [
|
||||||
|
make_vk_key_down(VK_MEDIA_PLAY_PAUSE),
|
||||||
|
make_vk_key_up(VK_MEDIA_PLAY_PAUSE),
|
||||||
|
];
|
||||||
|
send_inputs(&inputs);
|
||||||
|
}
|
||||||
679
src/main.rs
Normal file
679
src/main.rs
Normal file
@ -0,0 +1,679 @@
|
|||||||
|
#![windows_subsystem = "windows"]
|
||||||
|
|
||||||
|
mod audio;
|
||||||
|
mod config;
|
||||||
|
mod input;
|
||||||
|
mod session;
|
||||||
|
mod sound;
|
||||||
|
mod ws;
|
||||||
|
|
||||||
|
use session::RecordingSession;
|
||||||
|
use std::cell::RefCell;
|
||||||
|
use windows::core::{w, PCWSTR};
|
||||||
|
use windows::Win32::Foundation::{HWND, LPARAM, LRESULT, WPARAM};
|
||||||
|
use windows::Win32::Graphics::Gdi::{GetStockObject, BLACK_BRUSH};
|
||||||
|
use windows::Win32::System::LibraryLoader::GetModuleHandleW;
|
||||||
|
use windows::Win32::UI::Input::KeyboardAndMouse::{
|
||||||
|
RegisterHotKey, UnregisterHotKey, HOT_KEY_MODIFIERS,
|
||||||
|
};
|
||||||
|
use windows::Win32::UI::Shell::{
|
||||||
|
Shell_NotifyIconW, NIF_ICON, NIF_MESSAGE, NIF_TIP, NIM_ADD, NIM_DELETE, NIM_MODIFY,
|
||||||
|
NOTIFYICONDATAW,
|
||||||
|
};
|
||||||
|
use windows::Win32::UI::WindowsAndMessaging::*;
|
||||||
|
|
||||||
|
static ICO_IDLE: &[u8] = include_bytes!("../assets/idle.ico");
|
||||||
|
static ICO_RECORDING: &[u8] = include_bytes!("../assets/recording.ico");
|
||||||
|
|
||||||
|
/// Parse an ICO file and load the best-matching icon entry as an HICON.
|
||||||
|
fn load_icon_from_ico(data: &[u8]) -> HICON {
|
||||||
|
// ICO header: reserved(2) + type(2) + count(2) = 6 bytes
|
||||||
|
// Each entry: width(1) + height(1) + colorCount(1) + reserved(1)
|
||||||
|
// + planes(2) + bitCount(2) + bytesInRes(4) + imageOffset(4) = 16 bytes
|
||||||
|
if data.len() < 6 {
|
||||||
|
return HICON::default();
|
||||||
|
}
|
||||||
|
let count = u16::from_le_bytes([data[4], data[5]]) as usize;
|
||||||
|
if count == 0 || data.len() < 6 + count * 16 {
|
||||||
|
return HICON::default();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get system small icon size for tray
|
||||||
|
let desired = unsafe { GetSystemMetrics(SM_CXSMICON) } as u32;
|
||||||
|
|
||||||
|
// Find best entry: prefer exact match on desired size, else closest larger, else largest
|
||||||
|
let mut best_idx = 0;
|
||||||
|
let mut best_w = 0u32;
|
||||||
|
for i in 0..count {
|
||||||
|
let off = 6 + i * 16;
|
||||||
|
let w = if data[off] == 0 { 256 } else { data[off] as u32 };
|
||||||
|
if w == desired {
|
||||||
|
best_idx = i;
|
||||||
|
best_w = w;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (w >= desired && (best_w < desired || w < best_w))
|
||||||
|
|| (best_w < desired && w > best_w)
|
||||||
|
{
|
||||||
|
best_idx = i;
|
||||||
|
best_w = w;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let off = 6 + best_idx * 16;
|
||||||
|
let bytes_in_res = u32::from_le_bytes([data[off + 8], data[off + 9], data[off + 10], data[off + 11]]) as usize;
|
||||||
|
let image_offset = u32::from_le_bytes([data[off + 12], data[off + 13], data[off + 14], data[off + 15]]) as usize;
|
||||||
|
|
||||||
|
if data.len() < image_offset + bytes_in_res {
|
||||||
|
return HICON::default();
|
||||||
|
}
|
||||||
|
|
||||||
|
let image_data = &data[image_offset..image_offset + bytes_in_res];
|
||||||
|
unsafe {
|
||||||
|
CreateIconFromResourceEx(
|
||||||
|
image_data,
|
||||||
|
true,
|
||||||
|
0x00030000,
|
||||||
|
desired as i32,
|
||||||
|
desired as i32,
|
||||||
|
LR_DEFAULTCOLOR,
|
||||||
|
)
|
||||||
|
.unwrap_or_default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const WM_TRAYICON: u32 = WM_APP + 1;
|
||||||
|
const HOTKEY_ID: i32 = 1;
|
||||||
|
const IDM_EXIT: usize = 1001;
|
||||||
|
const IDM_CHANGE_HOTKEY: usize = 1002;
|
||||||
|
const IDM_TOGGLE_MEDIA_PAUSE: usize = 1003;
|
||||||
|
const IDM_SET_API_KEY: usize = 1004;
|
||||||
|
const IDM_SET_MODEL: usize = 1005;
|
||||||
|
|
||||||
|
thread_local! {
|
||||||
|
static SESSION: RefCell<Option<RecordingSession>> = RefCell::new(None);
|
||||||
|
static HWND_MAIN: RefCell<HWND> = RefCell::new(HWND::default());
|
||||||
|
/// When true, next key press in the dialog sets the hotkey.
|
||||||
|
static PICKING_HOTKEY: RefCell<bool> = RefCell::new(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn set_tray_tooltip(hwnd: HWND, tip: &str, recording: bool) {
|
||||||
|
let mut nid = NOTIFYICONDATAW {
|
||||||
|
cbSize: std::mem::size_of::<NOTIFYICONDATAW>() as u32,
|
||||||
|
hWnd: hwnd,
|
||||||
|
uID: 1,
|
||||||
|
uFlags: NIF_TIP | NIF_ICON,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
// Set tooltip
|
||||||
|
let tip_wide: Vec<u16> = tip.encode_utf16().chain(std::iter::once(0)).collect();
|
||||||
|
let len = tip_wide.len().min(nid.szTip.len());
|
||||||
|
nid.szTip[..len].copy_from_slice(&tip_wide[..len]);
|
||||||
|
|
||||||
|
nid.hIcon = if recording {
|
||||||
|
load_icon_from_ico(ICO_RECORDING)
|
||||||
|
} else {
|
||||||
|
load_icon_from_ico(ICO_IDLE)
|
||||||
|
};
|
||||||
|
|
||||||
|
unsafe {
|
||||||
|
let _ = Shell_NotifyIconW(NIM_MODIFY, &nid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn toggle_recording(hwnd: HWND) {
|
||||||
|
SESSION.with(|s| {
|
||||||
|
let mut session = s.borrow_mut();
|
||||||
|
let cfg = config::get();
|
||||||
|
if session.is_some() {
|
||||||
|
// Stop recording
|
||||||
|
session.take().unwrap().stop();
|
||||||
|
set_tray_tooltip(hwnd, "语音输入 - 空闲", false);
|
||||||
|
sound::play_stop();
|
||||||
|
if cfg.media_pause_enabled {
|
||||||
|
input::send_media_play_pause();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Start recording
|
||||||
|
if cfg.media_pause_enabled {
|
||||||
|
input::send_media_play_pause();
|
||||||
|
}
|
||||||
|
match RecordingSession::start() {
|
||||||
|
Ok(s) => {
|
||||||
|
*session = Some(s);
|
||||||
|
set_tray_tooltip(hwnd, "语音输入 - 录音中...", true);
|
||||||
|
sound::play_start();
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("[voice-ime] Failed to start recording: {e}");
|
||||||
|
set_tray_tooltip(hwnd, &format!("语音输入 - 错误: {e}"), false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Show a generic single-line text input dialog. Returns Some(text) if confirmed, None if cancelled.
|
||||||
|
fn show_text_input_dialog(hwnd: HWND, title: &str, label: &str, initial: &str) -> Option<String> {
|
||||||
|
unsafe {
|
||||||
|
let instance = GetModuleHandleW(None).unwrap();
|
||||||
|
|
||||||
|
let wc = WNDCLASSEXW {
|
||||||
|
cbSize: std::mem::size_of::<WNDCLASSEXW>() as u32,
|
||||||
|
lpfnWndProc: Some(textinput_wnd_proc),
|
||||||
|
hInstance: instance.into(),
|
||||||
|
lpszClassName: w!("VoiceIMETextInput"),
|
||||||
|
hbrBackground: std::mem::transmute(GetStockObject(
|
||||||
|
windows::Win32::Graphics::Gdi::WHITE_BRUSH,
|
||||||
|
)),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
RegisterClassExW(&wc);
|
||||||
|
|
||||||
|
let screen_w = GetSystemMetrics(SM_CXSCREEN);
|
||||||
|
let screen_h = GetSystemMetrics(SM_CYSCREEN);
|
||||||
|
let dlg_w = 420;
|
||||||
|
let dlg_h = 160;
|
||||||
|
|
||||||
|
TEXTINPUT_RESULT.with(|r| *r.borrow_mut() = None);
|
||||||
|
TEXTINPUT_LABEL.with(|r| *r.borrow_mut() = label.to_string());
|
||||||
|
TEXTINPUT_INITIAL.with(|r| *r.borrow_mut() = initial.to_string());
|
||||||
|
|
||||||
|
let mut title_w: Vec<u16> = title.encode_utf16().collect();
|
||||||
|
title_w.push(0);
|
||||||
|
|
||||||
|
let dlg = CreateWindowExW(
|
||||||
|
WS_EX_TOPMOST | WS_EX_TOOLWINDOW,
|
||||||
|
w!("VoiceIMETextInput"),
|
||||||
|
PCWSTR(title_w.as_ptr()),
|
||||||
|
WS_POPUP | WS_CAPTION | WS_SYSMENU,
|
||||||
|
(screen_w - dlg_w) / 2,
|
||||||
|
(screen_h - dlg_h) / 2,
|
||||||
|
dlg_w,
|
||||||
|
dlg_h,
|
||||||
|
Some(hwnd),
|
||||||
|
None,
|
||||||
|
Some(instance.into()),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let _ = ShowWindow(dlg, SW_SHOW);
|
||||||
|
let _ = SetForegroundWindow(dlg);
|
||||||
|
|
||||||
|
let mut msg = MSG::default();
|
||||||
|
while GetMessageW(&mut msg, None, 0, 0).as_bool() {
|
||||||
|
if !IsWindow(Some(dlg)).as_bool() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let _ = TranslateMessage(&msg);
|
||||||
|
DispatchMessageW(&msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEXTINPUT_RESULT.with(|r| r.borrow().clone()).filter(|s| !s.is_empty())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
thread_local! {
|
||||||
|
static TEXTINPUT_RESULT: RefCell<Option<String>> = RefCell::new(None);
|
||||||
|
static TEXTINPUT_LABEL: RefCell<String> = RefCell::new(String::new());
|
||||||
|
static TEXTINPUT_INITIAL: RefCell<String> = RefCell::new(String::new());
|
||||||
|
static TEXTINPUT_EDIT_HWND: RefCell<HWND> = RefCell::new(HWND::default());
|
||||||
|
}
|
||||||
|
|
||||||
|
const IDC_TEXTINPUT_EDIT: i32 = 3001;
|
||||||
|
const IDC_TEXTINPUT_OK: i32 = 3002;
|
||||||
|
|
||||||
|
unsafe extern "system" fn textinput_wnd_proc(
|
||||||
|
hwnd: HWND,
|
||||||
|
msg: u32,
|
||||||
|
wparam: WPARAM,
|
||||||
|
lparam: LPARAM,
|
||||||
|
) -> LRESULT {
|
||||||
|
match msg {
|
||||||
|
WM_CREATE => {
|
||||||
|
let instance = unsafe { GetModuleHandleW(None).unwrap() };
|
||||||
|
|
||||||
|
let label_text = TEXTINPUT_LABEL.with(|r| r.borrow().clone());
|
||||||
|
let mut label_w: Vec<u16> = label_text.encode_utf16().collect();
|
||||||
|
label_w.push(0);
|
||||||
|
unsafe {
|
||||||
|
let _ = CreateWindowExW(
|
||||||
|
WINDOW_EX_STYLE::default(),
|
||||||
|
w!("STATIC"),
|
||||||
|
PCWSTR(label_w.as_ptr()),
|
||||||
|
WS_CHILD | WS_VISIBLE,
|
||||||
|
15, 15, 380, 20,
|
||||||
|
Some(hwnd), None, Some(instance.into()), None,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
let init_text = TEXTINPUT_INITIAL.with(|r| r.borrow().clone());
|
||||||
|
let mut init_w: Vec<u16> = init_text.encode_utf16().collect();
|
||||||
|
init_w.push(0);
|
||||||
|
let edit = unsafe {
|
||||||
|
CreateWindowExW(
|
||||||
|
WS_EX_CLIENTEDGE,
|
||||||
|
w!("EDIT"),
|
||||||
|
PCWSTR(init_w.as_ptr()),
|
||||||
|
WS_CHILD | WS_VISIBLE | WINDOW_STYLE(0x0080),
|
||||||
|
15, 42, 375, 25,
|
||||||
|
Some(hwnd), Some(HMENU(IDC_TEXTINPUT_EDIT as _)),
|
||||||
|
Some(instance.into()), None,
|
||||||
|
).unwrap()
|
||||||
|
};
|
||||||
|
TEXTINPUT_EDIT_HWND.with(|h| *h.borrow_mut() = edit);
|
||||||
|
|
||||||
|
unsafe {
|
||||||
|
let _ = CreateWindowExW(
|
||||||
|
WINDOW_EX_STYLE::default(),
|
||||||
|
w!("BUTTON"),
|
||||||
|
w!("确定"),
|
||||||
|
WS_CHILD | WS_VISIBLE | WINDOW_STYLE(0x0001),
|
||||||
|
160, 80, 90, 30,
|
||||||
|
Some(hwnd), Some(HMENU(IDC_TEXTINPUT_OK as _)),
|
||||||
|
Some(instance.into()), None,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsafe {
|
||||||
|
let _ = SendMessageW(edit, 0x00B1, Some(WPARAM(0)), Some(LPARAM(-1)));
|
||||||
|
let _ = windows::Win32::UI::Input::KeyboardAndMouse::SetFocus(Some(edit));
|
||||||
|
}
|
||||||
|
LRESULT(0)
|
||||||
|
}
|
||||||
|
WM_COMMAND => {
|
||||||
|
let id = (wparam.0 & 0xFFFF) as i32;
|
||||||
|
if id == IDC_TEXTINPUT_OK {
|
||||||
|
let edit = TEXTINPUT_EDIT_HWND.with(|h| *h.borrow());
|
||||||
|
let len = unsafe { GetWindowTextLengthW(edit) } as usize;
|
||||||
|
let mut buf = vec![0u16; len + 1];
|
||||||
|
unsafe { GetWindowTextW(edit, &mut buf) };
|
||||||
|
let text = String::from_utf16_lossy(&buf[..len]);
|
||||||
|
TEXTINPUT_RESULT.with(|r| *r.borrow_mut() = Some(text.trim().to_string()));
|
||||||
|
unsafe { let _ = DestroyWindow(hwnd); }
|
||||||
|
}
|
||||||
|
LRESULT(0)
|
||||||
|
}
|
||||||
|
WM_CLOSE => {
|
||||||
|
TEXTINPUT_RESULT.with(|r| *r.borrow_mut() = Some(String::new()));
|
||||||
|
unsafe { let _ = DestroyWindow(hwnd); }
|
||||||
|
LRESULT(0)
|
||||||
|
}
|
||||||
|
WM_DESTROY => {
|
||||||
|
unsafe { PostQuitMessage(0); }
|
||||||
|
LRESULT(0)
|
||||||
|
}
|
||||||
|
_ => unsafe { DefWindowProcW(hwnd, msg, wparam, lparam) },
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Show API key input dialog. Returns true if user provided a key.
|
||||||
|
fn show_api_key_dialog(hwnd: HWND) -> bool {
|
||||||
|
let current = config::get().api_key;
|
||||||
|
if let Some(key) = show_text_input_dialog(hwnd, "设置 API Key", "请输入 Qwen ASR API Key:", ¤t) {
|
||||||
|
config::set_api_key(key);
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Show model input dialog.
|
||||||
|
fn show_model_dialog(hwnd: HWND) {
|
||||||
|
let current = config::get().model;
|
||||||
|
if let Some(model) = show_text_input_dialog(hwnd, "设置模型", "请输入 ASR 模型名称:", ¤t) {
|
||||||
|
config::set_model(model);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn show_context_menu(hwnd: HWND) {
|
||||||
|
unsafe {
|
||||||
|
let cfg = config::get();
|
||||||
|
let menu = CreatePopupMenu().unwrap();
|
||||||
|
|
||||||
|
// Hotkey item
|
||||||
|
let hotkey_label: Vec<u16> = format!("设置快捷键 (当前: {})\0", config::vk_name(cfg.hotkey_vk))
|
||||||
|
.encode_utf16()
|
||||||
|
.collect();
|
||||||
|
AppendMenuW(menu, MF_STRING, IDM_CHANGE_HOTKEY, PCWSTR(hotkey_label.as_ptr())).unwrap();
|
||||||
|
|
||||||
|
// Media pause toggle
|
||||||
|
let pause_label: Vec<u16> = "录音时暂停媒体播放\0".encode_utf16().collect();
|
||||||
|
let flags = if cfg.media_pause_enabled {
|
||||||
|
MF_STRING | MF_CHECKED
|
||||||
|
} else {
|
||||||
|
MF_STRING | MF_UNCHECKED
|
||||||
|
};
|
||||||
|
AppendMenuW(menu, flags, IDM_TOGGLE_MEDIA_PAUSE, PCWSTR(pause_label.as_ptr())).unwrap();
|
||||||
|
|
||||||
|
// API Key
|
||||||
|
let api_key_label: Vec<u16> = "设置 API Key...\0".encode_utf16().collect();
|
||||||
|
AppendMenuW(menu, MF_STRING, IDM_SET_API_KEY, PCWSTR(api_key_label.as_ptr())).unwrap();
|
||||||
|
|
||||||
|
// Model
|
||||||
|
let model_label: Vec<u16> = format!("设置模型 ({})...\0", cfg.model).encode_utf16().collect();
|
||||||
|
AppendMenuW(menu, MF_STRING, IDM_SET_MODEL, PCWSTR(model_label.as_ptr())).unwrap();
|
||||||
|
|
||||||
|
// Separator
|
||||||
|
AppendMenuW(menu, MF_SEPARATOR, 0, None).unwrap();
|
||||||
|
|
||||||
|
// Exit
|
||||||
|
let exit_label: Vec<u16> = "退出\0".encode_utf16().collect();
|
||||||
|
AppendMenuW(menu, MF_STRING, IDM_EXIT, PCWSTR(exit_label.as_ptr())).unwrap();
|
||||||
|
|
||||||
|
let mut pt = windows::Win32::Foundation::POINT::default();
|
||||||
|
let _ = GetCursorPos(&mut pt);
|
||||||
|
|
||||||
|
// Required for the menu to disappear when clicking outside
|
||||||
|
let _ = SetForegroundWindow(hwnd);
|
||||||
|
|
||||||
|
let _ = TrackPopupMenu(menu, TPM_BOTTOMALIGN | TPM_LEFTALIGN, pt.x, pt.y, Some(0), hwnd, None);
|
||||||
|
let _ = DestroyMenu(menu);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Register the global hotkey using the current config.
|
||||||
|
fn register_configured_hotkey(hwnd: HWND) -> bool {
|
||||||
|
let vk = config::get().hotkey_vk;
|
||||||
|
unsafe {
|
||||||
|
RegisterHotKey(
|
||||||
|
Some(hwnd),
|
||||||
|
HOTKEY_ID,
|
||||||
|
HOT_KEY_MODIFIERS(0),
|
||||||
|
vk as u32,
|
||||||
|
)
|
||||||
|
.is_ok()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Show a small dialog that captures the next key press as the new hotkey.
|
||||||
|
fn show_hotkey_picker(hwnd: HWND) {
|
||||||
|
// Create a small popup window for key capture
|
||||||
|
unsafe {
|
||||||
|
let instance = GetModuleHandleW(None).unwrap();
|
||||||
|
|
||||||
|
// Register class for picker window (once is fine, re-register is harmless)
|
||||||
|
let wc = WNDCLASSEXW {
|
||||||
|
cbSize: std::mem::size_of::<WNDCLASSEXW>() as u32,
|
||||||
|
lpfnWndProc: Some(picker_wnd_proc),
|
||||||
|
hInstance: instance.into(),
|
||||||
|
lpszClassName: w!("VoiceIMEPicker"),
|
||||||
|
hbrBackground: std::mem::transmute(GetStockObject(
|
||||||
|
windows::Win32::Graphics::Gdi::WHITE_BRUSH,
|
||||||
|
)),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
RegisterClassExW(&wc);
|
||||||
|
|
||||||
|
// Get screen center
|
||||||
|
let screen_w = GetSystemMetrics(SM_CXSCREEN);
|
||||||
|
let screen_h = GetSystemMetrics(SM_CYSCREEN);
|
||||||
|
let dlg_w = 320;
|
||||||
|
let dlg_h = 120;
|
||||||
|
|
||||||
|
let picker = CreateWindowExW(
|
||||||
|
WS_EX_TOPMOST | WS_EX_TOOLWINDOW,
|
||||||
|
w!("VoiceIMEPicker"),
|
||||||
|
w!("设置快捷键"),
|
||||||
|
WS_POPUP | WS_CAPTION | WS_SYSMENU,
|
||||||
|
(screen_w - dlg_w) / 2,
|
||||||
|
(screen_h - dlg_h) / 2,
|
||||||
|
dlg_w,
|
||||||
|
dlg_h,
|
||||||
|
Some(hwnd),
|
||||||
|
None,
|
||||||
|
Some(instance.into()),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// Unregister current hotkey so the key can be captured
|
||||||
|
let _ = UnregisterHotKey(Some(hwnd), HOTKEY_ID);
|
||||||
|
PICKING_HOTKEY.with(|p| *p.borrow_mut() = true);
|
||||||
|
|
||||||
|
let _ = ShowWindow(picker, SW_SHOW);
|
||||||
|
let _ = SetForegroundWindow(picker);
|
||||||
|
|
||||||
|
// Run a modal message loop for the picker
|
||||||
|
let mut msg = MSG::default();
|
||||||
|
while GetMessageW(&mut msg, None, 0, 0).as_bool() {
|
||||||
|
let _ = TranslateMessage(&msg);
|
||||||
|
DispatchMessageW(&msg);
|
||||||
|
if !PICKING_HOTKEY.with(|p| *p.borrow()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Re-register hotkey with (possibly new) config
|
||||||
|
if !register_configured_hotkey(hwnd) {
|
||||||
|
let text: Vec<u16> = format!(
|
||||||
|
"无法注册快捷键 {},可能被其他程序占用。\0",
|
||||||
|
config::vk_name(config::get().hotkey_vk)
|
||||||
|
)
|
||||||
|
.encode_utf16()
|
||||||
|
.collect();
|
||||||
|
let title: Vec<u16> = "语音输入\0".encode_utf16().collect();
|
||||||
|
MessageBoxW(
|
||||||
|
Some(hwnd),
|
||||||
|
PCWSTR(text.as_ptr()),
|
||||||
|
PCWSTR(title.as_ptr()),
|
||||||
|
MB_OK | MB_ICONWARNING,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
unsafe extern "system" fn picker_wnd_proc(
|
||||||
|
hwnd: HWND,
|
||||||
|
msg: u32,
|
||||||
|
wparam: WPARAM,
|
||||||
|
lparam: LPARAM,
|
||||||
|
) -> LRESULT {
|
||||||
|
match msg {
|
||||||
|
WM_CREATE => {
|
||||||
|
// Create a static label
|
||||||
|
let text: Vec<u16> = "请按下新的快捷键...\0".encode_utf16().collect();
|
||||||
|
let instance = unsafe { GetModuleHandleW(None).unwrap() };
|
||||||
|
unsafe {
|
||||||
|
let _ = CreateWindowExW(
|
||||||
|
WINDOW_EX_STYLE::default(),
|
||||||
|
w!("STATIC"),
|
||||||
|
PCWSTR(text.as_ptr()),
|
||||||
|
WS_CHILD | WS_VISIBLE | WINDOW_STYLE(0x01),
|
||||||
|
0,
|
||||||
|
25,
|
||||||
|
320,
|
||||||
|
40,
|
||||||
|
Some(hwnd),
|
||||||
|
None,
|
||||||
|
Some(instance.into()),
|
||||||
|
None,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
LRESULT(0)
|
||||||
|
}
|
||||||
|
WM_KEYDOWN | WM_SYSKEYDOWN => {
|
||||||
|
let vk = (wparam.0 & 0xFF) as u16;
|
||||||
|
// Ignore modifier-only keys
|
||||||
|
if !matches!(vk, 0x10 | 0x11 | 0x12 | 0xA0..=0xA5) {
|
||||||
|
config::set_hotkey_vk(vk);
|
||||||
|
PICKING_HOTKEY.with(|p| *p.borrow_mut() = false);
|
||||||
|
unsafe { let _ = DestroyWindow(hwnd); }
|
||||||
|
}
|
||||||
|
LRESULT(0)
|
||||||
|
}
|
||||||
|
WM_CLOSE => {
|
||||||
|
PICKING_HOTKEY.with(|p| *p.borrow_mut() = false);
|
||||||
|
unsafe { let _ = DestroyWindow(hwnd); }
|
||||||
|
LRESULT(0)
|
||||||
|
}
|
||||||
|
WM_DESTROY => {
|
||||||
|
PICKING_HOTKEY.with(|p| *p.borrow_mut() = false);
|
||||||
|
LRESULT(0)
|
||||||
|
}
|
||||||
|
_ => unsafe { DefWindowProcW(hwnd, msg, wparam, lparam) },
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
unsafe extern "system" fn wnd_proc(
|
||||||
|
hwnd: HWND,
|
||||||
|
msg: u32,
|
||||||
|
wparam: WPARAM,
|
||||||
|
lparam: LPARAM,
|
||||||
|
) -> LRESULT {
|
||||||
|
match msg {
|
||||||
|
WM_HOTKEY => {
|
||||||
|
if wparam.0 == HOTKEY_ID as usize {
|
||||||
|
toggle_recording(hwnd);
|
||||||
|
}
|
||||||
|
LRESULT(0)
|
||||||
|
}
|
||||||
|
WM_COMMAND => {
|
||||||
|
let id = (wparam.0 & 0xFFFF) as usize;
|
||||||
|
if id == IDM_EXIT {
|
||||||
|
// Clean up and exit
|
||||||
|
SESSION.with(|s| {
|
||||||
|
let mut session = s.borrow_mut();
|
||||||
|
if let Some(mut sess) = session.take() {
|
||||||
|
sess.stop();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
unsafe { DestroyWindow(hwnd).unwrap() };
|
||||||
|
} else if id == IDM_CHANGE_HOTKEY {
|
||||||
|
show_hotkey_picker(hwnd);
|
||||||
|
// Update tray tooltip with new hotkey name
|
||||||
|
set_tray_tooltip(hwnd, &format!("语音输入 ({})", config::vk_name(config::get().hotkey_vk)), false);
|
||||||
|
} else if id == IDM_TOGGLE_MEDIA_PAUSE {
|
||||||
|
let enabled = config::get().media_pause_enabled;
|
||||||
|
config::set_media_pause(!enabled);
|
||||||
|
} else if id == IDM_SET_API_KEY {
|
||||||
|
show_api_key_dialog(hwnd);
|
||||||
|
} else if id == IDM_SET_MODEL {
|
||||||
|
show_model_dialog(hwnd);
|
||||||
|
}
|
||||||
|
LRESULT(0)
|
||||||
|
}
|
||||||
|
x if x == WM_TRAYICON => {
|
||||||
|
let event = (lparam.0 & 0xFFFF) as u32;
|
||||||
|
if event == WM_RBUTTONUP {
|
||||||
|
show_context_menu(hwnd);
|
||||||
|
}
|
||||||
|
LRESULT(0)
|
||||||
|
}
|
||||||
|
WM_DESTROY => {
|
||||||
|
// Remove tray icon
|
||||||
|
let nid = NOTIFYICONDATAW {
|
||||||
|
cbSize: std::mem::size_of::<NOTIFYICONDATAW>() as u32,
|
||||||
|
hWnd: hwnd,
|
||||||
|
uID: 1,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
unsafe { let _ = Shell_NotifyIconW(NIM_DELETE, &nid); };
|
||||||
|
|
||||||
|
// Unregister hotkey
|
||||||
|
unsafe { let _ = UnregisterHotKey(Some(hwnd), HOTKEY_ID); };
|
||||||
|
|
||||||
|
unsafe { PostQuitMessage(0) };
|
||||||
|
LRESULT(0)
|
||||||
|
}
|
||||||
|
_ => unsafe { DefWindowProcW(hwnd, msg, wparam, lparam) },
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn create_hidden_window() -> HWND {
|
||||||
|
unsafe {
|
||||||
|
let instance = GetModuleHandleW(None).unwrap();
|
||||||
|
|
||||||
|
let wc = WNDCLASSEXW {
|
||||||
|
cbSize: std::mem::size_of::<WNDCLASSEXW>() as u32,
|
||||||
|
lpfnWndProc: Some(wnd_proc),
|
||||||
|
hInstance: instance.into(),
|
||||||
|
lpszClassName: w!("VoiceIMEWindow"),
|
||||||
|
hbrBackground: std::mem::transmute(GetStockObject(BLACK_BRUSH)),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
RegisterClassExW(&wc);
|
||||||
|
|
||||||
|
let hwnd = CreateWindowExW(
|
||||||
|
WINDOW_EX_STYLE::default(),
|
||||||
|
w!("VoiceIMEWindow"),
|
||||||
|
w!("Voice IME"),
|
||||||
|
WS_OVERLAPPEDWINDOW,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
Some(HWND_MESSAGE), // Message-only window
|
||||||
|
None,
|
||||||
|
Some(instance.into()),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
hwnd
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn create_tray_icon(hwnd: HWND) {
|
||||||
|
let mut nid = NOTIFYICONDATAW {
|
||||||
|
cbSize: std::mem::size_of::<NOTIFYICONDATAW>() as u32,
|
||||||
|
hWnd: hwnd,
|
||||||
|
uID: 1,
|
||||||
|
uFlags: NIF_MESSAGE | NIF_ICON | NIF_TIP,
|
||||||
|
uCallbackMessage: WM_TRAYICON,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
nid.hIcon = load_icon_from_ico(ICO_IDLE);
|
||||||
|
|
||||||
|
let tip = "语音输入 - 空闲 (F10)";
|
||||||
|
let tip_wide: Vec<u16> = tip.encode_utf16().chain(std::iter::once(0)).collect();
|
||||||
|
let len = tip_wide.len().min(nid.szTip.len());
|
||||||
|
nid.szTip[..len].copy_from_slice(&tip_wide[..len]);
|
||||||
|
|
||||||
|
unsafe {
|
||||||
|
let _ = Shell_NotifyIconW(NIM_ADD, &nid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
config::load();
|
||||||
|
|
||||||
|
let hwnd = create_hidden_window();
|
||||||
|
|
||||||
|
HWND_MAIN.with(|h| *h.borrow_mut() = hwnd);
|
||||||
|
|
||||||
|
// Prompt for API key if not configured
|
||||||
|
if config::get().api_key.is_empty() {
|
||||||
|
if !show_api_key_dialog(hwnd) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Register global hotkey from config
|
||||||
|
if !register_configured_hotkey(hwnd) {
|
||||||
|
eprintln!("[voice-ime] Failed to register hotkey. Is another instance running?");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
create_tray_icon(hwnd);
|
||||||
|
|
||||||
|
let hotkey_name = config::vk_name(config::get().hotkey_vk);
|
||||||
|
set_tray_tooltip(hwnd, &format!("语音输入 ({})", hotkey_name), false);
|
||||||
|
|
||||||
|
eprintln!("[voice-ime] Running. Press {} to toggle recording.", hotkey_name);
|
||||||
|
|
||||||
|
// Message loop
|
||||||
|
unsafe {
|
||||||
|
let mut msg = MSG::default();
|
||||||
|
while GetMessageW(&mut msg, None, 0, 0).as_bool() {
|
||||||
|
let _ = TranslateMessage(&msg);
|
||||||
|
DispatchMessageW(&msg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
eprintln!("[voice-ime] Exiting.");
|
||||||
|
}
|
||||||
99
src/session.rs
Normal file
99
src/session.rs
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
use std::sync::{Arc, Mutex};
|
||||||
|
use std::thread;
|
||||||
|
use tokio::sync::mpsc;
|
||||||
|
|
||||||
|
use crate::audio::AudioCapture;
|
||||||
|
use crate::config;
|
||||||
|
use crate::input;
|
||||||
|
use crate::ws;
|
||||||
|
|
||||||
|
/// Tracks text state across multiple speech segments in a VAD session.
|
||||||
|
struct TextState {
|
||||||
|
/// Concatenation of all completed segment transcripts.
|
||||||
|
completed_text: String,
|
||||||
|
/// Current segment's partial preview (text + stash from latest .text event).
|
||||||
|
current_partial: String,
|
||||||
|
/// The full text we last typed into the input field.
|
||||||
|
last_displayed: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct RecordingSession {
|
||||||
|
stop_tx: Option<mpsc::Sender<()>>,
|
||||||
|
thread_handle: Option<thread::JoinHandle<()>>,
|
||||||
|
_capture: Option<AudioCapture>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RecordingSession {
|
||||||
|
pub fn start() -> Result<Self, String> {
|
||||||
|
let (stop_tx, stop_rx) = mpsc::channel::<()>(1);
|
||||||
|
let (audio_tx, audio_rx) = mpsc::unbounded_channel::<Vec<u8>>();
|
||||||
|
|
||||||
|
let (capture, _audio_cfg) = AudioCapture::start(audio_tx)?;
|
||||||
|
|
||||||
|
let cfg = config::get();
|
||||||
|
let api_key = cfg.api_key.clone();
|
||||||
|
let model = cfg.model.clone();
|
||||||
|
|
||||||
|
let thread_handle = thread::spawn(move || {
|
||||||
|
let rt = tokio::runtime::Builder::new_current_thread()
|
||||||
|
.enable_all()
|
||||||
|
.build()
|
||||||
|
.expect("Failed to create tokio runtime");
|
||||||
|
|
||||||
|
let state = Arc::new(Mutex::new(TextState {
|
||||||
|
completed_text: String::new(),
|
||||||
|
current_partial: String::new(),
|
||||||
|
last_displayed: String::new(),
|
||||||
|
}));
|
||||||
|
|
||||||
|
let on_event = {
|
||||||
|
let state = state.clone();
|
||||||
|
move |event: ws::AsrEvent| {
|
||||||
|
let mut st = state.lock().unwrap();
|
||||||
|
match event {
|
||||||
|
ws::AsrEvent::Partial { text, stash } => {
|
||||||
|
st.current_partial = format!("{text}{stash}");
|
||||||
|
let full = format!("{}{}", st.completed_text, st.current_partial);
|
||||||
|
st.last_displayed = input::apply_text_update(&st.last_displayed, &full);
|
||||||
|
}
|
||||||
|
ws::AsrEvent::SegmentCompleted { transcript } => {
|
||||||
|
st.completed_text.push_str(&transcript);
|
||||||
|
st.current_partial.clear();
|
||||||
|
let full = st.completed_text.clone();
|
||||||
|
st.last_displayed = input::apply_text_update(&st.last_displayed, &full);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let result = rt.block_on(ws::run_ws_session(&api_key, &model, audio_rx, stop_rx, on_event));
|
||||||
|
|
||||||
|
if let Err(e) = result {
|
||||||
|
eprintln!("[voice-ime] Recording session error: {e}");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
Ok(RecordingSession {
|
||||||
|
stop_tx: Some(stop_tx),
|
||||||
|
thread_handle: Some(thread_handle),
|
||||||
|
_capture: Some(capture),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn stop(&mut self) {
|
||||||
|
self._capture.take();
|
||||||
|
|
||||||
|
if let Some(tx) = self.stop_tx.take() {
|
||||||
|
let _ = tx.blocking_send(());
|
||||||
|
}
|
||||||
|
if let Some(handle) = self.thread_handle.take() {
|
||||||
|
let _ = handle.join();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for RecordingSession {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
self.stop();
|
||||||
|
}
|
||||||
|
}
|
||||||
31
src/sound.rs
Normal file
31
src/sound.rs
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
use rodio::{Decoder, OutputStream, Sink};
|
||||||
|
use std::io::Cursor;
|
||||||
|
|
||||||
|
static SND_START: &[u8] = include_bytes!("../assets/start.mp3");
|
||||||
|
static SND_STOP: &[u8] = include_bytes!("../assets/stop.mp3");
|
||||||
|
|
||||||
|
/// Play an embedded sound in a background thread (non-blocking).
|
||||||
|
fn play_bytes(data: &'static [u8]) {
|
||||||
|
std::thread::spawn(move || {
|
||||||
|
let Ok((_stream, handle)) = OutputStream::try_default() else {
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
let Ok(sink) = Sink::try_new(&handle) else {
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
let cursor = Cursor::new(data);
|
||||||
|
let Ok(source) = Decoder::new(cursor) else {
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
sink.append(source);
|
||||||
|
sink.sleep_until_end();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn play_start() {
|
||||||
|
play_bytes(SND_START);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn play_stop() {
|
||||||
|
play_bytes(SND_STOP);
|
||||||
|
}
|
||||||
230
src/ws.rs
Normal file
230
src/ws.rs
Normal file
@ -0,0 +1,230 @@
|
|||||||
|
use base64::{engine::general_purpose::STANDARD, Engine as _};
|
||||||
|
use futures_util::{SinkExt, StreamExt};
|
||||||
|
use serde_json::Value;
|
||||||
|
use tokio::sync::mpsc;
|
||||||
|
use tokio_tungstenite::{
|
||||||
|
connect_async_tls_with_config,
|
||||||
|
tungstenite::{http::Request, Message},
|
||||||
|
};
|
||||||
|
|
||||||
|
const WS_BASE_URL: &str = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime?model=";
|
||||||
|
|
||||||
|
/// Events emitted by the ASR WebSocket session.
|
||||||
|
pub enum AsrEvent {
|
||||||
|
/// Partial result for current speech segment. Full preview = text + stash.
|
||||||
|
Partial { text: String, stash: String },
|
||||||
|
/// Final result for a completed speech segment.
|
||||||
|
SegmentCompleted { transcript: String },
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Run a WebSocket ASR session using the Qwen3 ASR Realtime API.
|
||||||
|
pub async fn run_ws_session(
|
||||||
|
api_key: &str,
|
||||||
|
model: &str,
|
||||||
|
mut audio_rx: mpsc::UnboundedReceiver<Vec<u8>>,
|
||||||
|
mut stop_rx: mpsc::Receiver<()>,
|
||||||
|
on_event: impl Fn(AsrEvent) + Send + 'static,
|
||||||
|
) -> Result<(), String> {
|
||||||
|
let ws_url = format!("{WS_BASE_URL}{model}");
|
||||||
|
// Build request with auth headers
|
||||||
|
let request = Request::builder()
|
||||||
|
.uri(&ws_url)
|
||||||
|
.header("Authorization", format!("Bearer {api_key}"))
|
||||||
|
.header("OpenAI-Beta", "realtime=v1")
|
||||||
|
.header("Host", "dashscope.aliyuncs.com")
|
||||||
|
.header("Connection", "Upgrade")
|
||||||
|
.header("Upgrade", "websocket")
|
||||||
|
.header("Sec-WebSocket-Version", "13")
|
||||||
|
.header(
|
||||||
|
"Sec-WebSocket-Key",
|
||||||
|
tokio_tungstenite::tungstenite::handshake::client::generate_key(),
|
||||||
|
)
|
||||||
|
.body(())
|
||||||
|
.map_err(|e| format!("Build request failed: {e}"))?;
|
||||||
|
|
||||||
|
let (ws_stream, _) = connect_async_tls_with_config(request, None, false, None)
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("WebSocket connect failed: {e}"))?;
|
||||||
|
|
||||||
|
let (mut write, mut read) = ws_stream.split();
|
||||||
|
|
||||||
|
// 1. Wait for session.created
|
||||||
|
wait_for_type(&mut read, "session.created").await?;
|
||||||
|
eprintln!("[voice-ime] WS: session.created");
|
||||||
|
|
||||||
|
// 2. Send session.update (VAD mode)
|
||||||
|
let session_update = serde_json::json!({
|
||||||
|
"event_id": "evt_session_update",
|
||||||
|
"type": "session.update",
|
||||||
|
"session": {
|
||||||
|
"modalities": ["text"],
|
||||||
|
"input_audio_format": "pcm16",
|
||||||
|
"sample_rate": 16000,
|
||||||
|
"input_audio_transcription": {
|
||||||
|
"language": "zh"
|
||||||
|
},
|
||||||
|
"turn_detection": {
|
||||||
|
"type": "server_vad",
|
||||||
|
"threshold": 0.0,
|
||||||
|
"silence_duration_ms": 400
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
write
|
||||||
|
.send(Message::Text(session_update.to_string().into()))
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("Send session.update failed: {e}"))?;
|
||||||
|
|
||||||
|
// 3. Wait for session.updated
|
||||||
|
wait_for_type(&mut read, "session.updated").await?;
|
||||||
|
eprintln!("[voice-ime] WS: session.updated, streaming audio...");
|
||||||
|
|
||||||
|
// 4. Spawn sender task: forwards audio as base64 JSON events
|
||||||
|
let (finish_tx, mut finish_rx) = mpsc::channel::<()>(1);
|
||||||
|
|
||||||
|
let send_task = tokio::spawn(async move {
|
||||||
|
let mut seq = 0u64;
|
||||||
|
loop {
|
||||||
|
tokio::select! {
|
||||||
|
biased;
|
||||||
|
_ = stop_rx.recv() => {
|
||||||
|
// Drain remaining audio
|
||||||
|
while let Ok(chunk) = audio_rx.try_recv() {
|
||||||
|
let _ = send_audio(&mut write, &chunk, &mut seq).await;
|
||||||
|
}
|
||||||
|
// Send session.finish
|
||||||
|
let finish = serde_json::json!({
|
||||||
|
"event_id": "evt_finish",
|
||||||
|
"type": "session.finish"
|
||||||
|
});
|
||||||
|
let _ = write.send(Message::Text(finish.to_string().into())).await;
|
||||||
|
let _ = finish_tx.send(()).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
chunk = audio_rx.recv() => {
|
||||||
|
match chunk {
|
||||||
|
Some(data) => {
|
||||||
|
if send_audio(&mut write, &data, &mut seq).await.is_err() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
// Audio channel closed
|
||||||
|
let finish = serde_json::json!({
|
||||||
|
"event_id": "evt_finish",
|
||||||
|
"type": "session.finish"
|
||||||
|
});
|
||||||
|
let _ = write.send(Message::Text(finish.to_string().into())).await;
|
||||||
|
let _ = finish_tx.send(()).await;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// 5. Receive task: process server events
|
||||||
|
let recv_task = tokio::spawn(async move {
|
||||||
|
let mut finish_sent = false;
|
||||||
|
loop {
|
||||||
|
tokio::select! {
|
||||||
|
biased;
|
||||||
|
_ = finish_rx.recv(), if !finish_sent => {
|
||||||
|
finish_sent = true;
|
||||||
|
// Keep reading until session.finished
|
||||||
|
}
|
||||||
|
msg = read.next() => {
|
||||||
|
match msg {
|
||||||
|
Some(Ok(Message::Text(text))) => {
|
||||||
|
let v: Value = match serde_json::from_str(&text) {
|
||||||
|
Ok(v) => v,
|
||||||
|
Err(_) => continue,
|
||||||
|
};
|
||||||
|
let event_type = v["type"].as_str().unwrap_or("");
|
||||||
|
match event_type {
|
||||||
|
"conversation.item.input_audio_transcription.text" => {
|
||||||
|
let text_part = v["text"].as_str().unwrap_or("").to_string();
|
||||||
|
let stash = v["stash"].as_str().unwrap_or("").to_string();
|
||||||
|
on_event(AsrEvent::Partial { text: text_part, stash });
|
||||||
|
}
|
||||||
|
"conversation.item.input_audio_transcription.completed" => {
|
||||||
|
let transcript = v["transcript"].as_str().unwrap_or("").to_string();
|
||||||
|
eprintln!("[voice-ime] Segment completed: {transcript}");
|
||||||
|
on_event(AsrEvent::SegmentCompleted { transcript });
|
||||||
|
}
|
||||||
|
"session.finished" => {
|
||||||
|
eprintln!("[voice-ime] WS: session.finished");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
"error" => {
|
||||||
|
let msg = v["error"]["message"].as_str().unwrap_or("unknown");
|
||||||
|
eprintln!("[voice-ime] ASR error: {msg}");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
_ => {} // ignore speech_started, speech_stopped, committed, etc.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some(Ok(_)) => {} // ping/pong/binary
|
||||||
|
Some(Err(e)) => {
|
||||||
|
eprintln!("[voice-ime] WS read error: {e}");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
None => return,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
let _ = send_task.await;
|
||||||
|
let _ = recv_task.await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Send a PCM audio chunk as a base64-encoded input_audio_buffer.append event.
|
||||||
|
async fn send_audio<S>(write: &mut S, pcm_bytes: &[u8], seq: &mut u64) -> Result<(), String>
|
||||||
|
where
|
||||||
|
S: futures_util::Sink<Message> + Unpin,
|
||||||
|
S::Error: std::fmt::Display,
|
||||||
|
{
|
||||||
|
let encoded = STANDARD.encode(pcm_bytes);
|
||||||
|
*seq += 1;
|
||||||
|
let event = serde_json::json!({
|
||||||
|
"event_id": format!("evt_audio_{seq}"),
|
||||||
|
"type": "input_audio_buffer.append",
|
||||||
|
"audio": encoded
|
||||||
|
});
|
||||||
|
write
|
||||||
|
.send(Message::Text(event.to_string().into()))
|
||||||
|
.await
|
||||||
|
.map_err(|e| format!("Send audio failed: {e}"))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read messages until one matches the expected type.
|
||||||
|
async fn wait_for_type<S>(read: &mut S, expected: &str) -> Result<Value, String>
|
||||||
|
where
|
||||||
|
S: futures_util::Stream<Item = Result<Message, tokio_tungstenite::tungstenite::Error>> + Unpin,
|
||||||
|
{
|
||||||
|
loop {
|
||||||
|
match read.next().await {
|
||||||
|
Some(Ok(Message::Text(text))) => {
|
||||||
|
let v: Value = serde_json::from_str(&text)
|
||||||
|
.map_err(|e| format!("Parse JSON failed: {e}"))?;
|
||||||
|
let t = v["type"].as_str().unwrap_or("");
|
||||||
|
if t == expected {
|
||||||
|
return Ok(v);
|
||||||
|
}
|
||||||
|
if t == "error" {
|
||||||
|
let msg = v["error"]["message"].as_str().unwrap_or("unknown");
|
||||||
|
return Err(format!("Server error: {msg}"));
|
||||||
|
}
|
||||||
|
// Ignore other event types while waiting
|
||||||
|
}
|
||||||
|
Some(Ok(_)) => {} // ignore non-text
|
||||||
|
Some(Err(e)) => return Err(format!("WS read error: {e}")),
|
||||||
|
None => return Err("Connection closed unexpectedly".to_string()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user