import fs from "fs"; import OpenAI from "openai"; import prompt from "./prompt.md"; import { prisma } from "@/lib/prisma"; import { downloadFile } from "@/lib/minio"; import { extractAudio } from "../media"; import { z } from "zod"; import { zodResponseFormat } from "openai/helpers/zod"; import { ProxyAgent, setGlobalDispatcher } from "undici"; const proxy = process.env.HTTPS_PROXY || process.env.HTTP_PROXY; if (proxy) { setGlobalDispatcher(new ProxyAgent(proxy)); } const client = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, baseURL: process.env.OPENAI_API_BASE_URL, }); // Zod schema aligned with prompt.md export const SttSchema = z .object({ speech_detected: z.boolean(), language: z.string().min(1).nullable(), audio_type: z.string().nullable(), transcript: z.array(z.string()), non_speech_summary: z.string().nullable(), }) .strict(); export type SttResult = z.infer; async function transcriptAudio(audio: Buffer | string) { if (typeof audio === "string") { audio = fs.readFileSync(audio); } const base64Audio = Buffer.from(audio).toString("base64"); const completion = await client.chat.completions.create({ model: "gemini-2.5-flash", messages: [ { role: "user", content: [ { type: "text", text: prompt }, { type: "input_audio", input_audio: { data: base64Audio, format: "mp3" }, }, ], }, ], response_format: zodResponseFormat(SttSchema, "stt_result"), }); const data = completion.choices?.[0]?.message console.log("转写结果", data.content); if (!data || !data.content) { throw new Error("No STT result returned from model"); } try { return JSON.parse(data.content) as SttResult; } catch (e) { throw new Error("Failed to parse STT result JSON"); } } export async function transcriptAweme(awemeId: string): Promise { const aweme = await prisma.video.findUnique({ where: { aweme_id: awemeId }, }); if (!aweme) { throw new Error("Aweme not found or aweme is not a video post"); } const vPath = aweme.video_url const buffer = await downloadFile(vPath); const audioDat = await extractAudio(buffer, { format: "mp3", bitrateKbps: 128 }); if (!audioDat || !audioDat.buffer) { throw new Error("Failed to extract audio from video"); } // 调用大模型生成转写结果 const result = await transcriptAudio(audioDat.buffer); // 将转写结果持久化到数据库 await prisma.videoTranscript.upsert({ where: { videoId: awemeId }, update: { speech_detected: result.speech_detected, language: result.language, audio_type: result.audio_type, transcript: result.transcript, non_speech_summary: result.non_speech_summary, }, create: { videoId: awemeId, speech_detected: result.speech_detected, language: result.language, audio_type: result.audio_type, transcript: result.transcript, non_speech_summary: result.non_speech_summary, }, }); return result; }