103 lines
2.8 KiB
TypeScript

import fs from "fs";
import OpenAI from "openai";
import prompt from "./prompt.md";
import { prisma } from "@/lib/prisma";
import { downloadFile } from "@/lib/minio";
import { extractAudio } from "../media";
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";
const client = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
baseURL: process.env.OPENAI_API_BASE_URL,
});
// Zod schema aligned with prompt.md
export const SttSchema = z
.object({
speech_detected: z.boolean(),
language: z.string().min(1).nullable(),
audio_type: z.string().nullable(),
transcript: z.array(z.string()),
non_speech_summary: z.string().nullable(),
})
.strict();
export type SttResult = z.infer<typeof SttSchema>;
async function transcriptAudio(audio: Buffer | string) {
if (typeof audio === "string") {
audio = fs.readFileSync(audio);
}
const base64Audio = Buffer.from(audio).toString("base64");
const completion = await client.chat.completions.create({
model: "gemini-2.5-flash",
messages: [
{
role: "user",
content: [
{ type: "text", text: prompt },
{
type: "input_audio",
input_audio: { data: base64Audio, format: "mp3" },
},
],
},
],
response_format: zodResponseFormat(SttSchema, "stt_result"),
});
const data = completion.choices?.[0]?.message
if (!data) {
throw new Error("No STT result returned from model");
}
const parsed = SttSchema.safeParse(data?.content).data;
if (!parsed) {
throw new Error("Failed to parse STT result with zod");
}
return parsed;
}
export async function transcriptAweme(awemeId: string): Promise<SttResult> {
const aweme = await prisma.video.findUnique({
where: { aweme_id: awemeId },
});
if (!aweme) {
throw new Error("Aweme not found or aweme is not a video post");
}
const vPath = aweme.video_url
const buffer = await downloadFile(vPath);
const audioDat = await extractAudio(buffer, { format: "mp3", bitrateKbps: 128 });
if (!audioDat || !audioDat.buffer) {
throw new Error("Failed to extract audio from video");
}
// 调用大模型生成转写结果
const result = await transcriptAudio(audioDat.buffer);
// 将转写结果持久化到数据库
await prisma.videoTranscript.upsert({
where: { videoId: awemeId },
update: {
speech_detected: result.speech_detected,
language: result.language,
audio_type: result.audio_type,
transcript: result.transcript,
non_speech_summary: result.non_speech_summary,
},
create: {
videoId: awemeId,
speech_detected: result.speech_detected,
language: result.language,
audio_type: result.audio_type,
transcript: result.transcript,
non_speech_summary: result.non_speech_summary,
},
});
return result;
}