103 lines
2.8 KiB
TypeScript
103 lines
2.8 KiB
TypeScript
import fs from "fs";
|
|
import OpenAI from "openai";
|
|
import prompt from "./prompt.md";
|
|
import { prisma } from "@/lib/prisma";
|
|
import { downloadFile } from "@/lib/minio";
|
|
import { extractAudio } from "../media";
|
|
import { z } from "zod";
|
|
import { zodResponseFormat } from "openai/helpers/zod";
|
|
|
|
const client = new OpenAI({
|
|
apiKey: process.env.OPENAI_API_KEY,
|
|
baseURL: process.env.OPENAI_API_BASE_URL,
|
|
});
|
|
|
|
// Zod schema aligned with prompt.md
|
|
export const SttSchema = z
|
|
.object({
|
|
speech_detected: z.boolean(),
|
|
language: z.string().min(1).nullable(),
|
|
audio_type: z.string().nullable(),
|
|
transcript: z.array(z.string()),
|
|
non_speech_summary: z.string().nullable(),
|
|
})
|
|
.strict();
|
|
|
|
export type SttResult = z.infer<typeof SttSchema>;
|
|
|
|
async function transcriptAudio(audio: Buffer | string) {
|
|
if (typeof audio === "string") {
|
|
audio = fs.readFileSync(audio);
|
|
}
|
|
const base64Audio = Buffer.from(audio).toString("base64");
|
|
|
|
const completion = await client.chat.completions.create({
|
|
model: "gemini-2.5-flash",
|
|
messages: [
|
|
{
|
|
role: "user",
|
|
content: [
|
|
{ type: "text", text: prompt },
|
|
{
|
|
type: "input_audio",
|
|
input_audio: { data: base64Audio, format: "mp3" },
|
|
},
|
|
],
|
|
},
|
|
],
|
|
response_format: zodResponseFormat(SttSchema, "stt_result"),
|
|
});
|
|
|
|
const data = completion.choices?.[0]?.message
|
|
if (!data) {
|
|
throw new Error("No STT result returned from model");
|
|
}
|
|
const parsed = SttSchema.safeParse(data?.content).data;
|
|
if (!parsed) {
|
|
throw new Error("Failed to parse STT result with zod");
|
|
}
|
|
return parsed;
|
|
}
|
|
|
|
export async function transcriptAweme(awemeId: string): Promise<SttResult> {
|
|
const aweme = await prisma.video.findUnique({
|
|
where: { aweme_id: awemeId },
|
|
});
|
|
if (!aweme) {
|
|
throw new Error("Aweme not found or aweme is not a video post");
|
|
}
|
|
const vPath = aweme.video_url
|
|
const buffer = await downloadFile(vPath);
|
|
|
|
const audioDat = await extractAudio(buffer, { format: "mp3", bitrateKbps: 128 });
|
|
|
|
if (!audioDat || !audioDat.buffer) {
|
|
throw new Error("Failed to extract audio from video");
|
|
}
|
|
|
|
// 调用大模型生成转写结果
|
|
const result = await transcriptAudio(audioDat.buffer);
|
|
|
|
// 将转写结果持久化到数据库
|
|
await prisma.videoTranscript.upsert({
|
|
where: { videoId: awemeId },
|
|
update: {
|
|
speech_detected: result.speech_detected,
|
|
language: result.language,
|
|
audio_type: result.audio_type,
|
|
transcript: result.transcript,
|
|
non_speech_summary: result.non_speech_summary,
|
|
},
|
|
create: {
|
|
videoId: awemeId,
|
|
speech_detected: result.speech_detected,
|
|
language: result.language,
|
|
audio_type: result.audio_type,
|
|
transcript: result.transcript,
|
|
non_speech_summary: result.non_speech_summary,
|
|
},
|
|
});
|
|
|
|
return result;
|
|
}
|