douyin-archive/app/api/stt/index.ts

import fs from "fs";
import OpenAI from "openai";
import prompt from "./prompt.md";
import { prisma } from "@/lib/prisma";
import { downloadFile } from "@/lib/minio";
import { extractAudio } from "../media";
import { z } from "zod";
import { zodResponseFormat } from "openai/helpers/zod";

const client = new OpenAI({
  apiKey: process.env.OPENAI_API_KEY,
  baseURL: process.env.OPENAI_API_BASE_URL,
});

// Zod schema aligned with prompt.md
export const SttSchema = z
  .object({
    speech_detected: z.boolean(),
    language: z.string().min(1).nullable(),
    audio_type: z.string().nullable(),
    transcript: z.array(z.string()),
    non_speech_summary: z.string().nullable(),
  })
  .strict();

export type SttResult = z.infer<typeof SttSchema>;

async function transcriptAudio(audio: Buffer | string) {
  if (typeof audio === "string") {
    audio = fs.readFileSync(audio);
  }
  const base64Audio = Buffer.from(audio).toString("base64");

  const completion = await client.chat.completions.create({
    model: "gemini-2.5-flash",
    messages: [
      {
        role: "user",
        content: [
          { type: "text", text: prompt },
          {
            type: "input_audio",
            input_audio: { data: base64Audio, format: "mp3" },
          },
        ],
      },
    ],
    response_format: zodResponseFormat(SttSchema, "stt_result"),
  });

  const data = completion.choices?.[0]?.message
  if (!data) {
    throw new Error("No STT result returned from model");
  }
  const parsed = SttSchema.safeParse(data?.content).data;
  if (!parsed) {
    throw new Error("Failed to parse STT result with zod");
  }
  return parsed;
}

export async function transcriptAweme(awemeId: string): Promise<SttResult> {
  const aweme = await prisma.video.findUnique({
    where: { aweme_id: awemeId },
  });
  if (!aweme) {
    throw new Error("Aweme not found or aweme is not a video post");
  }
  const vPath = aweme.video_url
  const buffer = await downloadFile(vPath);

  const audioDat = await extractAudio(buffer, { format: "mp3", bitrateKbps: 128 });

  if (!audioDat || !audioDat.buffer) {
    throw new Error("Failed to extract audio from video");
  }

  // 调用大模型生成转写结果
  const result = await transcriptAudio(audioDat.buffer);

  // 将转写结果持久化到数据库
  await prisma.videoTranscript.upsert({
    where: { videoId: awemeId },
    update: {
      speech_detected: result.speech_detected,
      language: result.language,
      audio_type: result.audio_type,
      transcript: result.transcript,
      non_speech_summary: result.non_speech_summary,
    },
    create: {
      videoId: awemeId,
      speech_detected: result.speech_detected,
      language: result.language,
      audio_type: result.audio_type,
      transcript: result.transcript,
      non_speech_summary: result.non_speech_summary,
    },
  });

  return result;
}