// src/scrapeDouyin.ts import { chromium, type Response } from 'playwright'; import { prisma } from '@/lib/prisma'; import { uploadFile, generateUniqueFileName } from '@/lib/minio'; import { createCamelCompatibleProxy } from '@/app/fetcher/utils'; import { waitForFirstResponse, waitForResponseWithTimeout, safeJson, downloadBinary } from '@/app/fetcher/network'; import { pickBestPlayAddr, extractFirstFrame } from '@/app/fetcher/media'; import { handleImagePost } from '@/app/fetcher/uploader'; import { saveToDB, saveImagePostToDB } from '@/app/fetcher/persist'; const DETAIL_PATH = '/aweme/v1/web/aweme/detail/'; const COMMENT_PATH = '/aweme/v1/web/comment/list/'; const POST_PATH = '/aweme/v1/web/aweme/post/' export async function scrapeDouyin(url: string) { const browser = await chromium.launch({ headless: true }); console.log("Launch chromium"); const context = await chromium.launchPersistentContext('chrome-profile/douyin', { headless: true }); const page = await context.newPage(); await page.addInitScript(() => { // 建一个全局容器存捕获的数据 (window as any).__pace_captured__ = []; // 用 Proxy 包装一个数组,拦截 push const captured = (window as any).__pace_captured__; const proxyArr = new Proxy([] as any[], { get(target, prop, receiver) { if (prop === 'push') { return (...items: any[]) => { try { captured.push(...items); } catch { } return Array.prototype.push.apply(target, items); }; } return Reflect.get(target, prop, receiver); }, set(target, prop, value, receiver) { // 兼容站点可能直接赋初始数组: self.__pace_f = [a,b] if (prop === 'length') return Reflect.set(target, prop, value, receiver); return Reflect.set(target, prop, value, receiver); } }); // 把 self/window 上的同名队列都指向我们的 proxy // 有些站点用 self,有些用 window (self as any).__pace_f = proxyArr; (window as any).__pace_f = proxyArr; }); try { // 先注册“先到先得”的监听,再导航,避免漏包 const firstTypePromise = waitForFirstResponse(context, [ { key: 'detail', test: (r: Response) => r.url().includes(DETAIL_PATH) && r.status() === 200 }, { key: 'post', test: (r: Response) => r.url().includes(POST_PATH) && r.status() === 200 }, ], 20_000); // 整体 20s 兜底超时,不逐个等待 // 评论只做短时“有就用、没有不等”的监听 const commentPromise = waitForResponseWithTimeout( context, (r: Response) => r.url().includes(COMMENT_PATH) && r.status() === 200, 8_000 ).catch(() => null); await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60_000 }); const firstType = await firstTypePromise; // { key, response } | null const commentRes = await commentPromise; // Response | null if (!firstType) { console.warn('无法判定作品类型(未捕获详情或图文接口)'); const md = await page.evaluate(() => { // @ts-ignore let data = window.__pace_captured__.find(i => i[1] && i[1].includes(`"awemeId":`))[1] return JSON.parse(data.slice(data.indexOf("{")).replaceAll("]\n", '')) // return {aweme: { detail: {} } }; }); let aweme_mem = md.aweme.detail as DouyinImageAweme; if (!aweme_mem) throw new Error('页面内存数据中未找到作品详情'); //@ts-ignore aweme_mem.author = aweme_mem.authorInfo const comments = commentRes ? (await safeJson(commentRes))! : { comments: [], total: 0, status_code: 0 }; const aweme = createCamelCompatibleProxy(aweme_mem); const uploads = await handleImagePost(context, aweme); const saved = await saveImagePostToDB(context, aweme, comments, uploads); return { type: "image", ...saved }; } // 分支:视频 or 图文(两者只会有一个命中,先到先得) if (firstType.key === 'post') { // 图文作品 const postJson = await safeJson(firstType.response); if (!postJson?.aweme_list?.length) throw new Error('图文作品响应为空'); const currentURL = page.url(); const target_aweme_id = currentURL.split('/').at(-1); const awemeList = postJson.aweme_list as unknown as DouyinImageAweme[]; let aweme = awemeList.find((pt: DouyinImageAweme) => pt.aweme_id === target_aweme_id); if (!aweme) { console.warn(`图文作品响应中未找到对应作品,look for aweme_id=${target_aweme_id}, have ${postJson.aweme_list.map(pt => pt.aweme_id).join(', ')}`); // Try read from memory // await new Promise(resolve => setTimeout(resolve, 1000000)); const md = await page.evaluate(() => { // @ts-ignore let data = window.__pace_captured__.find(i => i[1] && i[1].includes(`"awemeId":`))[1] return JSON.parse(data.slice(data.indexOf("{")).replaceAll("]\n", '')) // return {aweme: { detail: {} } }; }); aweme = md.aweme.detail as DouyinImageAweme; } // console.log(aweme); // await new Promise(resolve => setTimeout(resolve, 1000000)); console.log(aweme); const comments = commentRes ? (await safeJson(commentRes))! : { comments: [], total: 0, status_code: 0 }; const uploads = await handleImagePost(context, aweme); const saved = await saveImagePostToDB(context, aweme, comments, uploads); return { type: "image", ...saved }; } else if (firstType.key === 'detail') { // 视频作品 const detail = (await safeJson(firstType.response))!; const comments = commentRes ? (await safeJson(commentRes))! : { comments: [], total: 0, status_code: 0 }; // 找到比特率最高的 url const bestPlayAddr = pickBestPlayAddr( detail?.aweme_detail?.video.bit_rate ); const bestVUrl = bestPlayAddr?.url_list?.[0]; console.log('Best video URL:', bestVUrl); // 下载视频并上传至 MinIO,获取外链 let uploadedUrl: string | undefined; let coverUrl: string | undefined; if (bestVUrl && detail?.aweme_detail) { const { buffer, contentType, ext } = await downloadBinary(context, bestVUrl); const awemeId = detail.aweme_detail.aweme_id; const fileName = generateUniqueFileName(`${awemeId}.${ext}`, 'douyin/videos'); uploadedUrl = await uploadFile(buffer, fileName, { 'Content-Type': contentType }); console.log('Uploaded to MinIO:', uploadedUrl); // 提取首帧作为封面并上传 try { const cover = await extractFirstFrame(buffer); if (cover) { const coverName = generateUniqueFileName(`${awemeId}.jpg`, 'douyin/covers'); coverUrl = await uploadFile(cover.buffer, coverName, { 'Content-Type': cover.contentType }); console.log('Cover uploaded to MinIO:', coverUrl); } } catch (e) { console.warn('Extract first frame failed, skip cover:', (e as Error)?.message || e); } } const saved = await saveToDB(context, detail, comments, uploadedUrl, bestPlayAddr?.width, bestPlayAddr?.height, coverUrl); return { type: "video", ...saved }; } else { throw new Error('无法判定作品类型(未命中详情或图文接口)'); } } finally { await context.close(); await browser.close(); await prisma.$disconnect(); } }