2025-10-20 13:06:06 +08:00

171 lines
8.3 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// src/scrapeDouyin.ts
import { chromium, type Response } from 'playwright';
import { prisma } from '@/lib/prisma';
import { uploadFile, generateUniqueFileName } from '@/lib/minio';
import { createCamelCompatibleProxy } from '@/app/fetcher/utils';
import { waitForFirstResponse, waitForResponseWithTimeout, safeJson, downloadBinary } from '@/app/fetcher/network';
import { pickBestPlayAddr, extractFirstFrame } from '@/app/fetcher/media';
import { handleImagePost } from '@/app/fetcher/uploader';
import { saveToDB, saveImagePostToDB } from '@/app/fetcher/persist';
const DETAIL_PATH = '/aweme/v1/web/aweme/detail/';
const COMMENT_PATH = '/aweme/v1/web/comment/list/';
const POST_PATH = '/aweme/v1/web/aweme/post/'
export async function scrapeDouyin(url: string) {
const browser = await chromium.launch({ headless: true });
console.log("Launch chromium");
const context = await chromium.launchPersistentContext('chrome-profile/douyin', { headless: true });
const page = await context.newPage();
await page.addInitScript(() => {
// 建一个全局容器存捕获的数据
(window as any).__pace_captured__ = [];
// 用 Proxy 包装一个数组,拦截 push
const captured = (window as any).__pace_captured__;
const proxyArr = new Proxy([] as any[], {
get(target, prop, receiver) {
if (prop === 'push') {
return (...items: any[]) => {
try { captured.push(...items); } catch { }
return Array.prototype.push.apply(target, items);
};
}
return Reflect.get(target, prop, receiver);
},
set(target, prop, value, receiver) {
// 兼容站点可能直接赋初始数组: self.__pace_f = [a,b]
if (prop === 'length') return Reflect.set(target, prop, value, receiver);
return Reflect.set(target, prop, value, receiver);
}
});
// 把 self/window 上的同名队列都指向我们的 proxy
// 有些站点用 self有些用 window
(self as any).__pace_f = proxyArr;
(window as any).__pace_f = proxyArr;
});
try {
// 先注册“先到先得”的监听,再导航,避免漏包
const firstTypePromise = waitForFirstResponse(context, [
{ key: 'detail', test: (r: Response) => r.url().includes(DETAIL_PATH) && r.status() === 200 },
{ key: 'post', test: (r: Response) => r.url().includes(POST_PATH) && r.status() === 200 },
], 20_000); // 整体 20s 兜底超时,不逐个等待
// 评论只做短时“有就用、没有不等”的监听
const commentPromise = waitForResponseWithTimeout(
context,
(r: Response) => r.url().includes(COMMENT_PATH) && r.status() === 200,
8_000
).catch(() => null);
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60_000 });
const firstType = await firstTypePromise; // { key, response } | null
const commentRes = await commentPromise; // Response | null
if (!firstType) {
console.warn('无法判定作品类型(未捕获详情或图文接口)');
const md = await page.evaluate(() => {
// @ts-ignore
let data = window.__pace_captured__.find(i => i[1] && i[1].includes(`"awemeId":`))[1]
return JSON.parse(data.slice(data.indexOf("{")).replaceAll("]\n", ''))
// return {aweme: { detail: {} } };
});
let aweme_mem = md.aweme.detail as DouyinImageAweme;
if (!aweme_mem) throw new Error('页面内存数据中未找到作品详情');
//@ts-ignore
aweme_mem.author = aweme_mem.authorInfo
const comments = commentRes ? (await safeJson<DouyinCommentResponse>(commentRes))! : { comments: [], total: 0, status_code: 0 };
const aweme = createCamelCompatibleProxy(aweme_mem);
const uploads = await handleImagePost(context, aweme);
const saved = await saveImagePostToDB(context, aweme, comments, uploads);
return { type: "image", ...saved };
}
// 分支:视频 or 图文(两者只会有一个命中,先到先得)
if (firstType.key === 'post') {
// 图文作品
const postJson = await safeJson<DouyinPostListResponse>(firstType.response);
if (!postJson?.aweme_list?.length) throw new Error('图文作品响应为空');
const currentURL = page.url();
const target_aweme_id = currentURL.split('/').at(-1);
const awemeList = postJson.aweme_list as unknown as DouyinImageAweme[];
let aweme = awemeList.find((pt: DouyinImageAweme) => pt.aweme_id === target_aweme_id);
if (!aweme) {
console.warn(`图文作品响应中未找到对应作品look for aweme_id=${target_aweme_id}, have ${postJson.aweme_list.map(pt => pt.aweme_id).join(', ')}`);
// Try read from memory
// await new Promise(resolve => setTimeout(resolve, 1000000));
const md = await page.evaluate(() => {
// @ts-ignore
let data = window.__pace_captured__.find(i => i[1] && i[1].includes(`"awemeId":`))[1]
return JSON.parse(data.slice(data.indexOf("{")).replaceAll("]\n", ''))
// return {aweme: { detail: {} } };
});
aweme = md.aweme.detail as DouyinImageAweme;
}
// console.log(aweme);
// await new Promise(resolve => setTimeout(resolve, 1000000));
console.log(aweme);
const comments = commentRes ? (await safeJson<DouyinCommentResponse>(commentRes))! : { comments: [], total: 0, status_code: 0 };
const uploads = await handleImagePost(context, aweme);
const saved = await saveImagePostToDB(context, aweme, comments, uploads);
return { type: "image", ...saved };
} else if (firstType.key === 'detail') {
// 视频作品
const detail = (await safeJson<DouyinVideoDetailResponse>(firstType.response))!;
const comments = commentRes ? (await safeJson<DouyinCommentResponse>(commentRes))! : { comments: [], total: 0, status_code: 0 };
// 找到比特率最高的 url
const bestPlayAddr = pickBestPlayAddr(
detail?.aweme_detail?.video.bit_rate
);
const bestVUrl = bestPlayAddr?.url_list?.[0];
console.log('Best video URL:', bestVUrl);
// 下载视频并上传至 MinIO获取外链
let uploadedUrl: string | undefined;
let coverUrl: string | undefined;
if (bestVUrl && detail?.aweme_detail) {
const { buffer, contentType, ext } = await downloadBinary(context, bestVUrl);
const awemeId = detail.aweme_detail.aweme_id;
const fileName = generateUniqueFileName(`${awemeId}.${ext}`, 'douyin/videos');
uploadedUrl = await uploadFile(buffer, fileName, { 'Content-Type': contentType });
console.log('Uploaded to MinIO:', uploadedUrl);
// 提取首帧作为封面并上传
try {
const cover = await extractFirstFrame(buffer);
if (cover) {
const coverName = generateUniqueFileName(`${awemeId}.jpg`, 'douyin/covers');
coverUrl = await uploadFile(cover.buffer, coverName, { 'Content-Type': cover.contentType });
console.log('Cover uploaded to MinIO:', coverUrl);
}
} catch (e) {
console.warn('Extract first frame failed, skip cover:', (e as Error)?.message || e);
}
}
const saved = await saveToDB(context, detail, comments, uploadedUrl, bestPlayAddr?.width, bestPlayAddr?.height, coverUrl);
return { type: "video", ...saved };
} else {
throw new Error('无法判定作品类型(未命中详情或图文接口)');
}
} finally {
await context.close();
await browser.close();
await prisma.$disconnect();
}
}