2025-10-21 14:40:27 +08:00

205 lines
9.9 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// src/scrapeDouyin.ts
import { BrowserContext, chromium, Page, type Response } from 'playwright';
import { prisma } from '@/lib/prisma';
import { uploadFile, generateUniqueFileName } from '@/lib/minio';
import { createCamelCompatibleProxy } from '@/app/fetcher/utils';
import { waitForFirstResponse, waitForResponseWithTimeout, safeJson, downloadBinary } from '@/app/fetcher/network';
import { pickBestPlayAddr, extractFirstFrame } from '@/app/fetcher/media';
import { handleImagePost } from '@/app/fetcher/uploader';
import { saveToDB, saveImagePostToDB } from '@/app/fetcher/persist';
import chalk from 'chalk';
const DETAIL_PATH = '/aweme/v1/web/aweme/detail/';
const COMMENT_PATH = '/aweme/v1/web/comment/list/';
const POST_PATH = '/aweme/v1/web/aweme/post/'
async function readPostMem(context: BrowserContext, page: Page) {
const md = await page.evaluate(() => {
// @ts-ignore
let data = window.__pace_captured__.find(i => i[1] && i[1].includes(`"awemeId":`))[1]
return JSON.parse(data.slice(data.indexOf("{")).replaceAll("]\n", ''))
// return {aweme: { detail: {} } };
}).catch(() => null);
let aweme_mem = md?.aweme?.detail as DouyinImageAweme;
if (!aweme_mem) throw new Error('页面内存数据中未找到作品详情');
// @ts-ignore
aweme_mem.author = aweme_mem.authorInfo
const comments = md.comment ? createCamelCompatibleProxy<DouyinCommentResponse>(md.comment) : null;
const aweme = createCamelCompatibleProxy(aweme_mem);
return { aweme, comments }
}
export async function scrapeDouyin(url: string) {
const browser = await chromium.launch({ headless: true });
console.log(chalk.blue('🚀 启动 Chromium 浏览器...'));
const context = await chromium.launchPersistentContext('chrome-profile/douyin', { headless: false });
const page = await context.newPage();
console.log(chalk.cyan(`📄 正在访问: ${chalk.underline(url)}`));
await page.addInitScript(() => {
// 建一个全局容器存捕获的数据
(window as any).__pace_captured__ = [];
// 用 Proxy 包装一个数组,拦截 push
const captured = (window as any).__pace_captured__;
const proxyArr = new Proxy([] as any[], {
get(target, prop, receiver) {
if (prop === 'push') {
return (...items: any[]) => {
try { captured.push(...items); } catch { }
return Array.prototype.push.apply(target, items);
};
}
return Reflect.get(target, prop, receiver);
},
set(target, prop, value, receiver) {
// 兼容站点可能直接赋初始数组: self.__pace_f = [a,b]
if (prop === 'length') return Reflect.set(target, prop, value, receiver);
return Reflect.set(target, prop, value, receiver);
}
});
(self as any).__pace_f = proxyArr;
(window as any).__pace_f = proxyArr;
});
try {
// 先注册“先到先得”的监听,再导航,避免漏包
const firstTypePromise = waitForFirstResponse(context, [
{ key: 'detail', test: (r: Response) => r.url().includes(DETAIL_PATH) && r.status() === 200 },
{ key: 'post', test: (r: Response) => r.url().includes(POST_PATH) && r.status() === 200 },
], 9_000); // 整体 9s 兜底超时,不逐个等待
// 评论只做短时“有就用、没有不等”的监听
const commentPromise = waitForResponseWithTimeout(
context, (r: Response) => r.url().includes(COMMENT_PATH) && r.status() === 200, 8_000
).catch(() => null);
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60_000 });
try {
// 优先尝试从内存读取图文数据
let { aweme, comments } = await readPostMem(context, page);
console.log(chalk.green('✓ 从内存读取图文数据成功'));
const uploads = await handleImagePost(context, aweme);
if (!comments) {
console.warn(chalk.yellow('⚠ 从内存读取评论数据失败,尝试从网络请求获取评论数据'));
const commentRes = await commentPromise;
comments = commentRes && await safeJson<DouyinCommentResponse>(commentRes);
if (!comments) {
console.warn(chalk.yellow('⚠ 无法从内存读取评论数据,且网络请求也未返回评论数据'));
comments = { comments: [], total: 0, status_code: 0 };
} else {
console.log(chalk.green('✓ 从网络请求获取评论数据成功'));
}
} else {
console.log(chalk.green('✓ 从内存读取评论数据成功'));
}
const saved = await saveImagePostToDB(context, aweme, comments, uploads); // 传递完整 JSON
console.log(chalk.green.bold('✓ 图文作品保存成功'));
return { type: "image", ...saved };
} catch {
}
const commentRes = await commentPromise;
const firstType = await firstTypePromise;
if (!firstType) {
console.error(chalk.red('✗ 既无法从内存读取数据,也无法从网络获得数据'));
throw new Error('既无法从内存读取数据,也无法从网络获得数据。');
}
console.log(chalk.cyan(`📡 检测到作品类型: ${chalk.bold(firstType.key === 'post' ? '图文' : '视频')}`));
let comments = commentRes && await safeJson<DouyinCommentResponse>(commentRes);
if (!comments) {
console.warn(chalk.yellow('⚠ 无法从内存读取评论数据,且网络请求也未返回评论数据'));
comments = { comments: [], total: 0, status_code: 0 };
}
// 分支:视频 or 图文(两者只会有一个命中,先到先得)
if (firstType.key === 'post') {
// 图文作品
const postJson = await safeJson<DouyinPostListResponse>(firstType.response);
if (!postJson?.aweme_list?.length) throw new Error('图文作品响应为空');
const currentURL = page.url();
const target_aweme_id = currentURL.split('/').at(-1);
const awemeList = postJson.aweme_list as unknown as DouyinImageAweme[];
let aweme = awemeList.find((pt: DouyinImageAweme) => pt.aweme_id === target_aweme_id);
if (!aweme) {
throw new Error('既无法从内存读取数据Post 列表中也不包含需要爬取的作品。');
}
const uploads = await handleImagePost(context, aweme);
const saved = await saveImagePostToDB(context, aweme, comments, uploads, postJson); // 传递完整 JSON
console.log(chalk.green.bold('✓ 图文作品保存成功'));
return { type: "image", ...saved };
} else if (firstType.key === 'detail') {
// 视频作品
const detail = (await safeJson<DouyinVideoDetailResponse>(firstType.response))!;
// 找到比特率最高的 url
const bestPlayAddr = pickBestPlayAddr(
detail?.aweme_detail?.video.bit_rate
);
const bestVUrl = bestPlayAddr?.url_list?.[0];
const fps = bestPlayAddr?.FPS ?? null; // 提取 FPS
console.log(chalk.cyan(`📹 最佳视频 URL: ${chalk.dim(bestVUrl)}`));
console.log(chalk.cyan(`🎞️ 视频帧率: ${chalk.bold(fps || 'N/A')} FPS`));
if (bestPlayAddr?.width && bestPlayAddr?.height) {
console.log(chalk.cyan(`📐 视频分辨率: ${chalk.bold(`${bestPlayAddr.width}x${bestPlayAddr.height}`)}`));
}
// 下载视频并上传至 MinIO获取外链
let uploadedUrl: string | undefined;
let coverUrl: string | undefined;
if (bestVUrl && detail?.aweme_detail) {
console.log(chalk.blue('⬇️ 正在下载视频...'));
const { buffer, contentType, ext } = await downloadBinary(context, bestVUrl);
const awemeId = detail.aweme_detail.aweme_id;
const fileName = generateUniqueFileName(`${awemeId}.${ext}`, 'douyin/videos');
console.log(chalk.blue('⬆️ 正在上传视频到 MinIO...'));
uploadedUrl = await uploadFile(buffer, fileName, { 'Content-Type': contentType });
console.log(chalk.green(`✓ 视频上传成功: ${chalk.underline(uploadedUrl)}`));
// 提取首帧作为封面并上传
try {
console.log(chalk.blue('🖼️ 正在提取视频封面...'));
const cover = await extractFirstFrame(buffer);
if (cover) {
const coverName = generateUniqueFileName(`${awemeId}.jpg`, 'douyin/covers');
coverUrl = await uploadFile(cover.buffer, coverName, { 'Content-Type': cover.contentType });
console.log(chalk.green(`✓ 封面上传成功: ${chalk.underline(coverUrl)}`));
}
} catch (e) {
console.warn(chalk.yellow(`⚠ 提取封面失败,跳过: ${(e as Error)?.message || e}`));
}
}
const saved = await saveToDB(context, detail, comments, uploadedUrl, bestPlayAddr?.width, bestPlayAddr?.height, coverUrl, fps ?? undefined);
console.log(chalk.green.bold('✓ 视频作品保存成功'));
return { type: "video", ...saved };
} else {
throw new Error('无法判定作品类型(未命中详情或图文接口)');
}
} finally {
console.log(chalk.gray('🧹 清理资源...'));
await context.close();
await browser.close();
await prisma.$disconnect();
console.log(chalk.gray('✓ 资源清理完成'));
}
}