350 lines
15 KiB
TypeScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

export const runtime = 'nodejs'
// src/scrapeDouyin.ts
import { BrowserContext, Page, type Response } from 'playwright';
import { chromium } from 'playwright-extra';
import { prisma } from '@/lib/prisma';
import { uploadFile, generateUniqueFileName } from '@/lib/minio';
import { createCamelCompatibleProxy } from '@/app/api/fetcher/utils';
import { waitForFirstResponse, waitForResponseWithTimeout, safeJson, downloadBinary, collectResponsesWithinTime } from '@/app/api/fetcher/network';
import { pickBestPlayAddr } from '@/app/api/fetcher/media';
import { handleImagePost } from '@/app/api/fetcher/uploader';
import { saveToDB, saveImagePostToDB } from '@/app/api/fetcher/persist';
import chalk from 'chalk';
import { acquireIsolatedContext, releaseIsolatedContext } from '@/app/api/fetcher/browser';
import { extractFirstFrame } from '@/app/api/media';
import { transcriptAweme } from '../stt';
const DETAIL_PATH = '/aweme/v1/web/aweme/detail/';
const COMMENT_PATH = '/aweme/v1/web/comment/list/';
const POST_PATH = '/aweme/v1/web/aweme/post/'
/**
* 滚动页面并收集评论
* @param context 浏览器上下文
* @param page 页面对象
* @param durationMs 持续时间(毫秒)
* @returns 收集到的所有评论响应
*/
async function scrollAndCollectComments(
context: BrowserContext,
page: Page,
durationMs: number = 10_000
): Promise<Response[]> {
console.log(chalk.blue(`📜 开始滚动页面收集评论(持续 ${durationMs / 1000} 秒)...`));
// 启动评论响应收集器
const commentResponsesPromise = collectResponsesWithinTime(
context,
(r: Response) => r.url().includes(COMMENT_PATH) && r.status() === 200 && r.request().frame()?.page() === page,
durationMs
);
// 在指定时间内持续滚动页面
const startTime = Date.now();
const scrollInterval = 500;
let scrollCount = 0;
const selector = "div[data-e2e='comment-list']";
// 1) 等元素出现并可见
await page.waitForSelector(selector, { state: 'visible', timeout: 5000 });
// 2) 确保滚动到可见区域
const list = page.locator(selector);
await list.scrollIntoViewIfNeeded();
// 3) 执行 hover推荐用 locator 的 hover
list.hover({ timeout: 5000 }).catch(() => { });
while (Date.now() - startTime < durationMs - 500) { // 留 500ms 缓冲
try {
list.hover({ timeout: 2000 }).catch(() => { });
// 使用 Playwright 的 mouse.wheel 方法滚动
// 每次滚动一大段距离
// await list.hover();
const scrollAmount = 1500;
await page.mouse.wheel(0, scrollAmount);
scrollCount++;
console.log(chalk.gray(` ↓ 第 ${scrollCount} 次滚动`));
// 等待一段时间,让评论加载
await page.waitForTimeout(scrollInterval);
} catch (e) {
console.warn(chalk.yellow(` ⚠ 滚动时出现警告: ${(e as Error)?.message}`));
}
}
// 等待收集器完成
const commentResponses = await commentResponsesPromise;
console.log(chalk.green(`✓ 评论收集完成,共收集到 ${commentResponses.length} 个评论响应`));
return commentResponses;
}
async function readPostMem(context: BrowserContext, page: Page) {
const md = await page.evaluate(() => {
// @ts-ignore
let data = window.__pace_captured__.find(i => i[1] && i[1].includes(`"awemeId":`))[1]
return JSON.parse(data.slice(data.indexOf("{")).replaceAll("]\n", ''))
// return {aweme: { detail: {} } };
}).catch(() => null);
// await new Promise((res) => setTimeout(res, 1000000));
let aweme_mem = md?.aweme?.detail as DouyinImageAweme;
if (!aweme_mem) throw new Error('页面内存数据中未找到作品详情');
// @ts-ignore
aweme_mem.author = aweme_mem.authorInfo
// @ts-ignore
aweme_mem.statistics = aweme_mem.stats
const comments = md.comment ? createCamelCompatibleProxy<DouyinCommentResponse>(md.comment) : null;
const aweme = createCamelCompatibleProxy(aweme_mem);
return { aweme, comments }
}
export class ScrapeError extends Error {
constructor(
message: string,
public statusCode: number = 500,
public code?: string
) {
super(message);
this.name = 'ScrapeError';
}
}
export async function scrapeDouyin(url: string) {
console.log(chalk.blue('🚀 启动共享 Chromium 浏览器...'));
let context: BrowserContext | null = await acquireIsolatedContext();
const page = await context.newPage();
console.log(chalk.cyan(`📄 正在访问: ${chalk.underline(url)}`));
await page.addInitScript(() => {
// 建一个全局容器存捕获的数据
(window as any).__pace_captured__ = [];
// 用 Proxy 包装一个数组,拦截 push
const captured = (window as any).__pace_captured__;
const proxyArr = new Proxy([] as any[], {
get(target, prop, receiver) {
if (prop === 'push') {
return (...items: any[]) => {
try { captured.push(...items); } catch { }
return Array.prototype.push.apply(target, items);
};
}
return Reflect.get(target, prop, receiver);
},
set(target, prop, value, receiver) {
// 兼容站点可能直接赋初始数组: self.__pace_f = [a,b]
if (prop === 'length') return Reflect.set(target, prop, value, receiver);
return Reflect.set(target, prop, value, receiver);
}
});
(self as any).__pace_f = proxyArr;
(window as any).__pace_f = proxyArr;
});
try {
// 先注册“先到先得”的监听,再导航,避免漏包
const firstTypePromise = waitForFirstResponse(context, [
{ key: 'detail', test: (r: Response) => r.url().includes(DETAIL_PATH) && r.status() === 200 && r.request().frame()?.page() === page },
{ key: 'post', test: (r: Response) => r.url().includes(POST_PATH) && r.status() === 200 && r.request().frame()?.page() === page },
], 10_000);
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 20_000 });
// 查找页面中是否存在 "视频不存在" 的提示
const isNotFound = await page.locator('text=视频不存在').count().then(count => count > 0).catch(() => false);
if (isNotFound) {
console.error(chalk.red('✗ 视频不存在或已被删除'));
throw new ScrapeError('视频不存在或已被删除', 404, 'VIDEO_NOT_FOUND');
}
// 等待作品类型判定
const firstType = await firstTypePromise;
// 尝试从内存读取图文数据(如果是图文作品)
let memoryData: { aweme: any; comments: DouyinCommentResponse | null } | null = null;
try {
memoryData = await readPostMem(context, page);
console.log(chalk.green('✓ 从内存读取图文数据成功'));
} catch {
// 内存读取失败,稍后通过网络获取
}
if (!firstType && !memoryData) {
console.error(chalk.red('✗ 既无法从内存读取数据,也无法从网络获得数据'));
throw new ScrapeError('无法获取作品数据,可能是网络问题或作品已下架', 404, 'NO_DATA');
}
console.log(chalk.cyan(`📡 检测到作品类型: ${chalk.bold(firstType?.key === 'post' || memoryData ? '图文' : '视频')}`));
let allComments: DouyinComment[] = [];
try {
// 开始滚动并收集评论
const commentResponses = await scrollAndCollectComments(context, page);
// 解析所有收集到的评论响应
for (const commentRes of commentResponses) {
try {
const commentData = await safeJson<DouyinCommentResponse>(commentRes);
if (commentData?.comments?.length) {
allComments.push(...commentData.comments);
}
} catch (e) {
console.warn(chalk.yellow(`⚠ 解析评论响应失败: ${(e as Error)?.message}`));
}
}
} catch (error) {
console.warn(chalk.yellow(`⚠ 评论收集失败: ${(error as Error)?.message}`));
}
// 去重评论(根据 cid
const uniqueComments = Array.from(
new Map(allComments.map(c => [c.cid, c])).values()
);
console.log(chalk.green(`✓ 共收集到 ${uniqueComments.length} 条独立评论(去重前: ${allComments.length}`));
// 如果从内存读取到了评论,合并进来作为兜底
let comments: DouyinCommentResponse;
if (memoryData?.comments?.comments?.length) {
console.log(chalk.blue(`📝 合并内存中的 ${memoryData.comments.comments.length} 条评论`));
const memComments = memoryData.comments.comments;
const mergedMap = new Map(uniqueComments.map(c => [c.cid, c]));
for (const c of memComments) {
if (!mergedMap.has(c.cid)) {
mergedMap.set(c.cid, c);
}
}
comments = {
comments: Array.from(mergedMap.values()),
total: mergedMap.size,
status_code: 0
};
console.log(chalk.green(`✓ 合并后共 ${comments.comments.length} 条评论`));
} else {
comments = {
comments: uniqueComments,
total: uniqueComments.length,
status_code: 0
};
}
// 分支:视频 or 图文(两者只会有一个命中,先到先得)
// 优先处理内存数据(图文)
if (memoryData) {
const aweme = memoryData.aweme;
const uploads = await handleImagePost(context, aweme);
const saved = await saveImagePostToDB(context, aweme, comments, uploads); // 传递完整 JSON
console.log(chalk.green.bold('✓ 图文作品保存成功'));
return { type: "image", ...saved };
} else if (firstType?.key === 'post') {
// 图文作品(网络)
const postJson = await safeJson<DouyinPostListResponse>(firstType.response);
if (!postJson?.aweme_list?.length) throw new ScrapeError('图文作品响应为空', 404, 'EMPTY_POST_RESPONSE');
const currentURL = page.url();
const target_aweme_id = currentURL.split('/').at(-1);
const awemeList = postJson.aweme_list as unknown as DouyinImageAweme[];
let aweme = awemeList.find((pt: DouyinImageAweme) => pt.aweme_id === target_aweme_id);
if (!aweme) {
throw new ScrapeError('无法找到目标作品,可能已被删除', 404, 'POST_NOT_FOUND');
}
const uploads = await handleImagePost(context, aweme);
const saved = await saveImagePostToDB(context, aweme, comments, uploads, postJson); // 传递完整 JSON
console.log(chalk.green.bold('✓ 图文作品保存成功'));
return { type: "image", ...saved };
} else if (firstType?.key === 'detail') {
// 视频作品
const detail = (await safeJson<DouyinVideoDetailResponse>(firstType.response))!;
// 找到比特率最高的 url
const bestPlayAddr = pickBestPlayAddr(
detail?.aweme_detail?.video.bit_rate
);
const bestVUrl = bestPlayAddr?.url_list?.[0];
const fps = bestPlayAddr?.FPS ?? null; // 提取 FPS
console.log(chalk.cyan(`📹 最佳视频 URL: ${chalk.dim(bestVUrl)}`));
console.log(chalk.cyan(`🎞️ 视频帧率: ${chalk.bold(fps || 'N/A')} FPS`));
if (bestPlayAddr?.width && bestPlayAddr?.height) {
console.log(chalk.cyan(`📐 视频分辨率: ${chalk.bold(`${bestPlayAddr.width}x${bestPlayAddr.height}`)}`));
}
// 下载视频并上传至 MinIO获取外链
let uploadedUrl: string | undefined;
let coverUrl: string | undefined;
if (bestVUrl && detail?.aweme_detail) {
console.log(chalk.blue('⬇️ 正在下载视频...'));
const { buffer, contentType, ext } = await downloadBinary(context, bestVUrl);
const awemeId = detail.aweme_detail.aweme_id;
const fileName = generateUniqueFileName(`${awemeId}.${ext}`, 'douyin/videos');
console.log(chalk.blue('⬆️ 正在上传视频到 MinIO...'));
uploadedUrl = await uploadFile(buffer, fileName, { 'Content-Type': contentType });
console.log(chalk.green(`✓ 视频上传成功: ${chalk.underline(uploadedUrl)}`));
// 提取首帧作为封面并上传
try {
console.log(chalk.blue('🖼️ 正在提取视频封面...'));
const cover = await extractFirstFrame(buffer);
if (cover) {
const coverName = generateUniqueFileName(`${awemeId}.jpg`, 'douyin/covers');
coverUrl = await uploadFile(cover.buffer, coverName, { 'Content-Type': cover.contentType });
console.log(chalk.green(`✓ 封面上传成功: ${chalk.underline(coverUrl)}`));
}
} catch (e) {
console.warn(chalk.yellow(`⚠ 提取封面失败,跳过: ${(e as Error)?.message || e}`));
}
}
const saved = await saveToDB(context, detail, comments, uploadedUrl, bestPlayAddr?.width, bestPlayAddr?.height, coverUrl, fps ?? undefined);
console.log(chalk.green.bold('✓ 视频作品保存成功'));
transcriptAweme(detail.aweme_detail.aweme_id).catch((e) => {}); // 异步转写,不阻塞主流程
return { type: "video", ...saved };
} else {
throw new ScrapeError('无法判定作品类型,接口响应异常', 500, 'UNKNOWN_TYPE');
}
} catch (error) {
// 如果是我们自定义的错误,直接抛出
if (error instanceof ScrapeError) {
throw error;
}
// 处理其他类型的错误
const errMsg = (error as Error)?.message || String(error);
console.error(chalk.red(`✗ 爬取失败: ${errMsg}`));
// 根据错误类型返回不同的状态码
if (errMsg.includes('timeout') || errMsg.includes('超时')) {
throw new ScrapeError('请求超时,请稍后重试', 408, 'TIMEOUT');
}
if (errMsg.includes('页面内存数据中未找到作品详情')) {
throw new ScrapeError('作品数据加载失败', 404, 'DATA_NOT_LOADED');
}
if (errMsg.includes('net::')) {
throw new ScrapeError('网络连接失败', 503, 'NETWORK_ERROR');
}
// 默认服务器错误
throw new ScrapeError(errMsg || '爬取过程中发生未知错误', 500, 'UNKNOWN_ERROR');
} finally {
console.log(chalk.gray('🧹 清理资源...'));
try { await page.close({ runBeforeUnload: true }); } catch { }
// 关闭本次任务的隔离上下文与浏览器
await releaseIsolatedContext(context);
await prisma.$disconnect();
console.log(chalk.gray('✓ 资源清理完成'));
}
}