export const runtime = 'nodejs' // src/scrapeDouyin.ts import { BrowserContext, Page, type Response } from 'playwright'; import { chromium } from 'playwright-extra'; import { prisma } from '@/lib/prisma'; import { uploadFile, generateUniqueFileName } from '@/lib/minio'; import { createCamelCompatibleProxy } from '@/app/api/fetcher/utils'; import { waitForFirstResponse, waitForResponseWithTimeout, safeJson, downloadBinary, collectResponsesWithinTime } from '@/app/api/fetcher/network'; import { pickBestPlayAddr, extractFirstFrame } from '@/app/api/fetcher/media'; import { handleImagePost } from '@/app/api/fetcher/uploader'; import { saveToDB, saveImagePostToDB } from '@/app/api/fetcher/persist'; import chalk from 'chalk'; import { acquireIsolatedContext, releaseIsolatedContext } from '@/app/api/fetcher/browser'; const DETAIL_PATH = '/aweme/v1/web/aweme/detail/'; const COMMENT_PATH = '/aweme/v1/web/comment/list/'; const POST_PATH = '/aweme/v1/web/aweme/post/' /** * 滚动页面并收集评论 * @param context 浏览器上下文 * @param page 页面对象 * @param durationMs 持续时间(毫秒) * @returns 收集到的所有评论响应 */ async function scrollAndCollectComments( context: BrowserContext, page: Page, durationMs: number = 10_000 ): Promise { console.log(chalk.blue(`📜 开始滚动页面收集评论(持续 ${durationMs / 1000} 秒)...`)); // 启动评论响应收集器 const commentResponsesPromise = collectResponsesWithinTime( context, (r: Response) => r.url().includes(COMMENT_PATH) && r.status() === 200 && r.request().frame()?.page() === page, durationMs ); // 在指定时间内持续滚动页面 const startTime = Date.now(); const scrollInterval = 500; let scrollCount = 0; const selector = "div[data-e2e='comment-list']"; // 1) 等元素出现并可见 await page.waitForSelector(selector, { state: 'visible', timeout: 5000 }); // 2) 确保滚动到可见区域 const list = page.locator(selector); await list.scrollIntoViewIfNeeded(); // 3) 执行 hover(推荐用 locator 的 hover) list.hover({ timeout: 5000 }).catch(() => { }); while (Date.now() - startTime < durationMs - 500) { // 留 500ms 缓冲 try { list.hover({ timeout: 2000 }).catch(() => { }); // 使用 Playwright 的 mouse.wheel 方法滚动 // 每次滚动一大段距离 // await list.hover(); const scrollAmount = 1500; await page.mouse.wheel(0, scrollAmount); scrollCount++; console.log(chalk.gray(` ↓ 第 ${scrollCount} 次滚动`)); // 等待一段时间,让评论加载 await page.waitForTimeout(scrollInterval); } catch (e) { console.warn(chalk.yellow(` ⚠ 滚动时出现警告: ${(e as Error)?.message}`)); } } // 等待收集器完成 const commentResponses = await commentResponsesPromise; console.log(chalk.green(`✓ 评论收集完成,共收集到 ${commentResponses.length} 个评论响应`)); return commentResponses; } async function readPostMem(context: BrowserContext, page: Page) { const md = await page.evaluate(() => { // @ts-ignore let data = window.__pace_captured__.find(i => i[1] && i[1].includes(`"awemeId":`))[1] return JSON.parse(data.slice(data.indexOf("{")).replaceAll("]\n", '')) // return {aweme: { detail: {} } }; }).catch(() => null); let aweme_mem = md?.aweme?.detail as DouyinImageAweme; if (!aweme_mem) throw new Error('页面内存数据中未找到作品详情'); // @ts-ignore aweme_mem.author = aweme_mem.authorInfo const comments = md.comment ? createCamelCompatibleProxy(md.comment) : null; const aweme = createCamelCompatibleProxy(aweme_mem); return { aweme, comments } } export class ScrapeError extends Error { constructor( message: string, public statusCode: number = 500, public code?: string ) { super(message); this.name = 'ScrapeError'; } } export async function scrapeDouyin(url: string) { console.log(chalk.blue('🚀 启动共享 Chromium 浏览器...')); let context: BrowserContext | null = await acquireIsolatedContext(); const page = await context.newPage(); console.log(chalk.cyan(`📄 正在访问: ${chalk.underline(url)}`)); await page.addInitScript(() => { // 建一个全局容器存捕获的数据 (window as any).__pace_captured__ = []; // 用 Proxy 包装一个数组,拦截 push const captured = (window as any).__pace_captured__; const proxyArr = new Proxy([] as any[], { get(target, prop, receiver) { if (prop === 'push') { return (...items: any[]) => { try { captured.push(...items); } catch { } return Array.prototype.push.apply(target, items); }; } return Reflect.get(target, prop, receiver); }, set(target, prop, value, receiver) { // 兼容站点可能直接赋初始数组: self.__pace_f = [a,b] if (prop === 'length') return Reflect.set(target, prop, value, receiver); return Reflect.set(target, prop, value, receiver); } }); (self as any).__pace_f = proxyArr; (window as any).__pace_f = proxyArr; }); try { // 先注册“先到先得”的监听,再导航,避免漏包 const firstTypePromise = waitForFirstResponse(context, [ { key: 'detail', test: (r: Response) => r.url().includes(DETAIL_PATH) && r.status() === 200 && r.request().frame()?.page() === page }, { key: 'post', test: (r: Response) => r.url().includes(POST_PATH) && r.status() === 200 && r.request().frame()?.page() === page }, ], 10_000); await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 20_000 }); // 查找页面中是否存在 "视频不存在" 的提示 const isNotFound = await page.locator('text=视频不存在').count().then(count => count > 0).catch(() => false); if (isNotFound) { console.error(chalk.red('✗ 视频不存在或已被删除')); throw new ScrapeError('视频不存在或已被删除', 404, 'VIDEO_NOT_FOUND'); } // 等待作品类型判定 const firstType = await firstTypePromise; // 尝试从内存读取图文数据(如果是图文作品) let memoryData: { aweme: any; comments: DouyinCommentResponse | null } | null = null; try { memoryData = await readPostMem(context, page); console.log(chalk.green('✓ 从内存读取图文数据成功')); } catch { // 内存读取失败,稍后通过网络获取 } if (!firstType && !memoryData) { console.error(chalk.red('✗ 既无法从内存读取数据,也无法从网络获得数据')); throw new ScrapeError('无法获取作品数据,可能是网络问题或作品已下架', 404, 'NO_DATA'); } console.log(chalk.cyan(`📡 检测到作品类型: ${chalk.bold(firstType?.key === 'post' || memoryData ? '图文' : '视频')}`)); let allComments: DouyinComment[] = []; try { // 开始滚动并收集评论 const commentResponses = await scrollAndCollectComments(context, page); // 解析所有收集到的评论响应 for (const commentRes of commentResponses) { try { const commentData = await safeJson(commentRes); if (commentData?.comments?.length) { allComments.push(...commentData.comments); } } catch (e) { console.warn(chalk.yellow(`⚠ 解析评论响应失败: ${(e as Error)?.message}`)); } } } catch (error) { console.warn(chalk.yellow(`⚠ 评论收集失败: ${(error as Error)?.message}`)); } // 去重评论(根据 cid) const uniqueComments = Array.from( new Map(allComments.map(c => [c.cid, c])).values() ); console.log(chalk.green(`✓ 共收集到 ${uniqueComments.length} 条独立评论(去重前: ${allComments.length})`)); // 如果从内存读取到了评论,合并进来作为兜底 let comments: DouyinCommentResponse; if (memoryData?.comments?.comments?.length) { console.log(chalk.blue(`📝 合并内存中的 ${memoryData.comments.comments.length} 条评论`)); const memComments = memoryData.comments.comments; const mergedMap = new Map(uniqueComments.map(c => [c.cid, c])); for (const c of memComments) { if (!mergedMap.has(c.cid)) { mergedMap.set(c.cid, c); } } comments = { comments: Array.from(mergedMap.values()), total: mergedMap.size, status_code: 0 }; console.log(chalk.green(`✓ 合并后共 ${comments.comments.length} 条评论`)); } else { comments = { comments: uniqueComments, total: uniqueComments.length, status_code: 0 }; } // 分支:视频 or 图文(两者只会有一个命中,先到先得) // 优先处理内存数据(图文) if (memoryData) { const aweme = memoryData.aweme; const uploads = await handleImagePost(context, aweme); const saved = await saveImagePostToDB(context, aweme, comments, uploads); // 传递完整 JSON console.log(chalk.green.bold('✓ 图文作品保存成功')); return { type: "image", ...saved }; } else if (firstType?.key === 'post') { // 图文作品(网络) const postJson = await safeJson(firstType.response); if (!postJson?.aweme_list?.length) throw new ScrapeError('图文作品响应为空', 404, 'EMPTY_POST_RESPONSE'); const currentURL = page.url(); const target_aweme_id = currentURL.split('/').at(-1); const awemeList = postJson.aweme_list as unknown as DouyinImageAweme[]; let aweme = awemeList.find((pt: DouyinImageAweme) => pt.aweme_id === target_aweme_id); if (!aweme) { throw new ScrapeError('无法找到目标作品,可能已被删除', 404, 'POST_NOT_FOUND'); } const uploads = await handleImagePost(context, aweme); const saved = await saveImagePostToDB(context, aweme, comments, uploads, postJson); // 传递完整 JSON console.log(chalk.green.bold('✓ 图文作品保存成功')); return { type: "image", ...saved }; } else if (firstType?.key === 'detail') { // 视频作品 const detail = (await safeJson(firstType.response))!; // 找到比特率最高的 url const bestPlayAddr = pickBestPlayAddr( detail?.aweme_detail?.video.bit_rate ); const bestVUrl = bestPlayAddr?.url_list?.[0]; const fps = bestPlayAddr?.FPS ?? null; // 提取 FPS console.log(chalk.cyan(`📹 最佳视频 URL: ${chalk.dim(bestVUrl)}`)); console.log(chalk.cyan(`🎞️ 视频帧率: ${chalk.bold(fps || 'N/A')} FPS`)); if (bestPlayAddr?.width && bestPlayAddr?.height) { console.log(chalk.cyan(`📐 视频分辨率: ${chalk.bold(`${bestPlayAddr.width}x${bestPlayAddr.height}`)}`)); } // 下载视频并上传至 MinIO,获取外链 let uploadedUrl: string | undefined; let coverUrl: string | undefined; if (bestVUrl && detail?.aweme_detail) { console.log(chalk.blue('⬇️ 正在下载视频...')); const { buffer, contentType, ext } = await downloadBinary(context, bestVUrl); const awemeId = detail.aweme_detail.aweme_id; const fileName = generateUniqueFileName(`${awemeId}.${ext}`, 'douyin/videos'); console.log(chalk.blue('⬆️ 正在上传视频到 MinIO...')); uploadedUrl = await uploadFile(buffer, fileName, { 'Content-Type': contentType }); console.log(chalk.green(`✓ 视频上传成功: ${chalk.underline(uploadedUrl)}`)); // 提取首帧作为封面并上传 try { console.log(chalk.blue('🖼️ 正在提取视频封面...')); const cover = await extractFirstFrame(buffer); if (cover) { const coverName = generateUniqueFileName(`${awemeId}.jpg`, 'douyin/covers'); coverUrl = await uploadFile(cover.buffer, coverName, { 'Content-Type': cover.contentType }); console.log(chalk.green(`✓ 封面上传成功: ${chalk.underline(coverUrl)}`)); } } catch (e) { console.warn(chalk.yellow(`⚠ 提取封面失败,跳过: ${(e as Error)?.message || e}`)); } } const saved = await saveToDB(context, detail, comments, uploadedUrl, bestPlayAddr?.width, bestPlayAddr?.height, coverUrl, fps ?? undefined); console.log(chalk.green.bold('✓ 视频作品保存成功')); return { type: "video", ...saved }; } else { throw new ScrapeError('无法判定作品类型,接口响应异常', 500, 'UNKNOWN_TYPE'); } } catch (error) { // 如果是我们自定义的错误,直接抛出 if (error instanceof ScrapeError) { throw error; } // 处理其他类型的错误 const errMsg = (error as Error)?.message || String(error); console.error(chalk.red(`✗ 爬取失败: ${errMsg}`)); // 根据错误类型返回不同的状态码 if (errMsg.includes('timeout') || errMsg.includes('超时')) { throw new ScrapeError('请求超时,请稍后重试', 408, 'TIMEOUT'); } if (errMsg.includes('页面内存数据中未找到作品详情')) { throw new ScrapeError('作品数据加载失败', 404, 'DATA_NOT_LOADED'); } if (errMsg.includes('net::')) { throw new ScrapeError('网络连接失败', 503, 'NETWORK_ERROR'); } // 默认服务器错误 throw new ScrapeError(errMsg || '爬取过程中发生未知错误', 500, 'UNKNOWN_ERROR'); } finally { console.log(chalk.gray('🧹 清理资源...')); try { await page.close({ runBeforeUnload: true }); } catch { } // 关闭本次任务的隔离上下文与浏览器 await releaseIsolatedContext(context); await prisma.$disconnect(); console.log(chalk.gray('✓ 资源清理完成')); } }