347 lines
15 KiB
TypeScript
347 lines
15 KiB
TypeScript
export const runtime = 'nodejs'
|
||
// src/scrapeDouyin.ts
|
||
import { BrowserContext, Page, type Response } from 'playwright';
|
||
import { chromium } from 'playwright-extra';
|
||
import { prisma } from '@/lib/prisma';
|
||
import { uploadFile, generateUniqueFileName } from '@/lib/minio';
|
||
import { createCamelCompatibleProxy } from '@/app/api/fetcher/utils';
|
||
import { waitForFirstResponse, waitForResponseWithTimeout, safeJson, downloadBinary, collectResponsesWithinTime } from '@/app/api/fetcher/network';
|
||
import { pickBestPlayAddr, extractFirstFrame } from '@/app/api/fetcher/media';
|
||
import { handleImagePost } from '@/app/api/fetcher/uploader';
|
||
import { saveToDB, saveImagePostToDB } from '@/app/api/fetcher/persist';
|
||
import chalk from 'chalk';
|
||
import { acquireIsolatedContext, releaseIsolatedContext } from '@/app/api/fetcher/browser';
|
||
|
||
const DETAIL_PATH = '/aweme/v1/web/aweme/detail/';
|
||
const COMMENT_PATH = '/aweme/v1/web/comment/list/';
|
||
const POST_PATH = '/aweme/v1/web/aweme/post/'
|
||
|
||
/**
|
||
* 滚动页面并收集评论
|
||
* @param context 浏览器上下文
|
||
* @param page 页面对象
|
||
* @param durationMs 持续时间(毫秒)
|
||
* @returns 收集到的所有评论响应
|
||
*/
|
||
async function scrollAndCollectComments(
|
||
context: BrowserContext,
|
||
page: Page,
|
||
durationMs: number = 10_000
|
||
): Promise<Response[]> {
|
||
console.log(chalk.blue(`📜 开始滚动页面收集评论(持续 ${durationMs / 1000} 秒)...`));
|
||
|
||
// 启动评论响应收集器
|
||
const commentResponsesPromise = collectResponsesWithinTime(
|
||
context,
|
||
(r: Response) => r.url().includes(COMMENT_PATH) && r.status() === 200 && r.request().frame()?.page() === page,
|
||
durationMs
|
||
);
|
||
|
||
// 在指定时间内持续滚动页面
|
||
const startTime = Date.now();
|
||
const scrollInterval = 500;
|
||
let scrollCount = 0;
|
||
const selector = "div[data-e2e='comment-list']";
|
||
|
||
// 1) 等元素出现并可见
|
||
await page.waitForSelector(selector, { state: 'visible', timeout: 5000 });
|
||
|
||
// 2) 确保滚动到可见区域
|
||
const list = page.locator(selector);
|
||
await list.scrollIntoViewIfNeeded();
|
||
|
||
// 3) 执行 hover(推荐用 locator 的 hover)
|
||
list.hover({ timeout: 5000 }).catch(() => { });
|
||
while (Date.now() - startTime < durationMs - 500) { // 留 500ms 缓冲
|
||
try {
|
||
list.hover({ timeout: 2000 }).catch(() => { });
|
||
// 使用 Playwright 的 mouse.wheel 方法滚动
|
||
// 每次滚动一大段距离
|
||
// await list.hover();
|
||
const scrollAmount = 1500;
|
||
await page.mouse.wheel(0, scrollAmount);
|
||
|
||
scrollCount++;
|
||
console.log(chalk.gray(` ↓ 第 ${scrollCount} 次滚动`));
|
||
|
||
// 等待一段时间,让评论加载
|
||
await page.waitForTimeout(scrollInterval);
|
||
|
||
} catch (e) {
|
||
console.warn(chalk.yellow(` ⚠ 滚动时出现警告: ${(e as Error)?.message}`));
|
||
}
|
||
}
|
||
|
||
// 等待收集器完成
|
||
const commentResponses = await commentResponsesPromise;
|
||
console.log(chalk.green(`✓ 评论收集完成,共收集到 ${commentResponses.length} 个评论响应`));
|
||
|
||
return commentResponses;
|
||
}
|
||
|
||
async function readPostMem(context: BrowserContext, page: Page) {
|
||
const md = await page.evaluate(() => {
|
||
// @ts-ignore
|
||
let data = window.__pace_captured__.find(i => i[1] && i[1].includes(`"awemeId":`))[1]
|
||
return JSON.parse(data.slice(data.indexOf("{")).replaceAll("]\n", ''))
|
||
// return {aweme: { detail: {} } };
|
||
}).catch(() => null);
|
||
|
||
// await new Promise((res) => setTimeout(res, 1000000));
|
||
|
||
let aweme_mem = md?.aweme?.detail as DouyinImageAweme;
|
||
if (!aweme_mem) throw new Error('页面内存数据中未找到作品详情');
|
||
|
||
// @ts-ignore
|
||
aweme_mem.author = aweme_mem.authorInfo
|
||
// @ts-ignore
|
||
aweme_mem.statistics = aweme_mem.stats
|
||
|
||
const comments = md.comment ? createCamelCompatibleProxy<DouyinCommentResponse>(md.comment) : null;
|
||
const aweme = createCamelCompatibleProxy(aweme_mem);
|
||
|
||
return { aweme, comments }
|
||
}
|
||
|
||
|
||
export class ScrapeError extends Error {
|
||
constructor(
|
||
message: string,
|
||
public statusCode: number = 500,
|
||
public code?: string
|
||
) {
|
||
super(message);
|
||
this.name = 'ScrapeError';
|
||
}
|
||
}
|
||
|
||
export async function scrapeDouyin(url: string) {
|
||
console.log(chalk.blue('🚀 启动共享 Chromium 浏览器...'));
|
||
let context: BrowserContext | null = await acquireIsolatedContext();
|
||
const page = await context.newPage();
|
||
console.log(chalk.cyan(`📄 正在访问: ${chalk.underline(url)}`));
|
||
|
||
await page.addInitScript(() => {
|
||
// 建一个全局容器存捕获的数据
|
||
(window as any).__pace_captured__ = [];
|
||
|
||
// 用 Proxy 包装一个数组,拦截 push
|
||
const captured = (window as any).__pace_captured__;
|
||
const proxyArr = new Proxy([] as any[], {
|
||
get(target, prop, receiver) {
|
||
if (prop === 'push') {
|
||
return (...items: any[]) => {
|
||
try { captured.push(...items); } catch { }
|
||
return Array.prototype.push.apply(target, items);
|
||
};
|
||
}
|
||
return Reflect.get(target, prop, receiver);
|
||
},
|
||
set(target, prop, value, receiver) {
|
||
// 兼容站点可能直接赋初始数组: self.__pace_f = [a,b]
|
||
if (prop === 'length') return Reflect.set(target, prop, value, receiver);
|
||
return Reflect.set(target, prop, value, receiver);
|
||
}
|
||
});
|
||
|
||
(self as any).__pace_f = proxyArr;
|
||
(window as any).__pace_f = proxyArr;
|
||
});
|
||
|
||
try {
|
||
// 先注册“先到先得”的监听,再导航,避免漏包
|
||
const firstTypePromise = waitForFirstResponse(context, [
|
||
{ key: 'detail', test: (r: Response) => r.url().includes(DETAIL_PATH) && r.status() === 200 && r.request().frame()?.page() === page },
|
||
{ key: 'post', test: (r: Response) => r.url().includes(POST_PATH) && r.status() === 200 && r.request().frame()?.page() === page },
|
||
], 10_000);
|
||
|
||
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 20_000 });
|
||
|
||
// 查找页面中是否存在 "视频不存在" 的提示
|
||
const isNotFound = await page.locator('text=视频不存在').count().then(count => count > 0).catch(() => false);
|
||
if (isNotFound) {
|
||
console.error(chalk.red('✗ 视频不存在或已被删除'));
|
||
throw new ScrapeError('视频不存在或已被删除', 404, 'VIDEO_NOT_FOUND');
|
||
}
|
||
|
||
// 等待作品类型判定
|
||
const firstType = await firstTypePromise;
|
||
|
||
// 尝试从内存读取图文数据(如果是图文作品)
|
||
let memoryData: { aweme: any; comments: DouyinCommentResponse | null } | null = null;
|
||
try {
|
||
memoryData = await readPostMem(context, page);
|
||
console.log(chalk.green('✓ 从内存读取图文数据成功'));
|
||
} catch {
|
||
// 内存读取失败,稍后通过网络获取
|
||
}
|
||
|
||
if (!firstType && !memoryData) {
|
||
console.error(chalk.red('✗ 既无法从内存读取数据,也无法从网络获得数据'));
|
||
throw new ScrapeError('无法获取作品数据,可能是网络问题或作品已下架', 404, 'NO_DATA');
|
||
}
|
||
|
||
console.log(chalk.cyan(`📡 检测到作品类型: ${chalk.bold(firstType?.key === 'post' || memoryData ? '图文' : '视频')}`));
|
||
|
||
let allComments: DouyinComment[] = [];
|
||
try {
|
||
// 开始滚动并收集评论
|
||
const commentResponses = await scrollAndCollectComments(context, page);
|
||
|
||
// 解析所有收集到的评论响应
|
||
for (const commentRes of commentResponses) {
|
||
try {
|
||
const commentData = await safeJson<DouyinCommentResponse>(commentRes);
|
||
if (commentData?.comments?.length) {
|
||
allComments.push(...commentData.comments);
|
||
}
|
||
} catch (e) {
|
||
console.warn(chalk.yellow(`⚠ 解析评论响应失败: ${(e as Error)?.message}`));
|
||
}
|
||
}
|
||
} catch (error) {
|
||
console.warn(chalk.yellow(`⚠ 评论收集失败: ${(error as Error)?.message}`));
|
||
}
|
||
|
||
|
||
// 去重评论(根据 cid)
|
||
const uniqueComments = Array.from(
|
||
new Map(allComments.map(c => [c.cid, c])).values()
|
||
);
|
||
|
||
console.log(chalk.green(`✓ 共收集到 ${uniqueComments.length} 条独立评论(去重前: ${allComments.length})`));
|
||
|
||
// 如果从内存读取到了评论,合并进来作为兜底
|
||
let comments: DouyinCommentResponse;
|
||
if (memoryData?.comments?.comments?.length) {
|
||
console.log(chalk.blue(`📝 合并内存中的 ${memoryData.comments.comments.length} 条评论`));
|
||
const memComments = memoryData.comments.comments;
|
||
const mergedMap = new Map(uniqueComments.map(c => [c.cid, c]));
|
||
for (const c of memComments) {
|
||
if (!mergedMap.has(c.cid)) {
|
||
mergedMap.set(c.cid, c);
|
||
}
|
||
}
|
||
comments = {
|
||
comments: Array.from(mergedMap.values()),
|
||
total: mergedMap.size,
|
||
status_code: 0
|
||
};
|
||
console.log(chalk.green(`✓ 合并后共 ${comments.comments.length} 条评论`));
|
||
} else {
|
||
comments = {
|
||
comments: uniqueComments,
|
||
total: uniqueComments.length,
|
||
status_code: 0
|
||
};
|
||
}
|
||
|
||
// 分支:视频 or 图文(两者只会有一个命中,先到先得)
|
||
// 优先处理内存数据(图文)
|
||
if (memoryData) {
|
||
const aweme = memoryData.aweme;
|
||
const uploads = await handleImagePost(context, aweme);
|
||
const saved = await saveImagePostToDB(context, aweme, comments, uploads); // 传递完整 JSON
|
||
console.log(chalk.green.bold('✓ 图文作品保存成功'));
|
||
return { type: "image", ...saved };
|
||
} else if (firstType?.key === 'post') {
|
||
// 图文作品(网络)
|
||
const postJson = await safeJson<DouyinPostListResponse>(firstType.response);
|
||
if (!postJson?.aweme_list?.length) throw new ScrapeError('图文作品响应为空', 404, 'EMPTY_POST_RESPONSE');
|
||
|
||
const currentURL = page.url();
|
||
const target_aweme_id = currentURL.split('/').at(-1);
|
||
const awemeList = postJson.aweme_list as unknown as DouyinImageAweme[];
|
||
let aweme = awemeList.find((pt: DouyinImageAweme) => pt.aweme_id === target_aweme_id);
|
||
if (!aweme) {
|
||
throw new ScrapeError('无法找到目标作品,可能已被删除', 404, 'POST_NOT_FOUND');
|
||
}
|
||
|
||
const uploads = await handleImagePost(context, aweme);
|
||
const saved = await saveImagePostToDB(context, aweme, comments, uploads, postJson); // 传递完整 JSON
|
||
console.log(chalk.green.bold('✓ 图文作品保存成功'));
|
||
return { type: "image", ...saved };
|
||
} else if (firstType?.key === 'detail') {
|
||
// 视频作品
|
||
const detail = (await safeJson<DouyinVideoDetailResponse>(firstType.response))!;
|
||
|
||
// 找到比特率最高的 url
|
||
const bestPlayAddr = pickBestPlayAddr(
|
||
detail?.aweme_detail?.video.bit_rate
|
||
);
|
||
const bestVUrl = bestPlayAddr?.url_list?.[0];
|
||
const fps = bestPlayAddr?.FPS ?? null; // 提取 FPS
|
||
|
||
console.log(chalk.cyan(`📹 最佳视频 URL: ${chalk.dim(bestVUrl)}`));
|
||
console.log(chalk.cyan(`🎞️ 视频帧率: ${chalk.bold(fps || 'N/A')} FPS`));
|
||
if (bestPlayAddr?.width && bestPlayAddr?.height) {
|
||
console.log(chalk.cyan(`📐 视频分辨率: ${chalk.bold(`${bestPlayAddr.width}x${bestPlayAddr.height}`)}`));
|
||
}
|
||
|
||
// 下载视频并上传至 MinIO,获取外链
|
||
let uploadedUrl: string | undefined;
|
||
let coverUrl: string | undefined;
|
||
if (bestVUrl && detail?.aweme_detail) {
|
||
console.log(chalk.blue('⬇️ 正在下载视频...'));
|
||
const { buffer, contentType, ext } = await downloadBinary(context, bestVUrl);
|
||
const awemeId = detail.aweme_detail.aweme_id;
|
||
const fileName = generateUniqueFileName(`${awemeId}.${ext}`, 'douyin/videos');
|
||
|
||
console.log(chalk.blue('⬆️ 正在上传视频到 MinIO...'));
|
||
uploadedUrl = await uploadFile(buffer, fileName, { 'Content-Type': contentType });
|
||
console.log(chalk.green(`✓ 视频上传成功: ${chalk.underline(uploadedUrl)}`));
|
||
|
||
// 提取首帧作为封面并上传
|
||
try {
|
||
console.log(chalk.blue('🖼️ 正在提取视频封面...'));
|
||
const cover = await extractFirstFrame(buffer);
|
||
if (cover) {
|
||
const coverName = generateUniqueFileName(`${awemeId}.jpg`, 'douyin/covers');
|
||
coverUrl = await uploadFile(cover.buffer, coverName, { 'Content-Type': cover.contentType });
|
||
console.log(chalk.green(`✓ 封面上传成功: ${chalk.underline(coverUrl)}`));
|
||
}
|
||
} catch (e) {
|
||
console.warn(chalk.yellow(`⚠ 提取封面失败,跳过: ${(e as Error)?.message || e}`));
|
||
}
|
||
}
|
||
|
||
const saved = await saveToDB(context, detail, comments, uploadedUrl, bestPlayAddr?.width, bestPlayAddr?.height, coverUrl, fps ?? undefined);
|
||
console.log(chalk.green.bold('✓ 视频作品保存成功'));
|
||
return { type: "video", ...saved };
|
||
} else {
|
||
throw new ScrapeError('无法判定作品类型,接口响应异常', 500, 'UNKNOWN_TYPE');
|
||
}
|
||
} catch (error) {
|
||
// 如果是我们自定义的错误,直接抛出
|
||
if (error instanceof ScrapeError) {
|
||
throw error;
|
||
}
|
||
|
||
// 处理其他类型的错误
|
||
const errMsg = (error as Error)?.message || String(error);
|
||
console.error(chalk.red(`✗ 爬取失败: ${errMsg}`));
|
||
|
||
// 根据错误类型返回不同的状态码
|
||
if (errMsg.includes('timeout') || errMsg.includes('超时')) {
|
||
throw new ScrapeError('请求超时,请稍后重试', 408, 'TIMEOUT');
|
||
}
|
||
if (errMsg.includes('页面内存数据中未找到作品详情')) {
|
||
throw new ScrapeError('作品数据加载失败', 404, 'DATA_NOT_LOADED');
|
||
}
|
||
if (errMsg.includes('net::')) {
|
||
throw new ScrapeError('网络连接失败', 503, 'NETWORK_ERROR');
|
||
}
|
||
|
||
// 默认服务器错误
|
||
throw new ScrapeError(errMsg || '爬取过程中发生未知错误', 500, 'UNKNOWN_ERROR');
|
||
} finally {
|
||
console.log(chalk.gray('🧹 清理资源...'));
|
||
try { await page.close({ runBeforeUnload: true }); } catch { }
|
||
// 关闭本次任务的隔离上下文与浏览器
|
||
await releaseIsolatedContext(context);
|
||
await prisma.$disconnect();
|
||
console.log(chalk.gray('✓ 资源清理完成'));
|
||
}
|
||
}
|
||
|