247 lines
12 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// src/scrapeDouyin.ts
import { BrowserContext, Page, chromium, type Response } from 'playwright';
import { prisma } from '@/lib/prisma';
import { uploadFile, generateUniqueFileName } from '@/lib/minio';
import { createCamelCompatibleProxy } from '@/app/api/fetcher/utils';
import { waitForFirstResponse, waitForResponseWithTimeout, safeJson, downloadBinary } from '@/app/api/fetcher/network';
import { pickBestPlayAddr, extractFirstFrame } from '@/app/api/fetcher/media';
import { handleImagePost } from '@/app/api/fetcher/uploader';
import { saveToDB, saveImagePostToDB } from '@/app/api/fetcher/persist';
import chalk from 'chalk';
import { acquireBrowserContext, releaseBrowserContext } from '@/app/api/fetcher/browser';
const DETAIL_PATH = '/aweme/v1/web/aweme/detail/';
const COMMENT_PATH = '/aweme/v1/web/comment/list/';
const POST_PATH = '/aweme/v1/web/aweme/post/'
async function readPostMem(context: BrowserContext, page: Page) {
const md = await page.evaluate(() => {
// @ts-ignore
let data = window.__pace_captured__.find(i => i[1] && i[1].includes(`"awemeId":`))[1]
return JSON.parse(data.slice(data.indexOf("{")).replaceAll("]\n", ''))
// return {aweme: { detail: {} } };
}).catch(() => null);
let aweme_mem = md?.aweme?.detail as DouyinImageAweme;
if (!aweme_mem) throw new Error('页面内存数据中未找到作品详情');
// @ts-ignore
aweme_mem.author = aweme_mem.authorInfo
const comments = md.comment ? createCamelCompatibleProxy<DouyinCommentResponse>(md.comment) : null;
const aweme = createCamelCompatibleProxy(aweme_mem);
return { aweme, comments }
}
export class ScrapeError extends Error {
constructor(
message: string,
public statusCode: number = 500,
public code?: string
) {
super(message);
this.name = 'ScrapeError';
}
}
export async function scrapeDouyin(url: string) {
console.log(chalk.blue('🚀 启动共享 Chromium 浏览器...'));
const context = await acquireBrowserContext();
const page = await context.newPage();
console.log(chalk.cyan(`📄 正在访问: ${chalk.underline(url)}`));
await page.addInitScript(() => {
// 建一个全局容器存捕获的数据
(window as any).__pace_captured__ = [];
// 用 Proxy 包装一个数组,拦截 push
const captured = (window as any).__pace_captured__;
const proxyArr = new Proxy([] as any[], {
get(target, prop, receiver) {
if (prop === 'push') {
return (...items: any[]) => {
try { captured.push(...items); } catch { }
return Array.prototype.push.apply(target, items);
};
}
return Reflect.get(target, prop, receiver);
},
set(target, prop, value, receiver) {
// 兼容站点可能直接赋初始数组: self.__pace_f = [a,b]
if (prop === 'length') return Reflect.set(target, prop, value, receiver);
return Reflect.set(target, prop, value, receiver);
}
});
(self as any).__pace_f = proxyArr;
(window as any).__pace_f = proxyArr;
});
try {
// 先注册“先到先得”的监听,再导航,避免漏包
const firstTypePromise = waitForFirstResponse(context, [
{ key: 'detail', test: (r: Response) => r.url().includes(DETAIL_PATH) && r.status() === 200 },
{ key: 'post', test: (r: Response) => r.url().includes(POST_PATH) && r.status() === 200 },
], 40_000);
// 评论只做短时“有就用、没有不等”的监听
const commentPromise = waitForResponseWithTimeout(
context, (r: Response) => r.url().includes(COMMENT_PATH) && r.status() === 200, 40_000
).catch(() => null);
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60_000 });
// 查找页面中是否存在 "视频不存在" 的提示
const isNotFound = await page.locator('text=视频不存在').count().then(count => count > 0).catch(() => false);
if (isNotFound) {
console.error(chalk.red('✗ 视频不存在或已被删除'));
throw new ScrapeError('视频不存在或已被删除', 404, 'VIDEO_NOT_FOUND');
}
try {
// 优先尝试从内存读取图文数据
let { aweme, comments } = await readPostMem(context, page);
console.log(chalk.green('✓ 从内存读取图文数据成功'));
const uploads = await handleImagePost(context, aweme);
if (!comments) {
console.warn(chalk.yellow('⚠ 从内存读取评论数据失败,尝试从网络请求获取评论数据'));
const commentRes = await commentPromise;
comments = commentRes && await safeJson<DouyinCommentResponse>(commentRes);
if (!comments) {
console.warn(chalk.yellow('⚠ 无法从内存读取评论数据,且网络请求也未返回评论数据'));
comments = { comments: [], total: 0, status_code: 0 };
} else {
console.log(chalk.green('✓ 从网络请求获取评论数据成功'));
}
} else {
console.log(chalk.green('✓ 从内存读取评论数据成功'));
}
const saved = await saveImagePostToDB(context, aweme, comments, uploads); // 传递完整 JSON
console.log(chalk.green.bold('✓ 图文作品保存成功'));
return { type: "image", ...saved };
} catch {
}
const commentRes = await commentPromise;
const firstType = await firstTypePromise;
if (!firstType) {
console.error(chalk.red('✗ 既无法从内存读取数据,也无法从网络获得数据'));
throw new ScrapeError('无法获取作品数据,可能是网络问题或作品已下架', 404, 'NO_DATA');
}
console.log(chalk.cyan(`📡 检测到作品类型: ${chalk.bold(firstType.key === 'post' ? '图文' : '视频')}`));
let comments = commentRes && await safeJson<DouyinCommentResponse>(commentRes);
if (!comments) {
console.warn(chalk.yellow('⚠ 无法从内存读取评论数据,且网络请求也未返回评论数据'));
comments = { comments: [], total: 0, status_code: 0 };
}
// 分支:视频 or 图文(两者只会有一个命中,先到先得)
if (firstType.key === 'post') {
// 图文作品
const postJson = await safeJson<DouyinPostListResponse>(firstType.response);
if (!postJson?.aweme_list?.length) throw new ScrapeError('图文作品响应为空', 404, 'EMPTY_POST_RESPONSE');
const currentURL = page.url();
const target_aweme_id = currentURL.split('/').at(-1);
const awemeList = postJson.aweme_list as unknown as DouyinImageAweme[];
let aweme = awemeList.find((pt: DouyinImageAweme) => pt.aweme_id === target_aweme_id);
if (!aweme) {
throw new ScrapeError('无法找到目标作品,可能已被删除', 404, 'POST_NOT_FOUND');
}
const uploads = await handleImagePost(context, aweme);
const saved = await saveImagePostToDB(context, aweme, comments, uploads, postJson); // 传递完整 JSON
console.log(chalk.green.bold('✓ 图文作品保存成功'));
return { type: "image", ...saved };
} else if (firstType.key === 'detail') {
// 视频作品
const detail = (await safeJson<DouyinVideoDetailResponse>(firstType.response))!;
// 找到比特率最高的 url
const bestPlayAddr = pickBestPlayAddr(
detail?.aweme_detail?.video.bit_rate
);
const bestVUrl = bestPlayAddr?.url_list?.[0];
const fps = bestPlayAddr?.FPS ?? null; // 提取 FPS
console.log(chalk.cyan(`📹 最佳视频 URL: ${chalk.dim(bestVUrl)}`));
console.log(chalk.cyan(`🎞️ 视频帧率: ${chalk.bold(fps || 'N/A')} FPS`));
if (bestPlayAddr?.width && bestPlayAddr?.height) {
console.log(chalk.cyan(`📐 视频分辨率: ${chalk.bold(`${bestPlayAddr.width}x${bestPlayAddr.height}`)}`));
}
// 下载视频并上传至 MinIO获取外链
let uploadedUrl: string | undefined;
let coverUrl: string | undefined;
if (bestVUrl && detail?.aweme_detail) {
console.log(chalk.blue('⬇️ 正在下载视频...'));
const { buffer, contentType, ext } = await downloadBinary(context, bestVUrl);
const awemeId = detail.aweme_detail.aweme_id;
const fileName = generateUniqueFileName(`${awemeId}.${ext}`, 'douyin/videos');
console.log(chalk.blue('⬆️ 正在上传视频到 MinIO...'));
uploadedUrl = await uploadFile(buffer, fileName, { 'Content-Type': contentType });
console.log(chalk.green(`✓ 视频上传成功: ${chalk.underline(uploadedUrl)}`));
// 提取首帧作为封面并上传
try {
console.log(chalk.blue('🖼️ 正在提取视频封面...'));
const cover = await extractFirstFrame(buffer);
if (cover) {
const coverName = generateUniqueFileName(`${awemeId}.jpg`, 'douyin/covers');
coverUrl = await uploadFile(cover.buffer, coverName, { 'Content-Type': cover.contentType });
console.log(chalk.green(`✓ 封面上传成功: ${chalk.underline(coverUrl)}`));
}
} catch (e) {
console.warn(chalk.yellow(`⚠ 提取封面失败,跳过: ${(e as Error)?.message || e}`));
}
}
const saved = await saveToDB(context, detail, comments, uploadedUrl, bestPlayAddr?.width, bestPlayAddr?.height, coverUrl, fps ?? undefined);
console.log(chalk.green.bold('✓ 视频作品保存成功'));
return { type: "video", ...saved };
} else {
throw new ScrapeError('无法判定作品类型,接口响应异常', 500, 'UNKNOWN_TYPE');
}
} catch (error) {
// 如果是我们自定义的错误,直接抛出
if (error instanceof ScrapeError) {
throw error;
}
// 处理其他类型的错误
const errMsg = (error as Error)?.message || String(error);
console.error(chalk.red(`✗ 爬取失败: ${errMsg}`));
// 根据错误类型返回不同的状态码
if (errMsg.includes('timeout') || errMsg.includes('超时')) {
throw new ScrapeError('请求超时,请稍后重试', 408, 'TIMEOUT');
}
if (errMsg.includes('页面内存数据中未找到作品详情')) {
throw new ScrapeError('作品数据加载失败', 404, 'DATA_NOT_LOADED');
}
if (errMsg.includes('net::')) {
throw new ScrapeError('网络连接失败', 503, 'NETWORK_ERROR');
}
// 默认服务器错误
throw new ScrapeError(errMsg || '爬取过程中发生未知错误', 500, 'UNKNOWN_ERROR');
} finally {
console.log(chalk.gray('🧹 清理资源...'));
try { await page.close({ runBeforeUnload: true }); } catch {}
// 仅释放共享上下文的引用,不直接关闭窗口
await releaseBrowserContext();
await prisma.$disconnect();
console.log(chalk.gray('✓ 资源清理完成'));
}
}