优化爬取
This commit is contained in:
parent
9b45c4e3e8
commit
9d1598a5ab
@ -1,5 +1,5 @@
|
||||
// src/scrapeDouyin.ts
|
||||
import { chromium, type Response } from 'playwright';
|
||||
import { BrowserContext, chromium, Page, type Response } from 'playwright';
|
||||
import { prisma } from '@/lib/prisma';
|
||||
import { uploadFile, generateUniqueFileName } from '@/lib/minio';
|
||||
import { createCamelCompatibleProxy } from '@/app/fetcher/utils';
|
||||
@ -7,16 +7,39 @@ import { waitForFirstResponse, waitForResponseWithTimeout, safeJson, downloadBin
|
||||
import { pickBestPlayAddr, extractFirstFrame } from '@/app/fetcher/media';
|
||||
import { handleImagePost } from '@/app/fetcher/uploader';
|
||||
import { saveToDB, saveImagePostToDB } from '@/app/fetcher/persist';
|
||||
import chalk from 'chalk';
|
||||
|
||||
const DETAIL_PATH = '/aweme/v1/web/aweme/detail/';
|
||||
const COMMENT_PATH = '/aweme/v1/web/comment/list/';
|
||||
const POST_PATH = '/aweme/v1/web/aweme/post/'
|
||||
|
||||
async function readPostMem(context: BrowserContext, page: Page) {
|
||||
const md = await page.evaluate(() => {
|
||||
// @ts-ignore
|
||||
let data = window.__pace_captured__.find(i => i[1] && i[1].includes(`"awemeId":`))[1]
|
||||
return JSON.parse(data.slice(data.indexOf("{")).replaceAll("]\n", ''))
|
||||
// return {aweme: { detail: {} } };
|
||||
}).catch(() => null);
|
||||
|
||||
let aweme_mem = md?.aweme?.detail as DouyinImageAweme;
|
||||
if (!aweme_mem) throw new Error('页面内存数据中未找到作品详情');
|
||||
|
||||
// @ts-ignore
|
||||
aweme_mem.author = aweme_mem.authorInfo
|
||||
const comments = md.comment ? createCamelCompatibleProxy<DouyinCommentResponse>(md.comment) : null;
|
||||
const aweme = createCamelCompatibleProxy(aweme_mem);
|
||||
|
||||
return { aweme, comments }
|
||||
}
|
||||
|
||||
|
||||
export async function scrapeDouyin(url: string) {
|
||||
const browser = await chromium.launch({ headless: true });
|
||||
console.log("Launch chromium");
|
||||
console.log(chalk.blue('🚀 启动 Chromium 浏览器...'));
|
||||
|
||||
const context = await chromium.launchPersistentContext('chrome-profile/douyin', { headless: true });
|
||||
const context = await chromium.launchPersistentContext('chrome-profile/douyin', { headless: false });
|
||||
const page = await context.newPage();
|
||||
console.log(chalk.cyan(`📄 正在访问: ${chalk.underline(url)}`));
|
||||
|
||||
await page.addInitScript(() => {
|
||||
// 建一个全局容器存捕获的数据
|
||||
@ -41,8 +64,6 @@ export async function scrapeDouyin(url: string) {
|
||||
}
|
||||
});
|
||||
|
||||
// 把 self/window 上的同名队列都指向我们的 proxy
|
||||
// 有些站点用 self,有些用 window
|
||||
(self as any).__pace_f = proxyArr;
|
||||
(window as any).__pace_f = proxyArr;
|
||||
});
|
||||
@ -52,40 +73,56 @@ export async function scrapeDouyin(url: string) {
|
||||
const firstTypePromise = waitForFirstResponse(context, [
|
||||
{ key: 'detail', test: (r: Response) => r.url().includes(DETAIL_PATH) && r.status() === 200 },
|
||||
{ key: 'post', test: (r: Response) => r.url().includes(POST_PATH) && r.status() === 200 },
|
||||
], 20_000); // 整体 20s 兜底超时,不逐个等待
|
||||
], 9_000); // 整体 9s 兜底超时,不逐个等待
|
||||
|
||||
// 评论只做短时“有就用、没有不等”的监听
|
||||
const commentPromise = waitForResponseWithTimeout(
|
||||
context,
|
||||
(r: Response) => r.url().includes(COMMENT_PATH) && r.status() === 200,
|
||||
8_000
|
||||
context, (r: Response) => r.url().includes(COMMENT_PATH) && r.status() === 200, 8_000
|
||||
).catch(() => null);
|
||||
|
||||
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60_000 });
|
||||
|
||||
const firstType = await firstTypePromise; // { key, response } | null
|
||||
const commentRes = await commentPromise; // Response | null
|
||||
|
||||
if (!firstType) {
|
||||
console.warn('无法判定作品类型(未捕获详情或图文接口)');
|
||||
const md = await page.evaluate(() => {
|
||||
// @ts-ignore
|
||||
let data = window.__pace_captured__.find(i => i[1] && i[1].includes(`"awemeId":`))[1]
|
||||
return JSON.parse(data.slice(data.indexOf("{")).replaceAll("]\n", ''))
|
||||
// return {aweme: { detail: {} } };
|
||||
});
|
||||
let aweme_mem = md.aweme.detail as DouyinImageAweme;
|
||||
if (!aweme_mem) throw new Error('页面内存数据中未找到作品详情');
|
||||
|
||||
//@ts-ignore
|
||||
aweme_mem.author = aweme_mem.authorInfo
|
||||
const comments = commentRes ? (await safeJson<DouyinCommentResponse>(commentRes))! : { comments: [], total: 0, status_code: 0 };
|
||||
|
||||
const aweme = createCamelCompatibleProxy(aweme_mem);
|
||||
try {
|
||||
// 优先尝试从内存读取图文数据
|
||||
let { aweme, comments } = await readPostMem(context, page);
|
||||
console.log(chalk.green('✓ 从内存读取图文数据成功'));
|
||||
|
||||
const uploads = await handleImagePost(context, aweme);
|
||||
const saved = await saveImagePostToDB(context, aweme, comments, uploads);
|
||||
if (!comments) {
|
||||
console.warn(chalk.yellow('⚠ 从内存读取评论数据失败,尝试从网络请求获取评论数据'));
|
||||
|
||||
const commentRes = await commentPromise;
|
||||
comments = commentRes && await safeJson<DouyinCommentResponse>(commentRes);
|
||||
if (!comments) {
|
||||
console.warn(chalk.yellow('⚠ 无法从内存读取评论数据,且网络请求也未返回评论数据'));
|
||||
comments = { comments: [], total: 0, status_code: 0 };
|
||||
} else {
|
||||
console.log(chalk.green('✓ 从网络请求获取评论数据成功'));
|
||||
}
|
||||
} else {
|
||||
console.log(chalk.green('✓ 从内存读取评论数据成功'));
|
||||
}
|
||||
const saved = await saveImagePostToDB(context, aweme, comments, uploads); // 传递完整 JSON
|
||||
console.log(chalk.green.bold('✓ 图文作品保存成功'));
|
||||
return { type: "image", ...saved };
|
||||
} catch {
|
||||
|
||||
}
|
||||
|
||||
const commentRes = await commentPromise;
|
||||
const firstType = await firstTypePromise;
|
||||
|
||||
if (!firstType) {
|
||||
console.error(chalk.red('✗ 既无法从内存读取数据,也无法从网络获得数据'));
|
||||
throw new Error('既无法从内存读取数据,也无法从网络获得数据。');
|
||||
}
|
||||
|
||||
console.log(chalk.cyan(`📡 检测到作品类型: ${chalk.bold(firstType.key === 'post' ? '图文' : '视频')}`));
|
||||
|
||||
let comments = commentRes && await safeJson<DouyinCommentResponse>(commentRes);
|
||||
if (!comments) {
|
||||
console.warn(chalk.yellow('⚠ 无法从内存读取评论数据,且网络请求也未返回评论数据'));
|
||||
comments = { comments: [], total: 0, status_code: 0 };
|
||||
}
|
||||
|
||||
// 分支:视频 or 图文(两者只会有一个命中,先到先得)
|
||||
@ -99,72 +136,69 @@ export async function scrapeDouyin(url: string) {
|
||||
const awemeList = postJson.aweme_list as unknown as DouyinImageAweme[];
|
||||
let aweme = awemeList.find((pt: DouyinImageAweme) => pt.aweme_id === target_aweme_id);
|
||||
if (!aweme) {
|
||||
console.warn(`图文作品响应中未找到对应作品,look for aweme_id=${target_aweme_id}, have ${postJson.aweme_list.map(pt => pt.aweme_id).join(', ')}`);
|
||||
// Try read from memory
|
||||
// await new Promise(resolve => setTimeout(resolve, 1000000));
|
||||
const md = await page.evaluate(() => {
|
||||
// @ts-ignore
|
||||
let data = window.__pace_captured__.find(i => i[1] && i[1].includes(`"awemeId":`))[1]
|
||||
return JSON.parse(data.slice(data.indexOf("{")).replaceAll("]\n", ''))
|
||||
// return {aweme: { detail: {} } };
|
||||
});
|
||||
aweme = md.aweme.detail as DouyinImageAweme;
|
||||
throw new Error('既无法从内存读取数据,Post 列表中也不包含需要爬取的作品。');
|
||||
}
|
||||
// console.log(aweme);
|
||||
// await new Promise(resolve => setTimeout(resolve, 1000000));
|
||||
console.log(aweme);
|
||||
|
||||
|
||||
const comments = commentRes ? (await safeJson<DouyinCommentResponse>(commentRes))! : { comments: [], total: 0, status_code: 0 };
|
||||
|
||||
const uploads = await handleImagePost(context, aweme);
|
||||
const saved = await saveImagePostToDB(context, aweme, comments, uploads);
|
||||
const saved = await saveImagePostToDB(context, aweme, comments, uploads, postJson); // 传递完整 JSON
|
||||
console.log(chalk.green.bold('✓ 图文作品保存成功'));
|
||||
return { type: "image", ...saved };
|
||||
} else if (firstType.key === 'detail') {
|
||||
// 视频作品
|
||||
const detail = (await safeJson<DouyinVideoDetailResponse>(firstType.response))!;
|
||||
const comments = commentRes ? (await safeJson<DouyinCommentResponse>(commentRes))! : { comments: [], total: 0, status_code: 0 };
|
||||
|
||||
// 找到比特率最高的 url
|
||||
const bestPlayAddr = pickBestPlayAddr(
|
||||
detail?.aweme_detail?.video.bit_rate
|
||||
);
|
||||
const bestVUrl = bestPlayAddr?.url_list?.[0];
|
||||
const fps = bestPlayAddr?.FPS ?? null; // 提取 FPS
|
||||
|
||||
console.log('Best video URL:', bestVUrl);
|
||||
console.log(chalk.cyan(`📹 最佳视频 URL: ${chalk.dim(bestVUrl)}`));
|
||||
console.log(chalk.cyan(`🎞️ 视频帧率: ${chalk.bold(fps || 'N/A')} FPS`));
|
||||
if (bestPlayAddr?.width && bestPlayAddr?.height) {
|
||||
console.log(chalk.cyan(`📐 视频分辨率: ${chalk.bold(`${bestPlayAddr.width}x${bestPlayAddr.height}`)}`));
|
||||
}
|
||||
|
||||
// 下载视频并上传至 MinIO,获取外链
|
||||
let uploadedUrl: string | undefined;
|
||||
let coverUrl: string | undefined;
|
||||
if (bestVUrl && detail?.aweme_detail) {
|
||||
console.log(chalk.blue('⬇️ 正在下载视频...'));
|
||||
const { buffer, contentType, ext } = await downloadBinary(context, bestVUrl);
|
||||
const awemeId = detail.aweme_detail.aweme_id;
|
||||
const fileName = generateUniqueFileName(`${awemeId}.${ext}`, 'douyin/videos');
|
||||
|
||||
console.log(chalk.blue('⬆️ 正在上传视频到 MinIO...'));
|
||||
uploadedUrl = await uploadFile(buffer, fileName, { 'Content-Type': contentType });
|
||||
console.log('Uploaded to MinIO:', uploadedUrl);
|
||||
console.log(chalk.green(`✓ 视频上传成功: ${chalk.underline(uploadedUrl)}`));
|
||||
|
||||
// 提取首帧作为封面并上传
|
||||
try {
|
||||
console.log(chalk.blue('🖼️ 正在提取视频封面...'));
|
||||
const cover = await extractFirstFrame(buffer);
|
||||
if (cover) {
|
||||
const coverName = generateUniqueFileName(`${awemeId}.jpg`, 'douyin/covers');
|
||||
coverUrl = await uploadFile(cover.buffer, coverName, { 'Content-Type': cover.contentType });
|
||||
console.log('Cover uploaded to MinIO:', coverUrl);
|
||||
console.log(chalk.green(`✓ 封面上传成功: ${chalk.underline(coverUrl)}`));
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn('Extract first frame failed, skip cover:', (e as Error)?.message || e);
|
||||
console.warn(chalk.yellow(`⚠ 提取封面失败,跳过: ${(e as Error)?.message || e}`));
|
||||
}
|
||||
}
|
||||
|
||||
const saved = await saveToDB(context, detail, comments, uploadedUrl, bestPlayAddr?.width, bestPlayAddr?.height, coverUrl);
|
||||
const saved = await saveToDB(context, detail, comments, uploadedUrl, bestPlayAddr?.width, bestPlayAddr?.height, coverUrl, fps ?? undefined);
|
||||
console.log(chalk.green.bold('✓ 视频作品保存成功'));
|
||||
return { type: "video", ...saved };
|
||||
} else {
|
||||
throw new Error('无法判定作品类型(未命中详情或图文接口)');
|
||||
}
|
||||
} finally {
|
||||
console.log(chalk.gray('🧹 清理资源...'));
|
||||
await context.close();
|
||||
await browser.close();
|
||||
await prisma.$disconnect();
|
||||
console.log(chalk.gray('✓ 资源清理完成'));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -10,7 +10,8 @@ export async function saveToDB(
|
||||
videoUrl?: string,
|
||||
width?: number,
|
||||
height?: number,
|
||||
coverUrl?: string
|
||||
coverUrl?: string,
|
||||
fps?: number
|
||||
) {
|
||||
if (!detailResp?.aweme_detail) throw new Error('视频详情为空');
|
||||
const d = detailResp.aweme_detail;
|
||||
@ -63,6 +64,8 @@ export async function saveToDB(
|
||||
width: width ?? null,
|
||||
height: height ?? null,
|
||||
cover_url: coverUrl ?? null,
|
||||
fps: fps ?? null,
|
||||
raw_json: detailResp as any, // 保存完整接口 JSON
|
||||
},
|
||||
update: {
|
||||
desc: d.desc,
|
||||
@ -79,6 +82,8 @@ export async function saveToDB(
|
||||
...(width ? { width } : {}),
|
||||
...(height ? { height } : {}),
|
||||
...(coverUrl ? { cover_url: coverUrl } : {}),
|
||||
...(fps ? { fps } : {}),
|
||||
raw_json: detailResp as any, // 更新完整接口 JSON
|
||||
},
|
||||
});
|
||||
|
||||
@ -133,7 +138,8 @@ export async function saveImagePostToDB(
|
||||
context: BrowserContext,
|
||||
aweme: DouyinImageAweme,
|
||||
commentResp: DouyinCommentResponse,
|
||||
uploads: { images: { url: string; width?: number; height?: number }[]; musicUrl?: string }
|
||||
uploads: { images: { url: string; width?: number; height?: number }[]; musicUrl?: string },
|
||||
rawJson?: any
|
||||
) {
|
||||
if (!aweme?.author?.sec_uid) throw new Error('作者 sec_uid 缺失');
|
||||
|
||||
@ -180,6 +186,7 @@ export async function saveImagePostToDB(
|
||||
authorId: author.sec_uid,
|
||||
tags: (aweme.video_tag?.map(t => t.tag_name) ?? []),
|
||||
music_url: uploads.musicUrl ?? null,
|
||||
raw_json: rawJson ?? null, // 保存完整接口 JSON
|
||||
},
|
||||
update: {
|
||||
desc: aweme.desc,
|
||||
@ -192,6 +199,7 @@ export async function saveImagePostToDB(
|
||||
authorId: author.sec_uid,
|
||||
tags: (aweme.video_tag?.map(t => t.tag_name) ?? []),
|
||||
music_url: uploads.musicUrl ?? undefined,
|
||||
raw_json: rawJson ?? undefined, // 更新完整接口 JSON
|
||||
},
|
||||
});
|
||||
|
||||
|
||||
3
app/fetcher/types.d.ts
vendored
3
app/fetcher/types.d.ts
vendored
@ -83,6 +83,9 @@ interface PlayVariant {
|
||||
width: number;
|
||||
height: number;
|
||||
data_size: number;
|
||||
FPS: number;
|
||||
is_bytevc1: number; // 0 or 1
|
||||
is_h265: number; // 0 or 1
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
3
bun.lock
3
bun.lock
@ -5,6 +5,7 @@
|
||||
"name": "douyin-archive",
|
||||
"dependencies": {
|
||||
"@prisma/client": "^6.16.3",
|
||||
"chalk": "^5.6.2",
|
||||
"lucide-react": "^0.546.0",
|
||||
"minio": "^8.0.6",
|
||||
"next": "15.5.6",
|
||||
@ -199,6 +200,8 @@
|
||||
|
||||
"caniuse-lite": ["caniuse-lite@1.0.30001751", "", {}, "sha512-A0QJhug0Ly64Ii3eIqHu5X51ebln3k4yTUkY1j8drqpWHVreg/VLijN48cZ1bYPiqOQuqpkIKnzr/Ul8V+p6Cw=="],
|
||||
|
||||
"chalk": ["chalk@5.6.2", "", {}, "sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA=="],
|
||||
|
||||
"chokidar": ["chokidar@4.0.3", "", { "dependencies": { "readdirp": "^4.0.1" } }, "sha512-Qgzu8kfBvo+cA4962jnP1KkS6Dop5NS6g7R5LFYJr4b8Ub94PPQXUksCw9PvXoeXPRRddRNC5C1JQUR2SMGtnA=="],
|
||||
|
||||
"chownr": ["chownr@3.0.0", "", {}, "sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g=="],
|
||||
|
||||
@ -11,6 +11,7 @@
|
||||
},
|
||||
"dependencies": {
|
||||
"@prisma/client": "^6.16.3",
|
||||
"chalk": "^5.6.2",
|
||||
"lucide-react": "^0.546.0",
|
||||
"minio": "^8.0.6",
|
||||
"next": "15.5.6",
|
||||
|
||||
@ -0,0 +1,6 @@
|
||||
-- AlterTable
|
||||
ALTER TABLE "ImagePost" ADD COLUMN "raw_json" JSONB;
|
||||
|
||||
-- AlterTable
|
||||
ALTER TABLE "Video" ADD COLUMN "fps" INTEGER,
|
||||
ADD COLUMN "raw_json" JSONB;
|
||||
@ -42,6 +42,9 @@ model Video {
|
||||
width Int?
|
||||
height Int?
|
||||
|
||||
// 视频帧率
|
||||
fps Int?
|
||||
|
||||
// 视频封面(首帧提取后上传到 MinIO 的外链)
|
||||
cover_url String?
|
||||
|
||||
@ -53,6 +56,9 @@ model Video {
|
||||
tags String[] // 视频标签列表
|
||||
video_url String // 视频文件 URL
|
||||
|
||||
// 保存完整的接口原始 JSON 数据(用于备份和后续分析)
|
||||
raw_json Json?
|
||||
|
||||
createdAt DateTime @default(now())
|
||||
updatedAt DateTime @updatedAt
|
||||
|
||||
@ -117,6 +123,9 @@ model ImagePost {
|
||||
images ImageFile[]
|
||||
comments Comment[]
|
||||
|
||||
// 保存完整的接口原始 JSON 数据(用于备份和后续分析)
|
||||
raw_json Json?
|
||||
|
||||
createdAt DateTime @default(now())
|
||||
updatedAt DateTime @updatedAt
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user