diff --git a/app/fetcher/index.ts b/app/fetcher/index.ts index 48e4a56..71f4d46 100644 --- a/app/fetcher/index.ts +++ b/app/fetcher/index.ts @@ -1,5 +1,5 @@ // src/scrapeDouyin.ts -import { chromium, type Response } from 'playwright'; +import { BrowserContext, chromium, Page, type Response } from 'playwright'; import { prisma } from '@/lib/prisma'; import { uploadFile, generateUniqueFileName } from '@/lib/minio'; import { createCamelCompatibleProxy } from '@/app/fetcher/utils'; @@ -7,16 +7,39 @@ import { waitForFirstResponse, waitForResponseWithTimeout, safeJson, downloadBin import { pickBestPlayAddr, extractFirstFrame } from '@/app/fetcher/media'; import { handleImagePost } from '@/app/fetcher/uploader'; import { saveToDB, saveImagePostToDB } from '@/app/fetcher/persist'; +import chalk from 'chalk'; const DETAIL_PATH = '/aweme/v1/web/aweme/detail/'; const COMMENT_PATH = '/aweme/v1/web/comment/list/'; const POST_PATH = '/aweme/v1/web/aweme/post/' + +async function readPostMem(context: BrowserContext, page: Page) { + const md = await page.evaluate(() => { + // @ts-ignore + let data = window.__pace_captured__.find(i => i[1] && i[1].includes(`"awemeId":`))[1] + return JSON.parse(data.slice(data.indexOf("{")).replaceAll("]\n", '')) + // return {aweme: { detail: {} } }; + }).catch(() => null); + + let aweme_mem = md?.aweme?.detail as DouyinImageAweme; + if (!aweme_mem) throw new Error('页面内存数据中未找到作品详情'); + + // @ts-ignore + aweme_mem.author = aweme_mem.authorInfo + const comments = md.comment ? createCamelCompatibleProxy(md.comment) : null; + const aweme = createCamelCompatibleProxy(aweme_mem); + + return { aweme, comments } +} + + export async function scrapeDouyin(url: string) { const browser = await chromium.launch({ headless: true }); - console.log("Launch chromium"); + console.log(chalk.blue('🚀 启动 Chromium 浏览器...')); - const context = await chromium.launchPersistentContext('chrome-profile/douyin', { headless: true }); + const context = await chromium.launchPersistentContext('chrome-profile/douyin', { headless: false }); const page = await context.newPage(); + console.log(chalk.cyan(`📄 正在访问: ${chalk.underline(url)}`)); await page.addInitScript(() => { // 建一个全局容器存捕获的数据 @@ -41,8 +64,6 @@ export async function scrapeDouyin(url: string) { } }); - // 把 self/window 上的同名队列都指向我们的 proxy - // 有些站点用 self,有些用 window (self as any).__pace_f = proxyArr; (window as any).__pace_f = proxyArr; }); @@ -52,40 +73,56 @@ export async function scrapeDouyin(url: string) { const firstTypePromise = waitForFirstResponse(context, [ { key: 'detail', test: (r: Response) => r.url().includes(DETAIL_PATH) && r.status() === 200 }, { key: 'post', test: (r: Response) => r.url().includes(POST_PATH) && r.status() === 200 }, - ], 20_000); // 整体 20s 兜底超时,不逐个等待 + ], 9_000); // 整体 9s 兜底超时,不逐个等待 // 评论只做短时“有就用、没有不等”的监听 const commentPromise = waitForResponseWithTimeout( - context, - (r: Response) => r.url().includes(COMMENT_PATH) && r.status() === 200, - 8_000 + context, (r: Response) => r.url().includes(COMMENT_PATH) && r.status() === 200, 8_000 ).catch(() => null); await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60_000 }); - const firstType = await firstTypePromise; // { key, response } | null - const commentRes = await commentPromise; // Response | null - - if (!firstType) { - console.warn('无法判定作品类型(未捕获详情或图文接口)'); - const md = await page.evaluate(() => { - // @ts-ignore - let data = window.__pace_captured__.find(i => i[1] && i[1].includes(`"awemeId":`))[1] - return JSON.parse(data.slice(data.indexOf("{")).replaceAll("]\n", '')) - // return {aweme: { detail: {} } }; - }); - let aweme_mem = md.aweme.detail as DouyinImageAweme; - if (!aweme_mem) throw new Error('页面内存数据中未找到作品详情'); - - //@ts-ignore - aweme_mem.author = aweme_mem.authorInfo - const comments = commentRes ? (await safeJson(commentRes))! : { comments: [], total: 0, status_code: 0 }; - - const aweme = createCamelCompatibleProxy(aweme_mem); + try { + // 优先尝试从内存读取图文数据 + let { aweme, comments } = await readPostMem(context, page); + console.log(chalk.green('✓ 从内存读取图文数据成功')); const uploads = await handleImagePost(context, aweme); - const saved = await saveImagePostToDB(context, aweme, comments, uploads); + if (!comments) { + console.warn(chalk.yellow('⚠ 从内存读取评论数据失败,尝试从网络请求获取评论数据')); + + const commentRes = await commentPromise; + comments = commentRes && await safeJson(commentRes); + if (!comments) { + console.warn(chalk.yellow('⚠ 无法从内存读取评论数据,且网络请求也未返回评论数据')); + comments = { comments: [], total: 0, status_code: 0 }; + } else { + console.log(chalk.green('✓ 从网络请求获取评论数据成功')); + } + } else { + console.log(chalk.green('✓ 从内存读取评论数据成功')); + } + const saved = await saveImagePostToDB(context, aweme, comments, uploads); // 传递完整 JSON + console.log(chalk.green.bold('✓ 图文作品保存成功')); return { type: "image", ...saved }; + } catch { + + } + + const commentRes = await commentPromise; + const firstType = await firstTypePromise; + + if (!firstType) { + console.error(chalk.red('✗ 既无法从内存读取数据,也无法从网络获得数据')); + throw new Error('既无法从内存读取数据,也无法从网络获得数据。'); + } + + console.log(chalk.cyan(`📡 检测到作品类型: ${chalk.bold(firstType.key === 'post' ? '图文' : '视频')}`)); + + let comments = commentRes && await safeJson(commentRes); + if (!comments) { + console.warn(chalk.yellow('⚠ 无法从内存读取评论数据,且网络请求也未返回评论数据')); + comments = { comments: [], total: 0, status_code: 0 }; } // 分支:视频 or 图文(两者只会有一个命中,先到先得) @@ -99,72 +136,69 @@ export async function scrapeDouyin(url: string) { const awemeList = postJson.aweme_list as unknown as DouyinImageAweme[]; let aweme = awemeList.find((pt: DouyinImageAweme) => pt.aweme_id === target_aweme_id); if (!aweme) { - console.warn(`图文作品响应中未找到对应作品,look for aweme_id=${target_aweme_id}, have ${postJson.aweme_list.map(pt => pt.aweme_id).join(', ')}`); - // Try read from memory - // await new Promise(resolve => setTimeout(resolve, 1000000)); - const md = await page.evaluate(() => { - // @ts-ignore - let data = window.__pace_captured__.find(i => i[1] && i[1].includes(`"awemeId":`))[1] - return JSON.parse(data.slice(data.indexOf("{")).replaceAll("]\n", '')) - // return {aweme: { detail: {} } }; - }); - aweme = md.aweme.detail as DouyinImageAweme; + throw new Error('既无法从内存读取数据,Post 列表中也不包含需要爬取的作品。'); } - // console.log(aweme); - // await new Promise(resolve => setTimeout(resolve, 1000000)); - console.log(aweme); - - - const comments = commentRes ? (await safeJson(commentRes))! : { comments: [], total: 0, status_code: 0 }; const uploads = await handleImagePost(context, aweme); - const saved = await saveImagePostToDB(context, aweme, comments, uploads); + const saved = await saveImagePostToDB(context, aweme, comments, uploads, postJson); // 传递完整 JSON + console.log(chalk.green.bold('✓ 图文作品保存成功')); return { type: "image", ...saved }; } else if (firstType.key === 'detail') { // 视频作品 const detail = (await safeJson(firstType.response))!; - const comments = commentRes ? (await safeJson(commentRes))! : { comments: [], total: 0, status_code: 0 }; // 找到比特率最高的 url const bestPlayAddr = pickBestPlayAddr( detail?.aweme_detail?.video.bit_rate ); const bestVUrl = bestPlayAddr?.url_list?.[0]; + const fps = bestPlayAddr?.FPS ?? null; // 提取 FPS - console.log('Best video URL:', bestVUrl); + console.log(chalk.cyan(`📹 最佳视频 URL: ${chalk.dim(bestVUrl)}`)); + console.log(chalk.cyan(`🎞️ 视频帧率: ${chalk.bold(fps || 'N/A')} FPS`)); + if (bestPlayAddr?.width && bestPlayAddr?.height) { + console.log(chalk.cyan(`📐 视频分辨率: ${chalk.bold(`${bestPlayAddr.width}x${bestPlayAddr.height}`)}`)); + } // 下载视频并上传至 MinIO,获取外链 let uploadedUrl: string | undefined; let coverUrl: string | undefined; if (bestVUrl && detail?.aweme_detail) { + console.log(chalk.blue('⬇️ 正在下载视频...')); const { buffer, contentType, ext } = await downloadBinary(context, bestVUrl); const awemeId = detail.aweme_detail.aweme_id; const fileName = generateUniqueFileName(`${awemeId}.${ext}`, 'douyin/videos'); + + console.log(chalk.blue('⬆️ 正在上传视频到 MinIO...')); uploadedUrl = await uploadFile(buffer, fileName, { 'Content-Type': contentType }); - console.log('Uploaded to MinIO:', uploadedUrl); + console.log(chalk.green(`✓ 视频上传成功: ${chalk.underline(uploadedUrl)}`)); // 提取首帧作为封面并上传 try { + console.log(chalk.blue('🖼️ 正在提取视频封面...')); const cover = await extractFirstFrame(buffer); if (cover) { const coverName = generateUniqueFileName(`${awemeId}.jpg`, 'douyin/covers'); coverUrl = await uploadFile(cover.buffer, coverName, { 'Content-Type': cover.contentType }); - console.log('Cover uploaded to MinIO:', coverUrl); + console.log(chalk.green(`✓ 封面上传成功: ${chalk.underline(coverUrl)}`)); } } catch (e) { - console.warn('Extract first frame failed, skip cover:', (e as Error)?.message || e); + console.warn(chalk.yellow(`⚠ 提取封面失败,跳过: ${(e as Error)?.message || e}`)); } } - const saved = await saveToDB(context, detail, comments, uploadedUrl, bestPlayAddr?.width, bestPlayAddr?.height, coverUrl); + const saved = await saveToDB(context, detail, comments, uploadedUrl, bestPlayAddr?.width, bestPlayAddr?.height, coverUrl, fps ?? undefined); + console.log(chalk.green.bold('✓ 视频作品保存成功')); return { type: "video", ...saved }; } else { throw new Error('无法判定作品类型(未命中详情或图文接口)'); } } finally { + console.log(chalk.gray('🧹 清理资源...')); await context.close(); await browser.close(); await prisma.$disconnect(); + console.log(chalk.gray('✓ 资源清理完成')); } } diff --git a/app/fetcher/persist.ts b/app/fetcher/persist.ts index 9dd27db..70512c5 100644 --- a/app/fetcher/persist.ts +++ b/app/fetcher/persist.ts @@ -10,7 +10,8 @@ export async function saveToDB( videoUrl?: string, width?: number, height?: number, - coverUrl?: string + coverUrl?: string, + fps?: number ) { if (!detailResp?.aweme_detail) throw new Error('视频详情为空'); const d = detailResp.aweme_detail; @@ -63,6 +64,8 @@ export async function saveToDB( width: width ?? null, height: height ?? null, cover_url: coverUrl ?? null, + fps: fps ?? null, + raw_json: detailResp as any, // 保存完整接口 JSON }, update: { desc: d.desc, @@ -79,6 +82,8 @@ export async function saveToDB( ...(width ? { width } : {}), ...(height ? { height } : {}), ...(coverUrl ? { cover_url: coverUrl } : {}), + ...(fps ? { fps } : {}), + raw_json: detailResp as any, // 更新完整接口 JSON }, }); @@ -133,7 +138,8 @@ export async function saveImagePostToDB( context: BrowserContext, aweme: DouyinImageAweme, commentResp: DouyinCommentResponse, - uploads: { images: { url: string; width?: number; height?: number }[]; musicUrl?: string } + uploads: { images: { url: string; width?: number; height?: number }[]; musicUrl?: string }, + rawJson?: any ) { if (!aweme?.author?.sec_uid) throw new Error('作者 sec_uid 缺失'); @@ -180,6 +186,7 @@ export async function saveImagePostToDB( authorId: author.sec_uid, tags: (aweme.video_tag?.map(t => t.tag_name) ?? []), music_url: uploads.musicUrl ?? null, + raw_json: rawJson ?? null, // 保存完整接口 JSON }, update: { desc: aweme.desc, @@ -192,6 +199,7 @@ export async function saveImagePostToDB( authorId: author.sec_uid, tags: (aweme.video_tag?.map(t => t.tag_name) ?? []), music_url: uploads.musicUrl ?? undefined, + raw_json: rawJson ?? undefined, // 更新完整接口 JSON }, }); diff --git a/app/fetcher/types.d.ts b/app/fetcher/types.d.ts index 15fff6c..404c151 100644 --- a/app/fetcher/types.d.ts +++ b/app/fetcher/types.d.ts @@ -83,6 +83,9 @@ interface PlayVariant { width: number; height: number; data_size: number; + FPS: number; + is_bytevc1: number; // 0 or 1 + is_h265: number; // 0 or 1 }; } diff --git a/bun.lock b/bun.lock index b667c8b..1fba115 100644 --- a/bun.lock +++ b/bun.lock @@ -5,6 +5,7 @@ "name": "douyin-archive", "dependencies": { "@prisma/client": "^6.16.3", + "chalk": "^5.6.2", "lucide-react": "^0.546.0", "minio": "^8.0.6", "next": "15.5.6", @@ -199,6 +200,8 @@ "caniuse-lite": ["caniuse-lite@1.0.30001751", "", {}, "sha512-A0QJhug0Ly64Ii3eIqHu5X51ebln3k4yTUkY1j8drqpWHVreg/VLijN48cZ1bYPiqOQuqpkIKnzr/Ul8V+p6Cw=="], + "chalk": ["chalk@5.6.2", "", {}, "sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA=="], + "chokidar": ["chokidar@4.0.3", "", { "dependencies": { "readdirp": "^4.0.1" } }, "sha512-Qgzu8kfBvo+cA4962jnP1KkS6Dop5NS6g7R5LFYJr4b8Ub94PPQXUksCw9PvXoeXPRRddRNC5C1JQUR2SMGtnA=="], "chownr": ["chownr@3.0.0", "", {}, "sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g=="], diff --git a/package.json b/package.json index 0ad222e..dd04cd5 100644 --- a/package.json +++ b/package.json @@ -11,6 +11,7 @@ }, "dependencies": { "@prisma/client": "^6.16.3", + "chalk": "^5.6.2", "lucide-react": "^0.546.0", "minio": "^8.0.6", "next": "15.5.6", diff --git a/prisma/migrations/20251021014140_add_raw_json_and_fps/migration.sql b/prisma/migrations/20251021014140_add_raw_json_and_fps/migration.sql new file mode 100644 index 0000000..fef5c94 --- /dev/null +++ b/prisma/migrations/20251021014140_add_raw_json_and_fps/migration.sql @@ -0,0 +1,6 @@ +-- AlterTable +ALTER TABLE "ImagePost" ADD COLUMN "raw_json" JSONB; + +-- AlterTable +ALTER TABLE "Video" ADD COLUMN "fps" INTEGER, +ADD COLUMN "raw_json" JSONB; diff --git a/prisma/schema.prisma b/prisma/schema.prisma index 808fd18..1d3182d 100644 --- a/prisma/schema.prisma +++ b/prisma/schema.prisma @@ -42,6 +42,9 @@ model Video { width Int? height Int? + // 视频帧率 + fps Int? + // 视频封面(首帧提取后上传到 MinIO 的外链) cover_url String? @@ -53,6 +56,9 @@ model Video { tags String[] // 视频标签列表 video_url String // 视频文件 URL + // 保存完整的接口原始 JSON 数据(用于备份和后续分析) + raw_json Json? + createdAt DateTime @default(now()) updatedAt DateTime @updatedAt @@ -117,6 +123,9 @@ model ImagePost { images ImageFile[] comments Comment[] + // 保存完整的接口原始 JSON 数据(用于备份和后续分析) + raw_json Json? + createdAt DateTime @default(now()) updatedAt DateTime @updatedAt