171 lines
8.3 KiB
TypeScript
171 lines
8.3 KiB
TypeScript
// src/scrapeDouyin.ts
|
||
import { chromium, type Response } from 'playwright';
|
||
import { prisma } from '@/lib/prisma';
|
||
import { uploadFile, generateUniqueFileName } from '@/lib/minio';
|
||
import { createCamelCompatibleProxy } from '@/app/fetcher/utils';
|
||
import { waitForFirstResponse, waitForResponseWithTimeout, safeJson, downloadBinary } from '@/app/fetcher/network';
|
||
import { pickBestPlayAddr, extractFirstFrame } from '@/app/fetcher/media';
|
||
import { handleImagePost } from '@/app/fetcher/uploader';
|
||
import { saveToDB, saveImagePostToDB } from '@/app/fetcher/persist';
|
||
|
||
const DETAIL_PATH = '/aweme/v1/web/aweme/detail/';
|
||
const COMMENT_PATH = '/aweme/v1/web/comment/list/';
|
||
const POST_PATH = '/aweme/v1/web/aweme/post/'
|
||
export async function scrapeDouyin(url: string) {
|
||
const browser = await chromium.launch({ headless: true });
|
||
console.log("Launch chromium");
|
||
|
||
const context = await chromium.launchPersistentContext('chrome-profile/douyin', { headless: true });
|
||
const page = await context.newPage();
|
||
|
||
await page.addInitScript(() => {
|
||
// 建一个全局容器存捕获的数据
|
||
(window as any).__pace_captured__ = [];
|
||
|
||
// 用 Proxy 包装一个数组,拦截 push
|
||
const captured = (window as any).__pace_captured__;
|
||
const proxyArr = new Proxy([] as any[], {
|
||
get(target, prop, receiver) {
|
||
if (prop === 'push') {
|
||
return (...items: any[]) => {
|
||
try { captured.push(...items); } catch { }
|
||
return Array.prototype.push.apply(target, items);
|
||
};
|
||
}
|
||
return Reflect.get(target, prop, receiver);
|
||
},
|
||
set(target, prop, value, receiver) {
|
||
// 兼容站点可能直接赋初始数组: self.__pace_f = [a,b]
|
||
if (prop === 'length') return Reflect.set(target, prop, value, receiver);
|
||
return Reflect.set(target, prop, value, receiver);
|
||
}
|
||
});
|
||
|
||
// 把 self/window 上的同名队列都指向我们的 proxy
|
||
// 有些站点用 self,有些用 window
|
||
(self as any).__pace_f = proxyArr;
|
||
(window as any).__pace_f = proxyArr;
|
||
});
|
||
|
||
try {
|
||
// 先注册“先到先得”的监听,再导航,避免漏包
|
||
const firstTypePromise = waitForFirstResponse(context, [
|
||
{ key: 'detail', test: (r: Response) => r.url().includes(DETAIL_PATH) && r.status() === 200 },
|
||
{ key: 'post', test: (r: Response) => r.url().includes(POST_PATH) && r.status() === 200 },
|
||
], 20_000); // 整体 20s 兜底超时,不逐个等待
|
||
|
||
// 评论只做短时“有就用、没有不等”的监听
|
||
const commentPromise = waitForResponseWithTimeout(
|
||
context,
|
||
(r: Response) => r.url().includes(COMMENT_PATH) && r.status() === 200,
|
||
8_000
|
||
).catch(() => null);
|
||
|
||
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 60_000 });
|
||
|
||
const firstType = await firstTypePromise; // { key, response } | null
|
||
const commentRes = await commentPromise; // Response | null
|
||
|
||
if (!firstType) {
|
||
console.warn('无法判定作品类型(未捕获详情或图文接口)');
|
||
const md = await page.evaluate(() => {
|
||
// @ts-ignore
|
||
let data = window.__pace_captured__.find(i => i[1] && i[1].includes(`"awemeId":`))[1]
|
||
return JSON.parse(data.slice(data.indexOf("{")).replaceAll("]\n", ''))
|
||
// return {aweme: { detail: {} } };
|
||
});
|
||
let aweme_mem = md.aweme.detail as DouyinImageAweme;
|
||
if (!aweme_mem) throw new Error('页面内存数据中未找到作品详情');
|
||
|
||
//@ts-ignore
|
||
aweme_mem.author = aweme_mem.authorInfo
|
||
const comments = commentRes ? (await safeJson<DouyinCommentResponse>(commentRes))! : { comments: [], total: 0, status_code: 0 };
|
||
|
||
const aweme = createCamelCompatibleProxy(aweme_mem);
|
||
|
||
const uploads = await handleImagePost(context, aweme);
|
||
const saved = await saveImagePostToDB(context, aweme, comments, uploads);
|
||
return { type: "image", ...saved };
|
||
}
|
||
|
||
// 分支:视频 or 图文(两者只会有一个命中,先到先得)
|
||
if (firstType.key === 'post') {
|
||
// 图文作品
|
||
const postJson = await safeJson<DouyinPostListResponse>(firstType.response);
|
||
if (!postJson?.aweme_list?.length) throw new Error('图文作品响应为空');
|
||
|
||
const currentURL = page.url();
|
||
const target_aweme_id = currentURL.split('/').at(-1);
|
||
const awemeList = postJson.aweme_list as unknown as DouyinImageAweme[];
|
||
let aweme = awemeList.find((pt: DouyinImageAweme) => pt.aweme_id === target_aweme_id);
|
||
if (!aweme) {
|
||
console.warn(`图文作品响应中未找到对应作品,look for aweme_id=${target_aweme_id}, have ${postJson.aweme_list.map(pt => pt.aweme_id).join(', ')}`);
|
||
// Try read from memory
|
||
// await new Promise(resolve => setTimeout(resolve, 1000000));
|
||
const md = await page.evaluate(() => {
|
||
// @ts-ignore
|
||
let data = window.__pace_captured__.find(i => i[1] && i[1].includes(`"awemeId":`))[1]
|
||
return JSON.parse(data.slice(data.indexOf("{")).replaceAll("]\n", ''))
|
||
// return {aweme: { detail: {} } };
|
||
});
|
||
aweme = md.aweme.detail as DouyinImageAweme;
|
||
}
|
||
// console.log(aweme);
|
||
// await new Promise(resolve => setTimeout(resolve, 1000000));
|
||
console.log(aweme);
|
||
|
||
|
||
const comments = commentRes ? (await safeJson<DouyinCommentResponse>(commentRes))! : { comments: [], total: 0, status_code: 0 };
|
||
|
||
const uploads = await handleImagePost(context, aweme);
|
||
const saved = await saveImagePostToDB(context, aweme, comments, uploads);
|
||
return { type: "image", ...saved };
|
||
} else if (firstType.key === 'detail') {
|
||
// 视频作品
|
||
const detail = (await safeJson<DouyinVideoDetailResponse>(firstType.response))!;
|
||
const comments = commentRes ? (await safeJson<DouyinCommentResponse>(commentRes))! : { comments: [], total: 0, status_code: 0 };
|
||
|
||
// 找到比特率最高的 url
|
||
const bestPlayAddr = pickBestPlayAddr(
|
||
detail?.aweme_detail?.video.bit_rate
|
||
);
|
||
const bestVUrl = bestPlayAddr?.url_list?.[0];
|
||
|
||
console.log('Best video URL:', bestVUrl);
|
||
|
||
// 下载视频并上传至 MinIO,获取外链
|
||
let uploadedUrl: string | undefined;
|
||
let coverUrl: string | undefined;
|
||
if (bestVUrl && detail?.aweme_detail) {
|
||
const { buffer, contentType, ext } = await downloadBinary(context, bestVUrl);
|
||
const awemeId = detail.aweme_detail.aweme_id;
|
||
const fileName = generateUniqueFileName(`${awemeId}.${ext}`, 'douyin/videos');
|
||
uploadedUrl = await uploadFile(buffer, fileName, { 'Content-Type': contentType });
|
||
console.log('Uploaded to MinIO:', uploadedUrl);
|
||
|
||
// 提取首帧作为封面并上传
|
||
try {
|
||
const cover = await extractFirstFrame(buffer);
|
||
if (cover) {
|
||
const coverName = generateUniqueFileName(`${awemeId}.jpg`, 'douyin/covers');
|
||
coverUrl = await uploadFile(cover.buffer, coverName, { 'Content-Type': cover.contentType });
|
||
console.log('Cover uploaded to MinIO:', coverUrl);
|
||
}
|
||
} catch (e) {
|
||
console.warn('Extract first frame failed, skip cover:', (e as Error)?.message || e);
|
||
}
|
||
}
|
||
|
||
const saved = await saveToDB(context, detail, comments, uploadedUrl, bestPlayAddr?.width, bestPlayAddr?.height, coverUrl);
|
||
return { type: "video", ...saved };
|
||
} else {
|
||
throw new Error('无法判定作品类型(未命中详情或图文接口)');
|
||
}
|
||
} finally {
|
||
await context.close();
|
||
await browser.close();
|
||
await prisma.$disconnect();
|
||
}
|
||
}
|
||
|