import fs from 'node:fs/promises'; import path from 'node:path'; import { chromium, type Browser, type BrowserContext, type Page } from 'playwright'; import { AppDataSource, PlatformAccount, Work } from '../models/index.js'; import { logger } from '../utils/logger.js'; import { WorkDayStatisticsService } from './WorkDayStatisticsService.js'; import type { ProxyConfig } from '@media-manager/shared'; import { AccountService } from './AccountService.js'; import { getPythonServiceBaseUrl } from './PythonServiceConfigService.js'; type PlaywrightCookie = { name: string; value: string; domain?: string; path?: string; url?: string; expires?: number; httpOnly?: boolean; secure?: boolean; sameSite?: 'Lax' | 'None' | 'Strict'; }; type BjhListType = 'small_video_v2' | 'video' | 'news'; type ArticleListStatisticItem = { article_id?: string; nid?: string; id?: string; title?: string; type?: string; view_count?: number; comment_count?: number; likes_count?: number; collect_count?: number; share_count?: number; rec_count?: number; }; type ArticleListStatisticResponse = { errno?: number; errmsg?: string; data?: { count?: string | number; list?: ArticleListStatisticItem[]; }; }; type TrendItem = { event_day?: string; // YYYYMMDD view_count?: string | number; disp_pv?: string | number; likes_count?: string | number; comment_count?: string | number; collect_count?: string | number; share_count?: string | number; cover_ctr?: string | number; completion_ratio?: string | number; avg_duration?: string | number; view_duration?: string | number; fans_add_cnt?: string | number; }; type GetTrendDataResponse = { errno?: number; errmsg?: string; data?: { basic_list?: TrendItem[]; }; }; function ensureDir(p: string) { return fs.mkdir(p, { recursive: true }); } function parseCookiesFromAccount(cookieData: string | null): PlaywrightCookie[] { if (!cookieData) return []; const raw = cookieData.trim(); if (!raw) return []; // 1) JSON array if (raw.startsWith('[') || raw.startsWith('{')) { try { const parsed = JSON.parse(raw); const arr = Array.isArray(parsed) ? parsed : (parsed?.cookies ? parsed.cookies : []); if (!Array.isArray(arr)) return []; return arr .map((c: any) => { const name = String(c?.name ?? '').trim(); const value = String(c?.value ?? '').trim(); if (!name) return null; const domain = c?.domain ? String(c.domain) : undefined; const pathVal = c?.path ? String(c.path) : '/'; const url = !domain ? 'https://baijiahao.baidu.com' : undefined; const sameSiteRaw = c?.sameSite; const sameSite = sameSiteRaw === 'Lax' || sameSiteRaw === 'None' || sameSiteRaw === 'Strict' ? sameSiteRaw : undefined; return { name, value, domain, path: pathVal, url, expires: typeof c?.expires === 'number' ? c.expires : undefined, httpOnly: typeof c?.httpOnly === 'boolean' ? c.httpOnly : undefined, secure: typeof c?.secure === 'boolean' ? c.secure : undefined, sameSite, } satisfies PlaywrightCookie; }) .filter(Boolean) as PlaywrightCookie[]; } catch { // fallthrough } } // 2) "a=b; c=d" const pairs = raw.split(';').map((p) => p.trim()).filter(Boolean); const cookies: PlaywrightCookie[] = []; for (const p of pairs) { const idx = p.indexOf('='); if (idx <= 0) continue; const name = p.slice(0, idx).trim(); const value = p.slice(idx + 1).trim(); if (!name) continue; cookies.push({ name, value, url: 'https://baijiahao.baidu.com' }); } return cookies; } async function createBrowserForAccount(proxy: ProxyConfig | null): Promise<{ browser: Browser; shouldClose: boolean }> { const headless = true; if (proxy?.enabled) { const server = `${proxy.type}://${proxy.host}:${proxy.port}`; const browser = await chromium.launch({ headless, proxy: { server, username: proxy.username, password: proxy.password, }, args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu', '--window-size=1920,1080'], }); return { browser, shouldClose: true }; } const browser = await chromium.launch({ headless, args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu', '--window-size=1920,1080'], }); return { browser, shouldClose: true }; } function isJwtLike(v: unknown): v is string { if (!v || typeof v !== 'string') return false; const s = v.trim(); if (s.length < 60) return false; const parts = s.split('.'); if (parts.length !== 3) return false; return parts.every((p) => /^[A-Za-z0-9_-]+$/.test(p) && p.length > 10); } async function extractTokenFromPage(page: Page): Promise { const token = await page .evaluate(() => { const isJwtLikeInner = (v: any) => { if (!v || typeof v !== 'string') return false; const s = v.trim(); if (s.length < 60) return false; const parts = s.split('.'); if (parts.length !== 3) return false; return parts.every((p) => /^[A-Za-z0-9_-]+$/.test(p) && p.length > 10); }; const pickFromStorage = (storage: Storage) => { try { const keys = Object.keys(storage || {}); for (const k of keys) { const v = storage.getItem(k); if (isJwtLikeInner(v)) return v; } } catch { // ignore } return ''; }; let t = pickFromStorage(window.localStorage); if (t) return t; t = pickFromStorage(window.sessionStorage); if (t) return t; const meta = document.querySelector('meta[name="token"], meta[name="bjh-token"]'); const metaToken = meta && meta.getAttribute('content'); if (isJwtLikeInner(metaToken)) return metaToken; const candidates = [ ((window as any).__INITIAL_STATE__ && (window as any).__INITIAL_STATE__.token) || '', ((window as any).__PRELOADED_STATE__ && (window as any).__PRELOADED_STATE__.token) || '', ((window as any).__NUXT__ && (window as any).__NUXT__.state && (window as any).__NUXT__.state.token) || '', ]; for (const c of candidates) { if (isJwtLikeInner(c)) return c; } return ''; }) .catch(() => ''); if (token && isJwtLike(token)) return token; // HTML 兜底 const html = await page.content().catch(() => ''); const m = html.match(/([A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,})/); if (m?.[1] && isJwtLike(m[1])) return m[1]; return ''; } function toYmd(date: Date): string { const yyyy = date.getFullYear(); const mm = String(date.getMonth() + 1).padStart(2, '0'); const dd = String(date.getDate()).padStart(2, '0'); return `${yyyy}${mm}${dd}`; } function parseYyyyMmDdCompactToDate(day: string): Date | null { const s = String(day || '').trim(); const m = s.match(/^(\d{4})(\d{2})(\d{2})$/); if (!m) return null; const d = new Date(Number(m[1]), Number(m[2]) - 1, Number(m[3])); d.setHours(0, 0, 0, 0); return d; } function toInt(v: unknown): number { if (v === null || v === undefined) return 0; if (typeof v === 'number' && Number.isFinite(v)) return Math.floor(v); const s = String(v).trim(); if (!s) return 0; const n = Number(s.replace(/,/g, '')); return Number.isFinite(n) ? Math.floor(n) : 0; } function _toStr(v: unknown): string { if (v === null || v === undefined) return '0'; const s = String(v).trim(); return s || '0'; } /** 比率:不为 0 时加上 %,为 0 或空返回 '0' */ function formatRateWithPercent(v: unknown): string { if (v === null || v === undefined) return '0'; const s = String(v).trim(); if (!s) return '0'; const n = Number(s.replace(/,/g, '')); if (!Number.isFinite(n) || n === 0) return '0'; if (s.includes('%')) return s; if (n > 0 && n <= 1) return `${(n * 100).toFixed(2)}%`; return `${Number(n.toFixed(2))}%`; } /** 观看时长:保留两位小数 */ function formatDurationTwoDecimals(v: unknown): string { if (v === null || v === undefined) return '0'; const n = Number(String(v).trim().replace(/,/g, '')); if (!Number.isFinite(n)) return '0'; return n.toFixed(2); } export class BaijiahaoWorkDailyStatisticsImportService { private accountRepository = AppDataSource.getRepository(PlatformAccount); private workRepository = AppDataSource.getRepository(Work); private workDayStatisticsService = new WorkDayStatisticsService(); private accountService = new AccountService(); private stateDir = path.resolve(process.cwd(), 'tmp', 'baijiahao-storage-state'); static async runDailyImport(): Promise { const svc = new BaijiahaoWorkDailyStatisticsImportService(); await svc.runDailyImportForAllBaijiahaoAccounts(); } static async runDailyImportForAccount(accountId: number): Promise { const svc = new BaijiahaoWorkDailyStatisticsImportService(); const account = await svc.accountRepository.findOne({ where: { id: accountId, platform: 'baijiahao' as any }, }); if (!account) throw new Error(`未找到百家号账号 id=${accountId}`); await svc.importAccountWorkDaily(account); } async runDailyImportForAllBaijiahaoAccounts(): Promise { await ensureDir(this.stateDir); const accounts = await this.accountRepository.find({ where: { platform: 'baijiahao' as any } }); logger.info(`[BJ WorkDaily] Start. total_accounts=${accounts.length}`); for (const account of accounts) { try { await this.importAccountWorkDaily(account); } catch (e) { logger.error( `[BJ WorkDaily] Account failed. accountId=${account.id} name=${account.accountName || ''}`, e ); } } logger.info('[BJ WorkDaily] Done.'); } private getStatePath(accountId: number) { return path.join(this.stateDir, `${accountId}.json`); } private async _createContext( account: PlatformAccount, cookies: PlaywrightCookie[] ): Promise<{ context: BrowserContext; browser: Browser; shouldClose: boolean; token: string }> { const { browser, shouldClose } = await createBrowserForAccount(account.proxyConfig); const statePath = this.getStatePath(account.id); let hasState = false; try { await fs.access(statePath); hasState = true; } catch { hasState = false; } const context = await browser.newContext({ viewport: { width: 1920, height: 1080 }, locale: 'zh-CN', timezoneId: 'Asia/Shanghai', userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36 Edg/144.0.0.0', ...(hasState ? { storageState: statePath } : {}), }); context.setDefaultTimeout(60_000); if (!hasState) { await context.addCookies(cookies as any); } const page = await context.newPage(); await page.goto('https://baijiahao.baidu.com/builder/rc/analysiscontent/single', { waitUntil: 'domcontentloaded', }); await page.waitForTimeout(1500); const token = await extractTokenFromPage(page); if (token) { try { await ensureDir(this.stateDir); await context.storageState({ path: statePath }); } catch { // ignore } } await page.close().catch(() => undefined); return { context, browser, shouldClose, token }; } private buildCommonHeaders(token: string): Record { const headers: Record = { accept: 'application/json, text/plain, */*', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', referer: 'https://baijiahao.baidu.com/builder/rc/analysiscontent/single', }; if (token) headers.token = token; return headers; } private async _fetchArticleListStatisticPage( context: BrowserContext, token: string, params: { startDay: string; // YYYYMMDD endDay: string; // YYYYMMDD type: BjhListType; num: number; count: number; } ): Promise { const { startDay, endDay, type, num, count } = params; const url = `https://baijiahao.baidu.com/author/eco/statistics/articleListStatistic?start_day=${startDay}&end_day=${endDay}&type=${type}&num=${num}&count=${count}`; const res = await (context as any).request.get(url, { headers: this.buildCommonHeaders(token), }); const json = (await res.json().catch(() => null)) as ArticleListStatisticResponse | null; if (!json) throw new Error(`articleListStatistic json parse failed (http=${res.status()})`); return json; } private async _fetchTrendData( context: BrowserContext, token: string, nid: string ): Promise { const url = `https://baijiahao.baidu.com/author/eco/statistic/gettrenddata?nid=${encodeURIComponent( nid )}&trend_type=all&data_type=addition`; const res = await (context as any).request.get(url, { headers: this.buildCommonHeaders(token), }); const json = (await res.json().catch(() => null)) as GetTrendDataResponse | null; if (!json) throw new Error(`gettrenddata json parse failed (http=${res.status()})`); return json; } private isNotLoggedInErrno(errno: unknown): boolean { const n = typeof errno === 'number' ? errno : Number(errno); // 110: 未登录;20040001: 当前用户未登录(你示例里的 errno) return n === 110 || n === 20040001; } private isNotLoggedInError(e: unknown): boolean { const err = e as any; if (!err) return false; if (err.code === 'BJH_NOT_LOGGED_IN') return true; const msg = String(err.message || '').toLowerCase(); return msg.includes('未登录') || msg.includes('not logged in'); } /** * 通过 Python 服务调用百家号 articleListStatistic 接口 * 复用 Python 端对 Cookie 的处理和登录逻辑 */ private async fetchArticleListStatisticViaPython( account: PlatformAccount, params: { startDay: string; // YYYYMMDD endDay: string; // YYYYMMDD type: BjhListType; num: number; count: number; } ): Promise { const base = (await getPythonServiceBaseUrl()).replace(/\/$/, ''); const url = `${base}/baijiahao/article_stats`; const cookie = String(account.cookieData || '').trim(); if (!cookie) { throw new Error('百家号账号 cookie 为空,无法调用 Python article_stats'); } const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), 30_000); try { const res = await fetch(url, { method: 'POST', signal: controller.signal, headers: { 'Content-Type': 'application/json', }, body: JSON.stringify({ cookie, start_day: params.startDay, end_day: params.endDay, type: params.type, num: params.num, count: params.count, }), }); const text = await res.text(); let data: any = {}; try { data = text ? JSON.parse(text) : {}; } catch { throw new Error(`Python article_stats 返回非 JSON 响应: http=${res.status}`); } if (!res.ok) { const msg = String(data?.errmsg || data?.error || '').trim() || `HTTP ${res.status}`; throw new Error(`Python article_stats 调用失败: ${msg}`); } const errno = typeof data?.errno === 'number' ? data.errno : Number(data?.errno ?? 0); const errmsg = String(data?.errmsg || data?.error || '').trim() || undefined; const payload: ArticleListStatisticResponse = { errno, errmsg, data: data?.data, }; return payload; } finally { clearTimeout(timeoutId); } } /** * 通过 Python 服务调用百家号 gettrenddata 接口 * 复用 Python 端对 Cookie 的处理和登录逻辑 */ private async fetchTrendDataViaPython( account: PlatformAccount, nid: string ): Promise { const base = (await getPythonServiceBaseUrl()).replace(/\/$/, ''); const url = `${base}/baijiahao/trend_data`; const cookie = String(account.cookieData || '').trim(); if (!cookie) { throw new Error('百家号账号 cookie 为空,无法调用 Python trend_data'); } const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), 30_000); try { const res = await fetch(url, { method: 'POST', signal: controller.signal, headers: { 'Content-Type': 'application/json', }, body: JSON.stringify({ cookie, nid, }), }); const text = await res.text(); let data: any = {}; try { data = text ? JSON.parse(text) : {}; } catch { throw new Error(`Python trend_data 返回非 JSON 响应: http=${res.status}`); } if (!res.ok) { const msg = String(data?.errmsg || data?.error || '').trim() || `HTTP ${res.status}`; throw new Error(`Python trend_data 调用失败: ${msg}`); } const errno = typeof data?.errno === 'number' ? data.errno : Number(data?.errno ?? 0); const errmsg = String(data?.errmsg || data?.error || '').trim() || undefined; const payload: GetTrendDataResponse = { errno, errmsg, data: data?.data, }; return payload; } finally { clearTimeout(timeoutId); } } private async importAccountWorkDaily(account: PlatformAccount, isRetry = false): Promise { const cookies = parseCookiesFromAccount(account.cookieData); if (!cookies.length) { logger.warn( `[BJ WorkDaily] accountId=${account.id} cookieData 为空或无法解析,跳过` ); return; } const works = await this.workRepository.find({ where: { accountId: account.id, platform: 'baijiahao' as any }, select: ['id', 'platformVideoId'], }); if (!works.length) { logger.info( `[BJ WorkDaily] accountId=${account.id} 没有 baijiahao 作品,跳过` ); return; } const idMap = new Map(); for (const w of works) { const k = String(w.platformVideoId || '').trim(); if (k) idMap.set(k, w.id); } try { // 默认取近 30 天(中国时区):昨天作为 end_day,往前推 29 天作为 start_day const now = new Date(); const chinaNow = new Date(now.getTime() + 8 * 60 * 60 * 1000); const chinaYesterday = new Date(chinaNow.getTime() - 24 * 60 * 60 * 1000); const endDay = toYmd(chinaYesterday); const startDayDate = new Date(chinaYesterday); startDayDate.setDate(startDayDate.getDate() - 29); const startDay = toYmd(startDayDate); const types: BjhListType[] = ['small_video_v2', 'video', 'news']; const pageSize = 10; let worksUpdated = 0; let wdsInserted = 0; let wdsUpdated = 0; for (const t of types) { let num = 1; let total = 0; while (true) { const body = await this.fetchArticleListStatisticViaPython(account, { startDay, endDay, type: t, num, count: pageSize, }); if (this.isNotLoggedInErrno(body.errno)) { const err = new Error( `articleListStatistic errno=${body.errno} 未登录/会话失效` ); (err as any).code = 'BJH_NOT_LOGGED_IN'; throw err; } if (body.errno !== 0) { throw new Error( `articleListStatistic errno=${body.errno} errmsg=${body.errmsg || ''}` ); } const list = body.data?.list || []; const countRaw = body.data?.count; total = typeof countRaw === 'string' ? toInt(countRaw) : toInt(countRaw); if (!list.length) break; // 1) 先把列表汇总写入 works.yesterday_* for (const it of list) { const articleId = String(it.article_id || '').trim(); if (!articleId) continue; const workId = idMap.get(articleId); if (!workId) continue; const patch: Partial = { yesterdayPlayCount: toInt(it.view_count), yesterdayCommentCount: toInt(it.comment_count), yesterdayLikeCount: toInt(it.likes_count), yesterdayCollectCount: toInt(it.collect_count), yesterdayShareCount: toInt(it.share_count), // 百家号列表 rec_count → 推荐量 yesterdayRecommendCount: toInt(it.rec_count), }; const r = await this.workRepository.update(workId, patch as any); if (r.affected && r.affected > 0) worksUpdated += r.affected; } // 2) 再逐条拉趋势,把 basic_list 写入 work_day_statistics for (const it of list) { const articleId = String(it.article_id || '').trim(); if (!articleId) continue; const workId = idMap.get(articleId); if (!workId) continue; const trend = await this.fetchTrendDataViaPython(account, articleId); if (this.isNotLoggedInErrno(trend.errno)) { const err = new Error( `gettrenddata errno=${trend.errno} 未登录/会话失效` ); (err as any).code = 'BJH_NOT_LOGGED_IN'; throw err; } if (trend.errno !== 0) { logger.warn( `[BJ WorkDaily] gettrenddata errno=${trend.errno} nid=${articleId} errmsg=${trend.errmsg || ''}` ); continue; } const basic = trend.data?.basic_list || []; for (const day of basic) { const d = parseYyyyMmDdCompactToDate(String(day.event_day || '')); if (!d) continue; const save = await this.workDayStatisticsService.saveStatisticsForDate( workId, d, { playCount: toInt(day.view_count), likeCount: toInt(day.likes_count), commentCount: toInt(day.comment_count), collectCount: toInt(day.collect_count), shareCount: toInt(day.share_count), // basic_list 目前没有推荐量字段;如果后续有再映射到 recommendCount fansIncrease: toInt(day.fans_add_cnt), coverClickRate: formatRateWithPercent(day.cover_ctr), completionRate: formatRateWithPercent(day.completion_ratio), avgWatchDuration: formatDurationTwoDecimals(day.avg_duration), totalWatchDuration: formatDurationTwoDecimals(day.view_duration), } ); wdsInserted += save.inserted; wdsUpdated += save.updated; } } const fetched = num * pageSize; if (total > 0 && fetched >= total) break; num += 1; if (num > 200) break; } } logger.info( `[BJ WorkDaily] accountId=${account.id} done. worksUpdated=${worksUpdated} wdsInserted=${wdsInserted} wdsUpdated=${wdsUpdated} range=${startDay}-${endDay}` ); } catch (e) { if (!isRetry && this.isNotLoggedInError(e)) { logger.info( `[BJ WorkDaily] Login expired detected for account ${account.id}, attempting to refresh account...` ); try { const refreshResult = await this.accountService.refreshAccount( account.userId, account.id ); if (refreshResult.needReLogin) { logger.warn( `[BJ WorkDaily] Account ${account.id} refresh finished but still need re-login, mark as expired.` ); await this.accountRepository.update(account.id, { status: 'expired' as any, }); return; } const refreshed = await this.accountRepository.findOne({ where: { id: account.id }, }); if (!refreshed) { throw new Error('账号刷新后未找到'); } logger.info( `[BJ WorkDaily] Account ${account.id} refresh success, retry work daily import once...` ); await this.importAccountWorkDaily(refreshed, true); return; } catch (refreshError) { logger.error( `[BJ WorkDaily] Account ${account.id} refresh failed:`, refreshError ); await this.accountRepository.update(account.id, { status: 'expired' as any, }); return; } } throw e; } } }