BaijiahaoContentOverviewImportService.ts 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926
  1. import fs from 'node:fs/promises';
  2. import path from 'node:path';
  3. import { chromium, type Browser, type Page, type BrowserContext } from 'playwright';
  4. import * as XLSXNS from 'xlsx';
  5. import { AppDataSource, PlatformAccount } from '../models/index.js';
  6. import { BrowserManager } from '../automation/browser.js';
  7. import { logger } from '../utils/logger.js';
  8. import { UserDayStatisticsService } from './UserDayStatisticsService.js';
  9. import { AccountService } from './AccountService.js';
  10. import { getPythonServiceBaseUrl } from './PythonServiceConfigService.js';
  11. import type { ProxyConfig } from '@media-manager/shared';
  12. import { WS_EVENTS } from '@media-manager/shared';
  13. import { wsManager } from '../websocket/index.js';
  14. // xlsx 在 ESM 下可能挂在 default 上;这里做一次兼容兜底
  15. // eslint-disable-next-line @typescript-eslint/no-explicit-any
  16. const XLSX: any = (XLSXNS as any).default ?? (XLSXNS as any);
  17. type PlaywrightCookie = {
  18. name: string;
  19. value: string;
  20. domain?: string;
  21. path?: string;
  22. url?: string;
  23. expires?: number;
  24. httpOnly?: boolean;
  25. secure?: boolean;
  26. sameSite?: 'Lax' | 'None' | 'Strict';
  27. };
  28. function ensureDir(p: string) {
  29. return fs.mkdir(p, { recursive: true });
  30. }
  31. function normalizeDateText(input: unknown): Date | null {
  32. if (!input) return null;
  33. const s = String(input).trim();
  34. if (!s) return null;
  35. // 20260115 / 2026-01-15 / 2026/01/15
  36. const mCompact = s.match(/^(\d{4})(\d{2})(\d{2})$/);
  37. if (mCompact) {
  38. const yyyy = Number(mCompact[1]);
  39. const mm = Number(mCompact[2]);
  40. const dd = Number(mCompact[3]);
  41. if (!yyyy || !mm || !dd) return null;
  42. const d = new Date(yyyy, mm - 1, dd);
  43. d.setHours(0, 0, 0, 0);
  44. return d;
  45. }
  46. const m1 = s.match(/(\d{4})\D(\d{1,2})\D(\d{1,2})/);
  47. if (m1) {
  48. const yyyy = Number(m1[1]);
  49. const mm = Number(m1[2]);
  50. const dd = Number(m1[3]);
  51. if (!yyyy || !mm || !dd) return null;
  52. const d = new Date(yyyy, mm - 1, dd);
  53. d.setHours(0, 0, 0, 0);
  54. return d;
  55. }
  56. return null;
  57. }
  58. function parseChineseNumberLike(input: unknown): number | null {
  59. if (input === null || input === undefined) return null;
  60. const s = String(input).trim();
  61. if (!s) return null;
  62. const plain = s.replace(/,/g, '');
  63. const wan = plain.match(/^(\d+(\.\d+)?)\s*万$/);
  64. if (wan) return Math.round(Number(wan[1]) * 10000);
  65. const yi = plain.match(/^(\d+(\.\d+)?)\s*亿$/);
  66. if (yi) return Math.round(Number(yi[1]) * 100000000);
  67. const n = Number(plain.replace(/[^\d.-]/g, ''));
  68. if (Number.isFinite(n)) return Math.round(n);
  69. return null;
  70. }
  71. function parseCookiesFromAccount(cookieData: string | null): PlaywrightCookie[] {
  72. if (!cookieData) return [];
  73. const raw = cookieData.trim();
  74. if (!raw) return [];
  75. // 1) JSON array
  76. if (raw.startsWith('[') || raw.startsWith('{')) {
  77. try {
  78. const parsed = JSON.parse(raw);
  79. const arr = Array.isArray(parsed) ? parsed : (parsed?.cookies ? parsed.cookies : []);
  80. if (!Array.isArray(arr)) return [];
  81. return arr
  82. .map((c: any) => {
  83. const name = String(c?.name ?? '').trim();
  84. const value = String(c?.value ?? '').trim();
  85. if (!name) return null;
  86. const domain = c?.domain ? String(c.domain) : undefined;
  87. const pathVal = c?.path ? String(c.path) : '/';
  88. const url = !domain ? 'https://baijiahao.baidu.com' : undefined;
  89. const sameSiteRaw = c?.sameSite;
  90. const sameSite =
  91. sameSiteRaw === 'Lax' || sameSiteRaw === 'None' || sameSiteRaw === 'Strict'
  92. ? sameSiteRaw
  93. : undefined;
  94. return {
  95. name,
  96. value,
  97. domain,
  98. path: pathVal,
  99. url,
  100. expires: typeof c?.expires === 'number' ? c.expires : undefined,
  101. httpOnly: typeof c?.httpOnly === 'boolean' ? c.httpOnly : undefined,
  102. secure: typeof c?.secure === 'boolean' ? c.secure : undefined,
  103. sameSite,
  104. } satisfies PlaywrightCookie;
  105. })
  106. .filter(Boolean) as PlaywrightCookie[];
  107. } catch {
  108. // fallthrough
  109. }
  110. }
  111. // 2) "a=b; c=d"
  112. const pairs = raw.split(';').map((p) => p.trim()).filter(Boolean);
  113. const cookies: PlaywrightCookie[] = [];
  114. for (const p of pairs) {
  115. const idx = p.indexOf('=');
  116. if (idx <= 0) continue;
  117. const name = p.slice(0, idx).trim();
  118. const value = p.slice(idx + 1).trim();
  119. if (!name) continue;
  120. cookies.push({ name, value, url: 'https://baijiahao.baidu.com' });
  121. }
  122. return cookies;
  123. }
  124. async function createBrowserForAccount(proxy: ProxyConfig | null): Promise<{ browser: Browser; shouldClose: boolean }> {
  125. // 静默同步:默认一律 headless,不弹窗
  126. // 只有在“引导登录/验证”时(BJ_STORAGE_STATE_BOOTSTRAP=1 且 BJ_IMPORT_HEADLESS=0)才允许 headful
  127. const allowHeadfulForBootstrap =
  128. process.env.BJ_STORAGE_STATE_BOOTSTRAP === '1' && process.env.BJ_IMPORT_HEADLESS === '0';
  129. const headless = !allowHeadfulForBootstrap;
  130. if (proxy?.enabled) {
  131. const server = `${proxy.type}://${proxy.host}:${proxy.port}`;
  132. const browser = await chromium.launch({
  133. headless,
  134. proxy: {
  135. server,
  136. username: proxy.username,
  137. password: proxy.password,
  138. },
  139. args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu', '--window-size=1920,1080'],
  140. });
  141. return { browser, shouldClose: true };
  142. }
  143. const browser = await BrowserManager.getBrowser({ headless });
  144. return { browser, shouldClose: false };
  145. }
  146. function parseBaijiahaoExcel(
  147. filePath: string
  148. ): Map<string, { recordDate: Date } & Record<string, any>> {
  149. const wb = XLSX.readFile(filePath);
  150. const result = new Map<string, { recordDate: Date } & Record<string, any>>();
  151. logger.info(
  152. `[BJ Import] Excel loaded. file=${path.basename(filePath)} sheets=${wb.SheetNames.join(' | ')}`
  153. );
  154. for (const sheetName of wb.SheetNames) {
  155. const sheet = wb.Sheets[sheetName];
  156. const rows = XLSX.utils.sheet_to_json<Record<string, any>>(sheet, { defval: '' });
  157. if (!rows.length) {
  158. logger.warn(`[BJ Import] Sheet empty. name=${sheetName}`);
  159. continue;
  160. }
  161. const keys = Object.keys(rows[0] || {});
  162. logger.info(
  163. `[BJ Import] Sheet parsed. name=${sheetName} rows=${rows.length} keys=${keys.join(',')}`
  164. );
  165. // 百家号 Excel 为 GBK 编码,列名在 node 环境下会变成乱码(但列顺序稳定),所以这里按“列位置”做映射:
  166. // 0: 日期(形如 20260115)
  167. // 1: 阅读量
  168. // 2: 点击率
  169. // 3: 互动率
  170. // 4: 评论量
  171. // 5: 评论率(%)
  172. // 6: 点赞量
  173. // 7: 点赞率(%)
  174. // 8: 收藏量
  175. // 9: 收藏率(%)
  176. // 10: 分享量
  177. // 11: 分享率(%)
  178. // 12: 作品涨粉量
  179. // 13: 作品涨粉率
  180. // 14: 作品脱粉量
  181. // ... 其余列暂不入库
  182. for (const row of rows) {
  183. const cols = Object.keys(row || {});
  184. if (!cols.length) continue;
  185. const dateVal = (row as any)[cols[0]];
  186. const d = normalizeDateText(dateVal);
  187. if (!d) continue;
  188. const key = `${d.getFullYear()}-${String(d.getMonth() + 1).padStart(2, '0')}-${String(
  189. d.getDate()
  190. ).padStart(2, '0')}`;
  191. if (!result.has(key)) result.set(key, { recordDate: d });
  192. const obj = result.get(key)!;
  193. const safeGet = (idx: number): any =>
  194. idx >= 0 && idx < cols.length ? (row as any)[cols[idx]] : undefined;
  195. // 阅读量 → playCount
  196. const readCount = parseChineseNumberLike(safeGet(1));
  197. if (typeof readCount === 'number') (obj as any).playCount = readCount;
  198. // 点赞量 → likeCount
  199. const likeCount = parseChineseNumberLike(safeGet(6));
  200. if (typeof likeCount === 'number') (obj as any).likeCount = likeCount;
  201. // 评论量 → commentCount
  202. const commentCount = parseChineseNumberLike(safeGet(4));
  203. if (typeof commentCount === 'number') (obj as any).commentCount = commentCount;
  204. // 收藏量 → collectCount
  205. const collectCount = parseChineseNumberLike(safeGet(8));
  206. if (typeof collectCount === 'number') (obj as any).collectCount = collectCount;
  207. // 分享量 → shareCount
  208. const shareCount = parseChineseNumberLike(safeGet(10));
  209. if (typeof shareCount === 'number') (obj as any).shareCount = shareCount;
  210. // 点击率 → coverClickRate(不为 0 时加 %)
  211. const clickRateRaw = safeGet(2);
  212. const coverClickRate = formatRateWithPercent(clickRateRaw);
  213. if (coverClickRate !== '0') (obj as any).coverClickRate = coverClickRate;
  214. // fans_increase 只看作品涨粉量(不再扣除作品脱粉量)
  215. const inc = parseChineseNumberLike(safeGet(12));
  216. if (typeof inc === 'number') {
  217. (obj as any).fansIncrease = inc;
  218. }
  219. }
  220. }
  221. return result;
  222. }
  223. /** 比率:不为 0 时加上 %,为 0 或空返回 '0' */
  224. function formatRateWithPercent(v: unknown): string {
  225. if (v === null || v === undefined) return '0';
  226. const s = String(v).trim();
  227. if (!s) return '0';
  228. const n = Number(s.replace(/,/g, ''));
  229. if (!Number.isFinite(n) || n === 0) return '0';
  230. if (s.includes('%')) return s;
  231. if (n > 0 && n <= 1) return `${(n * 100).toFixed(2)}%`;
  232. return `${Number(n.toFixed(2))}%`;
  233. }
  234. function formatPercentString(input: unknown): string | null {
  235. if (input === null || input === undefined) return null;
  236. const s = String(input).trim();
  237. if (!s) return null;
  238. if (s.includes('%')) return s;
  239. const n = Number(s);
  240. if (!Number.isFinite(n)) return null;
  241. // 0.0423 -> 4.23%
  242. if (n >= 0 && n <= 1) return `${(n * 100).toFixed(2)}%`;
  243. // 4.23 -> 4.23%
  244. return `${n.toFixed(2)}%`;
  245. }
  246. function findArrayWithDateLikeField(root: any): { arr: any[]; dateKey: string } | null {
  247. const seen = new Set<any>();
  248. const queue: any[] = [root];
  249. const isDateLike = (v: any) => {
  250. if (v === null || v === undefined) return false;
  251. if (typeof v === 'number') return String(v).match(/^\d{8}$/);
  252. const s = String(v).trim();
  253. return /^\d{8}$/.test(s) || /^\d{4}[-/]\d{1,2}[-/]\d{1,2}$/.test(s);
  254. };
  255. const dateKeyCandidates = ['event_day', 'day', 'date', 'stat_day', 'statDay', 'dt', 'time', 'the_day'];
  256. const candidates: Array<{ arr: any[]; dateKey: string }> = [];
  257. while (queue.length) {
  258. const cur = queue.shift();
  259. if (!cur || typeof cur !== 'object') continue;
  260. if (seen.has(cur)) continue;
  261. seen.add(cur);
  262. if (Array.isArray(cur)) {
  263. // 数组元素为对象且含日期字段
  264. for (const item of cur) {
  265. if (!item || typeof item !== 'object') continue;
  266. const keys = Object.keys(item);
  267. for (const dk of dateKeyCandidates) {
  268. if (keys.includes(dk) && isDateLike((item as any)[dk])) {
  269. candidates.push({ arr: cur, dateKey: dk });
  270. break;
  271. }
  272. }
  273. // 兜底:任意字段像日期
  274. for (const k of keys) {
  275. if (isDateLike((item as any)[k])) {
  276. candidates.push({ arr: cur, dateKey: k });
  277. break;
  278. }
  279. }
  280. }
  281. } else {
  282. for (const v of Object.values(cur)) {
  283. if (v && typeof v === 'object') queue.push(v);
  284. }
  285. }
  286. }
  287. if (!candidates.length) return null;
  288. candidates.sort((a, b) => (b.arr?.length ?? 0) - (a.arr?.length ?? 0));
  289. return candidates[0]!;
  290. }
  291. function parseBaijiahaoAppStatisticV3(json: any): Map<string, { recordDate: Date } & Record<string, any>> {
  292. const result = new Map<string, { recordDate: Date } & Record<string, any>>();
  293. const found = findArrayWithDateLikeField(json);
  294. if (!found) return result;
  295. const { arr, dateKey } = found;
  296. const pickNumber = (obj: any, keys: string[]): number | null => {
  297. for (const k of keys) {
  298. if (obj?.[k] === undefined || obj?.[k] === null) continue;
  299. const n = parseChineseNumberLike(obj[k]);
  300. if (typeof n === 'number') return n;
  301. }
  302. return null;
  303. };
  304. const pickString = (obj: any, keys: string[]): string | null => {
  305. for (const k of keys) {
  306. if (obj?.[k] === undefined || obj?.[k] === null) continue;
  307. const s = String(obj[k]).trim();
  308. if (s) return s;
  309. }
  310. return null;
  311. };
  312. for (const item of arr) {
  313. if (!item || typeof item !== 'object') continue;
  314. const d = normalizeDateText(item[dateKey]);
  315. if (!d) continue;
  316. const key = `${d.getFullYear()}-${String(d.getMonth() + 1).padStart(2, '0')}-${String(d.getDate()).padStart(2, '0')}`;
  317. if (!result.has(key)) result.set(key, { recordDate: d });
  318. const obj = result.get(key)!;
  319. // 阅读量 → playCount(百家号 appStatisticV3 使用 view_count)
  320. const play = pickNumber(item, ['view_count', 'read_cnt', 'readCount', 'read', 'pv', 'view_cnt', 'viewCount', 'views']);
  321. if (typeof play === 'number') (obj as any).playCount = play;
  322. // 点赞量 → likeCount(百家号 API 使用 likes_count)
  323. const like = pickNumber(item, ['likes_count', 'like_cnt', 'praise_cnt', 'praise', 'likeCount', 'likes']);
  324. if (typeof like === 'number') (obj as any).likeCount = like;
  325. // 评论量 → commentCount(百家号 API 使用 comment_count)
  326. const comment = pickNumber(item, ['comment_count', 'comment_cnt', 'commentCount', 'comments']);
  327. if (typeof comment === 'number') (obj as any).commentCount = comment;
  328. // 收藏量 → collectCount(百家号 API 字段为 collect_count)
  329. const collect = pickNumber(item, ['collect_count', 'collect_cnt', 'favorite_cnt', 'fav_cnt', 'collectCount', 'favorites']);
  330. if (typeof collect === 'number') (obj as any).collectCount = collect;
  331. // 分享量 → shareCount(百家号 API 使用 share_count)
  332. const share = pickNumber(item, ['share_count', 'share_cnt', 'shareCount', 'shares']);
  333. if (typeof share === 'number') (obj as any).shareCount = share;
  334. // 点击率 → coverClickRate
  335. const clickRateRaw =
  336. pickString(item, ['click_rate', 'ctr', 'clickRate']) ??
  337. (typeof pickNumber(item, ['click_rate', 'ctr', 'clickRate']) === 'number'
  338. ? String(pickNumber(item, ['click_rate', 'ctr', 'clickRate']))
  339. : null);
  340. const clickRate = formatPercentString(clickRateRaw);
  341. if (clickRate) (obj as any).coverClickRate = clickRate;
  342. // 作品涨粉量 → fansIncrease(百家号 API 使用 fans_increase / fans_add_cnt)
  343. const fansInc = pickNumber(item, ['fans_increase', 'fans_add_cnt', 'works_fans_inc', 'worksFansInc', 'content_fans_inc', 'fans_inc', 'fansIncrease']);
  344. if (typeof fansInc === 'number') (obj as any).fansIncrease = fansInc;
  345. }
  346. return result;
  347. }
  348. export class BaijiahaoContentOverviewImportService {
  349. private accountRepository = AppDataSource.getRepository(PlatformAccount);
  350. private userDayStatisticsService = new UserDayStatisticsService();
  351. private downloadDir = path.resolve(process.cwd(), 'tmp', 'baijiahao-content-overview');
  352. private stateDir = path.resolve(process.cwd(), 'tmp', 'baijiahao-storage-state');
  353. private getStatePath(accountId: number) {
  354. return path.join(this.stateDir, `${accountId}.json`);
  355. }
  356. private async ensureStorageState(
  357. account: PlatformAccount,
  358. cookies: PlaywrightCookie[]
  359. ): Promise<string | null> {
  360. const statePath = this.getStatePath(account.id);
  361. try {
  362. await fs.access(statePath);
  363. return statePath;
  364. } catch {
  365. // no state
  366. }
  367. // 需要你在弹出的浏览器里完成一次登录/验证,然后脚本会自动保存 storageState
  368. // 启用方式:BJ_IMPORT_HEADLESS=0 且 BJ_STORAGE_STATE_BOOTSTRAP=1
  369. if (!(process.env.BJ_IMPORT_HEADLESS === '0' && process.env.BJ_STORAGE_STATE_BOOTSTRAP === '1')) {
  370. return null;
  371. }
  372. await ensureDir(this.stateDir);
  373. logger.warn(
  374. `[BJ Import] No storageState for accountId=${account.id}. Bootstrapping... 请在弹出的浏览器中完成登录/验证。`
  375. );
  376. const { browser, shouldClose } = await createBrowserForAccount(account.proxyConfig);
  377. try {
  378. const context = await browser.newContext({
  379. viewport: { width: 1920, height: 1080 },
  380. locale: 'zh-CN',
  381. timezoneId: 'Asia/Shanghai',
  382. });
  383. await context.addCookies(cookies as any);
  384. const page = await context.newPage();
  385. await page.goto('https://baijiahao.baidu.com/builder/rc/analysiscontent', {
  386. waitUntil: 'domcontentloaded',
  387. });
  388. // 最长等 5 分钟:让你手动完成登录/短信等
  389. await page
  390. .waitForFunction(() => {
  391. const t = document.body?.innerText || '';
  392. return t.includes('数据中心') || t.includes('内容分析') || t.includes('基础数据');
  393. }, { timeout: 5 * 60_000 })
  394. .catch(() => undefined);
  395. await context.storageState({ path: statePath });
  396. logger.info(`[BJ Import] storageState saved: ${statePath}`);
  397. await context.close();
  398. return statePath;
  399. } finally {
  400. if (shouldClose) await browser.close().catch(() => undefined);
  401. }
  402. }
  403. /**
  404. * 通过 Python 调用 appStatisticV3(登录模式与打开后台一致:使用账号已存 Cookie)
  405. */
  406. private async fetchAppStatisticV3ViaPython(
  407. account: PlatformAccount,
  408. startDay: string,
  409. endDay: string
  410. ): Promise<Record<string, unknown>> {
  411. const base = (await getPythonServiceBaseUrl()).replace(/\/$/, '');
  412. const url = `${base}/baijiahao/app_statistic_v3`;
  413. const cookie = String(account.cookieData || '').trim();
  414. if (!cookie) throw new Error('百家号账号 cookie 为空,无法调用 Python app_statistic_v3');
  415. const controller = new AbortController();
  416. const timeoutId = setTimeout(() => controller.abort(), 30_000);
  417. try {
  418. const res = await fetch(url, {
  419. method: 'POST',
  420. signal: controller.signal,
  421. headers: { 'Content-Type': 'application/json' },
  422. body: JSON.stringify({ cookie, start_day: startDay, end_day: endDay }),
  423. });
  424. const text = await res.text();
  425. const data = text ? (JSON.parse(text) as Record<string, unknown>) : {};
  426. if (!res.ok) {
  427. const msg = String(data?.errmsg || data?.error || '').trim() || `HTTP ${res.status}`;
  428. throw new Error(`Python app_statistic_v3 调用失败: ${msg}`);
  429. }
  430. return data;
  431. } finally {
  432. clearTimeout(timeoutId);
  433. }
  434. }
  435. /**
  436. * 通过 Python 调用 getFansBasicInfo(登录模式与打开后台一致)
  437. */
  438. private async fetchFansBasicInfoViaPython(
  439. account: PlatformAccount,
  440. start: string,
  441. end: string
  442. ): Promise<Record<string, unknown>> {
  443. const base = (await getPythonServiceBaseUrl()).replace(/\/$/, '');
  444. const url = `${base}/baijiahao/fans_basic_info`;
  445. const cookie = String(account.cookieData || '').trim();
  446. if (!cookie) throw new Error('百家号账号 cookie 为空,无法调用 Python fans_basic_info');
  447. const controller = new AbortController();
  448. const timeoutId = setTimeout(() => controller.abort(), 30_000);
  449. try {
  450. const res = await fetch(url, {
  451. method: 'POST',
  452. signal: controller.signal,
  453. headers: { 'Content-Type': 'application/json' },
  454. body: JSON.stringify({ cookie, start, end }),
  455. });
  456. const text = await res.text();
  457. const data = text ? (JSON.parse(text) as Record<string, unknown>) : {};
  458. if (!res.ok) {
  459. const msg = String(data?.errmsg || data?.error || '').trim() || `HTTP ${res.status}`;
  460. throw new Error(`Python fans_basic_info 调用失败: ${msg}`);
  461. }
  462. return data;
  463. } finally {
  464. clearTimeout(timeoutId);
  465. }
  466. }
  467. /**
  468. * 统一入口:定时任务与添加账号均调用此方法,执行“内容分析-基础数据-近30天 + 粉丝 getFansBasicInfo”
  469. */
  470. static async runDailyImport(): Promise<void> {
  471. const svc = new BaijiahaoContentOverviewImportService();
  472. await svc.runDailyImportForAllBaijiahaoAccounts();
  473. }
  474. /**
  475. * 为所有百家号账号导出“数据中心-内容分析-基础数据-近30天”并导入 user_day_statistics
  476. */
  477. async runDailyImportForAllBaijiahaoAccounts(): Promise<void> {
  478. await ensureDir(this.downloadDir);
  479. const accounts = await this.accountRepository.find({
  480. where: { platform: 'baijiahao' as any },
  481. });
  482. logger.info(`[BJ Import] Start. total_accounts=${accounts.length}`);
  483. for (const account of accounts) {
  484. try {
  485. await this.importAccountLast30Days(account);
  486. } catch (e) {
  487. logger.error(
  488. `[BJ Import] Account failed. accountId=${account.id} name=${account.accountName || ''}`,
  489. e
  490. );
  491. }
  492. }
  493. logger.info('[BJ Import] Done.');
  494. }
  495. /**
  496. * 单账号:优先 Python+Node(登录与打开后台一致,使用账号已存 Cookie);失败则刷新重试一次,再失败则浏览器兜底
  497. */
  498. async importAccountLast30Days(account: PlatformAccount, isRetry = false): Promise<void> {
  499. const cookies = parseCookiesFromAccount(account.cookieData);
  500. if (!cookies.length) throw new Error('cookieData 为空或无法解析');
  501. const end = new Date();
  502. end.setHours(0, 0, 0, 0);
  503. end.setDate(end.getDate() - 1);
  504. const start = new Date(end);
  505. start.setDate(start.getDate() - 29);
  506. const fmt = (d: Date) =>
  507. `${d.getFullYear()}${String(d.getMonth() + 1).padStart(2, '0')}${String(d.getDate()).padStart(2, '0')}`;
  508. const start_day = fmt(start);
  509. const end_day = fmt(end);
  510. const chinaTz = 'Asia/Shanghai';
  511. const toChinaYMD = (date: Date): { y: number; m: number; d: number } => {
  512. const formatter = new Intl.DateTimeFormat('en-CA', {
  513. timeZone: chinaTz,
  514. year: 'numeric',
  515. month: '2-digit',
  516. day: '2-digit',
  517. });
  518. const parts = formatter.formatToParts(date);
  519. const get = (type: string) => parts.find((p) => p.type === type)?.value ?? '0';
  520. return {
  521. y: parseInt(get('year'), 10),
  522. m: parseInt(get('month'), 10),
  523. d: parseInt(get('day'), 10),
  524. };
  525. };
  526. const now = new Date();
  527. const today = toChinaYMD(now);
  528. const yesterdayDate = new Date(
  529. Date.UTC(today.y, today.m - 1, today.d, 0, 0, 0, 0) - 24 * 60 * 60 * 1000
  530. );
  531. const startDate = new Date(yesterdayDate.getTime() - 29 * 24 * 60 * 60 * 1000);
  532. const endYMD = toChinaYMD(yesterdayDate);
  533. const startYMD = toChinaYMD(startDate);
  534. const pad = (n: number) => String(n).padStart(2, '0');
  535. const startStr = `${startYMD.y}${pad(startYMD.m)}${pad(startYMD.d)}`;
  536. const endStr = `${endYMD.y}${pad(endYMD.m)}${pad(endYMD.d)}`;
  537. // 优先 Python(登录与打开后台一致:仅用账号已存 Cookie,不启浏览器)
  538. try {
  539. const data = await this.fetchAppStatisticV3ViaPython(account, start_day, end_day);
  540. const errno = typeof data?.errno === 'number' ? data.errno : Number(data?.errno ?? -1);
  541. if (errno !== 0) throw new Error(data?.errmsg ? String(data.errmsg) : 'appStatisticV3 errno !== 0');
  542. const perDay = parseBaijiahaoAppStatisticV3(data);
  543. if (perDay.size === 0) throw new Error('appStatisticV3 解析后无数据');
  544. let inserted = 0;
  545. let updated = 0;
  546. for (const v of perDay.values()) {
  547. const { recordDate, ...patch } = v;
  548. const r = await this.userDayStatisticsService.saveStatisticsForDate(account.id, recordDate, patch);
  549. inserted += r.inserted;
  550. updated += r.updated;
  551. }
  552. logger.info(
  553. `[BJ Import] basic-data (via Python). accountId=${account.id} days=${perDay.size} inserted=${inserted} updated=${updated}`
  554. );
  555. try {
  556. const fansBody = await this.fetchFansBasicInfoViaPython(account, startStr, endStr);
  557. const fansErrno = (fansBody as any).errno;
  558. if (fansErrno === 0 || fansErrno === undefined) {
  559. const list = this.parseGetFansBasicInfoResponse(fansBody as Record<string, unknown>);
  560. let fansUpdated = 0;
  561. for (const { recordDate, fansCount, fansIncrease } of list) {
  562. const r = await this.userDayStatisticsService.saveStatisticsForDate(
  563. account.id,
  564. recordDate,
  565. { fansCount, fansIncrease }
  566. );
  567. fansUpdated += r.inserted + r.updated;
  568. }
  569. logger.info(`[BJ Import] Fans data (via Python). accountId=${account.id} days=${list.length} updated=${fansUpdated}`);
  570. }
  571. } catch (e) {
  572. logger.warn(`[BJ Import] Fans via Python failed (non-fatal). accountId=${account.id}`, e instanceof Error ? e.message : e);
  573. }
  574. return;
  575. } catch (pythonError) {
  576. logger.warn(
  577. `[BJ Import] Python path failed, fallback to browser. accountId=${account.id}`,
  578. pythonError instanceof Error ? pythonError.message : pythonError
  579. );
  580. }
  581. if (!isRetry) {
  582. try {
  583. const accountService = new AccountService();
  584. const refreshResult = await accountService.refreshAccount(account.userId, account.id);
  585. if (!refreshResult.needReLogin) {
  586. const refreshedAccount = await this.accountRepository.findOne({ where: { id: account.id } });
  587. if (refreshedAccount) {
  588. logger.info(`[BJ Import] Account ${account.id} refreshed, retrying import...`);
  589. return await this.importAccountLast30Days(refreshedAccount, true);
  590. }
  591. }
  592. } catch (refreshError) {
  593. logger.error(`[BJ Import] Account ${account.id} refresh failed:`, refreshError);
  594. }
  595. }
  596. // 浏览器兜底:原有逻辑不变
  597. const { browser, shouldClose } = await createBrowserForAccount(account.proxyConfig);
  598. try {
  599. const statePath = await this.ensureStorageState(account, cookies);
  600. const context = await browser.newContext({
  601. acceptDownloads: true,
  602. viewport: { width: 1920, height: 1080 },
  603. locale: 'zh-CN',
  604. timezoneId: 'Asia/Shanghai',
  605. userAgent:
  606. 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
  607. ...(statePath ? { storageState: statePath } : {}),
  608. });
  609. context.setDefaultTimeout(60_000);
  610. if (!statePath) await context.addCookies(cookies as any);
  611. const page = await context.newPage();
  612. await page.goto('https://baijiahao.baidu.com/builder/rc/analysiscontent', { waitUntil: 'domcontentloaded' });
  613. await page.waitForTimeout(1500);
  614. if (page.url().includes('passport') || page.url().includes('login')) {
  615. if (!isRetry) {
  616. logger.info(`[BJ Import] Login expired for account ${account.id}, attempting refresh...`);
  617. await context.close();
  618. if (shouldClose) await browser.close();
  619. try {
  620. const accountService = new AccountService();
  621. const refreshResult = await accountService.refreshAccount(account.userId, account.id);
  622. if (refreshResult.needReLogin) {
  623. logger.warn(`[BJ Import] Account ${account.id} refresh failed, still needs re-login`);
  624. throw new Error('未登录/需要重新登录(跳转到登录页)');
  625. }
  626. const refreshedAccount = await this.accountRepository.findOne({ where: { id: account.id } });
  627. if (!refreshedAccount) throw new Error('账号刷新后未找到');
  628. return await this.importAccountLast30Days(refreshedAccount, true);
  629. } catch (refreshError) {
  630. logger.error(`[BJ Import] Account ${account.id} refresh failed:`, refreshError);
  631. throw new Error('未登录/需要重新登录(跳转到登录页)');
  632. }
  633. }
  634. throw new Error('未登录/需要重新登录(跳转到登录页)');
  635. }
  636. const bodyText = (await page.textContent('body').catch(() => '')) || '';
  637. if (bodyText.includes('暂无数据') || bodyText.includes('无权访问')) {
  638. await this.accountRepository.update(account.id, { status: 'expired' as any });
  639. wsManager.sendToUser(account.userId, WS_EVENTS.ACCOUNT_UPDATED, {
  640. account: { id: account.id, status: 'expired', platform: 'baijiahao' },
  641. });
  642. wsManager.sendToUser(account.userId, WS_EVENTS.SYSTEM_MESSAGE, {
  643. level: 'warning',
  644. message: `百家号账号「${account.accountName || account.accountId || account.id}」暂无数据看板访问权限,请到百家号后台检查数据权限。`,
  645. platform: 'baijiahao',
  646. accountId: account.id,
  647. });
  648. throw new Error('百家号数据看板暂无访问权限/暂无数据,已标记 expired 并通知用户');
  649. }
  650. await page.getByText('数据中心', { exact: false }).first().click().catch(() => undefined);
  651. await page.getByText('内容分析', { exact: false }).first().click().catch(() => undefined);
  652. await page.getByText('基础数据', { exact: false }).first().click().catch(() => undefined);
  653. try {
  654. const trigger = page.getByText(/近\d+天?/, { exact: false }).first();
  655. if ((await trigger.count()) > 0) await trigger.click().catch(() => undefined);
  656. const thirtyDay =
  657. (await page.getByText('近30天', { exact: true }).first().count()) > 0
  658. ? page.getByText('近30天', { exact: true }).first()
  659. : page.getByText('近30日', { exact: false }).first();
  660. await thirtyDay.click().catch(() => undefined);
  661. await page.waitForTimeout(5000);
  662. } catch (e) {
  663. logger.warn(`[BJ Import] Unable to switch to 近30天. accountId=${account.id}`, e);
  664. }
  665. let perDay = new Map<string, { recordDate: Date } & Record<string, any>>();
  666. let inserted = 0;
  667. let updated = 0;
  668. const tryFetchApi = async () => {
  669. const apiUrl = `https://baijiahao.baidu.com/author/eco/statistics/appStatisticV3?type=all&start_day=${start_day}&end_day=${end_day}&stat=0&special_filter_days=30`;
  670. const res = await (context as any).request.get(apiUrl, {
  671. headers: { Referer: 'https://baijiahao.baidu.com/builder/rc/analysiscontent' },
  672. });
  673. if (!res.ok()) throw new Error(`appStatisticV3 http ${res.status()}`);
  674. const json = await res.json().catch(() => null);
  675. if (!json) throw new Error('appStatisticV3 json parse failed');
  676. if (process.env.BJ_IMPORT_DEBUG === '1') {
  677. const debugPath = path.join(this.downloadDir, `appStatisticV3_response_${account.id}_${Date.now()}.json`);
  678. await ensureDir(this.downloadDir);
  679. await fs.writeFile(debugPath, JSON.stringify(json, null, 2), 'utf-8');
  680. logger.info(`[BJ Import] DEBUG: appStatisticV3 原始响应已写入 ${debugPath}`);
  681. }
  682. return parseBaijiahaoAppStatisticV3(json);
  683. };
  684. try {
  685. perDay = await tryFetchApi();
  686. } catch (e) {
  687. logger.warn(`[BJ Import] appStatisticV3 failed, fallback to Excel. accountId=${account.id}`, e);
  688. }
  689. let filePath: string | null = null;
  690. if (perDay.size === 0) {
  691. const [download] = await Promise.all([
  692. page.waitForEvent('download', { timeout: 60_000 }),
  693. page.getByText('导出数据', { exact: true }).first().click(),
  694. ]);
  695. filePath = path.join(this.downloadDir, `${account.id}_${Date.now()}_${download.suggestedFilename()}`);
  696. await download.saveAs(filePath);
  697. perDay = parseBaijiahaoExcel(filePath);
  698. } else if (perDay.size < 20) {
  699. const [download] = await Promise.all([
  700. page.waitForEvent('download', { timeout: 60_000 }),
  701. page.getByText('导出数据', { exact: true }).first().click(),
  702. ]);
  703. filePath = path.join(this.downloadDir, `${account.id}_${Date.now()}_${download.suggestedFilename()}`);
  704. await download.saveAs(filePath);
  705. const excelMap = parseBaijiahaoExcel(filePath);
  706. for (const [k, v] of excelMap.entries()) {
  707. if (!perDay.has(k)) perDay.set(k, v);
  708. }
  709. }
  710. try {
  711. for (const v of perDay.values()) {
  712. const { recordDate, ...patch } = v;
  713. const r = await this.userDayStatisticsService.saveStatisticsForDate(account.id, recordDate, patch);
  714. inserted += r.inserted;
  715. updated += r.updated;
  716. }
  717. logger.info(`[BJ Import] basic-data (browser). accountId=${account.id} days=${perDay.size} inserted=${inserted} updated=${updated}`);
  718. } finally {
  719. if (filePath && process.env.KEEP_BJ_XLSX !== 'true') await fs.unlink(filePath).catch(() => undefined);
  720. }
  721. try {
  722. await this.importFansDataByApi(context, account);
  723. } catch (e) {
  724. logger.warn(`[BJ Import] Fans import failed (non-fatal). accountId=${account.id}`, e instanceof Error ? e.message : e);
  725. }
  726. await context.close();
  727. } finally {
  728. if (shouldClose) await browser.close().catch(() => undefined);
  729. }
  730. }
  731. /**
  732. * 粉丝数据:直接请求 getFansBasicInfo(近30天 = 中国时区昨天为结束,往前推 30 天),不打开页面
  733. * sum_fans_count → fans_count,new_fans_count → fans_increase
  734. * 使用中国时区计算日期,避免服务器非东八区时只拿到部分天数
  735. */
  736. private async importFansDataByApi(context: BrowserContext, account: PlatformAccount): Promise<void> {
  737. const chinaTz = 'Asia/Shanghai';
  738. const toChinaYMD = (date: Date): { y: number; m: number; d: number } => {
  739. const formatter = new Intl.DateTimeFormat('en-CA', { timeZone: chinaTz, year: 'numeric', month: '2-digit', day: '2-digit' });
  740. const parts = formatter.formatToParts(date);
  741. const get = (type: string) => parts.find((p) => p.type === type)?.value ?? '0';
  742. return { y: parseInt(get('year'), 10), m: parseInt(get('month'), 10), d: parseInt(get('day'), 10) };
  743. };
  744. const now = new Date();
  745. const today = toChinaYMD(now);
  746. const yesterdayDate = new Date(Date.UTC(today.y, today.m - 1, today.d, 0, 0, 0, 0) - 24 * 60 * 60 * 1000);
  747. const startDate = new Date(yesterdayDate.getTime() - 29 * 24 * 60 * 60 * 1000);
  748. const endYMD = toChinaYMD(yesterdayDate);
  749. const startYMD = toChinaYMD(startDate);
  750. const pad = (n: number) => String(n).padStart(2, '0');
  751. const startStr = `${startYMD.y}${pad(startYMD.m)}${pad(startYMD.d)}`;
  752. const endStr = `${endYMD.y}${pad(endYMD.m)}${pad(endYMD.d)}`;
  753. const apiUrl = `https://baijiahao.baidu.com/author/eco/statistics/getFansBasicInfo?start=${startStr}&end=${endStr}&fans_type=new%2Csum&sort=asc&is_page=0&show_type=chart`;
  754. logger.info(`[BJ Import] getFansBasicInfo range (China). accountId=${account.id} start=${startStr} end=${endStr}`);
  755. let body: Record<string, unknown> | null = null;
  756. try {
  757. const res = await (context as any).request.get(apiUrl, {
  758. headers: { Referer: 'https://baijiahao.baidu.com/builder/rc/analysisfans/basedata' },
  759. });
  760. if (res.ok()) body = await res.json().catch(() => null);
  761. } catch (e) {
  762. logger.warn(`[BJ Import] getFansBasicInfo request failed. accountId=${account.id}`, e);
  763. return;
  764. }
  765. if (!body || typeof body !== 'object') {
  766. logger.warn(`[BJ Import] getFansBasicInfo response not valid JSON, skip. accountId=${account.id}`);
  767. return;
  768. }
  769. const errno = (body as any).errno;
  770. if (errno !== 0 && errno !== undefined) {
  771. logger.warn(`[BJ Import] getFansBasicInfo errno=${errno}, skip. accountId=${account.id}`);
  772. return;
  773. }
  774. const list = this.parseGetFansBasicInfoResponse(body);
  775. if (!list.length) {
  776. logger.info(`[BJ Import] No fans data from getFansBasicInfo. accountId=${account.id}`);
  777. return;
  778. }
  779. const firstDay = list[0]?.recordDate;
  780. const lastDay = list[list.length - 1]?.recordDate;
  781. const fmtDay = (d: Date) => (d ? `${d.getFullYear()}-${String(d.getMonth() + 1).padStart(2, '0')}-${String(d.getDate()).padStart(2, '0')}` : '');
  782. logger.info(`[BJ Import] getFansBasicInfo response. accountId=${account.id} count=${list.length} first=${fmtDay(firstDay)} last=${fmtDay(lastDay)}`);
  783. let updated = 0;
  784. for (const { recordDate, fansCount, fansIncrease } of list) {
  785. const r = await this.userDayStatisticsService.saveStatisticsForDate(account.id, recordDate, {
  786. fansCount,
  787. fansIncrease,
  788. });
  789. updated += r.inserted + r.updated;
  790. }
  791. logger.info(`[BJ Import] Fans data imported. accountId=${account.id} days=${list.length} updated=${updated}`);
  792. }
  793. /**
  794. * 解析 getFansBasicInfo 接口返回,提取 (recordDate, fansCount, fansIncrease) 列表
  795. * sum_fans_count → fans_count,new_fans_count → fans_increase;"--" 或无效值跳过或按 0 处理
  796. */
  797. private parseGetFansBasicInfoResponse(
  798. body: Record<string, unknown>
  799. ): Array<{ recordDate: Date; fansCount: number; fansIncrease: number }> {
  800. const list: Array<{ recordDate: Date; fansCount: number; fansIncrease: number }> = [];
  801. const data = body.data as Record<string, unknown> | undefined;
  802. if (!data || typeof data !== 'object') return list;
  803. const arr = data.list as unknown[] | undefined;
  804. if (!Array.isArray(arr)) return list;
  805. for (const item of arr) {
  806. if (!item || typeof item !== 'object') continue;
  807. const o = item as Record<string, unknown>;
  808. const dayRaw = o.day;
  809. if (dayRaw == null) continue;
  810. const dayStr = String(dayRaw).trim();
  811. if (!/^\d{8}$/.test(dayStr)) continue;
  812. const d = normalizeDateText(dayStr);
  813. if (!d) continue;
  814. const sumRaw = o.sum_fans_count;
  815. const newRaw = o.new_fans_count;
  816. const toNum = (v: unknown): number => {
  817. if (v === null || v === undefined) return 0;
  818. if (typeof v === 'number' && Number.isFinite(v)) return Math.max(0, Math.round(v));
  819. const s = String(v).trim();
  820. if (s === '' || s === '--') return 0;
  821. const n = Number(s.replace(/,/g, ''));
  822. return Number.isFinite(n) ? Math.max(0, Math.round(n)) : 0;
  823. };
  824. const fansCount = toNum(sumRaw);
  825. const fansIncrease = toNum(newRaw);
  826. list.push({ recordDate: d, fansCount, fansIncrease });
  827. }
  828. return list;
  829. }
  830. }