| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558 |
- import fs from 'node:fs/promises';
- import path from 'node:path';
- import { chromium, type Browser } from 'playwright';
- import * as XLSXNS from 'xlsx';
- import { AppDataSource, PlatformAccount } from '../models/index.js';
- import { BrowserManager } from '../automation/browser.js';
- import { logger } from '../utils/logger.js';
- import { UserDayStatisticsService } from './UserDayStatisticsService.js';
- import type { ProxyConfig } from '@media-manager/shared';
- import { WS_EVENTS } from '@media-manager/shared';
- import { wsManager } from '../websocket/index.js';
- // xlsx 在 ESM 下可能挂在 default 上;这里做一次兼容兜底
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
- const XLSX: any = (XLSXNS as any).default ?? (XLSXNS as any);
- type PlaywrightCookie = {
- name: string;
- value: string;
- domain?: string;
- path?: string;
- url?: string;
- expires?: number;
- httpOnly?: boolean;
- secure?: boolean;
- sameSite?: 'Lax' | 'None' | 'Strict';
- };
- function ensureDir(p: string) {
- return fs.mkdir(p, { recursive: true });
- }
- function normalizeDateText(input: unknown): Date | null {
- if (!input) return null;
- if (input instanceof Date && !Number.isNaN(input.getTime())) {
- const d = new Date(input);
- d.setHours(0, 0, 0, 0);
- return d;
- }
- const s = String(input).trim();
- if (!s) return null;
- // 2026/1/27 or 2026-01-27
- const m1 = s.match(/(\d{4})\D(\d{1,2})\D(\d{1,2})/);
- if (m1) {
- const yyyy = Number(m1[1]);
- const mm = Number(m1[2]);
- const dd = Number(m1[3]);
- if (!yyyy || !mm || !dd) return null;
- const d = new Date(yyyy, mm - 1, dd);
- d.setHours(0, 0, 0, 0);
- return d;
- }
- // 20260127
- const m2 = s.match(/^(\d{4})(\d{2})(\d{2})$/);
- if (m2) {
- const yyyy = Number(m2[1]);
- const mm = Number(m2[2]);
- const dd = Number(m2[3]);
- const d = new Date(yyyy, mm - 1, dd);
- d.setHours(0, 0, 0, 0);
- return d;
- }
- return null;
- }
- function parseChineseNumberLike(input: unknown): number | null {
- if (input === null || input === undefined) return null;
- const s = String(input).trim();
- if (!s) return null;
- const plain = s.replace(/,/g, '');
- const wan = plain.match(/^(\d+(\.\d+)?)\s*万$/);
- if (wan) return Math.round(Number(wan[1]) * 10000);
- const yi = plain.match(/^(\d+(\.\d+)?)\s*亿$/);
- if (yi) return Math.round(Number(yi[1]) * 100000000);
- const n = Number(plain.replace(/[^\d.-]/g, ''));
- if (Number.isFinite(n)) return Math.round(n);
- return null;
- }
- function parseCookiesFromAccount(cookieData: string | null): PlaywrightCookie[] {
- if (!cookieData) return [];
- const raw = cookieData.trim();
- if (!raw) return [];
- // 1) JSON array / 对象
- if (raw.startsWith('[') || raw.startsWith('{')) {
- try {
- const parsed = JSON.parse(raw);
- const arr = Array.isArray(parsed) ? parsed : (parsed?.cookies ? parsed.cookies : []);
- if (!Array.isArray(arr)) return [];
- return arr
- .map((c: any) => {
- const name = String(c?.name ?? '').trim();
- const value = String(c?.value ?? '').trim();
- if (!name) return null;
- const domain = c?.domain ? String(c.domain) : undefined;
- const pathVal = c?.path ? String(c.path) : '/';
- const url = !domain ? 'https://channels.weixin.qq.com' : undefined;
- const sameSiteRaw = c?.sameSite;
- const sameSite =
- sameSiteRaw === 'Lax' || sameSiteRaw === 'None' || sameSiteRaw === 'Strict'
- ? sameSiteRaw
- : undefined;
- return {
- name,
- value,
- domain,
- path: pathVal,
- url,
- expires: typeof c?.expires === 'number' ? c.expires : undefined,
- httpOnly: typeof c?.httpOnly === 'boolean' ? c.httpOnly : undefined,
- secure: typeof c?.secure === 'boolean' ? c.secure : undefined,
- sameSite,
- } satisfies PlaywrightCookie;
- })
- .filter(Boolean) as PlaywrightCookie[];
- } catch {
- // fallthrough
- }
- }
- // 2) "a=b; c=d"
- const pairs = raw.split(';').map((p) => p.trim()).filter(Boolean);
- const cookies: PlaywrightCookie[] = [];
- for (const p of pairs) {
- const idx = p.indexOf('=');
- if (idx <= 0) continue;
- const name = p.slice(0, idx).trim();
- const value = p.slice(idx + 1).trim();
- if (!name) continue;
- cookies.push({ name, value, url: 'https://channels.weixin.qq.com' });
- }
- return cookies;
- }
- async function createBrowserForAccount(proxy: ProxyConfig | null): Promise<{ browser: Browser; shouldClose: boolean }> {
- // 默认 headless;但视频号在 headless 下经常会强制跳登录/风控,
- // 因此允许通过 WX_IMPORT_HEADLESS=0 强制用有头浏览器跑导入。
- const headless = process.env.WX_IMPORT_HEADLESS === '0' ? false : true;
- if (proxy?.enabled) {
- const server = `${proxy.type}://${proxy.host}:${proxy.port}`;
- const browser = await chromium.launch({
- headless,
- proxy: {
- server,
- username: proxy.username,
- password: proxy.password,
- },
- args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu', '--window-size=1920,1080'],
- });
- return { browser, shouldClose: true };
- }
- const browser = await BrowserManager.getBrowser({ headless });
- return { browser, shouldClose: false };
- }
- type WxSection = '关注者数据' | '视频数据' | '图文数据';
- function parseCsvLine(line: string): string[] {
- // 简单 CSV 解析(处理双引号包裹与转义)
- const out: string[] = [];
- let cur = '';
- let inQuotes = false;
- for (let i = 0; i < line.length; i++) {
- const ch = line[i]!;
- if (ch === '"') {
- const next = line[i + 1];
- if (inQuotes && next === '"') {
- cur += '"';
- i++;
- } else {
- inQuotes = !inQuotes;
- }
- continue;
- }
- if (ch === ',' && !inQuotes) {
- out.push(cur);
- cur = '';
- continue;
- }
- cur += ch;
- }
- out.push(cur);
- return out.map((s) => s.trim());
- }
- async function parseWeixinVideoFile(filePath: string): Promise<Map<string, { recordDate: Date } & Record<string, any>>> {
- const ext = path.extname(filePath).toLowerCase();
- if (ext === '.csv') {
- const text = await fs.readFile(filePath, 'utf8');
- const lines = text.replace(/^\uFEFF/, '').split(/\r?\n/).filter((l) => l.trim().length > 0);
- const result = new Map<string, { recordDate: Date } & Record<string, any>>();
- logger.info(`[WX Import] CSV loaded. file=${path.basename(filePath)} lines=${lines.length}`);
- // 找表头行(含“时间”或“日期”)
- const headerLineIdx = lines.findIndex((l) => l.includes('"时间"') || l.includes('"日期"') || l.startsWith('时间,') || l.startsWith('日期,'));
- if (headerLineIdx < 0) return result;
- const header = parseCsvLine(lines[headerLineIdx]!).map((c) => c.replace(/^"|"$/g, '').trim());
- logger.info(`[WX Import] Header detected. headerRow=${headerLineIdx + 1} headers=${header.join('|')}`);
- const colIndex = (names: string[]) => {
- for (const n of names) {
- const idx = header.findIndex((h) => h === n);
- if (idx >= 0) return idx;
- }
- for (const n of names) {
- const idx = header.findIndex((h) => h.includes(n));
- if (idx >= 0) return idx;
- }
- return -1;
- };
- const dateCol = colIndex(['时间', '日期']);
- const playCol = colIndex(['播放', '播放量', '曝光量', '阅读/播放量', '阅读量']);
- const likeCol = colIndex(['喜欢', '点赞', '点赞量']);
- const commentCol = colIndex(['评论', '评论量']);
- const shareCol = colIndex(['分享', '分享量']);
- const fansIncCol = colIndex(['净增关注', '新增关注']);
- const fansTotalCol = colIndex(['关注者总数', '关注者总量', '粉丝总数', '粉丝总量']);
- for (let i = headerLineIdx + 1; i < lines.length; i++) {
- const cols = parseCsvLine(lines[i]!).map((c) => c.replace(/^"|"$/g, '').trim());
- if (dateCol < 0 || cols.length <= dateCol) continue;
- const d = normalizeDateText(cols[dateCol]);
- if (!d) continue;
- const key = `${d.getFullYear()}-${String(d.getMonth() + 1).padStart(2, '0')}-${String(d.getDate()).padStart(2, '0')}`;
- if (!result.has(key)) result.set(key, { recordDate: d });
- const obj = result.get(key)!;
- if (playCol >= 0 && cols.length > playCol) {
- const n = parseChineseNumberLike(cols[playCol]);
- if (typeof n === 'number') (obj as any).playCount = n;
- }
- if (likeCol >= 0 && cols.length > likeCol) {
- const n = parseChineseNumberLike(cols[likeCol]);
- if (typeof n === 'number') (obj as any).likeCount = n;
- }
- if (commentCol >= 0 && cols.length > commentCol) {
- const n = parseChineseNumberLike(cols[commentCol]);
- if (typeof n === 'number') (obj as any).commentCount = n;
- }
- if (shareCol >= 0 && cols.length > shareCol) {
- const n = parseChineseNumberLike(cols[shareCol]);
- if (typeof n === 'number') (obj as any).shareCount = n;
- }
- if (fansIncCol >= 0 && cols.length > fansIncCol) {
- const n = parseChineseNumberLike(cols[fansIncCol]);
- if (typeof n === 'number') (obj as any).fansIncrease = n;
- }
- if (fansTotalCol >= 0 && cols.length > fansTotalCol) {
- const n = parseChineseNumberLike(cols[fansTotalCol]);
- if (typeof n === 'number') (obj as any).fansCount = n;
- }
- }
- return result;
- }
- // xlsx/xls:走 xlsx 解析
- const wb = XLSX.readFile(filePath);
- const result = new Map<string, { recordDate: Date } & Record<string, any>>();
- logger.info(`[WX Import] Excel loaded. file=${path.basename(filePath)} sheets=${wb.SheetNames.join(' | ')}`);
- for (const sheetName of wb.SheetNames) {
- const sheet = wb.Sheets[sheetName];
- const rows: any[][] = XLSX.utils.sheet_to_json(sheet, { header: 1, defval: '' });
- if (!rows.length) continue;
- let headerIdx = rows.findIndex(
- (r) => Array.isArray(r) && r.some((c) => ['时间', '日期'].includes(String(c).trim()))
- );
- if (headerIdx < 0) continue;
- const header = rows[headerIdx].map((c) => String(c).trim());
- logger.info(`[WX Import] Header detected. sheet=${sheetName} headerRow=${headerIdx + 1} headers=${header.join('|')}`);
- const colIndex = (names: string[]) => {
- for (const n of names) {
- const idx = header.findIndex((h) => h === n);
- if (idx >= 0) return idx;
- }
- for (const n of names) {
- const idx = header.findIndex((h) => h.includes(n));
- if (idx >= 0) return idx;
- }
- return -1;
- };
- const dateCol = colIndex(['时间', '日期']);
- if (dateCol < 0) continue;
- const playCol = colIndex(['播放', '播放量', '曝光量', '阅读/播放量', '阅读量']);
- const likeCol = colIndex(['喜欢', '点赞', '点赞量']);
- const commentCol = colIndex(['评论', '评论量']);
- const shareCol = colIndex(['分享', '分享量']);
- const fansIncCol = colIndex(['净增关注', '新增关注']);
- const fansTotalCol = colIndex(['关注者总数', '关注者总量', '粉丝总数', '粉丝总量']);
- for (let i = headerIdx + 1; i < rows.length; i++) {
- const r = rows[i];
- if (!r || !Array.isArray(r) || r.length <= dateCol) continue;
- const d = normalizeDateText(r[dateCol]);
- if (!d) continue;
- const key = `${d.getFullYear()}-${String(d.getMonth() + 1).padStart(2, '0')}-${String(d.getDate()).padStart(2, '0')}`;
- if (!result.has(key)) result.set(key, { recordDate: d });
- const obj = result.get(key)!;
- if (playCol >= 0) {
- const n = parseChineseNumberLike(r[playCol]);
- if (typeof n === 'number') (obj as any).playCount = n;
- }
- if (likeCol >= 0) {
- const n = parseChineseNumberLike(r[likeCol]);
- if (typeof n === 'number') (obj as any).likeCount = n;
- }
- if (commentCol >= 0) {
- const n = parseChineseNumberLike(r[commentCol]);
- if (typeof n === 'number') (obj as any).commentCount = n;
- }
- if (shareCol >= 0) {
- const n = parseChineseNumberLike(r[shareCol]);
- if (typeof n === 'number') (obj as any).shareCount = n;
- }
- if (fansIncCol >= 0) {
- const n = parseChineseNumberLike(r[fansIncCol]);
- if (typeof n === 'number') (obj as any).fansIncrease = n;
- }
- if (fansTotalCol >= 0) {
- const n = parseChineseNumberLike(r[fansTotalCol]);
- if (typeof n === 'number') (obj as any).fansCount = n;
- }
- }
- }
- return result;
- }
- export class WeixinVideoDataCenterImportService {
- private accountRepository = AppDataSource.getRepository(PlatformAccount);
- private userDayStatisticsService = new UserDayStatisticsService();
- // 兼容 monorepo 从根目录/从 server 目录启动
- private baseDir =
- path.basename(process.cwd()).toLowerCase() === 'server'
- ? process.cwd()
- : path.resolve(process.cwd(), 'server');
- private downloadDir = path.resolve(this.baseDir, 'tmp', 'weixin-video-data-center');
- private stateDir = path.resolve(this.baseDir, 'tmp', 'weixin-video-storage-state');
- private getStatePath(accountId: number) {
- return path.join(this.stateDir, `${accountId}.json`);
- }
- private async ensureStorageState(account: PlatformAccount, cookies: PlaywrightCookie[]): Promise<string | null> {
- const statePath = this.getStatePath(account.id);
- try {
- await fs.access(statePath);
- return statePath;
- } catch {
- // no state
- }
- if (!(process.env.WX_IMPORT_HEADLESS === '0' && process.env.WX_STORAGE_STATE_BOOTSTRAP === '1')) {
- return null;
- }
- await ensureDir(this.stateDir);
- logger.warn(`[WX Import] No storageState for accountId=${account.id}. Bootstrapping... 请在弹出的浏览器中完成登录/验证。`);
- const { browser, shouldClose } = await createBrowserForAccount(account.proxyConfig);
- try {
- const context = await browser.newContext({
- viewport: { width: 1920, height: 1080 },
- locale: 'zh-CN',
- timezoneId: 'Asia/Shanghai',
- });
- await context.addCookies(cookies as any);
- const page = await context.newPage();
- await page.goto('https://channels.weixin.qq.com/platform', { waitUntil: 'domcontentloaded' });
- await page
- .waitForFunction(() => {
- const t = document.body?.innerText || '';
- return t.includes('数据中心') || t.includes('关注者数据') || t.includes('视频数据');
- }, { timeout: 5 * 60_000 })
- .catch(() => undefined);
- await context.storageState({ path: statePath });
- logger.info(`[WX Import] storageState saved: ${statePath}`);
- await context.close();
- return statePath;
- } finally {
- if (shouldClose) await browser.close().catch(() => undefined);
- }
- }
- async runDailyImportForAllWeixinVideoAccounts(): Promise<void> {
- await ensureDir(this.downloadDir);
- const accounts = await this.accountRepository.find({ where: { platform: 'weixin_video' as any } });
- logger.info(`[WX Import] Start. total_accounts=${accounts.length}`);
- for (const account of accounts) {
- try {
- await this.importAccountLast30Days(account);
- } catch (e) {
- logger.error(`[WX Import] Account failed. accountId=${account.id} name=${account.accountName || ''}`, e);
- }
- }
- logger.info('[WX Import] Done.');
- }
- async importAccountLast30Days(account: PlatformAccount): Promise<void> {
- const cookies = parseCookiesFromAccount(account.cookieData);
- if (!cookies.length) throw new Error('cookieData 为空或无法解析');
- const { browser, shouldClose } = await createBrowserForAccount(account.proxyConfig);
- try {
- const statePath = await this.ensureStorageState(account, cookies);
- logger.info(
- `[WX Import] Context init. accountId=${account.id} storageState=${statePath ? statePath : 'none'}`
- );
- const context = await browser.newContext({
- acceptDownloads: true,
- viewport: { width: 1920, height: 1080 },
- locale: 'zh-CN',
- timezoneId: 'Asia/Shanghai',
- userAgent:
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
- ...(statePath ? { storageState: statePath } : {}),
- });
- context.setDefaultTimeout(60_000);
- if (!statePath) await context.addCookies(cookies as any);
- const page = await context.newPage();
- await page.goto('https://channels.weixin.qq.com/platform', { waitUntil: 'domcontentloaded' });
- await page.waitForTimeout(1500);
- if (page.url().includes('login') || page.url().includes('passport')) {
- throw new Error('未登录/需要重新登录(跳转到登录页)');
- }
- // 进入 数据中心
- await page.getByText('数据中心', { exact: false }).first().click();
- await page.waitForTimeout(800);
- // 目前只需要关注者数据 + 视频数据,图文数据暂不采集
- const sections: WxSection[] = ['关注者数据', '视频数据'];
- let mergedDays = new Map<string, { recordDate: Date } & Record<string, any>>();
- const tryClick = async (texts: string[]) => {
- for (const t of texts) {
- const loc = page.getByText(t, { exact: true }).first();
- if ((await loc.count().catch(() => 0)) > 0) {
- await loc.click().catch(() => undefined);
- return true;
- }
- }
- for (const t of texts) {
- const loc = page.getByText(t, { exact: false }).first();
- if ((await loc.count().catch(() => 0)) > 0) {
- await loc.click().catch(() => undefined);
- return true;
- }
- }
- return false;
- };
- const exportSection = async (section: WxSection) => {
- const ok = await tryClick([section]);
- if (!ok) {
- logger.warn(`[WX Import] Section not found, skip. accountId=${account.id} section=${section}`);
- return;
- }
- await page.waitForTimeout(1200);
- // 进入 增长详情/数据详情(页面上可能显示“增长详情”或“数据详情”)
- await tryClick(['增长详情', '数据详情']);
- await page.waitForTimeout(800);
- // 日期范围:点击「近30天」
- try {
- if (section === '关注者数据') {
- const loc = page.locator(
- '#container-wrap > div.container-center > div > div > div.follower-growth-wrap > div:nth-child(4) > div > div > div.card-body > div.filter-wrap > div > div.filter-content > div > div > div.weui-desktop-radio-group.radio-group > label:nth-child(2)'
- );
- if ((await loc.count().catch(() => 0)) > 0) {
- await loc.click().catch(() => undefined);
- } else {
- await tryClick(['近30天', '近30日', '近30']);
- }
- } else if (section === '视频数据') {
- const loc = page.locator(
- '#container-wrap > div.container-center > div > div > div > div.post-total-wrap > div.post-statistic-common > div:nth-child(3) > div > div > div.card-body > div.filter-wrap > div:nth-child(2) > div.filter-content > div > div > div.weui-desktop-radio-group.radio-group > label:nth-child(2)'
- );
- if ((await loc.count().catch(() => 0)) > 0) {
- await loc.click().catch(() => undefined);
- } else {
- await tryClick(['近30天', '近30日', '近30']);
- }
- } else {
- await tryClick(['近30天', '近30日', '近30']);
- }
- } catch {
- await tryClick(['近30天', '近30日', '近30']);
- }
- await page.waitForTimeout(4000);
- // 下载表格
- const [download] = await Promise.all([
- page.waitForEvent('download', { timeout: 60_000 }),
- tryClick(['下载表格', '下载', '导出数据']),
- ]);
- const filename = `${account.id}_${Date.now()}_${download.suggestedFilename()}`;
- const filePath = path.join(this.downloadDir, filename);
- await download.saveAs(filePath);
- try {
- const perDay = await parseWeixinVideoFile(filePath);
- for (const [k, v] of perDay.entries()) {
- if (!mergedDays.has(k)) mergedDays.set(k, { recordDate: v.recordDate });
- Object.assign(mergedDays.get(k)!, v);
- }
- logger.info(`[WX Import] Section parsed. accountId=${account.id} section=${section} days=${perDay.size}`);
- } finally {
- if (process.env.KEEP_WX_XLSX === 'true') {
- logger.warn(`[WX Import] KEEP_WX_XLSX=true, keep file: ${filePath}`);
- } else {
- await fs.unlink(filePath).catch(() => undefined);
- }
- }
- };
- for (const s of sections) {
- await exportSection(s);
- }
- let inserted = 0;
- let updated = 0;
- for (const v of mergedDays.values()) {
- const { recordDate, ...patch } = v;
- const r = await this.userDayStatisticsService.saveStatisticsForDate(account.id, recordDate, patch);
- inserted += r.inserted;
- updated += r.updated;
- }
- logger.info(`[WX Import] Account imported. accountId=${account.id} days=${mergedDays.size} inserted=${inserted} updated=${updated}`);
- await context.close();
- } finally {
- if (shouldClose) await browser.close().catch(() => undefined);
- }
- }
- }
|