|
|
@@ -325,20 +325,22 @@ class WeixinPublisher(BasePublisher):
|
|
|
print(f"[{self.platform_name}] 使用代理: {proxy.get('server')}", flush=True)
|
|
|
|
|
|
# 参考 matrix: 使用系统内的 Chrome 浏览器,避免 H264 编码错误
|
|
|
- # 如果没有安装 Chrome,则使用默认 Chromium
|
|
|
+ # 非 headless 时添加 slow_mo 便于观察点击操作
|
|
|
+ launch_opts = {"headless": self.headless}
|
|
|
+ if not self.headless:
|
|
|
+ launch_opts["slow_mo"] = 400 # 每个操作间隔 400ms,便于观看
|
|
|
+ print(f"[{self.platform_name}] 有头模式 + slow_mo=400ms,浏览器将可见", flush=True)
|
|
|
try:
|
|
|
- self.browser = await playwright.chromium.launch(
|
|
|
- headless=self.headless,
|
|
|
- channel="chrome", # 使用系统 Chrome
|
|
|
- proxy=proxy if proxy and proxy.get('server') else None
|
|
|
- )
|
|
|
- print(f"[{self.platform_name}] 使用系统 Chrome 浏览器")
|
|
|
+ launch_opts["channel"] = "chrome"
|
|
|
+ if proxy and proxy.get("server"):
|
|
|
+ launch_opts["proxy"] = proxy
|
|
|
+ self.browser = await playwright.chromium.launch(**launch_opts)
|
|
|
+ print(f"[{self.platform_name}] 使用系统 Chrome 浏览器", flush=True)
|
|
|
except Exception as e:
|
|
|
- print(f"[{self.platform_name}] Chrome 不可用,使用 Chromium: {e}")
|
|
|
- self.browser = await playwright.chromium.launch(
|
|
|
- headless=self.headless,
|
|
|
- proxy=proxy if proxy and proxy.get('server') else None
|
|
|
- )
|
|
|
+ print(f"[{self.platform_name}] Chrome 不可用,使用 Chromium: {e}", flush=True)
|
|
|
+ if "channel" in launch_opts:
|
|
|
+ del launch_opts["channel"]
|
|
|
+ self.browser = await playwright.chromium.launch(**launch_opts)
|
|
|
|
|
|
# 设置 HTTP Headers 防止重定向
|
|
|
headers = {
|
|
|
@@ -1209,6 +1211,590 @@ class WeixinPublisher(BasePublisher):
|
|
|
return WorksResult(success=True, platform=self.platform_name, works=works, total=total, has_more=has_more, next_page=next_page)
|
|
|
|
|
|
|
|
|
+ async def sync_work_daily_stats_via_browser(
|
|
|
+ self, cookies: str, work_id: int, platform_video_id: str
|
|
|
+ ) -> dict:
|
|
|
+ """
|
|
|
+ 通过浏览器自动化同步单个作品的每日数据到 work_day_statistics。
|
|
|
+ 流程:
|
|
|
+ 1. 打开 statistic/post 页,点击单篇视频 tab,点击近30天
|
|
|
+ 2. 监听 post_list 接口,根据 exportId 匹配 platform_video_id 得到 objectId
|
|
|
+ 3. 找到 data-row-key=objectId 的行,点击「查看」
|
|
|
+ 4. 进入详情页,点击数据详情的近30天,点击下载表格
|
|
|
+ 5. 解析 CSV 并返回 statistics 列表(供 Node 保存)
|
|
|
+ """
|
|
|
+ import csv
|
|
|
+ import tempfile
|
|
|
+ from pathlib import Path
|
|
|
+
|
|
|
+ result = {"success": False, "error": "", "statistics": [], "inserted": 0, "updated": 0}
|
|
|
+ post_list_data = {"list": []}
|
|
|
+
|
|
|
+ async def handle_response(response):
|
|
|
+ try:
|
|
|
+ if "statistic/post_list" in response.url and response.request.method == "POST":
|
|
|
+ try:
|
|
|
+ body = await response.json()
|
|
|
+ if body.get("errCode") == 0 and body.get("data"):
|
|
|
+ post_list_data["list"] = body.get("data", {}).get("list", [])
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+
|
|
|
+ try:
|
|
|
+ await self.init_browser()
|
|
|
+ cookie_list = self.parse_cookies(cookies)
|
|
|
+ await self.set_cookies(cookie_list)
|
|
|
+ if not self.page:
|
|
|
+ raise Exception("Page not initialized")
|
|
|
+
|
|
|
+ self.page.on("response", handle_response)
|
|
|
+
|
|
|
+ # 1. 打开数据分析-作品数据页
|
|
|
+ print(f"[{self.platform_name}] 打开数据分析页...", flush=True)
|
|
|
+ await self.page.goto("https://channels.weixin.qq.com/platform/statistic/post", timeout=30000)
|
|
|
+ if not self.headless:
|
|
|
+ print(f"[{self.platform_name}] 浏览器已打开,请将窗口置于前台观看操作(等待 5 秒)...", flush=True)
|
|
|
+ await asyncio.sleep(5)
|
|
|
+ else:
|
|
|
+ await asyncio.sleep(3)
|
|
|
+ if "login" in self.page.url:
|
|
|
+ raise Exception("Cookie 已过期,请重新登录")
|
|
|
+
|
|
|
+ # 2. 点击「单篇视频」tab
|
|
|
+ tab_sel = "div.weui-desktop-tab__navs ul li:nth-child(2) a"
|
|
|
+ try:
|
|
|
+ await self.page.wait_for_selector(tab_sel, timeout=8000)
|
|
|
+ await self.page.click(tab_sel)
|
|
|
+ except Exception:
|
|
|
+ tab_sel = "a:has-text('单篇视频')"
|
|
|
+ await self.page.click(tab_sel)
|
|
|
+ await asyncio.sleep(2)
|
|
|
+
|
|
|
+ # 3. 点击「近30天」(单篇视频页的日期范围筛选)
|
|
|
+ # 选择器优先级:精确匹配单篇视频区域内的日期范围 radio 组
|
|
|
+ radio_selectors = [
|
|
|
+ "div.post-single-wrap div.weui-desktop-radio-group.radio-group label:has-text('近30天')",
|
|
|
+ "div.post-single-wrap div.filter-wrap div.weui-desktop-radio-group label:nth-child(2)",
|
|
|
+ "div.post-single-wrap div.card-body div.filter-wrap div:nth-child(2) label:nth-child(2)",
|
|
|
+ "div.post-single-wrap label:has-text('近30天')",
|
|
|
+ "div.weui-desktop-radio-group label:has-text('近30天')",
|
|
|
+ "label:has-text('近30天')",
|
|
|
+ ]
|
|
|
+ clicked = False
|
|
|
+ for sel in radio_selectors:
|
|
|
+ try:
|
|
|
+ el = self.page.locator(sel).first
|
|
|
+ if await el.count() > 0:
|
|
|
+ await el.click()
|
|
|
+ clicked = True
|
|
|
+ print(f"[{self.platform_name}] 已点击近30天按钮 (selector: {sel[:50]}...)", flush=True)
|
|
|
+ break
|
|
|
+ except Exception as e:
|
|
|
+ continue
|
|
|
+ if not clicked:
|
|
|
+ print(f"[{self.platform_name}] 警告: 未找到近30天按钮,继续尝试...", flush=True)
|
|
|
+ await asyncio.sleep(3)
|
|
|
+
|
|
|
+ # 4. 从 post_list 响应中找 exportId -> objectId
|
|
|
+ export_id_to_object = {}
|
|
|
+ for item in post_list_data["list"]:
|
|
|
+ eid = (item.get("exportId") or "").strip()
|
|
|
+ oid = (item.get("objectId") or "").strip()
|
|
|
+ if eid and oid:
|
|
|
+ export_id_to_object[eid] = oid
|
|
|
+
|
|
|
+ object_id = export_id_to_object.get(platform_video_id) or export_id_to_object.get(
|
|
|
+ platform_video_id.strip()
|
|
|
+ )
|
|
|
+ if not object_id:
|
|
|
+ # 尝试宽松匹配(platform_video_id 可能带前缀)
|
|
|
+ for eid, oid in export_id_to_object.items():
|
|
|
+ if platform_video_id in eid or eid in platform_video_id:
|
|
|
+ object_id = oid
|
|
|
+ break
|
|
|
+ if not object_id:
|
|
|
+ result["error"] = f"未在 post_list 中匹配到 exportId={platform_video_id}"
|
|
|
+ print(f"[{self.platform_name}] {result['error']}", flush=True)
|
|
|
+ return result
|
|
|
+
|
|
|
+ # 5. 找到 data-row-key=objectId 的行,点击「查看」
|
|
|
+ view_btn = self.page.locator(f'tr[data-row-key="{object_id}"] a.detail-wrap, tr[data-row-key="{object_id}"] a:has-text("查看")')
|
|
|
+ try:
|
|
|
+ await view_btn.first.wait_for(timeout=5000)
|
|
|
+ await view_btn.first.click()
|
|
|
+ except Exception as e:
|
|
|
+ view_btn = self.page.locator(f'tr[data-row-key="{object_id}"] a')
|
|
|
+ if await view_btn.count() > 0:
|
|
|
+ await view_btn.first.click()
|
|
|
+ else:
|
|
|
+ raise Exception(f"未找到 objectId={object_id} 的查看按钮: {e}")
|
|
|
+ await asyncio.sleep(3)
|
|
|
+
|
|
|
+ # 6. 详情页:点击数据详情的「近30天」,再点击「下载表格」
|
|
|
+ detail_radio = "div.post-statistic-common div.filter-wrap label:nth-child(2)"
|
|
|
+ for sel in [detail_radio, "div.main-body label:has-text('近30天')"]:
|
|
|
+ try:
|
|
|
+ el = self.page.locator(sel).first
|
|
|
+ if await el.count() > 0:
|
|
|
+ await el.click()
|
|
|
+ break
|
|
|
+ except Exception:
|
|
|
+ continue
|
|
|
+ await asyncio.sleep(2)
|
|
|
+
|
|
|
+ # 保存到 server/tmp 目录
|
|
|
+ download_dir = Path(__file__).resolve().parent.parent.parent / "tmp"
|
|
|
+ download_dir.mkdir(parents=True, exist_ok=True)
|
|
|
+
|
|
|
+ async with self.page.expect_download(timeout=15000) as download_info:
|
|
|
+ download_btn = self.page.locator("div.post-statistic-common div.filter-extra a, a:has-text('下载表格')")
|
|
|
+ if await download_btn.count() == 0:
|
|
|
+ raise Exception("未找到「下载表格」按钮")
|
|
|
+ await download_btn.first.click()
|
|
|
+ download = await download_info.value
|
|
|
+ save_path = download_dir / f"work_{work_id}_{int(time.time())}.csv"
|
|
|
+ await download.save_as(save_path)
|
|
|
+
|
|
|
+ # 7. 解析 CSV -> statistics
|
|
|
+ stats_list = []
|
|
|
+ with open(save_path, "r", encoding="utf-8-sig", errors="replace") as f:
|
|
|
+ reader = csv.DictReader(f)
|
|
|
+ rows = list(reader)
|
|
|
+ for row in rows:
|
|
|
+ date_val = (
|
|
|
+ row.get("日期")
|
|
|
+ or row.get("date")
|
|
|
+ or row.get("时间")
|
|
|
+ or row.get("时间周期", "")
|
|
|
+ ).strip()
|
|
|
+ if not date_val:
|
|
|
+ continue
|
|
|
+ dt = None
|
|
|
+ norm = date_val[:10].replace("年", "-").replace("月", "-").replace("日", "-").replace("/", "-")
|
|
|
+ if len(norm) >= 8 and norm.count("-") >= 2:
|
|
|
+ parts = norm.split("-")
|
|
|
+ if len(parts) == 3:
|
|
|
+ try:
|
|
|
+ y, m, d = int(parts[0]), int(parts[1]), int(parts[2])
|
|
|
+ if 2000 <= y <= 2100 and 1 <= m <= 12 and 1 <= d <= 31:
|
|
|
+ dt = datetime(y, m, d)
|
|
|
+ except (ValueError, IndexError):
|
|
|
+ pass
|
|
|
+ if not dt:
|
|
|
+ for fmt in ["%Y-%m-%d", "%Y/%m/%d", "%m/%d/%Y"]:
|
|
|
+ try:
|
|
|
+ dt = datetime.strptime((date_val.split()[0] if date_val else "")[:10], fmt)
|
|
|
+ break
|
|
|
+ except (ValueError, IndexError):
|
|
|
+ dt = None
|
|
|
+ if not dt:
|
|
|
+ continue
|
|
|
+ rec_date = dt.strftime("%Y-%m-%d")
|
|
|
+ play = self._parse_count(row.get("播放", "") or row.get("播放量", "") or row.get("play_count", "0"))
|
|
|
+ like = self._parse_count(row.get("点赞", "") or row.get("like_count", "0"))
|
|
|
+ comment = self._parse_count(row.get("评论", "") or row.get("comment_count", "0"))
|
|
|
+ share = self._parse_count(row.get("分享", "") or row.get("share_count", "0"))
|
|
|
+ collect = self._parse_count(row.get("收藏", "") or row.get("collect_count", "0"))
|
|
|
+ comp_rate = (row.get("完播率", "") or row.get("completion_rate", "0")).strip().rstrip("%") or "0"
|
|
|
+ avg_dur = (row.get("平均播放时长", "") or row.get("avg_watch_duration", "0")).strip()
|
|
|
+ stats_list.append({
|
|
|
+ "work_id": work_id,
|
|
|
+ "record_date": rec_date,
|
|
|
+ "play_count": play,
|
|
|
+ "like_count": like,
|
|
|
+ "comment_count": comment,
|
|
|
+ "share_count": share,
|
|
|
+ "collect_count": collect,
|
|
|
+ "completion_rate": comp_rate,
|
|
|
+ "avg_watch_duration": avg_dur,
|
|
|
+ })
|
|
|
+ result["statistics"] = stats_list
|
|
|
+ result["success"] = True
|
|
|
+ try:
|
|
|
+ os.remove(save_path)
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+ except Exception as e:
|
|
|
+ import traceback
|
|
|
+ traceback.print_exc()
|
|
|
+ result["error"] = str(e)
|
|
|
+ finally:
|
|
|
+ try:
|
|
|
+ await self.close_browser()
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+ return result
|
|
|
+
|
|
|
+ async def sync_account_works_daily_stats_via_browser(
|
|
|
+ self,
|
|
|
+ cookies: str,
|
|
|
+ works: List[dict],
|
|
|
+ save_fn=None,
|
|
|
+ update_works_fn=None,
|
|
|
+ headless: bool = True,
|
|
|
+ ) -> dict:
|
|
|
+ """
|
|
|
+ 纯浏览器批量同步账号下所有作品(在库的)的每日数据到 work_day_statistics。
|
|
|
+ 流程:
|
|
|
+ 1. 打开 statistic/post → 点击单篇视频 → 点击近30天
|
|
|
+ 2. 【首次】监听 post_list 接口 → 解析响应更新 works 表 yesterday_* 字段
|
|
|
+ 3. 监听 post_list 获取 exportId->objectId 映射
|
|
|
+ 4. 遍历 post_list 的每一条:
|
|
|
+ - 若 exportId 在 works 的 platform_video_id 中无匹配 → 跳过
|
|
|
+ - 若匹配 → 找到 data-row-key=objectId 的行,点击「查看」
|
|
|
+ - 详情页:默认近7天,直接监听 feed_aggreagate_data_by_tab_type 接口
|
|
|
+ - 从「全部」tab 解析 browse/like/comment/forward/fav/follow,日期从昨天往前推
|
|
|
+ - 通过 save_fn 存入 work_day_statistics
|
|
|
+ - 返回列表页,继续下一条
|
|
|
+ works: [{"work_id": int, "platform_video_id": str}, ...]
|
|
|
+ save_fn: (stats_list: List[dict]) -> {inserted, updated},由调用方传入,用于调用 Node batch-dates
|
|
|
+ update_works_fn: (updates: List[dict]) -> {updated},由调用方传入,用于将 post_list 解析数据更新到 works 表(仅首次调用)
|
|
|
+ """
|
|
|
+ from pathlib import Path
|
|
|
+ from datetime import timedelta
|
|
|
+
|
|
|
+ result = {
|
|
|
+ "success": True,
|
|
|
+ "error": "",
|
|
|
+ "total_processed": 0,
|
|
|
+ "total_skipped": 0,
|
|
|
+ "inserted": 0,
|
|
|
+ "updated": 0,
|
|
|
+ "works_updated": 0,
|
|
|
+ }
|
|
|
+ # platform_video_id(exportId) -> work_id
|
|
|
+ export_id_to_work = {}
|
|
|
+ for w in works:
|
|
|
+ pvid = (w.get("platform_video_id") or w.get("platformVideoId") or "").strip()
|
|
|
+ wid = w.get("work_id") or w.get("workId")
|
|
|
+ if pvid and wid is not None:
|
|
|
+ export_id_to_work[pvid] = int(wid)
|
|
|
+ # 兼容可能带/不带前缀(如 export/xxx vs xxx)
|
|
|
+ if "/" in pvid:
|
|
|
+ export_id_to_work[pvid.split("/")[-1]] = int(wid)
|
|
|
+
|
|
|
+ post_list_data = {"list": []}
|
|
|
+ feed_aggreagate_data = {"body": None}
|
|
|
+
|
|
|
+ async def handle_response(response):
|
|
|
+ try:
|
|
|
+ url = response.url
|
|
|
+ if "statistic/post_list" in url:
|
|
|
+ try:
|
|
|
+ body = await response.json()
|
|
|
+ if body.get("errCode") == 0 and body.get("data"):
|
|
|
+ post_list_data["list"] = body.get("data", {}).get("list", [])
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+ elif "feed_aggreagate_data_by_tab_type" in url:
|
|
|
+ try:
|
|
|
+ body = await response.json()
|
|
|
+ if body.get("errCode") == 0 and body.get("data"):
|
|
|
+ feed_aggreagate_data["body"] = body
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+
|
|
|
+ try:
|
|
|
+ await self.init_browser()
|
|
|
+ cookie_list = self.parse_cookies(cookies)
|
|
|
+ await self.set_cookies(cookie_list)
|
|
|
+ if not self.page:
|
|
|
+ raise Exception("Page not initialized")
|
|
|
+
|
|
|
+ self.page.on("response", handle_response)
|
|
|
+
|
|
|
+ # 1. 打开数据分析-作品数据页
|
|
|
+ print(f"[{self.platform_name}] 打开数据分析页...", flush=True)
|
|
|
+ await self.page.goto("https://channels.weixin.qq.com/platform/statistic/post", timeout=30000)
|
|
|
+ if not headless:
|
|
|
+ print(f"[{self.platform_name}] 浏览器已打开,请将窗口置于前台观看操作(等待 5 秒)...", flush=True)
|
|
|
+ await asyncio.sleep(5)
|
|
|
+ else:
|
|
|
+ await asyncio.sleep(3)
|
|
|
+ if "login" in self.page.url:
|
|
|
+ raise Exception("Cookie 已过期,请重新登录")
|
|
|
+
|
|
|
+ # 2. 点击「单篇视频」tab
|
|
|
+ tab_sel = "div.weui-desktop-tab__navs ul li:nth-child(2) a"
|
|
|
+ try:
|
|
|
+ await self.page.wait_for_selector(tab_sel, timeout=8000)
|
|
|
+ await self.page.click(tab_sel)
|
|
|
+ except Exception:
|
|
|
+ tab_sel = "a:has-text('单篇视频')"
|
|
|
+ await self.page.click(tab_sel)
|
|
|
+ await asyncio.sleep(2)
|
|
|
+
|
|
|
+ # 3. 点击「近30天」前清空 list,点击后等待 handler 捕获带 fullPlayRate 的 post_list
|
|
|
+ post_list_data["list"] = []
|
|
|
+ radio_selectors = [
|
|
|
+ "div.post-single-wrap div.weui-desktop-radio-group.radio-group label:has-text('近30天')",
|
|
|
+ "div.post-single-wrap div.filter-wrap div.weui-desktop-radio-group label:nth-child(2)",
|
|
|
+ "div.post-single-wrap label:has-text('近30天')",
|
|
|
+ "div.weui-desktop-radio-group label:has-text('近30天')",
|
|
|
+ "label:has-text('近30天')",
|
|
|
+ ]
|
|
|
+ clicked = False
|
|
|
+ for sel in radio_selectors:
|
|
|
+ try:
|
|
|
+ el = self.page.locator(sel).first
|
|
|
+ if await el.count() > 0:
|
|
|
+ await el.click()
|
|
|
+ clicked = True
|
|
|
+ print(f"[{self.platform_name}] 已点击近30天 (selector: {sel[:40]}...)", flush=True)
|
|
|
+ break
|
|
|
+ except Exception:
|
|
|
+ continue
|
|
|
+ if not clicked:
|
|
|
+ print(f"[{self.platform_name}] 警告: 未找到近30天按钮", flush=True)
|
|
|
+ await asyncio.sleep(5)
|
|
|
+
|
|
|
+ # 4. 从 post_list 获取列表
|
|
|
+ items = post_list_data["list"]
|
|
|
+ if not items:
|
|
|
+ result["error"] = "未监听到 post_list 或列表为空"
|
|
|
+ print(f"[{self.platform_name}] {result['error']}", flush=True)
|
|
|
+ return result
|
|
|
+
|
|
|
+ # 4.5 【仅首次】从 post_list 接口响应解析数据 → 更新 works 表(不再下载 CSV)
|
|
|
+ # post_list 返回字段映射: readCount->播放量, likeCount->点赞, commentCount->评论, forwardCount->分享,
|
|
|
+ # fullPlayRate->完播率(0-1小数), avgPlayTimeSec->平均播放时长(秒), exportId->匹配 work_id
|
|
|
+ if update_works_fn and items:
|
|
|
+ try:
|
|
|
+ updates = []
|
|
|
+ for it in items:
|
|
|
+ eid = (it.get("exportId") or "").strip()
|
|
|
+ if not eid:
|
|
|
+ continue
|
|
|
+ work_id = export_id_to_work.get(eid)
|
|
|
+ if work_id is None:
|
|
|
+ for k, v in export_id_to_work.items():
|
|
|
+ if eid in k or k in eid:
|
|
|
+ work_id = v
|
|
|
+ break
|
|
|
+ if work_id is None:
|
|
|
+ continue
|
|
|
+ read_count = int(it.get("readCount") or 0)
|
|
|
+ like_count = int(it.get("likeCount") or 0)
|
|
|
+ comment_count = int(it.get("commentCount") or 0)
|
|
|
+ forward_count = int(it.get("forwardCount") or 0)
|
|
|
+ follow_count = int(it.get("followCount") or 0)
|
|
|
+ full_play_rate = it.get("fullPlayRate")
|
|
|
+ if full_play_rate is not None:
|
|
|
+ comp_rate = f"{float(full_play_rate) * 100:.2f}%"
|
|
|
+ else:
|
|
|
+ comp_rate = "0"
|
|
|
+ avg_sec = it.get("avgPlayTimeSec")
|
|
|
+ if avg_sec is not None:
|
|
|
+ avg_dur = f"{float(avg_sec):.2f}秒"
|
|
|
+ else:
|
|
|
+ avg_dur = "0"
|
|
|
+ updates.append({
|
|
|
+ "work_id": work_id,
|
|
|
+ "yesterday_play_count": read_count,
|
|
|
+ "yesterday_like_count": like_count,
|
|
|
+ "yesterday_comment_count": comment_count,
|
|
|
+ "yesterday_share_count": forward_count,
|
|
|
+ "yesterday_follow_count": follow_count,
|
|
|
+ "yesterday_completion_rate": comp_rate,
|
|
|
+ "yesterday_avg_watch_duration": avg_dur,
|
|
|
+ })
|
|
|
+ if updates:
|
|
|
+ try:
|
|
|
+ save_result = update_works_fn(updates)
|
|
|
+ result["works_updated"] = save_result.get("updated", 0)
|
|
|
+ except Exception as api_err:
|
|
|
+ import traceback
|
|
|
+ traceback.print_exc()
|
|
|
+ except Exception as e:
|
|
|
+ import traceback
|
|
|
+ traceback.print_exc()
|
|
|
+ print(f"[{self.platform_name}] 解析 post_list 更新 works 失败: {e}", flush=True)
|
|
|
+
|
|
|
+ # 辅助:点击单篇视频 + 近30天,恢复列表视图(go_back 后会回到全部视频页)
|
|
|
+ async def ensure_single_video_near30():
|
|
|
+ tab_sel = "div.weui-desktop-tab__navs ul li:nth-child(2) a"
|
|
|
+ try:
|
|
|
+ await self.page.wait_for_selector(tab_sel, timeout=8000)
|
|
|
+ await self.page.click(tab_sel)
|
|
|
+ except Exception:
|
|
|
+ await self.page.click("a:has-text('单篇视频')")
|
|
|
+ await asyncio.sleep(2)
|
|
|
+ for sel in [
|
|
|
+ "div.post-single-wrap div.weui-desktop-radio-group.radio-group label:has-text('近30天')",
|
|
|
+ "div.post-single-wrap label:has-text('近30天')",
|
|
|
+ "div.weui-desktop-radio-group label:has-text('近30天')",
|
|
|
+ "label:has-text('近30天')",
|
|
|
+ ]:
|
|
|
+ try:
|
|
|
+ el = self.page.locator(sel).first
|
|
|
+ if await el.count() > 0:
|
|
|
+ await el.click()
|
|
|
+ break
|
|
|
+ except Exception:
|
|
|
+ continue
|
|
|
+ await asyncio.sleep(3)
|
|
|
+
|
|
|
+ # 5. 遍历每一条,按 exportId 匹配作品
|
|
|
+ processed_export_ids = set()
|
|
|
+
|
|
|
+ for idx, item in enumerate(items):
|
|
|
+ eid = (item.get("exportId") or "").strip()
|
|
|
+ oid = (item.get("objectId") or "").strip()
|
|
|
+ if not oid:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 已处理过的跳过(理论上循环顺序即处理顺序,此处做双重保险)
|
|
|
+ if eid in processed_export_ids:
|
|
|
+ print(f"[{self.platform_name}] 跳过 [{idx+1}] exportId={eid} (已处理)", flush=True)
|
|
|
+ continue
|
|
|
+
|
|
|
+ # go_back 后回到全部视频页,需重新点击单篇视频+近30天
|
|
|
+ if idx > 0:
|
|
|
+ await ensure_single_video_near30()
|
|
|
+
|
|
|
+ # 匹配 work_id
|
|
|
+ work_id = export_id_to_work.get(eid)
|
|
|
+ if work_id is None:
|
|
|
+ for k, v in export_id_to_work.items():
|
|
|
+ if eid in k or k in eid:
|
|
|
+ work_id = v
|
|
|
+ break
|
|
|
+ if work_id is None:
|
|
|
+ result["total_skipped"] += 1
|
|
|
+ print(f"[{self.platform_name}] 跳过 [{idx+1}] exportId={eid} (库中无对应作品)", flush=True)
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 点击「查看」:Ant Design 表格 tr[data-row-key] > td > div.slot-wrap > a.detail-wrap
|
|
|
+ # 操作列可能在 ant-table-fixed-right 内,优先尝试
|
|
|
+ view_selectors = [
|
|
|
+ f'div.ant-table-fixed-right tr[data-row-key="{oid}"] a.detail-wrap',
|
|
|
+ f'tr[data-row-key="{oid}"] a.detail-wrap',
|
|
|
+ f'tr[data-row-key="{oid}"] td a.detail-wrap',
|
|
|
+ f'tr[data-row-key="{oid}"] a:has-text("查看")',
|
|
|
+ f'tr[data-row-key="{oid}"] a',
|
|
|
+ ]
|
|
|
+ clicked = False
|
|
|
+ for sel in view_selectors:
|
|
|
+ view_btn = self.page.locator(sel)
|
|
|
+ if await view_btn.count() > 0:
|
|
|
+ try:
|
|
|
+ await view_btn.first.wait_for(timeout=3000)
|
|
|
+ await view_btn.first.click()
|
|
|
+ clicked = True
|
|
|
+ print(f"[{self.platform_name}] 已点击查看 (selector: {sel[:40]}...)", flush=True)
|
|
|
+ break
|
|
|
+ except Exception as e:
|
|
|
+ continue
|
|
|
+ if not clicked:
|
|
|
+ print(f"[{self.platform_name}] 未找到 objectId={oid} 的查看按钮", flush=True)
|
|
|
+ result["total_skipped"] += 1
|
|
|
+ continue
|
|
|
+ await asyncio.sleep(3)
|
|
|
+
|
|
|
+ # 详情页:默认展示近7天,页面加载时自动请求 feed_aggreagate,不清空 body 避免覆盖已监听到的响应
|
|
|
+ await asyncio.sleep(4)
|
|
|
+
|
|
|
+ # 从 feed_aggreagate 响应解析「全部」数据
|
|
|
+ # 数据结构: data.dataByFanstype[].dataByTabtype[] 中 tabTypeName="全部" 或 tabType=999
|
|
|
+ # 日期:从昨天往前推 N 天(含昨天),数组从最早到最晚排列
|
|
|
+ body = feed_aggreagate_data.get("body")
|
|
|
+ if not body or not body.get("data"):
|
|
|
+ print(f"[{self.platform_name}] work_id={work_id} 未监听到 feed_aggreagate 有效响应", flush=True)
|
|
|
+ await self.page.go_back()
|
|
|
+ await asyncio.sleep(2)
|
|
|
+ continue
|
|
|
+
|
|
|
+ tab_all = None
|
|
|
+ for fan_item in body.get("data", {}).get("dataByFanstype", []):
|
|
|
+ for tab_item in fan_item.get("dataByTabtype", []):
|
|
|
+ if tab_item.get("tabTypeName") == "全部" or tab_item.get("tabType") == 999:
|
|
|
+ tab_all = tab_item.get("data")
|
|
|
+ break
|
|
|
+ if tab_all is not None:
|
|
|
+ break
|
|
|
+ if not tab_all:
|
|
|
+ tab_all = body.get("data", {}).get("feedData", [{}])[0].get("totalData")
|
|
|
+ if not tab_all:
|
|
|
+ print(f"[{self.platform_name}] work_id={work_id} 未找到「全部」数据", flush=True)
|
|
|
+ await self.page.go_back()
|
|
|
+ await asyncio.sleep(2)
|
|
|
+ continue
|
|
|
+
|
|
|
+ browse = tab_all.get("browse", [])
|
|
|
+ n = len(browse)
|
|
|
+ if n == 0:
|
|
|
+ print(f"[{self.platform_name}] work_id={work_id} browse 为空", flush=True)
|
|
|
+ await self.page.go_back()
|
|
|
+ await asyncio.sleep(2)
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 日期:昨天往前推 n 天,index 0 = 最早日
|
|
|
+ today = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
|
|
|
+ yesterday = today - timedelta(days=1)
|
|
|
+ start_date = yesterday - timedelta(days=n - 1)
|
|
|
+
|
|
|
+ like_arr = tab_all.get("like", [])
|
|
|
+ comment_arr = tab_all.get("comment", [])
|
|
|
+ forward_arr = tab_all.get("forward", [])
|
|
|
+ fav_arr = tab_all.get("fav", [])
|
|
|
+ follow_arr = tab_all.get("follow", [])
|
|
|
+
|
|
|
+ stats_list = []
|
|
|
+ for i in range(n):
|
|
|
+ rec_dt = start_date + timedelta(days=i)
|
|
|
+ rec_date = rec_dt.strftime("%Y-%m-%d")
|
|
|
+ play = self._parse_count(browse[i] if i < len(browse) else "0")
|
|
|
+ like = self._parse_count(like_arr[i] if i < len(like_arr) else "0")
|
|
|
+ comment = self._parse_count(comment_arr[i] if i < len(comment_arr) else "0")
|
|
|
+ share = self._parse_count(forward_arr[i] if i < len(forward_arr) else "0")
|
|
|
+ follow = self._parse_count(follow_arr[i] if i < len(follow_arr) else "0")
|
|
|
+ # fav[i] 不入库,follow[i] 入 follow_count
|
|
|
+ stats_list.append({
|
|
|
+ "work_id": work_id,
|
|
|
+ "record_date": rec_date,
|
|
|
+ "play_count": play,
|
|
|
+ "like_count": like,
|
|
|
+ "comment_count": comment,
|
|
|
+ "share_count": share,
|
|
|
+ "collect_count": 0,
|
|
|
+ "follow_count": follow,
|
|
|
+ "completion_rate": "0",
|
|
|
+ "avg_watch_duration": "0",
|
|
|
+ })
|
|
|
+ print(f"[{self.platform_name}] work_id={work_id} 从 feed_aggreagate 解析得到 {len(stats_list)} 条日统计", flush=True)
|
|
|
+
|
|
|
+ # 存入 work_day_statistics(通过 save_fn 调用 Node)
|
|
|
+ if save_fn and stats_list:
|
|
|
+ try:
|
|
|
+ save_result = save_fn(stats_list)
|
|
|
+ result["inserted"] += save_result.get("inserted", 0)
|
|
|
+ result["updated"] += save_result.get("updated", 0)
|
|
|
+ except Exception as e:
|
|
|
+ print(f"[{self.platform_name}] work_id={work_id} 保存失败: {e}", flush=True)
|
|
|
+
|
|
|
+ result["total_processed"] += 1
|
|
|
+ processed_export_ids.add(eid)
|
|
|
+
|
|
|
+ # 返回列表页,继续下一条(会回到全部视频页,下次循环会重新点击单篇视频+近30天)
|
|
|
+ await self.page.go_back()
|
|
|
+ await asyncio.sleep(2)
|
|
|
+ print(f"[{self.platform_name}] 批量同步完成: 处理 {result['total_processed']} 个作品, 跳过 {result['total_skipped']} 个", flush=True)
|
|
|
+ except Exception as e:
|
|
|
+ import traceback
|
|
|
+ traceback.print_exc()
|
|
|
+ result["success"] = False
|
|
|
+ result["error"] = str(e)
|
|
|
+ finally:
|
|
|
+ try:
|
|
|
+ await self.close_browser()
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+ return result
|
|
|
+
|
|
|
async def get_comments(self, cookies: str, work_id: str, cursor: str = "") -> CommentsResult:
|
|
|
"""
|
|
|
获取视频号作品评论(完全参考 get_weixin_work_comments.py 的接口监听逻辑)
|