|
|
@@ -3018,7 +3018,269 @@ class BaijiahaoPublisher(BasePublisher):
|
|
|
has_more=has_more,
|
|
|
next_page=next_page
|
|
|
)
|
|
|
-
|
|
|
+
|
|
|
+ async def get_all_works(self, cookies: str) -> WorksResult:
|
|
|
+ """
|
|
|
+ 获取百家号全部作品列表(自动分页,复用浏览器实例)。
|
|
|
+ 避免每页都启动新浏览器导致的性能问题和风控触发。
|
|
|
+ """
|
|
|
+ import re
|
|
|
+
|
|
|
+ print(f"\n{'='*60}")
|
|
|
+ print(f"[{self.platform_name}] 获取全部作品列表(自动分页)")
|
|
|
+ print(f"{'='*60}")
|
|
|
+
|
|
|
+ all_works: List[WorkItem] = []
|
|
|
+ seen_ids = set()
|
|
|
+ total = 0
|
|
|
+ current_page = 1
|
|
|
+ page_size = 20
|
|
|
+ max_pages = 50 # 最多50页
|
|
|
+
|
|
|
+ try:
|
|
|
+ cookie_list = self.parse_cookies(cookies)
|
|
|
+ await self.init_browser()
|
|
|
+ await self.set_cookies(cookie_list)
|
|
|
+
|
|
|
+ if not self.page:
|
|
|
+ raise Exception("Page not initialized")
|
|
|
+
|
|
|
+ # 打开内容管理页以建立会话并提取 token
|
|
|
+ content_url = (
|
|
|
+ "https://baijiahao.baidu.com/builder/rc/content"
|
|
|
+ f"?currentPage={current_page}&pageSize={page_size}"
|
|
|
+ "&search=&type=&collection=&startDate=&endDate="
|
|
|
+ )
|
|
|
+ await self.page.goto(content_url, wait_until="domcontentloaded", timeout=60000)
|
|
|
+ await asyncio.sleep(3)
|
|
|
+
|
|
|
+ # 检查登录状态
|
|
|
+ current_url = self.page.url
|
|
|
+ if "passport.baidu.com" in current_url or "login" in current_url:
|
|
|
+ raise Exception("Cookie 已过期,请重新登录百家号账号")
|
|
|
+
|
|
|
+ # 提取 token
|
|
|
+ token = await self.page.evaluate(
|
|
|
+ """
|
|
|
+ () => {
|
|
|
+ const isJwtLike = (v) => {
|
|
|
+ if (!v || typeof v !== 'string') return false;
|
|
|
+ const s = v.trim();
|
|
|
+ if (s.length < 60) return false;
|
|
|
+ const parts = s.split('.');
|
|
|
+ if (parts.length !== 3) return false;
|
|
|
+ return parts.every(p => /^[A-Za-z0-9_-]+$/.test(p) && p.length > 10);
|
|
|
+ };
|
|
|
+
|
|
|
+ const pickFromStorage = (storage) => {
|
|
|
+ try {
|
|
|
+ const keys = Object.keys(storage || {});
|
|
|
+ for (const k of keys) {
|
|
|
+ const v = storage.getItem(k);
|
|
|
+ if (isJwtLike(v)) return v;
|
|
|
+ }
|
|
|
+ } catch {}
|
|
|
+ return "";
|
|
|
+ };
|
|
|
+
|
|
|
+ let t = pickFromStorage(window.localStorage);
|
|
|
+ if (t) return t;
|
|
|
+ t = pickFromStorage(window.sessionStorage);
|
|
|
+ if (t) return t;
|
|
|
+
|
|
|
+ const meta = document.querySelector('meta[name="token"], meta[name="bjh-token"]');
|
|
|
+ const metaToken = meta && meta.getAttribute('content');
|
|
|
+ if (isJwtLike(metaToken)) return metaToken;
|
|
|
+
|
|
|
+ const candidates = [
|
|
|
+ (window.__INITIAL_STATE__ && window.__INITIAL_STATE__.token) || "",
|
|
|
+ (window.__PRELOADED_STATE__ && window.__PRELOADED_STATE__.token) || "",
|
|
|
+ (window.__NUXT__ && window.__NUXT__.state && window.__NUXT__.state.token) || "",
|
|
|
+ ];
|
|
|
+ for (const c of candidates) {
|
|
|
+ if (isJwtLike(c)) return c;
|
|
|
+ }
|
|
|
+
|
|
|
+ return "";
|
|
|
+ }
|
|
|
+ """
|
|
|
+ )
|
|
|
+
|
|
|
+ if not token:
|
|
|
+ html = await self.page.content()
|
|
|
+ m = re.search(r'([A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,})', html)
|
|
|
+ if m:
|
|
|
+ token = m.group(1)
|
|
|
+
|
|
|
+ if not token:
|
|
|
+ raise Exception("未能从页面提取 token(可能未登录或触发风控),请重新登录百家号账号后再试")
|
|
|
+
|
|
|
+ print(f"[{self.platform_name}] ✓ Token 提取成功")
|
|
|
+
|
|
|
+ def _pick_cover(item: dict) -> str:
|
|
|
+ cover = item.get("crosswise_cover") or item.get("vertical_cover") or ""
|
|
|
+ if cover:
|
|
|
+ return cover
|
|
|
+ raw = item.get("cover_images") or ""
|
|
|
+ try:
|
|
|
+ parsed = json.loads(raw) if isinstance(raw, str) else raw
|
|
|
+ if isinstance(parsed, list) and parsed:
|
|
|
+ first = parsed[0]
|
|
|
+ if isinstance(first, dict):
|
|
|
+ return first.get("src") or first.get("ori_src") or ""
|
|
|
+ if isinstance(first, str):
|
|
|
+ return first
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+ return ""
|
|
|
+
|
|
|
+ def _pick_duration(item: dict) -> int:
|
|
|
+ for k in ("rmb_duration", "duration", "long"):
|
|
|
+ try:
|
|
|
+ v = int(item.get(k) or 0)
|
|
|
+ if v > 0:
|
|
|
+ return v
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+ ex = item.get("displaytype_exinfo") or ""
|
|
|
+ try:
|
|
|
+ exj = json.loads(ex) if isinstance(ex, str) and ex else (ex if isinstance(ex, dict) else {})
|
|
|
+ ugc = (exj.get("ugcvideo") or {}) if isinstance(exj, dict) else {}
|
|
|
+ vi = ugc.get("video_info") or {}
|
|
|
+ v = int(vi.get("durationInSecond") or ugc.get("long") or 0)
|
|
|
+ return v if v > 0 else 0
|
|
|
+ except Exception:
|
|
|
+ return 0
|
|
|
+
|
|
|
+ def _pick_status(item: dict) -> str:
|
|
|
+ qs = str(item.get("quality_status") or "").lower()
|
|
|
+ st = str(item.get("status") or "").lower()
|
|
|
+ if qs == "rejected" or "reject" in st:
|
|
|
+ return "rejected"
|
|
|
+ if st in ("draft", "unpublish", "unpublished"):
|
|
|
+ return "draft"
|
|
|
+ return "published"
|
|
|
+
|
|
|
+ # 分页循环
|
|
|
+ for page_iter in range(max_pages):
|
|
|
+ page_num = page_iter + 1 # 百家号 currentPage 从 1 开始
|
|
|
+ api_url = (
|
|
|
+ "https://baijiahao.baidu.com/pcui/article/lists"
|
|
|
+ f"?currentPage={page_num}"
|
|
|
+ f"&pageSize={page_size}"
|
|
|
+ "&search=&type=&collection=&startDate=&endDate="
|
|
|
+ "&clearBeforeFetch=false"
|
|
|
+ "&dynamic=1"
|
|
|
+ )
|
|
|
+
|
|
|
+ resp = await self.page.evaluate(
|
|
|
+ """
|
|
|
+ async ({ url, token }) => {
|
|
|
+ const r = await fetch(url, {
|
|
|
+ method: 'GET',
|
|
|
+ credentials: 'include',
|
|
|
+ headers: {
|
|
|
+ 'accept': 'application/json, text/plain, */*',
|
|
|
+ ...(token ? { token } : {}),
|
|
|
+ },
|
|
|
+ });
|
|
|
+ const text = await r.text();
|
|
|
+ return { ok: r.ok, status: r.status, text };
|
|
|
+ }
|
|
|
+ """,
|
|
|
+ {"url": api_url, "token": token},
|
|
|
+ )
|
|
|
+
|
|
|
+ if not resp or not resp.get("ok"):
|
|
|
+ print(f"[{self.platform_name}] 第 {page_num} 页请求失败: HTTP {resp.get('status') if isinstance(resp, dict) else 'unknown'}")
|
|
|
+ break
|
|
|
+
|
|
|
+ api_result = json.loads(resp.get("text") or "{}")
|
|
|
+ errno = api_result.get("errno", -1)
|
|
|
+
|
|
|
+ if errno != 0:
|
|
|
+ errmsg = api_result.get("errmsg", "unknown error")
|
|
|
+ print(f"[{self.platform_name}] 第 {page_num} 页接口错误: errno={errno}, errmsg={errmsg}")
|
|
|
+ if errno in (110, 20040001):
|
|
|
+ raise Exception("百家号未登录或 Cookie/token 失效,请重新登录后再同步")
|
|
|
+ # 非登录错误则停止分页
|
|
|
+ break
|
|
|
+
|
|
|
+ data = api_result.get("data", {}) or {}
|
|
|
+ items = data.get("list", []) or []
|
|
|
+ page_info = data.get("page", {}) or {}
|
|
|
+
|
|
|
+ if page_iter == 0:
|
|
|
+ total = int(page_info.get("totalCount", 0) or 0)
|
|
|
+ print(f"[{self.platform_name}] 作品总数: {total}")
|
|
|
+
|
|
|
+ new_count = 0
|
|
|
+ for item in items:
|
|
|
+ work_id = str(item.get("nid") or item.get("feed_id") or item.get("article_id") or item.get("id") or "")
|
|
|
+ if not work_id or work_id in seen_ids:
|
|
|
+ continue
|
|
|
+ seen_ids.add(work_id)
|
|
|
+ new_count += 1
|
|
|
+ all_works.append(
|
|
|
+ WorkItem(
|
|
|
+ work_id=work_id,
|
|
|
+ title=str(item.get("title") or ""),
|
|
|
+ cover_url=_pick_cover(item),
|
|
|
+ video_url=str(item.get("url") or ""),
|
|
|
+ duration=_pick_duration(item),
|
|
|
+ status=_pick_status(item),
|
|
|
+ publish_time=str(item.get("publish_time") or item.get("publish_at") or item.get("created_at") or ""),
|
|
|
+ play_count=int(item.get("read_amount") or 0),
|
|
|
+ like_count=int(item.get("like_amount") or 0),
|
|
|
+ comment_count=int(item.get("comment_amount") or 0),
|
|
|
+ share_count=int(item.get("share_amount") or 0),
|
|
|
+ collect_count=int(item.get("collection_amount") or 0),
|
|
|
+ )
|
|
|
+ )
|
|
|
+
|
|
|
+ total_page = int(page_info.get("totalPage", 0) or 0)
|
|
|
+ has_more = bool(total_page and page_num < total_page)
|
|
|
+
|
|
|
+ print(f"[{self.platform_name}] 第 {page_num}/{total_page or '?'} 页: 获取 {new_count} 个新作品, 累计 {len(all_works)}")
|
|
|
+
|
|
|
+ if not has_more or len(items) == 0 or new_count == 0:
|
|
|
+ break
|
|
|
+
|
|
|
+ # 页间短暂等待,避免过快触发风控
|
|
|
+ await asyncio.sleep(1)
|
|
|
+
|
|
|
+ print(f"[{self.platform_name}] ✓ 自动分页完成,共获取 {len(all_works)} 个作品")
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ import traceback
|
|
|
+ traceback.print_exc()
|
|
|
+ # 如果已获取到部分作品,仍然返回成功
|
|
|
+ if all_works:
|
|
|
+ print(f"[{self.platform_name}] 虽有异常但已获取 {len(all_works)} 个作品,正常返回")
|
|
|
+ return WorksResult(
|
|
|
+ success=True,
|
|
|
+ platform=self.platform_name,
|
|
|
+ works=all_works,
|
|
|
+ total=total or len(all_works),
|
|
|
+ has_more=False,
|
|
|
+ next_page="",
|
|
|
+ )
|
|
|
+ return WorksResult(
|
|
|
+ success=False,
|
|
|
+ platform=self.platform_name,
|
|
|
+ error=str(e),
|
|
|
+ debug_info="baijiahao_get_all_works_failed"
|
|
|
+ )
|
|
|
+
|
|
|
+ return WorksResult(
|
|
|
+ success=True,
|
|
|
+ platform=self.platform_name,
|
|
|
+ works=all_works,
|
|
|
+ total=total or len(all_works),
|
|
|
+ has_more=False,
|
|
|
+ next_page="",
|
|
|
+ )
|
|
|
+
|
|
|
async def get_article_stats(
|
|
|
self,
|
|
|
cookies: str,
|