|
|
@@ -8,6 +8,7 @@
|
|
|
import asyncio
|
|
|
import os
|
|
|
import sys
|
|
|
+import time
|
|
|
from pathlib import Path
|
|
|
from typing import List
|
|
|
from .base import (
|
|
|
@@ -86,8 +87,58 @@ class XiaohongshuPublisher(BasePublisher):
|
|
|
raise Exception(f"签名失败: {e}")
|
|
|
|
|
|
def sign_sync(self, uri, data=None, a1="", web_session=""):
|
|
|
- """同步签名函数,供 XhsClient 使用"""
|
|
|
- return asyncio.run(self.get_sign(uri, data, a1, web_session))
|
|
|
+ """
|
|
|
+ 同步签名函数,供 XhsClient 使用。
|
|
|
+
|
|
|
+ 注意:发布流程运行在 asyncio 事件循环中(通过 asyncio.run 启动),
|
|
|
+ 这里如果再调用 asyncio.run 会触发 “asyncio.run() cannot be called from a running event loop”。
|
|
|
+ 因此改为使用 sync_playwright 的同步实现(参考 matrix/xhs_uploader)。
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ from playwright.sync_api import sync_playwright
|
|
|
+ except Exception as e:
|
|
|
+ raise Exception(f"缺少 playwright 同步接口支持: {e}")
|
|
|
+
|
|
|
+ last_exc: Exception | None = None
|
|
|
+ for attempt in range(1, 6):
|
|
|
+ try:
|
|
|
+ with sync_playwright() as playwright:
|
|
|
+ browser = playwright.chromium.launch(headless=True)
|
|
|
+ context = browser.new_context()
|
|
|
+
|
|
|
+ if STEALTH_JS_PATH.exists():
|
|
|
+ context.add_init_script(path=str(STEALTH_JS_PATH))
|
|
|
+
|
|
|
+ page = context.new_page()
|
|
|
+ page.goto("https://www.xiaohongshu.com", wait_until="domcontentloaded", timeout=60000)
|
|
|
+
|
|
|
+ if a1:
|
|
|
+ context.add_cookies([
|
|
|
+ {'name': 'a1', 'value': a1, 'domain': ".xiaohongshu.com", 'path': "/"}
|
|
|
+ ])
|
|
|
+ page.reload(wait_until="domcontentloaded")
|
|
|
+
|
|
|
+ # 参考 matrix:设置完 cookie 后需要稍等,否则可能出现 window._webmsxyw 不存在
|
|
|
+ time.sleep(1.5)
|
|
|
+
|
|
|
+ encrypt_params = page.evaluate(
|
|
|
+ "([url, data]) => window._webmsxyw(url, data)",
|
|
|
+ [uri, data]
|
|
|
+ )
|
|
|
+
|
|
|
+ context.close()
|
|
|
+ browser.close()
|
|
|
+
|
|
|
+ return {
|
|
|
+ "x-s": encrypt_params["X-s"],
|
|
|
+ "x-t": str(encrypt_params["X-t"])
|
|
|
+ }
|
|
|
+ except Exception as e:
|
|
|
+ last_exc = e
|
|
|
+ # 轻微退避重试
|
|
|
+ time.sleep(0.4 * attempt)
|
|
|
+
|
|
|
+ raise Exception(f"签名失败: {last_exc}")
|
|
|
|
|
|
async def publish_via_api(self, cookies: str, params: PublishParams) -> PublishResult:
|
|
|
"""通过 API 发布视频"""
|
|
|
@@ -596,29 +647,14 @@ class XiaohongshuPublisher(BasePublisher):
|
|
|
# 移除监听器
|
|
|
self.page.remove_listener('response', handle_response)
|
|
|
|
|
|
- # 处理捕获到的数据
|
|
|
- import json
|
|
|
- if captured_data:
|
|
|
- print(f"[{self.platform_name}] 成功捕获到 API 数据", flush=True)
|
|
|
- data = captured_data.get('data', {})
|
|
|
- notes = data.get('notes', [])
|
|
|
- print(f"[{self.platform_name}] notes 数量: {len(notes)}", flush=True)
|
|
|
-
|
|
|
- # 从 tags 获取总数
|
|
|
- tags = data.get('tags', [])
|
|
|
- for tag in tags:
|
|
|
- if tag.get('id') == 'special.note_time_desc':
|
|
|
- total = tag.get('notes_count', 0)
|
|
|
- break
|
|
|
-
|
|
|
- has_more = data.get('page', -1) != -1
|
|
|
-
|
|
|
- for note in notes:
|
|
|
+ # 处理捕获到的数据(增加分页抓取,避免仅第一页)
|
|
|
+ def parse_notes(notes_list):
|
|
|
+ parsed = []
|
|
|
+ for note in notes_list:
|
|
|
note_id = note.get('id', '')
|
|
|
if not note_id:
|
|
|
continue
|
|
|
|
|
|
- # 获取封面
|
|
|
cover_url = ''
|
|
|
images_list = note.get('images_list', [])
|
|
|
if images_list:
|
|
|
@@ -626,10 +662,8 @@ class XiaohongshuPublisher(BasePublisher):
|
|
|
if cover_url.startswith('http://'):
|
|
|
cover_url = cover_url.replace('http://', 'https://')
|
|
|
|
|
|
- # 获取时长
|
|
|
duration = note.get('video_info', {}).get('duration', 0)
|
|
|
|
|
|
- # 解析状态
|
|
|
status = 'published'
|
|
|
tab_status = note.get('tab_status', 1)
|
|
|
if tab_status == 0:
|
|
|
@@ -639,7 +673,7 @@ class XiaohongshuPublisher(BasePublisher):
|
|
|
elif tab_status == 3:
|
|
|
status = 'rejected'
|
|
|
|
|
|
- works.append(WorkItem(
|
|
|
+ parsed.append(WorkItem(
|
|
|
work_id=note_id,
|
|
|
title=note.get('display_title', '') or '无标题',
|
|
|
cover_url=cover_url,
|
|
|
@@ -652,8 +686,83 @@ class XiaohongshuPublisher(BasePublisher):
|
|
|
share_count=note.get('shared_count', 0),
|
|
|
collect_count=note.get('collected_count', 0),
|
|
|
))
|
|
|
+ return parsed
|
|
|
+
|
|
|
+ import json
|
|
|
+ if captured_data:
|
|
|
+ print(f"[{self.platform_name}] 成功捕获到 API 数据", flush=True)
|
|
|
+ data = captured_data.get('data', {})
|
|
|
+ notes = data.get('notes', [])
|
|
|
+ print(f"[{self.platform_name}] notes 数量: {len(notes)}", flush=True)
|
|
|
+
|
|
|
+ # 从 tags 获取总数
|
|
|
+ tags = data.get('tags', [])
|
|
|
+ for tag in tags:
|
|
|
+ if tag.get('id') == 'special.note_time_desc':
|
|
|
+ total = tag.get('notes_count', 0)
|
|
|
+ break
|
|
|
+
|
|
|
+ works.extend(parse_notes(notes))
|
|
|
+
|
|
|
+ # 分页抓取剩余页面:不依赖 data.page(有些情况下会误报 -1),直到拿不到新数据为止
|
|
|
+ max_pages = 30
|
|
|
+ page_num = 1 # 已经拿了 page=0
|
|
|
+ seen_note_ids = set([w.work_id for w in works])
|
|
|
+ has_more = True
|
|
|
+
|
|
|
+ while has_more and page_num < max_pages:
|
|
|
+ try:
|
|
|
+ next_resp = await self.page.evaluate(
|
|
|
+ """async (p) => {
|
|
|
+ const res = await fetch(`https://edith.xiaohongshu.com/web_api/sns/v5/creator/note/user/posted?tab=0&page=${p}`, {
|
|
|
+ method: 'GET',
|
|
|
+ credentials: 'include',
|
|
|
+ headers: { 'Accept': 'application/json' }
|
|
|
+ });
|
|
|
+ return await res.json();
|
|
|
+ }""",
|
|
|
+ page_num
|
|
|
+ )
|
|
|
+ except Exception as fetch_err:
|
|
|
+ print(f"[{self.platform_name}] 分页请求异常 page={page_num}: {fetch_err}", flush=True)
|
|
|
+ break
|
|
|
+
|
|
|
+ if not next_resp:
|
|
|
+ break
|
|
|
+
|
|
|
+ if not (next_resp.get('success') or next_resp.get('code') == 0):
|
|
|
+ break
|
|
|
+
|
|
|
+ next_data = next_resp.get('data', {})
|
|
|
+ next_notes = next_data.get('notes', []) or []
|
|
|
+
|
|
|
+ if not next_notes:
|
|
|
+ has_more = False
|
|
|
+ break
|
|
|
+
|
|
|
+ parsed_next = parse_notes(next_notes)
|
|
|
+ new_items = [w for w in parsed_next if w.work_id and w.work_id not in seen_note_ids]
|
|
|
+ if not new_items:
|
|
|
+ # 没有新数据,停止
|
|
|
+ has_more = False
|
|
|
+ break
|
|
|
+
|
|
|
+ for w in new_items:
|
|
|
+ seen_note_ids.add(w.work_id)
|
|
|
+ works.extend(new_items)
|
|
|
+
|
|
|
+ # 更新总数(若第一页未拿到)
|
|
|
+ if not total and next_data.get('tags'):
|
|
|
+ for tag in next_data.get('tags', []):
|
|
|
+ if tag.get('id') == 'special.note_time_desc':
|
|
|
+ total = tag.get('notes_count', 0)
|
|
|
+ break
|
|
|
+
|
|
|
+ page_num += 1
|
|
|
|
|
|
- print(f"[{self.platform_name}] 解析到 {len(works)} 个作品,总计: {total}", flush=True)
|
|
|
+ # 分页完毕,has_more 表示是否还存在更多(以最后一页标记为准)
|
|
|
+ if not has_more:
|
|
|
+ print(f"[{self.platform_name}] 已抓取所有分页,共 {len(works)} 条", flush=True)
|
|
|
else:
|
|
|
print(f"[{self.platform_name}] 未能捕获到 API 数据", flush=True)
|
|
|
|