xiaohongshu.py 49 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096
  1. # -*- coding: utf-8 -*-
  2. """
  3. 小红书视频发布器
  4. 参考: matrix/xhs_uploader/main.py
  5. 使用 xhs SDK API 方式发布,更稳定
  6. """
  7. import asyncio
  8. import os
  9. import sys
  10. import time
  11. from pathlib import Path
  12. from typing import List
  13. from .base import (
  14. BasePublisher, PublishParams, PublishResult,
  15. WorkItem, WorksResult, CommentItem, CommentsResult
  16. )
  17. # 添加 matrix 项目路径,用于导入签名脚本
  18. MATRIX_PATH = Path(__file__).parent.parent.parent.parent / "matrix"
  19. sys.path.insert(0, str(MATRIX_PATH))
  20. # 尝试导入 xhs SDK
  21. try:
  22. from xhs import XhsClient
  23. XHS_SDK_AVAILABLE = True
  24. except ImportError:
  25. print("[Warning] xhs 库未安装,请运行: pip install xhs")
  26. XhsClient = None
  27. XHS_SDK_AVAILABLE = False
  28. # 签名脚本路径
  29. STEALTH_JS_PATH = MATRIX_PATH / "xhs-api" / "js" / "stealth.min.js"
  30. class XiaohongshuPublisher(BasePublisher):
  31. """
  32. 小红书视频发布器
  33. 优先使用 xhs SDK API 方式发布
  34. """
  35. platform_name = "xiaohongshu"
  36. login_url = "https://creator.xiaohongshu.com/"
  37. publish_url = "https://creator.xiaohongshu.com/publish/publish"
  38. cookie_domain = ".xiaohongshu.com"
  39. async def get_sign(self, uri: str, data=None, a1: str = "", web_session: str = ""):
  40. """获取小红书 API 签名"""
  41. from playwright.async_api import async_playwright
  42. try:
  43. async with async_playwright() as playwright:
  44. browser = await playwright.chromium.launch(headless=True)
  45. browser_context = await browser.new_context()
  46. if STEALTH_JS_PATH.exists():
  47. await browser_context.add_init_script(path=str(STEALTH_JS_PATH))
  48. page = await browser_context.new_page()
  49. await page.goto("https://www.xiaohongshu.com")
  50. await asyncio.sleep(1)
  51. await page.reload()
  52. await asyncio.sleep(1)
  53. if a1:
  54. await browser_context.add_cookies([
  55. {'name': 'a1', 'value': a1, 'domain': ".xiaohongshu.com", 'path': "/"}
  56. ])
  57. await page.reload()
  58. await asyncio.sleep(0.5)
  59. encrypt_params = await page.evaluate(
  60. "([url, data]) => window._webmsxyw(url, data)",
  61. [uri, data]
  62. )
  63. await browser_context.close()
  64. await browser.close()
  65. return {
  66. "x-s": encrypt_params["X-s"],
  67. "x-t": str(encrypt_params["X-t"])
  68. }
  69. except Exception as e:
  70. import traceback
  71. traceback.print_exc()
  72. raise Exception(f"签名失败: {e}")
  73. def sign_sync(self, uri, data=None, a1="", web_session=""):
  74. """
  75. 同步签名函数,供 XhsClient 使用。
  76. 注意:发布流程运行在 asyncio 事件循环中(通过 asyncio.run 启动),
  77. 这里如果再调用 asyncio.run 会触发 “asyncio.run() cannot be called from a running event loop”。
  78. 因此改为使用 sync_playwright 的同步实现(参考 matrix/xhs_uploader)。
  79. """
  80. try:
  81. from playwright.sync_api import sync_playwright
  82. except Exception as e:
  83. raise Exception(f"缺少 playwright 同步接口支持: {e}")
  84. last_exc: Exception | None = None
  85. for attempt in range(1, 6):
  86. try:
  87. with sync_playwright() as playwright:
  88. browser = playwright.chromium.launch(headless=True)
  89. context = browser.new_context()
  90. if STEALTH_JS_PATH.exists():
  91. context.add_init_script(path=str(STEALTH_JS_PATH))
  92. page = context.new_page()
  93. page.goto("https://www.xiaohongshu.com", wait_until="domcontentloaded", timeout=60000)
  94. if a1:
  95. context.add_cookies([
  96. {'name': 'a1', 'value': a1, 'domain': ".xiaohongshu.com", 'path': "/"}
  97. ])
  98. page.reload(wait_until="domcontentloaded")
  99. # 参考 matrix:设置完 cookie 后需要稍等,否则可能出现 window._webmsxyw 不存在
  100. time.sleep(1.5)
  101. encrypt_params = page.evaluate(
  102. "([url, data]) => window._webmsxyw(url, data)",
  103. [uri, data]
  104. )
  105. context.close()
  106. browser.close()
  107. return {
  108. "x-s": encrypt_params["X-s"],
  109. "x-t": str(encrypt_params["X-t"])
  110. }
  111. except Exception as e:
  112. last_exc = e
  113. # 轻微退避重试
  114. time.sleep(0.4 * attempt)
  115. raise Exception(f"签名失败: {last_exc}")
  116. async def publish_via_api(self, cookies: str, params: PublishParams) -> PublishResult:
  117. """通过 API 发布视频"""
  118. if not XHS_SDK_AVAILABLE:
  119. raise Exception("xhs SDK 未安装,请运行: pip install xhs")
  120. self.report_progress(10, "正在通过 API 发布...")
  121. print(f"[{self.platform_name}] 使用 XHS SDK API 发布...")
  122. print(f"[{self.platform_name}] 视频路径: {params.video_path}")
  123. print(f"[{self.platform_name}] 标题: {params.title}")
  124. # 转换 cookie 格式
  125. cookie_list = self.parse_cookies(cookies)
  126. cookie_string = self.cookies_to_string(cookie_list) if cookie_list else cookies
  127. print(f"[{self.platform_name}] Cookie 长度: {len(cookie_string)}")
  128. self.report_progress(20, "正在上传视频...")
  129. # 创建客户端
  130. xhs_client = XhsClient(cookie_string, sign=self.sign_sync)
  131. print(f"[{self.platform_name}] 开始调用 create_video_note...")
  132. # 发布视频
  133. try:
  134. result = xhs_client.create_video_note(
  135. title=params.title,
  136. desc=params.description or params.title,
  137. topics=params.tags or [],
  138. post_time=params.publish_date.strftime("%Y-%m-%d %H:%M:%S") if params.publish_date else None,
  139. video_path=params.video_path,
  140. cover_path=params.cover_path if params.cover_path and os.path.exists(params.cover_path) else None
  141. )
  142. print(f"[{self.platform_name}] SDK 返回结果: {result}")
  143. except Exception as e:
  144. import traceback
  145. traceback.print_exc()
  146. print(f"[{self.platform_name}] SDK 调用失败: {e}")
  147. raise Exception(f"XHS SDK 发布失败: {e}")
  148. # 验证返回结果
  149. if not result:
  150. raise Exception("XHS SDK 返回空结果")
  151. # 检查是否有错误
  152. if isinstance(result, dict):
  153. if result.get("code") and result.get("code") != 0:
  154. raise Exception(f"发布失败: {result.get('msg', '未知错误')}")
  155. if result.get("success") == False:
  156. raise Exception(f"发布失败: {result.get('msg', result.get('error', '未知错误'))}")
  157. note_id = result.get("note_id", "") if isinstance(result, dict) else ""
  158. video_url = result.get("url", "") if isinstance(result, dict) else ""
  159. if not note_id:
  160. print(f"[{self.platform_name}] 警告: 未获取到 note_id,返回结果: {result}")
  161. self.report_progress(100, "发布成功")
  162. print(f"[{self.platform_name}] 发布成功! note_id={note_id}, url={video_url}")
  163. return PublishResult(
  164. success=True,
  165. platform=self.platform_name,
  166. video_id=note_id,
  167. video_url=video_url,
  168. message="发布成功"
  169. )
  170. async def publish(self, cookies: str, params: PublishParams) -> PublishResult:
  171. """发布视频到小红书 - 参考 matrix/xhs_uploader/main.py"""
  172. print(f"\n{'='*60}")
  173. print(f"[{self.platform_name}] 开始发布视频")
  174. print(f"[{self.platform_name}] 视频路径: {params.video_path}")
  175. print(f"[{self.platform_name}] 标题: {params.title}")
  176. print(f"[{self.platform_name}] Headless: {self.headless}")
  177. print(f"[{self.platform_name}] XHS SDK 可用: {XHS_SDK_AVAILABLE}")
  178. print(f"{'='*60}")
  179. # 检查视频文件
  180. if not os.path.exists(params.video_path):
  181. raise Exception(f"视频文件不存在: {params.video_path}")
  182. print(f"[{self.platform_name}] 视频文件存在,大小: {os.path.getsize(params.video_path)} bytes")
  183. self.report_progress(5, "正在准备发布...")
  184. # 参考 matrix: 优先使用 XHS SDK API 方式发布(更稳定)
  185. if XHS_SDK_AVAILABLE:
  186. try:
  187. print(f"[{self.platform_name}] 尝试使用 XHS SDK API 发布...")
  188. result = await self.publish_via_api(cookies, params)
  189. print(f"[{self.platform_name}] API 发布完成: success={result.success}")
  190. # 如果 API 返回成功,直接返回
  191. if result.success:
  192. return result
  193. # 如果 API 返回失败但有具体错误,也返回
  194. if result.error and "请刷新" not in result.error:
  195. return result
  196. # 其他情况尝试 Playwright 方式
  197. print(f"[{self.platform_name}] API 方式未成功,尝试 Playwright...")
  198. except Exception as e:
  199. import traceback
  200. traceback.print_exc()
  201. print(f"[{self.platform_name}] API 发布失败: {e}")
  202. print(f"[{self.platform_name}] 尝试使用 Playwright 方式...")
  203. # 使用 Playwright 方式发布
  204. print(f"[{self.platform_name}] 使用 Playwright 方式发布...")
  205. return await self.publish_via_playwright(cookies, params)
  206. async def publish_via_playwright(self, cookies: str, params: PublishParams) -> PublishResult:
  207. """通过 Playwright 发布视频"""
  208. self.report_progress(10, "正在初始化浏览器...")
  209. print(f"[{self.platform_name}] Playwright 方式开始...")
  210. await self.init_browser()
  211. cookie_list = self.parse_cookies(cookies)
  212. print(f"[{self.platform_name}] 设置 {len(cookie_list)} 个 cookies")
  213. await self.set_cookies(cookie_list)
  214. if not self.page:
  215. raise Exception("Page not initialized")
  216. self.report_progress(15, "正在打开发布页面...")
  217. # 直接访问视频发布页面
  218. publish_url = "https://creator.xiaohongshu.com/publish/publish?source=official"
  219. print(f"[{self.platform_name}] 打开页面: {publish_url}")
  220. await self.page.goto(publish_url)
  221. await asyncio.sleep(3)
  222. current_url = self.page.url
  223. print(f"[{self.platform_name}] 当前 URL: {current_url}")
  224. # 检查登录状态
  225. if "login" in current_url or "passport" in current_url:
  226. screenshot_base64 = await self.capture_screenshot()
  227. return PublishResult(
  228. success=False,
  229. platform=self.platform_name,
  230. error="登录已过期,请重新登录",
  231. screenshot_base64=screenshot_base64,
  232. page_url=current_url,
  233. status='need_captcha',
  234. need_captcha=True,
  235. captcha_type='login'
  236. )
  237. # 使用 AI 检查验证码
  238. ai_captcha = await self.ai_check_captcha()
  239. if ai_captcha['has_captcha']:
  240. print(f"[{self.platform_name}] AI检测到验证码: {ai_captcha['captcha_type']}", flush=True)
  241. screenshot_base64 = await self.capture_screenshot()
  242. return PublishResult(
  243. success=False,
  244. platform=self.platform_name,
  245. error=f"检测到{ai_captcha['captcha_type']}验证码,需要使用有头浏览器完成验证",
  246. screenshot_base64=screenshot_base64,
  247. page_url=current_url,
  248. status='need_captcha',
  249. need_captcha=True,
  250. captcha_type=ai_captcha['captcha_type']
  251. )
  252. self.report_progress(20, "正在上传视频...")
  253. # 等待页面加载
  254. await asyncio.sleep(2)
  255. # 上传视频
  256. upload_triggered = False
  257. # 方法1: 直接设置隐藏的 file input
  258. print(f"[{self.platform_name}] 尝试方法1: 设置 file input")
  259. file_inputs = self.page.locator('input[type="file"]')
  260. input_count = await file_inputs.count()
  261. print(f"[{self.platform_name}] 找到 {input_count} 个 file input")
  262. if input_count > 0:
  263. # 找到接受视频的 input
  264. for i in range(input_count):
  265. input_el = file_inputs.nth(i)
  266. accept = await input_el.get_attribute('accept') or ''
  267. print(f"[{self.platform_name}] Input {i} accept: {accept}")
  268. if 'video' in accept or '*' in accept or not accept:
  269. await input_el.set_input_files(params.video_path)
  270. upload_triggered = True
  271. print(f"[{self.platform_name}] 视频文件已设置到 input {i}")
  272. break
  273. # 方法2: 点击上传区域触发文件选择器
  274. if not upload_triggered:
  275. print(f"[{self.platform_name}] 尝试方法2: 点击上传区域")
  276. try:
  277. upload_area = self.page.locator('[class*="upload-wrapper"], [class*="upload-area"], .upload-input').first
  278. if await upload_area.count() > 0:
  279. async with self.page.expect_file_chooser(timeout=5000) as fc_info:
  280. await upload_area.click()
  281. file_chooser = await fc_info.value
  282. await file_chooser.set_files(params.video_path)
  283. upload_triggered = True
  284. print(f"[{self.platform_name}] 通过点击上传区域上传成功")
  285. except Exception as e:
  286. print(f"[{self.platform_name}] 方法2失败: {e}")
  287. if not upload_triggered:
  288. screenshot_base64 = await self.capture_screenshot()
  289. page_url = await self.get_page_url()
  290. return PublishResult(
  291. success=False,
  292. platform=self.platform_name,
  293. error="无法上传视频文件",
  294. screenshot_base64=screenshot_base64,
  295. page_url=page_url,
  296. status='need_action'
  297. )
  298. self.report_progress(40, "等待视频上传完成...")
  299. print(f"[{self.platform_name}] 等待视频上传和处理...")
  300. # 等待上传完成(检测页面变化)
  301. upload_complete = False
  302. for i in range(60): # 最多等待3分钟
  303. await asyncio.sleep(3)
  304. # 检查是否有标题输入框(上传完成后出现)
  305. title_input_count = await self.page.locator('input[placeholder*="标题"], input[placeholder*="填写标题"]').count()
  306. # 或者检查编辑器区域
  307. editor_count = await self.page.locator('[class*="ql-editor"], [contenteditable="true"]').count()
  308. # 检查发布按钮是否可见
  309. publish_btn_count = await self.page.locator('.publishBtn, button:has-text("发布")').count()
  310. print(f"[{self.platform_name}] 检测 {i+1}: 标题框={title_input_count}, 编辑器={editor_count}, 发布按钮={publish_btn_count}")
  311. if title_input_count > 0 or (editor_count > 0 and publish_btn_count > 0):
  312. upload_complete = True
  313. print(f"[{self.platform_name}] 视频上传完成!")
  314. break
  315. if not upload_complete:
  316. screenshot_base64 = await self.capture_screenshot()
  317. page_url = await self.get_page_url()
  318. return PublishResult(
  319. success=False,
  320. platform=self.platform_name,
  321. error="视频上传超时",
  322. screenshot_base64=screenshot_base64,
  323. page_url=page_url,
  324. status='need_action'
  325. )
  326. await asyncio.sleep(2)
  327. self.report_progress(60, "正在填写笔记信息...")
  328. print(f"[{self.platform_name}] 填写标题: {params.title[:20]}")
  329. # 填写标题
  330. title_filled = False
  331. title_selectors = [
  332. 'input[placeholder*="标题"]',
  333. 'input[placeholder*="填写标题"]',
  334. '[class*="title"] input',
  335. '.c-input_inner',
  336. ]
  337. for selector in title_selectors:
  338. title_input = self.page.locator(selector).first
  339. if await title_input.count() > 0:
  340. await title_input.click()
  341. await title_input.fill('') # 先清空
  342. await title_input.fill(params.title[:20])
  343. title_filled = True
  344. print(f"[{self.platform_name}] 标题已填写,使用选择器: {selector}")
  345. break
  346. if not title_filled:
  347. print(f"[{self.platform_name}] 警告: 未找到标题输入框")
  348. # 填写描述和标签
  349. if params.description or params.tags:
  350. desc_filled = False
  351. desc_selectors = [
  352. '[class*="ql-editor"]',
  353. '[class*="content-input"] [contenteditable="true"]',
  354. '[class*="editor"] [contenteditable="true"]',
  355. '.ql-editor',
  356. ]
  357. for selector in desc_selectors:
  358. desc_input = self.page.locator(selector).first
  359. if await desc_input.count() > 0:
  360. await desc_input.click()
  361. await asyncio.sleep(0.5)
  362. if params.description:
  363. await self.page.keyboard.type(params.description, delay=20)
  364. print(f"[{self.platform_name}] 描述已填写")
  365. if params.tags:
  366. # 添加标签
  367. await self.page.keyboard.press("Enter")
  368. for tag in params.tags[:5]: # 最多5个标签
  369. await self.page.keyboard.type(f"#{tag}", delay=20)
  370. await asyncio.sleep(0.3)
  371. await self.page.keyboard.press("Space")
  372. print(f"[{self.platform_name}] 标签已填写: {params.tags[:5]}")
  373. desc_filled = True
  374. break
  375. if not desc_filled:
  376. print(f"[{self.platform_name}] 警告: 未找到描述输入框")
  377. await asyncio.sleep(2)
  378. self.report_progress(80, "正在发布...")
  379. await asyncio.sleep(2)
  380. # 滚动到页面底部确保发布按钮可见
  381. await self.page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
  382. await asyncio.sleep(1)
  383. print(f"[{self.platform_name}] 查找发布按钮...")
  384. # 点击发布
  385. publish_selectors = [
  386. 'button.publishBtn',
  387. '.publishBtn',
  388. 'button.d-button.red',
  389. 'button:has-text("发布"):not(:has-text("定时发布"))',
  390. '[class*="publish"][class*="btn"]',
  391. ]
  392. publish_clicked = False
  393. for selector in publish_selectors:
  394. try:
  395. btn = self.page.locator(selector).first
  396. if await btn.count() > 0:
  397. is_visible = await btn.is_visible()
  398. is_enabled = await btn.is_enabled()
  399. print(f"[{self.platform_name}] 按钮 {selector}: visible={is_visible}, enabled={is_enabled}")
  400. if is_visible and is_enabled:
  401. box = await btn.bounding_box()
  402. if box:
  403. print(f"[{self.platform_name}] 点击发布按钮: {selector}, 位置: ({box['x']}, {box['y']})")
  404. # 使用真实鼠标点击
  405. await self.page.mouse.click(box['x'] + box['width']/2, box['y'] + box['height']/2)
  406. publish_clicked = True
  407. break
  408. except Exception as e:
  409. print(f"[{self.platform_name}] 选择器 {selector} 错误: {e}")
  410. if not publish_clicked:
  411. # 保存截图用于调试
  412. screenshot_path = f"debug_publish_failed_{self.platform_name}.png"
  413. await self.page.screenshot(path=screenshot_path, full_page=True)
  414. print(f"[{self.platform_name}] 未找到发布按钮,截图保存到: {screenshot_path}")
  415. # 打印页面 HTML 结构用于调试
  416. buttons = await self.page.query_selector_all('button')
  417. print(f"[{self.platform_name}] 页面上共有 {len(buttons)} 个按钮")
  418. for i, btn in enumerate(buttons[:10]):
  419. text = await btn.text_content() or ''
  420. cls = await btn.get_attribute('class') or ''
  421. print(f" 按钮 {i}: text='{text.strip()[:30]}', class='{cls[:50]}'")
  422. raise Exception("未找到发布按钮")
  423. print(f"[{self.platform_name}] 已点击发布按钮,等待发布完成...")
  424. self.report_progress(90, "等待发布结果...")
  425. # 等待发布完成(检测 URL 变化或成功提示)
  426. publish_success = False
  427. for i in range(20): # 最多等待 20 秒
  428. await asyncio.sleep(1)
  429. current_url = self.page.url
  430. # 检查是否跳转到发布成功页面或内容管理页面
  431. if "published=true" in current_url or "success" in current_url or "content" in current_url:
  432. publish_success = True
  433. print(f"[{self.platform_name}] 发布成功! 跳转到: {current_url}")
  434. break
  435. # 检查是否有成功提示
  436. try:
  437. success_msg = await self.page.locator('[class*="success"], .toast-success, [class*="Toast"]').first.is_visible()
  438. if success_msg:
  439. publish_success = True
  440. print(f"[{self.platform_name}] 检测到成功提示!")
  441. break
  442. except:
  443. pass
  444. # 检查是否有错误提示
  445. try:
  446. error_elements = self.page.locator('[class*="error"], .toast-error, [class*="fail"]')
  447. if await error_elements.count() > 0:
  448. error_text = await error_elements.first.text_content()
  449. if error_text and len(error_text.strip()) > 0:
  450. raise Exception(f"发布失败: {error_text.strip()}")
  451. except Exception as e:
  452. if "发布失败" in str(e):
  453. raise
  454. # 如果没有明确的成功标志,返回截图供 AI 分析
  455. if not publish_success:
  456. final_url = self.page.url
  457. print(f"[{self.platform_name}] 发布结果不确定,当前 URL: {final_url}")
  458. screenshot_base64 = await self.capture_screenshot()
  459. print(f"[{self.platform_name}] 已获取截图供 AI 分析")
  460. # 如果 URL 还是发布页面,可能需要继续操作
  461. if "publish/publish" in final_url:
  462. return PublishResult(
  463. success=False,
  464. platform=self.platform_name,
  465. error="发布结果待确认,请查看截图",
  466. screenshot_base64=screenshot_base64,
  467. page_url=final_url,
  468. status='need_action'
  469. )
  470. self.report_progress(100, "发布完成")
  471. print(f"[{self.platform_name}] Playwright 方式发布完成!")
  472. screenshot_base64 = await self.capture_screenshot()
  473. page_url = await self.get_page_url()
  474. return PublishResult(
  475. success=True,
  476. platform=self.platform_name,
  477. message="发布完成",
  478. screenshot_base64=screenshot_base64,
  479. page_url=page_url,
  480. status='success'
  481. )
  482. async def get_works(self, cookies: str, page: int = 0, page_size: int = 20) -> WorksResult:
  483. """获取小红书作品列表 - 通过监听页面网络响应获取数据"""
  484. print(f"\n{'='*60}", flush=True)
  485. print(f"[{self.platform_name}] 获取作品列表", flush=True)
  486. print(f"[{self.platform_name}] page={page}, page_size={page_size}", flush=True)
  487. print(f"{'='*60}", flush=True)
  488. works: List[WorkItem] = []
  489. total = 0
  490. has_more = False
  491. captured_data = {}
  492. try:
  493. await self.init_browser()
  494. cookie_list = self.parse_cookies(cookies)
  495. # 打印 cookies 信息用于调试
  496. print(f"[{self.platform_name}] 解析到 {len(cookie_list)} 个 cookies", flush=True)
  497. await self.set_cookies(cookie_list)
  498. if not self.page:
  499. raise Exception("Page not initialized")
  500. # 定义响应监听器 - 捕获页面自动发起的 API 请求
  501. async def handle_response(response):
  502. nonlocal captured_data
  503. url = response.url
  504. # 监听作品列表 API
  505. if 'creator/note/user/posted' in url or 'creator/note_list' in url:
  506. try:
  507. json_data = await response.json()
  508. print(f"[{self.platform_name}] 捕获到 API 响应: {url[:80]}...", flush=True)
  509. if json_data.get('success') or json_data.get('code') == 0:
  510. captured_data = json_data
  511. print(f"[{self.platform_name}] API 响应成功,data keys: {list(json_data.get('data', {}).keys())}", flush=True)
  512. except Exception as e:
  513. print(f"[{self.platform_name}] 解析响应失败: {e}", flush=True)
  514. # 注册响应监听器
  515. self.page.on('response', handle_response)
  516. print(f"[{self.platform_name}] 已注册 API 响应监听器", flush=True)
  517. # 访问笔记管理页面 - 页面会自动发起 API 请求
  518. print(f"[{self.platform_name}] 访问笔记管理页面...", flush=True)
  519. try:
  520. await self.page.goto("https://creator.xiaohongshu.com/new/note-manager", wait_until="domcontentloaded", timeout=30000)
  521. except Exception as nav_error:
  522. print(f"[{self.platform_name}] 导航超时,但继续尝试: {nav_error}", flush=True)
  523. # 等待 API 响应被捕获
  524. await asyncio.sleep(5)
  525. # 检查登录状态
  526. current_url = self.page.url
  527. print(f"[{self.platform_name}] 当前页面: {current_url}", flush=True)
  528. if "login" in current_url:
  529. raise Exception("Cookie 已过期,请重新登录")
  530. # 如果还没有捕获到数据,等待更长时间
  531. if not captured_data:
  532. print(f"[{self.platform_name}] 等待 API 响应...", flush=True)
  533. await asyncio.sleep(5)
  534. # 移除监听器
  535. self.page.remove_listener('response', handle_response)
  536. # 处理捕获到的数据(增加分页抓取,避免仅第一页)
  537. def parse_notes(notes_list):
  538. parsed = []
  539. for note in notes_list:
  540. note_id = note.get('id', '')
  541. if not note_id:
  542. continue
  543. cover_url = ''
  544. images_list = note.get('images_list', [])
  545. if images_list:
  546. cover_url = images_list[0].get('url', '')
  547. if cover_url.startswith('http://'):
  548. cover_url = cover_url.replace('http://', 'https://')
  549. duration = note.get('video_info', {}).get('duration', 0)
  550. status = 'published'
  551. tab_status = note.get('tab_status', 1)
  552. if tab_status == 0:
  553. status = 'draft'
  554. elif tab_status == 2:
  555. status = 'reviewing'
  556. elif tab_status == 3:
  557. status = 'rejected'
  558. parsed.append(WorkItem(
  559. work_id=note_id,
  560. title=note.get('display_title', '') or '无标题',
  561. cover_url=cover_url,
  562. duration=duration,
  563. status=status,
  564. publish_time=note.get('time', ''),
  565. play_count=note.get('view_count', 0),
  566. like_count=note.get('likes', 0),
  567. comment_count=note.get('comments_count', 0),
  568. share_count=note.get('shared_count', 0),
  569. collect_count=note.get('collected_count', 0),
  570. ))
  571. return parsed
  572. import json
  573. if captured_data:
  574. print(f"[{self.platform_name}] 成功捕获到 API 数据", flush=True)
  575. data = captured_data.get('data', {})
  576. notes = data.get('notes', [])
  577. print(f"[{self.platform_name}] notes 数量: {len(notes)}", flush=True)
  578. # 从 tags 获取总数
  579. tags = data.get('tags', [])
  580. for tag in tags:
  581. if tag.get('id') == 'special.note_time_desc':
  582. total = tag.get('notes_count', 0)
  583. break
  584. works.extend(parse_notes(notes))
  585. # 分页抓取剩余页面:不依赖 data.page(有些情况下会误报 -1),直到拿不到新数据为止
  586. max_pages = 30
  587. page_num = 1 # 已经拿了 page=0
  588. seen_note_ids = set([w.work_id for w in works])
  589. has_more = True
  590. while has_more and page_num < max_pages:
  591. try:
  592. next_resp = await self.page.evaluate(
  593. """async (p) => {
  594. const res = await fetch(`https://edith.xiaohongshu.com/web_api/sns/v5/creator/note/user/posted?tab=0&page=${p}`, {
  595. method: 'GET',
  596. credentials: 'include',
  597. headers: { 'Accept': 'application/json' }
  598. });
  599. return await res.json();
  600. }""",
  601. page_num
  602. )
  603. except Exception as fetch_err:
  604. print(f"[{self.platform_name}] 分页请求异常 page={page_num}: {fetch_err}", flush=True)
  605. break
  606. if not next_resp:
  607. break
  608. if not (next_resp.get('success') or next_resp.get('code') == 0):
  609. break
  610. next_data = next_resp.get('data', {})
  611. next_notes = next_data.get('notes', []) or []
  612. if not next_notes:
  613. has_more = False
  614. break
  615. parsed_next = parse_notes(next_notes)
  616. new_items = [w for w in parsed_next if w.work_id and w.work_id not in seen_note_ids]
  617. if not new_items:
  618. # 没有新数据,停止
  619. has_more = False
  620. break
  621. for w in new_items:
  622. seen_note_ids.add(w.work_id)
  623. works.extend(new_items)
  624. # 更新总数(若第一页未拿到)
  625. if not total and next_data.get('tags'):
  626. for tag in next_data.get('tags', []):
  627. if tag.get('id') == 'special.note_time_desc':
  628. total = tag.get('notes_count', 0)
  629. break
  630. page_num += 1
  631. # 分页完毕,has_more 表示是否还存在更多(以最后一页标记为准)
  632. if not has_more:
  633. print(f"[{self.platform_name}] 已抓取所有分页,共 {len(works)} 条", flush=True)
  634. else:
  635. print(f"[{self.platform_name}] 未能捕获到 API 数据", flush=True)
  636. except Exception as e:
  637. import traceback
  638. print(f"[{self.platform_name}] 发生异常: {e}", flush=True)
  639. traceback.print_exc()
  640. return WorksResult(
  641. success=False,
  642. platform=self.platform_name,
  643. error=str(e)
  644. )
  645. finally:
  646. # 确保关闭浏览器
  647. await self.close_browser()
  648. return WorksResult(
  649. success=True,
  650. platform=self.platform_name,
  651. works=works,
  652. total=total or len(works),
  653. has_more=has_more
  654. )
  655. async def get_comments(self, cookies: str, work_id: str, cursor: str = "") -> CommentsResult:
  656. """获取小红书作品评论 - 通过创作者后台评论管理页面"""
  657. print(f"\n{'='*60}")
  658. print(f"[{self.platform_name}] 获取作品评论")
  659. print(f"[{self.platform_name}] work_id={work_id}, cursor={cursor}")
  660. print(f"{'='*60}")
  661. comments: List[CommentItem] = []
  662. total = 0
  663. has_more = False
  664. next_cursor = ""
  665. captured_data = {}
  666. try:
  667. await self.init_browser()
  668. cookie_list = self.parse_cookies(cookies)
  669. await self.set_cookies(cookie_list)
  670. if not self.page:
  671. raise Exception("Page not initialized")
  672. # 设置 API 响应监听器
  673. async def handle_response(response):
  674. nonlocal captured_data
  675. url = response.url
  676. # 监听评论相关 API - 创作者后台和普通页面的 API
  677. if '/comment/' in url and ('page' in url or 'list' in url):
  678. try:
  679. json_data = await response.json()
  680. print(f"[{self.platform_name}] 捕获到评论 API: {url[:100]}...", flush=True)
  681. if json_data.get('success') or json_data.get('code') == 0:
  682. data = json_data.get('data', {})
  683. comment_list = data.get('comments') or data.get('list') or []
  684. if comment_list:
  685. captured_data = json_data
  686. print(f"[{self.platform_name}] 评论 API 响应成功,comments={len(comment_list)}", flush=True)
  687. else:
  688. print(f"[{self.platform_name}] 评论 API 响应成功但无评论", flush=True)
  689. except Exception as e:
  690. print(f"[{self.platform_name}] 解析评论响应失败: {e}", flush=True)
  691. self.page.on('response', handle_response)
  692. print(f"[{self.platform_name}] 已注册评论 API 响应监听器", flush=True)
  693. # 访问创作者后台评论管理页面
  694. comment_url = "https://creator.xiaohongshu.com/creator/comment"
  695. print(f"[{self.platform_name}] 访问评论管理页面: {comment_url}", flush=True)
  696. await self.page.goto(comment_url, wait_until="domcontentloaded", timeout=30000)
  697. await asyncio.sleep(5)
  698. # 检查是否被重定向到登录页
  699. current_url = self.page.url
  700. print(f"[{self.platform_name}] 当前页面 URL: {current_url}", flush=True)
  701. if "login" in current_url:
  702. raise Exception("Cookie 已过期,请重新登录")
  703. # 等待评论加载
  704. if not captured_data:
  705. print(f"[{self.platform_name}] 等待评论 API 响应...", flush=True)
  706. # 尝试滚动页面触发评论加载
  707. await self.page.evaluate('window.scrollBy(0, 500)')
  708. await asyncio.sleep(3)
  709. if not captured_data:
  710. # 再等待一会,可能评论 API 加载较慢
  711. print(f"[{self.platform_name}] 继续等待评论加载...", flush=True)
  712. await asyncio.sleep(5)
  713. # 移除监听器
  714. self.page.remove_listener('response', handle_response)
  715. # 解析评论数据
  716. if captured_data:
  717. data = captured_data.get('data', {})
  718. comment_list = data.get('comments') or data.get('list') or []
  719. has_more = data.get('has_more', False)
  720. next_cursor = data.get('cursor', '')
  721. print(f"[{self.platform_name}] 解析评论: has_more={has_more}, comments={len(comment_list)}", flush=True)
  722. for comment in comment_list:
  723. cid = comment.get('id', '')
  724. if not cid:
  725. continue
  726. user_info = comment.get('user_info', {})
  727. # 解析子评论
  728. replies = []
  729. sub_comments = comment.get('sub_comments', []) or []
  730. for sub in sub_comments:
  731. sub_user = sub.get('user_info', {})
  732. replies.append(CommentItem(
  733. comment_id=sub.get('id', ''),
  734. work_id=work_id,
  735. content=sub.get('content', ''),
  736. author_id=sub_user.get('user_id', ''),
  737. author_name=sub_user.get('nickname', ''),
  738. author_avatar=sub_user.get('image', ''),
  739. like_count=sub.get('like_count', 0),
  740. create_time=sub.get('create_time', ''),
  741. ))
  742. comments.append(CommentItem(
  743. comment_id=cid,
  744. work_id=work_id,
  745. content=comment.get('content', ''),
  746. author_id=user_info.get('user_id', ''),
  747. author_name=user_info.get('nickname', ''),
  748. author_avatar=user_info.get('image', ''),
  749. like_count=comment.get('like_count', 0),
  750. reply_count=comment.get('sub_comment_count', 0),
  751. create_time=comment.get('create_time', ''),
  752. replies=replies,
  753. ))
  754. total = len(comments)
  755. print(f"[{self.platform_name}] 解析到 {total} 条评论", flush=True)
  756. else:
  757. print(f"[{self.platform_name}] 未捕获到评论 API 响应", flush=True)
  758. except Exception as e:
  759. import traceback
  760. traceback.print_exc()
  761. return CommentsResult(
  762. success=False,
  763. platform=self.platform_name,
  764. work_id=work_id,
  765. error=str(e)
  766. )
  767. finally:
  768. await self.close_browser()
  769. result = CommentsResult(
  770. success=True,
  771. platform=self.platform_name,
  772. work_id=work_id,
  773. comments=comments,
  774. total=total,
  775. has_more=has_more
  776. )
  777. result.__dict__['cursor'] = next_cursor
  778. return result
  779. async def get_all_comments(self, cookies: str) -> dict:
  780. """获取所有作品的评论 - 通过评论管理页面"""
  781. print(f"\n{'='*60}")
  782. print(f"[{self.platform_name}] 获取所有作品评论")
  783. print(f"{'='*60}")
  784. all_work_comments = []
  785. captured_comments = []
  786. captured_notes = {} # note_id -> note_info
  787. try:
  788. await self.init_browser()
  789. cookie_list = self.parse_cookies(cookies)
  790. await self.set_cookies(cookie_list)
  791. if not self.page:
  792. raise Exception("Page not initialized")
  793. # 设置 API 响应监听器
  794. async def handle_response(response):
  795. nonlocal captured_comments, captured_notes
  796. url = response.url
  797. try:
  798. # 监听评论列表 API - 多种格式
  799. if '/comment/' in url and ('page' in url or 'list' in url):
  800. json_data = await response.json()
  801. print(f"[{self.platform_name}] 捕获到评论 API: {url[:100]}...", flush=True)
  802. if json_data.get('success') or json_data.get('code') == 0:
  803. data = json_data.get('data', {})
  804. comments = data.get('comments', []) or data.get('list', [])
  805. # 从 URL 中提取 note_id
  806. import re
  807. note_id_match = re.search(r'note_id=([^&]+)', url)
  808. note_id = note_id_match.group(1) if note_id_match else ''
  809. if comments:
  810. for comment in comments:
  811. # 添加 note_id 到评论中
  812. if note_id and 'note_id' not in comment:
  813. comment['note_id'] = note_id
  814. captured_comments.append(comment)
  815. print(f"[{self.platform_name}] 捕获到 {len(comments)} 条评论 (note_id={note_id}),总计: {len(captured_comments)}", flush=True)
  816. # 监听笔记列表 API
  817. if '/note/' in url and ('list' in url or 'posted' in url or 'manager' in url):
  818. json_data = await response.json()
  819. if json_data.get('success') or json_data.get('code') == 0:
  820. data = json_data.get('data', {})
  821. notes = data.get('notes', []) or data.get('list', [])
  822. print(f"[{self.platform_name}] 捕获到笔记列表 API: {len(notes)} 个笔记", flush=True)
  823. for note in notes:
  824. note_id = note.get('note_id', '') or note.get('id', '')
  825. if note_id:
  826. cover_url = ''
  827. cover = note.get('cover', {})
  828. if isinstance(cover, dict):
  829. cover_url = cover.get('url', '') or cover.get('url_default', '')
  830. elif isinstance(cover, str):
  831. cover_url = cover
  832. captured_notes[note_id] = {
  833. 'title': note.get('title', '') or note.get('display_title', ''),
  834. 'cover': cover_url,
  835. }
  836. except Exception as e:
  837. print(f"[{self.platform_name}] 解析响应失败: {e}", flush=True)
  838. self.page.on('response', handle_response)
  839. print(f"[{self.platform_name}] 已注册 API 响应监听器", flush=True)
  840. # 访问评论管理页面
  841. print(f"[{self.platform_name}] 访问评论管理页面...", flush=True)
  842. await self.page.goto("https://creator.xiaohongshu.com/creator/comment", wait_until="domcontentloaded", timeout=30000)
  843. await asyncio.sleep(5)
  844. # 检查登录状态
  845. current_url = self.page.url
  846. if "login" in current_url:
  847. raise Exception("Cookie 已过期,请重新登录")
  848. print(f"[{self.platform_name}] 页面加载完成,当前捕获: {len(captured_comments)} 条评论, {len(captured_notes)} 个笔记", flush=True)
  849. # 滚动加载更多评论
  850. for i in range(5):
  851. await self.page.evaluate('window.scrollBy(0, 500)')
  852. await asyncio.sleep(1)
  853. await asyncio.sleep(3)
  854. # 移除监听器
  855. self.page.remove_listener('response', handle_response)
  856. print(f"[{self.platform_name}] 最终捕获: {len(captured_comments)} 条评论, {len(captured_notes)} 个笔记", flush=True)
  857. # 按作品分组评论
  858. work_comments_map = {} # note_id -> work_comments
  859. for comment in captured_comments:
  860. # 获取笔记信息
  861. note_info = comment.get('note_info', {}) or comment.get('note', {})
  862. note_id = comment.get('note_id', '') or note_info.get('note_id', '') or note_info.get('id', '')
  863. if not note_id:
  864. continue
  865. if note_id not in work_comments_map:
  866. saved_note = captured_notes.get(note_id, {})
  867. cover_url = ''
  868. cover = note_info.get('cover', {})
  869. if isinstance(cover, dict):
  870. cover_url = cover.get('url', '') or cover.get('url_default', '')
  871. elif isinstance(cover, str):
  872. cover_url = cover
  873. if not cover_url:
  874. cover_url = saved_note.get('cover', '')
  875. work_comments_map[note_id] = {
  876. 'work_id': note_id,
  877. 'title': note_info.get('title', '') or note_info.get('display_title', '') or saved_note.get('title', ''),
  878. 'cover_url': cover_url,
  879. 'comments': []
  880. }
  881. cid = comment.get('id', '') or comment.get('comment_id', '')
  882. if not cid:
  883. continue
  884. user_info = comment.get('user_info', {}) or comment.get('user', {})
  885. work_comments_map[note_id]['comments'].append({
  886. 'comment_id': cid,
  887. 'author_id': user_info.get('user_id', '') or user_info.get('id', ''),
  888. 'author_name': user_info.get('nickname', '') or user_info.get('name', ''),
  889. 'author_avatar': user_info.get('image', '') or user_info.get('avatar', ''),
  890. 'content': comment.get('content', ''),
  891. 'like_count': comment.get('like_count', 0),
  892. 'create_time': comment.get('create_time', ''),
  893. })
  894. all_work_comments = list(work_comments_map.values())
  895. total_comments = sum(len(w['comments']) for w in all_work_comments)
  896. print(f"[{self.platform_name}] 获取到 {len(all_work_comments)} 个作品的 {total_comments} 条评论", flush=True)
  897. except Exception as e:
  898. import traceback
  899. traceback.print_exc()
  900. return {
  901. 'success': False,
  902. 'platform': self.platform_name,
  903. 'error': str(e),
  904. 'work_comments': []
  905. }
  906. finally:
  907. await self.close_browser()
  908. return {
  909. 'success': True,
  910. 'platform': self.platform_name,
  911. 'work_comments': all_work_comments,
  912. 'total': len(all_work_comments)
  913. }