xiaohongshu.py 44 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987
  1. # -*- coding: utf-8 -*-
  2. """
  3. 小红书视频发布器
  4. 参考: matrix/xhs_uploader/main.py
  5. 使用 xhs SDK API 方式发布,更稳定
  6. """
  7. import asyncio
  8. import os
  9. import sys
  10. from pathlib import Path
  11. from typing import List
  12. from .base import (
  13. BasePublisher, PublishParams, PublishResult,
  14. WorkItem, WorksResult, CommentItem, CommentsResult
  15. )
  16. # 添加 matrix 项目路径,用于导入签名脚本
  17. MATRIX_PATH = Path(__file__).parent.parent.parent.parent / "matrix"
  18. sys.path.insert(0, str(MATRIX_PATH))
  19. # 尝试导入 xhs SDK
  20. try:
  21. from xhs import XhsClient
  22. XHS_SDK_AVAILABLE = True
  23. except ImportError:
  24. print("[Warning] xhs 库未安装,请运行: pip install xhs")
  25. XhsClient = None
  26. XHS_SDK_AVAILABLE = False
  27. # 签名脚本路径
  28. STEALTH_JS_PATH = MATRIX_PATH / "xhs-api" / "js" / "stealth.min.js"
  29. class XiaohongshuPublisher(BasePublisher):
  30. """
  31. 小红书视频发布器
  32. 优先使用 xhs SDK API 方式发布
  33. """
  34. platform_name = "xiaohongshu"
  35. login_url = "https://creator.xiaohongshu.com/"
  36. publish_url = "https://creator.xiaohongshu.com/publish/publish"
  37. cookie_domain = ".xiaohongshu.com"
  38. async def get_sign(self, uri: str, data=None, a1: str = "", web_session: str = ""):
  39. """获取小红书 API 签名"""
  40. from playwright.async_api import async_playwright
  41. try:
  42. async with async_playwright() as playwright:
  43. browser = await playwright.chromium.launch(headless=True)
  44. browser_context = await browser.new_context()
  45. if STEALTH_JS_PATH.exists():
  46. await browser_context.add_init_script(path=str(STEALTH_JS_PATH))
  47. page = await browser_context.new_page()
  48. await page.goto("https://www.xiaohongshu.com")
  49. await asyncio.sleep(1)
  50. await page.reload()
  51. await asyncio.sleep(1)
  52. if a1:
  53. await browser_context.add_cookies([
  54. {'name': 'a1', 'value': a1, 'domain': ".xiaohongshu.com", 'path': "/"}
  55. ])
  56. await page.reload()
  57. await asyncio.sleep(0.5)
  58. encrypt_params = await page.evaluate(
  59. "([url, data]) => window._webmsxyw(url, data)",
  60. [uri, data]
  61. )
  62. await browser_context.close()
  63. await browser.close()
  64. return {
  65. "x-s": encrypt_params["X-s"],
  66. "x-t": str(encrypt_params["X-t"])
  67. }
  68. except Exception as e:
  69. import traceback
  70. traceback.print_exc()
  71. raise Exception(f"签名失败: {e}")
  72. def sign_sync(self, uri, data=None, a1="", web_session=""):
  73. """同步签名函数,供 XhsClient 使用"""
  74. return asyncio.run(self.get_sign(uri, data, a1, web_session))
  75. async def publish_via_api(self, cookies: str, params: PublishParams) -> PublishResult:
  76. """通过 API 发布视频"""
  77. if not XHS_SDK_AVAILABLE:
  78. raise Exception("xhs SDK 未安装,请运行: pip install xhs")
  79. self.report_progress(10, "正在通过 API 发布...")
  80. print(f"[{self.platform_name}] 使用 XHS SDK API 发布...")
  81. print(f"[{self.platform_name}] 视频路径: {params.video_path}")
  82. print(f"[{self.platform_name}] 标题: {params.title}")
  83. # 转换 cookie 格式
  84. cookie_list = self.parse_cookies(cookies)
  85. cookie_string = self.cookies_to_string(cookie_list) if cookie_list else cookies
  86. print(f"[{self.platform_name}] Cookie 长度: {len(cookie_string)}")
  87. self.report_progress(20, "正在上传视频...")
  88. # 创建客户端
  89. xhs_client = XhsClient(cookie_string, sign=self.sign_sync)
  90. print(f"[{self.platform_name}] 开始调用 create_video_note...")
  91. # 发布视频
  92. try:
  93. result = xhs_client.create_video_note(
  94. title=params.title,
  95. desc=params.description or params.title,
  96. topics=params.tags or [],
  97. post_time=params.publish_date.strftime("%Y-%m-%d %H:%M:%S") if params.publish_date else None,
  98. video_path=params.video_path,
  99. cover_path=params.cover_path if params.cover_path and os.path.exists(params.cover_path) else None
  100. )
  101. print(f"[{self.platform_name}] SDK 返回结果: {result}")
  102. except Exception as e:
  103. import traceback
  104. traceback.print_exc()
  105. print(f"[{self.platform_name}] SDK 调用失败: {e}")
  106. raise Exception(f"XHS SDK 发布失败: {e}")
  107. # 验证返回结果
  108. if not result:
  109. raise Exception("XHS SDK 返回空结果")
  110. # 检查是否有错误
  111. if isinstance(result, dict):
  112. if result.get("code") and result.get("code") != 0:
  113. raise Exception(f"发布失败: {result.get('msg', '未知错误')}")
  114. if result.get("success") == False:
  115. raise Exception(f"发布失败: {result.get('msg', result.get('error', '未知错误'))}")
  116. note_id = result.get("note_id", "") if isinstance(result, dict) else ""
  117. video_url = result.get("url", "") if isinstance(result, dict) else ""
  118. if not note_id:
  119. print(f"[{self.platform_name}] 警告: 未获取到 note_id,返回结果: {result}")
  120. self.report_progress(100, "发布成功")
  121. print(f"[{self.platform_name}] 发布成功! note_id={note_id}, url={video_url}")
  122. return PublishResult(
  123. success=True,
  124. platform=self.platform_name,
  125. video_id=note_id,
  126. video_url=video_url,
  127. message="发布成功"
  128. )
  129. async def publish(self, cookies: str, params: PublishParams) -> PublishResult:
  130. """发布视频到小红书 - 参考 matrix/xhs_uploader/main.py"""
  131. print(f"\n{'='*60}")
  132. print(f"[{self.platform_name}] 开始发布视频")
  133. print(f"[{self.platform_name}] 视频路径: {params.video_path}")
  134. print(f"[{self.platform_name}] 标题: {params.title}")
  135. print(f"[{self.platform_name}] Headless: {self.headless}")
  136. print(f"[{self.platform_name}] XHS SDK 可用: {XHS_SDK_AVAILABLE}")
  137. print(f"{'='*60}")
  138. # 检查视频文件
  139. if not os.path.exists(params.video_path):
  140. raise Exception(f"视频文件不存在: {params.video_path}")
  141. print(f"[{self.platform_name}] 视频文件存在,大小: {os.path.getsize(params.video_path)} bytes")
  142. self.report_progress(5, "正在准备发布...")
  143. # 参考 matrix: 优先使用 XHS SDK API 方式发布(更稳定)
  144. if XHS_SDK_AVAILABLE:
  145. try:
  146. print(f"[{self.platform_name}] 尝试使用 XHS SDK API 发布...")
  147. result = await self.publish_via_api(cookies, params)
  148. print(f"[{self.platform_name}] API 发布完成: success={result.success}")
  149. # 如果 API 返回成功,直接返回
  150. if result.success:
  151. return result
  152. # 如果 API 返回失败但有具体错误,也返回
  153. if result.error and "请刷新" not in result.error:
  154. return result
  155. # 其他情况尝试 Playwright 方式
  156. print(f"[{self.platform_name}] API 方式未成功,尝试 Playwright...")
  157. except Exception as e:
  158. import traceback
  159. traceback.print_exc()
  160. print(f"[{self.platform_name}] API 发布失败: {e}")
  161. print(f"[{self.platform_name}] 尝试使用 Playwright 方式...")
  162. # 使用 Playwright 方式发布
  163. print(f"[{self.platform_name}] 使用 Playwright 方式发布...")
  164. return await self.publish_via_playwright(cookies, params)
  165. async def publish_via_playwright(self, cookies: str, params: PublishParams) -> PublishResult:
  166. """通过 Playwright 发布视频"""
  167. self.report_progress(10, "正在初始化浏览器...")
  168. print(f"[{self.platform_name}] Playwright 方式开始...")
  169. await self.init_browser()
  170. cookie_list = self.parse_cookies(cookies)
  171. print(f"[{self.platform_name}] 设置 {len(cookie_list)} 个 cookies")
  172. await self.set_cookies(cookie_list)
  173. if not self.page:
  174. raise Exception("Page not initialized")
  175. self.report_progress(15, "正在打开发布页面...")
  176. # 直接访问视频发布页面
  177. publish_url = "https://creator.xiaohongshu.com/publish/publish?source=official"
  178. print(f"[{self.platform_name}] 打开页面: {publish_url}")
  179. await self.page.goto(publish_url)
  180. await asyncio.sleep(3)
  181. current_url = self.page.url
  182. print(f"[{self.platform_name}] 当前 URL: {current_url}")
  183. # 检查登录状态
  184. if "login" in current_url or "passport" in current_url:
  185. screenshot_base64 = await self.capture_screenshot()
  186. return PublishResult(
  187. success=False,
  188. platform=self.platform_name,
  189. error="登录已过期,请重新登录",
  190. screenshot_base64=screenshot_base64,
  191. page_url=current_url,
  192. status='need_captcha',
  193. need_captcha=True,
  194. captcha_type='login'
  195. )
  196. # 使用 AI 检查验证码
  197. ai_captcha = await self.ai_check_captcha()
  198. if ai_captcha['has_captcha']:
  199. print(f"[{self.platform_name}] AI检测到验证码: {ai_captcha['captcha_type']}", flush=True)
  200. screenshot_base64 = await self.capture_screenshot()
  201. return PublishResult(
  202. success=False,
  203. platform=self.platform_name,
  204. error=f"检测到{ai_captcha['captcha_type']}验证码,需要使用有头浏览器完成验证",
  205. screenshot_base64=screenshot_base64,
  206. page_url=current_url,
  207. status='need_captcha',
  208. need_captcha=True,
  209. captcha_type=ai_captcha['captcha_type']
  210. )
  211. self.report_progress(20, "正在上传视频...")
  212. # 等待页面加载
  213. await asyncio.sleep(2)
  214. # 上传视频
  215. upload_triggered = False
  216. # 方法1: 直接设置隐藏的 file input
  217. print(f"[{self.platform_name}] 尝试方法1: 设置 file input")
  218. file_inputs = self.page.locator('input[type="file"]')
  219. input_count = await file_inputs.count()
  220. print(f"[{self.platform_name}] 找到 {input_count} 个 file input")
  221. if input_count > 0:
  222. # 找到接受视频的 input
  223. for i in range(input_count):
  224. input_el = file_inputs.nth(i)
  225. accept = await input_el.get_attribute('accept') or ''
  226. print(f"[{self.platform_name}] Input {i} accept: {accept}")
  227. if 'video' in accept or '*' in accept or not accept:
  228. await input_el.set_input_files(params.video_path)
  229. upload_triggered = True
  230. print(f"[{self.platform_name}] 视频文件已设置到 input {i}")
  231. break
  232. # 方法2: 点击上传区域触发文件选择器
  233. if not upload_triggered:
  234. print(f"[{self.platform_name}] 尝试方法2: 点击上传区域")
  235. try:
  236. upload_area = self.page.locator('[class*="upload-wrapper"], [class*="upload-area"], .upload-input').first
  237. if await upload_area.count() > 0:
  238. async with self.page.expect_file_chooser(timeout=5000) as fc_info:
  239. await upload_area.click()
  240. file_chooser = await fc_info.value
  241. await file_chooser.set_files(params.video_path)
  242. upload_triggered = True
  243. print(f"[{self.platform_name}] 通过点击上传区域上传成功")
  244. except Exception as e:
  245. print(f"[{self.platform_name}] 方法2失败: {e}")
  246. if not upload_triggered:
  247. screenshot_base64 = await self.capture_screenshot()
  248. page_url = await self.get_page_url()
  249. return PublishResult(
  250. success=False,
  251. platform=self.platform_name,
  252. error="无法上传视频文件",
  253. screenshot_base64=screenshot_base64,
  254. page_url=page_url,
  255. status='need_action'
  256. )
  257. self.report_progress(40, "等待视频上传完成...")
  258. print(f"[{self.platform_name}] 等待视频上传和处理...")
  259. # 等待上传完成(检测页面变化)
  260. upload_complete = False
  261. for i in range(60): # 最多等待3分钟
  262. await asyncio.sleep(3)
  263. # 检查是否有标题输入框(上传完成后出现)
  264. title_input_count = await self.page.locator('input[placeholder*="标题"], input[placeholder*="填写标题"]').count()
  265. # 或者检查编辑器区域
  266. editor_count = await self.page.locator('[class*="ql-editor"], [contenteditable="true"]').count()
  267. # 检查发布按钮是否可见
  268. publish_btn_count = await self.page.locator('.publishBtn, button:has-text("发布")').count()
  269. print(f"[{self.platform_name}] 检测 {i+1}: 标题框={title_input_count}, 编辑器={editor_count}, 发布按钮={publish_btn_count}")
  270. if title_input_count > 0 or (editor_count > 0 and publish_btn_count > 0):
  271. upload_complete = True
  272. print(f"[{self.platform_name}] 视频上传完成!")
  273. break
  274. if not upload_complete:
  275. screenshot_base64 = await self.capture_screenshot()
  276. page_url = await self.get_page_url()
  277. return PublishResult(
  278. success=False,
  279. platform=self.platform_name,
  280. error="视频上传超时",
  281. screenshot_base64=screenshot_base64,
  282. page_url=page_url,
  283. status='need_action'
  284. )
  285. await asyncio.sleep(2)
  286. self.report_progress(60, "正在填写笔记信息...")
  287. print(f"[{self.platform_name}] 填写标题: {params.title[:20]}")
  288. # 填写标题
  289. title_filled = False
  290. title_selectors = [
  291. 'input[placeholder*="标题"]',
  292. 'input[placeholder*="填写标题"]',
  293. '[class*="title"] input',
  294. '.c-input_inner',
  295. ]
  296. for selector in title_selectors:
  297. title_input = self.page.locator(selector).first
  298. if await title_input.count() > 0:
  299. await title_input.click()
  300. await title_input.fill('') # 先清空
  301. await title_input.fill(params.title[:20])
  302. title_filled = True
  303. print(f"[{self.platform_name}] 标题已填写,使用选择器: {selector}")
  304. break
  305. if not title_filled:
  306. print(f"[{self.platform_name}] 警告: 未找到标题输入框")
  307. # 填写描述和标签
  308. if params.description or params.tags:
  309. desc_filled = False
  310. desc_selectors = [
  311. '[class*="ql-editor"]',
  312. '[class*="content-input"] [contenteditable="true"]',
  313. '[class*="editor"] [contenteditable="true"]',
  314. '.ql-editor',
  315. ]
  316. for selector in desc_selectors:
  317. desc_input = self.page.locator(selector).first
  318. if await desc_input.count() > 0:
  319. await desc_input.click()
  320. await asyncio.sleep(0.5)
  321. if params.description:
  322. await self.page.keyboard.type(params.description, delay=20)
  323. print(f"[{self.platform_name}] 描述已填写")
  324. if params.tags:
  325. # 添加标签
  326. await self.page.keyboard.press("Enter")
  327. for tag in params.tags[:5]: # 最多5个标签
  328. await self.page.keyboard.type(f"#{tag}", delay=20)
  329. await asyncio.sleep(0.3)
  330. await self.page.keyboard.press("Space")
  331. print(f"[{self.platform_name}] 标签已填写: {params.tags[:5]}")
  332. desc_filled = True
  333. break
  334. if not desc_filled:
  335. print(f"[{self.platform_name}] 警告: 未找到描述输入框")
  336. await asyncio.sleep(2)
  337. self.report_progress(80, "正在发布...")
  338. await asyncio.sleep(2)
  339. # 滚动到页面底部确保发布按钮可见
  340. await self.page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
  341. await asyncio.sleep(1)
  342. print(f"[{self.platform_name}] 查找发布按钮...")
  343. # 点击发布
  344. publish_selectors = [
  345. 'button.publishBtn',
  346. '.publishBtn',
  347. 'button.d-button.red',
  348. 'button:has-text("发布"):not(:has-text("定时发布"))',
  349. '[class*="publish"][class*="btn"]',
  350. ]
  351. publish_clicked = False
  352. for selector in publish_selectors:
  353. try:
  354. btn = self.page.locator(selector).first
  355. if await btn.count() > 0:
  356. is_visible = await btn.is_visible()
  357. is_enabled = await btn.is_enabled()
  358. print(f"[{self.platform_name}] 按钮 {selector}: visible={is_visible}, enabled={is_enabled}")
  359. if is_visible and is_enabled:
  360. box = await btn.bounding_box()
  361. if box:
  362. print(f"[{self.platform_name}] 点击发布按钮: {selector}, 位置: ({box['x']}, {box['y']})")
  363. # 使用真实鼠标点击
  364. await self.page.mouse.click(box['x'] + box['width']/2, box['y'] + box['height']/2)
  365. publish_clicked = True
  366. break
  367. except Exception as e:
  368. print(f"[{self.platform_name}] 选择器 {selector} 错误: {e}")
  369. if not publish_clicked:
  370. # 保存截图用于调试
  371. screenshot_path = f"debug_publish_failed_{self.platform_name}.png"
  372. await self.page.screenshot(path=screenshot_path, full_page=True)
  373. print(f"[{self.platform_name}] 未找到发布按钮,截图保存到: {screenshot_path}")
  374. # 打印页面 HTML 结构用于调试
  375. buttons = await self.page.query_selector_all('button')
  376. print(f"[{self.platform_name}] 页面上共有 {len(buttons)} 个按钮")
  377. for i, btn in enumerate(buttons[:10]):
  378. text = await btn.text_content() or ''
  379. cls = await btn.get_attribute('class') or ''
  380. print(f" 按钮 {i}: text='{text.strip()[:30]}', class='{cls[:50]}'")
  381. raise Exception("未找到发布按钮")
  382. print(f"[{self.platform_name}] 已点击发布按钮,等待发布完成...")
  383. self.report_progress(90, "等待发布结果...")
  384. # 等待发布完成(检测 URL 变化或成功提示)
  385. publish_success = False
  386. for i in range(20): # 最多等待 20 秒
  387. await asyncio.sleep(1)
  388. current_url = self.page.url
  389. # 检查是否跳转到发布成功页面或内容管理页面
  390. if "published=true" in current_url or "success" in current_url or "content" in current_url:
  391. publish_success = True
  392. print(f"[{self.platform_name}] 发布成功! 跳转到: {current_url}")
  393. break
  394. # 检查是否有成功提示
  395. try:
  396. success_msg = await self.page.locator('[class*="success"], .toast-success, [class*="Toast"]').first.is_visible()
  397. if success_msg:
  398. publish_success = True
  399. print(f"[{self.platform_name}] 检测到成功提示!")
  400. break
  401. except:
  402. pass
  403. # 检查是否有错误提示
  404. try:
  405. error_elements = self.page.locator('[class*="error"], .toast-error, [class*="fail"]')
  406. if await error_elements.count() > 0:
  407. error_text = await error_elements.first.text_content()
  408. if error_text and len(error_text.strip()) > 0:
  409. raise Exception(f"发布失败: {error_text.strip()}")
  410. except Exception as e:
  411. if "发布失败" in str(e):
  412. raise
  413. # 如果没有明确的成功标志,返回截图供 AI 分析
  414. if not publish_success:
  415. final_url = self.page.url
  416. print(f"[{self.platform_name}] 发布结果不确定,当前 URL: {final_url}")
  417. screenshot_base64 = await self.capture_screenshot()
  418. print(f"[{self.platform_name}] 已获取截图供 AI 分析")
  419. # 如果 URL 还是发布页面,可能需要继续操作
  420. if "publish/publish" in final_url:
  421. return PublishResult(
  422. success=False,
  423. platform=self.platform_name,
  424. error="发布结果待确认,请查看截图",
  425. screenshot_base64=screenshot_base64,
  426. page_url=final_url,
  427. status='need_action'
  428. )
  429. self.report_progress(100, "发布完成")
  430. print(f"[{self.platform_name}] Playwright 方式发布完成!")
  431. screenshot_base64 = await self.capture_screenshot()
  432. page_url = await self.get_page_url()
  433. return PublishResult(
  434. success=True,
  435. platform=self.platform_name,
  436. message="发布完成",
  437. screenshot_base64=screenshot_base64,
  438. page_url=page_url,
  439. status='success'
  440. )
  441. async def get_works(self, cookies: str, page: int = 0, page_size: int = 20) -> WorksResult:
  442. """获取小红书作品列表 - 通过监听页面网络响应获取数据"""
  443. print(f"\n{'='*60}", flush=True)
  444. print(f"[{self.platform_name}] 获取作品列表", flush=True)
  445. print(f"[{self.platform_name}] page={page}, page_size={page_size}", flush=True)
  446. print(f"{'='*60}", flush=True)
  447. works: List[WorkItem] = []
  448. total = 0
  449. has_more = False
  450. captured_data = {}
  451. try:
  452. await self.init_browser()
  453. cookie_list = self.parse_cookies(cookies)
  454. # 打印 cookies 信息用于调试
  455. print(f"[{self.platform_name}] 解析到 {len(cookie_list)} 个 cookies", flush=True)
  456. await self.set_cookies(cookie_list)
  457. if not self.page:
  458. raise Exception("Page not initialized")
  459. # 定义响应监听器 - 捕获页面自动发起的 API 请求
  460. async def handle_response(response):
  461. nonlocal captured_data
  462. url = response.url
  463. # 监听作品列表 API
  464. if 'creator/note/user/posted' in url or 'creator/note_list' in url:
  465. try:
  466. json_data = await response.json()
  467. print(f"[{self.platform_name}] 捕获到 API 响应: {url[:80]}...", flush=True)
  468. if json_data.get('success') or json_data.get('code') == 0:
  469. captured_data = json_data
  470. print(f"[{self.platform_name}] API 响应成功,data keys: {list(json_data.get('data', {}).keys())}", flush=True)
  471. except Exception as e:
  472. print(f"[{self.platform_name}] 解析响应失败: {e}", flush=True)
  473. # 注册响应监听器
  474. self.page.on('response', handle_response)
  475. print(f"[{self.platform_name}] 已注册 API 响应监听器", flush=True)
  476. # 访问笔记管理页面 - 页面会自动发起 API 请求
  477. print(f"[{self.platform_name}] 访问笔记管理页面...", flush=True)
  478. try:
  479. await self.page.goto("https://creator.xiaohongshu.com/new/note-manager", wait_until="domcontentloaded", timeout=30000)
  480. except Exception as nav_error:
  481. print(f"[{self.platform_name}] 导航超时,但继续尝试: {nav_error}", flush=True)
  482. # 等待 API 响应被捕获
  483. await asyncio.sleep(5)
  484. # 检查登录状态
  485. current_url = self.page.url
  486. print(f"[{self.platform_name}] 当前页面: {current_url}", flush=True)
  487. if "login" in current_url:
  488. raise Exception("Cookie 已过期,请重新登录")
  489. # 如果还没有捕获到数据,等待更长时间
  490. if not captured_data:
  491. print(f"[{self.platform_name}] 等待 API 响应...", flush=True)
  492. await asyncio.sleep(5)
  493. # 移除监听器
  494. self.page.remove_listener('response', handle_response)
  495. # 处理捕获到的数据
  496. import json
  497. if captured_data:
  498. print(f"[{self.platform_name}] 成功捕获到 API 数据", flush=True)
  499. data = captured_data.get('data', {})
  500. notes = data.get('notes', [])
  501. print(f"[{self.platform_name}] notes 数量: {len(notes)}", flush=True)
  502. # 从 tags 获取总数
  503. tags = data.get('tags', [])
  504. for tag in tags:
  505. if tag.get('id') == 'special.note_time_desc':
  506. total = tag.get('notes_count', 0)
  507. break
  508. has_more = data.get('page', -1) != -1
  509. for note in notes:
  510. note_id = note.get('id', '')
  511. if not note_id:
  512. continue
  513. # 获取封面
  514. cover_url = ''
  515. images_list = note.get('images_list', [])
  516. if images_list:
  517. cover_url = images_list[0].get('url', '')
  518. if cover_url.startswith('http://'):
  519. cover_url = cover_url.replace('http://', 'https://')
  520. # 获取时长
  521. duration = note.get('video_info', {}).get('duration', 0)
  522. # 解析状态
  523. status = 'published'
  524. tab_status = note.get('tab_status', 1)
  525. if tab_status == 0:
  526. status = 'draft'
  527. elif tab_status == 2:
  528. status = 'reviewing'
  529. elif tab_status == 3:
  530. status = 'rejected'
  531. works.append(WorkItem(
  532. work_id=note_id,
  533. title=note.get('display_title', '') or '无标题',
  534. cover_url=cover_url,
  535. duration=duration,
  536. status=status,
  537. publish_time=note.get('time', ''),
  538. play_count=note.get('view_count', 0),
  539. like_count=note.get('likes', 0),
  540. comment_count=note.get('comments_count', 0),
  541. share_count=note.get('shared_count', 0),
  542. collect_count=note.get('collected_count', 0),
  543. ))
  544. print(f"[{self.platform_name}] 解析到 {len(works)} 个作品,总计: {total}", flush=True)
  545. else:
  546. print(f"[{self.platform_name}] 未能捕获到 API 数据", flush=True)
  547. except Exception as e:
  548. import traceback
  549. print(f"[{self.platform_name}] 发生异常: {e}", flush=True)
  550. traceback.print_exc()
  551. return WorksResult(
  552. success=False,
  553. platform=self.platform_name,
  554. error=str(e)
  555. )
  556. finally:
  557. # 确保关闭浏览器
  558. await self.close_browser()
  559. return WorksResult(
  560. success=True,
  561. platform=self.platform_name,
  562. works=works,
  563. total=total or len(works),
  564. has_more=has_more
  565. )
  566. async def get_comments(self, cookies: str, work_id: str, cursor: str = "") -> CommentsResult:
  567. """获取小红书作品评论 - 通过创作者后台评论管理页面"""
  568. print(f"\n{'='*60}")
  569. print(f"[{self.platform_name}] 获取作品评论")
  570. print(f"[{self.platform_name}] work_id={work_id}, cursor={cursor}")
  571. print(f"{'='*60}")
  572. comments: List[CommentItem] = []
  573. total = 0
  574. has_more = False
  575. next_cursor = ""
  576. captured_data = {}
  577. try:
  578. await self.init_browser()
  579. cookie_list = self.parse_cookies(cookies)
  580. await self.set_cookies(cookie_list)
  581. if not self.page:
  582. raise Exception("Page not initialized")
  583. # 设置 API 响应监听器
  584. async def handle_response(response):
  585. nonlocal captured_data
  586. url = response.url
  587. # 监听评论相关 API - 创作者后台和普通页面的 API
  588. if '/comment/' in url and ('page' in url or 'list' in url):
  589. try:
  590. json_data = await response.json()
  591. print(f"[{self.platform_name}] 捕获到评论 API: {url[:100]}...", flush=True)
  592. if json_data.get('success') or json_data.get('code') == 0:
  593. data = json_data.get('data', {})
  594. comment_list = data.get('comments') or data.get('list') or []
  595. if comment_list:
  596. captured_data = json_data
  597. print(f"[{self.platform_name}] 评论 API 响应成功,comments={len(comment_list)}", flush=True)
  598. else:
  599. print(f"[{self.platform_name}] 评论 API 响应成功但无评论", flush=True)
  600. except Exception as e:
  601. print(f"[{self.platform_name}] 解析评论响应失败: {e}", flush=True)
  602. self.page.on('response', handle_response)
  603. print(f"[{self.platform_name}] 已注册评论 API 响应监听器", flush=True)
  604. # 访问创作者后台评论管理页面
  605. comment_url = "https://creator.xiaohongshu.com/creator/comment"
  606. print(f"[{self.platform_name}] 访问评论管理页面: {comment_url}", flush=True)
  607. await self.page.goto(comment_url, wait_until="domcontentloaded", timeout=30000)
  608. await asyncio.sleep(5)
  609. # 检查是否被重定向到登录页
  610. current_url = self.page.url
  611. print(f"[{self.platform_name}] 当前页面 URL: {current_url}", flush=True)
  612. if "login" in current_url:
  613. raise Exception("Cookie 已过期,请重新登录")
  614. # 等待评论加载
  615. if not captured_data:
  616. print(f"[{self.platform_name}] 等待评论 API 响应...", flush=True)
  617. # 尝试滚动页面触发评论加载
  618. await self.page.evaluate('window.scrollBy(0, 500)')
  619. await asyncio.sleep(3)
  620. if not captured_data:
  621. # 再等待一会,可能评论 API 加载较慢
  622. print(f"[{self.platform_name}] 继续等待评论加载...", flush=True)
  623. await asyncio.sleep(5)
  624. # 移除监听器
  625. self.page.remove_listener('response', handle_response)
  626. # 解析评论数据
  627. if captured_data:
  628. data = captured_data.get('data', {})
  629. comment_list = data.get('comments') or data.get('list') or []
  630. has_more = data.get('has_more', False)
  631. next_cursor = data.get('cursor', '')
  632. print(f"[{self.platform_name}] 解析评论: has_more={has_more}, comments={len(comment_list)}", flush=True)
  633. for comment in comment_list:
  634. cid = comment.get('id', '')
  635. if not cid:
  636. continue
  637. user_info = comment.get('user_info', {})
  638. # 解析子评论
  639. replies = []
  640. sub_comments = comment.get('sub_comments', []) or []
  641. for sub in sub_comments:
  642. sub_user = sub.get('user_info', {})
  643. replies.append(CommentItem(
  644. comment_id=sub.get('id', ''),
  645. work_id=work_id,
  646. content=sub.get('content', ''),
  647. author_id=sub_user.get('user_id', ''),
  648. author_name=sub_user.get('nickname', ''),
  649. author_avatar=sub_user.get('image', ''),
  650. like_count=sub.get('like_count', 0),
  651. create_time=sub.get('create_time', ''),
  652. ))
  653. comments.append(CommentItem(
  654. comment_id=cid,
  655. work_id=work_id,
  656. content=comment.get('content', ''),
  657. author_id=user_info.get('user_id', ''),
  658. author_name=user_info.get('nickname', ''),
  659. author_avatar=user_info.get('image', ''),
  660. like_count=comment.get('like_count', 0),
  661. reply_count=comment.get('sub_comment_count', 0),
  662. create_time=comment.get('create_time', ''),
  663. replies=replies,
  664. ))
  665. total = len(comments)
  666. print(f"[{self.platform_name}] 解析到 {total} 条评论", flush=True)
  667. else:
  668. print(f"[{self.platform_name}] 未捕获到评论 API 响应", flush=True)
  669. except Exception as e:
  670. import traceback
  671. traceback.print_exc()
  672. return CommentsResult(
  673. success=False,
  674. platform=self.platform_name,
  675. work_id=work_id,
  676. error=str(e)
  677. )
  678. finally:
  679. await self.close_browser()
  680. result = CommentsResult(
  681. success=True,
  682. platform=self.platform_name,
  683. work_id=work_id,
  684. comments=comments,
  685. total=total,
  686. has_more=has_more
  687. )
  688. result.__dict__['cursor'] = next_cursor
  689. return result
  690. async def get_all_comments(self, cookies: str) -> dict:
  691. """获取所有作品的评论 - 通过评论管理页面"""
  692. print(f"\n{'='*60}")
  693. print(f"[{self.platform_name}] 获取所有作品评论")
  694. print(f"{'='*60}")
  695. all_work_comments = []
  696. captured_comments = []
  697. captured_notes = {} # note_id -> note_info
  698. try:
  699. await self.init_browser()
  700. cookie_list = self.parse_cookies(cookies)
  701. await self.set_cookies(cookie_list)
  702. if not self.page:
  703. raise Exception("Page not initialized")
  704. # 设置 API 响应监听器
  705. async def handle_response(response):
  706. nonlocal captured_comments, captured_notes
  707. url = response.url
  708. try:
  709. # 监听评论列表 API - 多种格式
  710. if '/comment/' in url and ('page' in url or 'list' in url):
  711. json_data = await response.json()
  712. print(f"[{self.platform_name}] 捕获到评论 API: {url[:100]}...", flush=True)
  713. if json_data.get('success') or json_data.get('code') == 0:
  714. data = json_data.get('data', {})
  715. comments = data.get('comments', []) or data.get('list', [])
  716. # 从 URL 中提取 note_id
  717. import re
  718. note_id_match = re.search(r'note_id=([^&]+)', url)
  719. note_id = note_id_match.group(1) if note_id_match else ''
  720. if comments:
  721. for comment in comments:
  722. # 添加 note_id 到评论中
  723. if note_id and 'note_id' not in comment:
  724. comment['note_id'] = note_id
  725. captured_comments.append(comment)
  726. print(f"[{self.platform_name}] 捕获到 {len(comments)} 条评论 (note_id={note_id}),总计: {len(captured_comments)}", flush=True)
  727. # 监听笔记列表 API
  728. if '/note/' in url and ('list' in url or 'posted' in url or 'manager' in url):
  729. json_data = await response.json()
  730. if json_data.get('success') or json_data.get('code') == 0:
  731. data = json_data.get('data', {})
  732. notes = data.get('notes', []) or data.get('list', [])
  733. print(f"[{self.platform_name}] 捕获到笔记列表 API: {len(notes)} 个笔记", flush=True)
  734. for note in notes:
  735. note_id = note.get('note_id', '') or note.get('id', '')
  736. if note_id:
  737. cover_url = ''
  738. cover = note.get('cover', {})
  739. if isinstance(cover, dict):
  740. cover_url = cover.get('url', '') or cover.get('url_default', '')
  741. elif isinstance(cover, str):
  742. cover_url = cover
  743. captured_notes[note_id] = {
  744. 'title': note.get('title', '') or note.get('display_title', ''),
  745. 'cover': cover_url,
  746. }
  747. except Exception as e:
  748. print(f"[{self.platform_name}] 解析响应失败: {e}", flush=True)
  749. self.page.on('response', handle_response)
  750. print(f"[{self.platform_name}] 已注册 API 响应监听器", flush=True)
  751. # 访问评论管理页面
  752. print(f"[{self.platform_name}] 访问评论管理页面...", flush=True)
  753. await self.page.goto("https://creator.xiaohongshu.com/creator/comment", wait_until="domcontentloaded", timeout=30000)
  754. await asyncio.sleep(5)
  755. # 检查登录状态
  756. current_url = self.page.url
  757. if "login" in current_url:
  758. raise Exception("Cookie 已过期,请重新登录")
  759. print(f"[{self.platform_name}] 页面加载完成,当前捕获: {len(captured_comments)} 条评论, {len(captured_notes)} 个笔记", flush=True)
  760. # 滚动加载更多评论
  761. for i in range(5):
  762. await self.page.evaluate('window.scrollBy(0, 500)')
  763. await asyncio.sleep(1)
  764. await asyncio.sleep(3)
  765. # 移除监听器
  766. self.page.remove_listener('response', handle_response)
  767. print(f"[{self.platform_name}] 最终捕获: {len(captured_comments)} 条评论, {len(captured_notes)} 个笔记", flush=True)
  768. # 按作品分组评论
  769. work_comments_map = {} # note_id -> work_comments
  770. for comment in captured_comments:
  771. # 获取笔记信息
  772. note_info = comment.get('note_info', {}) or comment.get('note', {})
  773. note_id = comment.get('note_id', '') or note_info.get('note_id', '') or note_info.get('id', '')
  774. if not note_id:
  775. continue
  776. if note_id not in work_comments_map:
  777. saved_note = captured_notes.get(note_id, {})
  778. cover_url = ''
  779. cover = note_info.get('cover', {})
  780. if isinstance(cover, dict):
  781. cover_url = cover.get('url', '') or cover.get('url_default', '')
  782. elif isinstance(cover, str):
  783. cover_url = cover
  784. if not cover_url:
  785. cover_url = saved_note.get('cover', '')
  786. work_comments_map[note_id] = {
  787. 'work_id': note_id,
  788. 'title': note_info.get('title', '') or note_info.get('display_title', '') or saved_note.get('title', ''),
  789. 'cover_url': cover_url,
  790. 'comments': []
  791. }
  792. cid = comment.get('id', '') or comment.get('comment_id', '')
  793. if not cid:
  794. continue
  795. user_info = comment.get('user_info', {}) or comment.get('user', {})
  796. work_comments_map[note_id]['comments'].append({
  797. 'comment_id': cid,
  798. 'author_id': user_info.get('user_id', '') or user_info.get('id', ''),
  799. 'author_name': user_info.get('nickname', '') or user_info.get('name', ''),
  800. 'author_avatar': user_info.get('image', '') or user_info.get('avatar', ''),
  801. 'content': comment.get('content', ''),
  802. 'like_count': comment.get('like_count', 0),
  803. 'create_time': comment.get('create_time', ''),
  804. })
  805. all_work_comments = list(work_comments_map.values())
  806. total_comments = sum(len(w['comments']) for w in all_work_comments)
  807. print(f"[{self.platform_name}] 获取到 {len(all_work_comments)} 个作品的 {total_comments} 条评论", flush=True)
  808. except Exception as e:
  809. import traceback
  810. traceback.print_exc()
  811. return {
  812. 'success': False,
  813. 'platform': self.platform_name,
  814. 'error': str(e),
  815. 'work_comments': []
  816. }
  817. finally:
  818. await self.close_browser()
  819. return {
  820. 'success': True,
  821. 'platform': self.platform_name,
  822. 'work_comments': all_work_comments,
  823. 'total': len(all_work_comments)
  824. }