xiaohongshu.py 116 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776
  1. # -*- coding: utf-8 -*-
  2. """
  3. 小红书视频发布器
  4. 参考: matrix/xhs_uploader/main.py
  5. 使用 xhs SDK API 方式发布,更稳定
  6. """
  7. import asyncio
  8. import os
  9. import sys
  10. import time
  11. import concurrent.futures
  12. from pathlib import Path
  13. from typing import List
  14. from .base import (
  15. BasePublisher,
  16. PublishParams,
  17. PublishResult,
  18. WorkItem,
  19. WorksResult,
  20. CommentItem,
  21. CommentsResult,
  22. )
  23. from playwright.async_api import async_playwright
  24. stored_cookies = None
  25. # 添加 matrix 项目路径,用于导入签名脚本
  26. MATRIX_PATH = Path(__file__).parent.parent.parent.parent / "matrix"
  27. sys.path.insert(0, str(MATRIX_PATH))
  28. # 尝试导入 xhs SDK
  29. try:
  30. from xhs import XhsClient
  31. XHS_SDK_AVAILABLE = True
  32. except ImportError:
  33. print("[Warning] xhs 库未安装,请运行: pip install xhs")
  34. XhsClient = None
  35. XHS_SDK_AVAILABLE = False
  36. # 签名脚本路径
  37. STEALTH_JS_PATH = MATRIX_PATH / "xhs-api" / "js" / "stealth.min.js"
  38. _xhs_sign_executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
  39. class XiaohongshuPublisher(BasePublisher):
  40. """
  41. 小红书视频发布器
  42. 优先使用 xhs SDK API 方式发布
  43. """
  44. platform_name = "xiaohongshu"
  45. login_url = "https://creator.xiaohongshu.com/"
  46. publish_url = "https://creator.xiaohongshu.com/publish/publish"
  47. cookie_domain = ".xiaohongshu.com"
  48. async def get_sign(self, uri: str, data=None, a1: str = "", web_session: str = ""):
  49. """获取小红书 API 签名"""
  50. from playwright.async_api import async_playwright
  51. try:
  52. async with async_playwright() as playwright:
  53. browser = await playwright.chromium.launch(headless=True)
  54. browser_context = await browser.new_context()
  55. if STEALTH_JS_PATH.exists():
  56. await browser_context.add_init_script(path=str(STEALTH_JS_PATH))
  57. page = await browser_context.new_page()
  58. await page.goto("https://www.xiaohongshu.com")
  59. await asyncio.sleep(1)
  60. await page.reload()
  61. await asyncio.sleep(1)
  62. if a1:
  63. await browser_context.add_cookies(
  64. [
  65. {
  66. "name": "a1",
  67. "value": a1,
  68. "domain": ".xiaohongshu.com",
  69. "path": "/",
  70. }
  71. ]
  72. )
  73. await page.reload()
  74. await asyncio.sleep(0.5)
  75. encrypt_params = await page.evaluate(
  76. "([url, data]) => window._webmsxyw(url, data)", [uri, data]
  77. )
  78. await browser_context.close()
  79. await browser.close()
  80. return {"x-s": encrypt_params["X-s"], "x-t": str(encrypt_params["X-t"])}
  81. except Exception as e:
  82. import traceback
  83. traceback.print_exc()
  84. raise Exception(f"签名失败: {e}")
  85. def sign_sync(self, uri, data=None, a1="", web_session=""):
  86. """
  87. 同步签名函数,供 XhsClient 使用。
  88. 注意:发布流程运行在 asyncio 事件循环中(通过 asyncio.run 启动)。
  89. XhsClient 以同步方式调用 sign 回调,但我们需要使用 Playwright Async API 进行签名。
  90. 因此当处于事件循环中时,将签名逻辑放到独立线程里执行 asyncio.run。
  91. """
  92. def run_async_sign():
  93. return asyncio.run(
  94. self.get_sign(uri, data=data, a1=a1, web_session=web_session)
  95. )
  96. try:
  97. asyncio.get_running_loop()
  98. future = _xhs_sign_executor.submit(run_async_sign)
  99. return future.result(timeout=120)
  100. except RuntimeError:
  101. return run_async_sign()
  102. async def publish_via_api(
  103. self, cookies: str, params: PublishParams
  104. ) -> PublishResult:
  105. """通过 API 发布视频"""
  106. if not XHS_SDK_AVAILABLE:
  107. raise Exception("xhs SDK 未安装,请运行: pip install xhs")
  108. self.report_progress(10, "正在通过 API 发布...")
  109. print(f"[{self.platform_name}] 使用 XHS SDK API 发布...")
  110. print(f"[{self.platform_name}] 视频路径: {params.video_path}")
  111. print(f"[{self.platform_name}] 标题: {params.title}")
  112. # 转换 cookie 格式
  113. cookie_list = self.parse_cookies(cookies)
  114. cookie_string = self.cookies_to_string(cookie_list) if cookie_list else cookies
  115. print(f"[{self.platform_name}] Cookie 长度: {len(cookie_string)}")
  116. self.report_progress(20, "正在上传视频...")
  117. async def ensure_valid_cookie_for_sdk() -> str | None:
  118. await self.init_browser()
  119. cookie_list_for_browser = self.parse_cookies(cookie_string)
  120. await self.set_cookies(cookie_list_for_browser)
  121. if not self.page or not self.context:
  122. return None
  123. await self.page.goto(
  124. "https://creator.xiaohongshu.com/new/home",
  125. wait_until="domcontentloaded",
  126. timeout=60000,
  127. )
  128. await asyncio.sleep(2)
  129. current_url = (self.page.url or "").lower()
  130. if "login" in current_url or "passport" in current_url:
  131. if self.headless:
  132. return None
  133. waited = 0
  134. while waited < 180:
  135. current_url = (self.page.url or "").lower()
  136. if (
  137. "login" not in current_url
  138. and "passport" not in current_url
  139. and "creator.xiaohongshu.com" in current_url
  140. ):
  141. break
  142. await asyncio.sleep(2)
  143. waited += 2
  144. current_url = (self.page.url or "").lower()
  145. if "login" in current_url or "passport" in current_url:
  146. return None
  147. cookies_after = await self.context.cookies()
  148. try:
  149. await self.sync_cookies_to_node(cookies_after)
  150. except Exception:
  151. pass
  152. refreshed_cookie_str = self.cookies_to_string(cookies_after)
  153. return refreshed_cookie_str or None
  154. def call_create_video_note(sdk_cookie_str: str):
  155. xhs_client = XhsClient(sdk_cookie_str, sign=self.sign_sync)
  156. return xhs_client.create_video_note(
  157. title=params.title,
  158. desc=params.description or params.title,
  159. topics=params.tags or [],
  160. post_time=params.publish_date.strftime("%Y-%m-%d %H:%M:%S")
  161. if params.publish_date
  162. else None,
  163. video_path=params.video_path,
  164. cover_path=params.cover_path
  165. if params.cover_path and os.path.exists(params.cover_path)
  166. else None,
  167. )
  168. print(f"[{self.platform_name}] 开始调用 create_video_note...")
  169. try:
  170. result = call_create_video_note(cookie_string)
  171. print(f"[{self.platform_name}] SDK 返回结果: {result}")
  172. except Exception as e:
  173. err_text = str(e)
  174. if (
  175. "无登录信息" in err_text
  176. or '"code": -100' in err_text
  177. or "'code': -100" in err_text
  178. ):
  179. self.report_progress(15, "登录信息失效,尝试刷新登录信息...")
  180. refreshed = await ensure_valid_cookie_for_sdk()
  181. if not refreshed:
  182. screenshot_base64 = await self.capture_screenshot()
  183. page_url = (
  184. await self.get_page_url()
  185. if hasattr(self, "get_page_url")
  186. else (self.page.url if self.page else "")
  187. )
  188. return PublishResult(
  189. success=False,
  190. platform=self.platform_name,
  191. error="登录已过期,请使用有头浏览器重新登录",
  192. screenshot_base64=screenshot_base64,
  193. page_url=page_url,
  194. status="need_captcha",
  195. need_captcha=True,
  196. captcha_type="login",
  197. )
  198. try:
  199. result = call_create_video_note(refreshed)
  200. print(f"[{self.platform_name}] SDK 重试返回结果: {result}")
  201. except Exception as e2:
  202. import traceback
  203. traceback.print_exc()
  204. raise Exception(f"XHS SDK 发布失败: {e2}")
  205. else:
  206. import traceback
  207. traceback.print_exc()
  208. print(f"[{self.platform_name}] SDK 调用失败: {e}")
  209. raise Exception(f"XHS SDK 发布失败: {e}")
  210. # 验证返回结果
  211. if not result:
  212. raise Exception("XHS SDK 返回空结果")
  213. # 检查是否有错误
  214. if isinstance(result, dict):
  215. if result.get("code") and result.get("code") != 0:
  216. raise Exception(f"发布失败: {result.get('msg', '未知错误')}")
  217. if result.get("success") == False:
  218. raise Exception(
  219. f"发布失败: {result.get('msg', result.get('error', '未知错误'))}"
  220. )
  221. note_id = result.get("note_id", "") if isinstance(result, dict) else ""
  222. video_url = result.get("url", "") if isinstance(result, dict) else ""
  223. if not note_id:
  224. print(f"[{self.platform_name}] 警告: 未获取到 note_id,返回结果: {result}")
  225. self.report_progress(100, "发布成功")
  226. print(f"[{self.platform_name}] 发布成功! note_id={note_id}, url={video_url}")
  227. return PublishResult(
  228. success=True,
  229. platform=self.platform_name,
  230. video_id=note_id,
  231. video_url=video_url,
  232. message="发布成功",
  233. )
  234. async def publish(self, cookies: str, params: PublishParams) -> PublishResult:
  235. """发布视频到小红书 - 参考 matrix/xhs_uploader/main.py"""
  236. print(f"\n{'=' * 60}")
  237. print(f"[{self.platform_name}] 开始发布视频")
  238. print(f"[{self.platform_name}] 视频路径: {params.video_path}")
  239. print(f"[{self.platform_name}] 标题: {params.title}")
  240. print(f"[{self.platform_name}] Headless: {self.headless}")
  241. print(f"[{self.platform_name}] XHS SDK 可用: {XHS_SDK_AVAILABLE}")
  242. print(f"{'=' * 60}")
  243. # 检查视频文件
  244. if not os.path.exists(params.video_path):
  245. raise Exception(f"视频文件不存在: {params.video_path}")
  246. print(
  247. f"[{self.platform_name}] 视频文件存在,大小: {os.path.getsize(params.video_path)} bytes"
  248. )
  249. self.report_progress(5, "正在准备发布...")
  250. if isinstance(
  251. getattr(self, "proxy_config", None), dict
  252. ) and self.proxy_config.get("server"):
  253. print(
  254. f"[{self.platform_name}] 检测到代理配置,跳过 SDK 方式,使用 Playwright 走代理发布",
  255. flush=True,
  256. )
  257. return await self.publish_via_playwright(cookies, params)
  258. # 参考 matrix: 优先使用 XHS SDK API 方式发布(更稳定)
  259. if XHS_SDK_AVAILABLE:
  260. try:
  261. print(f"[{self.platform_name}] 尝试使用 XHS SDK API 发布...")
  262. result = await self.publish_via_api(cookies, params)
  263. print(f"[{self.platform_name}] API 发布完成: success={result.success}")
  264. # 如果 API 返回成功,直接返回
  265. if result.success:
  266. return result
  267. # 如果 API 返回失败但有具体错误,也返回
  268. if result.error and "请刷新" not in result.error:
  269. return result
  270. # 其他情况尝试 Playwright 方式
  271. print(f"[{self.platform_name}] API 方式未成功,尝试 Playwright...")
  272. except Exception as e:
  273. err_text = str(e)
  274. if "登录已过期" in err_text or "无登录信息" in err_text:
  275. print(
  276. f"[{self.platform_name}] API 登录失效,切换到 Playwright 方式...",
  277. flush=True,
  278. )
  279. else:
  280. import traceback
  281. traceback.print_exc()
  282. print(f"[{self.platform_name}] API 发布失败: {e}")
  283. print(f"[{self.platform_name}] 尝试使用 Playwright 方式...")
  284. # 使用 Playwright 方式发布
  285. print(f"[{self.platform_name}] 使用 Playwright 方式发布...")
  286. return await self.publish_via_playwright(cookies, params)
  287. async def publish_via_playwright(
  288. self, cookies: str, params: PublishParams
  289. ) -> PublishResult:
  290. """通过 Playwright 发布视频"""
  291. self.report_progress(10, "正在初始化浏览器...")
  292. print(f"[{self.platform_name}] Playwright 方式开始...")
  293. await self.init_browser()
  294. cookie_list = self.parse_cookies(cookies)
  295. print(f"[{self.platform_name}] 设置 {len(cookie_list)} 个 cookies")
  296. await self.set_cookies(cookie_list)
  297. if not self.page:
  298. raise Exception("Page not initialized")
  299. self.report_progress(15, "正在打开发布页面...")
  300. # 直接访问视频发布页面
  301. publish_url = "https://creator.xiaohongshu.com/publish/publish?source=official"
  302. print(f"[{self.platform_name}] 打开页面: {publish_url}")
  303. await self.page.goto(publish_url)
  304. await asyncio.sleep(3)
  305. current_url = self.page.url
  306. print(f"[{self.platform_name}] 当前 URL: {current_url}")
  307. async def wait_for_manual_login(timeout_seconds: int = 300) -> bool:
  308. if not self.page:
  309. return False
  310. self.report_progress(12, "检测到需要登录,请在浏览器窗口完成登录...")
  311. try:
  312. await self.page.bring_to_front()
  313. except:
  314. pass
  315. waited = 0
  316. while waited < timeout_seconds:
  317. try:
  318. url = self.page.url
  319. if (
  320. "login" not in url
  321. and "passport" not in url
  322. and "creator.xiaohongshu.com" in url
  323. ):
  324. return True
  325. await asyncio.sleep(2)
  326. waited += 2
  327. except:
  328. await asyncio.sleep(2)
  329. waited += 2
  330. return False
  331. async def wait_for_manual_captcha(timeout_seconds: int = 180) -> bool:
  332. waited = 0
  333. while waited < timeout_seconds:
  334. try:
  335. ai_captcha = await self.ai_check_captcha()
  336. if not ai_captcha.get("has_captcha"):
  337. return True
  338. except:
  339. pass
  340. await asyncio.sleep(3)
  341. waited += 3
  342. return False
  343. # 检查登录状态
  344. if "login" in current_url or "passport" in current_url:
  345. if not self.headless:
  346. logged_in = await wait_for_manual_login()
  347. if logged_in:
  348. try:
  349. if self.context:
  350. cookies_after = await self.context.cookies()
  351. await self.sync_cookies_to_node(cookies_after)
  352. except:
  353. pass
  354. await self.page.goto(publish_url)
  355. await asyncio.sleep(3)
  356. current_url = self.page.url
  357. else:
  358. screenshot_base64 = await self.capture_screenshot()
  359. return PublishResult(
  360. success=False,
  361. platform=self.platform_name,
  362. error="需要登录:请在浏览器窗口完成登录后重试",
  363. screenshot_base64=screenshot_base64,
  364. page_url=current_url,
  365. status="need_captcha",
  366. need_captcha=True,
  367. captcha_type="login",
  368. )
  369. else:
  370. screenshot_base64 = await self.capture_screenshot()
  371. return PublishResult(
  372. success=False,
  373. platform=self.platform_name,
  374. error="登录已过期,请重新登录",
  375. screenshot_base64=screenshot_base64,
  376. page_url=current_url,
  377. status="need_captcha",
  378. need_captcha=True,
  379. captcha_type="login",
  380. )
  381. # 使用 AI 检查验证码
  382. ai_captcha = await self.ai_check_captcha()
  383. if ai_captcha["has_captcha"]:
  384. print(
  385. f"[{self.platform_name}] AI检测到验证码: {ai_captcha['captcha_type']}",
  386. flush=True,
  387. )
  388. if not self.headless:
  389. solved = await wait_for_manual_captcha()
  390. if solved:
  391. try:
  392. if self.context:
  393. cookies_after = await self.context.cookies()
  394. await self.sync_cookies_to_node(cookies_after)
  395. except:
  396. pass
  397. else:
  398. screenshot_base64 = await self.capture_screenshot()
  399. return PublishResult(
  400. success=False,
  401. platform=self.platform_name,
  402. error=f"需要验证码:请在浏览器窗口完成验证后重试",
  403. screenshot_base64=screenshot_base64,
  404. page_url=current_url,
  405. status="need_captcha",
  406. need_captcha=True,
  407. captcha_type=ai_captcha["captcha_type"],
  408. )
  409. else:
  410. screenshot_base64 = await self.capture_screenshot()
  411. return PublishResult(
  412. success=False,
  413. platform=self.platform_name,
  414. error=f"检测到{ai_captcha['captcha_type']}验证码,需要使用有头浏览器完成验证",
  415. screenshot_base64=screenshot_base64,
  416. page_url=current_url,
  417. status="need_captcha",
  418. need_captcha=True,
  419. captcha_type=ai_captcha["captcha_type"],
  420. )
  421. self.report_progress(20, "正在上传视频...")
  422. # 等待页面加载
  423. await asyncio.sleep(2)
  424. # 上传视频
  425. upload_triggered = False
  426. # 方法1: 直接设置隐藏的 file input
  427. print(f"[{self.platform_name}] 尝试方法1: 设置 file input")
  428. file_inputs = self.page.locator('input[type="file"]')
  429. input_count = await file_inputs.count()
  430. print(f"[{self.platform_name}] 找到 {input_count} 个 file input")
  431. if input_count > 0:
  432. # 找到接受视频的 input
  433. for i in range(input_count):
  434. input_el = file_inputs.nth(i)
  435. accept = await input_el.get_attribute("accept") or ""
  436. print(f"[{self.platform_name}] Input {i} accept: {accept}")
  437. if "video" in accept or "*" in accept or not accept:
  438. await input_el.set_input_files(params.video_path)
  439. upload_triggered = True
  440. print(f"[{self.platform_name}] 视频文件已设置到 input {i}")
  441. break
  442. # 方法2: 点击上传区域触发文件选择器
  443. if not upload_triggered:
  444. print(f"[{self.platform_name}] 尝试方法2: 点击上传区域")
  445. try:
  446. upload_area = self.page.locator(
  447. '[class*="upload-wrapper"], [class*="upload-area"], .upload-input'
  448. ).first
  449. if await upload_area.count() > 0:
  450. async with self.page.expect_file_chooser(timeout=5000) as fc_info:
  451. await upload_area.click()
  452. file_chooser = await fc_info.value
  453. await file_chooser.set_files(params.video_path)
  454. upload_triggered = True
  455. print(f"[{self.platform_name}] 通过点击上传区域上传成功")
  456. except Exception as e:
  457. print(f"[{self.platform_name}] 方法2失败: {e}")
  458. if not upload_triggered:
  459. screenshot_base64 = await self.capture_screenshot()
  460. page_url = await self.get_page_url()
  461. return PublishResult(
  462. success=False,
  463. platform=self.platform_name,
  464. error="无法上传视频文件",
  465. screenshot_base64=screenshot_base64,
  466. page_url=page_url,
  467. status="need_action",
  468. )
  469. self.report_progress(40, "等待视频上传完成...")
  470. print(f"[{self.platform_name}] 等待视频上传和处理...")
  471. # 等待上传完成(检测页面变化)
  472. upload_complete = False
  473. for i in range(60): # 最多等待3分钟
  474. await asyncio.sleep(3)
  475. # 检查是否有标题输入框(上传完成后出现)
  476. title_input_count = await self.page.locator(
  477. 'input[placeholder*="标题"], input[placeholder*="填写标题"]'
  478. ).count()
  479. # 或者检查编辑器区域
  480. editor_count = await self.page.locator(
  481. '[class*="ql-editor"], [contenteditable="true"]'
  482. ).count()
  483. # 检查发布按钮是否可见
  484. publish_btn_count = await self.page.locator(
  485. '.publishBtn, button:has-text("发布")'
  486. ).count()
  487. print(
  488. f"[{self.platform_name}] 检测 {i + 1}: 标题框={title_input_count}, 编辑器={editor_count}, 发布按钮={publish_btn_count}"
  489. )
  490. if title_input_count > 0 or (editor_count > 0 and publish_btn_count > 0):
  491. upload_complete = True
  492. print(f"[{self.platform_name}] 视频上传完成!")
  493. break
  494. if not upload_complete:
  495. screenshot_base64 = await self.capture_screenshot()
  496. page_url = await self.get_page_url()
  497. return PublishResult(
  498. success=False,
  499. platform=self.platform_name,
  500. error="视频上传超时",
  501. screenshot_base64=screenshot_base64,
  502. page_url=page_url,
  503. status="need_action",
  504. )
  505. await asyncio.sleep(2)
  506. self.report_progress(60, "正在填写笔记信息...")
  507. print(f"[{self.platform_name}] 填写标题: {params.title[:20]}")
  508. # 填写标题
  509. title_filled = False
  510. title_selectors = [
  511. 'input[placeholder*="标题"]',
  512. 'input[placeholder*="填写标题"]',
  513. '[class*="title"] input',
  514. ".c-input_inner",
  515. ]
  516. for selector in title_selectors:
  517. title_input = self.page.locator(selector).first
  518. if await title_input.count() > 0:
  519. await title_input.click()
  520. await title_input.fill("") # 先清空
  521. await title_input.fill(params.title[:20])
  522. title_filled = True
  523. print(f"[{self.platform_name}] 标题已填写,使用选择器: {selector}")
  524. break
  525. if not title_filled:
  526. print(f"[{self.platform_name}] 警告: 未找到标题输入框")
  527. # 填写描述和标签
  528. if params.description or params.tags:
  529. desc_filled = False
  530. desc_selectors = [
  531. '[class*="ql-editor"]',
  532. '[class*="content-input"] [contenteditable="true"]',
  533. '[class*="editor"] [contenteditable="true"]',
  534. ".ql-editor",
  535. ]
  536. for selector in desc_selectors:
  537. desc_input = self.page.locator(selector).first
  538. if await desc_input.count() > 0:
  539. await desc_input.click()
  540. await asyncio.sleep(0.5)
  541. if params.description:
  542. await self.page.keyboard.type(params.description, delay=20)
  543. print(f"[{self.platform_name}] 描述已填写")
  544. if params.tags:
  545. # 添加标签
  546. await self.page.keyboard.press("Enter")
  547. for tag in params.tags[:5]: # 最多5个标签
  548. await self.page.keyboard.type(f"#{tag}", delay=20)
  549. await asyncio.sleep(0.3)
  550. await self.page.keyboard.press("Space")
  551. print(f"[{self.platform_name}] 标签已填写: {params.tags[:5]}")
  552. desc_filled = True
  553. break
  554. if not desc_filled:
  555. print(f"[{self.platform_name}] 警告: 未找到描述输入框")
  556. await asyncio.sleep(2)
  557. self.report_progress(80, "正在发布...")
  558. await asyncio.sleep(2)
  559. # 滚动到页面底部确保发布按钮可见
  560. await self.page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
  561. await asyncio.sleep(1)
  562. print(f"[{self.platform_name}] 查找发布按钮...")
  563. # 点击发布
  564. publish_selectors = [
  565. "button.publishBtn",
  566. ".publishBtn",
  567. "button.d-button.red",
  568. 'button:has-text("发布"):not(:has-text("定时发布"))',
  569. '[class*="publish"][class*="btn"]',
  570. ]
  571. publish_clicked = False
  572. for selector in publish_selectors:
  573. try:
  574. btn = self.page.locator(selector).first
  575. if await btn.count() > 0:
  576. is_visible = await btn.is_visible()
  577. is_enabled = await btn.is_enabled()
  578. print(
  579. f"[{self.platform_name}] 按钮 {selector}: visible={is_visible}, enabled={is_enabled}"
  580. )
  581. if is_visible and is_enabled:
  582. box = await btn.bounding_box()
  583. if box:
  584. print(
  585. f"[{self.platform_name}] 点击发布按钮: {selector}, 位置: ({box['x']}, {box['y']})"
  586. )
  587. # 使用真实鼠标点击
  588. await self.page.mouse.click(
  589. box["x"] + box["width"] / 2,
  590. box["y"] + box["height"] / 2,
  591. )
  592. publish_clicked = True
  593. break
  594. except Exception as e:
  595. print(f"[{self.platform_name}] 选择器 {selector} 错误: {e}")
  596. if not publish_clicked:
  597. try:
  598. suggest = await self.ai_suggest_playwright_selector(
  599. "点击小红书发布按钮"
  600. )
  601. if suggest.get("has_selector") and suggest.get("selector"):
  602. sel = suggest.get("selector")
  603. btn = self.page.locator(sel).first
  604. if (
  605. await btn.count() > 0
  606. and await btn.is_visible()
  607. and await btn.is_enabled()
  608. ):
  609. try:
  610. await btn.click()
  611. except:
  612. box = await btn.bounding_box()
  613. if box:
  614. await self.page.mouse.click(
  615. box["x"] + box["width"] / 2,
  616. box["y"] + box["height"] / 2,
  617. )
  618. publish_clicked = True
  619. except Exception as e:
  620. print(f"[{self.platform_name}] AI 点击发布按钮失败: {e}", flush=True)
  621. if not publish_clicked:
  622. # 保存截图用于调试
  623. screenshot_dir = os.path.join(
  624. os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
  625. "screenshots",
  626. )
  627. os.makedirs(screenshot_dir, exist_ok=True)
  628. screenshot_path = os.path.join(
  629. screenshot_dir, f"debug_publish_failed_{self.platform_name}.png"
  630. )
  631. await self.page.screenshot(path=screenshot_path, full_page=True)
  632. print(
  633. f"[{self.platform_name}] 未找到发布按钮,截图保存到: {screenshot_path}"
  634. )
  635. # 打印页面 HTML 结构用于调试
  636. buttons = await self.page.query_selector_all("button")
  637. print(f"[{self.platform_name}] 页面上共有 {len(buttons)} 个按钮")
  638. for i, btn in enumerate(buttons[:10]):
  639. text = await btn.text_content() or ""
  640. cls = await btn.get_attribute("class") or ""
  641. print(f" 按钮 {i}: text='{text.strip()[:30]}', class='{cls[:50]}'")
  642. raise Exception("未找到发布按钮")
  643. print(f"[{self.platform_name}] 已点击发布按钮,等待发布完成...")
  644. self.report_progress(90, "等待发布结果...")
  645. # 等待发布完成(检测 URL 变化或成功提示)
  646. publish_success = False
  647. refresh_retry = 0
  648. for i in range(20): # 最多等待 20 秒
  649. await asyncio.sleep(1)
  650. current_url = self.page.url
  651. # 检查是否跳转到发布成功页面或内容管理页面
  652. if (
  653. "published=true" in current_url
  654. or "success" in current_url
  655. or "content" in current_url
  656. ):
  657. publish_success = True
  658. print(f"[{self.platform_name}] 发布成功! 跳转到: {current_url}")
  659. break
  660. # 检查是否有成功提示
  661. try:
  662. success_msg = await self.page.locator(
  663. '[class*="success"], .toast-success, [class*="Toast"]'
  664. ).first.is_visible()
  665. if success_msg:
  666. publish_success = True
  667. print(f"[{self.platform_name}] 检测到成功提示!")
  668. break
  669. except:
  670. pass
  671. # 检查是否有错误提示
  672. try:
  673. error_elements = self.page.locator(
  674. '[class*="error"], .toast-error, [class*="fail"]'
  675. )
  676. if await error_elements.count() > 0:
  677. first_error = error_elements.first
  678. if await first_error.is_visible():
  679. error_text = (await first_error.text_content()) or ""
  680. error_text = error_text.strip()
  681. if error_text:
  682. if "请刷新" in error_text and refresh_retry < 3:
  683. refresh_retry += 1
  684. print(
  685. f"[{self.platform_name}] 检测到临时错误: {error_text},尝试刷新并重试发布({refresh_retry}/3)",
  686. flush=True,
  687. )
  688. try:
  689. await self.page.reload(
  690. wait_until="domcontentloaded"
  691. )
  692. except Exception:
  693. pass
  694. await asyncio.sleep(2)
  695. await self.page.evaluate(
  696. "window.scrollTo(0, document.body.scrollHeight)"
  697. )
  698. await asyncio.sleep(1)
  699. republish_clicked = False
  700. for selector in publish_selectors:
  701. try:
  702. btn = self.page.locator(selector).first
  703. if (
  704. await btn.count() > 0
  705. and await btn.is_visible()
  706. and await btn.is_enabled()
  707. ):
  708. try:
  709. await btn.click()
  710. except:
  711. box = await btn.bounding_box()
  712. if box:
  713. await self.page.mouse.click(
  714. box["x"] + box["width"] / 2,
  715. box["y"] + box["height"] / 2,
  716. )
  717. republish_clicked = True
  718. break
  719. except:
  720. continue
  721. continue
  722. screenshot_base64 = await self.capture_screenshot()
  723. page_url = await self.get_page_url()
  724. return PublishResult(
  725. success=False,
  726. platform=self.platform_name,
  727. error=f"发布失败: {error_text}",
  728. screenshot_base64=screenshot_base64,
  729. page_url=page_url,
  730. status="failed",
  731. )
  732. except Exception as e:
  733. if "发布失败" in str(e):
  734. raise
  735. # 如果没有明确的成功标志,返回截图供 AI 分析
  736. if not publish_success:
  737. final_url = self.page.url
  738. print(f"[{self.platform_name}] 发布结果不确定,当前 URL: {final_url}")
  739. screenshot_base64 = await self.capture_screenshot()
  740. print(f"[{self.platform_name}] 已获取截图供 AI 分析")
  741. # 如果 URL 还是发布页面,可能需要继续操作
  742. if "publish/publish" in final_url:
  743. return PublishResult(
  744. success=False,
  745. platform=self.platform_name,
  746. error="发布结果待确认,请查看截图",
  747. screenshot_base64=screenshot_base64,
  748. page_url=final_url,
  749. status="need_action",
  750. )
  751. self.report_progress(100, "发布完成")
  752. print(f"[{self.platform_name}] Playwright 方式发布完成!")
  753. screenshot_base64 = await self.capture_screenshot()
  754. page_url = await self.get_page_url()
  755. return PublishResult(
  756. success=True,
  757. platform=self.platform_name,
  758. message="发布完成",
  759. screenshot_base64=screenshot_base64,
  760. page_url=page_url,
  761. status="success",
  762. )
  763. async def get_account_info(self, cookies: str) -> dict:
  764. """获取账号信息"""
  765. print(f"\n{'=' * 60}")
  766. print(f"[{self.platform_name}] 获取账号信息")
  767. print(f"{'=' * 60}")
  768. captured_info = {}
  769. try:
  770. await self.init_browser()
  771. cookie_list = self.parse_cookies(cookies)
  772. await self.set_cookies(cookie_list)
  773. if not self.page:
  774. raise Exception("Page not initialized")
  775. # 监听个人信息 API
  776. async def handle_response(response):
  777. nonlocal captured_info
  778. if "api/galaxy/creator/home/personal_info" in response.url:
  779. try:
  780. json_data = await response.json()
  781. print(f"[{self.platform_name}] 捕获个人信息 API", flush=True)
  782. if json_data.get("success") or json_data.get("code") == 0:
  783. data = json_data.get("data", {})
  784. captured_info = {
  785. "account_id": f"xhs_{data.get('red_num', '')}",
  786. "account_name": data.get("name", ""),
  787. "avatar_url": data.get("avatar", ""),
  788. "fans_count": data.get("fans_count", 0),
  789. "works_count": 0, # 暂时无法直接获取准确的作品数,需要从作品列表获取
  790. }
  791. except Exception as e:
  792. print(
  793. f"[{self.platform_name}] 解析个人信息失败: {e}", flush=True
  794. )
  795. self.page.on("response", handle_response)
  796. # 访问首页
  797. print(f"[{self.platform_name}] 访问创作者首页...", flush=True)
  798. await self.page.goto(
  799. "https://creator.xiaohongshu.com/new/home",
  800. wait_until="domcontentloaded",
  801. )
  802. # 等待 API 响应
  803. for _ in range(10):
  804. if captured_info:
  805. break
  806. await asyncio.sleep(1)
  807. if not captured_info:
  808. print(
  809. f"[{self.platform_name}] 未捕获到个人信息,尝试刷新...", flush=True
  810. )
  811. await self.page.reload()
  812. for _ in range(10):
  813. if captured_info:
  814. break
  815. await asyncio.sleep(1)
  816. if not captured_info:
  817. raise Exception("无法获取账号信息")
  818. # 尝试获取作品数(从首页或其他地方)
  819. # 或者简单地返回已获取的信息,作品数由 get_works 更新
  820. return {"success": True, **captured_info}
  821. except Exception as e:
  822. import traceback
  823. traceback.print_exc()
  824. return {"success": False, "error": str(e)}
  825. finally:
  826. await self.close_browser()
  827. async def get_works(
  828. self, cookies: str, page: int = 0, page_size: int = 20
  829. ) -> WorksResult:
  830. """获取小红书作品列表 - 通过直接调用创作者笔记列表 API 获取"""
  831. print(f"\n{'=' * 60}", flush=True)
  832. print(f"[{self.platform_name}] 获取作品列表", flush=True)
  833. print(f"[{self.platform_name}] page={page}, page_size={page_size}", flush=True)
  834. print(f"{'=' * 60}", flush=True)
  835. works: List[WorkItem] = []
  836. total = 0
  837. has_more = False
  838. next_page = ""
  839. api_page_size = 20
  840. try:
  841. await self.init_browser()
  842. cookie_list = self.parse_cookies(cookies)
  843. # 打印 cookies 信息用于调试
  844. print(
  845. f"[{self.platform_name}] 解析到 {len(cookie_list)} 个 cookies",
  846. flush=True,
  847. )
  848. await self.set_cookies(cookie_list)
  849. if not self.page:
  850. raise Exception("Page not initialized")
  851. # 访问笔记管理页面 - 页面会自动发起 API 请求
  852. print(f"[{self.platform_name}] 访问笔记管理页面...", flush=True)
  853. try:
  854. await self.page.goto(
  855. "https://creator.xiaohongshu.com/new/note-manager",
  856. wait_until="domcontentloaded",
  857. timeout=30000,
  858. )
  859. except Exception as nav_error:
  860. print(
  861. f"[{self.platform_name}] 导航超时,但继续尝试: {nav_error}",
  862. flush=True,
  863. )
  864. # 检查登录状态
  865. current_url = self.page.url
  866. print(f"[{self.platform_name}] 当前页面: {current_url}", flush=True)
  867. if "login" in current_url:
  868. raise Exception("Cookie 已过期,请重新登录")
  869. # 等待页面完全加载,确保签名函数可用
  870. print(
  871. f"[{self.platform_name}] 等待页面完全加载和签名函数初始化...",
  872. flush=True,
  873. )
  874. await asyncio.sleep(3)
  875. # 检查签名函数是否可用
  876. sign_check_attempts = 0
  877. max_sign_check_attempts = 10
  878. while sign_check_attempts < max_sign_check_attempts:
  879. sign_available = await self.page.evaluate("""() => {
  880. return typeof window !== 'undefined' && typeof window._webmsxyw === 'function';
  881. }""")
  882. if sign_available:
  883. print(
  884. f"[{self.platform_name}] ✓ 签名函数 _webmsxyw 已可用",
  885. flush=True,
  886. )
  887. break
  888. sign_check_attempts += 1
  889. print(
  890. f"[{self.platform_name}] ⏳ 等待签名函数... ({sign_check_attempts}/{max_sign_check_attempts})",
  891. flush=True,
  892. )
  893. await asyncio.sleep(1)
  894. if sign_check_attempts >= max_sign_check_attempts:
  895. print(
  896. f"[{self.platform_name}] ⚠️ 警告: 签名函数 _webmsxyw 在 {max_sign_check_attempts} 次检查后仍不可用",
  897. flush=True,
  898. )
  899. print(
  900. f"[{self.platform_name}] 继续尝试,但 API 调用可能会失败",
  901. flush=True,
  902. )
  903. async def fetch_notes_page(p):
  904. # 再次检查签名函数(每次调用前都检查)
  905. sign_available = await self.page.evaluate("""() => {
  906. return typeof window !== 'undefined' && typeof window._webmsxyw === 'function';
  907. }""")
  908. if not sign_available:
  909. print(
  910. f"[{self.platform_name}] ⚠️ 签名函数 _webmsxyw 不可用,等待...",
  911. flush=True,
  912. )
  913. await asyncio.sleep(2)
  914. return await self.page.evaluate(
  915. """async (pageNum) => {
  916. try {
  917. // 使用正确的 API 端点:/api/galaxy/v2/creator/note/user/posted
  918. const url = `/api/galaxy/v2/creator/note/user/posted?tab=0&page=${pageNum}`;
  919. const headers = {
  920. 'Accept': 'application/json, text/plain, */*',
  921. 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  922. 'Referer': 'https://creator.xiaohongshu.com/new/note-manager',
  923. 'Sec-Fetch-Dest': 'empty',
  924. 'Sec-Fetch-Mode': 'cors',
  925. 'Sec-Fetch-Site': 'same-origin'
  926. };
  927. // 尝试获取签名
  928. let signResult = { hasSign: false, x_s: '', x_t: '', x_s_common: '', error: '' };
  929. if (typeof window !== 'undefined' && typeof window._webmsxyw === 'function') {
  930. try {
  931. const sign = window._webmsxyw(url, '');
  932. headers['x-s'] = sign['X-s'];
  933. headers['x-t'] = String(sign['X-t']);
  934. // 检查是否有 x-s-common
  935. if (sign['X-s-common']) {
  936. headers['x-s-common'] = sign['X-s-common'];
  937. }
  938. signResult = {
  939. hasSign: true,
  940. x_s: sign['X-s'] ? sign['X-s'].substring(0, 50) + '...' : '',
  941. x_t: String(sign['X-t']),
  942. x_s_common: sign['X-s-common'] ? sign['X-s-common'].substring(0, 50) + '...' : '',
  943. error: ''
  944. };
  945. console.log('签名生成成功:', signResult);
  946. } catch (e) {
  947. signResult.error = e.toString();
  948. console.error('签名生成失败:', e);
  949. }
  950. } else {
  951. signResult.error = '_webmsxyw function not found';
  952. console.error('签名函数不存在');
  953. }
  954. const res = await fetch(url, {
  955. method: 'GET',
  956. credentials: 'include',
  957. headers
  958. });
  959. const responseData = await res.json();
  960. return {
  961. ...responseData,
  962. _debug: {
  963. signResult: signResult,
  964. status: res.status,
  965. statusText: res.statusText
  966. }
  967. };
  968. } catch (e) {
  969. return { success: false, error: e.toString() };
  970. }
  971. }""",
  972. p,
  973. )
  974. def parse_notes(notes_list):
  975. parsed = []
  976. for note in notes_list:
  977. note_id = note.get("id", "")
  978. if not note_id:
  979. continue
  980. cover_url = ""
  981. images_list = note.get("images_list", [])
  982. if images_list:
  983. cover_url = images_list[0].get("url", "")
  984. if cover_url.startswith("http://"):
  985. cover_url = cover_url.replace("http://", "https://")
  986. duration = note.get("video_info", {}).get("duration", 0)
  987. status = "published"
  988. tab_status = note.get("tab_status", 1)
  989. if tab_status == 0:
  990. status = "draft"
  991. elif tab_status == 2:
  992. status = "reviewing"
  993. elif tab_status == 3:
  994. status = "rejected"
  995. video_url = (
  996. f"https://www.xiaohongshu.com/explore/{note_id}"
  997. if note_id
  998. else ""
  999. )
  1000. parsed.append(
  1001. WorkItem(
  1002. work_id=note_id,
  1003. title=note.get("display_title", "") or "无标题",
  1004. cover_url=cover_url,
  1005. video_url=video_url,
  1006. duration=duration,
  1007. status=status,
  1008. publish_time=note.get("time", ""),
  1009. play_count=note.get("view_count", 0),
  1010. like_count=note.get("likes", 0),
  1011. comment_count=note.get("comments_count", 0),
  1012. share_count=note.get("shared_count", 0),
  1013. collect_count=note.get("collected_count", 0),
  1014. )
  1015. )
  1016. return parsed
  1017. resp = None
  1018. for attempt in range(1, 4):
  1019. resp = await fetch_notes_page(page)
  1020. # 打印调试信息
  1021. if resp and isinstance(resp, dict) and resp.get("_debug"):
  1022. debug_info = resp.get("_debug", {})
  1023. sign_result = debug_info.get("signResult", {})
  1024. print(
  1025. f"[{self.platform_name}] 🔍 调试信息: 签名可用: {sign_result.get('hasSign', False)}, X-S: {sign_result.get('x_s', '')}, X-T: {sign_result.get('x_t', '')}, X-S-Common: {sign_result.get('x_s_common', '')}, 签名错误: {sign_result.get('error', '')}, HTTP 状态: {debug_info.get('status', 'N/A')}",
  1026. flush=True,
  1027. )
  1028. resp.pop("_debug", None)
  1029. if (
  1030. resp
  1031. and (resp.get("success") or resp.get("code") == 0)
  1032. and resp.get("data")
  1033. ):
  1034. break
  1035. print(
  1036. f"[{self.platform_name}] 拉取作品列表失败,重试 {attempt}/3: {str(resp)[:200]}",
  1037. flush=True,
  1038. )
  1039. await asyncio.sleep(1.2 * attempt)
  1040. if (
  1041. not resp
  1042. or not (resp.get("success") or resp.get("code") == 0)
  1043. or not resp.get("data")
  1044. ):
  1045. error_msg = resp.get("msg") if isinstance(resp, dict) else str(resp)
  1046. # 打印详细的错误信息
  1047. if isinstance(resp, dict):
  1048. if resp.get("msg"):
  1049. print(
  1050. f"[{self.platform_name}] 错误消息: {resp.get('msg')}",
  1051. flush=True,
  1052. )
  1053. if resp.get("message"):
  1054. print(
  1055. f"[{self.platform_name}] 错误消息: {resp.get('message')}",
  1056. flush=True,
  1057. )
  1058. if resp.get("error"):
  1059. print(
  1060. f"[{self.platform_name}] 错误: {resp.get('error')}",
  1061. flush=True,
  1062. )
  1063. raise Exception(f"无法获取作品列表数据: {error_msg}")
  1064. data = resp.get("data", {}) or {}
  1065. notes = data.get("notes", []) or []
  1066. print(
  1067. f"[{self.platform_name}] 第 {page} 页 notes 数量: {len(notes)}",
  1068. flush=True,
  1069. )
  1070. tags = data.get("tags", []) or []
  1071. if tags:
  1072. preferred = 0
  1073. for tag in tags:
  1074. if tag.get("id") == "special.note_time_desc":
  1075. preferred = (
  1076. tag.get("notes_count", 0)
  1077. or tag.get("notesCount", 0)
  1078. or tag.get("count", 0)
  1079. or 0
  1080. )
  1081. break
  1082. if preferred:
  1083. total = preferred
  1084. else:
  1085. total = max(
  1086. [
  1087. int(
  1088. t.get("notes_count", 0)
  1089. or t.get("notesCount", 0)
  1090. or t.get("count", 0)
  1091. or 0
  1092. )
  1093. for t in tags
  1094. ]
  1095. + [0]
  1096. )
  1097. if not total:
  1098. total = int(
  1099. data.get("total", 0)
  1100. or data.get("total_count", 0)
  1101. or data.get("totalCount", 0)
  1102. or 0
  1103. )
  1104. if not total and isinstance(data.get("page", {}), dict):
  1105. total = int(
  1106. data.get("page", {}).get("total", 0)
  1107. or data.get("page", {}).get("totalCount", 0)
  1108. or 0
  1109. )
  1110. next_page = data.get("page", "")
  1111. if next_page == page:
  1112. next_page = page + 1
  1113. works.extend(parse_notes(notes))
  1114. if total:
  1115. has_more = (page * api_page_size + len(notes)) < total
  1116. if has_more and (
  1117. next_page == -1
  1118. or str(next_page) == "-1"
  1119. or next_page == ""
  1120. or next_page is None
  1121. ):
  1122. next_page = page + 1
  1123. else:
  1124. if len(notes) == 0:
  1125. has_more = False
  1126. else:
  1127. next_resp = await fetch_notes_page(page + 1)
  1128. next_data = (
  1129. (next_resp or {}).get("data", {})
  1130. if isinstance(next_resp, dict)
  1131. else {}
  1132. )
  1133. next_notes = next_data.get("notes", []) or []
  1134. has_more = len(next_notes) > 0
  1135. next_page = next_data.get("page", next_page)
  1136. except Exception as e:
  1137. import traceback
  1138. print(f"[{self.platform_name}] 发生异常: {e}", flush=True)
  1139. traceback.print_exc()
  1140. return WorksResult(success=False, platform=self.platform_name, error=str(e))
  1141. finally:
  1142. # 确保关闭浏览器
  1143. await self.close_browser()
  1144. return WorksResult(
  1145. success=True,
  1146. platform=self.platform_name,
  1147. works=works,
  1148. total=total or (page * api_page_size + len(works)),
  1149. has_more=has_more,
  1150. next_page=next_page,
  1151. )
  1152. async def get_all_works(self, cookies: str) -> WorksResult:
  1153. """获取小红书全部作品(单次请求内自动翻页抓全量,避免 Node 侧分页不一致)"""
  1154. print(f"\n{'=' * 60}", flush=True)
  1155. print(f"[{self.platform_name}] 获取全部作品(auto paging)", flush=True)
  1156. print(f"{'=' * 60}", flush=True)
  1157. works: List[WorkItem] = []
  1158. total = 0
  1159. seen_ids = set()
  1160. cursor: object = 0
  1161. max_iters = 800
  1162. api_page_size = 20
  1163. try:
  1164. await self.init_browser()
  1165. cookie_list = self.parse_cookies(cookies)
  1166. print(
  1167. f"[{self.platform_name}] 解析到 {len(cookie_list)} 个 cookies",
  1168. flush=True,
  1169. )
  1170. await self.set_cookies(cookie_list)
  1171. if not self.page:
  1172. raise Exception("Page not initialized")
  1173. print(f"[{self.platform_name}] 访问笔记管理页面...", flush=True)
  1174. try:
  1175. await self.page.goto(
  1176. "https://creator.xiaohongshu.com/new/note-manager",
  1177. wait_until="domcontentloaded",
  1178. timeout=60000,
  1179. )
  1180. print(f"[{self.platform_name}] 页面加载成功", flush=True)
  1181. except Exception as nav_error:
  1182. print(
  1183. f"[{self.platform_name}] 导航超时,但继续尝试: {nav_error}",
  1184. flush=True,
  1185. )
  1186. # 即使超时也检查当前页面状态
  1187. try:
  1188. await asyncio.sleep(2)
  1189. current_url = self.page.url
  1190. print(
  1191. f"[{self.platform_name}] 超时后当前页面: {current_url}",
  1192. flush=True,
  1193. )
  1194. except Exception as e:
  1195. print(f"[{self.platform_name}] 检查页面状态时出错: {e}", flush=True)
  1196. current_url = self.page.url
  1197. print(f"[{self.platform_name}] 当前页面: {current_url}", flush=True)
  1198. if "login" in current_url:
  1199. raise Exception("Cookie 已过期,请重新登录")
  1200. # 等待页面完全加载,确保签名函数可用
  1201. print(
  1202. f"[{self.platform_name}] 等待页面完全加载和签名函数初始化...",
  1203. flush=True,
  1204. )
  1205. await asyncio.sleep(3)
  1206. # 检查签名函数是否可用
  1207. sign_check_attempts = 0
  1208. max_sign_check_attempts = 10
  1209. while sign_check_attempts < max_sign_check_attempts:
  1210. sign_available = await self.page.evaluate("""() => {
  1211. return typeof window !== 'undefined' && typeof window._webmsxyw === 'function';
  1212. }""")
  1213. if sign_available:
  1214. print(
  1215. f"[{self.platform_name}] ✓ 签名函数 _webmsxyw 已可用",
  1216. flush=True,
  1217. )
  1218. break
  1219. sign_check_attempts += 1
  1220. print(
  1221. f"[{self.platform_name}] ⏳ 等待签名函数... ({sign_check_attempts}/{max_sign_check_attempts})",
  1222. flush=True,
  1223. )
  1224. await asyncio.sleep(1)
  1225. if sign_check_attempts >= max_sign_check_attempts:
  1226. print(
  1227. f"[{self.platform_name}] ⚠️ 警告: 签名函数 _webmsxyw 在 {max_sign_check_attempts} 次检查后仍不可用",
  1228. flush=True,
  1229. )
  1230. print(
  1231. f"[{self.platform_name}] 继续尝试,但 API 调用可能会失败",
  1232. flush=True,
  1233. )
  1234. async def fetch_notes_page(p):
  1235. # 再次检查签名函数(每次调用前都检查)
  1236. sign_available = await self.page.evaluate("""() => {
  1237. return typeof window !== 'undefined' && typeof window._webmsxyw === 'function';
  1238. }""")
  1239. if not sign_available:
  1240. print(
  1241. f"[{self.platform_name}] ⚠️ 签名函数 _webmsxyw 不可用,等待...",
  1242. flush=True,
  1243. )
  1244. await asyncio.sleep(2)
  1245. return await self.page.evaluate(
  1246. """async (pageNum) => {
  1247. try {
  1248. // 使用正确的 API 端点:/api/galaxy/v2/creator/note/user/posted
  1249. const url = `/api/galaxy/v2/creator/note/user/posted?tab=0&page=${pageNum}`;
  1250. const headers = {
  1251. 'Accept': 'application/json, text/plain, */*',
  1252. 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
  1253. 'Referer': 'https://creator.xiaohongshu.com/new/note-manager',
  1254. 'Sec-Fetch-Dest': 'empty',
  1255. 'Sec-Fetch-Mode': 'cors',
  1256. 'Sec-Fetch-Site': 'same-origin'
  1257. };
  1258. // 尝试获取签名
  1259. let signResult = { hasSign: false, x_s: '', x_t: '', x_s_common: '', error: '' };
  1260. if (typeof window !== 'undefined' && typeof window._webmsxyw === 'function') {
  1261. try {
  1262. const sign = window._webmsxyw(url, '');
  1263. headers['x-s'] = sign['X-s'];
  1264. headers['x-t'] = String(sign['X-t']);
  1265. // 检查是否有 x-s-common
  1266. if (sign['X-s-common']) {
  1267. headers['x-s-common'] = sign['X-s-common'];
  1268. }
  1269. signResult = {
  1270. hasSign: true,
  1271. x_s: sign['X-s'] ? sign['X-s'].substring(0, 50) + '...' : '',
  1272. x_t: String(sign['X-t']),
  1273. x_s_common: sign['X-s-common'] ? sign['X-s-common'].substring(0, 50) + '...' : '',
  1274. error: ''
  1275. };
  1276. console.log('签名生成成功:', signResult);
  1277. } catch (e) {
  1278. signResult.error = e.toString();
  1279. console.error('签名生成失败:', e);
  1280. }
  1281. } else {
  1282. signResult.error = '_webmsxyw function not found';
  1283. console.error('签名函数不存在');
  1284. }
  1285. const res = await fetch(url, {
  1286. method: 'GET',
  1287. credentials: 'include',
  1288. headers
  1289. });
  1290. const responseData = await res.json();
  1291. return {
  1292. ...responseData,
  1293. _debug: {
  1294. signResult: signResult,
  1295. status: res.status,
  1296. statusText: res.statusText
  1297. }
  1298. };
  1299. } catch (e) {
  1300. return { success: false, error: e.toString() };
  1301. }
  1302. }""",
  1303. p,
  1304. )
  1305. def parse_notes(notes_list):
  1306. parsed = []
  1307. for note in notes_list:
  1308. note_id = note.get("id", "")
  1309. if not note_id:
  1310. continue
  1311. cover_url = ""
  1312. images_list = note.get("images_list", [])
  1313. if images_list:
  1314. cover_url = images_list[0].get("url", "")
  1315. if cover_url.startswith("http://"):
  1316. cover_url = cover_url.replace("http://", "https://")
  1317. duration = note.get("video_info", {}).get("duration", 0)
  1318. status = "published"
  1319. tab_status = note.get("tab_status", 1)
  1320. if tab_status == 0:
  1321. status = "draft"
  1322. elif tab_status == 2:
  1323. status = "reviewing"
  1324. elif tab_status == 3:
  1325. status = "rejected"
  1326. video_url = (
  1327. f"https://www.xiaohongshu.com/explore/{note_id}"
  1328. if note_id
  1329. else ""
  1330. )
  1331. parsed.append(
  1332. WorkItem(
  1333. work_id=note_id,
  1334. title=note.get("display_title", "") or "无标题",
  1335. cover_url=cover_url,
  1336. video_url=video_url,
  1337. duration=duration,
  1338. status=status,
  1339. publish_time=note.get("time", ""),
  1340. play_count=note.get("view_count", 0),
  1341. like_count=note.get("likes", 0),
  1342. comment_count=note.get("comments_count", 0),
  1343. share_count=note.get("shared_count", 0),
  1344. collect_count=note.get("collected_count", 0),
  1345. )
  1346. )
  1347. return parsed
  1348. async def collect_by_scrolling() -> WorksResult:
  1349. print(
  1350. f"[{self.platform_name}] 直连接口被拒绝,切换为滚动页面 + 监听 API 响应模式",
  1351. flush=True,
  1352. )
  1353. captured: List[WorkItem] = []
  1354. captured_total = 0
  1355. captured_seen = set()
  1356. lock = asyncio.Lock()
  1357. async def handle_response(response):
  1358. nonlocal captured_total
  1359. url = response.url
  1360. if (
  1361. "creator.xiaohongshu.com" not in url
  1362. and "edith.xiaohongshu.com" not in url
  1363. ) or "creator/note/user/posted" not in url:
  1364. return
  1365. try:
  1366. json_data = await response.json()
  1367. except Exception:
  1368. return
  1369. if not isinstance(json_data, dict):
  1370. return
  1371. if not (
  1372. json_data.get("success") or json_data.get("code") == 0
  1373. ) or not json_data.get("data"):
  1374. return
  1375. data = json_data.get("data", {}) or {}
  1376. notes = data.get("notes", []) or []
  1377. tags = data.get("tags", []) or []
  1378. declared = 0
  1379. if tags:
  1380. preferred = 0
  1381. for tag in tags:
  1382. if tag.get("id") == "special.note_time_desc":
  1383. preferred = (
  1384. tag.get("notes_count", 0)
  1385. or tag.get("notesCount", 0)
  1386. or tag.get("count", 0)
  1387. or 0
  1388. )
  1389. break
  1390. if preferred:
  1391. declared = int(preferred)
  1392. else:
  1393. declared = max(
  1394. [
  1395. int(
  1396. t.get("notes_count", 0)
  1397. or t.get("notesCount", 0)
  1398. or t.get("count", 0)
  1399. or 0
  1400. )
  1401. for t in tags
  1402. ]
  1403. + [0]
  1404. )
  1405. if not declared:
  1406. declared = int(
  1407. data.get("total", 0)
  1408. or data.get("total_count", 0)
  1409. or data.get("totalCount", 0)
  1410. or 0
  1411. )
  1412. if not declared and isinstance(data.get("page", {}), dict):
  1413. declared = int(
  1414. data.get("page", {}).get("total", 0)
  1415. or data.get("page", {}).get("totalCount", 0)
  1416. or 0
  1417. )
  1418. async with lock:
  1419. if declared:
  1420. captured_total = max(captured_total, declared)
  1421. parsed = parse_notes(notes)
  1422. new_count = 0
  1423. for w in parsed:
  1424. if w.work_id and w.work_id not in captured_seen:
  1425. captured_seen.add(w.work_id)
  1426. captured.append(w)
  1427. new_count += 1
  1428. if new_count > 0:
  1429. print(
  1430. f"[{self.platform_name}] 捕获 notes 响应: notes={len(notes)}, new={new_count}, total_now={len(captured)}, declared_total={captured_total}",
  1431. flush=True,
  1432. )
  1433. self.page.on("response", handle_response)
  1434. try:
  1435. try:
  1436. # 使用更宽松的等待条件,避免超时
  1437. await self.page.goto(
  1438. "https://creator.xiaohongshu.com/new/note-manager",
  1439. wait_until="domcontentloaded",
  1440. timeout=90000,
  1441. )
  1442. print(f"[{self.platform_name}] 页面加载成功", flush=True)
  1443. except Exception as nav_error:
  1444. print(
  1445. f"[{self.platform_name}] 导航异常(继续):{nav_error}",
  1446. flush=True,
  1447. )
  1448. # 即使超时也继续尝试,可能页面已经部分加载
  1449. try:
  1450. await asyncio.sleep(3)
  1451. current_url = self.page.url
  1452. print(
  1453. f"[{self.platform_name}] 超时后当前页面: {current_url}",
  1454. flush=True,
  1455. )
  1456. if "login" in current_url:
  1457. raise Exception("Cookie 已过期,请重新登录")
  1458. except Exception as e:
  1459. if "Cookie" in str(e):
  1460. raise
  1461. print(
  1462. f"[{self.platform_name}] 检查页面状态时出错: {e}",
  1463. flush=True,
  1464. )
  1465. await asyncio.sleep(2.0)
  1466. idle_rounds = 0
  1467. last_count = 0
  1468. last_height = 0
  1469. for _ in range(1, 400):
  1470. scroll_state = await self.page.evaluate(
  1471. """() => {
  1472. const isScrollable = (el) => {
  1473. if (!el) return false;
  1474. const style = window.getComputedStyle(el);
  1475. const oy = style.overflowY;
  1476. return (oy === 'auto' || oy === 'scroll') && (el.scrollHeight - el.clientHeight > 200);
  1477. };
  1478. const pickBest = () => {
  1479. const nodes = Array.from(document.querySelectorAll('*'));
  1480. let best = document.scrollingElement || document.documentElement || document.body;
  1481. let bestScroll = (best.scrollHeight || 0) - (best.clientHeight || 0);
  1482. for (const el of nodes) {
  1483. if (!isScrollable(el)) continue;
  1484. const diff = el.scrollHeight - el.clientHeight;
  1485. if (diff > bestScroll) {
  1486. best = el;
  1487. bestScroll = diff;
  1488. }
  1489. }
  1490. return best;
  1491. };
  1492. const el = pickBest();
  1493. const beforeTop = el.scrollTop || 0;
  1494. const beforeHeight = el.scrollHeight || 0;
  1495. el.scrollTo(0, beforeHeight);
  1496. return {
  1497. beforeTop,
  1498. afterTop: el.scrollTop || 0,
  1499. height: el.scrollHeight || 0,
  1500. client: el.clientHeight || 0,
  1501. };
  1502. }"""
  1503. )
  1504. await asyncio.sleep(1.2)
  1505. async with lock:
  1506. count_now = len(captured)
  1507. total_now = captured_total
  1508. if total_now and count_now >= total_now:
  1509. break
  1510. height_now = (
  1511. int(scroll_state.get("height", 0) or 0)
  1512. if isinstance(scroll_state, dict)
  1513. else 0
  1514. )
  1515. if count_now == last_count and height_now == last_height:
  1516. idle_rounds += 1
  1517. else:
  1518. idle_rounds = 0
  1519. last_count = count_now
  1520. last_height = height_now
  1521. if idle_rounds >= 6:
  1522. break
  1523. async with lock:
  1524. final_works = list(captured)
  1525. final_total = captured_total or len(final_works)
  1526. return WorksResult(
  1527. success=True,
  1528. platform=self.platform_name,
  1529. works=final_works,
  1530. total=final_total,
  1531. has_more=False,
  1532. next_page=-1,
  1533. )
  1534. finally:
  1535. try:
  1536. self.page.remove_listener("response", handle_response)
  1537. except Exception:
  1538. pass
  1539. # 添加请求监听,捕获请求头信息
  1540. captured_requests = []
  1541. async def handle_request(request):
  1542. url = request.url
  1543. if (
  1544. "creator.xiaohongshu.com" in url or "edith.xiaohongshu.com" in url
  1545. ) and "creator/note/user/posted" in url:
  1546. headers = request.headers
  1547. captured_requests.append(
  1548. {
  1549. "url": url,
  1550. "method": request.method,
  1551. "headers": dict(headers),
  1552. "timestamp": asyncio.get_event_loop().time(),
  1553. }
  1554. )
  1555. # 打印关键头部信息
  1556. x_s = headers.get("x-s", "")
  1557. x_t = headers.get("x-t", "")
  1558. x_s_common = headers.get("x-s-common", "")
  1559. print(f"[{self.platform_name}] 📡 API 请求: {url}", flush=True)
  1560. print(
  1561. f"[{self.platform_name}] Method: {request.method}",
  1562. flush=True,
  1563. )
  1564. print(
  1565. f"[{self.platform_name}] X-S: {x_s[:50] if x_s else '(none)'}...",
  1566. flush=True,
  1567. )
  1568. print(f"[{self.platform_name}] X-T: {x_t}", flush=True)
  1569. print(
  1570. f"[{self.platform_name}] X-S-Common: {x_s_common[:50] if x_s_common else '(none)'}...",
  1571. flush=True,
  1572. )
  1573. print(
  1574. f"[{self.platform_name}] Cookie: {headers.get('cookie', '')[:100]}...",
  1575. flush=True,
  1576. )
  1577. self.page.on("request", handle_request)
  1578. iters = 0
  1579. page_count = 0 # 统计实际获取到的页数
  1580. print(
  1581. f"[{self.platform_name}] ========== 开始自动分页获取作品 ==========",
  1582. flush=True,
  1583. )
  1584. print(
  1585. f"[{self.platform_name}] 最大迭代次数: {max_iters}, 每页大小: {api_page_size}",
  1586. flush=True,
  1587. )
  1588. while iters < max_iters:
  1589. iters += 1
  1590. print(
  1591. f"\n[{self.platform_name}] ---------- 第 {iters} 次请求 (cursor={cursor}) ----------",
  1592. flush=True,
  1593. )
  1594. resp = await fetch_notes_page(cursor)
  1595. # 打印调试信息
  1596. if resp and isinstance(resp, dict) and resp.get("_debug"):
  1597. debug_info = resp.get("_debug", {})
  1598. sign_result = debug_info.get("signResult", {})
  1599. print(f"[{self.platform_name}] 🔍 调试信息:", flush=True)
  1600. print(
  1601. f"[{self.platform_name}] 签名可用: {sign_result.get('hasSign', False)}",
  1602. flush=True,
  1603. )
  1604. if sign_result.get("x_s"):
  1605. print(
  1606. f"[{self.platform_name}] X-S: {sign_result.get('x_s', '')}",
  1607. flush=True,
  1608. )
  1609. if sign_result.get("x_t"):
  1610. print(
  1611. f"[{self.platform_name}] X-T: {sign_result.get('x_t', '')}",
  1612. flush=True,
  1613. )
  1614. if sign_result.get("error"):
  1615. print(
  1616. f"[{self.platform_name}] 签名错误: {sign_result.get('error', '')}",
  1617. flush=True,
  1618. )
  1619. print(
  1620. f"[{self.platform_name}] HTTP 状态: {debug_info.get('status', 'N/A')} {debug_info.get('statusText', '')}",
  1621. flush=True,
  1622. )
  1623. # 移除调试信息,避免影响后续处理
  1624. resp.pop("_debug", None)
  1625. if not resp or not isinstance(resp, dict):
  1626. print(
  1627. f"[{self.platform_name}] ❌ 第 {iters} 次拉取无响应,cursor={cursor}",
  1628. flush=True,
  1629. )
  1630. print(
  1631. f"[{self.platform_name}] 响应类型: {type(resp)}, 响应内容: {str(resp)[:500]}",
  1632. flush=True,
  1633. )
  1634. break
  1635. if not (resp.get("success") or resp.get("code") == 0) or not resp.get(
  1636. "data"
  1637. ):
  1638. error_msg = str(resp)[:500]
  1639. print(
  1640. f"[{self.platform_name}] ❌ 拉取失败 cursor={cursor}",
  1641. flush=True,
  1642. )
  1643. print(f"[{self.platform_name}] 响应详情: {error_msg}", flush=True)
  1644. print(
  1645. f"[{self.platform_name}] success={resp.get('success')}, code={resp.get('code')}, has_data={bool(resp.get('data'))}",
  1646. flush=True,
  1647. )
  1648. # 打印详细的错误信息
  1649. if resp.get("msg"):
  1650. print(
  1651. f"[{self.platform_name}] 错误消息: {resp.get('msg')}",
  1652. flush=True,
  1653. )
  1654. if resp.get("message"):
  1655. print(
  1656. f"[{self.platform_name}] 错误消息: {resp.get('message')}",
  1657. flush=True,
  1658. )
  1659. if resp.get("error"):
  1660. print(
  1661. f"[{self.platform_name}] 错误: {resp.get('error')}",
  1662. flush=True,
  1663. )
  1664. # 打印调试信息
  1665. if resp.get("_debug"):
  1666. debug_info = resp.get("_debug", {})
  1667. print(
  1668. f"[{self.platform_name}] HTTP 状态: {debug_info.get('status', 'N/A')} {debug_info.get('statusText', '')}",
  1669. flush=True,
  1670. )
  1671. sign_result = debug_info.get("signResult", {})
  1672. if sign_result.get("error"):
  1673. print(
  1674. f"[{self.platform_name}] 签名错误: {sign_result.get('error')}",
  1675. flush=True,
  1676. )
  1677. if iters == 1:
  1678. print(
  1679. f"[{self.platform_name}] 第一次请求失败,切换到滚动模式",
  1680. flush=True,
  1681. )
  1682. return await collect_by_scrolling()
  1683. break
  1684. data = resp.get("data", {}) or {}
  1685. notes = data.get("notes", []) or []
  1686. if not notes:
  1687. print(
  1688. f"[{self.platform_name}] ⚠️ cursor={cursor} 无作品,停止分页",
  1689. flush=True,
  1690. )
  1691. break
  1692. # 统计页数
  1693. page_count += 1
  1694. print(
  1695. f"[{self.platform_name}] ✅ 第 {page_count} 页获取成功,本页作品数: {len(notes)}",
  1696. flush=True,
  1697. )
  1698. tags = data.get("tags", []) or []
  1699. if tags:
  1700. preferred = 0
  1701. for tag in tags:
  1702. if tag.get("id") == "special.note_time_desc":
  1703. preferred = (
  1704. tag.get("notes_count", 0)
  1705. or tag.get("notesCount", 0)
  1706. or tag.get("count", 0)
  1707. or 0
  1708. )
  1709. break
  1710. if preferred:
  1711. total = max(total, int(preferred))
  1712. print(
  1713. f"[{self.platform_name}] 📊 从 tags 获取总数: {total} (preferred)",
  1714. flush=True,
  1715. )
  1716. else:
  1717. tag_total = max(
  1718. [
  1719. int(
  1720. t.get("notes_count", 0)
  1721. or t.get("notesCount", 0)
  1722. or t.get("count", 0)
  1723. or 0
  1724. )
  1725. for t in tags
  1726. ]
  1727. + [0]
  1728. )
  1729. total = max(total, tag_total)
  1730. if tag_total > 0:
  1731. print(
  1732. f"[{self.platform_name}] 📊 从 tags 获取总数: {total}",
  1733. flush=True,
  1734. )
  1735. if not total:
  1736. t2 = int(
  1737. data.get("total", 0)
  1738. or data.get("total_count", 0)
  1739. or data.get("totalCount", 0)
  1740. or 0
  1741. )
  1742. if not t2 and isinstance(data.get("page", {}), dict):
  1743. t2 = int(
  1744. data.get("page", {}).get("total", 0)
  1745. or data.get("page", {}).get("totalCount", 0)
  1746. or 0
  1747. )
  1748. total = max(total, t2)
  1749. if t2 > 0:
  1750. print(
  1751. f"[{self.platform_name}] 📊 从 data.total 获取总数: {total}",
  1752. flush=True,
  1753. )
  1754. parsed = parse_notes(notes)
  1755. new_items = []
  1756. for w in parsed:
  1757. if w.work_id and w.work_id not in seen_ids:
  1758. seen_ids.add(w.work_id)
  1759. new_items.append(w)
  1760. works.extend(new_items)
  1761. print(
  1762. f"[{self.platform_name}] 📈 累计统计: 本页新作品={len(new_items)}, 累计作品数={len(works)}, 声明总数={total}",
  1763. flush=True,
  1764. )
  1765. if total and len(works) >= total:
  1766. print(
  1767. f"[{self.platform_name}] ✅ 已获取全部作品 (累计={len(works)} >= 总数={total}),停止分页",
  1768. flush=True,
  1769. )
  1770. break
  1771. if len(new_items) == 0:
  1772. print(
  1773. f"[{self.platform_name}] ⚠️ 本页无新作品,停止分页", flush=True
  1774. )
  1775. break
  1776. next_page = data.get("page", "")
  1777. old_cursor = cursor
  1778. if next_page == cursor:
  1779. next_page = ""
  1780. if next_page == -1 or str(next_page) == "-1":
  1781. next_page = ""
  1782. if next_page is None or next_page == "":
  1783. if isinstance(cursor, int):
  1784. cursor = cursor + 1
  1785. else:
  1786. cursor = len(works) // api_page_size
  1787. print(
  1788. f"[{self.platform_name}] 🔄 下一页 cursor: {old_cursor} -> {cursor} (自动递增)",
  1789. flush=True,
  1790. )
  1791. else:
  1792. cursor = next_page
  1793. print(
  1794. f"[{self.platform_name}] 🔄 下一页 cursor: {old_cursor} -> {cursor} (API返回)",
  1795. flush=True,
  1796. )
  1797. await asyncio.sleep(0.5)
  1798. # 移除请求监听器
  1799. try:
  1800. self.page.remove_listener("request", handle_request)
  1801. except Exception:
  1802. pass
  1803. print(
  1804. f"\n[{self.platform_name}] ========== 分页完成 ==========", flush=True
  1805. )
  1806. print(
  1807. f"[{self.platform_name}] 📊 分页统计: 总请求次数={iters}, 成功获取页数={page_count}, 累计作品数={len(works)}, 声明总数={total}",
  1808. flush=True,
  1809. )
  1810. if captured_requests:
  1811. print(
  1812. f"[{self.platform_name}] 📡 捕获到 {len(captured_requests)} 个 API 请求",
  1813. flush=True,
  1814. )
  1815. for i, req in enumerate(captured_requests[:3], 1): # 只显示前3个
  1816. print(
  1817. f"[{self.platform_name}] 请求 {i}: {req['method']} {req['url']}",
  1818. flush=True,
  1819. )
  1820. if "x-s" in req["headers"]:
  1821. print(
  1822. f"[{self.platform_name}] X-S: {req['headers']['x-s'][:50]}...",
  1823. flush=True,
  1824. )
  1825. if "x-t" in req["headers"]:
  1826. print(
  1827. f"[{self.platform_name}] X-T: {req['headers']['x-t']}",
  1828. flush=True,
  1829. )
  1830. print(
  1831. f"[{self.platform_name}] ========================================\n",
  1832. flush=True,
  1833. )
  1834. except Exception as e:
  1835. import traceback
  1836. error_trace = traceback.format_exc()
  1837. print(f"[{self.platform_name}] 发生异常: {e}", flush=True)
  1838. traceback.print_exc()
  1839. return WorksResult(
  1840. success=False,
  1841. platform=self.platform_name,
  1842. error=str(e),
  1843. debug_info=f"异常详情: {error_trace[:500]}",
  1844. )
  1845. finally:
  1846. await self.close_browser()
  1847. debug_info = f"总请求次数={iters}, 成功获取页数={page_count}, 累计作品数={len(works)}, 声明总数={total}"
  1848. if len(works) == 0:
  1849. debug_info += " | 警告: 没有获取到任何作品,可能原因: Cookie失效、API调用失败、或账号无作品"
  1850. return WorksResult(
  1851. success=True,
  1852. platform=self.platform_name,
  1853. works=works,
  1854. total=total or len(works),
  1855. has_more=False,
  1856. next_page=-1,
  1857. debug_info=debug_info,
  1858. )
  1859. async def get_comments(
  1860. self, cookies: str, work_id: str, cursor: str = ""
  1861. ) -> CommentsResult:
  1862. """
  1863. 获取账号下所有作品的评论 —— 完全复刻 get_xiaohongshu_work_comments.py 的7步流程。
  1864. """
  1865. all_comments: List[CommentItem] = []
  1866. total_comments = 0
  1867. has_more = False
  1868. browser = None
  1869. print(222222222222222222222222222222222222)
  1870. print(work_id)
  1871. global stored_cookies
  1872. try:
  1873. # --- Step 1: 初始化浏览器和 Cookie ---
  1874. cookie_list = self.parse_cookies(cookies)
  1875. playwright = await async_playwright().start()
  1876. browser = await playwright.chromium.launch(headless=False)
  1877. context = await browser.new_context(
  1878. viewport={"width": 1400, "height": 900},
  1879. user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  1880. )
  1881. if os.path.exists("cookies.json"):
  1882. with open("cookies.json", "r") as f:
  1883. stored_cookies = json.load(f)
  1884. if stored_cookies:
  1885. await context.add_cookies(stored_cookies)
  1886. page = await context.new_page()
  1887. # --- Step 2: 打开小红书主页 ---
  1888. await page.goto(
  1889. "https://www.xiaohongshu.com", wait_until="domcontentloaded"
  1890. )
  1891. await asyncio.sleep(1.5)
  1892. # --- Step 3: 检查并处理登录弹窗 ---
  1893. try:
  1894. if await page.is_visible(".login-container", timeout=3000):
  1895. await page.wait_for_selector(
  1896. ".login-container", state="hidden", timeout=120000
  1897. )
  1898. stored_cookies = await context.cookies()
  1899. with open("xiaohongshu_cookies.json", "w") as f:
  1900. json.dump(stored_cookies, f)
  1901. except Exception as e:
  1902. pass # 忽略超时,继续执行
  1903. # --- 提取 User ID ---
  1904. user_id = None
  1905. for cookie in cookie_list:
  1906. if cookie.get("name") == "x-user-id-creator.xiaohongshu.com":
  1907. user_id = cookie.get("value")
  1908. break
  1909. if not user_id:
  1910. raise ValueError("无法从 Cookie 中提取 user_id")
  1911. # --- Step 4: 跳转到用户主页 ---
  1912. profile_url = f"https://www.xiaohongshu.com/user/profile/{user_id}"
  1913. await page.goto(profile_url, wait_until="domcontentloaded")
  1914. await asyncio.sleep(2)
  1915. # --- 等待笔记区域加载 ---
  1916. try:
  1917. await page.wait_for_selector(
  1918. "#userPostedFeeds .note-item", timeout=20000
  1919. )
  1920. except:
  1921. raise Exception("笔记区域未加载,请检查账号是否公开或 Cookie 是否有效")
  1922. # --- Step 5: 滚动到底部加载全部笔记 ---
  1923. last_height = None
  1924. while True:
  1925. await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
  1926. await asyncio.sleep(2)
  1927. new_height = await page.evaluate("document.body.scrollHeight")
  1928. if new_height == last_height:
  1929. break
  1930. last_height = new_height
  1931. # --- 获取所有封面图 ---
  1932. note_imgs = await page.query_selector_all(
  1933. "#userPostedFeeds .note-item .cover img"
  1934. )
  1935. print(f"共找到 {len(note_imgs)} 张封面图")
  1936. # --- Step 6 & 7: 依次点击封面图,捕获评论并结构化 ---
  1937. for i, img in enumerate(note_imgs):
  1938. try:
  1939. # >>> 新增:从 img 提取 note_id 并与 work_id 比较 <<<
  1940. note_id = await img.evaluate("""el => {
  1941. const item = el.closest('.note-item');
  1942. if (!item) return null;
  1943. const link = item.querySelector('a[href^="/explore/"]');
  1944. return link ? link.href.split('/').pop() : null;
  1945. }""")
  1946. if note_id != work_id:
  1947. print(
  1948. f"note_id {note_id} 与目标 work_id {work_id} 不匹配,跳出循环"
  1949. )
  1950. continue
  1951. # <<< 新增结束 >>>
  1952. await img.scroll_into_view_if_needed()
  1953. await asyncio.sleep(0.5)
  1954. comment_resp = None
  1955. def handle_response(response):
  1956. nonlocal comment_resp
  1957. if (
  1958. "edith.xiaohongshu.com/api/sns/web/v2/comment/page"
  1959. in response.url
  1960. ):
  1961. comment_resp = response
  1962. page.on("response", handle_response)
  1963. await img.click()
  1964. await asyncio.sleep(1.5)
  1965. page.remove_listener("response", handle_response)
  1966. if not comment_resp:
  1967. await page.keyboard.press("Escape")
  1968. continue
  1969. json_data = await comment_resp.json()
  1970. if not (json_data.get("success") or json_data.get("code") == 0):
  1971. await page.keyboard.press("Escape")
  1972. continue
  1973. data = json_data.get("data", {})
  1974. raw_comments = data.get("comments", [])
  1975. note_id = data.get("note_id", "")
  1976. for main_cmt in raw_comments:
  1977. # 主评论
  1978. user_info = main_cmt.get("user_info", {})
  1979. all_comments.append(
  1980. CommentItem(
  1981. comment_id=main_cmt["id"],
  1982. parent_comment_id=None,
  1983. work_id=work_id,
  1984. content=main_cmt["content"],
  1985. author_id=user_info.get("user_id", ""),
  1986. author_name=user_info.get("nickname", ""),
  1987. author_avatar=user_info.get("image", ""),
  1988. like_count=int(main_cmt.get("like_count", 0)),
  1989. reply_count=main_cmt.get("sub_comment_count", 0),
  1990. create_time=self._timestamp_to_readable(
  1991. main_cmt.get("create_time", 0)
  1992. ),
  1993. )
  1994. )
  1995. # 子评论
  1996. for sub_cmt in main_cmt.get("sub_comments", []):
  1997. sub_user = sub_cmt.get("user_info", {})
  1998. all_comments.append(
  1999. CommentItem(
  2000. comment_id=sub_cmt["id"],
  2001. parent_comment_id=main_cmt["id"],
  2002. work_id=work_id,
  2003. content=sub_cmt["content"],
  2004. author_id=sub_user.get("user_id", ""),
  2005. author_name=sub_user.get("nickname", ""),
  2006. author_avatar=sub_user.get("image", ""),
  2007. like_count=int(sub_cmt.get("like_count", 0)),
  2008. reply_count=0,
  2009. create_time=self._timestamp_to_readable(
  2010. sub_cmt.get("create_time", 0)
  2011. ),
  2012. )
  2013. )
  2014. # 关闭弹窗
  2015. await page.keyboard.press("Escape")
  2016. await asyncio.sleep(1)
  2017. except Exception as e:
  2018. # 出错也尝试关闭弹窗
  2019. try:
  2020. await page.keyboard.press("Escape")
  2021. await asyncio.sleep(0.5)
  2022. except:
  2023. pass
  2024. continue
  2025. # --- 返回结果 ---
  2026. total_comments = len(all_comments)
  2027. # return {
  2028. # 'success': True,
  2029. # 'platform': self.platform_name,
  2030. # 'work_comments': all_comments, # 注意:此处为扁平列表,如需按作品分组可在外层处理
  2031. # 'total': total_comments
  2032. # }
  2033. return CommentsResult(
  2034. success=True,
  2035. platform=self.platform_name,
  2036. work_id=work_id,
  2037. comments=all_comments,
  2038. total=total_comments,
  2039. has_more=has_more,
  2040. )
  2041. except Exception as e:
  2042. import traceback
  2043. traceback.print_exc()
  2044. return CommentsResult(
  2045. success=True, platform=self.platform_name, work_id=work_id, total=0
  2046. )
  2047. finally:
  2048. if browser:
  2049. await browser.close()
  2050. def _timestamp_to_readable(self, ts_ms: int) -> str:
  2051. """将毫秒时间戳转换为可读格式"""
  2052. from datetime import datetime
  2053. if not ts_ms:
  2054. return ""
  2055. try:
  2056. return datetime.fromtimestamp(ts_ms / 1000).strftime("%Y-%m-%d %H:%M:%S")
  2057. except Exception:
  2058. return ""
  2059. async def get_all_comments(self, cookies: str) -> dict:
  2060. """获取所有作品的评论 - 通过评论管理页面"""
  2061. print(f"\n{'=' * 60}")
  2062. print(f"[{self.platform_name}] 获取所有作品评论")
  2063. print(f"{'=' * 60}")
  2064. all_work_comments = []
  2065. captured_comments = []
  2066. captured_notes = {} # note_id -> note_info
  2067. try:
  2068. await self.init_browser()
  2069. cookie_list = self.parse_cookies(cookies)
  2070. await self.set_cookies(cookie_list)
  2071. if not self.page:
  2072. raise Exception("Page not initialized")
  2073. # 设置 API 响应监听器
  2074. async def handle_response(response):
  2075. nonlocal captured_comments, captured_notes
  2076. url = response.url
  2077. try:
  2078. # 监听评论列表 API - 多种格式
  2079. if "/comment/" in url and ("page" in url or "list" in url):
  2080. json_data = await response.json()
  2081. print(
  2082. f"[{self.platform_name}] 捕获到评论 API: {url[:100]}...",
  2083. flush=True,
  2084. )
  2085. if json_data.get("success") or json_data.get("code") == 0:
  2086. data = json_data.get("data", {})
  2087. comments = data.get("comments", []) or data.get("list", [])
  2088. # 从 URL 中提取 note_id
  2089. import re
  2090. note_id_match = re.search(r"note_id=([^&]+)", url)
  2091. note_id = note_id_match.group(1) if note_id_match else ""
  2092. if comments:
  2093. for comment in comments:
  2094. # 添加 note_id 到评论中
  2095. if note_id and "note_id" not in comment:
  2096. comment["note_id"] = note_id
  2097. captured_comments.append(comment)
  2098. print(
  2099. f"[{self.platform_name}] 捕获到 {len(comments)} 条评论 (note_id={note_id}),总计: {len(captured_comments)}",
  2100. flush=True,
  2101. )
  2102. # 监听笔记列表 API
  2103. if "/note/" in url and (
  2104. "list" in url or "posted" in url or "manager" in url
  2105. ):
  2106. json_data = await response.json()
  2107. if json_data.get("success") or json_data.get("code") == 0:
  2108. data = json_data.get("data", {})
  2109. notes = data.get("notes", []) or data.get("list", [])
  2110. print(
  2111. f"[{self.platform_name}] 捕获到笔记列表 API: {len(notes)} 个笔记",
  2112. flush=True,
  2113. )
  2114. for note in notes:
  2115. note_id = note.get("note_id", "") or note.get("id", "")
  2116. if note_id:
  2117. cover_url = ""
  2118. cover = note.get("cover", {})
  2119. if isinstance(cover, dict):
  2120. cover_url = cover.get("url", "") or cover.get(
  2121. "url_default", ""
  2122. )
  2123. elif isinstance(cover, str):
  2124. cover_url = cover
  2125. captured_notes[note_id] = {
  2126. "title": note.get("title", "")
  2127. or note.get("display_title", ""),
  2128. "cover": cover_url,
  2129. }
  2130. except Exception as e:
  2131. print(f"[{self.platform_name}] 解析响应失败: {e}", flush=True)
  2132. self.page.on("response", handle_response)
  2133. print(f"[{self.platform_name}] 已注册 API 响应监听器", flush=True)
  2134. # 访问评论管理页面
  2135. print(f"[{self.platform_name}] 访问评论管理页面...", flush=True)
  2136. await self.page.goto(
  2137. "https://creator.xiaohongshu.com/creator/comment",
  2138. wait_until="domcontentloaded",
  2139. timeout=30000,
  2140. )
  2141. await asyncio.sleep(5)
  2142. # 检查登录状态
  2143. current_url = self.page.url
  2144. if "login" in current_url:
  2145. raise Exception("Cookie 已过期,请重新登录")
  2146. print(
  2147. f"[{self.platform_name}] 页面加载完成,当前捕获: {len(captured_comments)} 条评论, {len(captured_notes)} 个笔记",
  2148. flush=True,
  2149. )
  2150. # 滚动加载更多评论
  2151. for i in range(5):
  2152. await self.page.evaluate("window.scrollBy(0, 500)")
  2153. await asyncio.sleep(1)
  2154. await asyncio.sleep(3)
  2155. # 移除监听器
  2156. self.page.remove_listener("response", handle_response)
  2157. print(
  2158. f"[{self.platform_name}] 最终捕获: {len(captured_comments)} 条评论, {len(captured_notes)} 个笔记",
  2159. flush=True,
  2160. )
  2161. # 按作品分组评论
  2162. work_comments_map = {} # note_id -> work_comments
  2163. for comment in captured_comments:
  2164. # 获取笔记信息
  2165. note_info = comment.get("note_info", {}) or comment.get("note", {})
  2166. note_id = (
  2167. comment.get("note_id", "")
  2168. or note_info.get("note_id", "")
  2169. or note_info.get("id", "")
  2170. )
  2171. if not note_id:
  2172. continue
  2173. if note_id not in work_comments_map:
  2174. saved_note = captured_notes.get(note_id, {})
  2175. cover_url = ""
  2176. cover = note_info.get("cover", {})
  2177. if isinstance(cover, dict):
  2178. cover_url = cover.get("url", "") or cover.get("url_default", "")
  2179. elif isinstance(cover, str):
  2180. cover_url = cover
  2181. if not cover_url:
  2182. cover_url = saved_note.get("cover", "")
  2183. work_comments_map[note_id] = {
  2184. "work_id": note_id,
  2185. "title": note_info.get("title", "")
  2186. or note_info.get("display_title", "")
  2187. or saved_note.get("title", ""),
  2188. "cover_url": cover_url,
  2189. "comments": [],
  2190. }
  2191. cid = comment.get("id", "") or comment.get("comment_id", "")
  2192. if not cid:
  2193. continue
  2194. user_info = comment.get("user_info", {}) or comment.get("user", {})
  2195. work_comments_map[note_id]["comments"].append(
  2196. {
  2197. "comment_id": cid,
  2198. "author_id": user_info.get("user_id", "")
  2199. or user_info.get("id", ""),
  2200. "author_name": user_info.get("nickname", "")
  2201. or user_info.get("name", ""),
  2202. "author_avatar": user_info.get("image", "")
  2203. or user_info.get("avatar", ""),
  2204. "content": comment.get("content", ""),
  2205. "like_count": comment.get("like_count", 0),
  2206. "create_time": comment.get("create_time", ""),
  2207. }
  2208. )
  2209. all_work_comments = list(work_comments_map.values())
  2210. total_comments = sum(len(w["comments"]) for w in all_work_comments)
  2211. print(
  2212. f"[{self.platform_name}] 获取到 {len(all_work_comments)} 个作品的 {total_comments} 条评论",
  2213. flush=True,
  2214. )
  2215. except Exception as e:
  2216. import traceback
  2217. traceback.print_exc()
  2218. return {
  2219. "success": False,
  2220. "platform": self.platform_name,
  2221. "error": str(e),
  2222. "work_comments": [],
  2223. }
  2224. finally:
  2225. await self.close_browser()
  2226. return {
  2227. "success": True,
  2228. "platform": self.platform_name,
  2229. "work_comments": all_work_comments,
  2230. "total": len(all_work_comments),
  2231. }
  2232. async def get_note_base(self, cookies: str, note_id: str) -> dict:
  2233. """
  2234. 调用创作者中心「笔记数据- note/base」接口,用于每日作品数据同步。
  2235. 使用账号已存 Cookie,不启浏览器,直接带 Referer 调 note/base。
  2236. """
  2237. import aiohttp
  2238. note_id = (note_id or "").strip()
  2239. if not note_id:
  2240. return {"data": None, "code": -1, "msg": "missing note_id"}
  2241. cookie_list = self.parse_cookies(cookies)
  2242. cookie_dict = {
  2243. c.get("name") or "": c.get("value") or ""
  2244. for c in cookie_list
  2245. if c.get("name")
  2246. }
  2247. api_headers = {
  2248. "Accept": "application/json, text/plain, */*",
  2249. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  2250. "Referer": f"https://creator.xiaohongshu.com/statistics/note-detail?noteId={note_id}",
  2251. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
  2252. "Accept-Encoding": "gzip, deflate, br",
  2253. "Connection": "keep-alive",
  2254. }
  2255. api_url = f"https://creator.xiaohongshu.com/api/galaxy/creator/datacenter/note/base?note_id={note_id}"
  2256. async with aiohttp.ClientSession(cookies=cookie_dict) as session:
  2257. async with session.get(
  2258. api_url,
  2259. headers=api_headers,
  2260. timeout=aiohttp.ClientTimeout(total=30),
  2261. ) as resp:
  2262. status = resp.status
  2263. try:
  2264. data = await resp.json()
  2265. except Exception:
  2266. text = await resp.text()
  2267. print(f"[{self.platform_name}] note/base non-JSON: {text[:500]}")
  2268. return {"data": None, "code": status, "msg": "invalid response"}
  2269. if status != 200:
  2270. return {
  2271. "data": None,
  2272. "code": status,
  2273. "msg": data.get("msg") if isinstance(data, dict) else "request failed",
  2274. }
  2275. return (
  2276. data
  2277. if isinstance(data, dict)
  2278. else {"data": None, "code": -1, "msg": "invalid response"}
  2279. )
  2280. async def get_account_base(self, cookies: str) -> dict:
  2281. """
  2282. 调用创作者中心「账号概览- account/base」接口,用于每日用户数据同步。
  2283. 使用账号已存 Cookie,不启浏览器,直接带 Referer 请求 API。
  2284. """
  2285. import aiohttp
  2286. cookie_list = self.parse_cookies(cookies)
  2287. cookie_dict = {
  2288. c.get("name") or "": c.get("value") or ""
  2289. for c in cookie_list
  2290. if c.get("name")
  2291. }
  2292. api_headers = {
  2293. "Accept": "application/json, text/plain, */*",
  2294. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  2295. "Referer": "https://creator.xiaohongshu.com/statistics/account/v2",
  2296. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
  2297. "Accept-Encoding": "gzip, deflate, br",
  2298. "Connection": "keep-alive",
  2299. }
  2300. api_url = "https://creator.xiaohongshu.com/api/galaxy/v2/creator/datacenter/account/base"
  2301. async with aiohttp.ClientSession(cookies=cookie_dict) as session:
  2302. async with session.get(
  2303. api_url,
  2304. headers=api_headers,
  2305. timeout=aiohttp.ClientTimeout(total=30),
  2306. ) as resp:
  2307. status = resp.status
  2308. try:
  2309. data = await resp.json()
  2310. except Exception:
  2311. text = await resp.text()
  2312. print(f"[{self.platform_name}] account/base non-JSON: {text[:500]}")
  2313. return {"data": None, "code": status, "msg": "invalid response"}
  2314. if status != 200:
  2315. return {
  2316. "data": None,
  2317. "code": status,
  2318. "msg": data.get("msg") if isinstance(data, dict) else "request failed",
  2319. }
  2320. return (
  2321. data
  2322. if isinstance(data, dict)
  2323. else {"data": None, "code": -1, "msg": "invalid response"}
  2324. )
  2325. async def get_fans_overall_new(self, cookies: str) -> dict:
  2326. """
  2327. 调用创作者中心「粉丝数据- overall_new」接口,用于每日用户数据中的粉丝趋势。
  2328. 使用账号已存 Cookie,不启浏览器,直接带 Referer 请求 API。
  2329. """
  2330. import aiohttp
  2331. cookie_list = self.parse_cookies(cookies)
  2332. cookie_dict = {
  2333. c.get("name") or "": c.get("value") or ""
  2334. for c in cookie_list
  2335. if c.get("name")
  2336. }
  2337. api_headers = {
  2338. "Accept": "application/json, text/plain, */*",
  2339. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  2340. "Referer": "https://creator.xiaohongshu.com/statistics/fans-data",
  2341. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
  2342. "Accept-Encoding": "gzip, deflate, br",
  2343. "Connection": "keep-alive",
  2344. }
  2345. api_url = (
  2346. "https://creator.xiaohongshu.com/api/galaxy/creator/data/fans/overall_new"
  2347. )
  2348. async with aiohttp.ClientSession(cookies=cookie_dict) as session:
  2349. async with session.get(
  2350. api_url,
  2351. headers=api_headers,
  2352. timeout=aiohttp.ClientTimeout(total=30),
  2353. ) as resp:
  2354. status = resp.status
  2355. try:
  2356. data = await resp.json()
  2357. except Exception:
  2358. text = await resp.text()
  2359. print(
  2360. f"[{self.platform_name}] fans/overall_new non-JSON: {text[:500]}"
  2361. )
  2362. return {"data": None, "code": status, "msg": "invalid response"}
  2363. if status != 200:
  2364. return {
  2365. "data": None,
  2366. "code": status,
  2367. "msg": data.get("msg") if isinstance(data, dict) else "request failed",
  2368. }
  2369. return (
  2370. data
  2371. if isinstance(data, dict)
  2372. else {"data": None, "code": -1, "msg": "invalid response"}
  2373. )
  2374. async def get_account_overview(self, cookies: str) -> dict:
  2375. """
  2376. 一次请求同时拉取 account/base 与 fans/overall_new,用于每日用户数据同步。
  2377. 使用已存 Cookie,不先访问页面,直接带 Referer 并行请求两个 API。
  2378. 返回: { "account_base": {...}, "fans_overall_new": {...} }
  2379. """
  2380. import aiohttp
  2381. cookie_list = self.parse_cookies(cookies)
  2382. cookie_dict = {
  2383. c.get("name") or "": c.get("value") or ""
  2384. for c in cookie_list
  2385. if c.get("name")
  2386. }
  2387. account_api_headers = {
  2388. "Accept": "application/json, text/plain, */*",
  2389. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  2390. "Referer": "https://creator.xiaohongshu.com/statistics/account/v2",
  2391. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
  2392. "Accept-Encoding": "gzip, deflate, br",
  2393. "Connection": "keep-alive",
  2394. }
  2395. fans_api_headers = {
  2396. "Accept": "application/json, text/plain, */*",
  2397. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  2398. "Referer": "https://creator.xiaohongshu.com/statistics/fans-data",
  2399. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
  2400. "Accept-Encoding": "gzip, deflate, br",
  2401. "Connection": "keep-alive",
  2402. }
  2403. api_account = "https://creator.xiaohongshu.com/api/galaxy/v2/creator/datacenter/account/base"
  2404. api_fans = (
  2405. "https://creator.xiaohongshu.com/api/galaxy/creator/data/fans/overall_new"
  2406. )
  2407. async def fetch_account_base(session):
  2408. async with session.get(
  2409. api_account,
  2410. headers=account_api_headers,
  2411. timeout=aiohttp.ClientTimeout(total=30),
  2412. ) as resp:
  2413. status = resp.status
  2414. try:
  2415. data = await resp.json()
  2416. except Exception:
  2417. text = await resp.text()
  2418. print(f"[{self.platform_name}] account/base non-JSON: {text[:500]}")
  2419. return {"data": None, "code": status, "msg": "invalid response"}
  2420. if status != 200:
  2421. return {
  2422. "data": None,
  2423. "code": status,
  2424. "msg": data.get("msg")
  2425. if isinstance(data, dict)
  2426. else "request failed",
  2427. }
  2428. return (
  2429. data
  2430. if isinstance(data, dict)
  2431. else {"data": None, "code": -1, "msg": "invalid response"}
  2432. )
  2433. async def fetch_fans_overall_new(session):
  2434. async with session.get(
  2435. api_fans,
  2436. headers=fans_api_headers,
  2437. timeout=aiohttp.ClientTimeout(total=30),
  2438. ) as resp:
  2439. status = resp.status
  2440. try:
  2441. data = await resp.json()
  2442. except Exception:
  2443. text = await resp.text()
  2444. print(
  2445. f"[{self.platform_name}] fans/overall_new non-JSON: {text[:500]}"
  2446. )
  2447. return {"data": None, "code": status, "msg": "invalid response"}
  2448. if status != 200:
  2449. return {
  2450. "data": None,
  2451. "code": status,
  2452. "msg": data.get("msg")
  2453. if isinstance(data, dict)
  2454. else "request failed",
  2455. }
  2456. return (
  2457. data
  2458. if isinstance(data, dict)
  2459. else {"data": None, "code": -1, "msg": "invalid response"}
  2460. )
  2461. async with aiohttp.ClientSession(cookies=cookie_dict) as session:
  2462. account_base_result, fans_overall_new_result = await asyncio.gather(
  2463. fetch_account_base(session),
  2464. fetch_fans_overall_new(session),
  2465. )
  2466. return {
  2467. "account_base": account_base_result,
  2468. "fans_overall_new": fans_overall_new_result,
  2469. }