baijiahao.py 190 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940
  1. # -*- coding: utf-8 -*-
  2. """
  3. 百家号视频发布器
  4. """
  5. import asyncio
  6. import json
  7. from typing import List
  8. from datetime import datetime
  9. from .base import (
  10. BasePublisher, PublishParams, PublishResult,
  11. WorkItem, WorksResult, CommentItem, CommentsResult
  12. )
  13. class BaijiahaoPublisher(BasePublisher):
  14. """
  15. 百家号视频发布器
  16. 使用 Playwright 自动化操作百家号创作者中心
  17. """
  18. platform_name = "baijiahao"
  19. login_url = "https://baijiahao.baidu.com/"
  20. publish_url = "https://baijiahao.baidu.com/builder/rc/edit?type=video"
  21. cookie_domain = ".baidu.com"
  22. # 登录检测配置
  23. login_check_url = "https://baijiahao.baidu.com/builder/rc/home"
  24. login_indicators = ["passport.baidu.com", "/login", "wappass.baidu.com"]
  25. login_selectors = ['text="登录"', 'text="请登录"', '[class*="login-btn"]']
  26. async def get_account_info(self, cookies: str) -> dict:
  27. """
  28. 获取百家号账号信息
  29. 使用直接 HTTP API 调用,不使用浏览器
  30. """
  31. import aiohttp
  32. print(f"\n{'='*60}")
  33. print(f"[{self.platform_name}] 获取账号信息 (使用 API)")
  34. print(f"{'='*60}")
  35. try:
  36. # 解析 cookies
  37. cookie_list = self.parse_cookies(cookies)
  38. cookie_dict = {c['name']: c['value'] for c in cookie_list}
  39. # 重要:百家号需要先访问主页建立会话上下文
  40. print(f"[{self.platform_name}] 第一步:访问主页建立会话...")
  41. session_headers = {
  42. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  43. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
  44. # Cookie 由 session 管理,不手动设置
  45. 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
  46. 'Accept-Encoding': 'gzip, deflate, br',
  47. 'Connection': 'keep-alive',
  48. 'Upgrade-Insecure-Requests': '1',
  49. 'Sec-Fetch-Dest': 'document',
  50. 'Sec-Fetch-Mode': 'navigate',
  51. 'Sec-Fetch-Site': 'none',
  52. 'Sec-Fetch-User': '?1',
  53. 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
  54. 'sec-ch-ua-mobile': '?0',
  55. 'sec-ch-ua-platform': '"Windows"'
  56. }
  57. headers = {
  58. 'Accept': 'application/json, text/plain, */*',
  59. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
  60. # Cookie 由 session 管理,不手动设置
  61. 'Referer': 'https://baijiahao.baidu.com/builder/rc/home',
  62. 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
  63. 'Accept-Encoding': 'gzip, deflate, br',
  64. 'Connection': 'keep-alive',
  65. 'Sec-Fetch-Dest': 'empty',
  66. 'Sec-Fetch-Mode': 'cors',
  67. 'Sec-Fetch-Site': 'same-origin',
  68. 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
  69. 'sec-ch-ua-mobile': '?0',
  70. 'sec-ch-ua-platform': '"Windows"'
  71. }
  72. # 使用 cookies 参数初始化 session,让 aiohttp 自动管理 cookie 更新
  73. async with aiohttp.ClientSession(cookies=cookie_dict) as session:
  74. # 步骤 0: 先访问主页建立会话上下文(关键步骤!)
  75. print(f"[{self.platform_name}] [0/4] 访问主页建立会话上下文...")
  76. async with session.get(
  77. 'https://baijiahao.baidu.com/builder/rc/home',
  78. headers=session_headers,
  79. timeout=aiohttp.ClientTimeout(total=30)
  80. ) as home_response:
  81. home_status = home_response.status
  82. print(f"[{self.platform_name}] 主页访问状态: {home_status}")
  83. # 获取响应头中的新cookies(如果有)
  84. if 'Set-Cookie' in home_response.headers:
  85. new_cookies = home_response.headers['Set-Cookie']
  86. print(f"[{self.platform_name}] 获取到新的会话Cookie")
  87. # 这里可以处理新的cookies,但暂时跳过复杂处理
  88. # 短暂等待确保会话建立
  89. await asyncio.sleep(1)
  90. # 步骤 1: 获取账号基本信息
  91. print(f"[{self.platform_name}] [1/4] 调用 appinfo API...")
  92. async with session.get(
  93. 'https://baijiahao.baidu.com/builder/app/appinfo',
  94. headers=headers,
  95. timeout=aiohttp.ClientTimeout(total=30)
  96. ) as response:
  97. appinfo_result = await response.json()
  98. print(f"[{self.platform_name}] appinfo API 完整响应: {json.dumps(appinfo_result, ensure_ascii=False)[:500]}")
  99. print(f"[{self.platform_name}] appinfo API 响应: errno={appinfo_result.get('errno')}")
  100. # 检查登录状态
  101. if appinfo_result.get('errno') != 0:
  102. error_msg = appinfo_result.get('errmsg', '未知错误')
  103. errno = appinfo_result.get('errno')
  104. print(f"[{self.platform_name}] API 返回错误: errno={errno}, msg={error_msg}")
  105. # errno 110 表示未登录
  106. if errno == 110:
  107. return {
  108. "success": False,
  109. "error": "Cookie 已失效,需要重新登录",
  110. "need_login": True
  111. }
  112. # errno 10001402 表示分散认证问题,尝试重新访问主页后重试
  113. if errno == 10001402:
  114. print(f"[{self.platform_name}] 检测到分散认证问题,尝试重新访问主页...")
  115. await asyncio.sleep(2)
  116. # 重新访问主页
  117. async with session.get(
  118. 'https://baijiahao.baidu.com/builder/rc/home',
  119. headers=session_headers,
  120. timeout=aiohttp.ClientTimeout(total=30)
  121. ) as retry_home_response:
  122. print(f"[{self.platform_name}] 重新访问主页状态: {retry_home_response.status}")
  123. await asyncio.sleep(1)
  124. # 重试 API 调用
  125. async with session.get(
  126. 'https://baijiahao.baidu.com/builder/app/appinfo',
  127. headers=headers,
  128. timeout=aiohttp.ClientTimeout(total=30)
  129. ) as retry_response:
  130. retry_result = await retry_response.json()
  131. if retry_result.get('errno') == 0:
  132. print(f"[{self.platform_name}] 分散认证问题已解决")
  133. # 使用重试成功的结果继续处理
  134. appinfo_result = retry_result
  135. else:
  136. print(f"[{self.platform_name}] 重试仍然失败")
  137. return {
  138. "success": False,
  139. "error": f"分散认证问题: {error_msg}",
  140. "need_login": True
  141. }
  142. return {
  143. "success": False,
  144. "error": error_msg,
  145. "need_login": True
  146. }
  147. # 获取用户数据
  148. user_data = appinfo_result.get('data', {}).get('user', {})
  149. if not user_data:
  150. return {
  151. "success": False,
  152. "error": "无法获取用户信息",
  153. "need_login": True
  154. }
  155. # 检查账号状态
  156. status = user_data.get('status', '')
  157. # 有效的账号状态:audit(审核中), pass(已通过), normal(正常), newbie(新手)
  158. valid_statuses = ['audit', 'pass', 'normal', 'newbie']
  159. if status not in valid_statuses:
  160. print(f"[{self.platform_name}] 账号状态异常: {status}")
  161. # 提取基本信息
  162. account_name = user_data.get('name') or user_data.get('uname') or '百家号账号'
  163. app_id = user_data.get('app_id') or user_data.get('id', 0)
  164. account_id = str(app_id) if app_id else f"baijiahao_{int(datetime.now().timestamp() * 1000)}"
  165. # 处理头像 URL
  166. avatar_url = user_data.get('avatar') or user_data.get('avatar_unify', '')
  167. if avatar_url and avatar_url.startswith('//'):
  168. avatar_url = 'https:' + avatar_url
  169. print(f"[{self.platform_name}] 账号名称: {account_name}, ID: {account_id}")
  170. # 步骤 2: 获取粉丝数(非关键,失败不影响整体)
  171. fans_count = 0
  172. try:
  173. print(f"[{self.platform_name}] [2/3] 调用 growth/get_info API 获取粉丝数...")
  174. async with session.get(
  175. 'https://baijiahao.baidu.com/cms-ui/rights/growth/get_info',
  176. headers=headers,
  177. timeout=aiohttp.ClientTimeout(total=10)
  178. ) as response:
  179. growth_result = await response.json()
  180. if growth_result.get('errno') == 0:
  181. growth_data = growth_result.get('data', {})
  182. fans_count = int(growth_data.get('fans_num', 0))
  183. print(f"[{self.platform_name}] 粉丝数: {fans_count}")
  184. else:
  185. print(f"[{self.platform_name}] 获取粉丝数失败: {growth_result.get('errmsg')}")
  186. except Exception as e:
  187. print(f"[{self.platform_name}] 获取粉丝数异常(非关键): {e}")
  188. # 步骤 3: 获取作品数量(使用与 Node 端一致的 API)
  189. works_count = 0
  190. try:
  191. print(f"[{self.platform_name}] [3/3] 调用 article/lists API 获取作品数...")
  192. # 使用与 Node 端一致的 API 参数
  193. list_url = 'https://baijiahao.baidu.com/pcui/article/lists?currentPage=1&pageSize=20&search=&type=&collection=&startDate=&endDate=&clearBeforeFetch=false&dynamic=0'
  194. async with session.get(
  195. list_url,
  196. headers={
  197. 'accept': '*/*',
  198. 'user-agent': 'PostmanRuntime/7.51.0',
  199. # cookie 由 session 管理
  200. 'referer': 'https://baijiahao.baidu.com/builder/rc/content',
  201. 'connection': 'keep-alive',
  202. 'accept-encoding': 'gzip, deflate, br',
  203. },
  204. timeout=aiohttp.ClientTimeout(total=30)
  205. ) as response:
  206. response_text = await response.text()
  207. print(f"[{self.platform_name}] ========== Works API Response ==========")
  208. print(f"[{self.platform_name}] Full response: {response_text[:1000]}...") # 只打印前1000字符
  209. print(f"[{self.platform_name}] =========================================")
  210. works_result = json.loads(response_text)
  211. # 处理分散认证问题 (errno=10001402),重试一次
  212. if works_result.get('errno') == 10001402:
  213. print(f"[{self.platform_name}] 分散认证问题 (errno=10001402),3秒后重试...")
  214. await asyncio.sleep(3)
  215. # 重试一次,使用更完整的请求头
  216. retry_headers = headers.copy()
  217. retry_headers.update({
  218. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  219. 'Cache-Control': 'max-age=0',
  220. 'Upgrade-Insecure-Requests': '1',
  221. })
  222. async with session.get(
  223. list_url,
  224. headers=retry_headers,
  225. timeout=aiohttp.ClientTimeout(total=30)
  226. ) as retry_response:
  227. retry_text = await retry_response.text()
  228. print(f"[{self.platform_name}] ========== Works API Retry Response ==========")
  229. print(f"[{self.platform_name}] Full retry response: {retry_text[:1000]}...")
  230. print(f"[{self.platform_name}] ===============================================")
  231. works_result = json.loads(retry_text)
  232. if works_result.get('errno') == 10001402:
  233. print(f"[{self.platform_name}] 重试仍然失败,返回已获取的账号信息")
  234. works_result = None
  235. if works_result and works_result.get('errno') == 0:
  236. works_data = works_result.get('data', {})
  237. # 优先使用 data.page.totalCount,如果没有则使用 data.total(兼容旧格式)
  238. page_info = works_data.get('page', {})
  239. works_count = int(page_info.get('totalCount', works_data.get('total', 0)))
  240. print(f"[{self.platform_name}] 作品数: {works_count} (from page.totalCount: {page_info.get('totalCount')}, from total: {works_data.get('total')})")
  241. else:
  242. errno = works_result.get('errno') if works_result else 'unknown'
  243. errmsg = works_result.get('errmsg', 'unknown error') if works_result else 'no response'
  244. print(f"[{self.platform_name}] 获取作品数失败: errno={errno}, errmsg={errmsg}")
  245. except Exception as e:
  246. import traceback
  247. print(f"[{self.platform_name}] 获取作品数异常(非关键): {e}")
  248. traceback.print_exc()
  249. # 返回账号信息
  250. account_info = {
  251. "success": True,
  252. "account_id": account_id,
  253. "account_name": account_name,
  254. "avatar_url": avatar_url,
  255. "fans_count": fans_count,
  256. "works_count": works_count,
  257. }
  258. print(f"[{self.platform_name}] ✓ 获取成功: {account_name} (粉丝: {fans_count}, 作品: {works_count})")
  259. return account_info
  260. except Exception as e:
  261. import traceback
  262. traceback.print_exc()
  263. return {
  264. "success": False,
  265. "error": str(e)
  266. }
  267. async def check_captcha(self) -> dict:
  268. """检查页面是否需要验证码"""
  269. if not self.page:
  270. return {'need_captcha': False, 'captcha_type': ''}
  271. try:
  272. # 检查各种验证码
  273. captcha_selectors = [
  274. 'text="请输入验证码"',
  275. 'text="滑动验证"',
  276. '[class*="captcha"]',
  277. '[class*="verify"]',
  278. ]
  279. for selector in captcha_selectors:
  280. try:
  281. if await self.page.locator(selector).count() > 0:
  282. print(f"[{self.platform_name}] 检测到验证码: {selector}")
  283. return {'need_captcha': True, 'captcha_type': 'image'}
  284. except:
  285. pass
  286. # 检查登录弹窗
  287. login_selectors = [
  288. 'text="请登录"',
  289. 'text="登录后继续"',
  290. '[class*="login-dialog"]',
  291. ]
  292. for selector in login_selectors:
  293. try:
  294. if await self.page.locator(selector).count() > 0:
  295. print(f"[{self.platform_name}] 检测到需要登录: {selector}")
  296. return {'need_captcha': True, 'captcha_type': 'login'}
  297. except:
  298. pass
  299. except Exception as e:
  300. print(f"[{self.platform_name}] 验证码检测异常: {e}")
  301. return {'need_captcha': False, 'captcha_type': ''}
  302. async def _check_login_resolved(self) -> bool:
  303. """检查用户是否已完成登录(页面不再停留在登录页)"""
  304. try:
  305. current_url = self.page.url
  306. for indicator in self.login_indicators:
  307. if indicator in current_url:
  308. return False
  309. # 检查传统登录弹窗
  310. login_selectors = [
  311. 'text="请登录"',
  312. 'text="登录后继续"',
  313. '[class*="login-dialog"]',
  314. ]
  315. for selector in login_selectors:
  316. try:
  317. loc = self.page.locator(selector).first
  318. if await loc.count() > 0 and await loc.is_visible():
  319. return False
  320. except:
  321. pass
  322. return True
  323. except:
  324. return False
  325. async def _check_captcha_resolved(self) -> bool:
  326. """检查用户是否已完成验证码验证"""
  327. try:
  328. # 先用传统方式检查(速度快,无需 API 调用)
  329. captcha_result = await self.check_captcha()
  330. if not captcha_result['need_captcha']:
  331. # 传统方式未检测到验证码,再用 AI 确认
  332. ai_captcha = await self.ai_check_captcha()
  333. if not ai_captcha['has_captcha']:
  334. return True
  335. return False
  336. except:
  337. return False
  338. async def _wait_for_user_resolve(
  339. self,
  340. check_fn,
  341. timeout: int = 300,
  342. poll_interval: int = 5,
  343. prompt: str = "",
  344. ) -> bool:
  345. """
  346. 有头浏览器模式下,等待用户手动解决验证码/登录问题。
  347. Args:
  348. check_fn: 异步函数,返回 True 表示问题已解决
  349. timeout: 超时时间(秒),默认 5 分钟
  350. poll_interval: 轮询间隔(秒)
  351. prompt: 展示给用户的提示信息
  352. Returns:
  353. True 表示用户已成功解决,False 表示超时
  354. """
  355. import time
  356. if prompt:
  357. print(f"[{self.platform_name}] {prompt}", flush=True)
  358. start_time = time.time()
  359. attempt = 0
  360. while time.time() - start_time < timeout:
  361. attempt += 1
  362. elapsed = int(time.time() - start_time)
  363. remaining = timeout - elapsed
  364. print(
  365. f"[{self.platform_name}] 等待用户操作... ({elapsed}s/{timeout}s, 剩余 {remaining}s)",
  366. flush=True,
  367. )
  368. self.report_progress(
  369. 12,
  370. f"等待用户操作中... (已等待 {elapsed}s)",
  371. )
  372. resolved = await check_fn()
  373. if resolved:
  374. print(f"[{self.platform_name}] 用户操作完成(第 {attempt} 次检测)", flush=True)
  375. await asyncio.sleep(2) # 额外等待页面稳定
  376. return True
  377. await asyncio.sleep(poll_interval)
  378. print(
  379. f"[{self.platform_name}] 等待用户操作超时 ({timeout}s)",
  380. flush=True,
  381. )
  382. return False
  383. async def _ai_analyze_upload_state(self, screenshot_base64: str = None) -> dict:
  384. """
  385. 使用 AI 识别当前上传状态,返回:
  386. {
  387. status: completed|uploading|failed|unknown,
  388. progress: int|None,
  389. confidence: int,
  390. reason: str,
  391. should_enter_publish_form: bool
  392. }
  393. """
  394. import os
  395. import ast
  396. import re
  397. import requests
  398. result = {
  399. "status": "unknown",
  400. "progress": None,
  401. "confidence": 0,
  402. "reason": "",
  403. "should_enter_publish_form": False,
  404. }
  405. try:
  406. if not screenshot_base64:
  407. screenshot_base64 = await self.capture_screenshot()
  408. if not screenshot_base64:
  409. result["reason"] = "no-screenshot"
  410. return result
  411. ai_api_key = os.environ.get('DASHSCOPE_API_KEY', '')
  412. ai_base_url = os.environ.get('DASHSCOPE_BASE_URL', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
  413. ai_vision_model = os.environ.get('AI_VISION_MODEL', 'qwen-vl-plus')
  414. if not ai_api_key:
  415. result["reason"] = "no-ai-key"
  416. return result
  417. prompt = """请分析这张“百家号视频发布页”截图,判断视频上传状态。
  418. 请只返回 JSON:
  419. {
  420. "status": "completed|uploading|failed|unknown",
  421. "progress": 0-100 或 null,
  422. "confidence": 0-100,
  423. "reason": "一句话证据",
  424. "should_enter_publish_form": true/false
  425. }
  426. 判定规则:
  427. 1) status=completed:
  428. - 出现“上传完成/处理完成/可发布/可填写标题描述/发布按钮可用”等信号
  429. - 或者明显已进入可填写发布信息的阶段
  430. 2) status=uploading:
  431. - 出现“上传中/处理中/转码中/xx%/请稍候”等
  432. 3) status=failed:
  433. - 出现“上传失败/处理失败/格式不支持/文件异常”等明确失败文案
  434. 4) should_enter_publish_form=true:
  435. - 画面显示“去发布/下一步/继续/完成编辑”等入口,且看起来应点击进入正式发布表单
  436. """
  437. headers = {
  438. 'Authorization': f'Bearer {ai_api_key}',
  439. 'Content-Type': 'application/json'
  440. }
  441. payload = {
  442. "model": ai_vision_model,
  443. "messages": [
  444. {
  445. "role": "user",
  446. "content": [
  447. {
  448. "type": "image_url",
  449. "image_url": {
  450. "url": f"data:image/jpeg;base64,{screenshot_base64}"
  451. }
  452. },
  453. {
  454. "type": "text",
  455. "text": prompt
  456. }
  457. ]
  458. }
  459. ],
  460. "max_tokens": 400
  461. }
  462. response = requests.post(
  463. f"{ai_base_url}/chat/completions",
  464. headers=headers,
  465. json=payload,
  466. timeout=30
  467. )
  468. if response.status_code != 200:
  469. result["reason"] = f"ai-http-{response.status_code}"
  470. return result
  471. response_json = response.json()
  472. ai_response = response_json.get('choices', [{}])[0].get('message', {}).get('content', '')
  473. json_match = re.search(r'```json\\s*([\\s\\S]*?)\\s*```', ai_response)
  474. if json_match:
  475. json_str = json_match.group(1)
  476. else:
  477. json_match = re.search(r'\\{[\\s\\S]*\\}', ai_response)
  478. json_str = json_match.group(0) if json_match else '{}'
  479. try:
  480. data = json.loads(json_str)
  481. except Exception:
  482. try:
  483. data = ast.literal_eval(json_str) if json_str and json_str != '{}' else {}
  484. if not isinstance(data, dict):
  485. data = {}
  486. except Exception:
  487. data = {}
  488. # 兼容中文 key / 非标准结构
  489. status_hint = str(
  490. data.get("status")
  491. or data.get("状态")
  492. or ""
  493. ).strip()
  494. status_raw = status_hint.lower()
  495. if (
  496. status_raw in ["complete", "completed", "success", "done", "finished", "ready"]
  497. or any(k in status_hint for k in ["完成", "成功", "可发布", "已上传"])
  498. ):
  499. status = "completed"
  500. elif (
  501. status_raw in ["uploading", "processing", "in_progress", "progress", "running"]
  502. or any(k in status_hint for k in ["上传中", "处理中", "转码", "进行中", "上传"])
  503. ):
  504. status = "uploading"
  505. elif (
  506. status_raw in ["failed", "error", "fail"]
  507. or any(k in status_hint for k in ["失败", "错误", "异常"])
  508. ):
  509. status = "failed"
  510. else:
  511. status = "unknown"
  512. progress = data.get("progress", data.get("进度", None))
  513. parsed_progress = None
  514. try:
  515. if progress is not None and str(progress).strip() != "":
  516. parsed_progress = max(0, min(100, int(float(progress))))
  517. except Exception:
  518. parsed_progress = None
  519. if parsed_progress is None:
  520. try:
  521. p_match = re.search(r'(\d{1,3})\s*%', ai_response or '')
  522. if p_match:
  523. parsed_progress = max(0, min(100, int(p_match.group(1))))
  524. except Exception:
  525. parsed_progress = None
  526. confidence = 0
  527. try:
  528. confidence = max(0, min(100, int(float(data.get("confidence", data.get("置信度", 0)) or 0))))
  529. except Exception:
  530. confidence = 0
  531. reason = str(data.get("reason", data.get("原因", "")) or "").strip()
  532. should_enter_raw = data.get(
  533. "should_enter_publish_form",
  534. data.get("是否进入发布表单", False)
  535. )
  536. if isinstance(should_enter_raw, bool):
  537. should_enter = should_enter_raw
  538. else:
  539. should_enter_text = str(should_enter_raw or "").strip().lower()
  540. should_enter = should_enter_text in ["true", "1", "yes", "y", "是"]
  541. # 当 AI 响应不是严格 JSON 时,按全文关键词推断
  542. response_text = str(ai_response or "")
  543. response_lower = response_text.lower()
  544. if status == "unknown":
  545. if any(k in response_text for k in ["上传完成", "处理完成", "上传成功", "可发布", "已完成"]):
  546. status = "completed"
  547. elif any(k in response_text for k in ["上传失败", "处理失败", "格式不支持", "文件异常", "失败"]):
  548. status = "failed"
  549. elif any(k in response_text for k in ["上传中", "处理中", "转码中", "请稍候"]) or re.search(r'(\d{1,3})\s*%', response_text):
  550. status = "uploading"
  551. if not should_enter and any(k in response_text for k in ["去发布", "下一步", "继续", "完成编辑"]):
  552. should_enter = True
  553. if not reason and response_text:
  554. reason = response_text.replace("\n", " ").strip()[:120]
  555. if confidence <= 0 and status != "unknown":
  556. confidence = 60
  557. # 二次语义修正
  558. if status == "uploading" and parsed_progress is not None and parsed_progress >= 100:
  559. status = "completed"
  560. should_enter = True
  561. # AI 有时会把 99/100 仍写成 uploading,这里做语义修正
  562. if status == "uploading" and parsed_progress is not None and parsed_progress >= 99 and confidence >= 60:
  563. status = "completed"
  564. should_enter = True
  565. return {
  566. "status": status,
  567. "progress": parsed_progress,
  568. "confidence": confidence,
  569. "reason": reason,
  570. "should_enter_publish_form": should_enter,
  571. }
  572. except Exception as e:
  573. result["reason"] = f"ai-exception:{e}"
  574. return result
  575. async def _extract_bjh_token(self) -> str:
  576. """从页面上下文提取百家号接口 token。"""
  577. if not self.page:
  578. return ""
  579. try:
  580. token = await self.page.evaluate(
  581. """
  582. () => {
  583. const isJwtLike = (v) => {
  584. if (!v || typeof v !== 'string') return false;
  585. const s = v.trim();
  586. if (s.length < 60) return false;
  587. const parts = s.split('.');
  588. if (parts.length !== 3) return false;
  589. return parts.every(p => /^[A-Za-z0-9_-]+$/.test(p) && p.length > 10);
  590. };
  591. const pickFromStorage = (storage) => {
  592. try {
  593. const keys = Object.keys(storage || {});
  594. for (const k of keys) {
  595. const v = storage.getItem(k);
  596. if (isJwtLike(v)) return v;
  597. }
  598. } catch {}
  599. return "";
  600. };
  601. let t = pickFromStorage(window.localStorage);
  602. if (t) return t;
  603. t = pickFromStorage(window.sessionStorage);
  604. if (t) return t;
  605. const meta = document.querySelector('meta[name="token"], meta[name="bjh-token"]');
  606. const metaToken = meta && meta.getAttribute('content');
  607. if (isJwtLike(metaToken)) return metaToken;
  608. const candidates = [
  609. (window.__INITIAL_STATE__ && window.__INITIAL_STATE__.token) || "",
  610. (window.__PRELOADED_STATE__ && window.__PRELOADED_STATE__.token) || "",
  611. (window.__NUXT__ && window.__NUXT__.state && window.__NUXT__.state.token) || "",
  612. ];
  613. for (const c of candidates) {
  614. if (isJwtLike(c)) return c;
  615. }
  616. return "";
  617. }
  618. """
  619. )
  620. if token:
  621. return str(token)
  622. except Exception:
  623. pass
  624. try:
  625. import re
  626. html = await self.page.content()
  627. m = re.search(r'([A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,})', html)
  628. if m:
  629. return m.group(1)
  630. except Exception:
  631. pass
  632. return ""
  633. async def _verify_publish_from_content_page(self, expected_title: str, page_size: int = 20) -> bool:
  634. """
  635. 到内容管理页调用列表接口,按标题二次确认是否已发布。
  636. """
  637. if not self.page:
  638. return False
  639. try:
  640. content_url = (
  641. "https://baijiahao.baidu.com/builder/rc/content"
  642. f"?currentPage=1&pageSize={int(page_size)}"
  643. "&search=&type=&collection=&startDate=&endDate="
  644. )
  645. await self.page.goto(content_url, wait_until="domcontentloaded", timeout=60000)
  646. await asyncio.sleep(2)
  647. token = await self._extract_bjh_token()
  648. expected = (expected_title or "").strip()
  649. if not expected:
  650. return False
  651. fetch_result = await self.page.evaluate(
  652. """
  653. async ({ token, pageSize }) => {
  654. const url =
  655. "https://baijiahao.baidu.com/pcui/article/lists" +
  656. "?currentPage=1" +
  657. `&pageSize=${pageSize}` +
  658. "&search=&type=&collection=&startDate=&endDate=" +
  659. "&clearBeforeFetch=false&dynamic=1";
  660. const r = await fetch(url, {
  661. method: "GET",
  662. credentials: "include",
  663. headers: {
  664. "accept": "application/json, text/plain, */*",
  665. ...(token ? { token } : {}),
  666. },
  667. });
  668. const text = await r.text();
  669. return { ok: r.ok, status: r.status, text };
  670. }
  671. """,
  672. {"token": token, "pageSize": int(page_size)}
  673. )
  674. if not fetch_result or not fetch_result.get("ok"):
  675. status = fetch_result.get("status") if isinstance(fetch_result, dict) else "unknown"
  676. print(f"[{self.platform_name}] 内容页校验接口失败: HTTP {status}")
  677. return False
  678. data = json.loads(fetch_result.get("text") or "{}")
  679. if data.get("errno") != 0:
  680. print(f"[{self.platform_name}] 内容页校验接口错误: errno={data.get('errno')}, msg={data.get('errmsg')}")
  681. return False
  682. items = ((data.get("data") or {}).get("list") or [])
  683. if not isinstance(items, list) or not items:
  684. print(f"[{self.platform_name}] 内容页校验:当前列表为空")
  685. return False
  686. # 标题匹配采用“全量相等 + 前缀包含”双策略,兼容平台侧自动截断。
  687. expected_variants = {expected}
  688. if len(expected) > 12:
  689. expected_variants.add(expected[:12])
  690. if len(expected) > 20:
  691. expected_variants.add(expected[:20])
  692. for item in items:
  693. title = str(item.get("title") or "").strip()
  694. if not title:
  695. continue
  696. for needle in expected_variants:
  697. if needle and (title == needle or needle in title):
  698. print(f"[{self.platform_name}] 内容页校验命中标题: {title}")
  699. return True
  700. print(f"[{self.platform_name}] 内容页校验未命中标题,expected={expected}")
  701. return False
  702. except Exception as e:
  703. print(f"[{self.platform_name}] 内容页校验异常: {e}")
  704. return False
  705. async def publish(self, cookies: str, params: PublishParams) -> PublishResult:
  706. """发布视频到百家号"""
  707. import os
  708. import re
  709. import shutil
  710. print(f"\n{'='*60}")
  711. print(f"[{self.platform_name}] 开始发布视频")
  712. print(f"[{self.platform_name}] 视频路径: {params.video_path}")
  713. print(f"[{self.platform_name}] 标题: {params.title}")
  714. print(f"[{self.platform_name}] 描述: {(params.description or '')[:120]}")
  715. print(f"[{self.platform_name}] Headless: {self.headless}")
  716. print(f"{'='*60}")
  717. self.report_progress(5, "正在初始化浏览器...")
  718. # 初始化浏览器
  719. await self.init_browser()
  720. print(f"[{self.platform_name}] 浏览器初始化完成")
  721. # 解析并设置 cookies
  722. cookie_list = self.parse_cookies(cookies)
  723. print(f"[{self.platform_name}] 解析到 {len(cookie_list)} 个 cookies")
  724. await self.set_cookies(cookie_list)
  725. if not self.page:
  726. raise Exception("Page not initialized")
  727. # 检查视频文件
  728. if not os.path.exists(params.video_path):
  729. raise Exception(f"视频文件不存在: {params.video_path}")
  730. print(f"[{self.platform_name}] 视频文件存在,大小: {os.path.getsize(params.video_path)} bytes")
  731. # 关键兜底:百家号在标题框不可编辑时会将“文件名主干”作为默认标题。
  732. # 因此上传前为视频创建“标题别名文件”(优先硬链接,失败再复制),确保默认标题可控。
  733. upload_video_path = params.video_path
  734. try:
  735. raw_title = (params.title or "").strip()
  736. if raw_title:
  737. safe_title = re.sub(r'[<>:"/\\\\|?*\\x00-\\x1F]', '', raw_title)
  738. safe_title = re.sub(r'\\s+', ' ', safe_title).strip().rstrip('.')
  739. if not safe_title:
  740. safe_title = "video"
  741. safe_title = safe_title[:30]
  742. src_ext = os.path.splitext(params.video_path)[1] or ".mp4"
  743. alias_dir = os.path.join(os.path.dirname(params.video_path), "_bjh_upload_alias")
  744. os.makedirs(alias_dir, exist_ok=True)
  745. # 轻量清理:删除 24h 前的旧别名文件,避免长期累积
  746. try:
  747. now_ts = datetime.now().timestamp()
  748. for fn in os.listdir(alias_dir):
  749. full = os.path.join(alias_dir, fn)
  750. if not os.path.isfile(full):
  751. continue
  752. if now_ts - os.path.getmtime(full) > 24 * 3600:
  753. try:
  754. os.remove(full)
  755. except Exception:
  756. pass
  757. except Exception:
  758. pass
  759. alias_name = f"{safe_title}{src_ext}"
  760. alias_path = os.path.join(alias_dir, alias_name)
  761. if os.path.abspath(alias_path) != os.path.abspath(params.video_path):
  762. if os.path.exists(alias_path):
  763. try:
  764. os.remove(alias_path)
  765. except Exception:
  766. pass
  767. try:
  768. os.link(params.video_path, alias_path)
  769. upload_video_path = alias_path
  770. print(f"[{self.platform_name}] 上传别名已创建(硬链接): {upload_video_path}")
  771. except Exception:
  772. shutil.copy2(params.video_path, alias_path)
  773. upload_video_path = alias_path
  774. print(f"[{self.platform_name}] 上传别名已创建(复制): {upload_video_path}")
  775. except Exception as e:
  776. upload_video_path = params.video_path
  777. print(f"[{self.platform_name}] 创建上传别名失败,回退原文件: {e}")
  778. self.report_progress(10, "正在打开上传页面...")
  779. # 访问视频发布页面(使用新视频发布界面)
  780. video_publish_url = "https://baijiahao.baidu.com/builder/rc/edit?type=videoV2&is_from_cms=1"
  781. await self.page.goto(video_publish_url, wait_until="domcontentloaded", timeout=60000)
  782. await asyncio.sleep(3)
  783. # 检查是否跳转到登录页
  784. current_url = self.page.url
  785. print(f"[{self.platform_name}] 当前页面: {current_url}")
  786. for indicator in self.login_indicators:
  787. if indicator in current_url:
  788. if not self.headless:
  789. # 有头浏览器模式:等待用户手动完成登录
  790. print(f"[{self.platform_name}] 有头模式检测到登录跳转,等待用户手动登录...")
  791. self.report_progress(12, "检测到需要登录,请在浏览器中手动完成登录...")
  792. login_resolved = await self._wait_for_user_resolve(
  793. check_fn=self._check_login_resolved,
  794. timeout=300,
  795. prompt="请在打开的浏览器中完成登录",
  796. )
  797. if login_resolved:
  798. current_url = self.page.url
  799. print(f"[{self.platform_name}] 用户已完成登录,当前页面: {current_url}")
  800. break # 登录已解决,继续发布流程
  801. else:
  802. screenshot_base64 = await self.capture_screenshot()
  803. return PublishResult(
  804. success=False,
  805. platform=self.platform_name,
  806. error="等待用户登录超时(5分钟),请重试",
  807. need_captcha=True,
  808. captcha_type='login',
  809. screenshot_base64=screenshot_base64,
  810. page_url=current_url,
  811. status='need_captcha'
  812. )
  813. else:
  814. screenshot_base64 = await self.capture_screenshot()
  815. return PublishResult(
  816. success=False,
  817. platform=self.platform_name,
  818. error="Cookie 已过期,需要重新登录",
  819. need_captcha=True,
  820. captcha_type='login',
  821. screenshot_base64=screenshot_base64,
  822. page_url=current_url,
  823. status='need_captcha'
  824. )
  825. # 检查验证码(有头模式下等待用户手动解决,无头模式下直接返回)
  826. captcha_detected = False
  827. captcha_type = ''
  828. # 使用 AI 检查验证码
  829. ai_captcha = await self.ai_check_captcha()
  830. if ai_captcha['has_captcha']:
  831. captcha_detected = True
  832. captcha_type = ai_captcha['captcha_type']
  833. print(f"[{self.platform_name}] AI检测到验证码: {captcha_type}", flush=True)
  834. # AI 未检测到时再用传统方式检查
  835. if not captcha_detected:
  836. captcha_result = await self.check_captcha()
  837. if captcha_result['need_captcha']:
  838. captcha_detected = True
  839. captcha_type = captcha_result['captcha_type']
  840. if captcha_detected:
  841. if not self.headless:
  842. # 有头浏览器模式:等待用户手动完成验证码
  843. print(f"[{self.platform_name}] 有头模式检测到验证码({captcha_type}),等待用户手动解决...")
  844. self.report_progress(12, f"检测到验证码,请在浏览器中手动完成验证...")
  845. captcha_resolved = await self._wait_for_user_resolve(
  846. check_fn=self._check_captcha_resolved,
  847. timeout=300,
  848. prompt="请在打开的浏览器中完成验证码验证",
  849. )
  850. if not captcha_resolved:
  851. screenshot_base64 = await self.capture_screenshot()
  852. return PublishResult(
  853. success=False,
  854. platform=self.platform_name,
  855. error=f"等待用户完成验证码超时(5分钟),请重试",
  856. need_captcha=True,
  857. captcha_type=captcha_type,
  858. screenshot_base64=screenshot_base64,
  859. page_url=current_url,
  860. status='need_captcha'
  861. )
  862. print(f"[{self.platform_name}] 用户已完成验证码验证,继续发布流程")
  863. else:
  864. screenshot_base64 = await self.capture_screenshot()
  865. return PublishResult(
  866. success=False,
  867. platform=self.platform_name,
  868. error=f"检测到{captcha_type}验证码,需要使用有头浏览器完成验证",
  869. need_captcha=True,
  870. captcha_type=captcha_type,
  871. screenshot_base64=screenshot_base64,
  872. page_url=current_url,
  873. status='need_captcha'
  874. )
  875. self.report_progress(15, "正在选择视频文件...")
  876. # 等待页面加载完成
  877. await asyncio.sleep(2)
  878. # 关闭可能的弹窗(有头模式下使用更保守的选择器,避免关闭用户需要交互的验证码弹窗)
  879. try:
  880. if self.headless:
  881. close_buttons = [
  882. 'button:has-text("我知道了")',
  883. 'button:has-text("知道了")',
  884. '[class*="close"]',
  885. '[class*="modal-close"]',
  886. ]
  887. else:
  888. # 有头模式:只关闭明确的提示性弹窗,不触碰可能含验证码的对话框
  889. close_buttons = [
  890. 'button:has-text("我知道了")',
  891. 'button:has-text("知道了")',
  892. ]
  893. for btn_selector in close_buttons:
  894. try:
  895. btn = self.page.locator(btn_selector).first
  896. if await btn.count() > 0 and await btn.is_visible():
  897. await btn.click()
  898. await asyncio.sleep(0.5)
  899. except:
  900. pass
  901. except:
  902. pass
  903. # 上传视频 - 尝试多种方式(遍历所有 frame,因为百家号上传 UI 在 iframe 中)
  904. upload_triggered = False
  905. # 方法1: 直接通过 file input 上传(遍历所有 frame)
  906. for frame in self.page.frames:
  907. if upload_triggered:
  908. break
  909. frame_url = frame.url or "about:blank"
  910. print(f"[{self.platform_name}] 搜索 file input, frame: {frame_url}")
  911. try:
  912. file_inputs = await frame.query_selector_all('input[type="file"]')
  913. print(f"[{self.platform_name}] frame {frame_url} 中找到 {len(file_inputs)} 个文件输入")
  914. for file_input in file_inputs:
  915. try:
  916. accept = await file_input.get_attribute('accept') or ''
  917. # 优先选择接受视频的 file input
  918. if accept and 'video' not in accept and '*' not in accept:
  919. continue
  920. await file_input.set_input_files(upload_video_path)
  921. upload_triggered = True
  922. print(f"[{self.platform_name}] 通过 file input 上传成功 (frame: {frame_url})")
  923. break
  924. except Exception as e:
  925. print(f"[{self.platform_name}] file input 上传失败: {e}")
  926. except Exception as e:
  927. print(f"[{self.platform_name}] frame {frame_url} 查找 file input 失败: {e}")
  928. # 方法2: 点击上传区域(遍历所有 frame)
  929. if not upload_triggered:
  930. upload_selectors = [
  931. 'div[class*="upload-box"]',
  932. 'div[class*="drag-upload"]',
  933. 'div[class*="uploader"]',
  934. 'div:has-text("点击上传")',
  935. 'div:has-text("选择文件")',
  936. 'div:has-text("拖动入此区域")',
  937. '[class*="upload-area"]',
  938. '[class*="upload-zone"]',
  939. '[class*="upload-wrapper"]',
  940. '[class*="upload-btn"]',
  941. '[class*="upload-video"]',
  942. '[class*="video-upload"]',
  943. '[class*="drag"]',
  944. '[class*="drop"]',
  945. ]
  946. for frame in self.page.frames:
  947. if upload_triggered:
  948. break
  949. frame_url = frame.url or "about:blank"
  950. for selector in upload_selectors:
  951. if upload_triggered:
  952. break
  953. try:
  954. upload_area = frame.locator(selector).first
  955. if await upload_area.count() > 0 and await upload_area.is_visible():
  956. print(f"[{self.platform_name}] 尝试点击上传区域: {selector} (frame: {frame_url})")
  957. async with self.page.expect_file_chooser(timeout=10000) as fc_info:
  958. await upload_area.click()
  959. file_chooser = await fc_info.value
  960. await file_chooser.set_files(upload_video_path)
  961. upload_triggered = True
  962. print(f"[{self.platform_name}] 通过点击上传区域成功 (frame: {frame_url})")
  963. break
  964. except Exception as e:
  965. print(f"[{self.platform_name}] 选择器 {selector} 失败 (frame: {frame_url}): {e}")
  966. if not upload_triggered:
  967. screenshot_base64 = await self.capture_screenshot()
  968. return PublishResult(
  969. success=False,
  970. platform=self.platform_name,
  971. error="未找到上传入口",
  972. screenshot_base64=screenshot_base64,
  973. page_url=await self.get_page_url(),
  974. status='failed'
  975. )
  976. self.report_progress(20, "等待视频上传...")
  977. # 等待视频上传完成(百家号大文件+处理可能较慢)
  978. upload_timeout = 900
  979. start_time = asyncio.get_event_loop().time()
  980. last_heartbeat_time = start_time
  981. last_signal_time = start_time
  982. last_stall_log_time = start_time
  983. last_ai_upload_check_time = start_time - 60
  984. ai_upload_check_interval = 20
  985. ai_upload_poll_count = 0
  986. ai_upload_unknown_streak = 0
  987. last_pct = -1
  988. forced_continue_after = 180 # 无进度信号时,3 分钟后执行兜底继续
  989. processing_since = None
  990. processing_selector_hit = ""
  991. processing_stale_continue_after = 300 # 处理态持续 5 分钟仍无明确变化,执行兜底继续
  992. has_progress_signal = False
  993. progress_signal_lost_continue_after = 90 # 已看到进度后,若信号中断 90s,直接进入下一步
  994. hard_cutover_signal_gap_after = 120 # 已出现过进度后,信号中断超过该值则硬切下一阶段
  995. hard_cutover_elapsed_after = 210 # 上传总耗时超过该值时,硬切下一阶段
  996. async def _attempt_enter_publish_form_from_upload(stage: str) -> bool:
  997. enter_selectors = [
  998. 'button:has-text("去发布")',
  999. '[role="button"]:has-text("去发布")',
  1000. 'button:has-text("发布视频")',
  1001. '[role="button"]:has-text("发布视频")',
  1002. 'button:has-text("下一步")',
  1003. '[role="button"]:has-text("下一步")',
  1004. 'button:has-text("继续")',
  1005. '[role="button"]:has-text("继续")',
  1006. 'button:has-text("完成编辑")',
  1007. '[role="button"]:has-text("完成编辑")',
  1008. '[class*="next"] button',
  1009. '[class*="step"] button',
  1010. ]
  1011. blocked_exact = {"发布", "定时发布", "立即发布", "取消", "返回", "关闭"}
  1012. blocked_contains = ["定时发布", "立即发布", "取消", "返回", "关闭", "删除", "重传", "重新上传", "清空"]
  1013. for selector in enter_selectors:
  1014. try:
  1015. btns = self.page.locator(selector)
  1016. count = await btns.count()
  1017. for idx in range(min(count, 6)):
  1018. btn = btns.nth(idx)
  1019. if not await btn.is_visible():
  1020. continue
  1021. text = (await btn.text_content() or "").strip()
  1022. compact = re.sub(r"\s+", "", text)
  1023. if compact in blocked_exact or any(w in compact for w in blocked_contains):
  1024. continue
  1025. disabled_attr = await btn.get_attribute('disabled')
  1026. aria_disabled = (await btn.get_attribute('aria-disabled') or '').lower()
  1027. if disabled_attr is not None or aria_disabled == 'true':
  1028. continue
  1029. try:
  1030. await btn.scroll_into_view_if_needed(timeout=1200)
  1031. except Exception:
  1032. pass
  1033. try:
  1034. await btn.click(timeout=2500)
  1035. except Exception:
  1036. await btn.click(force=True, timeout=2500)
  1037. print(f"[{self.platform_name}] 上传阶段尝试切换到发布表单: stage={stage}, selector={selector}, text={compact or text}, idx={idx}")
  1038. await asyncio.sleep(1)
  1039. return True
  1040. except Exception:
  1041. pass
  1042. # 深层 DOM 兜底(含 shadowRoot),应对常规选择器无法命中
  1043. try:
  1044. deep_clicked = await self.page.evaluate(
  1045. """
  1046. () => {
  1047. const wanted = ['去发布', '发布视频', '下一步', '继续', '完成编辑'];
  1048. const blockedExact = new Set(['发布', '定时发布', '立即发布', '取消', '返回', '关闭']);
  1049. const blockedContains = ['定时发布', '立即发布', '取消', '返回', '关闭', '删除', '重传', '重新上传', '清空'];
  1050. const roots = [document];
  1051. const visited = new Set();
  1052. const allNodes = [];
  1053. while (roots.length) {
  1054. const root = roots.pop();
  1055. if (!root || visited.has(root)) continue;
  1056. visited.add(root);
  1057. const nodes = root.querySelectorAll('*');
  1058. for (const n of nodes) {
  1059. allNodes.push(n);
  1060. if (n && n.shadowRoot) roots.push(n.shadowRoot);
  1061. }
  1062. }
  1063. const isVisible = (el) => {
  1064. try {
  1065. const style = window.getComputedStyle(el);
  1066. if (style.display === 'none' || style.visibility === 'hidden' || style.pointerEvents === 'none') return false;
  1067. const rect = el.getBoundingClientRect();
  1068. return !!rect && rect.width > 8 && rect.height > 8;
  1069. } catch {
  1070. return false;
  1071. }
  1072. };
  1073. for (const el of allNodes) {
  1074. const text = String(el.innerText || el.textContent || '').replace(/\\s+/g, '').trim();
  1075. if (!text) continue;
  1076. if (blockedExact.has(text)) continue;
  1077. if (blockedContains.some(x => text.includes(x))) continue;
  1078. if (!wanted.some(x => text.includes(x))) continue;
  1079. if (!isVisible(el)) continue;
  1080. const tag = String(el.tagName || '').toLowerCase();
  1081. const role = String(el.getAttribute && el.getAttribute('role') || '').toLowerCase();
  1082. const cls = String(el.className || '').toLowerCase();
  1083. const clickable = tag === 'button' || tag === 'a' || role === 'button' || /btn|button|next|step/.test(cls);
  1084. if (!clickable) continue;
  1085. try {
  1086. el.click();
  1087. return { ok: true, text };
  1088. } catch {}
  1089. }
  1090. return { ok: false, text: '' };
  1091. }
  1092. """
  1093. )
  1094. if deep_clicked and deep_clicked.get("ok"):
  1095. clicked_text = str(deep_clicked.get("text") or "").strip()
  1096. print(f"[{self.platform_name}] 上传阶段深层DOM切换发布表单成功: stage={stage}, text={clicked_text}")
  1097. await asyncio.sleep(1.2)
  1098. return True
  1099. except Exception:
  1100. pass
  1101. return False
  1102. while asyncio.get_event_loop().time() - start_time < upload_timeout:
  1103. now = asyncio.get_event_loop().time()
  1104. elapsed = int(now - start_time)
  1105. status_parts = []
  1106. # 检查上传进度
  1107. pct = None
  1108. try:
  1109. progress_nodes = self.page.locator('[class*="progress"], [class*="percent"], div:has-text("%"), span:has-text("%")')
  1110. node_count = await progress_nodes.count()
  1111. for idx in range(min(node_count, 6)):
  1112. text = await progress_nodes.nth(idx).text_content()
  1113. if not text:
  1114. continue
  1115. match = re.search(r'(\d{1,3})\s*%', text)
  1116. if match:
  1117. pct = max(0, min(100, int(match.group(1))))
  1118. break
  1119. except Exception:
  1120. pass
  1121. if pct is not None:
  1122. status_parts.append(f"progress={pct}%")
  1123. last_signal_time = now
  1124. has_progress_signal = True
  1125. if pct != last_pct:
  1126. self.report_progress(20 + min(35, int(pct * 0.35)), f"视频上传中 {pct}%...")
  1127. last_pct = pct
  1128. if pct >= 100:
  1129. print(f"[{self.platform_name}] 上传完成(进度达到 100%)")
  1130. break
  1131. # 明确的上传完成提示
  1132. upload_done = False
  1133. upload_done_selectors = [
  1134. 'div:has-text("上传完成")',
  1135. 'div:has-text("处理完成")',
  1136. 'div:has-text("上传成功")',
  1137. 'span:has-text("上传完成")',
  1138. '[class*="upload-success"]',
  1139. ]
  1140. try:
  1141. for selector in upload_done_selectors:
  1142. loc = self.page.locator(selector).first
  1143. if await loc.count() > 0 and await loc.is_visible():
  1144. upload_done = True
  1145. print(f"[{self.platform_name}] 检测到上传完成提示: {selector}")
  1146. break
  1147. except Exception:
  1148. pass
  1149. if upload_done:
  1150. last_signal_time = now
  1151. break
  1152. # 检查处理态
  1153. is_processing = False
  1154. processing_selectors = [
  1155. 'div:has-text("上传中")',
  1156. 'span:has-text("上传中")',
  1157. 'div:has-text("处理中")',
  1158. 'span:has-text("处理中")',
  1159. 'div:has-text("转码中")',
  1160. 'span:has-text("转码中")',
  1161. 'div:has-text("请稍候")',
  1162. 'span:has-text("请稍候")',
  1163. 'div:has-text("正在上传")',
  1164. 'div:has-text("正在处理")',
  1165. 'text="上传中"',
  1166. 'text="处理中"',
  1167. ]
  1168. try:
  1169. for selector in processing_selectors:
  1170. loc = self.page.locator(selector).first
  1171. if await loc.count() > 0 and await loc.is_visible():
  1172. is_processing = True
  1173. processing_selector_hit = selector
  1174. break
  1175. except Exception:
  1176. pass
  1177. if is_processing:
  1178. if processing_since is None:
  1179. processing_since = now
  1180. processing_elapsed = int(now - processing_since)
  1181. status_parts.append(f"processing={processing_elapsed}s")
  1182. if processing_selector_hit:
  1183. status_parts.append(f"by={processing_selector_hit}")
  1184. # 处理态短时间内视为有效信号;超过阈值后不再持续刷新 signal_gap,避免卡死
  1185. if processing_elapsed <= 180:
  1186. last_signal_time = now
  1187. else:
  1188. processing_since = None
  1189. processing_selector_hit = ""
  1190. # 检查是否出现标题输入框(部分页面会在上传阶段就显示,需结合时间/处理态判断)
  1191. title_input_visible = False
  1192. try:
  1193. title_input = self.page.locator('input[placeholder*="标题"], textarea[placeholder*="标题"], [class*="title-input"] input').first
  1194. title_input_visible = await title_input.count() > 0 and await title_input.is_visible()
  1195. except Exception:
  1196. title_input_visible = False
  1197. if title_input_visible and (
  1198. (not is_processing and elapsed >= 45) or
  1199. (processing_since is not None and (now - processing_since) >= 180) or
  1200. elapsed >= 360
  1201. ):
  1202. print(f"[{self.platform_name}] 检测到可编辑标题,继续后续步骤")
  1203. break
  1204. # 检查是否有错误提示
  1205. error_text = ''
  1206. try:
  1207. error_nodes = self.page.locator('[class*="error"], [class*="fail"], div:has-text("上传失败"), div:has-text("处理失败")')
  1208. err_count = await error_nodes.count()
  1209. for idx in range(min(err_count, 6)):
  1210. txt = (await error_nodes.nth(idx).text_content() or '').strip()
  1211. if txt and any(k in txt for k in ['失败', '错误', '异常', '中断']):
  1212. error_text = txt
  1213. break
  1214. except Exception:
  1215. error_text = ''
  1216. if error_text:
  1217. screenshot_base64 = await self.capture_screenshot()
  1218. return PublishResult(
  1219. success=False,
  1220. platform=self.platform_name,
  1221. error=f"上传失败: {error_text}",
  1222. screenshot_base64=screenshot_base64,
  1223. page_url=await self.get_page_url(),
  1224. status='failed'
  1225. )
  1226. # AI 上传状态判定(节流),用于弥补 DOM/文案信号缺失
  1227. should_run_ai_upload_check = (now - last_ai_upload_check_time) >= ai_upload_check_interval
  1228. if should_run_ai_upload_check:
  1229. ai_upload_poll_count += 1
  1230. ai_upload_state = await self._ai_analyze_upload_state()
  1231. last_ai_upload_check_time = now
  1232. ai_status = str(ai_upload_state.get("status") or "unknown").strip().lower()
  1233. ai_progress = ai_upload_state.get("progress")
  1234. ai_confidence = int(ai_upload_state.get("confidence") or 0)
  1235. ai_reason = str(ai_upload_state.get("reason") or "").strip()
  1236. ai_should_enter_form = bool(ai_upload_state.get("should_enter_publish_form"))
  1237. print(
  1238. f"[{self.platform_name}] AI上传轮询#{ai_upload_poll_count}: elapsed={elapsed}s, "
  1239. f"status={ai_status}, progress={ai_progress}, confidence={ai_confidence}, "
  1240. f"enter_form={ai_should_enter_form}, reason={ai_reason or '-'}"
  1241. )
  1242. if ai_status == "unknown":
  1243. ai_upload_unknown_streak += 1
  1244. else:
  1245. ai_upload_unknown_streak = 0
  1246. if ai_status == "failed":
  1247. screenshot_base64 = await self.capture_screenshot()
  1248. return PublishResult(
  1249. success=False,
  1250. platform=self.platform_name,
  1251. error=f"上传失败(AI判定): {ai_reason or '检测到上传失败信号'}",
  1252. screenshot_base64=screenshot_base64,
  1253. page_url=await self.get_page_url(),
  1254. status='failed'
  1255. )
  1256. if ai_status == "completed":
  1257. if ai_should_enter_form:
  1258. await _attempt_enter_publish_form_from_upload("ai-completed")
  1259. print(f"[{self.platform_name}] AI判定上传已完成,进入下一阶段")
  1260. last_signal_time = now
  1261. break
  1262. if ai_status == "uploading":
  1263. has_progress_signal = True
  1264. last_signal_time = now
  1265. if isinstance(ai_progress, (int, float)):
  1266. ai_pct = max(0, min(100, int(ai_progress)))
  1267. status_parts.append(f"ai-progress={ai_pct}%")
  1268. if ai_pct != last_pct and ai_pct > 0:
  1269. self.report_progress(20 + min(35, int(ai_pct * 0.35)), f"视频上传中 {ai_pct}%...")
  1270. last_pct = ai_pct
  1271. if ai_pct >= 99 and ai_confidence >= 60:
  1272. if ai_should_enter_form:
  1273. await _attempt_enter_publish_form_from_upload("ai-upload-99")
  1274. print(f"[{self.platform_name}] AI判定上传接近完成,进入下一阶段")
  1275. break
  1276. else:
  1277. status_parts.append("ai=uploading")
  1278. if ai_should_enter_form and elapsed >= 60:
  1279. await _attempt_enter_publish_form_from_upload("ai-uploading-enter-form")
  1280. elif ai_status == "unknown" and ai_should_enter_form and elapsed >= 60:
  1281. await _attempt_enter_publish_form_from_upload("ai-unknown-enter-form")
  1282. elif ai_status == "unknown" and ai_upload_unknown_streak >= 3 and elapsed >= 90:
  1283. await _attempt_enter_publish_form_from_upload("ai-unknown-streak")
  1284. # 心跳日志,便于定位“卡住”
  1285. if now - last_heartbeat_time >= 15:
  1286. signal_gap = int(now - last_signal_time)
  1287. extra = ", ".join(status_parts) if status_parts else "no-visible-signal"
  1288. print(f"[{self.platform_name}] 上传等待中: elapsed={elapsed}s, signal_gap={signal_gap}s, {extra}")
  1289. last_heartbeat_time = now
  1290. # 已经出现过进度后,如果进度信号中断较久,进入下一步兜底
  1291. dynamic_signal_lost_after = progress_signal_lost_continue_after
  1292. if last_pct >= 95:
  1293. # 95%+ 阶段可能有短暂静默,适度放宽
  1294. dynamic_signal_lost_after = max(progress_signal_lost_continue_after, 150)
  1295. elif last_pct >= 80:
  1296. # 中后段进度(80%+)可能进入转码/校验静默期,但不应无限等待
  1297. dynamic_signal_lost_after = max(progress_signal_lost_continue_after, 150)
  1298. elif last_pct >= 60:
  1299. dynamic_signal_lost_after = max(progress_signal_lost_continue_after, 120)
  1300. if has_progress_signal and (now - last_signal_time) >= dynamic_signal_lost_after:
  1301. signal_gap = int(now - last_signal_time)
  1302. if last_pct >= 95 or title_input_visible or elapsed >= max(780, upload_timeout - 60):
  1303. print(f"[{self.platform_name}] 上传进度信号中断过久({signal_gap}s>={dynamic_signal_lost_after}s),继续后续步骤(兜底)")
  1304. break
  1305. if (last_pct >= 70 and signal_gap >= hard_cutover_signal_gap_after) or elapsed >= hard_cutover_elapsed_after:
  1306. await _attempt_enter_publish_form_from_upload("hard-cutover-signal")
  1307. print(f"[{self.platform_name}] 上传长时间无新信号,执行硬切换到标题阶段: elapsed={elapsed}s, signal_gap={signal_gap}s, last_pct={last_pct}")
  1308. break
  1309. if now - last_stall_log_time >= 30:
  1310. print(f"[{self.platform_name}] 上传信号中断({signal_gap}s)但进度不足/标题未就绪,继续等待上传完成...")
  1311. last_stall_log_time = now
  1312. # 额外硬切策略:出现过中后段进度但长时间无新增信号时,不再继续卡住
  1313. if has_progress_signal and last_pct >= 70 and (now - last_signal_time) >= hard_cutover_signal_gap_after:
  1314. signal_gap = int(now - last_signal_time)
  1315. await _attempt_enter_publish_form_from_upload("hard-cutover-progress")
  1316. print(f"[{self.platform_name}] 中后段上传信号停滞,强制切换到标题阶段: elapsed={elapsed}s, signal_gap={signal_gap}s, last_pct={last_pct}")
  1317. break
  1318. # 从未出现可见进度信号时,不再长时间卡在 20%
  1319. if (not has_progress_signal) and elapsed >= forced_continue_after and (now - last_signal_time) >= 120:
  1320. if title_input_visible or elapsed >= max(600, upload_timeout - 90):
  1321. print(f"[{self.platform_name}] 上传阶段长时间无可见进度信号,继续后续步骤(兜底)")
  1322. break
  1323. if elapsed >= 480:
  1324. await _attempt_enter_publish_form_from_upload("hard-cutover-no-signal")
  1325. print(f"[{self.platform_name}] 上传持续无可见信号,执行硬切换到标题阶段: elapsed={elapsed}s")
  1326. break
  1327. if now - last_stall_log_time >= 30:
  1328. print(f"[{self.platform_name}] 上传暂无可见信号且标题未就绪,继续等待...")
  1329. last_stall_log_time = now
  1330. # 处理态持续过久时兜底继续,避免固定 DOM 文案导致无限等待
  1331. if processing_since is not None and (now - processing_since) >= processing_stale_continue_after:
  1332. if last_pct >= 95 or title_input_visible or elapsed >= max(780, upload_timeout - 60):
  1333. print(f"[{self.platform_name}] 上传阶段处理态持续过久,继续后续步骤(兜底)")
  1334. break
  1335. if elapsed >= hard_cutover_elapsed_after:
  1336. await _attempt_enter_publish_form_from_upload("hard-cutover-processing")
  1337. print(f"[{self.platform_name}] 处理态持续过久且总耗时较长,执行硬切换到标题阶段: elapsed={elapsed}s")
  1338. break
  1339. if now - last_stall_log_time >= 30:
  1340. print(f"[{self.platform_name}] 处理态持续较久但标题未就绪,继续等待上传收尾...")
  1341. last_stall_log_time = now
  1342. await asyncio.sleep(3)
  1343. self.report_progress(60, "正在填写标题...")
  1344. await asyncio.sleep(2)
  1345. # 填写标题(严格校验写入结果,避免填错输入框)
  1346. desired_title = (params.title or "").strip()[:30] # 百家号标题限制 30 字
  1347. video_stem = os.path.splitext(os.path.basename(params.video_path or ""))[0].strip().lower()
  1348. def _normalize_title_for_match(value: str) -> str:
  1349. v = re.sub(r"\s+", "", str(value or "")).strip().lower()
  1350. v = re.sub(r"[`~!@#$%^&*()_+=\[\]{}\\|;:'\",.<>/?,。!?;:、()【】《》\-\u3000]", "", v)
  1351. return v
  1352. def _looks_like_non_title_value(value: str) -> bool:
  1353. raw = str(value or "").strip()
  1354. if not raw:
  1355. return True
  1356. compact = raw.lower()
  1357. # 典型 UUID(平台内部资源ID/文件名)
  1358. if re.fullmatch(r"[0-9a-f]{8}-[0-9a-f]{4}-[1-5]?[0-9a-f]{3}-[89ab]?[0-9a-f]{3}-[0-9a-f]{12}", compact):
  1359. return True
  1360. # 纯英文数字/连接符且较长,通常是资源ID而不是标题
  1361. if len(compact) >= 24 and re.fullmatch(r"[a-z0-9_-]+", compact):
  1362. return True
  1363. # 与视频文件名主干一致时,视为误填
  1364. if video_stem and compact == video_stem:
  1365. return True
  1366. # 文件路径或带扩展名文本,视为误填
  1367. if "\\" in raw or "/" in raw:
  1368. return True
  1369. if re.search(r"\.(mp4|mov|avi|mkv|wmv|flv|m4v)$", compact):
  1370. return True
  1371. return False
  1372. def _title_matches_expected(current_value: str) -> bool:
  1373. if not desired_title:
  1374. return False
  1375. current = str(current_value or "").strip()
  1376. if not current:
  1377. return False
  1378. if _looks_like_non_title_value(current):
  1379. return False
  1380. expected_norm = _normalize_title_for_match(desired_title)
  1381. current_norm = _normalize_title_for_match(current)
  1382. if not expected_norm or not current_norm:
  1383. return False
  1384. if expected_norm == current_norm:
  1385. return True
  1386. if len(expected_norm) >= 4 and (expected_norm in current_norm or current_norm in expected_norm):
  1387. return True
  1388. prefix_len = min(8, len(expected_norm))
  1389. if prefix_len >= 4 and expected_norm[:prefix_len] in current_norm:
  1390. return True
  1391. return False
  1392. title_filled = False
  1393. title_verified_value = ""
  1394. title_failure_reason = ""
  1395. title_selectors = [
  1396. 'input[placeholder*="标题"]',
  1397. 'textarea[placeholder*="标题"]',
  1398. 'input[aria-label*="标题"]',
  1399. 'textarea[aria-label*="标题"]',
  1400. 'input[data-placeholder*="标题"]',
  1401. 'textarea[data-placeholder*="标题"]',
  1402. 'input[name*="title"]',
  1403. 'textarea[name*="title"]',
  1404. 'input[id*="title"]',
  1405. 'textarea[id*="title"]',
  1406. '[class*="title-input"] input',
  1407. '[class*="title"] textarea',
  1408. '[class*="title"] input',
  1409. '[class*="headline"] input',
  1410. '[class*="headline"] textarea',
  1411. '[class*="name"] input',
  1412. '[contenteditable="true"][placeholder*="标题"]',
  1413. '[contenteditable="true"][aria-label*="标题"]',
  1414. '[contenteditable="plaintext-only"][placeholder*="标题"]',
  1415. '[data-placeholder*="标题"][contenteditable="true"]',
  1416. '[class*="title"] [contenteditable="true"]',
  1417. '[role="textbox"][aria-label*="标题"]',
  1418. '[role="textbox"][placeholder*="标题"]',
  1419. ]
  1420. async def _has_editable_title_input() -> bool:
  1421. for frame in self.page.frames:
  1422. for selector in title_selectors:
  1423. try:
  1424. nodes = frame.locator(selector)
  1425. count = await nodes.count()
  1426. for idx in range(min(count, 10)):
  1427. node = nodes.nth(idx)
  1428. if not await node.is_visible():
  1429. continue
  1430. node_type = (await node.get_attribute('type') or '').strip().lower()
  1431. if node_type in ['file', 'hidden', 'checkbox', 'radio', 'button', 'submit']:
  1432. continue
  1433. try:
  1434. if await node.is_disabled():
  1435. continue
  1436. except Exception:
  1437. pass
  1438. return True
  1439. except Exception:
  1440. pass
  1441. # 深层 DOM 检查(含 shadowRoot)
  1442. for frame in self.page.frames:
  1443. try:
  1444. deep_found = await frame.evaluate(
  1445. """
  1446. () => {
  1447. const roots = [document];
  1448. const visited = new Set();
  1449. while (roots.length) {
  1450. const root = roots.pop();
  1451. if (!root || visited.has(root)) continue;
  1452. visited.add(root);
  1453. const nodes = root.querySelectorAll('*');
  1454. for (const n of nodes) {
  1455. if (n && n.shadowRoot) roots.push(n.shadowRoot);
  1456. const tag = String(n.tagName || '').toLowerCase();
  1457. if (!['input', 'textarea'].includes(tag) && String(n.getAttribute && n.getAttribute('contenteditable') || '').toLowerCase() !== 'true' && String(n.getAttribute && n.getAttribute('role') || '').toLowerCase() !== 'textbox') {
  1458. continue;
  1459. }
  1460. const type = String(n.getAttribute && n.getAttribute('type') || '').toLowerCase();
  1461. if (tag === 'input' && ['file', 'hidden', 'checkbox', 'radio', 'button', 'submit'].includes(type)) continue;
  1462. if (n.disabled || n.readOnly) continue;
  1463. const style = window.getComputedStyle(n);
  1464. if (style.display === 'none' || style.visibility === 'hidden') continue;
  1465. const rect = n.getBoundingClientRect();
  1466. if (!rect || rect.width < 8 || rect.height < 8) continue;
  1467. return true;
  1468. }
  1469. }
  1470. return false;
  1471. }
  1472. """
  1473. )
  1474. if deep_found:
  1475. return True
  1476. except Exception:
  1477. pass
  1478. return False
  1479. async def _try_enter_publish_form(stage: str) -> bool:
  1480. action_selectors = [
  1481. 'button:has-text("去发布")',
  1482. '[role="button"]:has-text("去发布")',
  1483. 'button:has-text("发布视频")',
  1484. '[role="button"]:has-text("发布视频")',
  1485. 'button:has-text("下一步")',
  1486. '[role="button"]:has-text("下一步")',
  1487. 'button:has-text("继续")',
  1488. '[role="button"]:has-text("继续")',
  1489. 'button:has-text("完成编辑")',
  1490. '[role="button"]:has-text("完成编辑")',
  1491. '[class*="next"] button',
  1492. '[class*="step"] button',
  1493. ]
  1494. blocked_exact = {"发布", "定时发布", "立即发布", "取消", "返回", "关闭"}
  1495. blocked_contains = ["定时发布", "立即发布", "取消", "返回", "关闭", "删除", "重传", "重新上传", "清空"]
  1496. for frame in self.page.frames:
  1497. frame_url = frame.url or "about:blank"
  1498. for selector in action_selectors:
  1499. try:
  1500. btns = frame.locator(selector)
  1501. btn_count = await btns.count()
  1502. for idx in range(min(btn_count, 6)):
  1503. btn = btns.nth(idx)
  1504. if not await btn.is_visible():
  1505. continue
  1506. text = (await btn.text_content() or "").strip()
  1507. compact = re.sub(r"\s+", "", text)
  1508. if compact in blocked_exact or any(t in compact for t in blocked_contains):
  1509. continue
  1510. disabled_attr = await btn.get_attribute('disabled')
  1511. aria_disabled = (await btn.get_attribute('aria-disabled') or '').lower()
  1512. if disabled_attr is not None or aria_disabled == 'true':
  1513. continue
  1514. try:
  1515. await btn.scroll_into_view_if_needed(timeout=1500)
  1516. except Exception:
  1517. pass
  1518. try:
  1519. await btn.click(timeout=3000)
  1520. except Exception:
  1521. await btn.click(force=True, timeout=3000)
  1522. print(f"[{self.platform_name}] 尝试进入发布表单: stage={stage}, frame={frame_url}, selector={selector}, text={compact or text}, idx={idx}")
  1523. await asyncio.sleep(1.2)
  1524. if await _has_editable_title_input():
  1525. print(f"[{self.platform_name}] 已进入可编辑发布表单: stage={stage}")
  1526. return True
  1527. except Exception:
  1528. pass
  1529. # 深层 DOM 兜底(含 shadowRoot)
  1530. try:
  1531. deep_clicked = await self.page.evaluate(
  1532. """
  1533. () => {
  1534. const wanted = ['去发布', '发布视频', '下一步', '继续', '完成编辑'];
  1535. const blockedExact = new Set(['发布', '定时发布', '立即发布', '取消', '返回', '关闭']);
  1536. const blockedContains = ['定时发布', '立即发布', '取消', '返回', '关闭', '删除', '重传', '重新上传', '清空'];
  1537. const roots = [document];
  1538. const visited = new Set();
  1539. const allNodes = [];
  1540. while (roots.length) {
  1541. const root = roots.pop();
  1542. if (!root || visited.has(root)) continue;
  1543. visited.add(root);
  1544. const nodes = root.querySelectorAll('*');
  1545. for (const n of nodes) {
  1546. allNodes.push(n);
  1547. if (n && n.shadowRoot) roots.push(n.shadowRoot);
  1548. }
  1549. }
  1550. const isVisible = (el) => {
  1551. try {
  1552. const style = window.getComputedStyle(el);
  1553. if (style.display === 'none' || style.visibility === 'hidden' || style.pointerEvents === 'none') return false;
  1554. const rect = el.getBoundingClientRect();
  1555. return !!rect && rect.width > 8 && rect.height > 8;
  1556. } catch {
  1557. return false;
  1558. }
  1559. };
  1560. for (const el of allNodes) {
  1561. const text = String(el.innerText || el.textContent || '').replace(/\\s+/g, '').trim();
  1562. if (!text) continue;
  1563. if (blockedExact.has(text)) continue;
  1564. if (blockedContains.some(x => text.includes(x))) continue;
  1565. if (!wanted.some(x => text.includes(x))) continue;
  1566. if (!isVisible(el)) continue;
  1567. const tag = String(el.tagName || '').toLowerCase();
  1568. const role = String(el.getAttribute && el.getAttribute('role') || '').toLowerCase();
  1569. const cls = String(el.className || '').toLowerCase();
  1570. const clickable = tag === 'button' || tag === 'a' || role === 'button' || /btn|button|next|step/.test(cls);
  1571. if (!clickable) continue;
  1572. try {
  1573. el.click();
  1574. return { ok: true, text };
  1575. } catch {}
  1576. }
  1577. return { ok: false, text: '' };
  1578. }
  1579. """
  1580. )
  1581. if deep_clicked and deep_clicked.get("ok"):
  1582. print(f"[{self.platform_name}] 深层DOM进入发布表单成功: stage={stage}, text={str(deep_clicked.get('text') or '').strip()}")
  1583. await asyncio.sleep(1.2)
  1584. if await _has_editable_title_input():
  1585. print(f"[{self.platform_name}] 已进入可编辑发布表单(深层DOM): stage={stage}")
  1586. return True
  1587. except Exception:
  1588. pass
  1589. return False
  1590. # 先等待可编辑标题框出现,避免上传兜底后立即进入导致误命中 file input
  1591. await _try_enter_publish_form("pre-title")
  1592. title_ready = False
  1593. title_wait_deadline = asyncio.get_event_loop().time() + 180
  1594. last_title_wait_log = 0.0
  1595. last_enter_publish_try = 0.0
  1596. while asyncio.get_event_loop().time() < title_wait_deadline and not title_ready:
  1597. try:
  1598. if await _has_editable_title_input():
  1599. title_ready = True
  1600. break
  1601. except Exception:
  1602. pass
  1603. for frame in self.page.frames:
  1604. if title_ready:
  1605. break
  1606. for selector in title_selectors:
  1607. if title_ready:
  1608. break
  1609. try:
  1610. title_nodes = frame.locator(selector)
  1611. node_count = await title_nodes.count()
  1612. for idx in range(min(node_count, 8)):
  1613. node = title_nodes.nth(idx)
  1614. if not await node.is_visible():
  1615. continue
  1616. node_type = (await node.get_attribute('type') or '').strip().lower()
  1617. if node_type in ['file', 'hidden', 'checkbox', 'radio', 'button', 'submit']:
  1618. continue
  1619. try:
  1620. if await node.is_disabled():
  1621. continue
  1622. except Exception:
  1623. pass
  1624. title_ready = True
  1625. break
  1626. except Exception:
  1627. pass
  1628. if title_ready:
  1629. break
  1630. now_wait = asyncio.get_event_loop().time()
  1631. if now_wait - last_title_wait_log >= 10:
  1632. print(f"[{self.platform_name}] 等待可编辑标题输入框... frames={len(self.page.frames)}")
  1633. last_title_wait_log = now_wait
  1634. if now_wait - last_enter_publish_try >= 15:
  1635. await _try_enter_publish_form("title-wait")
  1636. last_enter_publish_try = now_wait
  1637. await asyncio.sleep(2)
  1638. if not title_ready:
  1639. title_failure_reason = "title-not-ready"
  1640. print(f"[{self.platform_name}] 未检测到明确标题输入框,进入兜底识别模式")
  1641. for frame in self.page.frames:
  1642. if title_filled:
  1643. break
  1644. frame_url = frame.url or "about:blank"
  1645. for selector in title_selectors:
  1646. if title_filled:
  1647. break
  1648. try:
  1649. title_nodes = frame.locator(selector)
  1650. node_count = await title_nodes.count()
  1651. for idx in range(min(node_count, 8)):
  1652. node = title_nodes.nth(idx)
  1653. if not await node.is_visible():
  1654. continue
  1655. node_type = (await node.get_attribute('type') or '').strip().lower()
  1656. if node_type in ['file', 'hidden', 'checkbox', 'radio', 'button', 'submit']:
  1657. continue
  1658. try:
  1659. if await node.is_disabled():
  1660. continue
  1661. except Exception:
  1662. pass
  1663. node_tag = ""
  1664. try:
  1665. node_tag = ((await node.evaluate("el => (el.tagName || '').toLowerCase()")) or "").strip()
  1666. except Exception:
  1667. node_tag = ""
  1668. contenteditable_attr = (await node.get_attribute('contenteditable') or '').strip().lower()
  1669. role_attr = (await node.get_attribute('role') or '').strip().lower()
  1670. is_text_input = node_tag in ['input', 'textarea']
  1671. is_editable_block = contenteditable_attr == 'true' or role_attr == 'textbox'
  1672. try:
  1673. await node.click(timeout=2000)
  1674. except Exception:
  1675. pass
  1676. if is_text_input:
  1677. try:
  1678. await node.fill(desired_title, timeout=5000)
  1679. except Exception:
  1680. try:
  1681. await self.page.keyboard.press("Control+KeyA")
  1682. await self.page.keyboard.press("Backspace")
  1683. await self.page.keyboard.type(desired_title)
  1684. except Exception:
  1685. continue
  1686. elif is_editable_block:
  1687. try:
  1688. await self.page.keyboard.press("Control+KeyA")
  1689. await self.page.keyboard.press("Backspace")
  1690. await self.page.keyboard.type(desired_title)
  1691. except Exception:
  1692. try:
  1693. await node.evaluate(
  1694. """
  1695. (el, title) => {
  1696. el.focus();
  1697. el.textContent = title;
  1698. el.dispatchEvent(new Event('input', { bubbles: true }));
  1699. el.dispatchEvent(new Event('change', { bubbles: true }));
  1700. }
  1701. """,
  1702. desired_title
  1703. )
  1704. except Exception:
  1705. continue
  1706. else:
  1707. continue
  1708. await asyncio.sleep(0.2)
  1709. current_value = ""
  1710. if is_text_input:
  1711. try:
  1712. current_value = (await node.input_value() or "").strip()
  1713. except Exception:
  1714. current_value = ""
  1715. else:
  1716. try:
  1717. current_value = ((await node.evaluate("el => (el.innerText || el.textContent || '')")) or "").strip()
  1718. except Exception:
  1719. current_value = ""
  1720. if _title_matches_expected(current_value):
  1721. title_filled = True
  1722. title_verified_value = current_value
  1723. print(f"[{self.platform_name}] 标题填写成功: frame={frame_url}, selector={selector}, idx={idx}, value={current_value}")
  1724. break
  1725. elif current_value:
  1726. title_failure_reason = "candidate-mismatch"
  1727. # 对同一节点再做一次 JS 强制赋值,处理键盘输入未生效的情况
  1728. forced_value = ""
  1729. try:
  1730. forced_value = (
  1731. (await node.evaluate(
  1732. """
  1733. (el, title) => {
  1734. const tag = String(el.tagName || '').toLowerCase();
  1735. const type = String((el.getAttribute('type') || '')).toLowerCase();
  1736. if (tag === 'input' && ['file', 'hidden', 'checkbox', 'radio', 'button', 'submit'].includes(type)) return '';
  1737. const ce = String(el.getAttribute('contenteditable') || '').toLowerCase();
  1738. const role = String(el.getAttribute('role') || '').toLowerCase();
  1739. const isTextInput = tag === 'input' || tag === 'textarea';
  1740. const isEditableBlock = ce === 'true' || role === 'textbox';
  1741. const emit = () => {
  1742. el.dispatchEvent(new Event('input', { bubbles: true }));
  1743. el.dispatchEvent(new Event('change', { bubbles: true }));
  1744. };
  1745. try { el.focus(); } catch {}
  1746. if (isTextInput) {
  1747. try {
  1748. const proto = tag === 'textarea' ? window.HTMLTextAreaElement.prototype : window.HTMLInputElement.prototype;
  1749. const setter = Object.getOwnPropertyDescriptor(proto, 'value')?.set;
  1750. if (setter) {
  1751. setter.call(el, '');
  1752. emit();
  1753. setter.call(el, title);
  1754. emit();
  1755. } else {
  1756. el.value = '';
  1757. emit();
  1758. el.value = title;
  1759. emit();
  1760. }
  1761. } catch {
  1762. el.value = title;
  1763. emit();
  1764. }
  1765. return String(el.value || '').trim();
  1766. }
  1767. if (isEditableBlock) {
  1768. el.textContent = '';
  1769. emit();
  1770. el.textContent = title;
  1771. emit();
  1772. return String(el.innerText || el.textContent || '').trim();
  1773. }
  1774. return '';
  1775. }
  1776. """,
  1777. desired_title
  1778. )) or ""
  1779. ).strip()
  1780. except Exception:
  1781. forced_value = ""
  1782. if _title_matches_expected(forced_value):
  1783. title_filled = True
  1784. title_verified_value = forced_value
  1785. print(f"[{self.platform_name}] 标题强制写入成功: frame={frame_url}, selector={selector}, idx={idx}, value={forced_value}")
  1786. break
  1787. print(f"[{self.platform_name}] 标题候选值不匹配,已忽略: frame={frame_url}, selector={selector}, idx={idx}, value={current_value}")
  1788. except Exception as e:
  1789. print(f"[{self.platform_name}] 标题选择器失败: frame={frame_url}, selector={selector}, err={e}")
  1790. # 深层 DOM 兜底(含 shadowRoot)
  1791. if not title_filled and desired_title:
  1792. for frame in self.page.frames:
  1793. if title_filled:
  1794. break
  1795. frame_url = frame.url or "about:blank"
  1796. try:
  1797. deep_result = await frame.evaluate(
  1798. """
  1799. (title) => {
  1800. const roots = [document];
  1801. const visited = new Set();
  1802. const candidates = [];
  1803. while (roots.length) {
  1804. const root = roots.pop();
  1805. if (!root || visited.has(root)) continue;
  1806. visited.add(root);
  1807. const nodes = root.querySelectorAll('*');
  1808. for (const n of nodes) {
  1809. if (n && n.shadowRoot) roots.push(n.shadowRoot);
  1810. const tag = String(n.tagName || '').toLowerCase();
  1811. const type = String(n.getAttribute && n.getAttribute('type') || '').toLowerCase();
  1812. const ce = String(n.getAttribute && n.getAttribute('contenteditable') || '').toLowerCase();
  1813. const role = String(n.getAttribute && n.getAttribute('role') || '').toLowerCase();
  1814. const isTextInput = tag === 'input' || tag === 'textarea';
  1815. const isEditableBlock = ce === 'true' || role === 'textbox';
  1816. if (!isTextInput && !isEditableBlock) continue;
  1817. if (tag === 'input' && ['file', 'hidden', 'checkbox', 'radio', 'button', 'submit'].includes(type)) continue;
  1818. if (n.disabled || n.readOnly) continue;
  1819. const style = window.getComputedStyle(n);
  1820. if (style.display === 'none' || style.visibility === 'hidden') continue;
  1821. const rect = n.getBoundingClientRect();
  1822. if (!rect || rect.width < 8 || rect.height < 8) continue;
  1823. const ph = String(n.getAttribute && n.getAttribute('placeholder') || '');
  1824. const aria = String(n.getAttribute && n.getAttribute('aria-label') || '');
  1825. const name = String(n.getAttribute && n.getAttribute('name') || '');
  1826. const id = String(n.getAttribute && n.getAttribute('id') || '');
  1827. const cls = String(n.className || '');
  1828. const maxLen = parseInt(String(n.getAttribute && n.getAttribute('maxlength') || '0'), 10) || 0;
  1829. const container = n.closest && n.closest('label, [class*="form"], [class*="item"], [class*="field"], [class*="title"]');
  1830. const ctx = String((container && container.innerText) || '').slice(0, 80);
  1831. let score = 0;
  1832. if (/标题|title/i.test(ph)) score += 7;
  1833. if (/标题|title/i.test(aria)) score += 6;
  1834. if (/标题|title/i.test(name)) score += 5;
  1835. if (/标题|title/i.test(id)) score += 5;
  1836. if (/title|标题/i.test(cls)) score += 4;
  1837. if (/标题|title/i.test(ctx)) score += 5;
  1838. if (maxLen > 0 && maxLen <= 40) score += 3;
  1839. if (isTextInput) score += 2;
  1840. if (isEditableBlock) score += 1;
  1841. candidates.push({ n, score, isTextInput, isEditableBlock });
  1842. }
  1843. }
  1844. candidates.sort((a, b) => b.score - a.score);
  1845. if (!candidates.length) return { ok: false, value: '', reason: 'no-candidate' };
  1846. const emit = (el) => {
  1847. el.dispatchEvent(new Event('input', { bubbles: true }));
  1848. el.dispatchEvent(new Event('change', { bubbles: true }));
  1849. };
  1850. let lastError = '';
  1851. for (const item of candidates.slice(0, 12)) {
  1852. const el = item.n;
  1853. try {
  1854. el.focus();
  1855. if (item.isTextInput) {
  1856. const tag = String(el.tagName || '').toLowerCase();
  1857. const proto = tag === 'textarea' ? window.HTMLTextAreaElement.prototype : window.HTMLInputElement.prototype;
  1858. const setter = Object.getOwnPropertyDescriptor(proto, 'value')?.set;
  1859. if (setter) {
  1860. setter.call(el, '');
  1861. emit(el);
  1862. setter.call(el, title);
  1863. emit(el);
  1864. } else {
  1865. el.value = '';
  1866. emit(el);
  1867. el.value = title;
  1868. emit(el);
  1869. }
  1870. const v = String(el.value || '').trim();
  1871. if (v) return { ok: true, value: v, score: item.score };
  1872. } else if (item.isEditableBlock) {
  1873. el.textContent = '';
  1874. emit(el);
  1875. el.textContent = title;
  1876. emit(el);
  1877. const v = String(el.innerText || el.textContent || '').trim();
  1878. if (v) return { ok: true, value: v, score: item.score };
  1879. }
  1880. } catch (e) {
  1881. lastError = String(e || '');
  1882. }
  1883. }
  1884. return { ok: false, value: '', reason: lastError || 'set-value-failed' };
  1885. }
  1886. """,
  1887. desired_title
  1888. )
  1889. if deep_result and deep_result.get('ok'):
  1890. deep_written = str(deep_result.get('value') or '').strip()
  1891. if _title_matches_expected(deep_written):
  1892. title_filled = True
  1893. title_verified_value = deep_written
  1894. print(f"[{self.platform_name}] 标题深层DOM填写成功: frame={frame_url}, value={deep_written}")
  1895. break
  1896. elif deep_written:
  1897. title_failure_reason = "deep-dom-mismatch"
  1898. print(f"[{self.platform_name}] 标题深层DOM命中但值不匹配: frame={frame_url}, value={deep_written}")
  1899. except Exception:
  1900. pass
  1901. # JS 兜底写入标题
  1902. if not title_filled and desired_title:
  1903. fallback_reason = ""
  1904. for frame in self.page.frames:
  1905. if title_filled:
  1906. break
  1907. frame_url = frame.url or "about:blank"
  1908. try:
  1909. fallback = await frame.evaluate(
  1910. """
  1911. (title) => {
  1912. const nodes = Array.from(document.querySelectorAll(
  1913. 'input:not([type="file"]):not([type="hidden"]), textarea, [contenteditable="true"], [role="textbox"]'
  1914. ));
  1915. const scored = nodes
  1916. .map((el) => {
  1917. const tag = String(el.tagName || '').toLowerCase();
  1918. const type = String((el.getAttribute('type') || '')).toLowerCase();
  1919. if (tag === 'input' && ['file', 'hidden', 'checkbox', 'radio', 'button', 'submit'].includes(type)) return null;
  1920. if (el.disabled || el.readOnly) return null;
  1921. const style = window.getComputedStyle(el);
  1922. if (style.display === 'none' || style.visibility === 'hidden') return null;
  1923. const rect = el.getBoundingClientRect();
  1924. if (!rect || rect.width < 8 || rect.height < 8) return null;
  1925. const ph = String(el.getAttribute('placeholder') || '');
  1926. const aria = String(el.getAttribute('aria-label') || '');
  1927. const name = String(el.getAttribute('name') || '');
  1928. const id = String(el.getAttribute('id') || '');
  1929. const cls = String(el.className || '');
  1930. const ce = String(el.getAttribute('contenteditable') || '').toLowerCase();
  1931. const role = String(el.getAttribute('role') || '').toLowerCase();
  1932. const maxLen = parseInt(String(el.getAttribute('maxlength') || '0'), 10) || 0;
  1933. const container = el.closest('label, [class*="form"], [class*="item"], [class*="field"], [class*="title"]');
  1934. const ctx = String((container && container.innerText) || '').slice(0, 80);
  1935. let score = 0;
  1936. if (ph.includes('标题')) score += 6;
  1937. if (aria.includes('标题')) score += 5;
  1938. if (/title|标题/i.test(name)) score += 4;
  1939. if (/title|标题/i.test(id)) score += 4;
  1940. if (/title|标题/i.test(cls)) score += 3;
  1941. if (/标题|title/i.test(ctx)) score += 4;
  1942. if (maxLen > 0 && maxLen <= 40) score += 3;
  1943. if (tag === 'input' || tag === 'textarea') score += 1;
  1944. if (ce === 'true' || role === 'textbox') score += 2;
  1945. return { el, score, maxLen };
  1946. })
  1947. .filter(x => x && x.score > 0)
  1948. .sort((a, b) => b.score - a.score);
  1949. // 没有明显标题线索时,回退到短输入框(常见标题长度限制)
  1950. const candidates = scored.length
  1951. ? scored
  1952. : nodes
  1953. .map((el) => {
  1954. const tag = String(el.tagName || '').toLowerCase();
  1955. const type = String((el.getAttribute('type') || '')).toLowerCase();
  1956. if (tag === 'input' && ['file', 'hidden', 'checkbox', 'radio', 'button', 'submit'].includes(type)) return null;
  1957. if (el.disabled || el.readOnly) return null;
  1958. const style = window.getComputedStyle(el);
  1959. if (style.display === 'none' || style.visibility === 'hidden') return null;
  1960. const rect = el.getBoundingClientRect();
  1961. if (!rect || rect.width < 8 || rect.height < 8) return null;
  1962. const maxLen = parseInt(String(el.getAttribute('maxlength') || '0'), 10) || 0;
  1963. const score = (maxLen > 0 && maxLen <= 40 ? 3 : 0) + (tag === 'input' || tag === 'textarea' ? 1 : 0);
  1964. return score > 0 ? { el, score, maxLen } : null;
  1965. })
  1966. .filter(Boolean)
  1967. .sort((a, b) => b.score - a.score);
  1968. if (!candidates.length) return { ok: false, value: '', reason: 'no-scored-input' };
  1969. let lastError = '';
  1970. for (const item of candidates.slice(0, 10)) {
  1971. const target = item.el;
  1972. const tag = String(target.tagName || '').toLowerCase();
  1973. const ce = String(target.getAttribute('contenteditable') || '').toLowerCase();
  1974. const role = String(target.getAttribute('role') || '').toLowerCase();
  1975. const isTextInput = tag === 'input' || tag === 'textarea';
  1976. const isEditableBlock = ce === 'true' || role === 'textbox';
  1977. try {
  1978. target.focus();
  1979. if (isTextInput) {
  1980. target.value = '';
  1981. target.dispatchEvent(new Event('input', { bubbles: true }));
  1982. target.value = title;
  1983. target.dispatchEvent(new Event('input', { bubbles: true }));
  1984. target.dispatchEvent(new Event('change', { bubbles: true }));
  1985. const v = String(target.value || '').trim();
  1986. if (v) return { ok: true, value: v, score: item.score || 0 };
  1987. } else if (isEditableBlock) {
  1988. target.textContent = '';
  1989. target.dispatchEvent(new Event('input', { bubbles: true }));
  1990. target.textContent = title;
  1991. target.dispatchEvent(new Event('input', { bubbles: true }));
  1992. target.dispatchEvent(new Event('change', { bubbles: true }));
  1993. const v = String(target.innerText || target.textContent || '').trim();
  1994. if (v) return { ok: true, value: v, score: item.score || 0 };
  1995. }
  1996. } catch (e) {
  1997. lastError = String(e || '');
  1998. }
  1999. }
  2000. return { ok: false, value: '', reason: lastError || 'set-value-failed' };
  2001. }
  2002. """,
  2003. desired_title
  2004. )
  2005. if fallback and fallback.get('ok'):
  2006. written = str(fallback.get('value') or '').strip()
  2007. if _title_matches_expected(written):
  2008. title_filled = True
  2009. title_verified_value = written
  2010. print(f"[{self.platform_name}] 标题 JS 兜底填写成功: frame={frame_url}, value={written}")
  2011. break
  2012. elif written:
  2013. fallback_reason = f"fallback-value-not-match:{written}"
  2014. title_failure_reason = fallback_reason
  2015. print(f"[{self.platform_name}] 标题 JS 兜底命中疑似错误字段,已忽略: frame={frame_url}, value={written}")
  2016. elif fallback:
  2017. fallback_reason = str(fallback.get('reason') or '')
  2018. if fallback_reason:
  2019. title_failure_reason = fallback_reason
  2020. except Exception as e:
  2021. fallback_reason = str(e)
  2022. if fallback_reason:
  2023. title_failure_reason = fallback_reason
  2024. if not title_filled:
  2025. print(f"[{self.platform_name}] 标题 JS 兜底未命中: reason={fallback_reason or 'unknown'}")
  2026. # 强化重试:标题框可能在上传收尾阶段延迟可编辑,循环尝试写入一段时间
  2027. if not title_filled and desired_title:
  2028. print(f"[{self.platform_name}] 标题常规填写未命中,进入强化重试...")
  2029. # 百家号在上传 80%+ 后可能经历较长静默处理期,给更长窗口等待标题输入框真正可编辑
  2030. strong_retry_deadline = asyncio.get_event_loop().time() + 240
  2031. strong_retry_round = 0
  2032. last_retry_log = 0.0
  2033. while asyncio.get_event_loop().time() < strong_retry_deadline and not title_filled:
  2034. strong_retry_round += 1
  2035. retry_reason = ""
  2036. if strong_retry_round == 1 or strong_retry_round % 5 == 0:
  2037. await _try_enter_publish_form(f"title-retry-{strong_retry_round}")
  2038. for frame in self.page.frames:
  2039. if title_filled:
  2040. break
  2041. frame_url = frame.url or "about:blank"
  2042. try:
  2043. retry_result = await frame.evaluate(
  2044. """
  2045. (title) => {
  2046. const nodes = Array.from(document.querySelectorAll(
  2047. 'input:not([type="file"]):not([type="hidden"]), textarea, [contenteditable="true"], [role="textbox"]'
  2048. ));
  2049. const candidates = nodes
  2050. .map((el) => {
  2051. const tag = String(el.tagName || '').toLowerCase();
  2052. const type = String((el.getAttribute('type') || '')).toLowerCase();
  2053. if (tag === 'input' && ['file', 'hidden', 'checkbox', 'radio', 'button', 'submit'].includes(type)) return null;
  2054. if (el.disabled || el.readOnly) return null;
  2055. const style = window.getComputedStyle(el);
  2056. if (style.display === 'none' || style.visibility === 'hidden') return null;
  2057. const rect = el.getBoundingClientRect();
  2058. if (!rect || rect.width < 8 || rect.height < 8) return null;
  2059. const ph = String(el.getAttribute('placeholder') || '');
  2060. const aria = String(el.getAttribute('aria-label') || '');
  2061. const name = String(el.getAttribute('name') || '');
  2062. const id = String(el.getAttribute('id') || '');
  2063. const cls = String(el.className || '');
  2064. const ce = String(el.getAttribute('contenteditable') || '').toLowerCase();
  2065. const role = String(el.getAttribute('role') || '').toLowerCase();
  2066. const maxLen = parseInt(String(el.getAttribute('maxlength') || '0'), 10) || 0;
  2067. const container = el.closest('label, [class*="form"], [class*="item"], [class*="field"], [class*="title"]');
  2068. const ctx = String((container && container.innerText) || '').slice(0, 80);
  2069. let score = 0;
  2070. if (/标题|title/i.test(ph)) score += 7;
  2071. if (/标题|title/i.test(aria)) score += 6;
  2072. if (/标题|title/i.test(name)) score += 5;
  2073. if (/标题|title/i.test(id)) score += 5;
  2074. if (/title|标题/i.test(cls)) score += 4;
  2075. if (/标题|title/i.test(ctx)) score += 5;
  2076. if (maxLen > 0 && maxLen <= 40) score += 3;
  2077. if (tag === 'input' || tag === 'textarea') score += 2;
  2078. if (ce === 'true' || role === 'textbox') score += 1;
  2079. return { el, score };
  2080. })
  2081. .filter(Boolean)
  2082. .sort((a, b) => b.score - a.score);
  2083. if (!candidates.length) {
  2084. return { ok: false, value: '', score: -1, reason: 'no-candidate' };
  2085. }
  2086. let lastError = '';
  2087. for (const item of candidates.slice(0, 12)) {
  2088. const target = item.el;
  2089. const tag = String(target.tagName || '').toLowerCase();
  2090. const ce = String(target.getAttribute('contenteditable') || '').toLowerCase();
  2091. const role = String(target.getAttribute('role') || '').toLowerCase();
  2092. const isTextInput = tag === 'input' || tag === 'textarea';
  2093. const isEditableBlock = ce === 'true' || role === 'textbox';
  2094. const emit = () => {
  2095. target.dispatchEvent(new Event('input', { bubbles: true }));
  2096. target.dispatchEvent(new Event('change', { bubbles: true }));
  2097. };
  2098. try {
  2099. target.focus();
  2100. if (isTextInput) {
  2101. try {
  2102. const proto = tag === 'textarea' ? window.HTMLTextAreaElement.prototype : window.HTMLInputElement.prototype;
  2103. const setter = Object.getOwnPropertyDescriptor(proto, 'value')?.set;
  2104. if (setter) {
  2105. setter.call(target, '');
  2106. emit();
  2107. setter.call(target, title);
  2108. emit();
  2109. } else {
  2110. target.value = '';
  2111. emit();
  2112. target.value = title;
  2113. emit();
  2114. }
  2115. } catch {
  2116. target.value = title;
  2117. emit();
  2118. }
  2119. const v = String(target.value || '').trim();
  2120. if (v) return { ok: true, value: v, score: item.score || 0, reason: '' };
  2121. } else if (isEditableBlock) {
  2122. target.textContent = '';
  2123. emit();
  2124. target.textContent = title;
  2125. emit();
  2126. const v = String(target.innerText || target.textContent || '').trim();
  2127. if (v) return { ok: true, value: v, score: item.score || 0, reason: '' };
  2128. }
  2129. } catch (e) {
  2130. lastError = String(e || '');
  2131. }
  2132. }
  2133. return { ok: false, value: '', score: -1, reason: lastError || 'set-value-failed' };
  2134. }
  2135. """,
  2136. desired_title
  2137. )
  2138. if retry_result and retry_result.get('ok'):
  2139. written = str(retry_result.get('value') or '').strip()
  2140. score = int(retry_result.get('score') or 0)
  2141. # 强化重试仍要求“像标题”且可匹配,避免误写到其他文本框
  2142. if score >= 3 and _title_matches_expected(written):
  2143. title_filled = True
  2144. title_verified_value = written
  2145. print(f"[{self.platform_name}] 标题强化重试成功: round={strong_retry_round}, frame={frame_url}, score={score}, value={written}")
  2146. break
  2147. elif written:
  2148. retry_reason = f"value-not-match:{written},score={score}"
  2149. elif retry_result:
  2150. retry_reason = str(retry_result.get('reason') or '')
  2151. except Exception as e:
  2152. retry_reason = str(e)
  2153. if title_filled:
  2154. break
  2155. now_retry = asyncio.get_event_loop().time()
  2156. if retry_reason in ("no-candidate", "no-scored-input"):
  2157. has_title_input = await _has_editable_title_input()
  2158. if not has_title_input:
  2159. retry_reason = "no-candidate-and-form-not-ready"
  2160. if now_retry - last_retry_log >= 10:
  2161. print(f"[{self.platform_name}] 标题强化重试中: round={strong_retry_round}, reason={retry_reason or 'pending'}")
  2162. last_retry_log = now_retry
  2163. if retry_reason:
  2164. title_failure_reason = retry_reason
  2165. await asyncio.sleep(3)
  2166. # AI 兜底:页面结构变化时,通过视觉识别返回可用 selector
  2167. if not title_filled and desired_title:
  2168. print(f"[{self.platform_name}] 标题强化重试仍未命中,尝试 AI selector 兜底...")
  2169. try:
  2170. ai_goal = "找到页面中用于填写视频标题的输入框或可编辑区域,返回一个可直接输入标题的 Playwright selector"
  2171. ai_selector = await self.ai_suggest_playwright_selector(ai_goal)
  2172. if ai_selector.get("has_selector"):
  2173. selector = str(ai_selector.get("selector") or "").strip()
  2174. confidence = int(ai_selector.get("confidence") or 0)
  2175. print(f"[{self.platform_name}] AI 标题 selector: {selector}, confidence={confidence}")
  2176. for frame in self.page.frames:
  2177. if title_filled:
  2178. break
  2179. frame_url = frame.url or "about:blank"
  2180. try:
  2181. loc = frame.locator(selector).first
  2182. if await loc.count() <= 0 or not await loc.is_visible():
  2183. continue
  2184. try:
  2185. await loc.click(timeout=2500)
  2186. except Exception:
  2187. pass
  2188. node_tag = ""
  2189. try:
  2190. node_tag = ((await loc.evaluate("el => (el.tagName || '').toLowerCase()")) or "").strip()
  2191. except Exception:
  2192. node_tag = ""
  2193. is_text_input = node_tag in ["input", "textarea"]
  2194. if is_text_input:
  2195. try:
  2196. await loc.fill(desired_title, timeout=5000)
  2197. except Exception:
  2198. await self.page.keyboard.press("Control+KeyA")
  2199. await self.page.keyboard.press("Backspace")
  2200. await self.page.keyboard.type(desired_title)
  2201. else:
  2202. try:
  2203. await self.page.keyboard.press("Control+KeyA")
  2204. await self.page.keyboard.press("Backspace")
  2205. await self.page.keyboard.type(desired_title)
  2206. except Exception:
  2207. await loc.evaluate(
  2208. """
  2209. (el, title) => {
  2210. el.focus();
  2211. const tag = String(el.tagName || '').toLowerCase();
  2212. if (tag === 'input' || tag === 'textarea') {
  2213. el.value = title;
  2214. } else {
  2215. el.textContent = title;
  2216. }
  2217. el.dispatchEvent(new Event('input', { bubbles: true }));
  2218. el.dispatchEvent(new Event('change', { bubbles: true }));
  2219. }
  2220. """,
  2221. desired_title
  2222. )
  2223. await asyncio.sleep(0.3)
  2224. current_value = ""
  2225. try:
  2226. if is_text_input:
  2227. current_value = (await loc.input_value() or "").strip()
  2228. else:
  2229. current_value = ((await loc.evaluate("el => (el.innerText || el.textContent || '')")) or "").strip()
  2230. except Exception:
  2231. current_value = ""
  2232. if _title_matches_expected(current_value):
  2233. title_filled = True
  2234. title_verified_value = current_value
  2235. print(f"[{self.platform_name}] AI selector 标题填写成功: frame={frame_url}, value={current_value}")
  2236. break
  2237. else:
  2238. print(f"[{self.platform_name}] AI selector 命中但值不匹配: frame={frame_url}, value={current_value}")
  2239. except Exception as e:
  2240. print(f"[{self.platform_name}] AI selector 执行失败: frame={frame_url}, err={e}")
  2241. else:
  2242. print(f"[{self.platform_name}] AI 未返回可用标题 selector: {ai_selector.get('notes') or 'no-notes'}")
  2243. title_failure_reason = "ai-no-selector"
  2244. except Exception as e:
  2245. print(f"[{self.platform_name}] AI 标题兜底异常: {e}")
  2246. title_failure_reason = f"ai-exception:{e}"
  2247. if not title_filled:
  2248. # 某些版本页面在上传后长期不暴露可编辑标题框;不中断流程,尝试继续发布。
  2249. if any(k in (title_failure_reason or "") for k in ["no-candidate", "form-not-ready", "title-not-ready", "ai-no-selector"]):
  2250. print(f"[{self.platform_name}] 标题输入框未就绪({title_failure_reason}),继续后续发布流程(使用页面现有标题)")
  2251. else:
  2252. screenshot_base64 = await self.capture_screenshot()
  2253. return PublishResult(
  2254. success=False,
  2255. platform=self.platform_name,
  2256. error=f"标题填写失败,已终止发布: {title_failure_reason or 'unknown'}",
  2257. screenshot_base64=screenshot_base64,
  2258. page_url=await self.get_page_url(),
  2259. status='failed'
  2260. )
  2261. # 填写描述
  2262. if params.description:
  2263. self.report_progress(65, "正在填写描述...")
  2264. try:
  2265. desc_selectors = [
  2266. 'textarea[placeholder*="描述"]',
  2267. 'textarea[placeholder*="简介"]',
  2268. '[class*="desc"] textarea',
  2269. '[class*="description"] textarea',
  2270. ]
  2271. for selector in desc_selectors:
  2272. try:
  2273. desc_input = self.page.locator(selector).first
  2274. if await desc_input.count() > 0 and await desc_input.is_visible():
  2275. await desc_input.click()
  2276. await self.page.keyboard.type(params.description[:200])
  2277. print(f"[{self.platform_name}] 描述填写成功")
  2278. break
  2279. except:
  2280. pass
  2281. except Exception as e:
  2282. print(f"[{self.platform_name}] 描述填写失败: {e}")
  2283. self.report_progress(70, "正在发布...")
  2284. await asyncio.sleep(1.5)
  2285. # 点击发布按钮(等待按钮可点击,避免上传完成后直接误判失败)
  2286. publish_selectors = [
  2287. 'button:has-text("立即发布")',
  2288. '[role="button"]:has-text("立即发布")',
  2289. 'button:has-text("确认发布")',
  2290. '[role="button"]:has-text("确认发布")',
  2291. 'button:has-text("发布")',
  2292. '[role="button"]:has-text("发布")',
  2293. 'button:has-text("发表")',
  2294. 'button:has-text("提交")',
  2295. '[class*="publish"] button',
  2296. '[class*="submit"] button',
  2297. ]
  2298. publish_blocked_keywords = [
  2299. "定时发布",
  2300. "预约发布",
  2301. "存草稿",
  2302. "草稿",
  2303. "取消",
  2304. "返回",
  2305. "预览",
  2306. ]
  2307. publish_processing_indicators = [
  2308. 'div:has-text("发布中")',
  2309. 'div:has-text("提交中")',
  2310. 'span:has-text("发布中")',
  2311. 'span:has-text("提交中")',
  2312. 'div:has-text("正在上传")',
  2313. 'div:has-text("正在处理")',
  2314. 'span:has-text("正在上传")',
  2315. 'span:has-text("正在处理")',
  2316. 'div:has-text("请稍候")',
  2317. 'span:has-text("请稍候")',
  2318. 'div:has-text("审核中")',
  2319. 'span:has-text("审核中")',
  2320. ]
  2321. def _compact_btn_text(text: str) -> str:
  2322. return re.sub(r"\s+", "", str(text or "")).strip()
  2323. def _score_publish_button(btn_text_compact: str, prefer_confirm: bool = False) -> int:
  2324. if not btn_text_compact:
  2325. return -1
  2326. if any(k in btn_text_compact for k in publish_blocked_keywords):
  2327. return -1
  2328. if "发布中" in btn_text_compact or "提交中" in btn_text_compact:
  2329. return -1
  2330. score = -1
  2331. if "立即发布" in btn_text_compact:
  2332. score = 130
  2333. elif btn_text_compact == "确认发布":
  2334. score = 125
  2335. elif "确认发布" in btn_text_compact:
  2336. score = 120
  2337. elif btn_text_compact == "发布":
  2338. score = 115
  2339. elif "发布" in btn_text_compact:
  2340. score = 100
  2341. elif "发表" in btn_text_compact:
  2342. score = 80
  2343. elif "提交" in btn_text_compact:
  2344. score = 70
  2345. if score < 0:
  2346. return -1
  2347. if prefer_confirm and ("确认发布" in btn_text_compact or "立即发布" in btn_text_compact):
  2348. score += 20
  2349. return score
  2350. async def _collect_publish_candidates(prefer_confirm: bool = False):
  2351. candidates = []
  2352. found_visible_button = False
  2353. found_disabled_button = False
  2354. for frame in self.page.frames:
  2355. frame_url = frame.url or "about:blank"
  2356. for selector in publish_selectors:
  2357. try:
  2358. btns = frame.locator(selector)
  2359. btn_count = await btns.count()
  2360. for idx in range(min(btn_count, 6)):
  2361. btn = btns.nth(idx)
  2362. if not await btn.is_visible():
  2363. continue
  2364. found_visible_button = True
  2365. btn_text = (await btn.text_content() or "").strip()
  2366. btn_text_compact = _compact_btn_text(btn_text)
  2367. disabled_attr = await btn.get_attribute('disabled')
  2368. aria_disabled = (await btn.get_attribute('aria-disabled') or '').lower()
  2369. cls = (await btn.get_attribute('class') or '').lower()
  2370. is_disabled = bool(disabled_attr) or aria_disabled == 'true' or 'disabled' in cls
  2371. if is_disabled:
  2372. found_disabled_button = True
  2373. continue
  2374. score = _score_publish_button(btn_text_compact, prefer_confirm=prefer_confirm)
  2375. if score < 0:
  2376. continue
  2377. candidates.append({
  2378. "btn": btn,
  2379. "frame_url": frame_url,
  2380. "selector": selector,
  2381. "idx": idx,
  2382. "text": btn_text,
  2383. "score": score,
  2384. })
  2385. except Exception:
  2386. pass
  2387. candidates.sort(key=lambda x: x.get("score", 0), reverse=True)
  2388. return candidates, found_visible_button, found_disabled_button
  2389. async def _click_publish_candidate(candidate: dict):
  2390. btn = candidate.get("btn")
  2391. if not btn:
  2392. return False, "candidate-empty"
  2393. frame_url = str(candidate.get("frame_url") or "about:blank")
  2394. selector = str(candidate.get("selector") or "")
  2395. idx = int(candidate.get("idx") or 0)
  2396. btn_text = str(candidate.get("text") or "").strip()
  2397. before_url = self.page.url
  2398. try:
  2399. try:
  2400. await btn.scroll_into_view_if_needed(timeout=1500)
  2401. except Exception:
  2402. pass
  2403. try:
  2404. await btn.click(timeout=4000)
  2405. except Exception:
  2406. await btn.click(force=True, timeout=4000)
  2407. await asyncio.sleep(0.6)
  2408. after_url = self.page.url
  2409. state_flags = []
  2410. if after_url != before_url:
  2411. state_flags.append("url-changed")
  2412. try:
  2413. post_text = _compact_btn_text(await btn.text_content() or "")
  2414. if any(k in post_text for k in ["发布中", "提交中", "处理中"]):
  2415. state_flags.append("btn-processing")
  2416. except Exception:
  2417. pass
  2418. try:
  2419. for indicator in publish_processing_indicators:
  2420. loc = self.page.locator(indicator).first
  2421. if await loc.count() > 0 and await loc.is_visible():
  2422. state_flags.append("processing-indicator")
  2423. break
  2424. except Exception:
  2425. pass
  2426. state_desc = ",".join(state_flags) if state_flags else "no-immediate-signal"
  2427. print(f"[{self.platform_name}] 点击发布按钮成功: frame={frame_url}, selector={selector}, idx={idx}, text={btn_text}, state={state_desc}")
  2428. return True, ""
  2429. except Exception as e:
  2430. return False, str(e)
  2431. publish_clicked = False
  2432. publish_click_error = ""
  2433. publish_clicked_text = ""
  2434. click_deadline = asyncio.get_event_loop().time() + 180
  2435. last_publish_log = 0.0
  2436. while asyncio.get_event_loop().time() < click_deadline and not publish_clicked:
  2437. candidates, found_visible_button, found_disabled_button = await _collect_publish_candidates(prefer_confirm=False)
  2438. if candidates:
  2439. for candidate in candidates[:6]:
  2440. ok, err = await _click_publish_candidate(candidate)
  2441. if ok:
  2442. publish_clicked = True
  2443. publish_clicked_text = str(candidate.get("text") or "").strip()
  2444. break
  2445. if err:
  2446. publish_click_error = err
  2447. if publish_clicked:
  2448. break
  2449. now_click = asyncio.get_event_loop().time()
  2450. if now_click - last_publish_log >= 10:
  2451. if found_visible_button and found_disabled_button:
  2452. print(f"[{self.platform_name}] 发布按钮可见但不可点击,等待可用...")
  2453. elif found_visible_button:
  2454. print(f"[{self.platform_name}] 发布按钮可见,但点击失败,继续重试...")
  2455. else:
  2456. print(f"[{self.platform_name}] 尚未找到可见发布按钮,继续等待...")
  2457. last_publish_log = now_click
  2458. await asyncio.sleep(2)
  2459. # 某些页面会二次弹出“确认发布/立即发布”,补一次优先确认点击
  2460. if publish_clicked:
  2461. initial_text = _compact_btn_text(publish_clicked_text)
  2462. if initial_text and initial_text != "立即发布":
  2463. await asyncio.sleep(1)
  2464. confirm_candidates, _, _ = await _collect_publish_candidates(prefer_confirm=True)
  2465. for candidate in confirm_candidates[:4]:
  2466. candidate_text = _compact_btn_text(str(candidate.get("text") or ""))
  2467. if candidate_text == initial_text and ("确认发布" not in candidate_text and "立即发布" not in candidate_text):
  2468. continue
  2469. ok, err = await _click_publish_candidate(candidate)
  2470. if ok:
  2471. print(f"[{self.platform_name}] 检测到二次确认发布流程,已补点确认按钮: {candidate_text}")
  2472. break
  2473. if err:
  2474. publish_click_error = err
  2475. if not publish_clicked:
  2476. screenshot_base64 = await self.capture_screenshot()
  2477. return PublishResult(
  2478. success=False,
  2479. platform=self.platform_name,
  2480. error=f"发布按钮未找到或不可点击(可能仍在处理/必填项未通过)。title={title_verified_value or desired_title}; err={publish_click_error or 'none'}",
  2481. screenshot_base64=screenshot_base64,
  2482. page_url=await self.get_page_url(),
  2483. status='failed'
  2484. )
  2485. self.report_progress(80, "等待发布完成...")
  2486. # 记录点击发布前的 URL
  2487. publish_page_url = self.page.url
  2488. print(f"[{self.platform_name}] 发布前 URL: {publish_page_url}")
  2489. # 等待发布完成(百家号审核/处理链路可能较慢,默认等待 15 分钟)
  2490. publish_timeout = 900
  2491. start_time = asyncio.get_event_loop().time()
  2492. last_url = publish_page_url
  2493. republish_click_count = 0
  2494. republish_attempt_count = 0
  2495. last_republish_attempt_time = 0.0
  2496. republish_attempt_interval = 45 # 失败后至少间隔 45s 再尝试,避免刷屏和误操作
  2497. max_republish_attempts = 2
  2498. while asyncio.get_event_loop().time() - start_time < publish_timeout:
  2499. await asyncio.sleep(3)
  2500. current_url = self.page.url
  2501. # 检测 URL 是否发生变化
  2502. if current_url != last_url:
  2503. print(f"[{self.platform_name}] URL 变化: {last_url} -> {current_url}")
  2504. last_url = current_url
  2505. # 检查是否跳转到内容管理页面(真正的成功标志)
  2506. # 百家号发布成功后会跳转到 /builder/rc/content 页面
  2507. if '/builder/rc/content' in current_url and 'edit' not in current_url:
  2508. self.report_progress(100, "发布成功!")
  2509. print(f"[{self.platform_name}] 发布成功,已跳转到内容管理页: {current_url}")
  2510. screenshot_base64 = await self.capture_screenshot()
  2511. return PublishResult(
  2512. success=True,
  2513. platform=self.platform_name,
  2514. message="发布成功",
  2515. screenshot_base64=screenshot_base64,
  2516. page_url=current_url,
  2517. status='success'
  2518. )
  2519. # 检查是否有明确的成功提示弹窗
  2520. try:
  2521. # 百家号发布成功会显示"发布成功"弹窗
  2522. success_modal = self.page.locator('div:has-text("发布成功"), div:has-text("提交成功"), div:has-text("视频发布成功")').first
  2523. if await success_modal.count() > 0 and await success_modal.is_visible():
  2524. self.report_progress(100, "发布成功!")
  2525. print(f"[{self.platform_name}] 检测到发布成功弹窗")
  2526. screenshot_base64 = await self.capture_screenshot()
  2527. # 等待一下看是否会跳转
  2528. await asyncio.sleep(3)
  2529. return PublishResult(
  2530. success=True,
  2531. platform=self.platform_name,
  2532. message="发布成功",
  2533. screenshot_base64=screenshot_base64,
  2534. page_url=self.page.url,
  2535. status='success'
  2536. )
  2537. except Exception as e:
  2538. print(f"[{self.platform_name}] 检测成功提示异常: {e}")
  2539. # 检查是否有错误提示
  2540. try:
  2541. error_selectors = [
  2542. 'div.error-tip',
  2543. 'div[class*="error-msg"]',
  2544. 'span[class*="error"]',
  2545. 'div:has-text("发布失败")',
  2546. 'div:has-text("提交失败")',
  2547. ]
  2548. for error_selector in error_selectors:
  2549. error_el = self.page.locator(error_selector).first
  2550. if await error_el.count() > 0 and await error_el.is_visible():
  2551. error_text = await error_el.text_content()
  2552. if error_text and error_text.strip():
  2553. print(f"[{self.platform_name}] 检测到错误: {error_text}")
  2554. screenshot_base64 = await self.capture_screenshot()
  2555. return PublishResult(
  2556. success=False,
  2557. platform=self.platform_name,
  2558. error=f"发布失败: {error_text.strip()}",
  2559. screenshot_base64=screenshot_base64,
  2560. page_url=current_url,
  2561. status='failed'
  2562. )
  2563. except Exception as e:
  2564. print(f"[{self.platform_name}] 检测错误提示异常: {e}")
  2565. # 检查验证码
  2566. captcha_result = await self.check_captcha()
  2567. if captcha_result['need_captcha']:
  2568. screenshot_base64 = await self.capture_screenshot()
  2569. return PublishResult(
  2570. success=False,
  2571. platform=self.platform_name,
  2572. error=f"发布过程中需要{captcha_result['captcha_type']}验证码",
  2573. need_captcha=True,
  2574. captcha_type=captcha_result['captcha_type'],
  2575. screenshot_base64=screenshot_base64,
  2576. page_url=current_url,
  2577. status='need_captcha'
  2578. )
  2579. # 检查发布按钮状态(如果还在编辑页面)
  2580. if 'edit' in current_url:
  2581. try:
  2582. is_processing = False
  2583. for indicator in publish_processing_indicators:
  2584. loc = self.page.locator(indicator).first
  2585. if await loc.count() > 0 and await loc.is_visible():
  2586. is_processing = True
  2587. print(f"[{self.platform_name}] 正在处理中...")
  2588. break
  2589. if not is_processing:
  2590. # 如果不是在处理中,按节流策略尝试重新点击发布按钮
  2591. now_loop = asyncio.get_event_loop().time()
  2592. elapsed = now_loop - start_time
  2593. if (
  2594. elapsed > 60
  2595. and republish_attempt_count < max_republish_attempts
  2596. and (now_loop - last_republish_attempt_time) >= republish_attempt_interval
  2597. ):
  2598. last_republish_attempt_time = now_loop
  2599. republish_attempt_count += 1
  2600. print(f"[{self.platform_name}] 发布状态未变化,执行第 {republish_attempt_count}/{max_republish_attempts} 次补点发布...")
  2601. republish_done = False
  2602. republish_candidates, _, _ = await _collect_publish_candidates(prefer_confirm=True)
  2603. for candidate in republish_candidates[:6]:
  2604. ok, err = await _click_publish_candidate(candidate)
  2605. if ok:
  2606. republish_done = True
  2607. republish_click_count += 1
  2608. candidate_text = _compact_btn_text(str(candidate.get("text") or ""))
  2609. print(f"[{self.platform_name}] 重新点击发布按钮成功: text={candidate_text}, count={republish_click_count}")
  2610. break
  2611. if err:
  2612. publish_click_error = err
  2613. if not republish_done:
  2614. print(f"[{self.platform_name}] 本轮未找到可用的立即发布按钮,继续等待状态变化")
  2615. except Exception as e:
  2616. print(f"[{self.platform_name}] 检查处理状态异常: {e}")
  2617. # 超时,获取截图分析最终状态
  2618. print(f"[{self.platform_name}] 发布超时,最终 URL: {self.page.url}")
  2619. screenshot_base64 = await self.capture_screenshot()
  2620. # 最后一次检查是否在内容管理页
  2621. final_url = self.page.url
  2622. if '/builder/rc/content' in final_url and 'edit' not in final_url:
  2623. return PublishResult(
  2624. success=True,
  2625. platform=self.platform_name,
  2626. message="发布成功(延迟确认)",
  2627. screenshot_base64=screenshot_base64,
  2628. page_url=final_url,
  2629. status='success'
  2630. )
  2631. # 超时后兜底:跳转内容管理页按标题校验,避免“已发布但未跳转”误判失败
  2632. print(f"[{self.platform_name}] 超时后执行内容页二次校验,title={params.title}")
  2633. verify_deadline = asyncio.get_event_loop().time() + 120 # 最多再校验 2 分钟
  2634. while asyncio.get_event_loop().time() < verify_deadline:
  2635. if await self._verify_publish_from_content_page(params.title, page_size=20):
  2636. screenshot_base64 = await self.capture_screenshot()
  2637. return PublishResult(
  2638. success=True,
  2639. platform=self.platform_name,
  2640. message="发布成功(内容页校验)",
  2641. screenshot_base64=screenshot_base64,
  2642. page_url=self.page.url,
  2643. status='success'
  2644. )
  2645. await asyncio.sleep(8)
  2646. return PublishResult(
  2647. success=False,
  2648. platform=self.platform_name,
  2649. error="发布超时,请手动检查发布状态",
  2650. screenshot_base64=screenshot_base64,
  2651. page_url=final_url,
  2652. status='need_action'
  2653. )
  2654. async def get_works(self, cookies: str, page: int = 0, page_size: int = 20) -> WorksResult:
  2655. """
  2656. 获取百家号作品列表
  2657. 优先使用内容管理页的接口(pcui/article/lists)。
  2658. 说明:
  2659. - 该接口通常需要自定义请求头 token(JWT),仅靠 Cookie 可能会返回“未登录”
  2660. - 这里使用 Playwright 打开内容页,从 localStorage/sessionStorage/页面脚本中自动提取 token,
  2661. 再在页面上下文中发起 fetch(携带 cookie + token),以提高成功率
  2662. """
  2663. import re
  2664. print(f"\n{'='*60}")
  2665. print(f"[{self.platform_name}] 获取作品列表 (使用 API)")
  2666. print(f"[{self.platform_name}] page={page}, page_size={page_size}")
  2667. print(f"{'='*60}")
  2668. works: List[WorkItem] = []
  2669. total = 0
  2670. has_more = False
  2671. next_page = ""
  2672. try:
  2673. # 解析并设置 cookies(Playwright)
  2674. cookie_list = self.parse_cookies(cookies)
  2675. await self.init_browser()
  2676. await self.set_cookies(cookie_list)
  2677. if not self.page:
  2678. raise Exception("Page not initialized")
  2679. # 先打开内容管理页,确保本页 Referer/会话就绪
  2680. # Node 侧传 page=0,1,...;接口 currentPage 为 1,2,...
  2681. current_page = int(page) + 1
  2682. page_size = int(page_size)
  2683. content_url = (
  2684. "https://baijiahao.baidu.com/builder/rc/content"
  2685. f"?currentPage={current_page}&pageSize={page_size}"
  2686. "&search=&type=&collection=&startDate=&endDate="
  2687. )
  2688. await self.page.goto(content_url, wait_until="domcontentloaded", timeout=60000)
  2689. await asyncio.sleep(2)
  2690. # 1) 提取 token(JWT)
  2691. token = await self.page.evaluate(
  2692. """
  2693. () => {
  2694. const isJwtLike = (v) => {
  2695. if (!v || typeof v !== 'string') return false;
  2696. const s = v.trim();
  2697. if (s.length < 60) return false;
  2698. const parts = s.split('.');
  2699. if (parts.length !== 3) return false;
  2700. return parts.every(p => /^[A-Za-z0-9_-]+$/.test(p) && p.length > 10);
  2701. };
  2702. const pickFromStorage = (storage) => {
  2703. try {
  2704. const keys = Object.keys(storage || {});
  2705. for (const k of keys) {
  2706. const v = storage.getItem(k);
  2707. if (isJwtLike(v)) return v;
  2708. }
  2709. } catch {}
  2710. return "";
  2711. };
  2712. // localStorage / sessionStorage
  2713. let t = pickFromStorage(window.localStorage);
  2714. if (t) return t;
  2715. t = pickFromStorage(window.sessionStorage);
  2716. if (t) return t;
  2717. // meta 标签
  2718. const meta = document.querySelector('meta[name="token"], meta[name="bjh-token"]');
  2719. const metaToken = meta && meta.getAttribute('content');
  2720. if (isJwtLike(metaToken)) return metaToken;
  2721. // 简单从全局变量里找
  2722. const candidates = [
  2723. (window.__INITIAL_STATE__ && window.__INITIAL_STATE__.token) || "",
  2724. (window.__PRELOADED_STATE__ && window.__PRELOADED_STATE__.token) || "",
  2725. (window.__NUXT__ && window.__NUXT__.state && window.__NUXT__.state.token) || "",
  2726. ];
  2727. for (const c of candidates) {
  2728. if (isJwtLike(c)) return c;
  2729. }
  2730. return "";
  2731. }
  2732. """
  2733. )
  2734. # 2) 若仍未取到 token,再从页面 HTML 兜底提取
  2735. if not token:
  2736. html = await self.page.content()
  2737. m = re.search(r'([A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,})', html)
  2738. if m:
  2739. token = m.group(1)
  2740. if not token:
  2741. raise Exception("未能从页面提取 token(可能未登录或触发风控),请重新登录百家号账号后再试")
  2742. # 3) 调用接口(在页面上下文 fetch,自动携带 cookie)
  2743. api_url = (
  2744. "https://baijiahao.baidu.com/pcui/article/lists"
  2745. f"?currentPage={current_page}"
  2746. f"&pageSize={page_size}"
  2747. "&search=&type=&collection=&startDate=&endDate="
  2748. "&clearBeforeFetch=false"
  2749. "&dynamic=1"
  2750. )
  2751. resp = await self.page.evaluate(
  2752. """
  2753. async ({ url, token }) => {
  2754. const r = await fetch(url, {
  2755. method: 'GET',
  2756. credentials: 'include',
  2757. headers: {
  2758. 'accept': 'application/json, text/plain, */*',
  2759. ...(token ? { token } : {}),
  2760. },
  2761. });
  2762. const text = await r.text();
  2763. return { ok: r.ok, status: r.status, text };
  2764. }
  2765. """,
  2766. {"url": api_url, "token": token},
  2767. )
  2768. if not resp or not resp.get("ok"):
  2769. status = resp.get("status") if isinstance(resp, dict) else "unknown"
  2770. raise Exception(f"百家号接口请求失败: HTTP {status}")
  2771. api_result = json.loads(resp.get("text") or "{}")
  2772. print(f"[{self.platform_name}] pcui/article/lists 响应: errno={api_result.get('errno')}, errmsg={api_result.get('errmsg')}")
  2773. if api_result.get("errno") != 0:
  2774. errno = api_result.get("errno")
  2775. errmsg = api_result.get("errmsg", "unknown error")
  2776. # 20040001 常见为“未登录”
  2777. if errno in (110, 20040001):
  2778. raise Exception("百家号未登录或 Cookie/token 失效,请重新登录后再同步")
  2779. raise Exception(f"百家号接口错误: errno={errno}, errmsg={errmsg}")
  2780. data = api_result.get("data", {}) or {}
  2781. items = data.get("list", []) or []
  2782. page_info = data.get("page", {}) or {}
  2783. total = int(page_info.get("totalCount", 0) or 0)
  2784. total_page = int(page_info.get("totalPage", 0) or 0)
  2785. cur_page = int(page_info.get("currentPage", current_page) or current_page)
  2786. has_more = bool(total_page and cur_page < total_page)
  2787. next_page = cur_page + 1 if has_more else ""
  2788. print(f"[{self.platform_name}] 获取到 {len(items)} 个作品,总数: {total}, currentPage={cur_page}, totalPage={total_page}")
  2789. def _pick_cover(item: dict) -> str:
  2790. cover = item.get("crosswise_cover") or item.get("vertical_cover") or ""
  2791. if cover:
  2792. return cover
  2793. raw = item.get("cover_images") or ""
  2794. try:
  2795. # cover_images 可能是 JSON 字符串
  2796. parsed = json.loads(raw) if isinstance(raw, str) else raw
  2797. if isinstance(parsed, list) and parsed:
  2798. first = parsed[0]
  2799. if isinstance(first, dict):
  2800. return first.get("src") or first.get("ori_src") or ""
  2801. if isinstance(first, str):
  2802. return first
  2803. except Exception:
  2804. pass
  2805. return ""
  2806. def _pick_duration(item: dict) -> int:
  2807. for k in ("rmb_duration", "duration", "long"):
  2808. try:
  2809. v = int(item.get(k) or 0)
  2810. if v > 0:
  2811. return v
  2812. except Exception:
  2813. pass
  2814. # displaytype_exinfo 里可能有 ugcvideo.video_info.durationInSecond
  2815. ex = item.get("displaytype_exinfo") or ""
  2816. try:
  2817. exj = json.loads(ex) if isinstance(ex, str) and ex else (ex if isinstance(ex, dict) else {})
  2818. ugc = (exj.get("ugcvideo") or {}) if isinstance(exj, dict) else {}
  2819. vi = ugc.get("video_info") or {}
  2820. v = int(vi.get("durationInSecond") or ugc.get("long") or 0)
  2821. return v if v > 0 else 0
  2822. except Exception:
  2823. return 0
  2824. def _pick_status(item: dict) -> str:
  2825. qs = str(item.get("quality_status") or "").lower()
  2826. st = str(item.get("status") or "").lower()
  2827. if qs == "rejected" or "reject" in st:
  2828. return "rejected"
  2829. if st in ("draft", "unpublish", "unpublished"):
  2830. return "draft"
  2831. # 百家号常见 publish
  2832. return "published"
  2833. for item in items:
  2834. # 优先使用 nid(builder 预览链接使用这个)
  2835. work_id = str(item.get("nid") or item.get("feed_id") or item.get("article_id") or item.get("id") or "")
  2836. if not work_id:
  2837. continue
  2838. works.append(
  2839. WorkItem(
  2840. work_id=work_id,
  2841. title=str(item.get("title") or ""),
  2842. cover_url=_pick_cover(item),
  2843. video_url=str(item.get("url") or ""),
  2844. duration=_pick_duration(item),
  2845. status=_pick_status(item),
  2846. publish_time=str(item.get("publish_time") or item.get("publish_at") or item.get("created_at") or ""),
  2847. play_count=int(item.get("read_amount") or 0),
  2848. like_count=int(item.get("like_amount") or 0),
  2849. comment_count=int(item.get("comment_amount") or 0),
  2850. share_count=int(item.get("share_amount") or 0),
  2851. collect_count=int(item.get("collection_amount") or 0),
  2852. )
  2853. )
  2854. print(f"[{self.platform_name}] ✓ 成功解析 {len(works)} 个作品")
  2855. except Exception as e:
  2856. import traceback
  2857. traceback.print_exc()
  2858. return WorksResult(
  2859. success=False,
  2860. platform=self.platform_name,
  2861. error=str(e),
  2862. debug_info="baijiahao_get_works_failed"
  2863. )
  2864. return WorksResult(
  2865. success=True,
  2866. platform=self.platform_name,
  2867. works=works,
  2868. total=total,
  2869. has_more=has_more,
  2870. next_page=next_page
  2871. )
  2872. async def get_all_works(self, cookies: str) -> WorksResult:
  2873. """
  2874. 获取百家号全部作品列表(自动分页,复用浏览器实例)。
  2875. 避免每页都启动新浏览器导致的性能问题和风控触发。
  2876. """
  2877. import re
  2878. print(f"\n{'='*60}")
  2879. print(f"[{self.platform_name}] 获取全部作品列表(自动分页)")
  2880. print(f"{'='*60}")
  2881. all_works: List[WorkItem] = []
  2882. seen_ids = set()
  2883. total = 0
  2884. current_page = 1
  2885. page_size = 20
  2886. max_pages = 50 # 最多50页
  2887. try:
  2888. cookie_list = self.parse_cookies(cookies)
  2889. await self.init_browser()
  2890. await self.set_cookies(cookie_list)
  2891. if not self.page:
  2892. raise Exception("Page not initialized")
  2893. # 打开内容管理页以建立会话并提取 token
  2894. content_url = (
  2895. "https://baijiahao.baidu.com/builder/rc/content"
  2896. f"?currentPage={current_page}&pageSize={page_size}"
  2897. "&search=&type=&collection=&startDate=&endDate="
  2898. )
  2899. await self.page.goto(content_url, wait_until="domcontentloaded", timeout=60000)
  2900. await asyncio.sleep(3)
  2901. # 检查登录状态
  2902. current_url = self.page.url
  2903. if "passport.baidu.com" in current_url or "login" in current_url:
  2904. raise Exception("Cookie 已过期,请重新登录百家号账号")
  2905. # 提取 token
  2906. token = await self.page.evaluate(
  2907. """
  2908. () => {
  2909. const isJwtLike = (v) => {
  2910. if (!v || typeof v !== 'string') return false;
  2911. const s = v.trim();
  2912. if (s.length < 60) return false;
  2913. const parts = s.split('.');
  2914. if (parts.length !== 3) return false;
  2915. return parts.every(p => /^[A-Za-z0-9_-]+$/.test(p) && p.length > 10);
  2916. };
  2917. const pickFromStorage = (storage) => {
  2918. try {
  2919. const keys = Object.keys(storage || {});
  2920. for (const k of keys) {
  2921. const v = storage.getItem(k);
  2922. if (isJwtLike(v)) return v;
  2923. }
  2924. } catch {}
  2925. return "";
  2926. };
  2927. let t = pickFromStorage(window.localStorage);
  2928. if (t) return t;
  2929. t = pickFromStorage(window.sessionStorage);
  2930. if (t) return t;
  2931. const meta = document.querySelector('meta[name="token"], meta[name="bjh-token"]');
  2932. const metaToken = meta && meta.getAttribute('content');
  2933. if (isJwtLike(metaToken)) return metaToken;
  2934. const candidates = [
  2935. (window.__INITIAL_STATE__ && window.__INITIAL_STATE__.token) || "",
  2936. (window.__PRELOADED_STATE__ && window.__PRELOADED_STATE__.token) || "",
  2937. (window.__NUXT__ && window.__NUXT__.state && window.__NUXT__.state.token) || "",
  2938. ];
  2939. for (const c of candidates) {
  2940. if (isJwtLike(c)) return c;
  2941. }
  2942. return "";
  2943. }
  2944. """
  2945. )
  2946. if not token:
  2947. html = await self.page.content()
  2948. m = re.search(r'([A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,})', html)
  2949. if m:
  2950. token = m.group(1)
  2951. if not token:
  2952. raise Exception("未能从页面提取 token(可能未登录或触发风控),请重新登录百家号账号后再试")
  2953. print(f"[{self.platform_name}] ✓ Token 提取成功")
  2954. def _pick_cover(item: dict) -> str:
  2955. cover = item.get("crosswise_cover") or item.get("vertical_cover") or ""
  2956. if cover:
  2957. return cover
  2958. raw = item.get("cover_images") or ""
  2959. try:
  2960. parsed = json.loads(raw) if isinstance(raw, str) else raw
  2961. if isinstance(parsed, list) and parsed:
  2962. first = parsed[0]
  2963. if isinstance(first, dict):
  2964. return first.get("src") or first.get("ori_src") or ""
  2965. if isinstance(first, str):
  2966. return first
  2967. except Exception:
  2968. pass
  2969. return ""
  2970. def _pick_duration(item: dict) -> int:
  2971. for k in ("rmb_duration", "duration", "long"):
  2972. try:
  2973. v = int(item.get(k) or 0)
  2974. if v > 0:
  2975. return v
  2976. except Exception:
  2977. pass
  2978. ex = item.get("displaytype_exinfo") or ""
  2979. try:
  2980. exj = json.loads(ex) if isinstance(ex, str) and ex else (ex if isinstance(ex, dict) else {})
  2981. ugc = (exj.get("ugcvideo") or {}) if isinstance(exj, dict) else {}
  2982. vi = ugc.get("video_info") or {}
  2983. v = int(vi.get("durationInSecond") or ugc.get("long") or 0)
  2984. return v if v > 0 else 0
  2985. except Exception:
  2986. return 0
  2987. def _pick_status(item: dict) -> str:
  2988. qs = str(item.get("quality_status") or "").lower()
  2989. st = str(item.get("status") or "").lower()
  2990. if qs == "rejected" or "reject" in st:
  2991. return "rejected"
  2992. if st in ("draft", "unpublish", "unpublished"):
  2993. return "draft"
  2994. return "published"
  2995. # 分页循环
  2996. for page_iter in range(max_pages):
  2997. page_num = page_iter + 1 # 百家号 currentPage 从 1 开始
  2998. api_url = (
  2999. "https://baijiahao.baidu.com/pcui/article/lists"
  3000. f"?currentPage={page_num}"
  3001. f"&pageSize={page_size}"
  3002. "&search=&type=&collection=&startDate=&endDate="
  3003. "&clearBeforeFetch=false"
  3004. "&dynamic=1"
  3005. )
  3006. resp = await self.page.evaluate(
  3007. """
  3008. async ({ url, token }) => {
  3009. const r = await fetch(url, {
  3010. method: 'GET',
  3011. credentials: 'include',
  3012. headers: {
  3013. 'accept': 'application/json, text/plain, */*',
  3014. ...(token ? { token } : {}),
  3015. },
  3016. });
  3017. const text = await r.text();
  3018. return { ok: r.ok, status: r.status, text };
  3019. }
  3020. """,
  3021. {"url": api_url, "token": token},
  3022. )
  3023. if not resp or not resp.get("ok"):
  3024. print(f"[{self.platform_name}] 第 {page_num} 页请求失败: HTTP {resp.get('status') if isinstance(resp, dict) else 'unknown'}")
  3025. break
  3026. api_result = json.loads(resp.get("text") or "{}")
  3027. errno = api_result.get("errno", -1)
  3028. if errno != 0:
  3029. errmsg = api_result.get("errmsg", "unknown error")
  3030. print(f"[{self.platform_name}] 第 {page_num} 页接口错误: errno={errno}, errmsg={errmsg}")
  3031. if errno in (110, 20040001):
  3032. raise Exception("百家号未登录或 Cookie/token 失效,请重新登录后再同步")
  3033. # 非登录错误则停止分页
  3034. break
  3035. data = api_result.get("data", {}) or {}
  3036. items = data.get("list", []) or []
  3037. page_info = data.get("page", {}) or {}
  3038. if page_iter == 0:
  3039. total = int(page_info.get("totalCount", 0) or 0)
  3040. print(f"[{self.platform_name}] 作品总数: {total}")
  3041. new_count = 0
  3042. for item in items:
  3043. work_id = str(item.get("nid") or item.get("feed_id") or item.get("article_id") or item.get("id") or "")
  3044. if not work_id or work_id in seen_ids:
  3045. continue
  3046. seen_ids.add(work_id)
  3047. new_count += 1
  3048. all_works.append(
  3049. WorkItem(
  3050. work_id=work_id,
  3051. title=str(item.get("title") or ""),
  3052. cover_url=_pick_cover(item),
  3053. video_url=str(item.get("url") or ""),
  3054. duration=_pick_duration(item),
  3055. status=_pick_status(item),
  3056. publish_time=str(item.get("publish_time") or item.get("publish_at") or item.get("created_at") or ""),
  3057. play_count=int(item.get("read_amount") or 0),
  3058. like_count=int(item.get("like_amount") or 0),
  3059. comment_count=int(item.get("comment_amount") or 0),
  3060. share_count=int(item.get("share_amount") or 0),
  3061. collect_count=int(item.get("collection_amount") or 0),
  3062. )
  3063. )
  3064. total_page = int(page_info.get("totalPage", 0) or 0)
  3065. has_more = bool(total_page and page_num < total_page)
  3066. print(f"[{self.platform_name}] 第 {page_num}/{total_page or '?'} 页: 获取 {new_count} 个新作品, 累计 {len(all_works)}")
  3067. if not has_more or len(items) == 0 or new_count == 0:
  3068. break
  3069. # 页间短暂等待,避免过快触发风控
  3070. await asyncio.sleep(1)
  3071. print(f"[{self.platform_name}] ✓ 自动分页完成,共获取 {len(all_works)} 个作品")
  3072. except Exception as e:
  3073. import traceback
  3074. traceback.print_exc()
  3075. # 如果已获取到部分作品,仍然返回成功
  3076. if all_works:
  3077. print(f"[{self.platform_name}] 虽有异常但已获取 {len(all_works)} 个作品,正常返回")
  3078. return WorksResult(
  3079. success=True,
  3080. platform=self.platform_name,
  3081. works=all_works,
  3082. total=total or len(all_works),
  3083. has_more=False,
  3084. next_page="",
  3085. )
  3086. return WorksResult(
  3087. success=False,
  3088. platform=self.platform_name,
  3089. error=str(e),
  3090. debug_info="baijiahao_get_all_works_failed"
  3091. )
  3092. return WorksResult(
  3093. success=True,
  3094. platform=self.platform_name,
  3095. works=all_works,
  3096. total=total or len(all_works),
  3097. has_more=False,
  3098. next_page="",
  3099. )
  3100. async def get_article_stats(
  3101. self,
  3102. cookies: str,
  3103. start_day: str,
  3104. end_day: str,
  3105. stat_type: str,
  3106. num: int,
  3107. count: int,
  3108. ) -> dict:
  3109. """
  3110. 调用百家号 /author/eco/statistics/articleListStatistic 接口(不依赖浏览器 token),用于作品列表维度的每日数据。
  3111. """
  3112. import aiohttp
  3113. print(f"[{self.platform_name}] get_article_stats: {start_day}-{end_day}, type={stat_type}, num={num}, count={count}")
  3114. # 解析 cookies
  3115. cookie_list = self.parse_cookies(cookies)
  3116. cookie_dict = {c['name']: c['value'] for c in cookie_list}
  3117. session_headers = {
  3118. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  3119. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
  3120. 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
  3121. 'Accept-Encoding': 'gzip, deflate, br',
  3122. 'Connection': 'keep-alive',
  3123. 'Upgrade-Insecure-Requests': '1',
  3124. }
  3125. headers = {
  3126. 'Accept': 'application/json, text/plain, */*',
  3127. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
  3128. 'Referer': 'https://baijiahao.baidu.com/builder/rc/analysiscontent/single',
  3129. 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
  3130. 'Accept-Encoding': 'gzip, deflate, br',
  3131. 'Connection': 'keep-alive',
  3132. }
  3133. async with aiohttp.ClientSession(cookies=cookie_dict) as session:
  3134. # 0) 先访问 single 页面建立会话上下文(与 Node 端 UI 打开的页面一致)
  3135. try:
  3136. await session.get(
  3137. 'https://baijiahao.baidu.com/builder/rc/analysiscontent/single',
  3138. headers=session_headers,
  3139. timeout=aiohttp.ClientTimeout(total=20),
  3140. )
  3141. except Exception as e:
  3142. print(f"[{self.platform_name}] warmup single page failed (non-fatal): {e}")
  3143. # 1) 调用 articleListStatistic
  3144. api_url = (
  3145. "https://baijiahao.baidu.com/author/eco/statistics/articleListStatistic"
  3146. f"?start_day={start_day}&end_day={end_day}&type={stat_type}&num={num}&count={count}"
  3147. )
  3148. async with session.get(
  3149. api_url,
  3150. headers=headers,
  3151. timeout=aiohttp.ClientTimeout(total=30),
  3152. ) as resp:
  3153. status = resp.status
  3154. try:
  3155. data = await resp.json()
  3156. except Exception:
  3157. text = await resp.text()
  3158. print(f"[{self.platform_name}] articleListStatistic non-JSON response: {text[:1000]}")
  3159. raise
  3160. errno = data.get('errno')
  3161. errmsg = data.get('errmsg')
  3162. print(f"[{self.platform_name}] articleListStatistic: http={status}, errno={errno}, msg={errmsg}")
  3163. return {
  3164. "success": status == 200 and errno == 0,
  3165. "status": status,
  3166. "errno": errno,
  3167. "errmsg": errmsg,
  3168. "data": data.get('data') if isinstance(data, dict) else None,
  3169. }
  3170. async def get_trend_data(
  3171. self,
  3172. cookies: str,
  3173. nid: str,
  3174. ) -> dict:
  3175. """
  3176. 调用百家号 /author/eco/statistic/gettrenddata 接口,获取单作品的按日统计数据(basic_list)。
  3177. """
  3178. import aiohttp
  3179. print(f"[{self.platform_name}] get_trend_data: nid={nid}")
  3180. cookie_list = self.parse_cookies(cookies)
  3181. cookie_dict = {c['name']: c['value'] for c in cookie_list}
  3182. session_headers = {
  3183. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  3184. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
  3185. 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
  3186. 'Accept-Encoding': 'gzip, deflate, br',
  3187. 'Connection': 'keep-alive',
  3188. 'Upgrade-Insecure-Requests': '1',
  3189. }
  3190. headers = {
  3191. 'Accept': 'application/json, text/plain, */*',
  3192. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
  3193. 'Referer': 'https://baijiahao.baidu.com/builder/rc/analysiscontent/single',
  3194. 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
  3195. 'Accept-Encoding': 'gzip, deflate, br',
  3196. 'Connection': 'keep-alive',
  3197. }
  3198. async with aiohttp.ClientSession(cookies=cookie_dict) as session:
  3199. # 0) warmup
  3200. try:
  3201. await session.get(
  3202. 'https://baijiahao.baidu.com/builder/rc/analysiscontent/single',
  3203. headers=session_headers,
  3204. timeout=aiohttp.ClientTimeout(total=20),
  3205. )
  3206. except Exception as e:
  3207. print(f"[{self.platform_name}] warmup single page (trend) failed (non-fatal): {e}")
  3208. api_url = (
  3209. "https://baijiahao.baidu.com/author/eco/statistic/gettrenddata"
  3210. f"?nid={nid}&trend_type=all&data_type=addition"
  3211. )
  3212. async with session.get(
  3213. api_url,
  3214. headers=headers,
  3215. timeout=aiohttp.ClientTimeout(total=30),
  3216. ) as resp:
  3217. status = resp.status
  3218. try:
  3219. data = await resp.json()
  3220. except Exception:
  3221. text = await resp.text()
  3222. print(f"[{self.platform_name}] gettrenddata non-JSON response: {text[:1000]}")
  3223. raise
  3224. errno = data.get('errno')
  3225. errmsg = data.get('errmsg')
  3226. print(f"[{self.platform_name}] gettrenddata: http={status}, errno={errno}, msg={errmsg}")
  3227. return {
  3228. "success": status == 200 and errno == 0,
  3229. "status": status,
  3230. "errno": errno,
  3231. "errmsg": errmsg,
  3232. "data": data.get('data') if isinstance(data, dict) else None,
  3233. }
  3234. async def get_app_statistic_v3(
  3235. self,
  3236. cookies: str,
  3237. start_day: str,
  3238. end_day: str,
  3239. ) -> dict:
  3240. """
  3241. 调用百家号 appStatisticV3(账号维度近30天基础数据),用于用户每日数据同步。
  3242. 登录模式与打开后台一致:使用账号已存 Cookie,不启浏览器。
  3243. """
  3244. import aiohttp
  3245. print(f"[{self.platform_name}] get_app_statistic_v3: {start_day}-{end_day}")
  3246. cookie_list = self.parse_cookies(cookies)
  3247. cookie_dict = {c['name']: c['value'] for c in cookie_list}
  3248. session_headers = {
  3249. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  3250. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
  3251. 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
  3252. 'Accept-Encoding': 'gzip, deflate, br',
  3253. 'Connection': 'keep-alive',
  3254. }
  3255. headers = {
  3256. 'Accept': 'application/json, text/plain, */*',
  3257. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
  3258. 'Referer': 'https://baijiahao.baidu.com/builder/rc/analysiscontent',
  3259. 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
  3260. 'Accept-Encoding': 'gzip, deflate, br',
  3261. 'Connection': 'keep-alive',
  3262. }
  3263. async with aiohttp.ClientSession(cookies=cookie_dict) as session:
  3264. # warmup:与打开后台一致,先访问后台页面建立会话
  3265. try:
  3266. await session.get(
  3267. 'https://baijiahao.baidu.com/builder/rc/analysiscontent',
  3268. headers=session_headers,
  3269. timeout=aiohttp.ClientTimeout(total=20),
  3270. )
  3271. except Exception as e:
  3272. print(f"[{self.platform_name}] warmup analysiscontent failed (non-fatal): {e}")
  3273. api_url = (
  3274. "https://baijiahao.baidu.com/author/eco/statistics/appStatisticV3"
  3275. f"?type=all&start_day={start_day}&end_day={end_day}&stat=0&special_filter_days=30"
  3276. )
  3277. async with session.get(
  3278. api_url,
  3279. headers=headers,
  3280. timeout=aiohttp.ClientTimeout(total=30),
  3281. ) as resp:
  3282. status = resp.status
  3283. try:
  3284. data = await resp.json()
  3285. except Exception:
  3286. text = await resp.text()
  3287. print(f"[{self.platform_name}] appStatisticV3 non-JSON: {text[:1000]}")
  3288. raise
  3289. errno = data.get('errno') if isinstance(data, dict) else None
  3290. errmsg = data.get('errmsg') if isinstance(data, dict) else None
  3291. print(f"[{self.platform_name}] appStatisticV3: http={status}, errno={errno}, msg={errmsg}")
  3292. return data if isinstance(data, dict) else {"errno": -1, "errmsg": "invalid response", "data": None}
  3293. async def get_fans_basic_info(
  3294. self,
  3295. cookies: str,
  3296. start: str,
  3297. end: str,
  3298. ) -> dict:
  3299. """
  3300. 调用百家号 getFansBasicInfo(近30天粉丝数据),用于用户每日数据同步。
  3301. 登录模式与打开后台一致:使用账号已存 Cookie。
  3302. """
  3303. import aiohttp
  3304. print(f"[{self.platform_name}] get_fans_basic_info: {start}-{end}")
  3305. cookie_list = self.parse_cookies(cookies)
  3306. cookie_dict = {c['name']: c['value'] for c in cookie_list}
  3307. session_headers = {
  3308. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  3309. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
  3310. 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
  3311. 'Accept-Encoding': 'gzip, deflate, br',
  3312. 'Connection': 'keep-alive',
  3313. }
  3314. headers = {
  3315. 'Accept': 'application/json, text/plain, */*',
  3316. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
  3317. 'Referer': 'https://baijiahao.baidu.com/builder/rc/analysisfans/basedata',
  3318. 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
  3319. 'Accept-Encoding': 'gzip, deflate, br',
  3320. 'Connection': 'keep-alive',
  3321. }
  3322. async with aiohttp.ClientSession(cookies=cookie_dict) as session:
  3323. try:
  3324. await session.get(
  3325. 'https://baijiahao.baidu.com/builder/rc/analysisfans/basedata',
  3326. headers=session_headers,
  3327. timeout=aiohttp.ClientTimeout(total=20),
  3328. )
  3329. except Exception as e:
  3330. print(f"[{self.platform_name}] warmup analysisfans/basedata failed (non-fatal): {e}")
  3331. api_url = (
  3332. "https://baijiahao.baidu.com/author/eco/statistics/getFansBasicInfo"
  3333. f"?start={start}&end={end}&fans_type=new%2Csum&sort=asc&is_page=0&show_type=chart"
  3334. )
  3335. async with session.get(
  3336. api_url,
  3337. headers=headers,
  3338. timeout=aiohttp.ClientTimeout(total=30),
  3339. ) as resp:
  3340. status = resp.status
  3341. try:
  3342. data = await resp.json()
  3343. except Exception:
  3344. text = await resp.text()
  3345. print(f"[{self.platform_name}] getFansBasicInfo non-JSON: {text[:1000]}")
  3346. raise
  3347. errno = data.get('errno') if isinstance(data, dict) else None
  3348. errmsg = data.get('errmsg') if isinstance(data, dict) else None
  3349. print(f"[{self.platform_name}] getFansBasicInfo: http={status}, errno={errno}, msg={errmsg}")
  3350. return data if isinstance(data, dict) else {"errno": -1, "errmsg": "invalid response", "data": None}
  3351. async def check_login_status(self, cookies: str) -> dict:
  3352. """
  3353. 检查百家号 Cookie 登录状态
  3354. 现在与其他平台保持一致,直接复用 BasePublisher 的浏览器检测逻辑:
  3355. - 使用 Playwright 打开后台页面
  3356. - 根据是否跳转到登录页 / 是否出现登录弹窗或风控提示,判断登录是否有效
  3357. """
  3358. print(f"[{self.platform_name}] 检查登录状态 (使用通用浏览器逻辑)")
  3359. # 直接调用父类的实现,保持与抖音/小红书/视频号一致
  3360. return await super().check_login_status(cookies)
  3361. async def get_comments(self, cookies: str, work_id: str, cursor: str = "") -> CommentsResult:
  3362. """
  3363. 获取百家号作品评论
  3364. 参考 weixin.py 中同名逻辑,但百家号当前评论页仅提供「所有评论流」视图,
  3365. 且页面 DOM 结构为(外层关键节点):
  3366. div.client_pages_newComment_comment_all_list > div.cheetah-ui-pro-scroll-view >
  3367. div.list-container > div.client_pages_newComment_comment_all_listItem
  3368. 暂时按「按作品标题模糊匹配」的方式,筛选属于指定 work_id 对应作品标题的评论;
  3369. 若无法可靠匹配,则先返回该页面上的所有评论记录。
  3370. """
  3371. print(f"\n{'='*60}")
  3372. print(f"[{self.platform_name}] 获取作品评论")
  3373. print(f"[{self.platform_name}] work_id={work_id}")
  3374. print(f"{'='*60}")
  3375. comments: List[CommentItem] = []
  3376. total = 0
  3377. has_more = False
  3378. try:
  3379. # 启动浏览器并设置 Cookie(复用 BasePublisher 逻辑)
  3380. await self.init_browser()
  3381. cookie_list = self.parse_cookies(cookies)
  3382. await self.set_cookies(cookie_list)
  3383. if not self.page:
  3384. raise Exception("Page not initialized")
  3385. page = self.page
  3386. # 打开评论管理页(全量评论列表)
  3387. comment_url = "https://baijiahao.baidu.com/builder/rc/commentmanage/comment/all"
  3388. print(f"[{self.platform_name}] 正在打开评论页面: {comment_url}")
  3389. await page.goto(comment_url, timeout=30000)
  3390. await asyncio.sleep(3)
  3391. # 检查是否跳转到登录页
  3392. current_url = page.url
  3393. if "login" in current_url or "passport.baidu.com" in current_url:
  3394. raise Exception("Cookie 已过期,请重新登录")
  3395. # 等待评论列表容器加载
  3396. container_selector = ".client_pages_newComment_comment_all_list .cheetah-ui-pro-scroll-view .list-container"
  3397. print(f"[{self.platform_name}] 等待评论列表 DOM 加载...")
  3398. try:
  3399. await page.wait_for_selector(container_selector, timeout=15000)
  3400. except Exception as e:
  3401. print(f"[{self.platform_name}] 等待评论列表失败: {e}")
  3402. return CommentsResult(
  3403. success=False,
  3404. platform=self.platform_name,
  3405. work_id=work_id,
  3406. error=f"无法加载评论列表: {e}",
  3407. )
  3408. # 尝试多次下拉「加载更多」
  3409. try:
  3410. for i in range(5):
  3411. # 滚动评论列表区域
  3412. await page.evaluate(
  3413. """
  3414. () => {
  3415. const container = document.querySelector('.client_pages_newComment_comment_all_list .cheetah-ui-pro-scroll-view .list-container');
  3416. if (container) {
  3417. container.scrollTop = container.scrollHeight;
  3418. } else {
  3419. window.scrollBy(0, 600);
  3420. }
  3421. }
  3422. """
  3423. )
  3424. await asyncio.sleep(1)
  3425. # 点击「加载更多」按钮(如果存在)
  3426. await page.evaluate(
  3427. """
  3428. () => {
  3429. const more = document.querySelector('.client_pages_newComment_components_loadMore, .client_pages_newComment_comment_all_list .client_pages_newComment_components_loadMore');
  3430. if (more) {
  3431. (more.querySelector('.more-arrow') || more).click();
  3432. }
  3433. }
  3434. """
  3435. )
  3436. await asyncio.sleep(1.2)
  3437. except Exception as e:
  3438. print(f"[{self.platform_name}] 下拉/加载更多异常(非致命): {e}")
  3439. # 从 DOM 中提取评论数据
  3440. print(f"[{self.platform_name}] 从 DOM 提取评论数据...")
  3441. raw_comments = await page.evaluate(
  3442. """
  3443. () => {
  3444. const result = [];
  3445. const listRoot = document.querySelector('.client_pages_newComment_comment_all_list .cheetah-ui-pro-scroll-view .list-container');
  3446. if (!listRoot) {
  3447. return result;
  3448. }
  3449. const items = listRoot.querySelectorAll('.client_pages_newComment_comment_all_listItem');
  3450. items.forEach((item) => {
  3451. try {
  3452. const avatarImg = item.querySelector('.comment-card-avatar-wrapper-card-avatar img');
  3453. const nameEl = item.querySelector('.content-wrapper .user-container .name');
  3454. const titleContentEl = item.querySelector('.title-wrapper .title-content');
  3455. const contentEl = item.querySelector('.content-wrapper .content .content-w-highlight, .content-wrapper .content');
  3456. const infoWrapper = item.querySelector('.content-wrapper .info-wrapper');
  3457. const timeEl = infoWrapper ? infoWrapper.querySelector('span:nth-child(1)') : null;
  3458. const replyInfoEl = infoWrapper ? infoWrapper.querySelector('span:nth-child(2)') : null;
  3459. const workTitle = titleContentEl ? titleContentEl.textContent.trim() : '';
  3460. const authorName = nameEl ? nameEl.textContent.trim() : '';
  3461. const avatar = avatarImg ? avatarImg.src : '';
  3462. const content = contentEl ? contentEl.textContent.trim() : '';
  3463. const timeText = timeEl ? timeEl.textContent.trim() : '';
  3464. const replyText = replyInfoEl ? replyInfoEl.textContent.trim() : '';
  3465. const replyMatch = replyText.match(/(\\d+)/);
  3466. const replyCount = replyMatch ? parseInt(replyMatch[1], 10) : 0;
  3467. result.push({
  3468. comment_id: `bjh_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`,
  3469. parent_comment_id: "",
  3470. work_title: workTitle,
  3471. content,
  3472. username: authorName,
  3473. nickname: authorName,
  3474. avatar,
  3475. like_count: 0,
  3476. reply_count: replyCount,
  3477. create_time: timeText,
  3478. is_author: false,
  3479. create_time_unix: Date.now() / 1000,
  3480. });
  3481. } catch (e) {
  3482. // 单条解析失败忽略
  3483. }
  3484. });
  3485. return result;
  3486. }
  3487. """
  3488. )
  3489. if not isinstance(raw_comments, list):
  3490. raw_comments = []
  3491. print(f"[{self.platform_name}] 共抓取到原始评论条目: {len(raw_comments)}")
  3492. # TODO: 如果后续有更可靠的 work_id -> 标题映射,可以在此处根据 work_id 过滤
  3493. # 当前先不做强过滤,全部返回,避免漏数据
  3494. for c in raw_comments:
  3495. comment_item = CommentItem(
  3496. comment_id=c.get("comment_id", ""),
  3497. parent_comment_id=c.get("parent_comment_id", ""),
  3498. work_id=work_id,
  3499. content=c.get("content", ""),
  3500. author_id=c.get("username", ""),
  3501. author_name=c.get("nickname", ""),
  3502. author_avatar=c.get("avatar", ""),
  3503. like_count=c.get("like_count", 0),
  3504. reply_count=c.get("reply_count", 0),
  3505. create_time=c.get("create_time", ""),
  3506. )
  3507. comment_item.is_author = c.get("is_author", False)
  3508. comment_item.create_time_unix = c.get("create_time_unix", 0)
  3509. comment_item.work_title = c.get("work_title", "")
  3510. comments.append(comment_item)
  3511. total = len(comments)
  3512. # 是否还有更多暂无法从接口获知,先固定为 False
  3513. has_more = False
  3514. print(f"[{self.platform_name}] ✅ 最终整理出评论条数: {total}")
  3515. except Exception as e:
  3516. import traceback
  3517. traceback.print_exc()
  3518. return CommentsResult(
  3519. success=False,
  3520. platform=self.platform_name,
  3521. work_id=work_id,
  3522. error=str(e),
  3523. )
  3524. return CommentsResult(
  3525. success=True,
  3526. platform=self.platform_name,
  3527. work_id=work_id,
  3528. comments=comments,
  3529. total=total,
  3530. has_more=has_more,
  3531. )