weixin.py 118 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880
  1. # -*- coding: utf-8 -*-
  2. """
  3. 微信视频号发布器
  4. 参考: matrix/tencent_uploader/main.py
  5. """
  6. import asyncio
  7. import json
  8. import os
  9. from datetime import datetime
  10. from typing import List
  11. from .base import (
  12. BasePublisher,
  13. PublishParams,
  14. PublishResult,
  15. WorkItem,
  16. WorksResult,
  17. CommentItem,
  18. CommentsResult,
  19. )
  20. import os
  21. import time
  22. # 允许通过环境变量手动指定“上传视频入口”的选择器,便于在页面结构频繁变更时快速调整
  23. WEIXIN_UPLOAD_SELECTOR = os.environ.get("WEIXIN_UPLOAD_SELECTOR", "").strip()
  24. def format_short_title(origin_title: str) -> str:
  25. """
  26. 格式化短标题
  27. - 移除特殊字符
  28. - 长度限制在 6-16 字符
  29. """
  30. allowed_special_chars = "《》:+?%°"
  31. filtered_chars = [
  32. char
  33. if char.isalnum() or char in allowed_special_chars
  34. else " "
  35. if char == ","
  36. else ""
  37. for char in origin_title
  38. ]
  39. formatted_string = "".join(filtered_chars)
  40. if len(formatted_string) > 16:
  41. formatted_string = formatted_string[:16]
  42. elif len(formatted_string) < 6:
  43. formatted_string += " " * (6 - len(formatted_string))
  44. return formatted_string
  45. class WeixinPublisher(BasePublisher):
  46. """
  47. 微信视频号发布器
  48. 使用 Playwright 自动化操作视频号创作者中心
  49. 注意: 需要使用 Chrome 浏览器,否则可能出现 H264 编码错误
  50. """
  51. platform_name = "weixin"
  52. login_url = "https://channels.weixin.qq.com/platform"
  53. publish_url = "https://channels.weixin.qq.com/platform/post/create"
  54. cookie_domain = ".weixin.qq.com"
  55. def _parse_count(self, count_str: str) -> int:
  56. """解析数字(支持带'万'的格式)"""
  57. try:
  58. count_str = count_str.strip()
  59. if "万" in count_str:
  60. return int(float(count_str.replace("万", "")) * 10000)
  61. return int(count_str)
  62. except:
  63. return 0
  64. async def ai_find_upload_selector(
  65. self, frame_html: str, frame_name: str = "main"
  66. ) -> str:
  67. """
  68. 使用 AI 从 HTML 中识别“上传视频/选择文件”相关元素的 CSS 选择器。
  69. 设计思路:
  70. - 仅在常规 DOM 选择器都失败时调用,避免频繁占用 AI 配额;
  71. - 通过 DashScope 文本模型(与验证码识别同一套配置)分析 HTML;
  72. - 返回一个适合用于 frame.locator(selector) 的 CSS 选择器。
  73. """
  74. import json
  75. import re
  76. import requests
  77. import os
  78. # 避免 HTML 过长导致 token 超限,只截取前 N 字符
  79. if not frame_html:
  80. return ""
  81. max_len = 20000
  82. if len(frame_html) > max_len:
  83. frame_html = frame_html[:max_len]
  84. ai_api_key = os.environ.get("DASHSCOPE_API_KEY", "")
  85. ai_base_url = os.environ.get(
  86. "DASHSCOPE_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1"
  87. )
  88. ai_text_model = os.environ.get("AI_TEXT_MODEL", "qwen-plus")
  89. if not ai_api_key:
  90. print(f"[{self.platform_name}] AI上传入口识别: 未配置 AI API Key,跳过")
  91. return ""
  92. prompt = f"""
  93. 你是熟悉微信视频号后台的前端工程师,现在需要在一段 HTML 中找到“上传视频文件”的入口。
  94. 页面说明:
  95. - 平台:微信视频号(channels.weixin.qq.com)
  96. - 目标:用于上传视频文件的按钮或 input(一般会触发文件选择框)
  97. - 你会收到某个 frame 的完整 HTML 片段(不包含截图)。
  98. 请你根据下面的 HTML,推断最适合用于上传视频文件的元素,并输出一个可以被 Playwright 使用的 CSS 选择器。
  99. 要求:
  100. 1. 只考虑“上传/选择视频文件”的入口,不要返回“发布/发表/下一步”等按钮;
  101. 2. 选择器需要尽量稳定,不要使用自动生成的随机类名(例如带很多随机字母/数字的类名可以用前缀匹配);
  102. 3. 选择器必须是 CSS 选择器(不要返回 XPath);
  103. 4. 如果确实找不到合理的上传入口,返回 selector 为空字符串。
  104. 请以 JSON 格式输出,严格遵守以下结构(不要添加任何解释文字):
  105. ```json
  106. {{
  107. "selector": "CSS 选择器字符串,比如:input[type='file'] 或 div.upload-content input[type='file']"
  108. }}
  109. ```
  110. 下面是 frame=\"{frame_name}\" 的 HTML:
  111. ```html
  112. {frame_html}
  113. ```"""
  114. payload = {
  115. "model": ai_text_model,
  116. "messages": [
  117. {
  118. "role": "user",
  119. "content": prompt,
  120. }
  121. ],
  122. "max_tokens": 600,
  123. }
  124. headers = {
  125. "Authorization": f"Bearer {ai_api_key}",
  126. "Content-Type": "application/json",
  127. }
  128. try:
  129. print(
  130. f"[{self.platform_name}] AI上传入口识别: 正在分析 frame={frame_name} HTML..."
  131. )
  132. resp = requests.post(
  133. f"{ai_base_url}/chat/completions",
  134. headers=headers,
  135. json=payload,
  136. timeout=40,
  137. )
  138. if resp.status_code != 200:
  139. print(
  140. f"[{self.platform_name}] AI上传入口识别: API 返回错误 {resp.status_code}"
  141. )
  142. return ""
  143. data = resp.json()
  144. content = (
  145. data.get("choices", [{}])[0].get("message", {}).get("content", "") or ""
  146. )
  147. # 尝试从 ```json``` 代码块中解析
  148. json_match = re.search(r"```json\\s*([\\s\\S]*?)\\s*```", content)
  149. if json_match:
  150. json_str = json_match.group(1)
  151. else:
  152. json_match = re.search(r"\\{[\\s\\S]*\\}", content)
  153. json_str = json_match.group(0) if json_match else "{}"
  154. try:
  155. result = json.loads(json_str)
  156. except Exception:
  157. result = {}
  158. selector = (result.get("selector") or "").strip()
  159. print(f"[{self.platform_name}] AI上传入口识别结果: selector='{selector}'")
  160. return selector
  161. except Exception as e:
  162. print(f"[{self.platform_name}] AI上传入口识别异常: {e}")
  163. return ""
  164. async def ai_pick_selector_from_candidates(
  165. self, candidates: list, goal: str, frame_name: str = "main"
  166. ) -> str:
  167. """
  168. 将“候选元素列表(包含 css selector + 文本/属性)”发给 AI,让 AI 直接挑选最符合 goal 的元素。
  169. 适用于:HTML 里看不出上传入口、或页面大量动态渲染时。
  170. """
  171. import json
  172. import re
  173. import requests
  174. import os
  175. if not candidates:
  176. return ""
  177. ai_api_key = os.environ.get("DASHSCOPE_API_KEY", "")
  178. ai_base_url = os.environ.get(
  179. "DASHSCOPE_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1"
  180. )
  181. ai_text_model = os.environ.get("AI_TEXT_MODEL", "qwen-plus")
  182. if not ai_api_key:
  183. print(f"[{self.platform_name}] AI候选选择器: 未配置 AI API Key,跳过")
  184. return ""
  185. # 控制长度,最多取前 120 个候选
  186. candidates = candidates[:120]
  187. prompt = f"""
  188. 你是自动化发布工程师。现在要在微信视频号(channels.weixin.qq.com)发布页面里找到“{goal}”相关的入口元素。
  189. 我会给你一组候选元素,每个候选都包含:
  190. - css: 可直接用于 Playwright 的 CSS 选择器
  191. - tag / type / role / ariaLabel / text / id / className(部分字段可能为空)
  192. 你的任务:
  193. - 从候选中选出最可能用于“{goal}”的元素,返回它的 css 选择器;
  194. - 如果没有任何候选符合,返回空字符串。
  195. 注意:
  196. - 如果 goal 是“上传视频入口”,优先选择 input[type=file] 或看起来会触发选择文件/上传的区域;
  197. - 不要选择“发布/发表/下一步”等按钮(除非 goal 明确是发布按钮)。
  198. 请严格按 JSON 输出(不要解释):
  199. ```json
  200. {{ "selector": "..." }}
  201. ```
  202. 候选列表(frame={frame_name}):
  203. ```json
  204. {json.dumps(candidates, ensure_ascii=False)}
  205. ```"""
  206. payload = {
  207. "model": ai_text_model,
  208. "messages": [{"role": "user", "content": prompt}],
  209. "max_tokens": 400,
  210. }
  211. headers = {
  212. "Authorization": f"Bearer {ai_api_key}",
  213. "Content-Type": "application/json",
  214. }
  215. try:
  216. print(
  217. f"[{self.platform_name}] AI候选选择器: 正在分析 frame={frame_name}, goal={goal} ..."
  218. )
  219. resp = requests.post(
  220. f"{ai_base_url}/chat/completions",
  221. headers=headers,
  222. json=payload,
  223. timeout=40,
  224. )
  225. if resp.status_code != 200:
  226. print(
  227. f"[{self.platform_name}] AI候选选择器: API 返回错误 {resp.status_code}"
  228. )
  229. return ""
  230. data = resp.json()
  231. content = (
  232. data.get("choices", [{}])[0].get("message", {}).get("content", "") or ""
  233. )
  234. json_match = re.search(r"```json\\s*([\\s\\S]*?)\\s*```", content)
  235. if json_match:
  236. json_str = json_match.group(1)
  237. else:
  238. json_match = re.search(r"\\{[\\s\\S]*\\}", content)
  239. json_str = json_match.group(0) if json_match else "{}"
  240. try:
  241. result = json.loads(json_str)
  242. except Exception:
  243. result = {}
  244. selector = (result.get("selector") or "").strip()
  245. print(f"[{self.platform_name}] AI候选选择器结果: selector='{selector}'")
  246. return selector
  247. except Exception as e:
  248. print(f"[{self.platform_name}] AI候选选择器异常: {e}")
  249. return ""
  250. async def _extract_relevant_html_snippets(self, html: str) -> str:
  251. """
  252. 从 HTML 中抽取与上传相关的片段,减少 token,提升 AI 命中率。
  253. - 优先抓取包含 upload/上传/file/input 等关键词的窗口片段
  254. - 若未命中关键词,返回“开头 + 结尾”的拼接
  255. """
  256. import re
  257. if not html:
  258. return ""
  259. patterns = [
  260. r"upload",
  261. r"uploader",
  262. r"file",
  263. r"type\\s*=\\s*['\\\"]file['\\\"]",
  264. r"input",
  265. r"drag",
  266. r"drop",
  267. r"选择",
  268. r"上传",
  269. r"添加",
  270. r"视频",
  271. ]
  272. regex = re.compile("|".join(patterns), re.IGNORECASE)
  273. snippets = []
  274. for m in regex.finditer(html):
  275. start = max(0, m.start() - 350)
  276. end = min(len(html), m.end() + 350)
  277. snippets.append(html[start:end])
  278. if len(snippets) >= 18:
  279. break
  280. if snippets:
  281. # 去重(粗略)
  282. unique = []
  283. seen = set()
  284. for s in snippets:
  285. key = hash(s)
  286. if key not in seen:
  287. seen.add(key)
  288. unique.append(s)
  289. return "\n\n<!-- SNIPPET -->\n\n".join(unique)[:20000]
  290. # fallback: head + tail
  291. head = html[:9000]
  292. tail = html[-9000:] if len(html) > 9000 else ""
  293. return (head + "\n\n<!-- TAIL -->\n\n" + tail)[:20000]
  294. async def init_browser(self, storage_state: str = None):
  295. """初始化浏览器 - 参考 matrix 使用 channel=chrome 避免 H264 编码错误"""
  296. from playwright.async_api import async_playwright
  297. playwright = await async_playwright().start()
  298. proxy = (
  299. self.proxy_config
  300. if isinstance(getattr(self, "proxy_config", None), dict)
  301. else None
  302. )
  303. if proxy and proxy.get("server"):
  304. print(f"[{self.platform_name}] 使用代理: {proxy.get('server')}", flush=True)
  305. # 参考 matrix: 使用系统内的 Chrome 浏览器,避免 H264 编码错误
  306. # 非 headless 时添加 slow_mo 便于观察点击操作
  307. launch_opts = {"headless": self.headless}
  308. if not self.headless:
  309. launch_opts["slow_mo"] = 400 # 每个操作间隔 400ms,便于观看
  310. print(
  311. f"[{self.platform_name}] 有头模式 + slow_mo=400ms,浏览器将可见",
  312. flush=True,
  313. )
  314. try:
  315. launch_opts["channel"] = "chrome"
  316. if proxy and proxy.get("server"):
  317. launch_opts["proxy"] = proxy
  318. self.browser = await playwright.chromium.launch(**launch_opts)
  319. print(f"[{self.platform_name}] 使用系统 Chrome 浏览器", flush=True)
  320. except Exception as e:
  321. print(
  322. f"[{self.platform_name}] Chrome 不可用,使用 Chromium: {e}", flush=True
  323. )
  324. if "channel" in launch_opts:
  325. del launch_opts["channel"]
  326. self.browser = await playwright.chromium.launch(**launch_opts)
  327. # 设置 HTTP Headers 防止重定向
  328. headers = {
  329. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  330. "Referer": "https://channels.weixin.qq.com/platform/post/list",
  331. }
  332. self.context = await self.browser.new_context(
  333. extra_http_headers=headers,
  334. ignore_https_errors=True,
  335. viewport={"width": 1920, "height": 1080},
  336. user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  337. )
  338. self.page = await self.context.new_page()
  339. return self.page
  340. async def set_schedule_time(self, publish_date: datetime):
  341. """设置定时发布"""
  342. if not self.page:
  343. return
  344. print(f"[{self.platform_name}] 设置定时发布...")
  345. # 点击定时选项
  346. label_element = self.page.locator("label").filter(has_text="定时").nth(1)
  347. await label_element.click()
  348. # 选择日期
  349. await self.page.click('input[placeholder="请选择发表时间"]')
  350. publish_month = f"{publish_date.month:02d}"
  351. current_month = f"{publish_month}月"
  352. # 检查月份
  353. page_month = await self.page.inner_text(
  354. 'span.weui-desktop-picker__panel__label:has-text("月")'
  355. )
  356. if page_month != current_month:
  357. await self.page.click("button.weui-desktop-btn__icon__right")
  358. # 选择日期
  359. elements = await self.page.query_selector_all(
  360. "table.weui-desktop-picker__table a"
  361. )
  362. for element in elements:
  363. class_name = await element.evaluate("el => el.className")
  364. if "weui-desktop-picker__disabled" in class_name:
  365. continue
  366. text = await element.inner_text()
  367. if text.strip() == str(publish_date.day):
  368. await element.click()
  369. break
  370. # 输入时间
  371. await self.page.click('input[placeholder="请选择时间"]')
  372. await self.page.keyboard.press("Control+KeyA")
  373. await self.page.keyboard.type(str(publish_date.hour))
  374. # 点击其他地方确认
  375. await self.page.locator("div.input-editor").click()
  376. async def handle_upload_error(self, video_path: str):
  377. """处理上传错误"""
  378. if not self.page:
  379. return
  380. print(f"[{self.platform_name}] 视频出错了,重新上传中...")
  381. # 出错时先截一张当前页面的图,方便排查(代理问题、视频格式问题等)
  382. try:
  383. timestamp = int(time.time() * 1000)
  384. screenshot_path = f"weixin_upload_error_{timestamp}.png"
  385. await self.page.screenshot(path=screenshot_path, full_page=True)
  386. print(f"[{self.platform_name}] 上传错误截图已保存: {screenshot_path}", flush=True)
  387. except Exception as e:
  388. print(f"[{self.platform_name}] 保存上传错误截图失败: {e}", flush=True)
  389. # 删除出错的视频重新上传
  390. await self.page.locator(
  391. 'div.media-status-content div.tag-inner:has-text("删除")'
  392. ).click()
  393. await self.page.get_by_role("button", name="删除", exact=True).click()
  394. file_input = self.page.locator('input[type="file"]')
  395. await file_input.set_input_files(video_path)
  396. async def add_title_tags(self, params: PublishParams):
  397. """添加标题和话题"""
  398. if not self.page:
  399. return
  400. await self.page.locator("div.input-editor").click()
  401. await self.page.keyboard.type(params.title)
  402. if params.tags:
  403. await self.page.keyboard.press("Enter")
  404. for tag in params.tags:
  405. await self.page.keyboard.type("#" + tag)
  406. await self.page.keyboard.press("Space")
  407. print(f"[{self.platform_name}] 成功添加标题和 {len(params.tags)} 个话题")
  408. async def add_short_title(self):
  409. """添加短标题"""
  410. if not self.page:
  411. return
  412. try:
  413. short_title_element = (
  414. self.page.get_by_text("短标题", exact=True)
  415. .locator("..")
  416. .locator("xpath=following-sibling::div")
  417. .locator('span input[type="text"]')
  418. )
  419. if await short_title_element.count():
  420. # 获取已有内容作为短标题
  421. pass
  422. except:
  423. pass
  424. async def upload_cover(self, cover_path: str):
  425. """上传封面图"""
  426. if not self.page or not cover_path or not os.path.exists(cover_path):
  427. return
  428. try:
  429. await asyncio.sleep(2)
  430. preview_btn_info = await self.page.locator(
  431. 'div.finder-tag-wrap.btn:has-text("更换封面")'
  432. ).get_attribute("class")
  433. if "disabled" not in preview_btn_info:
  434. await self.page.locator(
  435. 'div.finder-tag-wrap.btn:has-text("更换封面")'
  436. ).click()
  437. await self.page.locator(
  438. "div.single-cover-uploader-wrap > div.wrap"
  439. ).hover()
  440. # 删除现有封面
  441. if await self.page.locator(".del-wrap > .svg-icon").count():
  442. await self.page.locator(".del-wrap > .svg-icon").click()
  443. # 上传新封面
  444. preview_div = self.page.locator(
  445. "div.single-cover-uploader-wrap > div.wrap"
  446. )
  447. async with self.page.expect_file_chooser() as fc_info:
  448. await preview_div.click()
  449. preview_chooser = await fc_info.value
  450. await preview_chooser.set_files(cover_path)
  451. await asyncio.sleep(2)
  452. await self.page.get_by_role("button", name="确定").click()
  453. await asyncio.sleep(1)
  454. await self.page.get_by_role("button", name="确认").click()
  455. print(f"[{self.platform_name}] 封面上传成功")
  456. except Exception as e:
  457. print(f"[{self.platform_name}] 封面上传失败: {e}")
  458. async def check_captcha(self) -> dict:
  459. """检查页面是否需要验证码"""
  460. if not self.page:
  461. return {"need_captcha": False, "captcha_type": ""}
  462. try:
  463. # 检查各种验证码
  464. captcha_selectors = [
  465. 'text="请输入验证码"',
  466. 'text="滑动验证"',
  467. '[class*="captcha"]',
  468. '[class*="verify"]',
  469. ]
  470. for selector in captcha_selectors:
  471. try:
  472. if await self.page.locator(selector).count() > 0:
  473. print(f"[{self.platform_name}] 检测到验证码: {selector}")
  474. return {"need_captcha": True, "captcha_type": "image"}
  475. except:
  476. pass
  477. # 检查登录弹窗
  478. login_selectors = [
  479. 'text="请登录"',
  480. 'text="扫码登录"',
  481. '[class*="login-dialog"]',
  482. ]
  483. for selector in login_selectors:
  484. try:
  485. if await self.page.locator(selector).count() > 0:
  486. print(f"[{self.platform_name}] 检测到需要登录: {selector}")
  487. return {"need_captcha": True, "captcha_type": "login"}
  488. except:
  489. pass
  490. except Exception as e:
  491. print(f"[{self.platform_name}] 验证码检测异常: {e}")
  492. return {"need_captcha": False, "captcha_type": ""}
  493. async def publish(self, cookies: str, params: PublishParams) -> PublishResult:
  494. """发布视频到视频号"""
  495. print(f"\n{'=' * 60}")
  496. print(f"[{self.platform_name}] 开始发布视频")
  497. print(f"[{self.platform_name}] 视频路径: {params.video_path}")
  498. print(f"[{self.platform_name}] 标题: {params.title}")
  499. print(f"[{self.platform_name}] Headless: {self.headless}")
  500. print(f"{'=' * 60}")
  501. self.report_progress(5, "正在初始化浏览器...")
  502. # 初始化浏览器(使用 Chrome)
  503. await self.init_browser()
  504. print(f"[{self.platform_name}] 浏览器初始化完成")
  505. # 解析并设置 cookies
  506. cookie_list = self.parse_cookies(cookies)
  507. print(cookie_list)
  508. print(f"[{self.platform_name}] 解析到 {len(cookie_list)} 个 cookies")
  509. await self.set_cookies(cookie_list)
  510. if not self.page:
  511. raise Exception("Page not initialized")
  512. # 检查视频文件
  513. if not os.path.exists(params.video_path):
  514. raise Exception(f"视频文件不存在: {params.video_path}")
  515. print(
  516. f"[{self.platform_name}] 视频文件存在,大小: {os.path.getsize(params.video_path)} bytes"
  517. )
  518. self.report_progress(10, "正在打开上传页面...")
  519. # 访问上传页面 - 使用 domcontentloaded 替代 networkidle,避免代理慢速导致超时
  520. await self.page.goto(
  521. self.publish_url, wait_until="domcontentloaded", timeout=90000
  522. )
  523. # 等待页面关键元素加载
  524. try:
  525. await self.page.wait_for_load_state("load", timeout=30000)
  526. except Exception:
  527. pass
  528. await asyncio.sleep(3)
  529. # 检查是否跳转到登录页
  530. current_url = self.page.url
  531. print(f"[{self.platform_name}] 当前页面: {current_url}")
  532. if "login" in current_url:
  533. screenshot_base64 = await self.capture_screenshot()
  534. return PublishResult(
  535. success=False,
  536. platform=self.platform_name,
  537. error="Cookie 已过期,需要重新登录",
  538. need_captcha=True,
  539. captcha_type="login",
  540. screenshot_base64=screenshot_base64,
  541. page_url=current_url,
  542. status="need_captcha",
  543. )
  544. # 使用 AI 检查验证码
  545. ai_captcha = await self.ai_check_captcha()
  546. if ai_captcha["has_captcha"]:
  547. print(
  548. f"[{self.platform_name}] AI检测到验证码: {ai_captcha['captcha_type']}",
  549. flush=True,
  550. )
  551. screenshot_base64 = await self.capture_screenshot()
  552. return PublishResult(
  553. success=False,
  554. platform=self.platform_name,
  555. error=f"检测到{ai_captcha['captcha_type']}验证码,需要使用有头浏览器完成验证",
  556. need_captcha=True,
  557. captcha_type=ai_captcha["captcha_type"],
  558. screenshot_base64=screenshot_base64,
  559. page_url=current_url,
  560. status="need_captcha",
  561. )
  562. # 传统方式检查验证码
  563. captcha_result = await self.check_captcha()
  564. if captcha_result["need_captcha"]:
  565. screenshot_base64 = await self.capture_screenshot()
  566. return PublishResult(
  567. success=False,
  568. platform=self.platform_name,
  569. error=f"需要{captcha_result['captcha_type']}验证码,请使用有头浏览器完成验证",
  570. need_captcha=True,
  571. captcha_type=captcha_result["captcha_type"],
  572. screenshot_base64=screenshot_base64,
  573. page_url=current_url,
  574. status="need_captcha",
  575. )
  576. self.report_progress(15, "正在选择视频文件...")
  577. # 上传视频
  578. # 说明:视频号发布页在不同账号/地区/灰度下 DOM 结构差异较大,且上传组件可能在 iframe 中。
  579. # 因此这里按 matrix 的思路“点击触发 file chooser”,同时增加“遍历全部 frame + 精确挑选 video input”的兜底。
  580. upload_success = False
  581. if not self.page:
  582. raise Exception("Page not initialized")
  583. # 等待页面把上传区域渲染出来(避免过早判断)
  584. try:
  585. await self.page.wait_for_selector(
  586. "div.upload-content, input[type='file'], iframe", timeout=20000
  587. )
  588. except Exception:
  589. pass
  590. async def _try_set_files_in_frame(frame, frame_name: str) -> bool:
  591. """在指定 frame 中尝试触发上传"""
  592. nonlocal upload_success
  593. if upload_success:
  594. return True
  595. # 方法0:如果用户通过环境变量显式配置了选择器,优先尝试这个
  596. if WEIXIN_UPLOAD_SELECTOR:
  597. try:
  598. el = frame.locator(WEIXIN_UPLOAD_SELECTOR).first
  599. if await el.count() > 0 and await el.is_visible():
  600. print(
  601. f"[{self.platform_name}] [{frame_name}] 使用环境变量 WEIXIN_UPLOAD_SELECTOR: {WEIXIN_UPLOAD_SELECTOR}"
  602. )
  603. try:
  604. async with self.page.expect_file_chooser(
  605. timeout=5000
  606. ) as fc_info:
  607. await el.click()
  608. chooser = await fc_info.value
  609. await chooser.set_files(params.video_path)
  610. upload_success = True
  611. print(
  612. f"[{self.platform_name}] [{frame_name}] 通过环境变量选择器上传成功"
  613. )
  614. return True
  615. except Exception as e:
  616. print(
  617. f"[{self.platform_name}] [{frame_name}] 环境变量选择器点击失败,尝试直接 set_input_files: {e}"
  618. )
  619. try:
  620. await el.set_input_files(params.video_path)
  621. upload_success = True
  622. print(
  623. f"[{self.platform_name}] [{frame_name}] 环境变量选择器 set_input_files 成功"
  624. )
  625. return True
  626. except Exception as e2:
  627. print(
  628. f"[{self.platform_name}] [{frame_name}] 环境变量选择器 set_input_files 仍失败: {e2}"
  629. )
  630. except Exception as e:
  631. print(
  632. f"[{self.platform_name}] [{frame_name}] 使用环境变量选择器定位元素失败: {e}"
  633. )
  634. # 先尝试点击上传区域触发 chooser(最贴近 matrix)
  635. click_selectors = [
  636. "div.upload-content",
  637. "div[class*='upload-content']",
  638. "div[class*='upload']",
  639. "div.add-wrap",
  640. "[class*='uploader']",
  641. "text=点击上传",
  642. "text=上传视频",
  643. "text=选择视频",
  644. ]
  645. for selector in click_selectors:
  646. try:
  647. el = frame.locator(selector).first
  648. if await el.count() > 0 and await el.is_visible():
  649. print(
  650. f"[{self.platform_name}] [{frame_name}] 找到可点击上传区域: {selector}"
  651. )
  652. try:
  653. async with self.page.expect_file_chooser(
  654. timeout=5000
  655. ) as fc_info:
  656. await el.click()
  657. chooser = await fc_info.value
  658. await chooser.set_files(params.video_path)
  659. upload_success = True
  660. print(
  661. f"[{self.platform_name}] [{frame_name}] 通过 file chooser 上传成功"
  662. )
  663. return True
  664. except Exception as e:
  665. print(
  666. f"[{self.platform_name}] [{frame_name}] 点击触发 chooser 失败: {e}"
  667. )
  668. except Exception:
  669. pass
  670. # 再尝试直接设置 input[type=file](iframe/隐藏 input 常见)
  671. try:
  672. inputs = frame.locator("input[type='file']")
  673. cnt = await inputs.count()
  674. if cnt > 0:
  675. best_idx = 0
  676. best_score = -1
  677. for i in range(cnt):
  678. try:
  679. inp = inputs.nth(i)
  680. accept = (await inp.get_attribute("accept")) or ""
  681. multiple = (await inp.get_attribute("multiple")) or ""
  682. score = 0
  683. if "video" in accept:
  684. score += 10
  685. if "mp4" in accept:
  686. score += 3
  687. if multiple:
  688. score += 1
  689. if score > best_score:
  690. best_score = score
  691. best_idx = i
  692. except Exception:
  693. continue
  694. target = inputs.nth(best_idx)
  695. print(
  696. f"[{self.platform_name}] [{frame_name}] 尝试对 input[{best_idx}] set_input_files (score={best_score})"
  697. )
  698. await target.set_input_files(params.video_path)
  699. upload_success = True
  700. print(
  701. f"[{self.platform_name}] [{frame_name}] 通过 file input 上传成功"
  702. )
  703. return True
  704. except Exception as e:
  705. print(f"[{self.platform_name}] [{frame_name}] file input 上传失败: {e}")
  706. # 不直接返回,让后面的 AI 兜底有机会执行
  707. # 方法4: 兜底使用 AI 分析 HTML,猜测上传入口
  708. try:
  709. frame_url = getattr(frame, "url", "")
  710. html_full = await frame.content()
  711. html_for_ai = await self._extract_relevant_html_snippets(html_full)
  712. print(
  713. f"[{self.platform_name}] [{frame_name}] frame_url={frame_url}, html_len={len(html_full)}, html_for_ai_len={len(html_for_ai)}"
  714. )
  715. ai_selector = await self.ai_find_upload_selector(
  716. html_for_ai, frame_name=frame_name
  717. )
  718. if ai_selector:
  719. try:
  720. el = frame.locator(ai_selector).first
  721. if await el.count() > 0:
  722. print(
  723. f"[{self.platform_name}] [{frame_name}] 使用 AI 选择器点击上传入口: {ai_selector}"
  724. )
  725. try:
  726. async with self.page.expect_file_chooser(
  727. timeout=5000
  728. ) as fc_info:
  729. await el.click()
  730. chooser = await fc_info.value
  731. await chooser.set_files(params.video_path)
  732. upload_success = True
  733. print(
  734. f"[{self.platform_name}] [{frame_name}] 通过 AI 选择器上传成功"
  735. )
  736. return True
  737. except Exception as e:
  738. print(
  739. f"[{self.platform_name}] [{frame_name}] AI 选择器点击失败,改为直接 set_input_files: {e}"
  740. )
  741. try:
  742. await el.set_input_files(params.video_path)
  743. upload_success = True
  744. print(
  745. f"[{self.platform_name}] [{frame_name}] AI 选择器直接 set_input_files 成功"
  746. )
  747. return True
  748. except Exception as e2:
  749. print(
  750. f"[{self.platform_name}] [{frame_name}] AI 选择器 set_input_files 仍失败: {e2}"
  751. )
  752. except Exception as e:
  753. print(
  754. f"[{self.platform_name}] [{frame_name}] 使用 AI 选择器定位元素失败: {e}"
  755. )
  756. else:
  757. # 如果 AI 无法从 HTML 推断,退一步:构造候选元素列表交给 AI 选择
  758. try:
  759. candidates = await frame.evaluate("""
  760. () => {
  761. function cssEscape(s) {
  762. try { return CSS.escape(s); } catch (e) { return s.replace(/[^a-zA-Z0-9_-]/g, '\\\\$&'); }
  763. }
  764. function buildSelector(el) {
  765. if (!el || el.nodeType !== 1) return '';
  766. if (el.id) return `#${cssEscape(el.id)}`;
  767. let parts = [];
  768. let cur = el;
  769. for (let depth = 0; cur && cur.nodeType === 1 && depth < 5; depth++) {
  770. let part = cur.tagName.toLowerCase();
  771. const role = cur.getAttribute('role');
  772. const type = cur.getAttribute('type');
  773. if (type) part += `[type="${type}"]`;
  774. if (role) part += `[role="${role}"]`;
  775. const cls = (cur.className || '').toString().trim().split(/\\s+/).filter(Boolean);
  776. if (cls.length) part += '.' + cls.slice(0, 2).map(cssEscape).join('.');
  777. // nth-of-type
  778. let idx = 1;
  779. let sib = cur;
  780. while (sib && (sib = sib.previousElementSibling)) {
  781. if (sib.tagName === cur.tagName) idx++;
  782. }
  783. part += `:nth-of-type(${idx})`;
  784. parts.unshift(part);
  785. cur = cur.parentElement;
  786. }
  787. return parts.join(' > ');
  788. }
  789. const nodes = Array.from(document.querySelectorAll('input, button, a, div, span'))
  790. .filter(el => {
  791. const tag = el.tagName.toLowerCase();
  792. const type = (el.getAttribute('type') || '').toLowerCase();
  793. const role = (el.getAttribute('role') || '').toLowerCase();
  794. const aria = (el.getAttribute('aria-label') || '').toLowerCase();
  795. const txt = (el.innerText || '').trim().slice(0, 60);
  796. const cls = (el.className || '').toString().toLowerCase();
  797. const isFile = tag === 'input' && type === 'file';
  798. const looksClickable =
  799. tag === 'button' || tag === 'a' || role === 'button' || el.onclick ||
  800. cls.includes('upload') || cls.includes('uploader') || cls.includes('drag') ||
  801. aria.includes('上传') || aria.includes('选择') || aria.includes('添加') ||
  802. txt.includes('上传') || txt.includes('选择') || txt.includes('添加') || txt.includes('点击上传');
  803. if (!isFile && !looksClickable) return false;
  804. const r = el.getBoundingClientRect();
  805. const visible = r.width > 5 && r.height > 5;
  806. return visible;
  807. });
  808. const limited = nodes.slice(0, 120).map(el => ({
  809. css: buildSelector(el),
  810. tag: el.tagName.toLowerCase(),
  811. type: el.getAttribute('type') || '',
  812. role: el.getAttribute('role') || '',
  813. ariaLabel: el.getAttribute('aria-label') || '',
  814. text: (el.innerText || '').trim().slice(0, 80),
  815. id: el.id || '',
  816. className: (el.className || '').toString().slice(0, 120),
  817. accept: el.getAttribute('accept') || '',
  818. }));
  819. return limited;
  820. }
  821. """)
  822. ai_selector2 = await self.ai_pick_selector_from_candidates(
  823. candidates=candidates,
  824. goal="上传视频入口",
  825. frame_name=frame_name,
  826. )
  827. if ai_selector2:
  828. el2 = frame.locator(ai_selector2).first
  829. if await el2.count() > 0:
  830. print(
  831. f"[{self.platform_name}] [{frame_name}] 使用 AI 候选选择器点击上传入口: {ai_selector2}"
  832. )
  833. try:
  834. async with self.page.expect_file_chooser(
  835. timeout=5000
  836. ) as fc_info:
  837. await el2.click()
  838. chooser2 = await fc_info.value
  839. await chooser2.set_files(params.video_path)
  840. upload_success = True
  841. print(
  842. f"[{self.platform_name}] [{frame_name}] 通过 AI 候选选择器上传成功"
  843. )
  844. return True
  845. except Exception as e:
  846. print(
  847. f"[{self.platform_name}] [{frame_name}] AI 候选选择器点击失败,尝试 set_input_files: {e}"
  848. )
  849. try:
  850. await el2.set_input_files(params.video_path)
  851. upload_success = True
  852. print(
  853. f"[{self.platform_name}] [{frame_name}] AI 候选选择器 set_input_files 成功"
  854. )
  855. return True
  856. except Exception as e2:
  857. print(
  858. f"[{self.platform_name}] [{frame_name}] AI 候选选择器 set_input_files 仍失败: {e2}"
  859. )
  860. except Exception as e:
  861. print(
  862. f"[{self.platform_name}] [{frame_name}] 构造候选并交给 AI 失败: {e}"
  863. )
  864. except Exception as e:
  865. print(
  866. f"[{self.platform_name}] [{frame_name}] AI 上传入口识别整体失败: {e}"
  867. )
  868. return False
  869. # 先尝试主 frame
  870. try:
  871. await _try_set_files_in_frame(self.page.main_frame, "main")
  872. except Exception as e:
  873. print(f"[{self.platform_name}] main frame 上传尝试异常: {e}")
  874. # 再遍历所有子 frame
  875. if not upload_success:
  876. try:
  877. frames = self.page.frames
  878. print(f"[{self.platform_name}] 发现 frames: {len(frames)}")
  879. for idx, fr in enumerate(frames):
  880. if upload_success:
  881. break
  882. # main_frame 已尝试过
  883. if fr == self.page.main_frame:
  884. continue
  885. name = fr.name or f"frame-{idx}"
  886. await _try_set_files_in_frame(fr, name)
  887. except Exception as e:
  888. print(f"[{self.platform_name}] 遍历 frames 异常: {e}")
  889. if not upload_success:
  890. screenshot_base64 = await self.capture_screenshot()
  891. return PublishResult(
  892. success=False,
  893. platform=self.platform_name,
  894. error="未找到上传入口(可能在 iframe 中或页面结构已变更)",
  895. screenshot_base64=screenshot_base64,
  896. page_url=await self.get_page_url(),
  897. status="failed",
  898. )
  899. self.report_progress(20, "正在填充标题和话题...")
  900. # 添加标题和话题
  901. await self.add_title_tags(params)
  902. self.report_progress(30, "等待视频上传完成...")
  903. # 等待上传完成(最多约 6 分钟),期间如多次出错会自动尝试重新上传
  904. upload_completed = False
  905. for _ in range(120):
  906. try:
  907. button_info = await self.page.get_by_role(
  908. "button", name="发表"
  909. ).get_attribute("class")
  910. if "weui-desktop-btn_disabled" not in button_info:
  911. print(f"[{self.platform_name}] 视频上传完毕")
  912. # 上传封面
  913. self.report_progress(50, "正在上传封面...")
  914. await self.upload_cover(params.cover_path)
  915. upload_completed = True
  916. break
  917. else:
  918. # 检查上传错误
  919. if await self.page.locator("div.status-msg.error").count():
  920. if await self.page.locator(
  921. 'div.media-status-content div.tag-inner:has-text("删除")'
  922. ).count():
  923. await self.handle_upload_error(params.video_path)
  924. await asyncio.sleep(3)
  925. except:
  926. await asyncio.sleep(3)
  927. # 如果一直没有等到“发表”按钮可用,认为上传失败,直接返回失败结果并附带截图
  928. if not upload_completed:
  929. screenshot_base64 = await self.capture_screenshot()
  930. page_url = await self.get_page_url()
  931. return PublishResult(
  932. success=False,
  933. platform=self.platform_name,
  934. error="视频上传失败,请查看截图",
  935. screenshot_base64=screenshot_base64,
  936. page_url=page_url,
  937. status="failed",
  938. )
  939. self.report_progress(60, "处理视频设置...")
  940. # 添加短标题
  941. try:
  942. short_title_el = (
  943. self.page.get_by_text("短标题", exact=True)
  944. .locator("..")
  945. .locator("xpath=following-sibling::div")
  946. .locator('span input[type="text"]')
  947. )
  948. if await short_title_el.count():
  949. short_title = format_short_title(params.title)
  950. await short_title_el.fill(short_title)
  951. except:
  952. pass
  953. # 定时发布
  954. if params.publish_date:
  955. self.report_progress(70, "设置定时发布...")
  956. await self.set_schedule_time(params.publish_date)
  957. self.report_progress(80, "正在发布...")
  958. # 点击发布 - 参考 matrix
  959. for i in range(30):
  960. try:
  961. # 参考 matrix: div.form-btns button:has-text("发表")
  962. publish_btn = self.page.locator('div.form-btns button:has-text("发表")')
  963. if await publish_btn.count():
  964. print(f"[{self.platform_name}] 点击发布按钮...")
  965. await publish_btn.click()
  966. # 等待跳转到作品列表页面 - 参考 matrix
  967. await self.page.wait_for_url(
  968. "https://channels.weixin.qq.com/platform/post/list", timeout=10000
  969. )
  970. self.report_progress(100, "发布成功")
  971. print(f"[{self.platform_name}] 视频发布成功!")
  972. screenshot_base64 = await self.capture_screenshot()
  973. return PublishResult(
  974. success=True,
  975. platform=self.platform_name,
  976. message="发布成功",
  977. screenshot_base64=screenshot_base64,
  978. page_url=self.page.url,
  979. status="success",
  980. )
  981. except Exception as e:
  982. current_url = self.page.url
  983. if "https://channels.weixin.qq.com/platform/post/list" in current_url:
  984. self.report_progress(100, "发布成功")
  985. print(f"[{self.platform_name}] 视频发布成功!")
  986. screenshot_base64 = await self.capture_screenshot()
  987. return PublishResult(
  988. success=True,
  989. platform=self.platform_name,
  990. message="发布成功",
  991. screenshot_base64=screenshot_base64,
  992. page_url=current_url,
  993. status="success",
  994. )
  995. else:
  996. print(
  997. f"[{self.platform_name}] 视频正在发布中... {i + 1}/30, URL: {current_url}"
  998. )
  999. await asyncio.sleep(1)
  1000. # 发布超时
  1001. screenshot_base64 = await self.capture_screenshot()
  1002. page_url = await self.get_page_url()
  1003. return PublishResult(
  1004. success=False,
  1005. platform=self.platform_name,
  1006. error="发布超时,请检查发布状态",
  1007. screenshot_base64=screenshot_base64,
  1008. page_url=page_url,
  1009. status="need_action",
  1010. )
  1011. async def _get_works_fallback_dom(self, page_size: int) -> tuple:
  1012. """API 失败时从当前页面 DOM 抓取作品列表(兼容新账号/不同入口)"""
  1013. works: List[WorkItem] = []
  1014. total = 0
  1015. has_more = False
  1016. try:
  1017. for selector in [
  1018. "div.post-feed-item",
  1019. "[class*='post-feed']",
  1020. "[class*='feed-item']",
  1021. "div[class*='post']",
  1022. ]:
  1023. try:
  1024. await self.page.wait_for_selector(selector, timeout=8000)
  1025. break
  1026. except Exception:
  1027. continue
  1028. post_items = self.page.locator("div.post-feed-item")
  1029. item_count = await post_items.count()
  1030. if item_count == 0:
  1031. post_items = self.page.locator("[class*='post-feed']")
  1032. item_count = await post_items.count()
  1033. for i in range(min(item_count, page_size)):
  1034. try:
  1035. item = post_items.nth(i)
  1036. cover_el = item.locator("div.media img.thumb").first
  1037. cover_url = (
  1038. await cover_el.get_attribute("src") or ""
  1039. if await cover_el.count() > 0
  1040. else ""
  1041. )
  1042. if not cover_url:
  1043. cover_el = item.locator("img").first
  1044. cover_url = (
  1045. await cover_el.get_attribute("src") or ""
  1046. if await cover_el.count() > 0
  1047. else ""
  1048. )
  1049. title_el = item.locator("div.post-title").first
  1050. title = (
  1051. (await title_el.text_content() or "").strip()
  1052. if await title_el.count() > 0
  1053. else ""
  1054. )
  1055. time_el = item.locator("div.post-time span").first
  1056. publish_time = (
  1057. (await time_el.text_content() or "").strip()
  1058. if await time_el.count() > 0
  1059. else ""
  1060. )
  1061. play_count = like_count = comment_count = share_count = (
  1062. collect_count
  1063. ) = 0
  1064. data_items = item.locator("div.post-data div.data-item")
  1065. for j in range(await data_items.count()):
  1066. data_item = data_items.nth(j)
  1067. count_text = (
  1068. await data_item.locator("span.count").text_content() or "0"
  1069. ).strip()
  1070. if (
  1071. await data_item.locator(
  1072. "span.weui-icon-outlined-eyes-on"
  1073. ).count()
  1074. > 0
  1075. ):
  1076. play_count = self._parse_count(count_text)
  1077. elif (
  1078. await data_item.locator(
  1079. "span.weui-icon-outlined-like"
  1080. ).count()
  1081. > 0
  1082. ):
  1083. like_count = self._parse_count(count_text)
  1084. elif (
  1085. await data_item.locator(
  1086. "span.weui-icon-outlined-comment"
  1087. ).count()
  1088. > 0
  1089. ):
  1090. comment_count = self._parse_count(count_text)
  1091. elif (
  1092. await data_item.locator(
  1093. "use[xlink\\:href='#icon-share']"
  1094. ).count()
  1095. > 0
  1096. ):
  1097. share_count = self._parse_count(count_text)
  1098. elif (
  1099. await data_item.locator(
  1100. "use[xlink\\:href='#icon-thumb']"
  1101. ).count()
  1102. > 0
  1103. ):
  1104. collect_count = self._parse_count(count_text)
  1105. work_id = f"weixin_{i}_{hash(title)}_{hash(publish_time)}"
  1106. works.append(
  1107. WorkItem(
  1108. work_id=work_id,
  1109. title=title or "无标题",
  1110. cover_url=cover_url,
  1111. duration=0,
  1112. status="published",
  1113. publish_time=publish_time,
  1114. play_count=play_count,
  1115. like_count=like_count,
  1116. comment_count=comment_count,
  1117. share_count=share_count,
  1118. collect_count=collect_count,
  1119. )
  1120. )
  1121. except Exception as e:
  1122. print(
  1123. f"[{self.platform_name}] DOM 解析作品 {i} 失败: {e}", flush=True
  1124. )
  1125. continue
  1126. total = len(works)
  1127. has_more = item_count > page_size
  1128. print(f"[{self.platform_name}] DOM 回退获取 {len(works)} 条", flush=True)
  1129. except Exception as e:
  1130. print(f"[{self.platform_name}] DOM 回退失败: {e}", flush=True)
  1131. return (works, total, has_more, "")
  1132. async def get_works(
  1133. self, cookies: str, page: int = 0, page_size: int = 20
  1134. ) -> WorksResult:
  1135. """获取视频号作品列表(调用 post_list 接口)
  1136. page: 页码从 0 开始,或上一页返回的 rawKeyBuff/lastBuff 字符串
  1137. """
  1138. # 分页:首页 currentPage=1/rawKeyBuff=null,下一页用 currentPage 递增或 rawKeyBuff
  1139. if page is None or page == "" or (isinstance(page, int) and page == 0):
  1140. current_page = 1
  1141. raw_key_buff = None
  1142. elif isinstance(page, int):
  1143. current_page = page + 1
  1144. raw_key_buff = None
  1145. else:
  1146. current_page = 1
  1147. raw_key_buff = str(page)
  1148. ts_ms = str(int(time.time() * 1000))
  1149. print(f"\n{'=' * 60}")
  1150. print(
  1151. f"[{self.platform_name}] 获取作品列表 currentPage={current_page}, pageSize={page_size}, rawKeyBuff={raw_key_buff[:40] if raw_key_buff else 'null'}..."
  1152. )
  1153. print(f"{'=' * 60}")
  1154. works: List[WorkItem] = []
  1155. total = 0
  1156. has_more = False
  1157. next_page = ""
  1158. try:
  1159. await self.init_browser()
  1160. cookie_list = self.parse_cookies(cookies)
  1161. await self.set_cookies(cookie_list)
  1162. if not self.page:
  1163. raise Exception("Page not initialized")
  1164. await self.page.goto(
  1165. "https://channels.weixin.qq.com/platform/post/list", timeout=30000
  1166. )
  1167. await asyncio.sleep(3)
  1168. current_url = self.page.url
  1169. if "login" in current_url:
  1170. raise Exception("Cookie 已过期,请重新登录")
  1171. api_url = "https://channels.weixin.qq.com/micro/content/cgi-bin/mmfinderassistant-bin/post/post_list"
  1172. req_body = {
  1173. "pageSize": page_size,
  1174. "currentPage": current_page,
  1175. "userpageType": 11,
  1176. "stickyOrder": True,
  1177. "timestamp": ts_ms,
  1178. "_log_finder_uin": "",
  1179. "_log_finder_id": "",
  1180. "rawKeyBuff": raw_key_buff,
  1181. "pluginSessionId": None,
  1182. "scene": 7,
  1183. "reqScene": 7,
  1184. }
  1185. body_str = json.dumps(req_body)
  1186. response = await self.page.evaluate(
  1187. """
  1188. async ([url, bodyStr]) => {
  1189. try {
  1190. const resp = await fetch(url, {
  1191. method: 'POST',
  1192. credentials: 'include',
  1193. headers: {
  1194. 'Content-Type': 'application/json',
  1195. 'Accept': '*/*',
  1196. 'Referer': 'https://channels.weixin.qq.com/platform/post/list'
  1197. },
  1198. body: bodyStr
  1199. });
  1200. return await resp.json();
  1201. } catch (e) {
  1202. return { error: e.toString() };
  1203. }
  1204. }
  1205. """,
  1206. [api_url, body_str],
  1207. )
  1208. is_first_page = current_page == 1 and raw_key_buff is None
  1209. if response.get("error"):
  1210. print(
  1211. f"[{self.platform_name}] API 请求失败: {response.get('error')}",
  1212. flush=True,
  1213. )
  1214. if is_first_page:
  1215. (
  1216. works,
  1217. total,
  1218. has_more,
  1219. next_page,
  1220. ) = await self._get_works_fallback_dom(page_size)
  1221. if works:
  1222. return WorksResult(
  1223. success=True,
  1224. platform=self.platform_name,
  1225. works=works,
  1226. total=total,
  1227. has_more=has_more,
  1228. next_page=next_page,
  1229. )
  1230. return WorksResult(
  1231. success=False,
  1232. platform=self.platform_name,
  1233. error=response.get("error", "API 请求失败"),
  1234. )
  1235. err_code = response.get("errCode", -1)
  1236. if err_code != 0:
  1237. err_msg = response.get("errMsg", "unknown")
  1238. print(
  1239. f"[{self.platform_name}] API errCode={err_code}, errMsg={err_msg}, 完整响应(前800字): {json.dumps(response, ensure_ascii=False)[:800]}",
  1240. flush=True,
  1241. )
  1242. if is_first_page:
  1243. (
  1244. works,
  1245. total,
  1246. has_more,
  1247. next_page,
  1248. ) = await self._get_works_fallback_dom(page_size)
  1249. if works:
  1250. return WorksResult(
  1251. success=True,
  1252. platform=self.platform_name,
  1253. works=works,
  1254. total=total,
  1255. has_more=has_more,
  1256. next_page=next_page,
  1257. )
  1258. return WorksResult(
  1259. success=False,
  1260. platform=self.platform_name,
  1261. error=f"errCode={err_code}, errMsg={err_msg}",
  1262. )
  1263. data = response.get("data") or {}
  1264. raw_list = data.get("list") or []
  1265. total = int(data.get("totalCount") or 0)
  1266. has_more = bool(data.get("continueFlag", False))
  1267. next_page = (data.get("lastBuff") or "").strip()
  1268. print(
  1269. f"[{self.platform_name}] API 响应: list_len={len(raw_list)}, totalCount={total}, continueFlag={has_more}, lastBuff={next_page[:50] if next_page else ''}..."
  1270. )
  1271. if is_first_page and len(raw_list) == 0:
  1272. works_fb, total_fb, has_more_fb, _ = await self._get_works_fallback_dom(
  1273. page_size
  1274. )
  1275. if works_fb:
  1276. return WorksResult(
  1277. success=True,
  1278. platform=self.platform_name,
  1279. works=works_fb,
  1280. total=total_fb,
  1281. has_more=has_more_fb,
  1282. next_page="",
  1283. )
  1284. for item in raw_list:
  1285. try:
  1286. # 存 works.platform_video_id 统一用 post_list 接口回参中的 exportId(如 export/xxx)
  1287. work_id = str(
  1288. item.get("exportId")
  1289. or item.get("objectId")
  1290. or item.get("id")
  1291. or ""
  1292. ).strip()
  1293. if not work_id:
  1294. work_id = f"weixin_{hash(item.get('createTime', 0))}_{hash(item.get('desc', {}).get('description', ''))}"
  1295. desc = item.get("desc") or {}
  1296. title = (desc.get("description") or "").strip() or "无标题"
  1297. cover_url = ""
  1298. duration = 0
  1299. media_list = desc.get("media") or []
  1300. if media_list and isinstance(media_list[0], dict):
  1301. m = media_list[0]
  1302. cover_url = (
  1303. m.get("coverUrl") or m.get("thumbUrl") or ""
  1304. ).strip()
  1305. duration = int(m.get("videoPlayLen") or 0)
  1306. create_ts = item.get("createTime") or 0
  1307. if isinstance(create_ts, (int, float)) and create_ts:
  1308. publish_time = datetime.fromtimestamp(create_ts).strftime(
  1309. "%Y-%m-%d %H:%M:%S"
  1310. )
  1311. else:
  1312. publish_time = str(create_ts) if create_ts else ""
  1313. # likeCount=推荐, favCount=点赞
  1314. read_count = int(item.get("readCount") or 0)
  1315. like_count = int(item.get("favCount") or 0)
  1316. comment_count = int(item.get("commentCount") or 0)
  1317. forward_count = int(item.get("forwardCount") or 0)
  1318. works.append(
  1319. WorkItem(
  1320. work_id=work_id,
  1321. title=title,
  1322. cover_url=cover_url,
  1323. duration=duration,
  1324. status="published",
  1325. publish_time=publish_time,
  1326. play_count=read_count,
  1327. like_count=like_count,
  1328. comment_count=comment_count,
  1329. share_count=forward_count,
  1330. collect_count=0,
  1331. )
  1332. )
  1333. except Exception as e:
  1334. print(f"[{self.platform_name}] 解析作品项失败: {e}", flush=True)
  1335. continue
  1336. if total == 0 and works:
  1337. total = len(works)
  1338. print(
  1339. f"[{self.platform_name}] 本页获取 {len(works)} 条,totalCount={total}, next_page={bool(next_page)}"
  1340. )
  1341. except Exception as e:
  1342. import traceback
  1343. traceback.print_exc()
  1344. return WorksResult(success=False, platform=self.platform_name, error=str(e))
  1345. return WorksResult(
  1346. success=True,
  1347. platform=self.platform_name,
  1348. works=works,
  1349. total=total,
  1350. has_more=has_more,
  1351. next_page=next_page,
  1352. )
  1353. async def sync_work_daily_stats_via_browser(
  1354. self, cookies: str, work_id: int, platform_video_id: str
  1355. ) -> dict:
  1356. """
  1357. 通过浏览器自动化同步单个作品的每日数据到 work_day_statistics。
  1358. 流程:
  1359. 1. 打开 statistic/post 页,点击单篇视频 tab,点击近30天
  1360. 2. 监听 post_list 接口,根据 exportId 匹配 platform_video_id 得到 objectId
  1361. 3. 找到 data-row-key=objectId 的行,点击「查看」
  1362. 4. 进入详情页,点击数据详情的近30天,点击下载表格
  1363. 5. 解析 CSV 并返回 statistics 列表(供 Node 保存)
  1364. """
  1365. import csv
  1366. import tempfile
  1367. from pathlib import Path
  1368. result = {
  1369. "success": False,
  1370. "error": "",
  1371. "statistics": [],
  1372. "inserted": 0,
  1373. "updated": 0,
  1374. }
  1375. post_list_data = {"list": []}
  1376. async def handle_response(response):
  1377. try:
  1378. if (
  1379. "statistic/post_list" in response.url
  1380. and response.request.method == "POST"
  1381. ):
  1382. try:
  1383. body = await response.json()
  1384. if body.get("errCode") == 0 and body.get("data"):
  1385. post_list_data["list"] = body.get("data", {}).get(
  1386. "list", []
  1387. )
  1388. except Exception:
  1389. pass
  1390. except Exception:
  1391. pass
  1392. try:
  1393. await self.init_browser()
  1394. cookie_list = self.parse_cookies(cookies)
  1395. await self.set_cookies(cookie_list)
  1396. if not self.page:
  1397. raise Exception("Page not initialized")
  1398. self.page.on("response", handle_response)
  1399. # 1. 打开数据分析-作品数据页
  1400. print(f"[{self.platform_name}] 打开数据分析页...", flush=True)
  1401. await self.page.goto(
  1402. "https://channels.weixin.qq.com/platform/statistic/post", timeout=30000
  1403. )
  1404. if not self.headless:
  1405. print(
  1406. f"[{self.platform_name}] 浏览器已打开,请将窗口置于前台观看操作(等待 5 秒)...",
  1407. flush=True,
  1408. )
  1409. await asyncio.sleep(5)
  1410. else:
  1411. await asyncio.sleep(3)
  1412. if "login" in self.page.url:
  1413. raise Exception("Cookie 已过期,请重新登录")
  1414. # 2. 点击「单篇视频」tab
  1415. tab_sel = "div.weui-desktop-tab__navs ul li:nth-child(2) a"
  1416. try:
  1417. await self.page.wait_for_selector(tab_sel, timeout=8000)
  1418. await self.page.click(tab_sel)
  1419. except Exception:
  1420. tab_sel = "a:has-text('单篇视频')"
  1421. await self.page.click(tab_sel)
  1422. await asyncio.sleep(2)
  1423. # 3. 点击「近30天」(单篇视频页的日期范围筛选)
  1424. # 选择器优先级:精确匹配单篇视频区域内的日期范围 radio 组
  1425. radio_selectors = [
  1426. "div.post-single-wrap div.weui-desktop-radio-group.radio-group label:has-text('近30天')",
  1427. "div.post-single-wrap div.filter-wrap div.weui-desktop-radio-group label:nth-child(2)",
  1428. "div.post-single-wrap div.card-body div.filter-wrap div:nth-child(2) label:nth-child(2)",
  1429. "div.post-single-wrap label:has-text('近30天')",
  1430. "div.weui-desktop-radio-group label:has-text('近30天')",
  1431. "label:has-text('近30天')",
  1432. ]
  1433. clicked = False
  1434. for sel in radio_selectors:
  1435. try:
  1436. el = self.page.locator(sel).first
  1437. if await el.count() > 0:
  1438. await el.click()
  1439. clicked = True
  1440. print(
  1441. f"[{self.platform_name}] 已点击近30天按钮 (selector: {sel[:50]}...)",
  1442. flush=True,
  1443. )
  1444. break
  1445. except Exception as e:
  1446. continue
  1447. if not clicked:
  1448. print(
  1449. f"[{self.platform_name}] 警告: 未找到近30天按钮,继续尝试...",
  1450. flush=True,
  1451. )
  1452. await asyncio.sleep(3)
  1453. # 4. 从 post_list 响应中找 exportId -> objectId
  1454. export_id_to_object = {}
  1455. for item in post_list_data["list"]:
  1456. eid = (item.get("exportId") or "").strip()
  1457. oid = (item.get("objectId") or "").strip()
  1458. if eid and oid:
  1459. export_id_to_object[eid] = oid
  1460. object_id = export_id_to_object.get(
  1461. platform_video_id
  1462. ) or export_id_to_object.get(platform_video_id.strip())
  1463. if not object_id:
  1464. # 尝试宽松匹配(platform_video_id 可能带前缀)
  1465. for eid, oid in export_id_to_object.items():
  1466. if platform_video_id in eid or eid in platform_video_id:
  1467. object_id = oid
  1468. break
  1469. if not object_id:
  1470. result["error"] = (
  1471. f"未在 post_list 中匹配到 exportId={platform_video_id}"
  1472. )
  1473. print(f"[{self.platform_name}] {result['error']}", flush=True)
  1474. return result
  1475. # 5. 找到 data-row-key=objectId 的行,点击「查看」
  1476. view_btn = self.page.locator(
  1477. f'tr[data-row-key="{object_id}"] a.detail-wrap, tr[data-row-key="{object_id}"] a:has-text("查看")'
  1478. )
  1479. try:
  1480. await view_btn.first.wait_for(timeout=5000)
  1481. await view_btn.first.click()
  1482. except Exception as e:
  1483. view_btn = self.page.locator(f'tr[data-row-key="{object_id}"] a')
  1484. if await view_btn.count() > 0:
  1485. await view_btn.first.click()
  1486. else:
  1487. raise Exception(f"未找到 objectId={object_id} 的查看按钮: {e}")
  1488. await asyncio.sleep(3)
  1489. # 6. 详情页:点击数据详情的「近30天」,再点击「下载表格」
  1490. detail_radio = (
  1491. "div.post-statistic-common div.filter-wrap label:nth-child(2)"
  1492. )
  1493. for sel in [detail_radio, "div.main-body label:has-text('近30天')"]:
  1494. try:
  1495. el = self.page.locator(sel).first
  1496. if await el.count() > 0:
  1497. await el.click()
  1498. break
  1499. except Exception:
  1500. continue
  1501. await asyncio.sleep(2)
  1502. # 保存到 server/tmp 目录
  1503. download_dir = Path(__file__).resolve().parent.parent.parent / "tmp"
  1504. download_dir.mkdir(parents=True, exist_ok=True)
  1505. async with self.page.expect_download(timeout=15000) as download_info:
  1506. download_btn = self.page.locator(
  1507. "div.post-statistic-common div.filter-extra a, a:has-text('下载表格')"
  1508. )
  1509. if await download_btn.count() == 0:
  1510. raise Exception("未找到「下载表格」按钮")
  1511. await download_btn.first.click()
  1512. download = await download_info.value
  1513. save_path = download_dir / f"work_{work_id}_{int(time.time())}.csv"
  1514. await download.save_as(save_path)
  1515. # 7. 解析 CSV -> statistics
  1516. stats_list = []
  1517. with open(save_path, "r", encoding="utf-8-sig", errors="replace") as f:
  1518. reader = csv.DictReader(f)
  1519. rows = list(reader)
  1520. for row in rows:
  1521. date_val = (
  1522. row.get("日期")
  1523. or row.get("date")
  1524. or row.get("时间")
  1525. or row.get("时间周期", "")
  1526. ).strip()
  1527. if not date_val:
  1528. continue
  1529. dt = None
  1530. norm = (
  1531. date_val[:10]
  1532. .replace("年", "-")
  1533. .replace("月", "-")
  1534. .replace("日", "-")
  1535. .replace("/", "-")
  1536. )
  1537. if len(norm) >= 8 and norm.count("-") >= 2:
  1538. parts = norm.split("-")
  1539. if len(parts) == 3:
  1540. try:
  1541. y, m, d = int(parts[0]), int(parts[1]), int(parts[2])
  1542. if 2000 <= y <= 2100 and 1 <= m <= 12 and 1 <= d <= 31:
  1543. dt = datetime(y, m, d)
  1544. except (ValueError, IndexError):
  1545. pass
  1546. if not dt:
  1547. for fmt in ["%Y-%m-%d", "%Y/%m/%d", "%m/%d/%Y"]:
  1548. try:
  1549. dt = datetime.strptime(
  1550. (date_val.split()[0] if date_val else "")[:10], fmt
  1551. )
  1552. break
  1553. except (ValueError, IndexError):
  1554. dt = None
  1555. if not dt:
  1556. continue
  1557. rec_date = dt.strftime("%Y-%m-%d")
  1558. play = self._parse_count(
  1559. row.get("播放", "")
  1560. or row.get("播放量", "")
  1561. or row.get("play_count", "0")
  1562. )
  1563. like = self._parse_count(
  1564. row.get("点赞", "") or row.get("like_count", "0")
  1565. )
  1566. comment = self._parse_count(
  1567. row.get("评论", "") or row.get("comment_count", "0")
  1568. )
  1569. share = self._parse_count(
  1570. row.get("分享", "") or row.get("share_count", "0")
  1571. )
  1572. collect = self._parse_count(
  1573. row.get("收藏", "") or row.get("collect_count", "0")
  1574. )
  1575. comp_rate = (
  1576. row.get("完播率", "") or row.get("completion_rate", "0")
  1577. ).strip().rstrip("%") or "0"
  1578. avg_dur = (
  1579. row.get("平均播放时长", "") or row.get("avg_watch_duration", "0")
  1580. ).strip()
  1581. stats_list.append(
  1582. {
  1583. "work_id": work_id,
  1584. "record_date": rec_date,
  1585. "play_count": play,
  1586. "like_count": like,
  1587. "comment_count": comment,
  1588. "share_count": share,
  1589. "collect_count": collect,
  1590. "completion_rate": comp_rate,
  1591. "avg_watch_duration": avg_dur,
  1592. }
  1593. )
  1594. result["statistics"] = stats_list
  1595. result["success"] = True
  1596. try:
  1597. os.remove(save_path)
  1598. except Exception:
  1599. pass
  1600. except Exception as e:
  1601. import traceback
  1602. traceback.print_exc()
  1603. result["error"] = str(e)
  1604. finally:
  1605. try:
  1606. await self.close_browser()
  1607. except Exception:
  1608. pass
  1609. return result
  1610. async def sync_account_works_daily_stats_via_browser(
  1611. self,
  1612. cookies: str,
  1613. works: List[dict],
  1614. save_fn=None,
  1615. update_works_fn=None,
  1616. headless: bool = True,
  1617. ) -> dict:
  1618. """
  1619. 纯浏览器批量同步账号下所有作品(在库的)的每日数据到 work_day_statistics。
  1620. 流程:
  1621. 1. 打开 statistic/post → 点击单篇视频 → 点击近30天
  1622. 2. 【首次】监听 post_list 接口 → 解析响应更新 works 表 yesterday_* 字段
  1623. 3. 监听 post_list 获取 exportId->objectId 映射
  1624. 4. 遍历 post_list 的每一条:
  1625. - 若 exportId 在 works 的 platform_video_id 中无匹配 → 跳过
  1626. - 若匹配 → 找到 data-row-key=objectId 的行,点击「查看」
  1627. - 详情页:默认近7天,直接监听 feed_aggreagate_data_by_tab_type 接口
  1628. - 从「全部」tab 解析 browse/like/comment/forward/fav/follow,日期从昨天往前推
  1629. - 通过 save_fn 存入 work_day_statistics
  1630. - 返回列表页,继续下一条
  1631. works: [{"work_id": int, "platform_video_id": str}, ...]
  1632. save_fn: (stats_list: List[dict]) -> {inserted, updated},由调用方传入,用于调用 Node batch-dates
  1633. update_works_fn: (updates: List[dict]) -> {updated},由调用方传入,用于将 post_list 解析数据更新到 works 表(仅首次调用)
  1634. """
  1635. from pathlib import Path
  1636. from datetime import timedelta
  1637. result = {
  1638. "success": True,
  1639. "error": "",
  1640. "total_processed": 0,
  1641. "total_skipped": 0,
  1642. "inserted": 0,
  1643. "updated": 0,
  1644. "works_updated": 0,
  1645. }
  1646. # platform_video_id(exportId) -> work_id
  1647. export_id_to_work = {}
  1648. for w in works:
  1649. pvid = (
  1650. w.get("platform_video_id") or w.get("platformVideoId") or ""
  1651. ).strip()
  1652. wid = w.get("work_id") or w.get("workId")
  1653. if pvid and wid is not None:
  1654. export_id_to_work[pvid] = int(wid)
  1655. # 兼容可能带/不带前缀(如 export/xxx vs xxx)
  1656. if "/" in pvid:
  1657. export_id_to_work[pvid.split("/")[-1]] = int(wid)
  1658. post_list_data = {"list": []}
  1659. feed_aggreagate_data = {"body": None}
  1660. async def handle_response(response):
  1661. try:
  1662. url = response.url
  1663. if "statistic/post_list" in url:
  1664. try:
  1665. body = await response.json()
  1666. if body.get("errCode") == 0 and body.get("data"):
  1667. post_list_data["list"] = body.get("data", {}).get(
  1668. "list", []
  1669. )
  1670. except Exception:
  1671. pass
  1672. elif "feed_aggreagate_data_by_tab_type" in url:
  1673. try:
  1674. body = await response.json()
  1675. if body.get("errCode") == 0 and body.get("data"):
  1676. feed_aggreagate_data["body"] = body
  1677. except Exception:
  1678. pass
  1679. except Exception:
  1680. pass
  1681. try:
  1682. await self.init_browser()
  1683. cookie_list = self.parse_cookies(cookies)
  1684. await self.set_cookies(cookie_list)
  1685. if not self.page:
  1686. raise Exception("Page not initialized")
  1687. self.page.on("response", handle_response)
  1688. # 1. 打开数据分析-作品数据页
  1689. print(f"[{self.platform_name}] 打开数据分析页...", flush=True)
  1690. await self.page.goto(
  1691. "https://channels.weixin.qq.com/platform/statistic/post", timeout=30000
  1692. )
  1693. if not headless:
  1694. print(
  1695. f"[{self.platform_name}] 浏览器已打开,请将窗口置于前台观看操作(等待 5 秒)...",
  1696. flush=True,
  1697. )
  1698. await asyncio.sleep(5)
  1699. else:
  1700. await asyncio.sleep(3)
  1701. if "login" in self.page.url:
  1702. raise Exception("Cookie 已过期,请重新登录")
  1703. # 2. 点击「单篇视频」tab
  1704. tab_sel = "div.weui-desktop-tab__navs ul li:nth-child(2) a"
  1705. try:
  1706. await self.page.wait_for_selector(tab_sel, timeout=8000)
  1707. await self.page.click(tab_sel)
  1708. except Exception:
  1709. tab_sel = "a:has-text('单篇视频')"
  1710. await self.page.click(tab_sel)
  1711. await asyncio.sleep(2)
  1712. # 3. 点击「近30天」前清空 list,点击后等待 handler 捕获带 fullPlayRate 的 post_list
  1713. post_list_data["list"] = []
  1714. radio_selectors = [
  1715. "div.post-single-wrap div.weui-desktop-radio-group.radio-group label:has-text('近30天')",
  1716. "div.post-single-wrap div.filter-wrap div.weui-desktop-radio-group label:nth-child(2)",
  1717. "div.post-single-wrap label:has-text('近30天')",
  1718. "div.weui-desktop-radio-group label:has-text('近30天')",
  1719. "label:has-text('近30天')",
  1720. ]
  1721. clicked = False
  1722. for sel in radio_selectors:
  1723. try:
  1724. el = self.page.locator(sel).first
  1725. if await el.count() > 0:
  1726. await el.click()
  1727. clicked = True
  1728. print(
  1729. f"[{self.platform_name}] 已点击近30天 (selector: {sel[:40]}...)",
  1730. flush=True,
  1731. )
  1732. break
  1733. except Exception:
  1734. continue
  1735. if not clicked:
  1736. print(f"[{self.platform_name}] 警告: 未找到近30天按钮", flush=True)
  1737. await asyncio.sleep(5)
  1738. # 4. 从 post_list 获取列表
  1739. items = post_list_data["list"]
  1740. if not items:
  1741. result["error"] = "未监听到 post_list 或列表为空"
  1742. print(f"[{self.platform_name}] {result['error']}", flush=True)
  1743. return result
  1744. # 4.5 【仅首次】从 post_list 接口响应解析数据 → 更新 works 表(不再下载 CSV)
  1745. # post_list 返回字段映射: readCount->播放量, likeCount->点赞, commentCount->评论, forwardCount->分享,
  1746. # fullPlayRate->完播率(0-1小数), avgPlayTimeSec->平均播放时长(秒), exportId->匹配 work_id
  1747. if update_works_fn and items:
  1748. try:
  1749. updates = []
  1750. for it in items:
  1751. eid = (it.get("exportId") or "").strip()
  1752. if not eid:
  1753. continue
  1754. work_id = export_id_to_work.get(eid)
  1755. if work_id is None:
  1756. for k, v in export_id_to_work.items():
  1757. if eid in k or k in eid:
  1758. work_id = v
  1759. break
  1760. if work_id is None:
  1761. continue
  1762. # likeCount=推荐, favCount=点赞
  1763. read_count = int(it.get("readCount") or 0)
  1764. recommend_count = int(it.get("likeCount") or 0)
  1765. like_count = int(it.get("favCount") or 0)
  1766. comment_count = int(it.get("commentCount") or 0)
  1767. forward_count = int(it.get("forwardCount") or 0)
  1768. follow_count = int(it.get("followCount") or 0)
  1769. full_play_rate = it.get("fullPlayRate")
  1770. if full_play_rate is not None:
  1771. comp_rate = f"{float(full_play_rate) * 100:.2f}%"
  1772. else:
  1773. comp_rate = "0"
  1774. avg_sec = it.get("avgPlayTimeSec")
  1775. if avg_sec is not None:
  1776. avg_dur = f"{float(avg_sec):.2f}秒"
  1777. else:
  1778. avg_dur = "0"
  1779. updates.append(
  1780. {
  1781. "work_id": work_id,
  1782. "yesterday_play_count": read_count,
  1783. "yesterday_like_count": like_count,
  1784. "yesterday_recommend_count": recommend_count,
  1785. "yesterday_comment_count": comment_count,
  1786. "yesterday_share_count": forward_count,
  1787. "yesterday_follow_count": follow_count,
  1788. "yesterday_completion_rate": comp_rate,
  1789. "yesterday_avg_watch_duration": avg_dur,
  1790. }
  1791. )
  1792. if updates:
  1793. try:
  1794. save_result = update_works_fn(updates)
  1795. result["works_updated"] = save_result.get("updated", 0)
  1796. except Exception as api_err:
  1797. import traceback
  1798. traceback.print_exc()
  1799. except Exception as e:
  1800. import traceback
  1801. traceback.print_exc()
  1802. print(
  1803. f"[{self.platform_name}] 解析 post_list 更新 works 失败: {e}",
  1804. flush=True,
  1805. )
  1806. # 辅助:点击单篇视频 + 近30天,恢复列表视图(go_back 后会回到全部视频页)
  1807. async def ensure_single_video_near30():
  1808. tab_sel = "div.weui-desktop-tab__navs ul li:nth-child(2) a"
  1809. try:
  1810. await self.page.wait_for_selector(tab_sel, timeout=8000)
  1811. await self.page.click(tab_sel)
  1812. except Exception:
  1813. await self.page.click("a:has-text('单篇视频')")
  1814. await asyncio.sleep(2)
  1815. for sel in [
  1816. "div.post-single-wrap div.weui-desktop-radio-group.radio-group label:has-text('近30天')",
  1817. "div.post-single-wrap label:has-text('近30天')",
  1818. "div.weui-desktop-radio-group label:has-text('近30天')",
  1819. "label:has-text('近30天')",
  1820. ]:
  1821. try:
  1822. el = self.page.locator(sel).first
  1823. if await el.count() > 0:
  1824. await el.click()
  1825. break
  1826. except Exception:
  1827. continue
  1828. await asyncio.sleep(3)
  1829. # 5. 遍历每一条,按 exportId 匹配作品
  1830. processed_export_ids = set()
  1831. for idx, item in enumerate(items):
  1832. eid = (item.get("exportId") or "").strip()
  1833. oid = (item.get("objectId") or "").strip()
  1834. if not oid:
  1835. continue
  1836. # 已处理过的跳过(理论上循环顺序即处理顺序,此处做双重保险)
  1837. if eid in processed_export_ids:
  1838. print(
  1839. f"[{self.platform_name}] 跳过 [{idx + 1}] exportId={eid} (已处理)",
  1840. flush=True,
  1841. )
  1842. continue
  1843. # go_back 后回到全部视频页,需重新点击单篇视频+近30天
  1844. if idx > 0:
  1845. await ensure_single_video_near30()
  1846. # 匹配 work_id
  1847. work_id = export_id_to_work.get(eid)
  1848. if work_id is None:
  1849. for k, v in export_id_to_work.items():
  1850. if eid in k or k in eid:
  1851. work_id = v
  1852. break
  1853. if work_id is None:
  1854. result["total_skipped"] += 1
  1855. print(
  1856. f"[{self.platform_name}] 跳过 [{idx + 1}] exportId={eid} (库中无对应作品)",
  1857. flush=True,
  1858. )
  1859. continue
  1860. # 点击「查看」:Ant Design 表格 tr[data-row-key] > td > div.slot-wrap > a.detail-wrap
  1861. # 操作列可能在 ant-table-fixed-right 内,优先尝试
  1862. view_selectors = [
  1863. f'div.ant-table-fixed-right tr[data-row-key="{oid}"] a.detail-wrap',
  1864. f'tr[data-row-key="{oid}"] a.detail-wrap',
  1865. f'tr[data-row-key="{oid}"] td a.detail-wrap',
  1866. f'tr[data-row-key="{oid}"] a:has-text("查看")',
  1867. f'tr[data-row-key="{oid}"] a',
  1868. ]
  1869. clicked = False
  1870. for sel in view_selectors:
  1871. view_btn = self.page.locator(sel)
  1872. if await view_btn.count() > 0:
  1873. try:
  1874. await view_btn.first.wait_for(timeout=3000)
  1875. await view_btn.first.click()
  1876. clicked = True
  1877. print(
  1878. f"[{self.platform_name}] 已点击查看 (selector: {sel[:40]}...)",
  1879. flush=True,
  1880. )
  1881. break
  1882. except Exception as e:
  1883. continue
  1884. if not clicked:
  1885. print(
  1886. f"[{self.platform_name}] 未找到 objectId={oid} 的查看按钮",
  1887. flush=True,
  1888. )
  1889. result["total_skipped"] += 1
  1890. continue
  1891. await asyncio.sleep(3)
  1892. # 详情页:默认展示近7天,页面加载时自动请求 feed_aggreagate,不清空 body 避免覆盖已监听到的响应
  1893. await asyncio.sleep(4)
  1894. # 从 feed_aggreagate 响应解析「全部」数据
  1895. # 数据结构: data.dataByFanstype[].dataByTabtype[] 中 tabTypeName="全部" 或 tabType=999
  1896. # 日期:从昨天往前推 N 天(含昨天),数组从最早到最晚排列
  1897. body = feed_aggreagate_data.get("body")
  1898. if not body or not body.get("data"):
  1899. print(
  1900. f"[{self.platform_name}] work_id={work_id} 未监听到 feed_aggreagate 有效响应",
  1901. flush=True,
  1902. )
  1903. await self.page.go_back()
  1904. await asyncio.sleep(2)
  1905. continue
  1906. tab_all = None
  1907. for fan_item in body.get("data", {}).get("dataByFanstype", []):
  1908. for tab_item in fan_item.get("dataByTabtype", []):
  1909. if (
  1910. tab_item.get("tabTypeName") == "全部"
  1911. or tab_item.get("tabType") == 999
  1912. ):
  1913. tab_all = tab_item.get("data")
  1914. break
  1915. if tab_all is not None:
  1916. break
  1917. if not tab_all:
  1918. tab_all = (
  1919. body.get("data", {}).get("feedData", [{}])[0].get("totalData")
  1920. )
  1921. if not tab_all:
  1922. print(
  1923. f"[{self.platform_name}] work_id={work_id} 未找到「全部」数据",
  1924. flush=True,
  1925. )
  1926. await self.page.go_back()
  1927. await asyncio.sleep(2)
  1928. continue
  1929. browse = tab_all.get("browse", [])
  1930. n = len(browse)
  1931. if n == 0:
  1932. print(
  1933. f"[{self.platform_name}] work_id={work_id} browse 为空",
  1934. flush=True,
  1935. )
  1936. await self.page.go_back()
  1937. await asyncio.sleep(2)
  1938. continue
  1939. # 日期:昨天往前推 n 天,index 0 = 最早日
  1940. today = datetime.now().replace(
  1941. hour=0, minute=0, second=0, microsecond=0
  1942. )
  1943. yesterday = today - timedelta(days=1)
  1944. start_date = yesterday - timedelta(days=n - 1)
  1945. # like=推荐, fav=点赞
  1946. like_arr = tab_all.get("like", [])
  1947. comment_arr = tab_all.get("comment", [])
  1948. forward_arr = tab_all.get("forward", [])
  1949. fav_arr = tab_all.get("fav", [])
  1950. follow_arr = tab_all.get("follow", [])
  1951. stats_list = []
  1952. for i in range(n):
  1953. rec_dt = start_date + timedelta(days=i)
  1954. rec_date = rec_dt.strftime("%Y-%m-%d")
  1955. play = self._parse_count(browse[i] if i < len(browse) else "0")
  1956. recommend = self._parse_count(
  1957. like_arr[i] if i < len(like_arr) else "0"
  1958. )
  1959. like = self._parse_count(fav_arr[i] if i < len(fav_arr) else "0")
  1960. comment = self._parse_count(
  1961. comment_arr[i] if i < len(comment_arr) else "0"
  1962. )
  1963. share = self._parse_count(
  1964. forward_arr[i] if i < len(forward_arr) else "0"
  1965. )
  1966. follow = self._parse_count(
  1967. follow_arr[i] if i < len(follow_arr) else "0"
  1968. )
  1969. stats_list.append(
  1970. {
  1971. "work_id": work_id,
  1972. "record_date": rec_date,
  1973. "play_count": play,
  1974. "like_count": like,
  1975. "recommend_count": recommend,
  1976. "comment_count": comment,
  1977. "share_count": share,
  1978. "collect_count": 0,
  1979. "follow_count": follow,
  1980. "completion_rate": "0",
  1981. "avg_watch_duration": "0",
  1982. }
  1983. )
  1984. print(
  1985. f"[{self.platform_name}] work_id={work_id} 从 feed_aggreagate 解析得到 {len(stats_list)} 条日统计",
  1986. flush=True,
  1987. )
  1988. # 存入 work_day_statistics(通过 save_fn 调用 Node)
  1989. if save_fn and stats_list:
  1990. try:
  1991. save_result = save_fn(stats_list)
  1992. result["inserted"] += save_result.get("inserted", 0)
  1993. result["updated"] += save_result.get("updated", 0)
  1994. except Exception as e:
  1995. print(
  1996. f"[{self.platform_name}] work_id={work_id} 保存失败: {e}",
  1997. flush=True,
  1998. )
  1999. result["total_processed"] += 1
  2000. processed_export_ids.add(eid)
  2001. # 返回列表页,继续下一条(会回到全部视频页,下次循环会重新点击单篇视频+近30天)
  2002. await self.page.go_back()
  2003. await asyncio.sleep(2)
  2004. print(
  2005. f"[{self.platform_name}] 批量同步完成: 处理 {result['total_processed']} 个作品, 跳过 {result['total_skipped']} 个",
  2006. flush=True,
  2007. )
  2008. except Exception as e:
  2009. import traceback
  2010. traceback.print_exc()
  2011. result["success"] = False
  2012. result["error"] = str(e)
  2013. finally:
  2014. try:
  2015. await self.close_browser()
  2016. except Exception:
  2017. pass
  2018. return result
  2019. async def get_comments(
  2020. self, cookies: str, work_id: str, cursor: str = ""
  2021. ) -> CommentsResult:
  2022. """
  2023. 获取视频号作品评论(完全参考 get_weixin_work_comments.py 的接口监听逻辑)
  2024. 支持递归提取二级评论,正确处理 parent_comment_id
  2025. """
  2026. print(f"\n{'=' * 60}")
  2027. print(f"[{self.platform_name}] 获取作品评论")
  2028. print(f"[{self.platform_name}] work_id={work_id}")
  2029. print(f"{'=' * 60}")
  2030. comments: List[CommentItem] = []
  2031. total = 0
  2032. has_more = False
  2033. try:
  2034. await self.init_browser()
  2035. cookie_list = self.parse_cookies(cookies)
  2036. await self.set_cookies(cookie_list)
  2037. if not self.page:
  2038. raise Exception("Page not initialized")
  2039. # 访问评论管理页面
  2040. print(f"[{self.platform_name}] 正在打开评论页面...")
  2041. await self.page.goto(
  2042. "https://channels.weixin.qq.com/platform/interaction/comment",
  2043. timeout=30000,
  2044. )
  2045. await asyncio.sleep(2)
  2046. # 检查登录状态
  2047. current_url = self.page.url
  2048. if "login" in current_url:
  2049. raise Exception("Cookie 已过期,请重新登录")
  2050. # === 步骤1: 监听 post_list 接口获取作品列表 ===
  2051. posts = []
  2052. try:
  2053. async with self.page.expect_response(
  2054. lambda res: "/post/post_list" in res.url, timeout=20000
  2055. ) as post_resp_info:
  2056. await self.page.wait_for_selector(
  2057. ".scroll-list .comment-feed-wrap", timeout=15000
  2058. )
  2059. post_resp = await post_resp_info.value
  2060. post_data = await post_resp.json()
  2061. if post_data.get("errCode") == 0:
  2062. posts = post_data.get("data", {}).get("list", [])
  2063. print(f"[{self.platform_name}] ✅ 获取 {len(posts)} 个作品")
  2064. else:
  2065. err_msg = post_data.get("errMsg", "未知错误")
  2066. print(f"[{self.platform_name}] ❌ post_list 业务错误: {err_msg}")
  2067. return CommentsResult(
  2068. success=False,
  2069. platform=self.platform_name,
  2070. work_id=work_id,
  2071. error=f"post_list 业务错误: {err_msg}",
  2072. )
  2073. except Exception as e:
  2074. print(f"[{self.platform_name}] ❌ 获取 post_list 失败: {e}")
  2075. return CommentsResult(
  2076. success=False,
  2077. platform=self.platform_name,
  2078. work_id=work_id,
  2079. error=f"获取 post_list 失败: {e}",
  2080. )
  2081. # === 步骤2: 在 DOM 中查找目标作品 ===
  2082. feed_wraps = await self.page.query_selector_all(
  2083. ".scroll-list .comment-feed-wrap"
  2084. )
  2085. target_feed = None
  2086. target_post = None
  2087. target_index = -1
  2088. for i, feed in enumerate(feed_wraps):
  2089. if i >= len(posts):
  2090. break
  2091. post = posts[i]
  2092. object_nonce = post.get("objectNonce", "")
  2093. post_work_id = post.get("objectId", "") or object_nonce
  2094. # 匹配 work_id(支持 objectId 或 objectNonce 匹配)
  2095. if (
  2096. work_id in [post_work_id, object_nonce]
  2097. or post_work_id in work_id
  2098. or object_nonce in work_id
  2099. ):
  2100. target_feed = feed
  2101. target_post = post
  2102. target_index = i
  2103. work_title = post.get("desc", {}).get("description", "无标题")
  2104. print(f"[{self.platform_name}] ✅ 找到目标作品: {work_title}")
  2105. continue
  2106. if not target_feed or not target_post:
  2107. print(f"[{self.platform_name}] ❌ 未找到 work_id={work_id} 对应的作品")
  2108. return CommentsResult(
  2109. success=True,
  2110. platform=self.platform_name,
  2111. work_id=work_id,
  2112. comments=[],
  2113. total=0,
  2114. has_more=False,
  2115. )
  2116. # 准备作品信息(用于递归函数)
  2117. object_nonce = target_post.get("objectNonce", f"nonce_{target_index}")
  2118. work_title = target_post.get("desc", {}).get(
  2119. "description", f"作品{target_index + 1}"
  2120. )
  2121. work_info = {"work_id": object_nonce, "work_title": work_title}
  2122. # === 步骤3: 点击作品触发 comment_list 接口 ===
  2123. content_wrap = (
  2124. await target_feed.query_selector(".feed-content") or target_feed
  2125. )
  2126. try:
  2127. async with self.page.expect_response(
  2128. lambda res: "/comment/comment_list" in res.url, timeout=15000
  2129. ) as comment_resp_info:
  2130. await content_wrap.click()
  2131. await asyncio.sleep(0.8)
  2132. comment_resp = await comment_resp_info.value
  2133. comment_data = await comment_resp.json()
  2134. if comment_data.get("errCode") != 0:
  2135. err_msg = comment_data.get("errMsg", "未知错误")
  2136. print(f"[{self.platform_name}] ❌ 评论接口错误: {err_msg}")
  2137. return CommentsResult(
  2138. success=False,
  2139. platform=self.platform_name,
  2140. work_id=work_id,
  2141. error=f"评论接口错误: {err_msg}",
  2142. )
  2143. raw_comments = comment_data.get("data", {}).get("comment", [])
  2144. total = comment_data.get("data", {}).get(
  2145. "totalCount", len(raw_comments)
  2146. )
  2147. print(
  2148. f"[{self.platform_name}] 📊 原始评论数: {len(raw_comments)}, 总数: {total}"
  2149. )
  2150. # === 步骤4: 递归提取所有评论(含子评论)===
  2151. extracted = self._extract_comments(
  2152. raw_comments, parent_id="", work_info=work_info
  2153. )
  2154. # === 步骤5: 转换为 CommentItem 列表(保留 weixin.py 的数据结构)===
  2155. for c in extracted:
  2156. # 使用接口返回的 comment_id
  2157. comment_id = c.get("comment_id", "")
  2158. parent_comment_id = c.get("parent_comment_id", "")
  2159. # 构建 CommentItem(保留原有数据结构用于数据库入库)
  2160. comment_item = CommentItem(
  2161. comment_id=comment_id,
  2162. parent_comment_id=parent_comment_id,
  2163. work_id=work_id,
  2164. content=c.get("content", ""),
  2165. author_id=c.get("username", ""), # 使用 username 作为 author_id
  2166. author_name=c.get("nickname", ""),
  2167. author_avatar=c.get("avatar", ""),
  2168. like_count=c.get("like_count", 0),
  2169. reply_count=0,
  2170. create_time=c.get("create_time", ""),
  2171. )
  2172. # 添加扩展字段(用于数据库存储和后续处理)
  2173. # comment_item.parent_comment_id = c.get("parent_comment_id", "")
  2174. comment_item.is_author = c.get("is_author", False)
  2175. comment_item.create_time_unix = c.get("create_time_unix", 0)
  2176. comment_item.work_title = c.get("work_title", "")
  2177. print(comment_item)
  2178. comments.append(comment_item)
  2179. # 打印日志
  2180. author_tag = " 👤(作者)" if c.get("is_author") else ""
  2181. parent_tag = (
  2182. f" [回复: {c.get('parent_comment_id', '')}]"
  2183. if c.get("parent_comment_id")
  2184. else ""
  2185. )
  2186. print(
  2187. f"[{self.platform_name}] - [{c.get('nickname', '')}] {c.get('content', '')[:30]}... "
  2188. f"({c.get('create_time', '')}){author_tag}{parent_tag}"
  2189. )
  2190. # 判断是否还有更多(优先使用接口返回的 continueFlag,否则根据数量判断)
  2191. has_more = (
  2192. comment_data.get("data", {}).get("continueFlag", False)
  2193. or len(extracted) < total
  2194. )
  2195. print(
  2196. f"[{self.platform_name}] ✅ 共提取 {len(comments)} 条评论(含子评论)"
  2197. )
  2198. except Exception as e:
  2199. print(f"[{self.platform_name}] ❌ 获取评论失败: {e}")
  2200. import traceback
  2201. traceback.print_exc()
  2202. return CommentsResult(
  2203. success=False,
  2204. platform=self.platform_name,
  2205. work_id=work_id,
  2206. error=f"获取评论失败: {e}",
  2207. )
  2208. except Exception as e:
  2209. import traceback
  2210. traceback.print_exc()
  2211. return CommentsResult(
  2212. success=False,
  2213. platform=self.platform_name,
  2214. work_id=work_id,
  2215. error=str(e),
  2216. )
  2217. return CommentsResult(
  2218. success=True,
  2219. platform=self.platform_name,
  2220. work_id=work_id,
  2221. comments=comments,
  2222. total=total,
  2223. has_more=has_more,
  2224. )
  2225. def _extract_comments(
  2226. self, comment_list: list, parent_id: str = "", work_info: dict = None
  2227. ) -> list:
  2228. """
  2229. 递归提取一级和二级评论(完全参考 get_weixin_work_comments.py 的 extract_comments 函数)
  2230. Args:
  2231. comment_list: 评论列表(原始接口数据)
  2232. parent_id: 父评论ID(一级评论为空字符串"",二级评论为父级评论ID)
  2233. work_info: 作品信息字典
  2234. Returns:
  2235. list: 扁平化的评论列表,包含一级和二级评论
  2236. """
  2237. result = []
  2238. # 获取当前用户 username(用于判断是否为作者)
  2239. # 优先从环境变量获取,也可通过其他方式配置
  2240. my_username = getattr(self, "my_username", "") or os.environ.get(
  2241. "WEIXIN_MY_USERNAME", ""
  2242. )
  2243. for cmt in comment_list:
  2244. # 处理时间戳
  2245. create_ts = int(cmt.get("commentCreatetime", 0) or 0)
  2246. readable_time = (
  2247. datetime.fromtimestamp(create_ts).strftime("%Y-%m-%d %H:%M:%S")
  2248. if create_ts > 0
  2249. else ""
  2250. )
  2251. # 判断是否作者(如果配置了 my_username)
  2252. username = cmt.get("username", "") or ""
  2253. is_author = (my_username != "") and (username == my_username)
  2254. # 构建评论条目 - 完全参考 get_weixin_work_comments.py 的字段
  2255. entry = {
  2256. "work_id": work_info.get("work_id", "") if work_info else "",
  2257. "work_title": work_info.get("work_title", "") if work_info else "",
  2258. "comment_id": cmt.get("commentId"),
  2259. "parent_comment_id": parent_id, # 关键:一级评论为空字符串"",二级评论为父评论ID
  2260. "username": username,
  2261. "nickname": cmt.get("commentNickname", ""),
  2262. "avatar": cmt.get("commentHeadurl", ""),
  2263. "content": cmt.get("commentContent", ""),
  2264. "create_time_unix": create_ts,
  2265. "create_time": readable_time,
  2266. "is_author": is_author,
  2267. "like_count": cmt.get("commentLikeCount", 0) or 0,
  2268. }
  2269. result.append(entry)
  2270. # 递归处理二级评论(levelTwoComment)
  2271. # 关键:二级评论的 parent_id 应该是当前这条评论的 comment_id
  2272. level_two = cmt.get("levelTwoComment", []) or []
  2273. if level_two and isinstance(level_two, list) and len(level_two) > 0:
  2274. # 当前评论的 ID 作为其子评论的 parent_id
  2275. current_comment_id = cmt.get("commentId", "")
  2276. result.extend(
  2277. self._extract_comments(
  2278. level_two, parent_id=current_comment_id, work_info=work_info
  2279. )
  2280. )
  2281. return result
  2282. async def auto_reply_private_messages(self, cookies: str) -> dict:
  2283. """自动回复私信 - 集成自 pw3.py"""
  2284. print(f"\n{'=' * 60}")
  2285. print(f"[{self.platform_name}] 开始自动回复私信")
  2286. print(f"{'=' * 60}")
  2287. try:
  2288. await self.init_browser()
  2289. cookie_list = self.parse_cookies(cookies)
  2290. await self.set_cookies(cookie_list)
  2291. if not self.page:
  2292. raise Exception("Page not initialized")
  2293. # 访问私信页面
  2294. await self.page.goto(
  2295. "https://channels.weixin.qq.com/platform/private_msg", timeout=30000
  2296. )
  2297. await asyncio.sleep(3)
  2298. # 检查登录状态
  2299. current_url = self.page.url
  2300. print(f"[{self.platform_name}] 当前 URL: {current_url}")
  2301. if "login" in current_url:
  2302. raise Exception("Cookie 已过期,请重新登录")
  2303. # 等待私信页面加载(使用多个选择器容错)
  2304. try:
  2305. await self.page.wait_for_selector(
  2306. ".private-msg-list-header", timeout=15000
  2307. )
  2308. except:
  2309. # 尝试其他选择器
  2310. try:
  2311. await self.page.wait_for_selector(
  2312. ".weui-desktop-tab__navs__inner", timeout=10000
  2313. )
  2314. print(f"[{self.platform_name}] 使用备用选择器加载成功")
  2315. except:
  2316. # 截图调试
  2317. screenshot_path = (
  2318. f"weixin_private_msg_{int(asyncio.get_event_loop().time())}.png"
  2319. )
  2320. await self.page.screenshot(path=screenshot_path)
  2321. print(
  2322. f"[{self.platform_name}] 页面加载失败,截图: {screenshot_path}"
  2323. )
  2324. raise Exception(f"私信页面加载超时,当前 URL: {current_url}")
  2325. print(f"[{self.platform_name}] 私信页面加载完成")
  2326. # 处理两个 tab
  2327. total_replied = 0
  2328. for tab_name in ["打招呼消息", "私信"]:
  2329. replied_count = await self._process_tab_sessions(tab_name)
  2330. total_replied += replied_count
  2331. print(f"[{self.platform_name}] 自动回复完成,共回复 {total_replied} 条消息")
  2332. return {
  2333. "success": True,
  2334. "platform": self.platform_name,
  2335. "replied_count": total_replied,
  2336. "message": f"成功回复 {total_replied} 条私信",
  2337. }
  2338. except Exception as e:
  2339. import traceback
  2340. traceback.print_exc()
  2341. return {"success": False, "platform": self.platform_name, "error": str(e)}
  2342. async def _process_tab_sessions(self, tab_name: str) -> int:
  2343. """处理指定 tab 下的所有会话"""
  2344. print(f"\n🔄 正在处理「{tab_name}」中的所有会话...")
  2345. if not self.page:
  2346. return 0
  2347. replied_count = 0
  2348. try:
  2349. # 点击 tab
  2350. if tab_name == "私信":
  2351. tab_link = self.page.locator(
  2352. ".weui-desktop-tab__navs__inner li"
  2353. ).first.locator("a")
  2354. elif tab_name == "打招呼消息":
  2355. tab_link = (
  2356. self.page.locator(".weui-desktop-tab__navs__inner li")
  2357. .nth(1)
  2358. .locator("a")
  2359. )
  2360. else:
  2361. return 0
  2362. if await tab_link.is_visible():
  2363. await tab_link.click()
  2364. print(f" ➤ 已点击「{tab_name}」tab")
  2365. else:
  2366. print(f" ❌ 「{tab_name}」tab 不可见")
  2367. return 0
  2368. # 等待会话列表加载
  2369. try:
  2370. await self.page.wait_for_function(
  2371. """
  2372. () => {
  2373. const hasSession = document.querySelectorAll('.session-wrap').length > 0;
  2374. const hasEmpty = !!document.querySelector('.empty-text');
  2375. return hasSession || hasEmpty;
  2376. }
  2377. """,
  2378. timeout=8000,
  2379. )
  2380. print(" ✅ 会话列表区域已加载")
  2381. except:
  2382. print(" ⚠️ 等待会话列表超时,继续尝试读取...")
  2383. # 获取会话
  2384. session_wraps = self.page.locator(".session-wrap")
  2385. session_count = await session_wraps.count()
  2386. print(f" 💬 共找到 {session_count} 个会话")
  2387. if session_count == 0:
  2388. return 0
  2389. # 遍历每个会话
  2390. for idx in range(session_count):
  2391. try:
  2392. current_sessions = self.page.locator(".session-wrap")
  2393. if idx >= await current_sessions.count():
  2394. break
  2395. session = current_sessions.nth(idx)
  2396. user_name = await session.locator(".name").inner_text()
  2397. last_preview = await session.locator(".feed-info").inner_text()
  2398. print(
  2399. f"\n ➤ [{idx + 1}/{session_count}] 正在处理: {user_name} | 最后消息: {last_preview}"
  2400. )
  2401. await session.click()
  2402. await asyncio.sleep(2)
  2403. # 提取聊天历史
  2404. history = await self._extract_chat_history()
  2405. need_reply = (not history) or (not history[-1]["is_author"])
  2406. if need_reply:
  2407. reply_text = await self._generate_reply_with_ai(history)
  2408. if reply_text == "":
  2409. reply_text = self._generate_reply(history)
  2410. # # 生成回复
  2411. # if history and history[-1]["is_author"]:
  2412. # reply_text = await self._generate_reply_with_ai(history)
  2413. # else:
  2414. # reply_text = self._generate_reply(history)
  2415. if reply_text:
  2416. print(f" 📝 回复内容: {reply_text}")
  2417. try:
  2418. textarea = self.page.locator(".edit_area").first
  2419. send_btn = self.page.locator(
  2420. 'button:has-text("发送")'
  2421. ).first
  2422. if (
  2423. await textarea.is_visible()
  2424. and await send_btn.is_visible()
  2425. ):
  2426. await textarea.fill(reply_text)
  2427. await asyncio.sleep(0.5)
  2428. await send_btn.click()
  2429. print(" ✅ 已发送")
  2430. replied_count += 1
  2431. await asyncio.sleep(1.5)
  2432. else:
  2433. print(" ❌ 输入框或发送按钮不可见")
  2434. except Exception as e:
  2435. print(f" ❌ 发送失败: {e}")
  2436. else:
  2437. print(" ➤ 无需回复")
  2438. else:
  2439. print(" ➤ 最后一条是我发的,跳过回复")
  2440. except Exception as e:
  2441. print(f" ❌ 处理会话 {idx + 1} 时出错: {e}")
  2442. continue
  2443. except Exception as e:
  2444. print(f"❌ 处理「{tab_name}」失败: {e}")
  2445. return replied_count
  2446. async def _extract_chat_history(self) -> list:
  2447. """精准提取聊天记录,区分作者(自己)和用户"""
  2448. if not self.page:
  2449. return []
  2450. history = []
  2451. message_wrappers = self.page.locator(
  2452. ".session-content-wrapper > div:not(.footer) > .text-wrapper"
  2453. )
  2454. count = await message_wrappers.count()
  2455. for i in range(count):
  2456. try:
  2457. wrapper = message_wrappers.nth(i)
  2458. # 判断方向
  2459. is_right = await wrapper.locator(".content-right").count() > 0
  2460. is_left = await wrapper.locator(".content-left").count() > 0
  2461. if not (is_left or is_right):
  2462. continue
  2463. # 提取消息文本
  2464. pre_el = wrapper.locator("pre.message-plain")
  2465. content = ""
  2466. if await pre_el.count() > 0:
  2467. content = await pre_el.inner_text()
  2468. content = content.strip()
  2469. if not content:
  2470. continue
  2471. # 获取头像
  2472. avatar_img = wrapper.locator(".avatar").first
  2473. avatar_src = ""
  2474. if await avatar_img.count() > 0:
  2475. avatar_src = await avatar_img.get_attribute("src") or ""
  2476. # 右侧 = 作者(自己)
  2477. is_author = is_right
  2478. # 获取用户名
  2479. if is_left:
  2480. name_el = wrapper.locator(".profile .name")
  2481. author_name = "用户"
  2482. if await name_el.count() > 0:
  2483. author_name = await name_el.inner_text()
  2484. else:
  2485. author_name = "我"
  2486. history.append(
  2487. {
  2488. "author": author_name,
  2489. "content": content,
  2490. "is_author": is_author,
  2491. "avatar": avatar_src,
  2492. }
  2493. )
  2494. except Exception as e:
  2495. print(f" ⚠️ 解析第 {i + 1} 条消息失败: {e}")
  2496. continue
  2497. return history
  2498. async def _generate_reply_with_ai(self, chat_history: list) -> str:
  2499. """使用 AI 生成智能回复"""
  2500. import requests
  2501. import json
  2502. try:
  2503. # 获取 AI 配置
  2504. ai_api_key = os.environ.get("DASHSCOPE_API_KEY", "")
  2505. ai_base_url = os.environ.get(
  2506. "DASHSCOPE_BASE_URL",
  2507. "https://dashscope.aliyuncs.com/compatible-mode/v1",
  2508. )
  2509. ai_model = os.environ.get("AI_MODEL", "qwen-plus")
  2510. if not ai_api_key:
  2511. print("⚠️ 未配置 AI API Key,使用规则回复")
  2512. return self._generate_reply(chat_history)
  2513. # 构建对话上下文
  2514. messages = [
  2515. {
  2516. "role": "system",
  2517. "content": "你是一个友好的微信视频号创作者助手,负责回复粉丝私信。请保持简洁、友好、专业的语气。回复长度不超过20字。",
  2518. }
  2519. ]
  2520. for msg in chat_history:
  2521. role = "assistant" if msg["is_author"] else "user"
  2522. messages.append({"role": role, "content": msg["content"]})
  2523. # 调用 AI API
  2524. headers = {
  2525. "Authorization": f"Bearer {ai_api_key}",
  2526. "Content-Type": "application/json",
  2527. }
  2528. payload = {
  2529. "model": ai_model,
  2530. "messages": messages,
  2531. "max_tokens": 150,
  2532. "temperature": 0.8,
  2533. }
  2534. print(" 🤖 正在调用 AI 生成回复...")
  2535. response = requests.post(
  2536. f"{ai_base_url}/chat/completions",
  2537. headers=headers,
  2538. json=payload,
  2539. timeout=30,
  2540. )
  2541. if response.status_code != 200:
  2542. print(f" ⚠️ AI API 返回错误 {response.status_code},使用规则回复")
  2543. return self._generate_reply(chat_history)
  2544. result = response.json()
  2545. ai_reply = (
  2546. result.get("choices", [{}])[0]
  2547. .get("message", {})
  2548. .get("content", "")
  2549. .strip()
  2550. )
  2551. if ai_reply:
  2552. print(f" ✅ AI 生成回复: {ai_reply}")
  2553. return ai_reply
  2554. else:
  2555. print(" ⚠️ AI 返回空内容,使用规则回复")
  2556. return self._generate_reply(chat_history)
  2557. except Exception as e:
  2558. print(f" ⚠️ AI 回复生成失败: {e},使用规则回复")
  2559. return self._generate_reply(chat_history)
  2560. def _generate_reply(self, chat_history: list) -> str:
  2561. """根据完整聊天历史生成回复(规则回复方式)"""
  2562. if not chat_history:
  2563. return "你好!感谢联系~"
  2564. # 检查最后一条是否是作者发的
  2565. if chat_history[-1]["is_author"]:
  2566. return "" # 不回复
  2567. # 找最后一条用户消息
  2568. last_user_msg = chat_history[-1]["content"]
  2569. # 简单规则回复
  2570. if "谢谢" in last_user_msg or "感谢" in last_user_msg:
  2571. return "不客气!欢迎常来交流~"
  2572. elif "你好" in last_user_msg or "在吗" in last_user_msg:
  2573. return "你好!请问有什么可以帮您的?"
  2574. elif "视频" in last_user_msg or "怎么拍" in last_user_msg:
  2575. return "视频是用手机拍摄的,注意光线和稳定哦!"
  2576. else:
  2577. return "收到!我会认真阅读您的留言~"