douyin.py 41 KB


  1. # -*- coding: utf-8 -*-
  2. """
  3. 抖音视频发布器
  4. 参考: matrix/douyin_uploader/main.py
  5. """
  6. import asyncio
  7. import os
  8. import json
  9. from datetime import datetime
  10. from typing import List
  11. from .base import (
  12. BasePublisher, PublishParams, PublishResult,
  13. WorkItem, WorksResult, CommentItem, CommentsResult
  14. )
  15. class DouyinPublisher(BasePublisher):
  16. """
  17. 抖音视频发布器
  18. 使用 Playwright 自动化操作抖音创作者中心
  19. """
  20. platform_name = "douyin"
  21. login_url = "https://creator.douyin.com/"
  22. publish_url = "https://creator.douyin.com/creator-micro/content/upload"
  23. cookie_domain = ".douyin.com"
  24. async def set_schedule_time(self, publish_date: datetime):
  25. """设置定时发布"""
  26. if not self.page:
  27. return
  28. # 选择定时发布
  29. label_element = self.page.locator("label.radio-d4zkru:has-text('定时发布')")
  30. await label_element.click()
  31. await asyncio.sleep(1)
  32. # 输入时间
  33. publish_date_str = publish_date.strftime("%Y-%m-%d %H:%M")
  34. await self.page.locator('.semi-input[placeholder="日期和时间"]').click()
  35. await self.page.keyboard.press("Control+KeyA")
  36. await self.page.keyboard.type(str(publish_date_str))
  37. await self.page.keyboard.press("Enter")
  38. await asyncio.sleep(1)
  39. async def handle_upload_error(self, video_path: str):
  40. """处理上传错误,重新上传"""
  41. if not self.page:
  42. return
  43. print(f"[{self.platform_name}] 视频出错了,重新上传中...")
  44. await self.page.locator('div.progress-div [class^="upload-btn-input"]').set_input_files(video_path)
  45. async def check_captcha(self) -> dict:
  46. """
  47. 检查页面是否需要验证码
  48. 返回: {'need_captcha': bool, 'captcha_type': str}
  49. """
  50. if not self.page:
  51. return {'need_captcha': False, 'captcha_type': ''}
  52. try:
  53. # 检查手机验证码弹窗
  54. phone_captcha_selectors = [
  55. 'text="请输入验证码"',
  56. 'text="输入手机验证码"',
  57. 'text="获取验证码"',
  58. 'text="手机号验证"',
  59. '[class*="captcha"][class*="phone"]',
  60. '[class*="verify"][class*="phone"]',
  61. '[class*="sms-code"]',
  62. 'input[placeholder*="验证码"]',
  63. ]
  64. for selector in phone_captcha_selectors:
  65. try:
  66. if await self.page.locator(selector).count() > 0:
  67. print(f"[{self.platform_name}] 检测到手机验证码: {selector}", flush=True)
  68. return {'need_captcha': True, 'captcha_type': 'phone'}
  69. except:
  70. pass
  71. # 检查滑块验证码
  72. slider_captcha_selectors = [
  73. '[class*="captcha"][class*="slider"]',
  74. '[class*="slide-verify"]',
  75. '[class*="drag-verify"]',
  76. 'text="按住滑块"',
  77. 'text="向右滑动"',
  78. 'text="拖动滑块"',
  79. ]
  80. for selector in slider_captcha_selectors:
  81. try:
  82. if await self.page.locator(selector).count() > 0:
  83. print(f"[{self.platform_name}] 检测到滑块验证码: {selector}", flush=True)
  84. return {'need_captcha': True, 'captcha_type': 'slider'}
  85. except:
  86. pass
  87. # 检查图片验证码
  88. image_captcha_selectors = [
  89. '[class*="captcha"][class*="image"]',
  90. '[class*="verify-image"]',
  91. 'text="点击图片"',
  92. 'text="选择正确的"',
  93. ]
  94. for selector in image_captcha_selectors:
  95. try:
  96. if await self.page.locator(selector).count() > 0:
  97. print(f"[{self.platform_name}] 检测到图片验证码: {selector}", flush=True)
  98. return {'need_captcha': True, 'captcha_type': 'image'}
  99. except:
  100. pass
  101. # 检查登录弹窗(Cookie 过期)
  102. login_selectors = [
  103. 'text="请先登录"',
  104. 'text="登录后继续"',
  105. '[class*="login-modal"]',
  106. '[class*="login-dialog"]',
  107. ]
  108. for selector in login_selectors:
  109. try:
  110. if await self.page.locator(selector).count() > 0:
  111. print(f"[{self.platform_name}] 检测到需要登录: {selector}", flush=True)
  112. return {'need_captcha': True, 'captcha_type': 'login'}
  113. except:
  114. pass
  115. except Exception as e:
  116. print(f"[{self.platform_name}] 验证码检测异常: {e}", flush=True)
  117. return {'need_captcha': False, 'captcha_type': ''}
  118. async def publish(self, cookies: str, params: PublishParams) -> PublishResult:
  119. """发布视频到抖音 - 参考 matrix/douyin_uploader/main.py"""
  120. print(f"\n{'='*60}")
  121. print(f"[{self.platform_name}] 开始发布视频")
  122. print(f"[{self.platform_name}] 视频路径: {params.video_path}")
  123. print(f"[{self.platform_name}] 标题: {params.title}")
  124. print(f"[{self.platform_name}] Headless: {self.headless}")
  125. print(f"{'='*60}")
  126. self.report_progress(5, "正在初始化浏览器...")
  127. # 初始化浏览器
  128. await self.init_browser()
  129. print(f"[{self.platform_name}] 浏览器初始化完成")
  130. # 解析并设置 cookies
  131. cookie_list = self.parse_cookies(cookies)
  132. print(f"[{self.platform_name}] 解析到 {len(cookie_list)} 个 cookies")
  133. await self.set_cookies(cookie_list)
  134. if not self.page:
  135. raise Exception("Page not initialized")
  136. # 检查视频文件
  137. if not os.path.exists(params.video_path):
  138. raise Exception(f"视频文件不存在: {params.video_path}")
  139. print(f"[{self.platform_name}] 视频文件存在,大小: {os.path.getsize(params.video_path)} bytes")
  140. self.report_progress(10, "正在打开上传页面...")
  141. # 访问上传页面 - 参考 matrix
  142. await self.page.goto("https://creator.douyin.com/creator-micro/content/upload")
  143. print(f"[{self.platform_name}] 等待页面加载...")
  144. try:
  145. await self.page.wait_for_url("https://creator.douyin.com/creator-micro/content/upload", timeout=30000)
  146. except:
  147. pass
  148. await asyncio.sleep(3)
  149. # 检查当前 URL 和页面状态
  150. current_url = self.page.url
  151. print(f"[{self.platform_name}] 当前 URL: {current_url}")
  152. # 检查是否在登录页面或需要登录
  153. if "login" in current_url or "passport" in current_url:
  154. screenshot_base64 = await self.capture_screenshot()
  155. return PublishResult(
  156. success=False,
  157. platform=self.platform_name,
  158. error="Cookie 已过期,需要重新登录",
  159. need_captcha=True,
  160. captcha_type='login',
  161. screenshot_base64=screenshot_base64,
  162. page_url=current_url,
  163. status='need_captcha'
  164. )
  165. # 使用 AI 检测验证码
  166. ai_captcha_result = await self.ai_check_captcha()
  167. if ai_captcha_result['has_captcha']:
  168. print(f"[{self.platform_name}] AI检测到验证码: {ai_captcha_result['captcha_type']}", flush=True)
  169. screenshot_base64 = await self.capture_screenshot()
  170. return PublishResult(
  171. success=False,
  172. platform=self.platform_name,
  173. error=f"检测到{ai_captcha_result['captcha_type']}验证码,需要使用有头浏览器完成验证",
  174. need_captcha=True,
  175. captcha_type=ai_captcha_result['captcha_type'],
  176. screenshot_base64=screenshot_base64,
  177. page_url=current_url,
  178. status='need_captcha'
  179. )
  180. # 传统方式检测验证码
  181. captcha_result = await self.check_captcha()
  182. if captcha_result['need_captcha']:
  183. print(f"[{self.platform_name}] 传统方式检测到验证码: {captcha_result['captcha_type']}", flush=True)
  184. screenshot_base64 = await self.capture_screenshot()
  185. return PublishResult(
  186. success=False,
  187. platform=self.platform_name,
  188. error=f"需要{captcha_result['captcha_type']}验证码,请使用有头浏览器完成验证",
  189. need_captcha=True,
  190. captcha_type=captcha_result['captcha_type'],
  191. screenshot_base64=screenshot_base64,
  192. page_url=current_url,
  193. status='need_captcha'
  194. )
  195. self.report_progress(15, "正在选择视频文件...")
  196. # 点击上传区域 - 参考 matrix: div.container-drag-info-Tl0RGH 或带 container-drag 的 div
  197. upload_selectors = [
  198. "div[class*='container-drag-info']",
  199. "div[class*='container-drag']",
  200. "div.upload-btn",
  201. "div[class*='upload']",
  202. ]
  203. upload_success = False
  204. for selector in upload_selectors:
  205. try:
  206. upload_div = self.page.locator(selector).first
  207. if await upload_div.count() > 0:
  208. print(f"[{self.platform_name}] 找到上传区域: {selector}")
  209. async with self.page.expect_file_chooser(timeout=10000) as fc_info:
  210. await upload_div.click()
  211. file_chooser = await fc_info.value
  212. await file_chooser.set_files(params.video_path)
  213. upload_success = True
  214. print(f"[{self.platform_name}] 视频文件已选择")
  215. break
  216. except Exception as e:
  217. print(f"[{self.platform_name}] 选择器 {selector} 失败: {e}")
  218. if not upload_success:
  219. screenshot_base64 = await self.capture_screenshot()
  220. return PublishResult(
  221. success=False,
  222. platform=self.platform_name,
  223. error="未找到上传入口",
  224. screenshot_base64=screenshot_base64,
  225. page_url=await self.get_page_url(),
  226. status='failed'
  227. )
  228. # 等待跳转到发布页面 - 参考 matrix
  229. self.report_progress(20, "等待进入发布页面...")
  230. for i in range(60):
  231. try:
  232. # matrix 等待的 URL: https://creator.douyin.com/creator-micro/content/post/video?enter_from=publish_page
  233. await self.page.wait_for_url(
  234. "https://creator.douyin.com/creator-micro/content/post/video*",
  235. timeout=2000
  236. )
  237. print(f"[{self.platform_name}] 已进入发布页面")
  238. break
  239. except:
  240. print(f"[{self.platform_name}] 等待进入发布页面... {i+1}/60")
  241. await asyncio.sleep(1)
  242. await asyncio.sleep(2)
  243. self.report_progress(30, "正在填充标题和话题...")
  244. # 填写标题 - 参考 matrix
  245. title_input = self.page.get_by_text('作品标题').locator("..").locator(
  246. "xpath=following-sibling::div[1]").locator("input")
  247. if await title_input.count():
  248. await title_input.fill(params.title[:30])
  249. print(f"[{self.platform_name}] 标题已填写")
  250. else:
  251. # 备用方式 - 参考 matrix
  252. title_container = self.page.locator(".notranslate")
  253. await title_container.click()
  254. await self.page.keyboard.press("Backspace")
  255. await self.page.keyboard.press("Control+KeyA")
  256. await self.page.keyboard.press("Delete")
  257. await self.page.keyboard.type(params.title)
  258. await self.page.keyboard.press("Enter")
  259. print(f"[{self.platform_name}] 标题已填写(备用方式)")
  260. # 添加话题标签 - 参考 matrix
  261. if params.tags:
  262. css_selector = ".zone-container"
  263. for index, tag in enumerate(params.tags, start=1):
  264. print(f"[{self.platform_name}] 正在添加第{index}个话题: #{tag}")
  265. await self.page.type(css_selector, "#" + tag)
  266. await self.page.press(css_selector, "Space")
  267. self.report_progress(40, "等待视频上传完成...")
  268. # 等待视频上传完成 - 参考 matrix: 检测"重新上传"按钮
  269. for i in range(120):
  270. try:
  271. count = await self.page.locator("div").filter(has_text="重新上传").count()
  272. if count > 0:
  273. print(f"[{self.platform_name}] 视频上传完毕")
  274. break
  275. else:
  276. print(f"[{self.platform_name}] 正在上传视频中... {i+1}/120")
  277. # 检查上传错误
  278. if await self.page.locator('div.progress-div > div:has-text("上传失败")').count():
  279. print(f"[{self.platform_name}] 发现上传出错了,重新上传...")
  280. await self.handle_upload_error(params.video_path)
  281. await asyncio.sleep(3)
  282. except:
  283. print(f"[{self.platform_name}] 正在上传视频中...")
  284. await asyncio.sleep(3)
  285. self.report_progress(60, "处理视频设置...")
  286. # 点击"我知道了"弹窗 - 参考 matrix
  287. known_count = await self.page.get_by_role("button", name="我知道了").count()
  288. if known_count > 0:
  289. await self.page.get_by_role("button", name="我知道了").nth(0).click()
  290. print(f"[{self.platform_name}] 关闭弹窗")
  291. await asyncio.sleep(5)
  292. # 设置位置 - 参考 matrix
  293. try:
  294. await self.page.locator('div.semi-select span:has-text("输入地理位置")').click()
  295. await asyncio.sleep(1)
  296. await self.page.keyboard.press("Backspace")
  297. await self.page.keyboard.press("Control+KeyA")
  298. await self.page.keyboard.press("Delete")
  299. await self.page.keyboard.type(params.location)
  300. await asyncio.sleep(1)
  301. await self.page.locator('div[role="listbox"] [role="option"]').first.click()
  302. print(f"[{self.platform_name}] 位置设置成功: {params.location}")
  303. except Exception as e:
  304. print(f"[{self.platform_name}] 设置位置失败: {e}")
  305. # 开启头条/西瓜同步 - 参考 matrix
  306. try:
  307. third_part_element = '[class^="info"] > [class^="first-part"] div div.semi-switch'
  308. if await self.page.locator(third_part_element).count():
  309. class_name = await self.page.eval_on_selector(
  310. third_part_element, 'div => div.className')
  311. if 'semi-switch-checked' not in class_name:
  312. await self.page.locator(third_part_element).locator(
  313. 'input.semi-switch-native-control').click()
  314. print(f"[{self.platform_name}] 已开启头条/西瓜同步")
  315. except:
  316. pass
  317. # 定时发布
  318. if params.publish_date:
  319. self.report_progress(70, "设置定时发布...")
  320. await self.set_schedule_time(params.publish_date)
  321. self.report_progress(80, "正在发布...")
  322. print(f"[{self.platform_name}] 查找发布按钮...")
  323. # 点击发布 - 参考 matrix
  324. for i in range(30):
  325. try:
  326. # 检查验证码(不要在每次循环都调 AI,太慢)
  327. if i % 5 == 0:
  328. ai_captcha = await self.ai_check_captcha()
  329. if ai_captcha['has_captcha']:
  330. print(f"[{self.platform_name}] AI检测到发布过程中需要验证码: {ai_captcha['captcha_type']}", flush=True)
  331. screenshot_base64 = await self.capture_screenshot()
  332. page_url = await self.get_page_url()
  333. return PublishResult(
  334. success=False,
  335. platform=self.platform_name,
  336. error=f"发布过程中需要{ai_captcha['captcha_type']}验证码,请使用有头浏览器完成验证",
  337. need_captcha=True,
  338. captcha_type=ai_captcha['captcha_type'],
  339. screenshot_base64=screenshot_base64,
  340. page_url=page_url,
  341. status='need_captcha'
  342. )
  343. publish_btn = self.page.get_by_role('button', name="发布", exact=True)
  344. btn_count = await publish_btn.count()
  345. if btn_count > 0:
  346. print(f"[{self.platform_name}] 点击发布按钮...")
  347. await publish_btn.click()
  348. # 等待跳转到内容管理页面 - 参考 matrix
  349. await self.page.wait_for_url(
  350. "https://creator.douyin.com/creator-micro/content/manage",
  351. timeout=5000
  352. )
  353. self.report_progress(100, "发布成功")
  354. print(f"[{self.platform_name}] 视频发布成功!")
  355. screenshot_base64 = await self.capture_screenshot()
  356. page_url = await self.get_page_url()
  357. return PublishResult(
  358. success=True,
  359. platform=self.platform_name,
  360. message="发布成功",
  361. screenshot_base64=screenshot_base64,
  362. page_url=page_url,
  363. status='success'
  364. )
  365. except Exception as e:
  366. current_url = self.page.url
  367. # 检查是否已经在管理页面
  368. if "https://creator.douyin.com/creator-micro/content/manage" in current_url:
  369. self.report_progress(100, "发布成功")
  370. print(f"[{self.platform_name}] 视频发布成功!")
  371. screenshot_base64 = await self.capture_screenshot()
  372. return PublishResult(
  373. success=True,
  374. platform=self.platform_name,
  375. message="发布成功",
  376. screenshot_base64=screenshot_base64,
  377. page_url=current_url,
  378. status='success'
  379. )
  380. else:
  381. print(f"[{self.platform_name}] 视频正在发布中... {i+1}/30, URL: {current_url}")
  382. await asyncio.sleep(1)
  383. # 发布超时
  384. print(f"[{self.platform_name}] 发布超时,获取截图...")
  385. screenshot_base64 = await self.capture_screenshot()
  386. page_url = await self.get_page_url()
  387. return PublishResult(
  388. success=False,
  389. platform=self.platform_name,
  390. error="发布超时,请检查发布状态",
  391. screenshot_base64=screenshot_base64,
  392. page_url=page_url,
  393. status='need_action'
  394. )
  395. async def get_works(self, cookies: str, page: int = 0, page_size: int = 20) -> WorksResult:
  396. """获取抖音作品列表"""
  397. print(f"\n{'='*60}")
  398. print(f"[{self.platform_name}] 获取作品列表")
  399. print(f"[{self.platform_name}] page={page}, page_size={page_size}")
  400. print(f"{'='*60}")
  401. works: List[WorkItem] = []
  402. total = 0
  403. has_more = False
  404. try:
  405. await self.init_browser()
  406. cookie_list = self.parse_cookies(cookies)
  407. await self.set_cookies(cookie_list)
  408. if not self.page:
  409. raise Exception("Page not initialized")
  410. # 访问创作者中心首页以触发登录验证
  411. await self.page.goto("https://creator.douyin.com/creator-micro/home")
  412. await asyncio.sleep(3)
  413. # 检查登录状态
  414. current_url = self.page.url
  415. if "login" in current_url or "passport" in current_url:
  416. raise Exception("Cookie 已过期,请重新登录")
  417. # 调用作品列表 API
  418. cursor = page * page_size
  419. # 移除 scene=star_atlas 和 aid=1128,使用更通用的参数
  420. api_url = f"https://creator.douyin.com/janus/douyin/creator/pc/work_list?status=0&device_platform=android&count={page_size}&max_cursor={cursor}&cookie_enabled=true&browser_language=zh-CN&browser_platform=Win32&browser_name=Mozilla&browser_online=true&timezone_name=Asia%2FShanghai"
  421. response = await self.page.evaluate(f'''
  422. async () => {{
  423. try {{
  424. const resp = await fetch("{api_url}", {{
  425. credentials: 'include',
  426. headers: {{ 'Accept': 'application/json' }}
  427. }});
  428. return await resp.json();
  429. }} catch (e) {{
  430. return {{ error: e.toString() }};
  431. }}
  432. }}
  433. ''')
  434. if response.get('error'):
  435. print(f"[{self.platform_name}] API 请求失败: {response.get('error')}", flush=True)
  436. print(f"[{self.platform_name}] API 响应: has_more={response.get('has_more')}, aweme_list={len(response.get('aweme_list', []))}")
  437. aweme_list = response.get('aweme_list', [])
  438. has_more = response.get('has_more', False)
  439. for aweme in aweme_list:
  440. aweme_id = str(aweme.get('aweme_id', ''))
  441. if not aweme_id:
  442. continue
  443. statistics = aweme.get('statistics', {})
  444. # 打印调试信息,确认字段存在
  445. # print(f"[{self.platform_name}] 作品 {aweme_id} 统计: {statistics}", flush=True)
  446. # 获取封面
  447. cover_url = ''
  448. if aweme.get('Cover', {}).get('url_list'):
  449. cover_url = aweme['Cover']['url_list'][0]
  450. elif aweme.get('video', {}).get('cover', {}).get('url_list'):
  451. cover_url = aweme['video']['cover']['url_list'][0]
  452. # 获取标题
  453. title = aweme.get('item_title', '') or aweme.get('desc', '').split('\n')[0][:50] or '无标题'
  454. # 获取时长(毫秒转秒)
  455. duration = aweme.get('video', {}).get('duration', 0) // 1000
  456. # 获取发布时间
  457. create_time = aweme.get('create_time', 0)
  458. publish_time = datetime.fromtimestamp(create_time).strftime('%Y-%m-%d %H:%M:%S') if create_time else ''
  459. works.append(WorkItem(
  460. work_id=aweme_id,
  461. title=title,
  462. cover_url=cover_url,
  463. duration=duration,
  464. status='published',
  465. publish_time=publish_time,
  466. play_count=int(statistics.get('play_count', 0)),
  467. like_count=int(statistics.get('digg_count', 0)),
  468. comment_count=int(statistics.get('comment_count', 0)),
  469. share_count=int(statistics.get('share_count', 0)),
  470. ))
  471. total = len(works)
  472. print(f"[{self.platform_name}] 获取到 {total} 个作品")
  473. except Exception as e:
  474. import traceback
  475. traceback.print_exc()
  476. return WorksResult(
  477. success=False,
  478. platform=self.platform_name,
  479. error=str(e)
  480. )
  481. return WorksResult(
  482. success=True,
  483. platform=self.platform_name,
  484. works=works,
  485. total=total,
  486. has_more=has_more
  487. )
  488. async def get_comments(self, cookies: str, work_id: str, cursor: str = "") -> CommentsResult:
  489. """获取抖音作品评论 - 通过访问视频详情页拦截评论 API"""
  490. print(f"\n{'='*60}")
  491. print(f"[{self.platform_name}] 获取作品评论")
  492. print(f"[{self.platform_name}] work_id={work_id}, cursor={cursor}")
  493. print(f"{'='*60}")
  494. comments: List[CommentItem] = []
  495. total = 0
  496. has_more = False
  497. next_cursor = ""
  498. captured_data = {}
  499. try:
  500. await self.init_browser()
  501. cookie_list = self.parse_cookies(cookies)
  502. await self.set_cookies(cookie_list)
  503. if not self.page:
  504. raise Exception("Page not initialized")
  505. # 设置 API 响应监听器
  506. async def handle_response(response):
  507. nonlocal captured_data
  508. url = response.url
  509. # 监听评论列表 API - 抖音视频页面使用的 API
  510. # /aweme/v1/web/comment/list/ 或 /comment/list/
  511. if '/comment/list' in url and ('aweme_id' in url or work_id in url):
  512. try:
  513. json_data = await response.json()
  514. print(f"[{self.platform_name}] 捕获到评论 API: {url[:100]}...", flush=True)
  515. # 检查响应是否成功
  516. if json_data.get('status_code') == 0 or json_data.get('comments'):
  517. captured_data = json_data
  518. comment_count = len(json_data.get('comments', []))
  519. print(f"[{self.platform_name}] 评论 API 响应成功: comments={comment_count}, has_more={json_data.get('has_more')}", flush=True)
  520. except Exception as e:
  521. print(f"[{self.platform_name}] 解析评论响应失败: {e}", flush=True)
  522. self.page.on('response', handle_response)
  523. print(f"[{self.platform_name}] 已注册评论 API 响应监听器", flush=True)
  524. # 访问视频详情页 - 这会自动触发评论 API 请求
  525. video_url = f"https://www.douyin.com/video/{work_id}"
  526. print(f"[{self.platform_name}] 访问视频详情页: {video_url}", flush=True)
  527. await self.page.goto(video_url, wait_until="domcontentloaded", timeout=30000)
  528. await asyncio.sleep(5)
  529. # 检查登录状态
  530. current_url = self.page.url
  531. if "login" in current_url or "passport" in current_url:
  532. raise Exception("Cookie 已过期,请重新登录")
  533. # 等待评论加载
  534. if not captured_data:
  535. print(f"[{self.platform_name}] 等待评论 API 响应...", flush=True)
  536. # 尝试滚动页面触发评论加载
  537. await self.page.evaluate('window.scrollBy(0, 300)')
  538. await asyncio.sleep(3)
  539. if not captured_data:
  540. # 再等待一会
  541. await asyncio.sleep(3)
  542. # 移除监听器
  543. self.page.remove_listener('response', handle_response)
  544. # 解析评论数据
  545. if captured_data:
  546. comment_list = captured_data.get('comments') or []
  547. has_more = captured_data.get('has_more', False) or captured_data.get('has_more', 0) == 1
  548. next_cursor = str(captured_data.get('cursor', ''))
  549. total = captured_data.get('total', 0) or len(comment_list)
  550. print(f"[{self.platform_name}] 解析评论: total={total}, has_more={has_more}, comments={len(comment_list)}", flush=True)
  551. for comment in comment_list:
  552. cid = str(comment.get('cid', ''))
  553. if not cid:
  554. continue
  555. user = comment.get('user', {})
  556. # 解析回复列表
  557. replies = []
  558. reply_list = comment.get('reply_comment', []) or []
  559. for reply in reply_list:
  560. reply_user = reply.get('user', {})
  561. replies.append(CommentItem(
  562. comment_id=str(reply.get('cid', '')),
  563. work_id=work_id,
  564. content=reply.get('text', ''),
  565. author_id=str(reply_user.get('uid', '')),
  566. author_name=reply_user.get('nickname', ''),
  567. author_avatar=reply_user.get('avatar_thumb', {}).get('url_list', [''])[0] if reply_user.get('avatar_thumb') else '',
  568. like_count=int(reply.get('digg_count', 0)),
  569. create_time=datetime.fromtimestamp(reply.get('create_time', 0)).strftime('%Y-%m-%d %H:%M:%S') if reply.get('create_time') else '',
  570. is_author=reply.get('is_author', False),
  571. ))
  572. comments.append(CommentItem(
  573. comment_id=cid,
  574. work_id=work_id,
  575. content=comment.get('text', ''),
  576. author_id=str(user.get('uid', '')),
  577. author_name=user.get('nickname', ''),
  578. author_avatar=user.get('avatar_thumb', {}).get('url_list', [''])[0] if user.get('avatar_thumb') else '',
  579. like_count=int(comment.get('digg_count', 0)),
  580. reply_count=int(comment.get('reply_comment_total', 0)),
  581. create_time=datetime.fromtimestamp(comment.get('create_time', 0)).strftime('%Y-%m-%d %H:%M:%S') if comment.get('create_time') else '',
  582. is_author=comment.get('is_author', False),
  583. replies=replies,
  584. ))
  585. print(f"[{self.platform_name}] 解析到 {len(comments)} 条评论", flush=True)
  586. else:
  587. print(f"[{self.platform_name}] 未捕获到评论 API 响应", flush=True)
  588. except Exception as e:
  589. import traceback
  590. traceback.print_exc()
  591. return CommentsResult(
  592. success=False,
  593. platform=self.platform_name,
  594. work_id=work_id,
  595. error=str(e)
  596. )
  597. finally:
  598. await self.close_browser()
  599. result = CommentsResult(
  600. success=True,
  601. platform=self.platform_name,
  602. work_id=work_id,
  603. comments=comments,
  604. total=total,
  605. has_more=has_more
  606. )
  607. result.__dict__['cursor'] = next_cursor
  608. return result
  609. async def get_all_comments(self, cookies: str) -> dict:
  610. """获取所有作品的评论 - 通过评论管理页面"""
  611. print(f"\n{'='*60}")
  612. print(f"[{self.platform_name}] 获取所有作品评论")
  613. print(f"{'='*60}")
  614. all_work_comments = []
  615. captured_comments = []
  616. captured_works = {} # work_id -> work_info
  617. try:
  618. await self.init_browser()
  619. cookie_list = self.parse_cookies(cookies)
  620. await self.set_cookies(cookie_list)
  621. if not self.page:
  622. raise Exception("Page not initialized")
  623. # 设置 API 响应监听器
  624. async def handle_response(response):
  625. nonlocal captured_comments, captured_works
  626. url = response.url
  627. try:
  628. # 监听评论列表 API - 多种格式
  629. # /comment/list/select/ 或 /comment/read 或 /creator/comment/list
  630. if '/comment/list' in url or '/comment/read' in url or 'comment_list' in url:
  631. json_data = await response.json()
  632. print(f"[{self.platform_name}] 捕获到评论 API: {url[:100]}...", flush=True)
  633. # 格式1: comments 字段
  634. comments = json_data.get('comments', [])
  635. # 格式2: comment_info_list 字段
  636. if not comments:
  637. comments = json_data.get('comment_info_list', [])
  638. if comments:
  639. # 从 URL 中提取 aweme_id
  640. import re
  641. aweme_id_match = re.search(r'aweme_id=(\d+)', url)
  642. aweme_id = aweme_id_match.group(1) if aweme_id_match else ''
  643. for comment in comments:
  644. # 添加 aweme_id 到评论中
  645. if aweme_id and 'aweme_id' not in comment:
  646. comment['aweme_id'] = aweme_id
  647. captured_comments.append(comment)
  648. print(f"[{self.platform_name}] 捕获到 {len(comments)} 条评论 (aweme_id={aweme_id}),总计: {len(captured_comments)}", flush=True)
  649. # 监听作品列表 API
  650. if '/work_list' in url or '/item/list' in url or '/creator/item' in url:
  651. json_data = await response.json()
  652. aweme_list = json_data.get('aweme_list', []) or json_data.get('item_info_list', []) or json_data.get('item_list', [])
  653. print(f"[{self.platform_name}] 捕获到作品列表 API: {len(aweme_list)} 个作品", flush=True)
  654. for aweme in aweme_list:
  655. aweme_id = str(aweme.get('aweme_id', '') or aweme.get('item_id', '') or aweme.get('item_id_plain', ''))
  656. if aweme_id:
  657. cover_url = ''
  658. if aweme.get('Cover', {}).get('url_list'):
  659. cover_url = aweme['Cover']['url_list'][0]
  660. elif aweme.get('video', {}).get('cover', {}).get('url_list'):
  661. cover_url = aweme['video']['cover']['url_list'][0]
  662. elif aweme.get('cover_image_url'):
  663. cover_url = aweme['cover_image_url']
  664. captured_works[aweme_id] = {
  665. 'title': aweme.get('item_title', '') or aweme.get('title', '') or aweme.get('desc', ''),
  666. 'cover': cover_url,
  667. 'comment_count': aweme.get('statistics', {}).get('comment_count', 0) or aweme.get('comment_count', 0),
  668. }
  669. except Exception as e:
  670. print(f"[{self.platform_name}] 解析响应失败: {e}", flush=True)
  671. self.page.on('response', handle_response)
  672. print(f"[{self.platform_name}] 已注册 API 响应监听器", flush=True)
  673. # 访问评论管理页面
  674. print(f"[{self.platform_name}] 访问评论管理页面...", flush=True)
  675. await self.page.goto("https://creator.douyin.com/creator-micro/interactive/comment", wait_until="domcontentloaded", timeout=30000)
  676. await asyncio.sleep(5)
  677. # 检查登录状态
  678. current_url = self.page.url
  679. if "login" in current_url or "passport" in current_url:
  680. raise Exception("Cookie 已过期,请重新登录")
  681. print(f"[{self.platform_name}] 页面加载完成,当前捕获: {len(captured_comments)} 条评论, {len(captured_works)} 个作品", flush=True)
  682. # 尝试点击"选择作品"来加载作品列表
  683. try:
  684. select_btn = await self.page.query_selector('text="选择作品"')
  685. if select_btn:
  686. print(f"[{self.platform_name}] 点击选择作品按钮...", flush=True)
  687. await select_btn.click()
  688. await asyncio.sleep(3)
  689. # 获取作品列表
  690. work_items = await self.page.query_selector_all('[class*="work-item"], [class*="video-item"], [class*="aweme-item"]')
  691. print(f"[{self.platform_name}] 找到 {len(work_items)} 个作品元素", flush=True)
  692. # 点击每个作品加载其评论
  693. for i, item in enumerate(work_items[:10]): # 最多处理10个作品
  694. try:
  695. await item.click()
  696. await asyncio.sleep(2)
  697. print(f"[{self.platform_name}] 已点击作品 {i+1}/{min(len(work_items), 10)}", flush=True)
  698. except:
  699. pass
  700. # 关闭选择作品弹窗
  701. close_btn = await self.page.query_selector('[class*="close"], [class*="cancel"]')
  702. if close_btn:
  703. await close_btn.click()
  704. await asyncio.sleep(1)
  705. except Exception as e:
  706. print(f"[{self.platform_name}] 选择作品操作失败: {e}", flush=True)
  707. # 滚动加载更多评论
  708. for i in range(5):
  709. await self.page.evaluate('window.scrollBy(0, 500)')
  710. await asyncio.sleep(1)
  711. await asyncio.sleep(3)
  712. # 移除监听器
  713. self.page.remove_listener('response', handle_response)
  714. print(f"[{self.platform_name}] 最终捕获: {len(captured_comments)} 条评论, {len(captured_works)} 个作品", flush=True)
  715. # 按作品分组评论
  716. work_comments_map = {} # work_id -> work_comments
  717. for comment in captured_comments:
  718. # 从评论中获取作品信息
  719. aweme = comment.get('aweme', {}) or comment.get('item', {})
  720. aweme_id = str(comment.get('aweme_id', '') or aweme.get('aweme_id', '') or aweme.get('item_id', ''))
  721. if not aweme_id:
  722. continue
  723. if aweme_id not in work_comments_map:
  724. work_info = captured_works.get(aweme_id, {})
  725. work_comments_map[aweme_id] = {
  726. 'work_id': aweme_id,
  727. 'title': aweme.get('title', '') or aweme.get('desc', '') or work_info.get('title', ''),
  728. 'cover_url': aweme.get('cover', {}).get('url_list', [''])[0] if aweme.get('cover') else work_info.get('cover', ''),
  729. 'comments': []
  730. }
  731. cid = str(comment.get('cid', ''))
  732. if not cid:
  733. continue
  734. user = comment.get('user', {})
  735. work_comments_map[aweme_id]['comments'].append({
  736. 'comment_id': cid,
  737. 'author_id': str(user.get('uid', '')),
  738. 'author_name': user.get('nickname', ''),
  739. 'author_avatar': user.get('avatar_thumb', {}).get('url_list', [''])[0] if user.get('avatar_thumb') else '',
  740. 'content': comment.get('text', ''),
  741. 'like_count': int(comment.get('digg_count', 0)),
  742. 'create_time': datetime.fromtimestamp(comment.get('create_time', 0)).strftime('%Y-%m-%d %H:%M:%S') if comment.get('create_time') else '',
  743. 'is_author': comment.get('is_author', False),
  744. })
  745. all_work_comments = list(work_comments_map.values())
  746. total_comments = sum(len(w['comments']) for w in all_work_comments)
  747. print(f"[{self.platform_name}] 获取到 {len(all_work_comments)} 个作品的 {total_comments} 条评论", flush=True)
  748. except Exception as e:
  749. import traceback
  750. traceback.print_exc()
  751. return {
  752. 'success': False,
  753. 'platform': self.platform_name,
  754. 'error': str(e),
  755. 'work_comments': []
  756. }
  757. finally:
  758. await self.close_browser()
  759. return {
  760. 'success': True,
  761. 'platform': self.platform_name,
  762. 'work_comments': all_work_comments,
  763. 'total': len(all_work_comments)
  764. }