Procházet zdrojové kódy

feat(xiaohongshu): 优先使用Python API获取小红书账号信息并优化作品抓取

- 对于小红书平台,优先通过Python API获取账号信息,因为Node端Playwright可能无法捕获相关API
- 增加获取账号信息的方法 `get_account_info`,通过监听API响应解析用户数据
- 优化作品列表抓取逻辑:增加最大页数限制至100页,添加请求错误处理和详细日志
- 在分页抓取时增加延迟避免请求过快,并改进分页停止条件
Ethanfly před 16 hodinami
rodič
revize
76851ea99f

binární
server/python/platforms/__pycache__/xiaohongshu.cpython-313.pyc


+ 99 - 8
server/python/platforms/xiaohongshu.py

@@ -579,6 +579,82 @@ class XiaohongshuPublisher(BasePublisher):
             status='success'
         )
     
+    async def get_account_info(self, cookies: str) -> dict:
+        """获取账号信息"""
+        print(f"\n{'='*60}")
+        print(f"[{self.platform_name}] 获取账号信息")
+        print(f"{'='*60}")
+        
+        captured_info = {}
+        
+        try:
+            await self.init_browser()
+            cookie_list = self.parse_cookies(cookies)
+            await self.set_cookies(cookie_list)
+            
+            if not self.page:
+                raise Exception("Page not initialized")
+            
+            # 监听个人信息 API
+            async def handle_response(response):
+                nonlocal captured_info
+                if 'api/galaxy/creator/home/personal_info' in response.url:
+                    try:
+                        json_data = await response.json()
+                        print(f"[{self.platform_name}] 捕获个人信息 API", flush=True)
+                        if json_data.get('success') or json_data.get('code') == 0:
+                            data = json_data.get('data', {})
+                            captured_info = {
+                                "account_id": f"xhs_{data.get('red_num', '')}",
+                                "account_name": data.get('name', ''),
+                                "avatar_url": data.get('avatar', ''),
+                                "fans_count": data.get('fans_count', 0),
+                                "works_count": 0  # 暂时无法直接获取准确的作品数,需要从作品列表获取
+                            }
+                    except Exception as e:
+                        print(f"[{self.platform_name}] 解析个人信息失败: {e}", flush=True)
+            
+            self.page.on('response', handle_response)
+            
+            # 访问首页
+            print(f"[{self.platform_name}] 访问创作者首页...", flush=True)
+            await self.page.goto("https://creator.xiaohongshu.com/new/home", wait_until="domcontentloaded")
+            
+            # 等待 API 响应
+            for _ in range(10):
+                if captured_info:
+                    break
+                await asyncio.sleep(1)
+            
+            if not captured_info:
+                print(f"[{self.platform_name}] 未捕获到个人信息,尝试刷新...", flush=True)
+                await self.page.reload()
+                for _ in range(10):
+                    if captured_info:
+                        break
+                    await asyncio.sleep(1)
+            
+            if not captured_info:
+                raise Exception("无法获取账号信息")
+            
+            # 尝试获取作品数(从首页或其他地方)
+            # 或者简单地返回已获取的信息,作品数由 get_works 更新
+            
+            return {
+                "success": True,
+                **captured_info
+            }
+            
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            return {
+                "success": False,
+                "error": str(e)
+            }
+        finally:
+            await self.close_browser()
+
     async def get_works(self, cookies: str, page: int = 0, page_size: int = 20) -> WorksResult:
         """获取小红书作品列表 - 通过监听页面网络响应获取数据"""
         print(f"\n{'='*60}", flush=True)
@@ -705,21 +781,26 @@ class XiaohongshuPublisher(BasePublisher):
                 works.extend(parse_notes(notes))
 
                 # 分页抓取剩余页面:不依赖 data.page(有些情况下会误报 -1),直到拿不到新数据为止
-                max_pages = 30
+                max_pages = 100  # 增加最大页数限制,确保能抓取更多作品
                 page_num = 1  # 已经拿了 page=0
                 seen_note_ids = set([w.work_id for w in works])
                 has_more = True
 
                 while has_more and page_num < max_pages:
+                    print(f"[{self.platform_name}] 正在抓取第 {page_num} 页...", flush=True)
                     try:
                         next_resp = await self.page.evaluate(
                             """async (p) => {
-                                const res = await fetch(`https://edith.xiaohongshu.com/web_api/sns/v5/creator/note/user/posted?tab=0&page=${p}`, {
-                                    method: 'GET',
-                                    credentials: 'include',
-                                    headers: { 'Accept': 'application/json' }
-                                });
-                                return await res.json();
+                                try {
+                                    const res = await fetch(`https://edith.xiaohongshu.com/web_api/sns/v5/creator/note/user/posted?tab=0&page=${p}`, {
+                                        method: 'GET',
+                                        credentials: 'include',
+                                        headers: { 'Accept': 'application/json' }
+                                    });
+                                    return await res.json();
+                                } catch (e) {
+                                    return { success: false, error: e.toString() };
+                                }
                             }""",
                             page_num
                         )
@@ -728,25 +809,31 @@ class XiaohongshuPublisher(BasePublisher):
                         break
 
                     if not next_resp:
+                        print(f"[{self.platform_name}] 第 {page_num} 页无响应", flush=True)
                         break
 
                     if not (next_resp.get('success') or next_resp.get('code') == 0):
+                        print(f"[{self.platform_name}] 第 {page_num} 页请求失败: {next_resp.get('msg')}", flush=True)
                         break
 
                     next_data = next_resp.get('data', {})
                     next_notes = next_data.get('notes', []) or []
 
                     if not next_notes:
+                        print(f"[{self.platform_name}] 第 {page_num} 页无作品,停止抓取", flush=True)
                         has_more = False
                         break
 
                     parsed_next = parse_notes(next_notes)
                     new_items = [w for w in parsed_next if w.work_id and w.work_id not in seen_note_ids]
+                    
                     if not new_items:
                         # 没有新数据,停止
+                        print(f"[{self.platform_name}] 第 {page_num} 页无新数据,停止抓取", flush=True)
                         has_more = False
                         break
-
+                    
+                    print(f"[{self.platform_name}] 第 {page_num} 页获取到 {len(new_items)} 条新数据", flush=True)
                     for w in new_items:
                         seen_note_ids.add(w.work_id)
                     works.extend(new_items)
@@ -759,10 +846,14 @@ class XiaohongshuPublisher(BasePublisher):
                                 break
 
                     page_num += 1
+                    # 增加一点延迟,避免请求过快
+                    await asyncio.sleep(1)
                 
                 # 分页完毕,has_more 表示是否还存在更多(以最后一页标记为准)
                 if not has_more:
                     print(f"[{self.platform_name}] 已抓取所有分页,共 {len(works)} 条", flush=True)
+                else:
+                    print(f"[{self.platform_name}] 达到最大页数限制 {max_pages},共 {len(works)} 条", flush=True)
             else:
                 print(f"[{self.platform_name}] 未能捕获到 API 数据", flush=True)
             

binární
server/python/weixin_private_msg_266653.png


+ 14 - 1
server/src/services/HeadlessBrowserService.ts

@@ -637,7 +637,20 @@ class HeadlessBrowserService {
             logger.info(`[Python API] Successfully fetched ${worksList.length} works for ${platform}`);
 
             try {
-              const accountInfo = await this.fetchAccountInfoWithPlaywright(platform, cookies);
+              let accountInfo: AccountInfo;
+
+              // 对于小红书,优先使用 Python API 获取账号信息(因为 Node 端 Playwright 可能无法捕获到 API)
+              if (platform === 'xiaohongshu') {
+                try {
+                  accountInfo = await this.fetchAccountInfoViaPython(platform, cookies);
+                } catch (e) {
+                  logger.warn(`[Python API] Failed to fetch account info for ${platform}, falling back to Playwright:`, e);
+                  accountInfo = await this.fetchAccountInfoWithPlaywright(platform, cookies);
+                }
+              } else {
+                accountInfo = await this.fetchAccountInfoWithPlaywright(platform, cookies);
+              }
+
               accountInfo.worksList = worksList;
               // 直接使用 Python API 获取的作品数量(最准确,排除了已删除/私密视频)
               accountInfo.worksCount = worksList.length;