Browse Source

feat(xiaohongshu): 实现作品同步自动分页与进度反馈

- 在 WorksResult 中添加 next_page 字段支持分页游标
- 为小红书平台实现 get_all_works 方法,支持自动翻页抓取全部作品
- 在 HeadlessBrowserService 中优化作品获取逻辑,支持分页进度回调
- 在 WorkService.syncWorks 中集成进度反馈和账号同步摘要
- 添加小红书作品数提取工具函数和自测脚本
- 在任务执行器中展示同步详情,包括平台、作品数和数据源信息
Ethanfly 20 hours ago
parent
commit
7afacd3ef3

BIN
server/python/__pycache__/app.cpython-313.pyc


+ 6 - 2
server/python/app.py

@@ -669,8 +669,9 @@ def get_works():
         cookie_str = data.get("cookie", "")
         page = data.get("page", 0)
         page_size = data.get("page_size", 20)
+        auto_paging = bool(data.get("auto_paging", False))
         
-        print(f"[Works] 收到请求: platform={platform}, page={page}, page_size={page_size}")
+        print(f"[Works] 收到请求: platform={platform}, page={page}, page_size={page_size}, auto_paging={auto_paging}")
         
         if not platform:
             return jsonify({"success": False, "error": "缺少 platform 参数"}), 400
@@ -687,7 +688,10 @@ def get_works():
         publisher = PublisherClass(headless=HEADLESS_MODE)
         
         # 执行获取作品
-        result = asyncio.run(publisher.run_get_works(cookie_str, page, page_size))
+        if platform == "xiaohongshu" and auto_paging and hasattr(publisher, "get_all_works"):
+            result = asyncio.run(publisher.get_all_works(cookie_str))
+        else:
+            result = asyncio.run(publisher.run_get_works(cookie_str, page, page_size))
         
         return jsonify(result.to_dict())
         

BIN
server/python/platforms/__pycache__/base.cpython-313.pyc


BIN
server/python/platforms/__pycache__/douyin.cpython-313.pyc


BIN
server/python/platforms/__pycache__/xiaohongshu.cpython-313.pyc


+ 3 - 1
server/python/platforms/base.py

@@ -118,6 +118,7 @@ class WorksResult:
     works: List[WorkItem] = field(default_factory=list)
     total: int = 0
     has_more: bool = False
+    next_page: Any = ""
     error: str = ""
     
     def to_dict(self) -> Dict[str, Any]:
@@ -127,6 +128,7 @@ class WorksResult:
             "works": [w.to_dict() for w in self.works],
             "total": self.total,
             "has_more": self.has_more,
+            "next_page": self.next_page,
             "error": self.error,
         }
 
@@ -765,4 +767,4 @@ class BasePublisher(ABC):
                 "error": str(e)
             }
         finally:
-            await self.close_browser()
+            await self.close_browser()

+ 409 - 119
server/python/platforms/xiaohongshu.py

@@ -656,7 +656,7 @@ class XiaohongshuPublisher(BasePublisher):
             await self.close_browser()
 
     async def get_works(self, cookies: str, page: int = 0, page_size: int = 20) -> WorksResult:
-        """获取小红书作品列表 - 通过监听页面网络响应获取数据"""
+        """获取小红书作品列表 - 通过直接调用创作者笔记列表 API 获取"""
         print(f"\n{'='*60}", flush=True)
         print(f"[{self.platform_name}] 获取作品列表", flush=True)
         print(f"[{self.platform_name}] page={page}, page_size={page_size}", flush=True)
@@ -665,7 +665,8 @@ class XiaohongshuPublisher(BasePublisher):
         works: List[WorkItem] = []
         total = 0
         has_more = False
-        captured_data = {}
+        next_page = ""
+        api_page_size = 20
         
         try:
             await self.init_browser()
@@ -678,26 +679,7 @@ class XiaohongshuPublisher(BasePublisher):
             
             if not self.page:
                 raise Exception("Page not initialized")
-            
-            # 定义响应监听器 - 捕获页面自动发起的 API 请求
-            async def handle_response(response):
-                nonlocal captured_data
-                url = response.url
-                # 监听作品列表 API
-                if 'creator/note/user/posted' in url or 'creator/note_list' in url:
-                    try:
-                        json_data = await response.json()
-                        print(f"[{self.platform_name}] 捕获到 API 响应: {url[:80]}...", flush=True)
-                        if json_data.get('success') or json_data.get('code') == 0:
-                            captured_data = json_data
-                            print(f"[{self.platform_name}] API 响应成功,data keys: {list(json_data.get('data', {}).keys())}", flush=True)
-                    except Exception as e:
-                        print(f"[{self.platform_name}] 解析响应失败: {e}", flush=True)
-            
-            # 注册响应监听器
-            self.page.on('response', handle_response)
-            print(f"[{self.platform_name}] 已注册 API 响应监听器", flush=True)
-            
+
             # 访问笔记管理页面 - 页面会自动发起 API 请求
             print(f"[{self.platform_name}] 访问笔记管理页面...", flush=True)
             
@@ -706,24 +688,40 @@ class XiaohongshuPublisher(BasePublisher):
             except Exception as nav_error:
                 print(f"[{self.platform_name}] 导航超时,但继续尝试: {nav_error}", flush=True)
             
-            # 等待 API 响应被捕获
-            await asyncio.sleep(5)
-            
             # 检查登录状态
             current_url = self.page.url
             print(f"[{self.platform_name}] 当前页面: {current_url}", flush=True)
             if "login" in current_url:
                 raise Exception("Cookie 已过期,请重新登录")
-            
-            # 如果还没有捕获到数据,等待更长时间
-            if not captured_data:
-                print(f"[{self.platform_name}] 等待 API 响应...", flush=True)
-                await asyncio.sleep(5)
-            
-            # 移除监听器
-            self.page.remove_listener('response', handle_response)
-            
-            # 处理捕获到的数据(增加分页抓取,避免仅第一页)
+
+            async def fetch_notes_page(p):
+                return await self.page.evaluate(
+                    """async (pageNum) => {
+                        try {
+                            const url = `https://edith.xiaohongshu.com/web_api/sns/v5/creator/note/user/posted?tab=0&page=${pageNum}`;
+                            const headers = { 'Accept': 'application/json' };
+                            if (typeof window !== 'undefined' && typeof window._webmsxyw === 'function') {
+                                try {
+                                    const sign = window._webmsxyw(url, '');
+                                    headers['x-s'] = sign['X-s'];
+                                    headers['x-t'] = String(sign['X-t']);
+                                } catch (e) {
+                                    // ignore sign errors and fallback
+                                }
+                            }
+                            const res = await fetch(url, {
+                                method: 'GET',
+                                credentials: 'include',
+                                headers
+                            });
+                            return await res.json();
+                        } catch (e) {
+                            return { success: false, error: e.toString() };
+                        }
+                    }""",
+                    p
+                )
+
             def parse_notes(notes_list):
                 parsed = []
                 for note in notes_list:
@@ -764,118 +762,410 @@ class XiaohongshuPublisher(BasePublisher):
                     ))
                 return parsed
 
-            import json
-            if captured_data:
-                print(f"[{self.platform_name}] 成功捕获到 API 数据", flush=True)
-                data = captured_data.get('data', {})
-                notes = data.get('notes', [])
-                print(f"[{self.platform_name}] notes 数量: {len(notes)}", flush=True)
-                
-                # 从 tags 获取总数
-                tags = data.get('tags', [])
+            resp = None
+            for attempt in range(1, 4):
+                resp = await fetch_notes_page(page)
+                if resp and (resp.get('success') or resp.get('code') == 0) and resp.get('data'):
+                    break
+                print(f"[{self.platform_name}] 拉取作品列表失败,重试 {attempt}/3: {str(resp)[:200]}", flush=True)
+                await asyncio.sleep(1.2 * attempt)
+
+            if not resp or not (resp.get('success') or resp.get('code') == 0) or not resp.get('data'):
+                raise Exception(f"无法获取作品列表数据: {resp.get('msg') if isinstance(resp, dict) else resp}")
+
+            data = resp.get('data', {}) or {}
+            notes = data.get('notes', []) or []
+            print(f"[{self.platform_name}] 第 {page} 页 notes 数量: {len(notes)}", flush=True)
+
+            tags = data.get('tags', []) or []
+            if tags:
+                preferred = 0
                 for tag in tags:
                     if tag.get('id') == 'special.note_time_desc':
-                        total = tag.get('notes_count', 0)
+                        preferred = tag.get('notes_count', 0) or tag.get('notesCount', 0) or tag.get('count', 0) or 0
                         break
-                
-                works.extend(parse_notes(notes))
+                if preferred:
+                    total = preferred
+                else:
+                    total = max([int(t.get('notes_count', 0) or t.get('notesCount', 0) or t.get('count', 0) or 0) for t in tags] + [0])
+            if not total:
+                total = int(data.get('total', 0) or data.get('total_count', 0) or data.get('totalCount', 0) or 0)
+                if not total and isinstance(data.get('page', {}), dict):
+                    total = int(data.get('page', {}).get('total', 0) or data.get('page', {}).get('totalCount', 0) or 0)
 
-                # 分页抓取剩余页面:不依赖 data.page(有些情况下会误报 -1),直到拿不到新数据为止
-                max_pages = 100  # 增加最大页数限制,确保能抓取更多作品
-                page_num = 1  # 已经拿了 page=0
-                seen_note_ids = set([w.work_id for w in works])
-                has_more = True
+            next_page = data.get('page', "")
+            if next_page == page:
+                next_page = page + 1
 
-                while has_more and page_num < max_pages:
-                    print(f"[{self.platform_name}] 正在抓取第 {page_num} 页...", flush=True)
-                    try:
-                        next_resp = await self.page.evaluate(
-                            """async (p) => {
+            works.extend(parse_notes(notes))
+
+            if total:
+                has_more = (page * api_page_size + len(notes)) < total
+                if has_more and (next_page == -1 or str(next_page) == "-1" or next_page == "" or next_page is None):
+                    next_page = page + 1
+            else:
+                if len(notes) == 0:
+                    has_more = False
+                else:
+                    next_resp = await fetch_notes_page(page + 1)
+                    next_data = (next_resp or {}).get('data', {}) if isinstance(next_resp, dict) else {}
+                    next_notes = next_data.get('notes', []) or []
+                    has_more = len(next_notes) > 0
+                    next_page = next_data.get('page', next_page)
+            
+        except Exception as e:
+            import traceback
+            print(f"[{self.platform_name}] 发生异常: {e}", flush=True)
+            traceback.print_exc()
+            return WorksResult(
+                success=False,
+                platform=self.platform_name,
+                error=str(e)
+            )
+        finally:
+            # 确保关闭浏览器
+            await self.close_browser()
+        
+        return WorksResult(
+            success=True,
+            platform=self.platform_name,
+            works=works,
+            total=total or (page * api_page_size + len(works)),
+            has_more=has_more,
+            next_page=next_page
+        )
+
+    async def get_all_works(self, cookies: str) -> WorksResult:
+        """获取小红书全部作品(单次请求内自动翻页抓全量,避免 Node 侧分页不一致)"""
+        print(f"\n{'='*60}", flush=True)
+        print(f"[{self.platform_name}] 获取全部作品(auto paging)", flush=True)
+        print(f"{'='*60}", flush=True)
+
+        works: List[WorkItem] = []
+        total = 0
+        seen_ids = set()
+        cursor: object = 0
+        max_iters = 800
+        api_page_size = 20
+
+        try:
+            await self.init_browser()
+            cookie_list = self.parse_cookies(cookies)
+            print(f"[{self.platform_name}] 解析到 {len(cookie_list)} 个 cookies", flush=True)
+            await self.set_cookies(cookie_list)
+            if not self.page:
+                raise Exception("Page not initialized")
+
+            print(f"[{self.platform_name}] 访问笔记管理页面...", flush=True)
+            try:
+                await self.page.goto("https://creator.xiaohongshu.com/new/note-manager", wait_until="domcontentloaded", timeout=30000)
+            except Exception as nav_error:
+                print(f"[{self.platform_name}] 导航超时,但继续尝试: {nav_error}", flush=True)
+
+            current_url = self.page.url
+            print(f"[{self.platform_name}] 当前页面: {current_url}", flush=True)
+            if "login" in current_url:
+                raise Exception("Cookie 已过期,请重新登录")
+
+            async def fetch_notes_page(p):
+                return await self.page.evaluate(
+                    """async (pageNum) => {
+                        try {
+                            const url = `https://edith.xiaohongshu.com/web_api/sns/v5/creator/note/user/posted?tab=0&page=${pageNum}`;
+                            const headers = { 'Accept': 'application/json' };
+                            if (typeof window !== 'undefined' && typeof window._webmsxyw === 'function') {
                                 try {
-                                    const res = await fetch(`https://edith.xiaohongshu.com/web_api/sns/v5/creator/note/user/posted?tab=0&page=${p}`, {
-                                        method: 'GET',
-                                        credentials: 'include',
-                                        headers: { 'Accept': 'application/json' }
-                                    });
-                                    return await res.json();
+                                    const sign = window._webmsxyw(url, '');
+                                    headers['x-s'] = sign['X-s'];
+                                    headers['x-t'] = String(sign['X-t']);
                                 } catch (e) {
-                                    return { success: false, error: e.toString() };
+                                    // ignore sign errors and fallback
                                 }
-                            }""",
-                            page_num
-                        )
-                    except Exception as fetch_err:
-                        print(f"[{self.platform_name}] 分页请求异常 page={page_num}: {fetch_err}", flush=True)
-                        break
+                            }
+                            const res = await fetch(url, {
+                                method: 'GET',
+                                credentials: 'include',
+                                headers
+                            });
+                            return await res.json();
+                        } catch (e) {
+                            return { success: false, error: e.toString() };
+                        }
+                    }""",
+                    p
+                )
 
-                    if not next_resp:
-                        print(f"[{self.platform_name}] 第 {page_num} 页无响应", flush=True)
-                        break
+            def parse_notes(notes_list):
+                parsed = []
+                for note in notes_list:
+                    note_id = note.get('id', '')
+                    if not note_id:
+                        continue
 
-                    if not (next_resp.get('success') or next_resp.get('code') == 0):
-                        print(f"[{self.platform_name}] 第 {page_num} 页请求失败: {next_resp.get('msg')}", flush=True)
-                        break
+                    cover_url = ''
+                    images_list = note.get('images_list', [])
+                    if images_list:
+                        cover_url = images_list[0].get('url', '')
+                        if cover_url.startswith('http://'):
+                            cover_url = cover_url.replace('http://', 'https://')
 
-                    next_data = next_resp.get('data', {})
-                    next_notes = next_data.get('notes', []) or []
+                    duration = note.get('video_info', {}).get('duration', 0)
 
-                    if not next_notes:
-                        print(f"[{self.platform_name}] 第 {page_num} 页无作品,停止抓取", flush=True)
-                        has_more = False
-                        break
+                    status = 'published'
+                    tab_status = note.get('tab_status', 1)
+                    if tab_status == 0:
+                        status = 'draft'
+                    elif tab_status == 2:
+                        status = 'reviewing'
+                    elif tab_status == 3:
+                        status = 'rejected'
 
-                    parsed_next = parse_notes(next_notes)
-                    new_items = [w for w in parsed_next if w.work_id and w.work_id not in seen_note_ids]
-                    
-                    if not new_items:
-                        # 没有新数据,停止
-                        print(f"[{self.platform_name}] 第 {page_num} 页无新数据,停止抓取", flush=True)
-                        has_more = False
-                        break
-                    
-                    print(f"[{self.platform_name}] 第 {page_num} 页获取到 {len(new_items)} 条新数据", flush=True)
-                    for w in new_items:
-                        seen_note_ids.add(w.work_id)
-                    works.extend(new_items)
+                    parsed.append(WorkItem(
+                        work_id=note_id,
+                        title=note.get('display_title', '') or '无标题',
+                        cover_url=cover_url,
+                        duration=duration,
+                        status=status,
+                        publish_time=note.get('time', ''),
+                        play_count=note.get('view_count', 0),
+                        like_count=note.get('likes', 0),
+                        comment_count=note.get('comments_count', 0),
+                        share_count=note.get('shared_count', 0),
+                        collect_count=note.get('collected_count', 0),
+                    ))
+                return parsed
+
+            async def collect_by_scrolling() -> WorksResult:
+                print(f"[{self.platform_name}] 直连接口被拒绝,切换为滚动页面 + 监听 API 响应模式", flush=True)
+                captured: List[WorkItem] = []
+                captured_total = 0
+                captured_seen = set()
+                lock = asyncio.Lock()
+
+                async def handle_response(response):
+                    nonlocal captured_total
+                    url = response.url
+                    if "edith.xiaohongshu.com" not in url or "creator/note/user/posted" not in url:
+                        return
+                    try:
+                        json_data = await response.json()
+                    except Exception:
+                        return
 
-                    # 更新总数(若第一页未拿到)
-                    if not total and next_data.get('tags'):
-                        for tag in next_data.get('tags', []):
-                            if tag.get('id') == 'special.note_time_desc':
-                                total = tag.get('notes_count', 0)
+                    if not isinstance(json_data, dict):
+                        return
+                    if not (json_data.get("success") or json_data.get("code") == 0) or not json_data.get("data"):
+                        return
+
+                    data = json_data.get("data", {}) or {}
+                    notes = data.get("notes", []) or []
+                    tags = data.get("tags", []) or []
+
+                    declared = 0
+                    if tags:
+                        preferred = 0
+                        for tag in tags:
+                            if tag.get("id") == "special.note_time_desc":
+                                preferred = tag.get("notes_count", 0) or tag.get("notesCount", 0) or tag.get("count", 0) or 0
                                 break
+                        if preferred:
+                            declared = int(preferred)
+                        else:
+                            declared = max([int(t.get("notes_count", 0) or t.get("notesCount", 0) or t.get("count", 0) or 0) for t in tags] + [0])
 
-                    page_num += 1
-                    # 增加一点延迟,避免请求过快
-                    await asyncio.sleep(1)
-                
-                # 分页完毕,has_more 表示是否还存在更多(以最后一页标记为准)
-                if not has_more:
-                    print(f"[{self.platform_name}] 已抓取所有分页,共 {len(works)} 条", flush=True)
+                    if not declared:
+                        declared = int(data.get("total", 0) or data.get("total_count", 0) or data.get("totalCount", 0) or 0)
+                        if not declared and isinstance(data.get("page", {}), dict):
+                            declared = int(data.get("page", {}).get("total", 0) or data.get("page", {}).get("totalCount", 0) or 0)
+
+                    async with lock:
+                        if declared:
+                            captured_total = max(captured_total, declared)
+
+                        parsed = parse_notes(notes)
+                        new_count = 0
+                        for w in parsed:
+                            if w.work_id and w.work_id not in captured_seen:
+                                captured_seen.add(w.work_id)
+                                captured.append(w)
+                                new_count += 1
+
+                        if new_count > 0:
+                            print(
+                                f"[{self.platform_name}] 捕获 notes 响应: notes={len(notes)}, new={new_count}, total_now={len(captured)}, declared_total={captured_total}",
+                                flush=True
+                            )
+
+                self.page.on("response", handle_response)
+
+                try:
+                    try:
+                        await self.page.goto("https://creator.xiaohongshu.com/new/note-manager", wait_until="networkidle", timeout=60000)
+                    except Exception as nav_error:
+                        print(f"[{self.platform_name}] 导航异常(继续):{nav_error}", flush=True)
+
+                    await asyncio.sleep(2.0)
+
+                    idle_rounds = 0
+                    last_count = 0
+                    last_height = 0
+
+                    for _ in range(1, 400):
+                        scroll_state = await self.page.evaluate(
+                            """() => {
+                                const isScrollable = (el) => {
+                                    if (!el) return false;
+                                    const style = window.getComputedStyle(el);
+                                    const oy = style.overflowY;
+                                    return (oy === 'auto' || oy === 'scroll') && (el.scrollHeight - el.clientHeight > 200);
+                                };
+                                const pickBest = () => {
+                                    const nodes = Array.from(document.querySelectorAll('*'));
+                                    let best = document.scrollingElement || document.documentElement || document.body;
+                                    let bestScroll = (best.scrollHeight || 0) - (best.clientHeight || 0);
+                                    for (const el of nodes) {
+                                        if (!isScrollable(el)) continue;
+                                        const diff = el.scrollHeight - el.clientHeight;
+                                        if (diff > bestScroll) {
+                                            best = el;
+                                            bestScroll = diff;
+                                        }
+                                    }
+                                    return best;
+                                };
+                                const el = pickBest();
+                                const beforeTop = el.scrollTop || 0;
+                                const beforeHeight = el.scrollHeight || 0;
+                                el.scrollTo(0, beforeHeight);
+                                return {
+                                    beforeTop,
+                                    afterTop: el.scrollTop || 0,
+                                    height: el.scrollHeight || 0,
+                                    client: el.clientHeight || 0,
+                                };
+                            }"""
+                        )
+
+                        await asyncio.sleep(1.2)
+
+                        async with lock:
+                            count_now = len(captured)
+                            total_now = captured_total
+
+                        if total_now and count_now >= total_now:
+                            break
+
+                        height_now = int(scroll_state.get("height", 0) or 0) if isinstance(scroll_state, dict) else 0
+                        if count_now == last_count and height_now == last_height:
+                            idle_rounds += 1
+                        else:
+                            idle_rounds = 0
+
+                        last_count = count_now
+                        last_height = height_now
+
+                        if idle_rounds >= 6:
+                            break
+
+                    async with lock:
+                        final_works = list(captured)
+                        final_total = captured_total or len(final_works)
+
+                    return WorksResult(
+                        success=True,
+                        platform=self.platform_name,
+                        works=final_works,
+                        total=final_total,
+                        has_more=False,
+                        next_page=-1
+                    )
+                finally:
+                    try:
+                        self.page.remove_listener("response", handle_response)
+                    except Exception:
+                        pass
+
+            iters = 0
+            while iters < max_iters:
+                iters += 1
+                resp = await fetch_notes_page(cursor)
+                if not resp or not isinstance(resp, dict):
+                    print(f"[{self.platform_name}] 第 {iters} 次拉取无响应,cursor={cursor}", flush=True)
+                    break
+                if not (resp.get('success') or resp.get('code') == 0) or not resp.get('data'):
+                    print(f"[{self.platform_name}] 拉取失败 cursor={cursor}: {str(resp)[:200]}", flush=True)
+                    if iters == 1:
+                        return await collect_by_scrolling()
+                    break
+
+                data = resp.get('data', {}) or {}
+                notes = data.get('notes', []) or []
+                if not notes:
+                    print(f"[{self.platform_name}] cursor={cursor} 无作品,停止", flush=True)
+                    break
+
+                tags = data.get('tags', []) or []
+                if tags:
+                    preferred = 0
+                    for tag in tags:
+                        if tag.get('id') == 'special.note_time_desc':
+                            preferred = tag.get('notes_count', 0) or tag.get('notesCount', 0) or tag.get('count', 0) or 0
+                            break
+                    if preferred:
+                        total = max(total, int(preferred))
+                    else:
+                        total = max(total, max([int(t.get('notes_count', 0) or t.get('notesCount', 0) or t.get('count', 0) or 0) for t in tags] + [0]))
+                if not total:
+                    t2 = int(data.get('total', 0) or data.get('total_count', 0) or data.get('totalCount', 0) or 0)
+                    if not t2 and isinstance(data.get('page', {}), dict):
+                        t2 = int(data.get('page', {}).get('total', 0) or data.get('page', {}).get('totalCount', 0) or 0)
+                    total = max(total, t2)
+
+                parsed = parse_notes(notes)
+                new_items = []
+                for w in parsed:
+                    if w.work_id and w.work_id not in seen_ids:
+                        seen_ids.add(w.work_id)
+                        new_items.append(w)
+                works.extend(new_items)
+
+                print(f"[{self.platform_name}] cursor={cursor} got={len(notes)}, new={len(new_items)}, total_now={len(works)}, declared_total={total}", flush=True)
+
+                if total and len(works) >= total:
+                    break
+                if len(new_items) == 0:
+                    break
+
+                next_page = data.get('page', "")
+                if next_page == cursor:
+                    next_page = ""
+                if next_page == -1 or str(next_page) == "-1":
+                    next_page = ""
+                if next_page is None or next_page == "":
+                    if isinstance(cursor, int):
+                        cursor = cursor + 1
+                    else:
+                        cursor = len(works) // api_page_size
                 else:
-                    print(f"[{self.platform_name}] 达到最大页数限制 {max_pages},共 {len(works)} 条", flush=True)
-            else:
-                print(f"[{self.platform_name}] 未能捕获到 API 数据", flush=True)
-            
+                    cursor = next_page
+
+                await asyncio.sleep(0.5)
+
         except Exception as e:
             import traceback
             print(f"[{self.platform_name}] 发生异常: {e}", flush=True)
             traceback.print_exc()
-            return WorksResult(
-                success=False,
-                platform=self.platform_name,
-                error=str(e)
-            )
+            return WorksResult(success=False, platform=self.platform_name, error=str(e))
         finally:
-            # 确保关闭浏览器
             await self.close_browser()
-        
+
         return WorksResult(
             success=True,
             platform=self.platform_name,
             works=works,
             total=total or len(works),
-            has_more=has_more
+            has_more=False,
+            next_page=-1
         )
     
     async def get_comments(self, cookies: str, work_id: str, cursor: str = "") -> CommentsResult:

+ 83 - 0
server/python/test_xiaohongshu_paging_logic.py

@@ -0,0 +1,83 @@
+import asyncio
+import sys
+from pathlib import Path
+
+CURRENT_DIR = Path(__file__).parent.resolve()
+if str(CURRENT_DIR) not in sys.path:
+    sys.path.insert(0, str(CURRENT_DIR))
+
+from platforms.xiaohongshu import XiaohongshuPublisher
+
+
+async def main():
+    publisher = XiaohongshuPublisher(headless=True)
+
+    async def fake_fetch_notes_page(p: int):
+        if str(p) == "0":
+            return {
+                "success": True,
+                "data": {
+                    "notes": [{"id": "1"}, {"id": "2"}],
+                    "tags": [{"id": "special.note_time_desc", "notes_count": 3}],
+                    "page": 1,
+                },
+            }
+        if str(p) == "1":
+            return {
+                "success": True,
+                "data": {
+                    "notes": [{"id": "3"}],
+                    "tags": [{"id": "special.note_time_desc", "notes_count": 3}],
+                    "page": -1,
+                },
+            }
+        return {"success": True, "data": {"notes": []}}
+
+    class DummyPage:
+        url = "https://creator.xiaohongshu.com/new/note-manager"
+
+        async def goto(self, *_args, **_kwargs):
+            return None
+
+        async def evaluate(self, _script, p):
+            return await fake_fetch_notes_page(p)
+
+    publisher.page = DummyPage()
+    publisher.context = object()
+    publisher.browser = object()
+
+    async def dummy_init_browser():
+        return None
+
+    async def dummy_set_cookies(_cookies):
+        return None
+
+    publisher.init_browser = dummy_init_browser  # type: ignore
+    publisher.set_cookies = dummy_set_cookies  # type: ignore
+    publisher.parse_cookies = lambda _c: []  # type: ignore
+    publisher.close_browser = lambda: asyncio.sleep(0)  # type: ignore
+
+    r0 = await publisher.get_works("[]", page=0, page_size=20)
+    assert r0.success
+    assert r0.total == 3
+    assert r0.has_more is True
+    assert r0.next_page == 1
+
+    r1 = await publisher.get_works("[]", page=1, page_size=20)
+    assert r1.success
+    assert r1.total == 3
+    assert r1.has_more is False
+    assert r1.next_page == -1
+
+    print("test_xiaohongshu_paging_logic passed")
+
+    r_all = await publisher.get_all_works("[]")
+    assert r_all.success
+    assert r_all.total == 3
+    assert len(r_all.works) == 3
+    assert r_all.has_more is False
+    print("test_xiaohongshu_auto_paging passed")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())

+ 48 - 0
server/src/scripts/selftest-xhs.ts

@@ -0,0 +1,48 @@
+import assert from 'node:assert/strict';
+import { extractDeclaredNotesCountFromPostedResponse, extractNotesCountFromPostedResponse } from '../utils/xiaohongshu.js';
+
+const sample = {
+  success: true,
+  data: {
+    notes: new Array(12).fill(null).map((_, i) => ({ id: String(i + 1) })),
+    tags: [
+      { id: 'special.note_time_desc', notes_count: 368 },
+      { id: 'other', notes_count: 12 },
+    ],
+  },
+};
+
+assert.equal(extractNotesCountFromPostedResponse(sample), 368);
+assert.equal(extractDeclaredNotesCountFromPostedResponse(sample), 368);
+
+const sampleMaxTag = {
+  data: {
+    notes: [],
+    tags: [
+      { id: 'a', notes_count: 10 },
+      { id: 'b', notes_count: 20 },
+    ],
+  },
+};
+assert.equal(extractNotesCountFromPostedResponse(sampleMaxTag), 20);
+assert.equal(extractDeclaredNotesCountFromPostedResponse(sampleMaxTag), 20);
+
+const sampleTotalOnly = {
+  data: {
+    notes: [{ id: '1' }],
+    total_count: 99,
+  },
+};
+assert.equal(extractNotesCountFromPostedResponse(sampleTotalOnly), 99);
+assert.equal(extractDeclaredNotesCountFromPostedResponse(sampleTotalOnly), 99);
+
+const sampleFallback = {
+  data: {
+    notes: [{ id: '1' }, { id: '2' }],
+  },
+};
+assert.equal(extractNotesCountFromPostedResponse(sampleFallback), 2);
+assert.equal(extractDeclaredNotesCountFromPostedResponse(sampleFallback), 0);
+
+console.log('selftest-xhs passed');
+

+ 199 - 197
server/src/services/HeadlessBrowserService.ts

@@ -1,6 +1,7 @@
 /// <reference lib="dom" />
 import { chromium, type BrowserContext, type Page } from 'playwright';
 import { logger } from '../utils/logger.js';
+import { extractDeclaredNotesCountFromPostedResponse } from '../utils/xiaohongshu.js';
 import type { PlatformType } from '@media-manager/shared';
 
 // Python 服务配置
@@ -79,6 +80,9 @@ export interface AccountInfo {
   fansCount?: number;
   worksCount: number;
   worksList?: WorkItem[];
+  worksListComplete?: boolean;
+  source?: 'python' | 'playwright' | 'api';
+  pythonAvailable?: boolean;
 }
 
 export interface WorkItem {
@@ -512,19 +516,40 @@ class HeadlessBrowserService {
   /**
    * 通过 Python API 获取作品列表
    */
-  private async fetchWorksViaPython(platform: PlatformType, cookies: CookieData[]): Promise<WorkItem[]> {
+  private async fetchWorksViaPython(
+    platform: PlatformType,
+    cookies: CookieData[],
+    onPage?: (info: {
+      platform: PlatformType;
+      page: string;
+      pageSize: number;
+      fetched: number;
+      newCount: number;
+      totalSoFar: number;
+      declaredTotal?: number;
+      hasMore: boolean;
+      nextPage?: unknown;
+    }) => void
+  ): Promise<{
+    works: WorkItem[];
+    total?: number;
+  }> {
     logger.info(`[Python API] Fetching works for ${platform} (auto pagination)...`);
 
     const cookieString = JSON.stringify(cookies);
     const pythonPlatform = platform === 'weixin_video' ? 'weixin' : platform;
 
-    const pageSize = 50;
-    const maxPages = 30;
+    const pageSize = platform === 'xiaohongshu' ? 20 : 50;
+    let maxPages = 30;
     const allWorks: WorkItem[] = [];
     const seenIds = new Set<string>();
+    let declaredTotal: number | undefined;
 
-    for (let page = 0; page < maxPages; page++) {
-      logger.info(`[Python API] Fetching works page=${page}, page_size=${pageSize} for ${platform}`);
+    let cursor: string | number = 0;
+    const seenCursors = new Set<string>();
+    for (let pageIndex = 0; pageIndex < maxPages; pageIndex++) {
+      const pageParam = platform === 'xiaohongshu' ? cursor : pageIndex;
+      logger.info(`[Python API] Fetching works page=${String(pageParam)}, page_size=${pageSize} for ${platform}`);
 
       const response = await fetch(`${PYTHON_SERVICE_URL}/works`, {
         method: 'POST',
@@ -534,8 +559,9 @@ class HeadlessBrowserService {
         body: JSON.stringify({
           platform: pythonPlatform,
           cookie: cookieString,
-          page,
+          page: pageParam,
           page_size: pageSize,
+          auto_paging: platform === 'xiaohongshu' && pageIndex === 0,
         }),
       });
 
@@ -549,6 +575,13 @@ class HeadlessBrowserService {
         throw new Error(result.error || 'Failed to get works');
       }
 
+      if (typeof result.total === 'number' && result.total > 0) {
+        declaredTotal = declaredTotal ? Math.max(declaredTotal, result.total) : result.total;
+        if (pageIndex === 0) {
+          maxPages = Math.min(400, Math.ceil(result.total / pageSize) + 5);
+        }
+      }
+
       const pageWorks: WorkItem[] = (result.works || []).map((work: {
         work_id: string;
         title: string;
@@ -585,27 +618,69 @@ class HeadlessBrowserService {
       }
 
       logger.info(
-        `[Python API] Page ${page} fetched=${pageWorks.length}, new=${newCount}, total=${allWorks.length}, has_more=${!!result.has_more}`
+        `[Python API] Page ${String(pageParam)} fetched=${pageWorks.length}, new=${newCount}, total=${allWorks.length}, has_more=${!!result.has_more}, declared_total=${declaredTotal || 0}, next_page=${String(result.next_page ?? '')}`
       );
+      onPage?.({
+        platform,
+        page: String(pageParam),
+        pageSize,
+        fetched: pageWorks.length,
+        newCount,
+        totalSoFar: allWorks.length,
+        declaredTotal,
+        hasMore: !!result.has_more,
+        nextPage: result.next_page,
+      });
+
+      if (platform === 'xiaohongshu') {
+        const next = result.next_page;
+        const expectedMore = declaredTotal && declaredTotal > 0 ? allWorks.length < declaredTotal : !!result.has_more;
+
+        if (next !== undefined && next !== null && next !== '' && next !== -1 && next !== '-1') {
+          const key = String(next);
+          if (seenCursors.has(key)) break;
+          seenCursors.add(key);
+          cursor = next;
+        } else {
+          cursor = (typeof cursor === 'number' ? cursor + 1 : pageIndex + 1);
+        }
 
-      if (!result.has_more || pageWorks.length === 0 || newCount === 0) {
-        break;
+        if (!expectedMore || pageWorks.length === 0 || newCount === 0) break;
+      } else {
+        if (!result.has_more || pageWorks.length === 0 || newCount === 0) break;
       }
     }
 
     logger.info(`[Python API] Total works fetched for ${platform}: ${allWorks.length}`);
-    return allWorks;
+    return { works: allWorks, total: declaredTotal };
   }
 
   /**
    * 获取账号信息(优先使用 Python API,回退到无头浏览器)
    */
-  async fetchAccountInfo(platform: PlatformType, cookies: CookieData[]): Promise<AccountInfo> {
+  async fetchAccountInfo(
+    platform: PlatformType,
+    cookies: CookieData[],
+    options?: {
+      onWorksFetchProgress?: (info: {
+        platform: PlatformType;
+        page: string;
+        pageSize: number;
+        fetched: number;
+        newCount: number;
+        totalSoFar: number;
+        declaredTotal?: number;
+        hasMore: boolean;
+        nextPage?: unknown;
+      }) => void;
+    }
+  ): Promise<AccountInfo> {
     logger.info(`[fetchAccountInfo] Starting for platform: ${platform}`);
+    let pythonAvailable = false;
 
     // 百家号:优先走 Python 的 /account_info(包含粉丝数、作品数),避免 Node 直连分散认证问题
     if (platform === 'baijiahao') {
-      const pythonAvailable = await this.checkPythonServiceAvailable();
+      pythonAvailable = await this.checkPythonServiceAvailable();
       if (pythonAvailable) {
         logger.info(`[Python API] Service available, fetching account_info for baijiahao`);
         try {
@@ -618,7 +693,10 @@ class HeadlessBrowserService {
       }
 
       // Python 不可用或失败时,回退到 Node 直连 API(可能仍会遇到分散认证问题)
-      return this.fetchBaijiahaoAccountInfoDirectApi(cookies);
+      const info = await this.fetchBaijiahaoAccountInfoDirectApi(cookies);
+      info.source = 'api';
+      info.pythonAvailable = pythonAvailable;
+      return info;
     }
 
     // 对于支持的平台,尝试使用 Python API 获取作品列表和账号信息
@@ -626,11 +704,15 @@ class HeadlessBrowserService {
     const supportedPlatforms: PlatformType[] = ['douyin', 'xiaohongshu', 'kuaishou', 'weixin_video', 'baijiahao'];
 
     if (supportedPlatforms.includes(platform)) {
-      const pythonAvailable = await this.checkPythonServiceAvailable();
+      pythonAvailable = await this.checkPythonServiceAvailable();
       if (pythonAvailable) {
         logger.info(`[Python API] Service available, trying to fetch works for ${platform}`);
         try {
-          const worksList = await this.fetchWorksViaPython(platform, cookies);
+          const { works: worksList, total: worksTotal } = await this.fetchWorksViaPython(
+            platform,
+            cookies,
+            options?.onWorksFetchProgress
+          );
 
           // 如果成功获取到作品,使用 Playwright 获取账号基本信息
           if (worksList.length > 0) {
@@ -652,15 +734,21 @@ class HeadlessBrowserService {
               }
 
               accountInfo.worksList = worksList;
-              // 直接使用 Python API 获取的作品数量(最准确,排除了已删除/私密视频)
-              accountInfo.worksCount = worksList.length;
+              // 账号展示的作品数优先用 Python 返回的 total(更接近创作者中心“全部笔记”),否则回退到抓到的列表长度
+              accountInfo.worksCount = worksTotal && worksTotal > 0 ? worksTotal : worksList.length;
+              accountInfo.worksListComplete = worksTotal && worksTotal > 0 ? worksList.length >= worksTotal : undefined;
+              accountInfo.source = 'python';
+              accountInfo.pythonAvailable = true;
               logger.info(`[fetchAccountInfo] Using Python API works count for ${platform}: ${accountInfo.worksCount}`);
               return accountInfo;
             } catch (playwrightError) {
               logger.warn(`[Playwright] Failed to get account info for ${platform}:`, playwrightError);
               const accountInfo = this.getDefaultAccountInfo(platform);
               accountInfo.worksList = worksList;
-              accountInfo.worksCount = worksList.length;
+              accountInfo.worksCount = worksTotal && worksTotal > 0 ? worksTotal : worksList.length;
+              accountInfo.worksListComplete = worksTotal && worksTotal > 0 ? worksList.length >= worksTotal : undefined;
+              accountInfo.source = 'python';
+              accountInfo.pythonAvailable = true;
               return accountInfo;
             }
           }
@@ -677,7 +765,10 @@ class HeadlessBrowserService {
 
     // 使用 Playwright 获取账号信息
     logger.info(`[Playwright] Fetching account info for ${platform}`);
-    return this.fetchAccountInfoWithPlaywright(platform, cookies);
+    const info = await this.fetchAccountInfoWithPlaywright(platform, cookies);
+    info.source = 'playwright';
+    info.pythonAvailable = pythonAvailable;
+    return info;
   }
 
   /**
@@ -1758,6 +1849,7 @@ class HeadlessBrowserService {
 
       // 获取作品列表 - 通过监听 API 接口
       const worksList: WorkItem[] = [];
+      let worksListComplete: boolean | undefined;
       try {
         logger.info('[Xiaohongshu] Navigating to note manager page to fetch works...');
 
@@ -1777,13 +1869,70 @@ class HeadlessBrowserService {
           shareCount: number;
         }> = [];
 
-        let currentPage = 0;
-        let hasMorePages = true;
-        const maxPages = 20; // 最多获取20页,防止无限循环
+        let maxPages = 120;
 
         // 设置 API 响应监听器 - 在导航之前绑定
         let apiResponseReceived = false;
         let totalNotesCount = 0; // 从 tags 中获取的总作品数
+        let stoppedByMaxPages = false;
+        const seenNoteIds = new Set<string>();
+
+        const upsertNotesFromPayload = (payload: any) => {
+          if (!payload) return;
+          const declaredTotal = extractDeclaredNotesCountFromPostedResponse(payload);
+          if (declaredTotal > 0) {
+            totalNotesCount = Math.max(totalNotesCount, declaredTotal);
+          }
+
+          if (totalNotesCount > 0) {
+            const estimatedPages = Math.ceil(totalNotesCount / 20) + 5;
+            maxPages = Math.max(maxPages, Math.min(500, estimatedPages));
+          }
+
+          const notes = payload.notes || [];
+          for (const note of notes) {
+            const noteId = note.id || '';
+            if (!noteId || seenNoteIds.has(noteId)) continue;
+            seenNoteIds.add(noteId);
+
+            let coverUrl = note.images_list?.[0]?.url || '';
+            if (coverUrl.startsWith('http://')) {
+              coverUrl = coverUrl.replace('http://', 'https://');
+            }
+            const duration = note.video_info?.duration || 0;
+
+            allNotesData.push({
+              noteId,
+              title: note.display_title || '',
+              coverUrl,
+              status: note.tab_status || 1,
+              publishTime: note.time || '',
+              type: note.type || 'normal',
+              duration,
+              likeCount: note.likes || 0,
+              commentCount: note.comments_count || 0,
+              collectCount: note.collected_count || 0,
+              viewCount: note.view_count || 0,
+              shareCount: note.shared_count || 0,
+            });
+          }
+        };
+
+        const fetchNotesPage = async (pageNum: number) => {
+          return await page.evaluate(async (p) => {
+            const response = await fetch(
+              `https://edith.xiaohongshu.com/web_api/sns/v5/creator/note/user/posted?tab=0&page=${p}`,
+              {
+                method: 'GET',
+                credentials: 'include',
+                headers: {
+                  Accept: 'application/json',
+                },
+              }
+            );
+            return await response.json();
+          }, pageNum);
+        };
 
         const notesApiHandler = async (response: import('playwright').Response) => {
           const url = response.url();
@@ -1798,56 +1947,8 @@ class HeadlessBrowserService {
 
               if ((data?.success || data?.code === 0) && data?.data) {
                 apiResponseReceived = true;
-
-                // 从 tags 中获取总作品数
-                // tags 数组中 id="special.note_time_desc" 的项("所有笔记")包含总数
-                if (data.data.tags && Array.isArray(data.data.tags)) {
-                  const allNotesTag = data.data.tags.find((tag: { id?: string; notes_count?: number }) =>
-                    tag.id === 'special.note_time_desc'
-                  );
-                  if (allNotesTag?.notes_count !== undefined) {
-                    totalNotesCount = allNotesTag.notes_count;
-                    logger.info(`[Xiaohongshu API] Total notes count from tags: ${totalNotesCount}`);
-                  }
-                }
-
-                const notes = data.data.notes || [];
-                for (const note of notes) {
-                  // 根据 API 返回格式解析数据
-                  // images_list 是数组,第一个元素包含 url
-                  // 将 http:// 转换为 https:// 以确保图片能正常加载
-                  let coverUrl = note.images_list?.[0]?.url || '';
-                  if (coverUrl.startsWith('http://')) {
-                    coverUrl = coverUrl.replace('http://', 'https://');
-                  }
-                  const duration = note.video_info?.duration || 0;
-
-                  logger.info(`[Xiaohongshu API] Note: id=${note.id}, title="${note.display_title}", cover=${coverUrl ? coverUrl.slice(0, 60) + '...' : 'none'}`);
-
-                  allNotesData.push({
-                    noteId: note.id || '',
-                    title: note.display_title || '',
-                    coverUrl: coverUrl,
-                    status: note.tab_status || 1, // 1=已发布
-                    publishTime: note.time || '',
-                    type: note.type || 'normal', // video/normal
-                    duration: duration,
-                    likeCount: note.likes || 0,
-                    commentCount: note.comments_count || 0,
-                    collectCount: note.collected_count || 0,
-                    viewCount: note.view_count || 0,
-                    shareCount: note.shared_count || 0,
-                  });
-                }
-
-                // 检查是否还有更多页面
-                // page=-1 表示没有更多数据
-                if (data.data.page === -1 || notes.length === 0) {
-                  hasMorePages = false;
-                  logger.info(`[Xiaohongshu API] No more pages (page indicator: ${data.data.page})`);
-                }
+                upsertNotesFromPayload(data.data);
               } else {
-                hasMorePages = false;
               }
             }
           } catch (e) {
@@ -1882,134 +1983,37 @@ class HeadlessBrowserService {
           logger.info('[Xiaohongshu] No notes captured via listener, trying direct API call...');
 
           try {
-            // 直接在页面上下文中调用 API
-            const apiResponse = await page.evaluate(async () => {
-              const response = await fetch('https://edith.xiaohongshu.com/web_api/sns/v5/creator/note/user/posted?tab=0&page=0', {
-                method: 'GET',
-                credentials: 'include',
-                headers: {
-                  'Accept': 'application/json',
-                },
-              });
-              return await response.json();
-            });
-
+            const apiResponse = await fetchNotesPage(0);
             logger.info(`[Xiaohongshu] Direct API call result: success=${apiResponse?.success}, code=${apiResponse?.code}`);
-
             if ((apiResponse?.success || apiResponse?.code === 0) && apiResponse?.data) {
-              // 从 tags 中获取总作品数
-              if (apiResponse.data.tags && Array.isArray(apiResponse.data.tags)) {
-                const allNotesTag = apiResponse.data.tags.find((tag: { id?: string; notes_count?: number }) =>
-                  tag.id === 'special.note_time_desc'
-                );
-                if (allNotesTag?.notes_count !== undefined) {
-                  totalNotesCount = allNotesTag.notes_count;
-                  logger.info(`[Xiaohongshu API Direct] Total notes count from tags: ${totalNotesCount}`);
-                }
-              }
-
-              const notes = apiResponse.data.notes || [];
-              for (const note of notes) {
-                // 将 http:// 转换为 https://
-                let coverUrl = note.images_list?.[0]?.url || '';
-                if (coverUrl.startsWith('http://')) {
-                  coverUrl = coverUrl.replace('http://', 'https://');
-                }
-                const duration = note.video_info?.duration || 0;
-
-                logger.info(`[Xiaohongshu API Direct] Note: id=${note.id}, cover=${coverUrl ? coverUrl.slice(0, 60) + '...' : 'none'}`);
-
-                allNotesData.push({
-                  noteId: note.id || '',
-                  title: note.display_title || '',
-                  coverUrl: coverUrl,
-                  status: note.tab_status || 1,
-                  publishTime: note.time || '',
-                  type: note.type || 'normal',
-                  duration: duration,
-                  likeCount: note.likes || 0,
-                  commentCount: note.comments_count || 0,
-                  collectCount: note.collected_count || 0,
-                  viewCount: note.view_count || 0,
-                  shareCount: note.shared_count || 0,
-                });
-              }
-
-              // 获取更多页面
-              let pageNum = 1;
-              let lastPage = apiResponse.data.page;
-              while (lastPage !== -1 && pageNum < maxPages) {
-                const nextResponse = await page.evaluate(async (p) => {
-                  const response = await fetch(`https://edith.xiaohongshu.com/web_api/sns/v5/creator/note/user/posted?tab=0&page=${p}`, {
-                    method: 'GET',
-                    credentials: 'include',
-                  });
-                  return await response.json();
-                }, pageNum);
-
-                if (nextResponse?.data?.notes?.length > 0) {
-                  for (const note of nextResponse.data.notes) {
-                    // 将 http:// 转换为 https://
-                    let coverUrl = note.images_list?.[0]?.url || '';
-                    if (coverUrl.startsWith('http://')) {
-                      coverUrl = coverUrl.replace('http://', 'https://');
-                    }
-                    const duration = note.video_info?.duration || 0;
-
-                    allNotesData.push({
-                      noteId: note.id || '',
-                      title: note.display_title || '',
-                      coverUrl: coverUrl,
-                      status: note.tab_status || 1,
-                      publishTime: note.time || '',
-                      type: note.type || 'normal',
-                      duration: duration,
-                      likeCount: note.likes || 0,
-                      commentCount: note.comments_count || 0,
-                      collectCount: note.collected_count || 0,
-                      viewCount: note.view_count || 0,
-                      shareCount: note.shared_count || 0,
-                    });
-                  }
-                  pageNum++;
-                  lastPage = nextResponse.data.page;
-                  if (lastPage === -1) break;
-                } else {
-                  break;
-                }
-              }
+              upsertNotesFromPayload(apiResponse.data);
             }
           } catch (apiError) {
             logger.warn('[Xiaohongshu] Direct API call failed:', apiError);
           }
         }
 
-        // 如果还是没有数据,尝试滚动加载
-        if (allNotesData.length === 0) {
-          logger.info('[Xiaohongshu] Still no notes, trying scroll to trigger API...');
-          await page.waitForTimeout(2000);
-        }
-
-        // 滚动加载更多页面(如果通过监听器获取的数据)
-        while (hasMorePages && currentPage < maxPages && allNotesData.length > 0 && apiResponseReceived) {
-          currentPage++;
-          const previousCount = allNotesData.length;
-
-          // 滚动到页面底部触发加载更多
-          await page.evaluate(() => {
-            window.scrollTo(0, document.body.scrollHeight);
-          });
-
-          // 等待新数据加载
-          await page.waitForTimeout(2000);
+        if (allNotesData.length > 0) {
+          let pageNum = 1;
+          while (pageNum < maxPages) {
+            if (totalNotesCount > 0 && seenNoteIds.size >= totalNotesCount) break;
+            let nextResponse: any;
+            try {
+              nextResponse = await fetchNotesPage(pageNum);
+            } catch (e) {
+              logger.warn(`[Xiaohongshu] Page fetch failed: page=${pageNum}`, e);
+              break;
+            }
 
-          // 如果没有新数据,退出循环
-          if (allNotesData.length === previousCount) {
-            logger.info(`[Xiaohongshu] No new notes loaded after scroll, stopping at page ${currentPage}`);
-            break;
+            if (!(nextResponse?.success || nextResponse?.code === 0) || !nextResponse?.data) break;
+            const before = seenNoteIds.size;
+            upsertNotesFromPayload(nextResponse.data);
+            const after = seenNoteIds.size;
+            if (after === before) break;
+            pageNum++;
+            await page.waitForTimeout(600);
           }
-
-          logger.info(`[Xiaohongshu] Page ${currentPage}: total ${allNotesData.length} notes`);
+          if (pageNum >= maxPages) stoppedByMaxPages = true;
         }
 
         // 移除监听器
@@ -2048,21 +2052,19 @@ class HeadlessBrowserService {
 
         logger.info(`[Xiaohongshu] Fetched ${worksList.length} works via API`);
 
-        // 更新作品数:直接使用获取到的 notes 数量(更准确)
-        // 只有当 notes 为空时才使用 tags 中的 notes_count
-        if (worksList.length > 0) {
-          worksCount = worksList.length;
-          logger.info(`[Xiaohongshu] Using actual notes count: ${worksCount}`);
-        } else if (totalNotesCount > 0) {
+        if (totalNotesCount > 0) {
+          worksListComplete = worksList.length >= totalNotesCount;
           worksCount = totalNotesCount;
-          logger.info(`[Xiaohongshu] Using notes count from tags: ${worksCount}`);
+        } else if (worksList.length > 0) {
+          worksListComplete = !stoppedByMaxPages;
+          worksCount = worksList.length;
         }
       } catch (worksError) {
         logger.warn('[Xiaohongshu] Failed to fetch works list:', worksError);
       }
 
       logger.info(`[Xiaohongshu] Final account info: id=${accountId}, name=${accountName}, fans=${fansCount}, works=${worksCount}`);
-      return { accountId, accountName, avatarUrl, fansCount, worksCount, worksList };
+      return { accountId, accountName, avatarUrl, fansCount, worksCount, worksList, worksListComplete };
     } catch (error) {
       logger.warn('[Xiaohongshu] Failed to fetch account info:', error);
     }

+ 161 - 68
server/src/services/WorkService.ts

@@ -88,13 +88,29 @@ export class WorkService {
   /**
    * 同步账号的作品
    */
-  async syncWorks(userId: number, accountId?: number): Promise<{ synced: number; accounts: number }> {
+  async syncWorks(
+    userId: number,
+    accountId?: number,
+    onProgress?: (progress: number, currentStep: string) => void
+  ): Promise<{
+    synced: number;
+    accounts: number;
+    accountSummaries: Array<{
+      accountId: number;
+      platform: PlatformType;
+      worksListLength: number;
+      worksCount: number;
+      source?: string;
+      pythonAvailable?: boolean;
+      syncedCount: number;
+    }>;
+  }> {
     logger.info(`[SyncWorks] Starting sync for userId: ${userId}, accountId: ${accountId || 'all'}`);
-    
+
     // 先查看所有账号(调试用)
     const allAccounts = await this.accountRepository.find({ where: { userId } });
     logger.info(`[SyncWorks] All accounts for user ${userId}: ${allAccounts.map(a => `id=${a.id},status=${a.status},platform=${a.platform}`).join('; ')}`);
-    
+
     // 同时查询 active 和 expired 状态的账号(expired 的账号 cookie 可能实际上还有效)
     const queryBuilder = this.accountRepository
       .createQueryBuilder('account')
@@ -107,20 +123,47 @@ export class WorkService {
 
     const accounts = await queryBuilder.getMany();
     logger.info(`[SyncWorks] Found ${accounts.length} accounts (active + expired)`);
-    
+
     let totalSynced = 0;
     let accountCount = 0;
-
-    for (const account of accounts) {
+    const accountSummaries: Array<{
+      accountId: number;
+      platform: PlatformType;
+      worksListLength: number;
+      worksCount: number;
+      source?: string;
+      pythonAvailable?: boolean;
+      syncedCount: number;
+    }> = [];
+
+    for (let i = 0; i < accounts.length; i++) {
+      const account = accounts[i];
       try {
         logger.info(`[SyncWorks] Syncing account ${account.id} (${account.platform}, status: ${account.status})`);
-        const synced = await this.syncAccountWorks(userId, account);
-        totalSynced += synced;
+        onProgress?.(
+          Math.min(95, 5 + Math.round((i / Math.max(1, accounts.length)) * 90)),
+          `同步账号 ${i + 1}/${accounts.length}: ${account.accountName || account.id} (${account.platform})`
+        );
+
+        const result = await this.syncAccountWorks(userId, account, (p, step) => {
+          const overall = 5 + Math.round(((i + Math.max(0, Math.min(1, p))) / Math.max(1, accounts.length)) * 90);
+          onProgress?.(Math.min(95, overall), step);
+        });
+        totalSynced += result.syncedCount;
         accountCount++;
-        logger.info(`[SyncWorks] Account ${account.id} synced ${synced} works`);
-        
+        accountSummaries.push({
+          accountId: account.id,
+          platform: account.platform as PlatformType,
+          worksListLength: result.worksListLength,
+          worksCount: result.worksCount,
+          source: result.source,
+          pythonAvailable: result.pythonAvailable,
+          syncedCount: result.syncedCount,
+        });
+        logger.info(`[SyncWorks] Account ${account.id} synced ${result.syncedCount} works`);
+
         // 如果同步成功且账号状态是 expired,则恢复为 active
-        if (synced > 0 && account.status === 'expired') {
+        if (result.syncedCount > 0 && account.status === 'expired') {
           await this.accountRepository.update(account.id, { status: 'active' });
           logger.info(`[SyncWorks] Account ${account.id} status restored to active`);
         }
@@ -129,19 +172,30 @@ export class WorkService {
       }
     }
 
+    onProgress?.(100, `同步完成:共同步 ${totalSynced} 条作品(${accountCount} 个账号)`);
     logger.info(`[SyncWorks] Complete: ${totalSynced} works synced from ${accountCount} accounts`);
-    return { synced: totalSynced, accounts: accountCount };
+    return { synced: totalSynced, accounts: accountCount, accountSummaries };
   }
 
   /**
    * 同步单个账号的作品
    */
-  private async syncAccountWorks(userId: number, account: PlatformAccount): Promise<number> {
+  private async syncAccountWorks(
+    userId: number,
+    account: PlatformAccount,
+    onProgress?: (progress: number, currentStep: string) => void
+  ): Promise<{
+    syncedCount: number;
+    worksListLength: number;
+    worksCount: number;
+    source?: string;
+    pythonAvailable?: boolean;
+  }> {
     logger.info(`[SyncAccountWorks] Starting for account ${account.id} (${account.platform})`);
-    
+
     if (!account.cookieData) {
       logger.warn(`Account ${account.id} has no cookie data`);
-      return 0;
+      return { syncedCount: 0, worksListLength: 0, worksCount: 0 };
     }
 
     // 解密 Cookie
@@ -167,21 +221,36 @@ export class WorkService {
       logger.info(`[SyncAccountWorks] Parsed ${cookieList.length} cookies from string format`);
       if (cookieList.length === 0) {
         logger.error(`Invalid cookie format for account ${account.id}`);
-        return 0;
+        return { syncedCount: 0, worksListLength: 0, worksCount: 0 };
       }
     }
 
     // 获取作品列表
     logger.info(`[SyncAccountWorks] Fetching account info from ${platform}...`);
-    const accountInfo = await headlessBrowserService.fetchAccountInfo(platform, cookieList);
+    onProgress?.(0.1, `获取作品列表中:${account.accountName || account.id} (${platform})`);
+    const accountInfo = await headlessBrowserService.fetchAccountInfo(platform, cookieList, {
+      onWorksFetchProgress: (info) => {
+        const declaredTotal = typeof info.declaredTotal === 'number' ? info.declaredTotal : 0;
+        const ratio = declaredTotal > 0 ? Math.min(1, info.totalSoFar / declaredTotal) : 0;
+        onProgress?.(
+          0.1 + ratio * 0.2,
+          `拉取作品中:${account.accountName || account.id} (${platform}) ${info.totalSoFar}/${declaredTotal || '?'}`
+        );
+      },
+    });
     logger.info(`[SyncAccountWorks] Got ${accountInfo.worksList?.length || 0} works from API`);
+    onProgress?.(
+      0.3,
+      `拉取完成:${account.accountName || account.id} (${platform}) python=${accountInfo.pythonAvailable ? 'ok' : 'off'} source=${accountInfo.source || 'unknown'} list=${accountInfo.worksList?.length || 0} total=${accountInfo.worksCount || 0}`
+    );
 
     let syncedCount = 0;
-    
+
     // 收集远程作品的 platformVideoId
     const remotePlatformVideoIds = new Set<string>();
 
     if (accountInfo.worksList && accountInfo.worksList.length > 0) {
+      const total = accountInfo.worksList.length;
       for (const workItem of accountInfo.worksList) {
         // 生成一个唯一的视频ID
         const platformVideoId = workItem.videoId || `${platform}_${workItem.title}_${workItem.publishTime}`.substring(0, 100);
@@ -225,30 +294,48 @@ export class WorkService {
           await this.workRepository.save(work);
         }
         syncedCount++;
+        if (syncedCount === 1 || syncedCount === total || syncedCount % 10 === 0) {
+          onProgress?.(0.3 + (syncedCount / total) * 0.65, `写入作品:${account.accountName || account.id} ${syncedCount}/${total}`);
+        }
       }
 
       logger.info(`Synced ${syncedCount} works for account ${account.id}`);
     }
 
     // 删除本地存在但远程已删除的作品
-    const localWorks = await this.workRepository.find({
-      where: { accountId: account.id },
-    });
-    
-    let deletedCount = 0;
-    for (const localWork of localWorks) {
-      if (!remotePlatformVideoIds.has(localWork.platformVideoId)) {
-        // 先删除关联的评论
-        await AppDataSource.getRepository(Comment).delete({ workId: localWork.id });
-        // 再删除作品
-        await this.workRepository.delete(localWork.id);
-        deletedCount++;
-        logger.info(`Deleted work ${localWork.id} (${localWork.title}) - no longer exists on platform`);
+    const remoteListLength = accountInfo.worksList?.length || 0;
+    const expectedRemoteCount = accountInfo.worksCount || 0;
+    const remoteComplete =
+      typeof accountInfo.worksListComplete === 'boolean'
+        ? accountInfo.worksListComplete
+        : expectedRemoteCount > 0
+          ? remoteListLength >= expectedRemoteCount
+          : remoteListLength > 0;
+
+    if (!remoteComplete) {
+      logger.warn(
+        `[SyncAccountWorks] Skipping local deletions for account ${account.id} because remote works list seems incomplete (remote=${remoteListLength}, expected=${expectedRemoteCount})`
+      );
+    } else if (remotePlatformVideoIds.size === 0) {
+      logger.warn(`[SyncAccountWorks] Skipping local deletions for account ${account.id} because no remote IDs were collected`);
+    } else {
+      const localWorks = await this.workRepository.find({
+        where: { accountId: account.id },
+      });
+
+      let deletedCount = 0;
+      for (const localWork of localWorks) {
+        if (!remotePlatformVideoIds.has(localWork.platformVideoId)) {
+          await AppDataSource.getRepository(Comment).delete({ workId: localWork.id });
+          await this.workRepository.delete(localWork.id);
+          deletedCount++;
+          logger.info(`Deleted work ${localWork.id} (${localWork.title}) - no longer exists on platform`);
+        }
+      }
+
+      if (deletedCount > 0) {
+        logger.info(`Deleted ${deletedCount} works that no longer exist on platform for account ${account.id}`);
       }
-    }
-    
-    if (deletedCount > 0) {
-      logger.info(`Deleted ${deletedCount} works that no longer exist on platform for account ${account.id}`);
     }
 
     // 保存每日统计数据
@@ -258,7 +345,13 @@ export class WorkService {
       logger.error(`[SyncAccountWorks] Failed to save day statistics for account ${account.id}:`, error);
     }
 
-    return syncedCount;
+    return {
+      syncedCount,
+      worksListLength: accountInfo.worksList?.length || 0,
+      worksCount: accountInfo.worksCount || 0,
+      source: accountInfo.source,
+      pythonAvailable: accountInfo.pythonAvailable,
+    };
   }
 
   /**
@@ -321,7 +414,7 @@ export class WorkService {
    */
   private parsePublishTime(timeStr: string): Date | null {
     if (!timeStr) return null;
-    
+
     // 尝试解析各种格式
     // 格式: "2025年12月19日 06:33"
     const match = timeStr.match(/(\d{4})年(\d{1,2})月(\d{1,2})日\s*(\d{1,2}):(\d{2})/);
@@ -329,13 +422,13 @@ export class WorkService {
       const [, year, month, day, hour, minute] = match;
       return new Date(parseInt(year), parseInt(month) - 1, parseInt(day), parseInt(hour), parseInt(minute));
     }
-    
+
     // 尝试直接解析
     const date = new Date(timeStr);
     if (!isNaN(date.getTime())) {
       return date;
     }
-    
+
     return null;
   }
 
@@ -346,17 +439,17 @@ export class WorkService {
     const work = await this.workRepository.findOne({
       where: { id: workId, userId },
     });
-    
+
     if (!work) {
       throw new AppError('作品不存在', HTTP_STATUS.NOT_FOUND, ERROR_CODES.NOT_FOUND);
     }
-    
+
     // 先删除关联的评论
     await AppDataSource.getRepository(Comment).delete({ workId });
-    
+
     // 删除作品
     await this.workRepository.delete(workId);
-    
+
     logger.info(`Deleted work ${workId} for user ${userId}`);
   }
 
@@ -365,7 +458,7 @@ export class WorkService {
    * @returns 包含 accountId 用于后续刷新作品列表
    */
   async deletePlatformWork(
-    userId: number, 
+    userId: number,
     workId: number,
     onCaptchaRequired?: (captchaInfo: { taskId: string }) => Promise<string>
   ): Promise<{ success: boolean; errorMessage?: string; accountId?: number }> {
@@ -373,19 +466,19 @@ export class WorkService {
       where: { id: workId, userId },
       relations: ['account'],
     });
-    
+
     if (!work) {
       throw new AppError('作品不存在', HTTP_STATUS.NOT_FOUND, ERROR_CODES.NOT_FOUND);
     }
-    
+
     const account = await this.accountRepository.findOne({
       where: { id: work.accountId },
     });
-    
+
     if (!account || !account.cookieData) {
       throw new AppError('账号不存在或未登录', HTTP_STATUS.BAD_REQUEST, ERROR_CODES.ACCOUNT_NOT_FOUND);
     }
-    
+
     // 解密 Cookie
     let decryptedCookies: string;
     try {
@@ -393,48 +486,48 @@ export class WorkService {
     } catch {
       decryptedCookies = account.cookieData;
     }
-    
+
     // 根据平台调用对应的删除方法
     if (account.platform === 'douyin') {
       const { DouyinAdapter } = await import('../automation/platforms/douyin.js');
       const adapter = new DouyinAdapter();
-      
+
       const result = await adapter.deleteWork(decryptedCookies, work.platformVideoId, onCaptchaRequired);
-      
+
       if (result.success) {
         // 更新作品状态为已删除
         await this.workRepository.update(workId, { status: 'deleted' });
         logger.info(`Platform work ${workId} deleted successfully`);
       }
-      
+
       return { ...result, accountId: account.id };
     }
-    
+
     if (account.platform === 'xiaohongshu') {
       const { XiaohongshuAdapter } = await import('../automation/platforms/xiaohongshu.js');
       const adapter = new XiaohongshuAdapter();
-      
+
       const result = await adapter.deleteWork(decryptedCookies, work.platformVideoId, onCaptchaRequired);
-      
+
       if (result.success) {
         // 更新作品状态为已删除
         await this.workRepository.update(workId, { status: 'deleted' });
         logger.info(`Platform work ${workId} (xiaohongshu) deleted successfully`);
       }
-      
+
       return { ...result, accountId: account.id };
     }
-    
+
     return { success: false, errorMessage: '暂不支持该平台删除功能' };
   }
 
   /**
    * 将 cookie 字符串解析为 cookie 列表
    */
-  private parseCookieString(cookieString: string, platform: PlatformType): { 
-    name: string; 
-    value: string; 
-    domain: string; 
+  private parseCookieString(cookieString: string, platform: PlatformType): {
+    name: string;
+    value: string;
+    domain: string;
     path: string;
   }[] {
     // 获取平台对应的域名
@@ -449,23 +542,23 @@ export class WorkService {
       qie: '.qq.com',
       dayuhao: '.alibaba.com',
     };
-    
+
     const domain = domainMap[platform] || `.${platform}.com`;
-    
+
     // 解析 "name=value; name2=value2" 格式的 cookie 字符串
     const cookies: { name: string; value: string; domain: string; path: string }[] = [];
-    
+
     const pairs = cookieString.split(';');
     for (const pair of pairs) {
       const trimmed = pair.trim();
       if (!trimmed) continue;
-      
+
       const eqIndex = trimmed.indexOf('=');
       if (eqIndex === -1) continue;
-      
+
       const name = trimmed.substring(0, eqIndex).trim();
       const value = trimmed.substring(eqIndex + 1).trim();
-      
+
       if (name && value) {
         cookies.push({
           name,
@@ -475,7 +568,7 @@ export class WorkService {
         });
       }
     }
-    
+
     return cookies;
   }
 

+ 5 - 2
server/src/services/login/XiaohongshuLoginService.ts

@@ -12,6 +12,7 @@
  */
 
 import { logger } from '../../utils/logger.js';
+import { extractNotesCountFromPostedResponse } from '../../utils/xiaohongshu.js';
 import { BaseLoginService, type ApiInterceptConfig } from './BaseLoginService.js';
 import type { AccountInfo, LoginSession } from './types.js';
 
@@ -51,8 +52,10 @@ export class XiaohongshuLoginService extends BaseLoginService {
         urlPattern: '/web_api/sns/v5/creator/note/user/posted',
         dataKey: 'noteList',
         handler: (data: any) => {
-          const notes = data.data?.notes || data.notes || [];
-          return { notes, count: notes.length };
+          const payload = data?.data || data || {};
+          const notes = payload?.notes || data?.notes || [];
+          const count = extractNotesCountFromPostedResponse(data);
+          return { notes, count };
         },
       },
     ];

+ 13 - 2
server/src/services/taskExecutors.ts

@@ -64,16 +64,27 @@ async function syncWorksExecutor(task: Task, updateProgress: ProgressUpdater): P
     throw new Error('缺少用户ID');
   }
 
-  const result = await workService.syncWorks(userId, task.accountId);
+  const result = await workService.syncWorks(userId, task.accountId, (progress, currentStep) => {
+    updateProgress({ progress, currentStep });
+  });
 
   updateProgress({ progress: 100, currentStep: '同步完成' });
 
+  const summaryText = (() => {
+    if (result.accountSummaries.length === 1) {
+      const s = result.accountSummaries[0];
+      return `(${s.platform} list=${s.worksListLength}/${s.worksCount} python=${s.pythonAvailable ? 'ok' : 'off'} source=${s.source || 'unknown'})`;
+    }
+    return '';
+  })();
+
   return {
     success: true,
-    message: `同步完成,共同步 ${result.synced} 个作品`,
+    message: `同步完成,共同步 ${result.synced} 个作品${summaryText}`,
     data: {
       syncedCount: result.synced,
       accountCount: result.accounts,
+      accountSummaries: result.accountSummaries,
     },
   };
 }

+ 66 - 0
server/src/utils/xiaohongshu.ts

@@ -0,0 +1,66 @@
+export function extractNotesCountFromPostedResponse(raw: any): number {
+  const payload = raw?.data || raw || {};
+  const tags = payload?.tags || raw?.tags || [];
+  let countFromTags = 0;
+
+  if (Array.isArray(tags)) {
+    const preferredTag = tags.find((t: any) => t?.id === 'special.note_time_desc');
+    const preferred = preferredTag?.notes_count ?? preferredTag?.notesCount ?? preferredTag?.count;
+    if (preferred !== undefined && preferred !== null) {
+      countFromTags = Number(preferred) || 0;
+    } else {
+      countFromTags = tags.reduce((max: number, t: any) => {
+        const n = Number(t?.notes_count ?? t?.notesCount ?? t?.count ?? 0);
+        return Number.isFinite(n) ? Math.max(max, n) : max;
+      }, 0);
+    }
+  }
+
+  const countFromPayload =
+    Number(
+      payload?.total ||
+      payload?.total_count ||
+      payload?.totalCount ||
+      payload?.notes_count ||
+      payload?.note_count ||
+      payload?.page?.total ||
+      payload?.page?.totalCount ||
+      0
+    ) || 0;
+
+  const notes = payload?.notes || raw?.notes || [];
+  return countFromTags || countFromPayload || (Array.isArray(notes) ? notes.length : 0);
+}
+
+export function extractDeclaredNotesCountFromPostedResponse(raw: any): number {
+  const payload = raw?.data || raw || {};
+  const tags = payload?.tags || raw?.tags || [];
+  let countFromTags = 0;
+
+  if (Array.isArray(tags)) {
+    const preferredTag = tags.find((t: any) => t?.id === 'special.note_time_desc');
+    const preferred = preferredTag?.notes_count ?? preferredTag?.notesCount ?? preferredTag?.count;
+    if (preferred !== undefined && preferred !== null) {
+      countFromTags = Number(preferred) || 0;
+    } else {
+      countFromTags = tags.reduce((max: number, t: any) => {
+        const n = Number(t?.notes_count ?? t?.notesCount ?? t?.count ?? 0);
+        return Number.isFinite(n) ? Math.max(max, n) : max;
+      }, 0);
+    }
+  }
+
+  const countFromPayload =
+    Number(
+      payload?.total ||
+      payload?.total_count ||
+      payload?.totalCount ||
+      payload?.notes_count ||
+      payload?.note_count ||
+      payload?.page?.total ||
+      payload?.page?.totalCount ||
+      0
+    ) || 0;
+
+  return countFromTags || countFromPayload;
+}