|
@@ -579,6 +579,82 @@ class XiaohongshuPublisher(BasePublisher):
|
|
|
status='success'
|
|
status='success'
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
|
|
+ async def get_account_info(self, cookies: str) -> dict:
|
|
|
|
|
+ """获取账号信息"""
|
|
|
|
|
+ print(f"\n{'='*60}")
|
|
|
|
|
+ print(f"[{self.platform_name}] 获取账号信息")
|
|
|
|
|
+ print(f"{'='*60}")
|
|
|
|
|
+
|
|
|
|
|
+ captured_info = {}
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ await self.init_browser()
|
|
|
|
|
+ cookie_list = self.parse_cookies(cookies)
|
|
|
|
|
+ await self.set_cookies(cookie_list)
|
|
|
|
|
+
|
|
|
|
|
+ if not self.page:
|
|
|
|
|
+ raise Exception("Page not initialized")
|
|
|
|
|
+
|
|
|
|
|
+ # 监听个人信息 API
|
|
|
|
|
+ async def handle_response(response):
|
|
|
|
|
+ nonlocal captured_info
|
|
|
|
|
+ if 'api/galaxy/creator/home/personal_info' in response.url:
|
|
|
|
|
+ try:
|
|
|
|
|
+ json_data = await response.json()
|
|
|
|
|
+ print(f"[{self.platform_name}] 捕获个人信息 API", flush=True)
|
|
|
|
|
+ if json_data.get('success') or json_data.get('code') == 0:
|
|
|
|
|
+ data = json_data.get('data', {})
|
|
|
|
|
+ captured_info = {
|
|
|
|
|
+ "account_id": f"xhs_{data.get('red_num', '')}",
|
|
|
|
|
+ "account_name": data.get('name', ''),
|
|
|
|
|
+ "avatar_url": data.get('avatar', ''),
|
|
|
|
|
+ "fans_count": data.get('fans_count', 0),
|
|
|
|
|
+ "works_count": 0 # 暂时无法直接获取准确的作品数,需要从作品列表获取
|
|
|
|
|
+ }
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"[{self.platform_name}] 解析个人信息失败: {e}", flush=True)
|
|
|
|
|
+
|
|
|
|
|
+ self.page.on('response', handle_response)
|
|
|
|
|
+
|
|
|
|
|
+ # 访问首页
|
|
|
|
|
+ print(f"[{self.platform_name}] 访问创作者首页...", flush=True)
|
|
|
|
|
+ await self.page.goto("https://creator.xiaohongshu.com/new/home", wait_until="domcontentloaded")
|
|
|
|
|
+
|
|
|
|
|
+ # 等待 API 响应
|
|
|
|
|
+ for _ in range(10):
|
|
|
|
|
+ if captured_info:
|
|
|
|
|
+ break
|
|
|
|
|
+ await asyncio.sleep(1)
|
|
|
|
|
+
|
|
|
|
|
+ if not captured_info:
|
|
|
|
|
+ print(f"[{self.platform_name}] 未捕获到个人信息,尝试刷新...", flush=True)
|
|
|
|
|
+ await self.page.reload()
|
|
|
|
|
+ for _ in range(10):
|
|
|
|
|
+ if captured_info:
|
|
|
|
|
+ break
|
|
|
|
|
+ await asyncio.sleep(1)
|
|
|
|
|
+
|
|
|
|
|
+ if not captured_info:
|
|
|
|
|
+ raise Exception("无法获取账号信息")
|
|
|
|
|
+
|
|
|
|
|
+ # 尝试获取作品数(从首页或其他地方)
|
|
|
|
|
+ # 或者简单地返回已获取的信息,作品数由 get_works 更新
|
|
|
|
|
+
|
|
|
|
|
+ return {
|
|
|
|
|
+ "success": True,
|
|
|
|
|
+ **captured_info
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ import traceback
|
|
|
|
|
+ traceback.print_exc()
|
|
|
|
|
+ return {
|
|
|
|
|
+ "success": False,
|
|
|
|
|
+ "error": str(e)
|
|
|
|
|
+ }
|
|
|
|
|
+ finally:
|
|
|
|
|
+ await self.close_browser()
|
|
|
|
|
+
|
|
|
async def get_works(self, cookies: str, page: int = 0, page_size: int = 20) -> WorksResult:
|
|
async def get_works(self, cookies: str, page: int = 0, page_size: int = 20) -> WorksResult:
|
|
|
"""获取小红书作品列表 - 通过监听页面网络响应获取数据"""
|
|
"""获取小红书作品列表 - 通过监听页面网络响应获取数据"""
|
|
|
print(f"\n{'='*60}", flush=True)
|
|
print(f"\n{'='*60}", flush=True)
|
|
@@ -705,21 +781,26 @@ class XiaohongshuPublisher(BasePublisher):
|
|
|
works.extend(parse_notes(notes))
|
|
works.extend(parse_notes(notes))
|
|
|
|
|
|
|
|
# 分页抓取剩余页面:不依赖 data.page(有些情况下会误报 -1),直到拿不到新数据为止
|
|
# 分页抓取剩余页面:不依赖 data.page(有些情况下会误报 -1),直到拿不到新数据为止
|
|
|
- max_pages = 30
|
|
|
|
|
|
|
+ max_pages = 100 # 增加最大页数限制,确保能抓取更多作品
|
|
|
page_num = 1 # 已经拿了 page=0
|
|
page_num = 1 # 已经拿了 page=0
|
|
|
seen_note_ids = set([w.work_id for w in works])
|
|
seen_note_ids = set([w.work_id for w in works])
|
|
|
has_more = True
|
|
has_more = True
|
|
|
|
|
|
|
|
while has_more and page_num < max_pages:
|
|
while has_more and page_num < max_pages:
|
|
|
|
|
+ print(f"[{self.platform_name}] 正在抓取第 {page_num} 页...", flush=True)
|
|
|
try:
|
|
try:
|
|
|
next_resp = await self.page.evaluate(
|
|
next_resp = await self.page.evaluate(
|
|
|
"""async (p) => {
|
|
"""async (p) => {
|
|
|
- const res = await fetch(`https://edith.xiaohongshu.com/web_api/sns/v5/creator/note/user/posted?tab=0&page=${p}`, {
|
|
|
|
|
- method: 'GET',
|
|
|
|
|
- credentials: 'include',
|
|
|
|
|
- headers: { 'Accept': 'application/json' }
|
|
|
|
|
- });
|
|
|
|
|
- return await res.json();
|
|
|
|
|
|
|
+ try {
|
|
|
|
|
+ const res = await fetch(`https://edith.xiaohongshu.com/web_api/sns/v5/creator/note/user/posted?tab=0&page=${p}`, {
|
|
|
|
|
+ method: 'GET',
|
|
|
|
|
+ credentials: 'include',
|
|
|
|
|
+ headers: { 'Accept': 'application/json' }
|
|
|
|
|
+ });
|
|
|
|
|
+ return await res.json();
|
|
|
|
|
+ } catch (e) {
|
|
|
|
|
+ return { success: false, error: e.toString() };
|
|
|
|
|
+ }
|
|
|
}""",
|
|
}""",
|
|
|
page_num
|
|
page_num
|
|
|
)
|
|
)
|
|
@@ -728,25 +809,31 @@ class XiaohongshuPublisher(BasePublisher):
|
|
|
break
|
|
break
|
|
|
|
|
|
|
|
if not next_resp:
|
|
if not next_resp:
|
|
|
|
|
+ print(f"[{self.platform_name}] 第 {page_num} 页无响应", flush=True)
|
|
|
break
|
|
break
|
|
|
|
|
|
|
|
if not (next_resp.get('success') or next_resp.get('code') == 0):
|
|
if not (next_resp.get('success') or next_resp.get('code') == 0):
|
|
|
|
|
+ print(f"[{self.platform_name}] 第 {page_num} 页请求失败: {next_resp.get('msg')}", flush=True)
|
|
|
break
|
|
break
|
|
|
|
|
|
|
|
next_data = next_resp.get('data', {})
|
|
next_data = next_resp.get('data', {})
|
|
|
next_notes = next_data.get('notes', []) or []
|
|
next_notes = next_data.get('notes', []) or []
|
|
|
|
|
|
|
|
if not next_notes:
|
|
if not next_notes:
|
|
|
|
|
+ print(f"[{self.platform_name}] 第 {page_num} 页无作品,停止抓取", flush=True)
|
|
|
has_more = False
|
|
has_more = False
|
|
|
break
|
|
break
|
|
|
|
|
|
|
|
parsed_next = parse_notes(next_notes)
|
|
parsed_next = parse_notes(next_notes)
|
|
|
new_items = [w for w in parsed_next if w.work_id and w.work_id not in seen_note_ids]
|
|
new_items = [w for w in parsed_next if w.work_id and w.work_id not in seen_note_ids]
|
|
|
|
|
+
|
|
|
if not new_items:
|
|
if not new_items:
|
|
|
# 没有新数据,停止
|
|
# 没有新数据,停止
|
|
|
|
|
+ print(f"[{self.platform_name}] 第 {page_num} 页无新数据,停止抓取", flush=True)
|
|
|
has_more = False
|
|
has_more = False
|
|
|
break
|
|
break
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
|
|
+ print(f"[{self.platform_name}] 第 {page_num} 页获取到 {len(new_items)} 条新数据", flush=True)
|
|
|
for w in new_items:
|
|
for w in new_items:
|
|
|
seen_note_ids.add(w.work_id)
|
|
seen_note_ids.add(w.work_id)
|
|
|
works.extend(new_items)
|
|
works.extend(new_items)
|
|
@@ -759,10 +846,14 @@ class XiaohongshuPublisher(BasePublisher):
|
|
|
break
|
|
break
|
|
|
|
|
|
|
|
page_num += 1
|
|
page_num += 1
|
|
|
|
|
+ # 增加一点延迟,避免请求过快
|
|
|
|
|
+ await asyncio.sleep(1)
|
|
|
|
|
|
|
|
# 分页完毕,has_more 表示是否还存在更多(以最后一页标记为准)
|
|
# 分页完毕,has_more 表示是否还存在更多(以最后一页标记为准)
|
|
|
if not has_more:
|
|
if not has_more:
|
|
|
print(f"[{self.platform_name}] 已抓取所有分页,共 {len(works)} 条", flush=True)
|
|
print(f"[{self.platform_name}] 已抓取所有分页,共 {len(works)} 条", flush=True)
|
|
|
|
|
+ else:
|
|
|
|
|
+ print(f"[{self.platform_name}] 达到最大页数限制 {max_pages},共 {len(works)} 条", flush=True)
|
|
|
else:
|
|
else:
|
|
|
print(f"[{self.platform_name}] 未能捕获到 API 数据", flush=True)
|
|
print(f"[{self.platform_name}] 未能捕获到 API 数据", flush=True)
|
|
|
|
|
|