From 26a845581e9505ed77c207cb5290fa24690bd0ec Mon Sep 17 00:00:00 2001 From: chimeElm <105601471+chimeElm@users.noreply.github.com> Date: Sat, 7 Jun 2025 02:41:09 +0800 Subject: [PATCH] Update client.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 修复CRAWLER_MAX_NOTES_COUNT在爬取小红书作者帖子时失效的问题 --- media_platform/xhs/client.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index c1757f2..01a833e 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -498,7 +498,7 @@ class XiaoHongShuClient(AbstractApiClient): result = [] notes_has_more = True notes_cursor = "" - while notes_has_more: + while notes_has_more and len(result) < config.CRAWLER_MAX_NOTES_COUNT: notes_res = await self.get_notes_by_creator(user_id, notes_cursor) if not notes_res: utils.logger.error( @@ -518,10 +518,21 @@ class XiaoHongShuClient(AbstractApiClient): utils.logger.info( f"[XiaoHongShuClient.get_all_notes_by_creator] got user_id:{user_id} notes len : {len(notes)}" ) + + remaining = config.CRAWLER_MAX_NOTES_COUNT - len(result) + if remaining <= 0: + break + + notes_to_add = notes[:remaining] if callback: - await callback(notes) + await callback(notes_to_add) + + result.extend(notes_to_add) await asyncio.sleep(crawl_interval) - result.extend(notes) + + utils.logger.info( + f"[XiaoHongShuClient.get_all_notes_by_creator] Finished getting notes for user {user_id}, total: {len(result)}" + ) return result async def get_note_short_url(self, note_id: str) -> Dict: