All_platform_comments_restrict

This commit is contained in:
unknown
2024-10-23 16:32:02 +08:00
parent 19269c66fd
commit 7e53c4acfc
12 changed files with 44 additions and 19 deletions

View File

@@ -204,21 +204,23 @@ class BaiduTieBaClient(AbstractApiClient):
return self._page_extractor.extract_note_detail(page_content)
async def get_note_all_comments(self, note_detail: TiebaNote, crawl_interval: float = 1.0,
callback: Optional[Callable] = None) -> List[TiebaComment]:
callback: Optional[Callable] = None,
max_count: int = 10,
) -> List[TiebaComment]:
"""
获取指定帖子下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息
Args:
note_detail: 帖子详情对象
crawl_interval: 爬取一次笔记的延迟单位(秒)
callback: 一次笔记爬取结束后
max_count: 一次帖子爬取的最大评论数量
Returns:
"""
uri = f"/p/{note_detail.note_id}"
result: List[TiebaComment] = []
current_page = 1
while note_detail.total_replay_page >= current_page:
while note_detail.total_replay_page >= current_page and len(result) < max_count:
params = {
"pn": current_page
}
@@ -227,6 +229,8 @@ class BaiduTieBaClient(AbstractApiClient):
note_id=note_detail.note_id)
if not comments:
break
if len(result) + len(comments) > max_count:
comments = comments[:max_count - len(result)]
if callback:
await callback(note_detail.note_id, comments)
result.extend(comments)

View File

@@ -226,7 +226,8 @@ class TieBaCrawler(AbstractCrawler):
await self.tieba_client.get_note_all_comments(
note_detail=note_detail,
crawl_interval=random.random(),
callback=tieba_store.batch_update_tieba_note_comments
callback=tieba_store.batch_update_tieba_note_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
)
async def get_creators_and_notes(self) -> None: