fix: 贴吧创作者bug修复

This commit is contained in:
Relakkes
2025-01-02 20:29:05 +08:00
parent 7ce1273386
commit fbbead814a
3 changed files with 55 additions and 6 deletions

View File

@@ -301,7 +301,7 @@ class BaiduTieBaClient(AbstractApiClient):
page_content = await self.get(uri, return_ori_content=True)
return self._page_extractor.extract_tieba_note_list(page_content)
async def get_creator_info_by_url(self, creator_url: str) -> TiebaCreator:
async def get_creator_info_by_url(self, creator_url: str) -> str:
"""
根据创作者ID获取创作者信息
Args:
@@ -311,7 +311,7 @@ class BaiduTieBaClient(AbstractApiClient):
"""
page_content = await self.request(method="GET", url=creator_url, return_ori_content=True)
return self._page_extractor.extract_creator_info(page_content)
return page_content
async def get_notes_by_creator(self, user_name: str, page_number: int) -> Dict:
"""
@@ -335,7 +335,10 @@ class BaiduTieBaClient(AbstractApiClient):
async def get_all_notes_by_creator_user_name(self,
user_name: str, crawl_interval: float = 1.0,
callback: Optional[Callable] = None,
max_note_count: int = 0) -> List[TiebaNote]:
max_note_count: int = 0,
creator_page_html_content: str = None,
) -> List[TiebaNote]:
"""
根据创作者用户名获取创作者所有帖子
Args:
@@ -343,11 +346,30 @@ class BaiduTieBaClient(AbstractApiClient):
crawl_interval: 爬取一次笔记的延迟单位(秒)
callback: 一次笔记爬取结束后的回调函数是一个awaitable类型的函数
max_note_count: 帖子最大获取数量如果为0则获取所有
creator_page_html_content: 创作者主页HTML内容
Returns:
"""
result = []
# 百度贴吧比较特殊一些前10个帖子是直接展示在主页上的要单独处理通过API获取不到
result: List[TiebaNote] = []
if creator_page_html_content:
thread_id_list = (
self._page_extractor.extract_tieba_thread_id_list_from_creator_page(
creator_page_html_content
)
)
utils.logger.info(
f"[BaiduTieBaClient.get_all_notes_by_creator] got user_name:{user_name} thread_id_list len : {len(thread_id_list)}"
)
note_detail_task = [
self.get_note_by_id(thread_id) for thread_id in thread_id_list
]
notes = await asyncio.gather(*note_detail_task)
if callback:
await callback(notes)
result.extend(notes)
notes_has_more = 1
page_number = 1
page_per_count = 20