From 55d8c7783f5dc8a558771dde347175292147f3c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A8=8B=E5=BA=8F=E5=91=98=E9=98=BF=E6=B1=9F=28Relakkes?= =?UTF-8?q?=29?= Date: Fri, 26 Dec 2025 19:22:24 +0800 Subject: [PATCH] feat: webo full context support --- config/weibo_config.py | 6 +++- media_platform/weibo/core.py | 65 +++++++++++++++++++++++++++++++++++- 2 files changed, 69 insertions(+), 2 deletions(-) diff --git a/config/weibo_config.py b/config/weibo_config.py index 2a0fe6a..e7d8ed5 100644 --- a/config/weibo_config.py +++ b/config/weibo_config.py @@ -31,6 +31,10 @@ WEIBO_SPECIFIED_ID_LIST = [ # 指定微博用户ID列表 WEIBO_CREATOR_ID_LIST = [ - "5533390220", + "5756404150", # ........................ ] + +# 是否开启微博爬取全文的功能,默认开启 +# 如果开启的话会增加被风控的概率,相当于一个关键词搜索请求会再遍历所有帖子的时候,再请求一次帖子详情 +ENABLE_WEIBO_FULL_TEXT = True diff --git a/media_platform/weibo/core.py b/media_platform/weibo/core.py index 08c19ab..b989e23 100644 --- a/media_platform/weibo/core.py +++ b/media_platform/weibo/core.py @@ -170,6 +170,8 @@ class WeiboCrawler(AbstractCrawler): search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type) note_id_list: List[str] = [] note_list = filter_search_result_card(search_res.get("cards")) + # 如果开启了全文获取功能,则批量获取帖子全文 + note_list = await self.batch_get_notes_full_text(note_list) for note_item in note_list: if note_item: mblog: Dict = note_item.get("mblog") @@ -313,12 +315,18 @@ class WeiboCrawler(AbstractCrawler): raise DataFetchError("Get creator info error") await weibo_store.save_creator(user_id, user_info=createor_info) + # 创建一个包装 callback,在保存数据前获取全文 + async def save_notes_with_full_text(note_list: List[Dict]): + # 如果开启了全文获取功能,先批量获取全文 + updated_note_list = await self.batch_get_notes_full_text(note_list) + await weibo_store.batch_update_weibo_notes(updated_note_list) + # Get all note information of the creator all_notes_list = await self.wb_client.get_all_notes_by_creator_id( creator_id=user_id, container_id=f"107603{user_id}", crawl_interval=0, - callback=weibo_store.batch_update_weibo_notes, + callback=save_notes_with_full_text, ) note_ids = [note_item.get("mblog", {}).get("id") for note_item in all_notes_list if note_item.get("mblog", {}).get("id")] @@ -406,6 +414,61 @@ class WeiboCrawler(AbstractCrawler): chromium = playwright.chromium return await self.launch_browser(chromium, playwright_proxy, user_agent, headless) + async def get_note_full_text(self, note_item: Dict) -> Dict: + """ + 获取帖子全文内容 + 如果帖子内容被截断(isLongText=True),则请求详情接口获取完整内容 + :param note_item: 帖子数据,包含 mblog 字段 + :return: 更新后的帖子数据 + """ + if not config.ENABLE_WEIBO_FULL_TEXT: + return note_item + + mblog = note_item.get("mblog", {}) + if not mblog: + return note_item + + # 检查是否是长文本 + is_long_text = mblog.get("isLongText", False) + if not is_long_text: + return note_item + + note_id = mblog.get("id") + if not note_id: + return note_item + + try: + utils.logger.info(f"[WeiboCrawler.get_note_full_text] Fetching full text for note: {note_id}") + full_note = await self.wb_client.get_note_info_by_id(note_id) + if full_note and full_note.get("mblog"): + # 用完整内容替换原始内容 + note_item["mblog"] = full_note["mblog"] + utils.logger.info(f"[WeiboCrawler.get_note_full_text] Successfully fetched full text for note: {note_id}") + + # 请求后休眠,避免风控 + await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) + except DataFetchError as ex: + utils.logger.error(f"[WeiboCrawler.get_note_full_text] Failed to fetch full text for note {note_id}: {ex}") + except Exception as ex: + utils.logger.error(f"[WeiboCrawler.get_note_full_text] Unexpected error for note {note_id}: {ex}") + + return note_item + + async def batch_get_notes_full_text(self, note_list: List[Dict]) -> List[Dict]: + """ + 批量获取帖子全文内容 + :param note_list: 帖子列表 + :return: 更新后的帖子列表 + """ + if not config.ENABLE_WEIBO_FULL_TEXT: + return note_list + + result = [] + for note_item in note_list: + updated_note = await self.get_note_full_text(note_item) + result.append(updated_note) + return result + async def close(self): """Close browser context""" # 如果使用CDP模式,需要特殊处理