feat: webo full context support

2026-05-08 03:27:36 +08:00 · 2025-12-26 19:22:24 +08:00
parent ff1b681311
commit 55d8c7783f
2 changed files with 69 additions and 2 deletions
--- a/config/weibo_config.py
+++ b/config/weibo_config.py
@@ -31,6 +31,10 @@ WEIBO_SPECIFIED_ID_LIST = [

 # 指定微博用户ID列表
 WEIBO_CREATOR_ID_LIST = [
-    "5533390220",
+    "5756404150",
    # ........................
 ]
+
+# 是否开启微博爬取全文的功能，默认开启
+# 如果开启的话会增加被风控的概率，相当于一个关键词搜索请求会再遍历所有帖子的时候，再请求一次帖子详情
+ENABLE_WEIBO_FULL_TEXT = True
--- a/media_platform/weibo/core.py
+++ b/media_platform/weibo/core.py
@@ -170,6 +170,8 @@ class WeiboCrawler(AbstractCrawler):
                search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type)
                note_id_list: List[str] = []
                note_list = filter_search_result_card(search_res.get("cards"))
+                # 如果开启了全文获取功能，则批量获取帖子全文
+                note_list = await self.batch_get_notes_full_text(note_list)
                for note_item in note_list:
                    if note_item:
                        mblog: Dict = note_item.get("mblog")
@@ -313,12 +315,18 @@ class WeiboCrawler(AbstractCrawler):
                    raise DataFetchError("Get creator info error")
                await weibo_store.save_creator(user_id, user_info=createor_info)

+                # 创建一个包装 callback，在保存数据前获取全文
+                async def save_notes_with_full_text(note_list: List[Dict]):
+                    # 如果开启了全文获取功能，先批量获取全文
+                    updated_note_list = await self.batch_get_notes_full_text(note_list)
+                    await weibo_store.batch_update_weibo_notes(updated_note_list)
+
                # Get all note information of the creator
                all_notes_list = await self.wb_client.get_all_notes_by_creator_id(
                    creator_id=user_id,
                    container_id=f"107603{user_id}",
                    crawl_interval=0,
-                    callback=weibo_store.batch_update_weibo_notes,
+                    callback=save_notes_with_full_text,
                )

                note_ids = [note_item.get("mblog", {}).get("id") for note_item in all_notes_list if note_item.get("mblog", {}).get("id")]
@@ -406,6 +414,61 @@ class WeiboCrawler(AbstractCrawler):
            chromium = playwright.chromium
            return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)

+    async def get_note_full_text(self, note_item: Dict) -> Dict:
+        """
+        获取帖子全文内容
+        如果帖子内容被截断（isLongText=True），则请求详情接口获取完整内容
+        :param note_item: 帖子数据，包含 mblog 字段
+        :return: 更新后的帖子数据
+        """
+        if not config.ENABLE_WEIBO_FULL_TEXT:
+            return note_item
+
+        mblog = note_item.get("mblog", {})
+        if not mblog:
+            return note_item
+
+        # 检查是否是长文本
+        is_long_text = mblog.get("isLongText", False)
+        if not is_long_text:
+            return note_item
+
+        note_id = mblog.get("id")
+        if not note_id:
+            return note_item
+
+        try:
+            utils.logger.info(f"[WeiboCrawler.get_note_full_text] Fetching full text for note: {note_id}")
+            full_note = await self.wb_client.get_note_info_by_id(note_id)
+            if full_note and full_note.get("mblog"):
+                # 用完整内容替换原始内容
+                note_item["mblog"] = full_note["mblog"]
+                utils.logger.info(f"[WeiboCrawler.get_note_full_text] Successfully fetched full text for note: {note_id}")
+
+            # 请求后休眠，避免风控
+            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+        except DataFetchError as ex:
+            utils.logger.error(f"[WeiboCrawler.get_note_full_text] Failed to fetch full text for note {note_id}: {ex}")
+        except Exception as ex:
+            utils.logger.error(f"[WeiboCrawler.get_note_full_text] Unexpected error for note {note_id}: {ex}")
+
+        return note_item
+
+    async def batch_get_notes_full_text(self, note_list: List[Dict]) -> List[Dict]:
+        """
+        批量获取帖子全文内容
+        :param note_list: 帖子列表
+        :return: 更新后的帖子列表
+        """
+        if not config.ENABLE_WEIBO_FULL_TEXT:
+            return note_list
+
+        result = []
+        for note_item in note_list:
+            updated_note = await self.get_note_full_text(note_item)
+            result.append(updated_note)
+        return result
+
    async def close(self):
        """Close browser context"""
        # 如果使用CDP模式，需要特殊处理