新增对微博博客内照片获取的支持文件存放路径data/weibo/images

2026-06-06 01:47:26 +08:00 · 2024-04-09 17:21:52 +08:00
parent 5c409c6f0c
commit 16413c3074
6 changed files with 114 additions and 3 deletions
--- a/media_platform/weibo/client.py
+++ b/media_platform/weibo/client.py
@@ -35,6 +35,7 @@ class WeiboClient:
        self._host = "https://m.weibo.cn"
        self.playwright_page = playwright_page
        self.cookie_dict = cookie_dict
+        self._image_agent_host = "https://i1.wp.com/"

    async def request(self, method, url, **kwargs) -> Any:
        async with httpx.AsyncClient(proxies=self.proxies) as client:
@@ -181,3 +182,25 @@ class WeiboClient:
            else:
                utils.logger.info(f"[WeiboClient.get_note_info_by_id] 未找到$render_data的值")
                return dict()
+
+    async def get_note_image(self, image_url: str) -> bytes:
+        image_url = image_url[8:] # 去掉 https://
+        sub_url = image_url.split("/")
+        image_url = ""
+        for i in range(len(sub_url)):
+            if i == 1:
+                image_url += "large/" #都获取高清大图
+            elif i == len(sub_url) - 1:
+                image_url += sub_url[i]
+            else:
+                image_url += sub_url[i] + "/"
+        # 微博图床对外存在防盗链，所以需要代理访问
+        # 由于微博图片是通过 i1.wp.com 来访问的，所以需要拼接一下
+        final_uri = (f"{self._image_agent_host}" f"{image_url}")
+        async with httpx.AsyncClient(proxies=self.proxies) as client:
+            response = await client.request("GET", final_uri, timeout=self.timeout)
+            if not response.reason_phrase == "OK":
+                utils.logger.error(f"[WeiboClient.get_note_image] request {final_uri} err, res:{response.text}")
+                return None
+            else:
+                return response.content
--- a/media_platform/weibo/core.py
+++ b/media_platform/weibo/core.py
@@ -121,8 +121,10 @@ class WeiboCrawler(AbstractCrawler):
                for note_item in note_list:
                    if note_item:
                        mblog: Dict = note_item.get("mblog")
-                        note_id_list.append(mblog.get("id"))
-                        await weibo_store.update_weibo_note(note_item)
+                        if mblog:
+                            note_id_list.append(mblog.get("id"))
+                            await weibo_store.update_weibo_note(note_item)
+                            await self.get_note_images(mblog)

                page += 1
                await self.batch_get_notes_comments(note_id_list)
@@ -200,6 +202,28 @@ class WeiboCrawler(AbstractCrawler):
            except Exception as e:
                utils.logger.error(f"[WeiboCrawler.get_note_comments] may be been blocked, err:{e}")

+    async def get_note_images(self, mblog: Dict):
+        """
+        get note images
+        :param mblog:
+        :return:
+        """
+        if not config.ENABLE_GET_IMAGES:
+            utils.logger.info(f"[WeiboCrawler.get_note_images] Crawling image mode is not enabled")
+            return
+        
+        pics: Dict = mblog.get("pics")
+        if not pics:
+            return
+        for pic in pics:
+            url = pic.get("url")
+            if not url:
+                continue
+            content = await self.wb_client.get_note_image(url)
+            if content != None:
+                extension_file_name = url.split(".")[-1]
+                await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name)
+
    async def create_weibo_client(self, httpx_proxy: Optional[str]) -> WeiboClient:
        """Create xhs client"""
        utils.logger.info("[WeiboCrawler.create_weibo_client] Begin create weibo API client ...")