fix_words

2026-04-21 19:27:40 +08:00 · 2025-05-22 20:31:48 +08:00
parent a356358c21
commit 44e3d370ff
8 changed files with 338 additions and 59 deletions
--- a/media_platform/bilibili/client.py
+++ b/media_platform/bilibili/client.py
@@ -341,7 +341,8 @@ class BilibiliClient(AbstractApiClient):
        return await self.get(uri, post_data)

    async def get_creator_info(self, creator_id: int) -> Dict:
-        """get creator info
+        """
+        get creator info
        :param creator_id: 作者 ID
        """
        uri = "/x/space/wbi/acc/info"
@@ -355,7 +356,8 @@ class BilibiliClient(AbstractApiClient):
                               pn: int,
                               ps: int = 24,
                               ) -> Dict:
-        """get video comments
+        """
+        get creator fans
        :param creator_id: 创作者 ID
        :param pn: 开始页数
        :param ps: 每页数量
@@ -376,7 +378,8 @@ class BilibiliClient(AbstractApiClient):
                                     pn: int,
                                     ps: int = 24,
                                     ) -> Dict:
-        """get video comments
+        """
+        get creator followings
        :param creator_id: 创作者 ID
        :param pn: 开始页数
        :param ps: 每页数量
@@ -391,11 +394,27 @@ class BilibiliClient(AbstractApiClient):
        }
        return await self.get(uri, post_data)

+    async def get_creator_dynamics(self, creator_id: int, offset: str = ""):
+        """
+        get creator comments
+        :param creator_id: 创作者 ID
+        :param offset: 发送请求所需参数
+        :return:
+        """
+        uri = "/x/polymer/web-dynamic/v1/feed/space"
+        post_data = {
+            "offset": offset,
+            "host_mid": creator_id,
+            "platform": "web",
+        }
+
+        return await self.get(uri, post_data)
+
    async def get_creator_all_fans(self, creator_info: Dict, crawl_interval: float = 1.0,
                                   callback: Optional[Callable] = None,
                                   max_count: int = 100) -> List:
        """
-        get video all comments include sub comments
+        get creator all fans
        :param creator_info:
        :param crawl_interval:
        :param callback:
@@ -419,16 +438,13 @@ class BilibiliClient(AbstractApiClient):
            if not fans_list:
                break
            result.extend(fans_list)
-        utils.logger.info(
-            f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans successfully")
-
        return result

    async def get_creator_all_followings(self, creator_info: Dict, crawl_interval: float = 1.0,
                                         callback: Optional[Callable] = None,
                                         max_count: int = 100) -> List:
        """
-        get video all comments include sub comments
+        get creator all followings
        :param creator_info:
        :param crawl_interval:
        :param callback:
@@ -452,7 +468,33 @@ class BilibiliClient(AbstractApiClient):
            if not followings_list:
                break
            result.extend(followings_list)
-        utils.logger.info(
-            f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings successfully")
-
+        return result
+
+    async def get_creator_all_dynamics(self, creator_info: Dict, crawl_interval: float = 1.0,
+                                       callback: Optional[Callable] = None,
+                                       max_count: int = 20) -> List:
+        """
+        get creator all followings
+        :param creator_info:
+        :param crawl_interval:
+        :param callback:
+        :param max_count: 一个up主爬取的最大动态数量
+
+        :return: up主关注者列表
+        """
+        creator_id = creator_info["id"]
+        result = []
+        offset = ""
+        has_more = True
+        while has_more and len(result) < max_count:
+            dynamics_res = await self.get_creator_dynamics(creator_id, offset)
+            dynamics_list: List[Dict] = dynamics_res["items"]
+            has_more = dynamics_res["has_more"]
+            offset = dynamics_res["offset"]
+            if len(result) + len(dynamics_list) > max_count:
+                dynamics_list = dynamics_list[:max_count - len(result)]
+            if callback:
+                await callback(creator_info, dynamics_list)
+            await asyncio.sleep(crawl_interval)
+            result.extend(dynamics_list)
        return result
--- a/media_platform/bilibili/core.py
+++ b/media_platform/bilibili/core.py
@@ -119,14 +119,16 @@ class BilibiliCrawler(AbstractCrawler):
        start_day: datetime = datetime.strptime(start, '%Y-%m-%d')
        end_day: datetime = datetime.strptime(end, '%Y-%m-%d')
        if start_day > end_day:
-            raise ValueError('Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end')
+            raise ValueError(
+                'Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end')
        elif start_day == end_day:  # 搜索同一天的内容
-            end_day = start_day + timedelta(days=1) - timedelta(seconds=1)  # 则将 end_day 设置为 start_day + 1 day - 1 second
+            end_day = start_day + timedelta(days=1) - timedelta(
+                seconds=1)  # 则将 end_day 设置为 start_day + 1 day - 1 second
        else:  # 搜索 start 至 end
            end_day = end_day + timedelta(days=1) - timedelta(seconds=1)  # 则将 end_day 设置为 end_day + 1 day - 1 second
        # 将其重新转换为时间戳
        return str(int(start_day.timestamp())), str(int(end_day.timestamp()))
-        
+
    async def search(self):
        """
        search bilibili video with keywords
@@ -164,9 +166,11 @@ class BilibiliCrawler(AbstractCrawler):
                    semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
                    task_list = []
                    try:
-                        task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
+                        task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore)
+                                     for video_item in video_list]
                    except Exception as e:
-                        utils.logger.warning(f"[BilibiliCrawler.search] error in the task list. The video for this page will not be included. {e}")
+                        utils.logger.warning(
+                            f"[BilibiliCrawler.search] error in the task list. The video for this page will not be included. {e}")
                    video_items = await asyncio.gather(*task_list)
                    for video_item in video_items:
                        if video_item:
@@ -180,21 +184,23 @@ class BilibiliCrawler(AbstractCrawler):
            else:
                for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq='D'):
                    # 按照每一天进行爬取的时间戳参数
-                    pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'), end=day.strftime('%Y-%m-%d'))
+                    pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'),
+                                                                                     end=day.strftime('%Y-%m-%d'))
                    page = 1
-                    #!该段 while 语句在发生异常时（通常情况下为当天数据为空时）会自动跳转到下一天，以实现最大程度爬取该关键词下当天的所有视频
-                    #!除了仅保留现在原有的 try, except Exception 语句外，不要再添加其他的异常处理！！！否则将使该段代码失效，使其仅能爬取当天一天数据而无法跳转到下一天
-                    #!除非将该段代码的逻辑进行重构以实现相同的功能，否则不要进行修改！！！
+                    # !该段 while 语句在发生异常时（通常情况下为当天数据为空时）会自动跳转到下一天，以实现最大程度爬取该关键词下当天的所有视频
+                    # !除了仅保留现在原有的 try, except Exception 语句外，不要再添加其他的异常处理！！！否则将使该段代码失效，使其仅能爬取当天一天数据而无法跳转到下一天
+                    # !除非将该段代码的逻辑进行重构以实现相同的功能，否则不要进行修改！！！
                    while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
-                        #! Catch any error if response return nothing, go to next day
+                        # ! Catch any error if response return nothing, go to next day
                        try:
-                            #! Don't skip any page, to make sure gather all video in one day
+                            # ! Don't skip any page, to make sure gather all video in one day
                            # if page < start_page:
                            #     utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}")
                            #     page += 1
                            #     continue

-                            utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}")
+                            utils.logger.info(
+                                f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}")
                            video_id_list: List[str] = []
                            videos_res = await self.bili_client.search_video_by_keyword(
                                keyword=keyword,
@@ -207,7 +213,9 @@ class BilibiliCrawler(AbstractCrawler):
                            video_list: List[Dict] = videos_res.get("result")

                            semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
-                            task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
+                            task_list = [
+                                self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for
+                                video_item in video_list]
                            video_items = await asyncio.gather(*task_list)
                            for video_item in video_items:
                                if video_item:
@@ -467,7 +475,6 @@ class BilibiliCrawler(AbstractCrawler):
        extension_file_name = f"video.mp4"
        await bilibili_store.store_video(aid, content, extension_file_name)

-
    async def get_all_creator_details(self, creator_id_list: List[int]):
        """
        creator_id_list: get details for creator from creator_id_list
@@ -485,7 +492,8 @@ class BilibiliCrawler(AbstractCrawler):
                    creator_id, semaphore), name=creator_id)
                task_list.append(task)
        except Exception as e:
-            utils.logger.warning(f"[BilibiliCrawler.get_all_creator_details] error in the task list. The creator will not be included. {e}")
+            utils.logger.warning(
+                f"[BilibiliCrawler.get_all_creator_details] error in the task list. The creator will not be included. {e}")

        await asyncio.gather(*task_list)

@@ -504,8 +512,9 @@ class BilibiliCrawler(AbstractCrawler):
                "sign": creator_unhandled_info.get("sign"),
                "avatar": creator_unhandled_info.get("face"),
            }
-        await self.get_fans(creator_info, semaphore)
-        await self.get_followings(creator_info, semaphore)
+        # await self.get_fans(creator_info, semaphore)
+        # await self.get_followings(creator_info, semaphore)
+        await self.get_dynamics(creator_info, semaphore)

    async def get_fans(self, creator_info: Dict, semaphore: asyncio.Semaphore):
        """
@@ -523,7 +532,7 @@ class BilibiliCrawler(AbstractCrawler):
                    creator_info=creator_info,
                    crawl_interval=random.random(),
                    callback=bilibili_store.batch_update_bilibili_creator_fans,
-                    max_count=config.CRAWLER_MAX_FANS_COUNT_SINGLENOTES,
+                    max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
                )

            except DataFetchError as ex:
@@ -549,7 +558,7 @@ class BilibiliCrawler(AbstractCrawler):
                    creator_info=creator_info,
                    crawl_interval=random.random(),
                    callback=bilibili_store.batch_update_bilibili_creator_followings,
-                    max_count=config.CRAWLER_MAX_FANS_COUNT_SINGLENOTES,
+                    max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
                )

            except DataFetchError as ex:
@@ -558,3 +567,29 @@ class BilibiliCrawler(AbstractCrawler):
            except Exception as e:
                utils.logger.error(
                    f"[BilibiliCrawler.get_followings] may be been blocked, err:{e}")
+
+    async def get_dynamics(self, creator_info: Dict, semaphore: asyncio.Semaphore):
+        """
+        get dynamics for creator id
+        :param creator_info:
+        :param semaphore:
+        :return:
+        """
+        creator_id = creator_info["id"]
+        async with semaphore:
+            try:
+                utils.logger.info(
+                    f"[BilibiliCrawler.get_dynamics] begin get creator_id: {creator_id} dynamics ...")
+                await self.bili_client.get_creator_all_dynamics(
+                    creator_info=creator_info,
+                    crawl_interval=random.random(),
+                    callback=bilibili_store.batch_update_bilibili_creator_dynamics,
+                    max_count=config.CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES,
+                )
+
+            except DataFetchError as ex:
+                utils.logger.error(
+                    f"[BilibiliCrawler.get_dynamics] get creator_id: {creator_id} dynamics error: {ex}")
+            except Exception as e:
+                utils.logger.error(
+                    f"[BilibiliCrawler.get_dynamics] may be been blocked, err:{e}")