feat(bilibili): Add flexible search modes and fix limit logic

Refactors the Bilibili keyword search functionality to provide more flexible crawling strategies and corrects a flaw in how crawl limits were applied. Previously, the `ALL_DAY` boolean flag offered a rigid choice for time-based searching and contained a logical issue where `CRAWLER_MAX_NOTES_COUNT` was incorrectly applied on a per-day basis instead of as an overall total. This commit introduces the `BILI_SEARCH_MODE` configuration option with three distinct modes: - `normal`: The default search behavior without time constraints. - `all_in_time_range`: Maximizes data collection within a specified date range, replicating the original intent of `ALL_DAY=True`. - `daily_limit_in_time_range`: A new mode that strictly enforces both the daily `MAX_NOTES_PER_DAY` and the total `CRAWLER_MAX_NOTES_COUNT` limits across the entire date range. This change resolves the limit logic bug and gives users more precise control over the crawling process. Changes include: - Modified `config/base_config.py` to replace `ALL_DAY` with `BILI_SEARCH_MODE`. - Refactored `media_platform/bilibili/core.py` to implement the new search mode logic.
2026-06-09 03:17:25 +08:00 · 2025-07-13 06:07:13 +08:00
parent e103bfa1f3
commit d0d7293926
2 changed files with 128 additions and 95 deletions
--- a/config/base_config.py
+++ b/config/base_config.py
@@ -87,7 +87,7 @@ START_PAGE = 1
 CRAWLER_MAX_NOTES_COUNT = 200
 # 每天爬取视频/帖子的数量控制
-MAX_NOTES_PER_DAY = 20
+MAX_NOTES_PER_DAY = 1
 # 并发爬虫数量控制
 MAX_CONCURRENCY_NUM = 1
@@ -99,7 +99,7 @@ ENABLE_GET_IMAGES = False
 ENABLE_GET_COMMENTS = True
 # 爬取一级评论的数量控制(单视频/帖子)
-CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 10
+CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 1
 # 是否开启爬二级评论模式, 默认不开启爬二级评论
 # 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
@@ -216,16 +216,17 @@ STOP_WORDS_FILE = "./docs/hit_stopwords.txt"
 # 中文字体文件路径
 FONT_PATH = "./docs/STZHONGS.TTF"
-# 爬取开始的天数，仅支持 bilibili 关键字搜索，YYYY-MM-DD 格式，若为 None 则表示不设置时间范围，按照默认关键字最多返回 1000 条视频的结果处理
+# 爬取开始的天数，仅支持 bilibili 关键字搜索，YYYY-MM-DD 格式
 START_DAY = "2024-01-01"
-# 爬取结束的天数，仅支持 bilibili 关键字搜索，YYYY-MM-DD 格式，若为 None 则表示不设置时间范围，按照默认关键字最多返回 1000 条视频的结果处理
+# 爬取结束的天数，仅支持 bilibili 关键字搜索，YYYY-MM-DD 格式
 END_DAY = "2024-01-01"
-# 是否开启按每一天进行爬取的选项，仅支持 bilibili 关键字搜索
+# Bilibili 搜索模式，仅在 CRAWLER_TYPE="search" 时生效
-# 若为 False，则忽略 START_DAY 与 END_DAY 设置的值
+# "normal": 不指定时间范围进行搜索，最多返回约1000条结果。
-# 若为 True，则按照 START_DAY 至 END_DAY 按照每一天进行筛选，这样能够突破 1000 条视频的限制，最大程度爬取该关键词下的所有视频
+# "all_in_time_range": 在 START_DAY 和 END_DAY 指定的时间范围内，尽可能多地爬取数据，每日上限受 MAX_NOTES_PER_DAY 影响，但总数可能超过 CRAWLER_MAX_NOTES_COUNT。
-ALL_DAY = False
+# "daily_limit_in_time_range": 在指定时间范围内，严格遵守 MAX_NOTES_PER_DAY 的每日上限和 CRAWLER_MAX_NOTES_COUNT 的总上限。
 BILI_SEARCH_MODE = "all_in_time_range"
 #!!! 下面仅支持 bilibili creator搜索
 # 爬取评论creator主页还是爬取creator动态和关系列表(True为前者)
--- a/media_platform/bilibili/core.py
+++ b/media_platform/bilibili/core.py
@@ -96,7 +96,14 @@ class BilibiliCrawler(AbstractCrawler):
            crawler_type_var.set(config.CRAWLER_TYPE)
            if config.CRAWLER_TYPE == "search":
                # Search for video and retrieve their comment information.
-                await self.search()
+                if config.BILI_SEARCH_MODE == "normal":
                    await self.search_by_keywords()
                elif config.BILI_SEARCH_MODE == "all_in_time_range":
                    await self.search_by_keywords_in_time_range(daily_limit=False)
                elif config.BILI_SEARCH_MODE == "daily_limit_in_time_range":
                    await self.search_by_keywords_in_time_range(daily_limit=True)
                else:
                    utils.logger.warning(f"Unknown BILI_SEARCH_MODE: {config.BILI_SEARCH_MODE}")
            elif config.CRAWLER_TYPE == "detail":
                # Get the information and comments of the specified post
                await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
@@ -141,106 +148,131 @@ class BilibiliCrawler(AbstractCrawler):
        # 将其重新转换为时间戳
        return str(int(start_day.timestamp())), str(int(end_day.timestamp()))
-    async def search(self):
+    async def search_by_keywords(self):
        """
-        search bilibili video with keywords
+        search bilibili video with keywords in normal mode
        :return:
        """
-        utils.logger.info("[BilibiliCrawler.search] Begin search bilibli keywords")
+        utils.logger.info("[BilibiliCrawler.search_by_keywords] Begin search bilibli keywords")
        bili_limit_count = 20  # bilibili limit page fixed value
        if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count:
            config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
        start_page = config.START_PAGE  # start page number
        for keyword in config.KEYWORDS.split(","):
            source_keyword_var.set(keyword)
-            utils.logger.info(f"[BilibiliCrawler.search] Current search keyword: {keyword}")
+            utils.logger.info(f"[BilibiliCrawler.search_by_keywords] Current search keyword: {keyword}")
-            # 每个关键词最多返回 1000 条数据
+            page = 1
-            if not config.ALL_DAY:
+            while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
-                page = 1
+                if page < start_page:
-                while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+                    utils.logger.info(f"[BilibiliCrawler.search_by_keywords] Skip page: {page}")
                    if page < start_page:
                        utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}")
                        page += 1
                        continue
                    utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, page: {page}")
                    video_id_list: List[str] = []
                    videos_res = await self.bili_client.search_video_by_keyword(
                        keyword=keyword,
                        page=page,
                        page_size=bili_limit_count,
                        order=SearchOrderType.DEFAULT,
                        pubtime_begin_s=0,  # 作品发布日期起始时间戳
                        pubtime_end_s=0  # 作品发布日期结束日期时间戳
                    )
                    video_list: List[Dict] = videos_res.get("result")
                    semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
                    task_list = []
                    try:
                        task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
                    except Exception as e:
                        utils.logger.warning(f"[BilibiliCrawler.search] error in the task list. The video for this page will not be included. {e}")
                    video_items = await asyncio.gather(*task_list)
                    for video_item in video_items:
                        if video_item:
                            video_id_list.append(video_item.get("View").get("aid"))
                            await bilibili_store.update_bilibili_video(video_item)
                            await bilibili_store.update_up_info(video_item)
                            await self.get_bilibili_video(video_item, semaphore)
                    page += 1
-                    await self.batch_get_video_comments(video_id_list)
+                    continue
            # 按照 START_DAY 至 END_DAY 按照每一天进行筛选，这样能够突破 1000 条视频的限制，最大程度爬取该关键词下每一天的所有视频
            else:
                for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq='D'):
                    # 按照每一天进行爬取的时间戳参数
                    pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'), end=day.strftime('%Y-%m-%d'))
                    page = 1
                    notes_count_this_day = 0
                    #!该段 while 语句在发生异常时（通常情况下为当天数据为空时）会自动跳转到下一天，以实现最大程度爬取该关键词下当天的所有视频
                    #!除了仅保留现在原有的 try, except Exception 语句外，不要再添加其他的异常处理！！！否则将使该段代码失效，使其仅能爬取当天一天数据而无法跳转到下一天
                    #!除非将该段代码的逻辑进行重构以实现相同的功能，否则不要进行修改！！！
                    while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
                        if notes_count_this_day >= config.MAX_NOTES_PER_DAY:
                            utils.logger.info(f"[BilibiliCrawler.search] Reached the maximum number of notes for today {day.ctime()}.")
                            break
                        #! Catch any error if response return nothing, go to next day
                        try:
                            #! Don't skip any page, to make sure gather all video in one day
                            # if page < start_page:
                            #     utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}")
                            #     page += 1
                            #     continue
-                            utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}")
+                utils.logger.info(f"[BilibiliCrawler.search_by_keywords] search bilibili keyword: {keyword}, page: {page}")
-                            video_id_list: List[str] = []
+                video_id_list: List[str] = []
-                            videos_res = await self.bili_client.search_video_by_keyword(
+                videos_res = await self.bili_client.search_video_by_keyword(
-                                keyword=keyword,
+                    keyword=keyword,
-                                page=page,
+                    page=page,
-                                page_size=bili_limit_count,
+                    page_size=bili_limit_count,
-                                order=SearchOrderType.DEFAULT,
+                    order=SearchOrderType.DEFAULT,
-                                pubtime_begin_s=pubtime_begin_s,  # 作品发布日期起始时间戳
+                    pubtime_begin_s=0,  # 作品发布日期起始时间戳
-                                pubtime_end_s=pubtime_end_s  # 作品发布日期结束日期时间戳
+                    pubtime_end_s=0  # 作品发布日期结束日期时间戳
-                            )
+                )
-                            video_list: List[Dict] = videos_res.get("result")
+                video_list: List[Dict] = videos_res.get("result")
-                            semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
+                if not video_list:
-                            task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
+                    utils.logger.info(f"[BilibiliCrawler.search_by_keywords] No more videos for '{keyword}', moving to next keyword.")
-                            video_items = await asyncio.gather(*task_list)
+                    break
-                            for video_item in video_items:
+
-                                if video_item:
+                semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
-                                    notes_count_this_day += 1
+                task_list = []
-                                    video_id_list.append(video_item.get("View").get("aid"))
+                try:
-                                    await bilibili_store.update_bilibili_video(video_item)
+                    task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
-                                    await bilibili_store.update_up_info(video_item)
+                except Exception as e:
-                                    await self.get_bilibili_video(video_item, semaphore)
+                    utils.logger.warning(f"[BilibiliCrawler.search_by_keywords] error in the task list. The video for this page will not be included. {e}")
-                            page += 1
+                video_items = await asyncio.gather(*task_list)
-                            await self.batch_get_video_comments(video_id_list)
+                for video_item in video_items:
-                        # go to next day
+                    if video_item:
-                        except Exception as e:
+                        video_id_list.append(video_item.get("View").get("aid"))
-                            print(e)
+                        await bilibili_store.update_bilibili_video(video_item)
                        await bilibili_store.update_up_info(video_item)
                        await self.get_bilibili_video(video_item, semaphore)
                page += 1
                await self.batch_get_video_comments(video_id_list)
    async def search_by_keywords_in_time_range(self, daily_limit: bool):
        """
        Search bilibili video with keywords in a given time range.
        :param daily_limit: if True, strictly limit the number of notes per day and total.
        """
        utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Begin search with daily_limit={daily_limit}")
        bili_limit_count = 20
        start_page = config.START_PAGE
        for keyword in config.KEYWORDS.split(","):
            source_keyword_var.set(keyword)
            utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Current search keyword: {keyword}")
            total_notes_crawled = 0
            for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq='D'):
                if daily_limit and total_notes_crawled >= config.CRAWLER_MAX_NOTES_COUNT:
                    utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}', skipping remaining days.")
                    break
                pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'), end=day.strftime('%Y-%m-%d'))
                page = 1
                notes_count_this_day = 0
                while True:
                    if notes_count_this_day >= config.MAX_NOTES_PER_DAY:
                        utils.logger.info(f"[BilibiliCrawler.search] Reached MAX_NOTES_PER_DAY limit for {day.ctime()}.")
                        break
                    if daily_limit and total_notes_crawled >= config.CRAWLER_MAX_NOTES_COUNT:
                        utils.logger.info(f"[BilibiliCrawler.search] Reached CRAWLER_MAX_NOTES_COUNT limit for keyword '{keyword}'.")
                        break
                    if not daily_limit and (page - start_page + 1) * bili_limit_count > config.CRAWLER_MAX_NOTES_COUNT:
                        # For non-daily-limit mode, we still respect the total count in a loose way per day.
                        break
                    try:
                        utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}")
                        video_id_list: List[str] = []
                        videos_res = await self.bili_client.search_video_by_keyword(
                            keyword=keyword,
                            page=page,
                            page_size=bili_limit_count,
                            order=SearchOrderType.DEFAULT,
                            pubtime_begin_s=pubtime_begin_s,
                            pubtime_end_s=pubtime_end_s
                        )
                        video_list: List[Dict] = videos_res.get("result")
                        if not video_list:
                            utils.logger.info(f"[BilibiliCrawler.search] No more videos for '{keyword}' on {day.ctime()}, moving to next day.")
                            break
                        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
                        task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
                        video_items = await asyncio.gather(*task_list)
                        for video_item in video_items:
                            if video_item:
                                if daily_limit and total_notes_crawled >= config.CRAWLER_MAX_NOTES_COUNT:
                                    break
                                notes_count_this_day += 1
                                total_notes_crawled += 1
                                video_id_list.append(video_item.get("View").get("aid"))
                                await bilibili_store.update_bilibili_video(video_item)
                                await bilibili_store.update_up_info(video_item)
                                await self.get_bilibili_video(video_item, semaphore)
                        page += 1
                        await self.batch_get_video_comments(video_id_list)
                    except Exception as e:
                        utils.logger.error(f"[BilibiliCrawler.search] Error searching on {day.ctime()}: {e}")
                        break
    async def batch_get_video_comments(self, video_id_list: List[str]):
        """
        batch get video comments