From 66843f216a36ac47440ad10298b0cbdb84eb5e91 Mon Sep 17 00:00:00 2001
From: Bowenwin <bowenxu505@gmail.com>
Date: Thu, 22 May 2025 22:26:30 +0800
Subject: [PATCH] finish_all_for_expand_bili

---
 config/base_config.py                 | 24 ++++++++--------
 config/db_config.py                   | 15 ++++------
 media_platform/bilibili/client.py     |  7 ++---
 media_platform/bilibili/core.py       | 40 ++++++++++++---------------
 store/bilibili/bilibili_store_impl.py | 15 +++++-----
 store/bilibili/bilibili_store_sql.py  |  1 +
 store/douyin/douyin_store_impl.py     |  2 +-
 store/kuaishou/kuaishou_store_impl.py |  2 +-
 store/tieba/tieba_store_impl.py       |  2 +-
 store/weibo/weibo_store_impl.py       |  2 +-
 store/xhs/xhs_store_impl.py           |  2 +-
 store/zhihu/zhihu_store_impl.py       |  2 +-
 12 files changed, 51 insertions(+), 63 deletions(-)

diff --git a/config/base_config.py b/config/base_config.py
index d05347a..102e567 100644
--- a/config/base_config.py
+++ b/config/base_config.py
@@ -10,16 +10,16 @@
 
 
 # 基础配置
-PLATFORM = "bili"
+PLATFORM = "xhs"
 KEYWORDS = "编程副业,编程兼职"  # 关键词搜索配置，以英文逗号分隔
-LOGIN_TYPE = "phone"  # qrcode or phone or cookie
+LOGIN_TYPE = "qrcode"  # qrcode or phone or cookie
 COOKIES = ""
 # 具体值参见media_platform.xxx.field下的枚举值，暂时只支持小红书
 SORT_TYPE = "popularity_descending"
 # 具体值参见media_platform.xxx.field下的枚举值，暂时只支持抖音
 PUBLISH_TIME_TYPE = 0
 CRAWLER_TYPE = (
-    "creator"  # 爬取类型，search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
+    "search"  # 爬取类型，search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
 )
 # 自定义User Agent（暂时仅对XHS有效）
 UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0'
@@ -54,9 +54,6 @@ USER_DATA_DIR = "%s_user_data_dir"  # %s will be replaced by platform name
 # 爬取开始页数 默认从第一页开始
 START_PAGE = 1
 
-# 爬取粉丝列表开始页数 默认从第一页开始
-START_CONTACTS_PAGE = 1
-
 # 爬取视频/帖子的数量控制
 CRAWLER_MAX_NOTES_COUNT = 200
 
@@ -147,11 +144,7 @@ DY_CREATOR_ID_LIST = [
 
 # 指定bili创作者ID列表(sec_id)
 BILI_CREATOR_ID_LIST = [
-    # "20813884",
-    "520819684",
-    # "472747194",
-    # "519872016",
-    # "372201438",
+    "20813884",
     # ........................
 ]
 
@@ -202,8 +195,15 @@ END_DAY = '2024-01-01'
 # 若为 True，则按照 START_DAY 至 END_DAY 按照每一天进行筛选，这样能够突破 1000 条视频的限制，最大程度爬取该关键词下的所有视频
 ALL_DAY = False
 
+#!!! 下面仅支持 bilibili creator搜索
+# 爬取评论creator主页还是爬取creator动态和关系列表(True为前者)
+CREATOR_MODE = True
+
+# 爬取creator粉丝列表时起始爬取页数
+START_CONTACTS_PAGE = 1
+
 # 爬取作者粉丝和关注列表数量控制(单作者)
 CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES = 100
 
-# 爬取作者动态粉丝和关注列表数量控制(单作者)
+# 爬取作者动态数量控制(单作者)
 CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES = 50
\ No newline at end of file
diff --git a/config/db_config.py b/config/db_config.py
index 2b9c2d5..51d3fd0 100644
--- a/config/db_config.py
+++ b/config/db_config.py
@@ -12,16 +12,11 @@
 import os
 
 # mysql config
-# RELATION_DB_PWD = os.getenv("RELATION_DB_PWD", "123456")
-# RELATION_DB_USER = os.getenv("RELATION_DB_USER", "root")
-# RELATION_DB_HOST = os.getenv("RELATION_DB_HOST", "localhost")
-# RELATION_DB_PORT = os.getenv("RELATION_DB_PORT", 3306)
-# RELATION_DB_NAME = os.getenv("RELATION_DB_NAME", "media_crawler")
-RELATION_DB_HOST = "47.94.233.47"    # 替换为你的数据库域名/公网IP
-RELATION_DB_PORT = 3306                 # 替换为你的数据库端口（通常3306）
-RELATION_DB_USER = "remote_user"      # 替换为你的数据库用户名
-RELATION_DB_PWD = "314159"       # 替换为你的数据库密码
-RELATION_DB_NAME = "Test"      # 替换为你的数据库名称
+RELATION_DB_PWD = os.getenv("RELATION_DB_PWD", "123456")
+RELATION_DB_USER = os.getenv("RELATION_DB_USER", "root")
+RELATION_DB_HOST = os.getenv("RELATION_DB_HOST", "localhost")
+RELATION_DB_PORT = os.getenv("RELATION_DB_PORT", 3306)
+RELATION_DB_NAME = os.getenv("RELATION_DB_NAME", "media_crawler")
 
 
 # redis config
diff --git a/media_platform/bilibili/client.py b/media_platform/bilibili/client.py
index d03d105..32af357 100644
--- a/media_platform/bilibili/client.py
+++ b/media_platform/bilibili/client.py
@@ -224,7 +224,7 @@ class BilibiliClient(AbstractApiClient):
 
     async def get_video_all_comments(self, video_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
                                      callback: Optional[Callable] = None,
-                                     max_count: int = 10, ):
+                                     max_count: int = 10,):
         """
         get video all comments include sub comments
         :param video_id:
@@ -251,7 +251,7 @@ class BilibiliClient(AbstractApiClient):
                     if (comment.get("rcount", 0) > 0):
                         {
                             await self.get_video_all_level_two_comments(
-                                video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback)
+                                video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval,  callback)
                         }
             if len(result) + len(comment_list) > max_count:
                 comment_list = comment_list[:max_count - len(result)]
@@ -321,8 +321,7 @@ class BilibiliClient(AbstractApiClient):
         result = await self.get(uri, post_data)
         return result
 
-    async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30,
-                                 order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict:
+    async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict:
         """get all videos for a creator
         :param creator_id: 创作者 ID
         :param pn: 页数
diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py
index 8c66747..1836ba2 100644
--- a/media_platform/bilibili/core.py
+++ b/media_platform/bilibili/core.py
@@ -89,9 +89,11 @@ class BilibiliCrawler(AbstractCrawler):
                 # Get the information and comments of the specified post
                 await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
             elif config.CRAWLER_TYPE == "creator":
-                # for creator_id in config.BILI_CREATOR_ID_LIST:
-                #     await self.get_creator_videos(int(creator_id))
-                await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST)
+                if config.CREATOR_MODE:
+                    for creator_id in config.BILI_CREATOR_ID_LIST:
+                        await self.get_creator_videos(int(creator_id))
+                else:
+                    await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST)
             else:
                 pass
             utils.logger.info(
@@ -119,11 +121,9 @@ class BilibiliCrawler(AbstractCrawler):
         start_day: datetime = datetime.strptime(start, '%Y-%m-%d')
         end_day: datetime = datetime.strptime(end, '%Y-%m-%d')
         if start_day > end_day:
-            raise ValueError(
-                'Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end')
+            raise ValueError('Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end')
         elif start_day == end_day:  # 搜索同一天的内容
-            end_day = start_day + timedelta(days=1) - timedelta(
-                seconds=1)  # 则将 end_day 设置为 start_day + 1 day - 1 second
+            end_day = start_day + timedelta(days=1) - timedelta(seconds=1)  # 则将 end_day 设置为 start_day + 1 day - 1 second
         else:  # 搜索 start 至 end
             end_day = end_day + timedelta(days=1) - timedelta(seconds=1)  # 则将 end_day 设置为 end_day + 1 day - 1 second
         # 将其重新转换为时间戳
@@ -166,11 +166,9 @@ class BilibiliCrawler(AbstractCrawler):
                     semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
                     task_list = []
                     try:
-                        task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore)
-                                     for video_item in video_list]
+                        task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
                     except Exception as e:
-                        utils.logger.warning(
-                            f"[BilibiliCrawler.search] error in the task list. The video for this page will not be included. {e}")
+                        utils.logger.warning(f"[BilibiliCrawler.search] error in the task list. The video for this page will not be included. {e}")
                     video_items = await asyncio.gather(*task_list)
                     for video_item in video_items:
                         if video_item:
@@ -184,23 +182,21 @@ class BilibiliCrawler(AbstractCrawler):
             else:
                 for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq='D'):
                     # 按照每一天进行爬取的时间戳参数
-                    pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'),
-                                                                                     end=day.strftime('%Y-%m-%d'))
+                    pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'), end=day.strftime('%Y-%m-%d'))
                     page = 1
-                    # !该段 while 语句在发生异常时（通常情况下为当天数据为空时）会自动跳转到下一天，以实现最大程度爬取该关键词下当天的所有视频
-                    # !除了仅保留现在原有的 try, except Exception 语句外，不要再添加其他的异常处理！！！否则将使该段代码失效，使其仅能爬取当天一天数据而无法跳转到下一天
-                    # !除非将该段代码的逻辑进行重构以实现相同的功能，否则不要进行修改！！！
+                    #!该段 while 语句在发生异常时（通常情况下为当天数据为空时）会自动跳转到下一天，以实现最大程度爬取该关键词下当天的所有视频
+                    #!除了仅保留现在原有的 try, except Exception 语句外，不要再添加其他的异常处理！！！否则将使该段代码失效，使其仅能爬取当天一天数据而无法跳转到下一天
+                    #!除非将该段代码的逻辑进行重构以实现相同的功能，否则不要进行修改！！！
                     while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
-                        # ! Catch any error if response return nothing, go to next day
+                        #! Catch any error if response return nothing, go to next day
                         try:
-                            # ! Don't skip any page, to make sure gather all video in one day
+                            #! Don't skip any page, to make sure gather all video in one day
                             # if page < start_page:
                             #     utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}")
                             #     page += 1
                             #     continue
 
-                            utils.logger.info(
-                                f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}")
+                            utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}")
                             video_id_list: List[str] = []
                             videos_res = await self.bili_client.search_video_by_keyword(
                                 keyword=keyword,
@@ -213,9 +209,7 @@ class BilibiliCrawler(AbstractCrawler):
                             video_list: List[Dict] = videos_res.get("result")
 
                             semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
-                            task_list = [
-                                self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for
-                                video_item in video_list]
+                            task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
                             video_items = await asyncio.gather(*task_list)
                             for video_item in video_items:
                                 if video_item:
diff --git a/store/bilibili/bilibili_store_impl.py b/store/bilibili/bilibili_store_impl.py
index 00ccd79..0fa1504 100644
--- a/store/bilibili/bilibili_store_impl.py
+++ b/store/bilibili/bilibili_store_impl.py
@@ -38,15 +38,13 @@ def calculate_number_of_files(file_store_path: str) -> int:
     if not os.path.exists(file_store_path):
         return 1
     try:
-        return max([int(file_name.split("_")[0]) for file_name in os.listdir(file_store_path)]) + 1
+        return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1
     except ValueError:
         return 1
 
-
 class BiliCsvStoreImplement(AbstractStore):
     csv_store_path: str = "data/bilibili"
-    file_count: int = calculate_number_of_files(csv_store_path)
-
+    file_count:int=calculate_number_of_files(csv_store_path)
     def make_save_file_name(self, store_type: str) -> str:
         """
         make save file name by store type
@@ -196,7 +194,7 @@ class BiliDbStoreImplement(AbstractStore):
             creator["add_ts"] = utils.get_current_timestamp()
             await add_new_creator(creator)
         else:
-            await update_creator_by_creator_id(creator_id, creator_item=creator)
+            await update_creator_by_creator_id(creator_id,creator_item=creator)
 
     async def store_contact(self, contact_item: Dict):
         """
@@ -249,10 +247,11 @@ class BiliJsonStoreImplement(AbstractStore):
     json_store_path: str = "data/bilibili/json"
     words_store_path: str = "data/bilibili/words"
     lock = asyncio.Lock()
-    file_count: int = calculate_number_of_files(json_store_path)
+    file_count:int=calculate_number_of_files(json_store_path)
     WordCloud = words.AsyncWordCloudGenerator()
 
-    def make_save_file_name(self, store_type: str) -> (str, str):
+
+    def make_save_file_name(self, store_type: str) -> (str,str):
         """
         make save file name by store type
         Args:
@@ -279,7 +278,7 @@ class BiliJsonStoreImplement(AbstractStore):
         """
         pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True)
         pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True)
-        save_file_name, words_file_name_prefix = self.make_save_file_name(store_type=store_type)
+        save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type)
         save_data = []
 
         async with self.lock:
diff --git a/store/bilibili/bilibili_store_sql.py b/store/bilibili/bilibili_store_sql.py
index 6ee4048..02b146c 100644
--- a/store/bilibili/bilibili_store_sql.py
+++ b/store/bilibili/bilibili_store_sql.py
@@ -66,6 +66,7 @@ async def update_content_by_content_id(content_id: str, content_item: Dict) -> i
     return effect_row
 
 
+
 async def query_comment_by_comment_id(comment_id: str) -> Dict:
     """
     查询一条评论内容
diff --git a/store/douyin/douyin_store_impl.py b/store/douyin/douyin_store_impl.py
index c6308b1..30c993c 100644
--- a/store/douyin/douyin_store_impl.py
+++ b/store/douyin/douyin_store_impl.py
@@ -238,7 +238,7 @@ class DouyinJsonStoreImplement(AbstractStore):
 
     async def store_comment(self, comment_item: Dict):
         """
-        comment JSON storage implementatio
+        comment JSON storage implementation
         Args:
             comment_item:
 
diff --git a/store/kuaishou/kuaishou_store_impl.py b/store/kuaishou/kuaishou_store_impl.py
index 0ee9f64..4866796 100644
--- a/store/kuaishou/kuaishou_store_impl.py
+++ b/store/kuaishou/kuaishou_store_impl.py
@@ -215,7 +215,7 @@ class KuaishouJsonStoreImplement(AbstractStore):
 
     async def store_comment(self, comment_item: Dict):
         """
-        comment JSON storage implementatio
+        comment JSON storage implementation
         Args:
             comment_item:
 
diff --git a/store/tieba/tieba_store_impl.py b/store/tieba/tieba_store_impl.py
index ade65e5..ff5da80 100644
--- a/store/tieba/tieba_store_impl.py
+++ b/store/tieba/tieba_store_impl.py
@@ -235,7 +235,7 @@ class TieBaJsonStoreImplement(AbstractStore):
 
     async def store_comment(self, comment_item: Dict):
         """
-        comment JSON storage implementatio
+        comment JSON storage implementation
         Args:
             comment_item:
 
diff --git a/store/weibo/weibo_store_impl.py b/store/weibo/weibo_store_impl.py
index 0db1e51..7348fec 100644
--- a/store/weibo/weibo_store_impl.py
+++ b/store/weibo/weibo_store_impl.py
@@ -241,7 +241,7 @@ class WeiboJsonStoreImplement(AbstractStore):
 
     async def store_comment(self, comment_item: Dict):
         """
-        comment JSON storage implementatio
+        comment JSON storage implementation
         Args:
             comment_item:
 
diff --git a/store/xhs/xhs_store_impl.py b/store/xhs/xhs_store_impl.py
index 54ce528..5ad4979 100644
--- a/store/xhs/xhs_store_impl.py
+++ b/store/xhs/xhs_store_impl.py
@@ -236,7 +236,7 @@ class XhsJsonStoreImplement(AbstractStore):
 
     async def store_comment(self, comment_item: Dict):
         """
-        comment JSON storage implementatio
+        comment JSON storage implementation
         Args:
             comment_item:
 
diff --git a/store/zhihu/zhihu_store_impl.py b/store/zhihu/zhihu_store_impl.py
index a5c24a3..34d0a5b 100644
--- a/store/zhihu/zhihu_store_impl.py
+++ b/store/zhihu/zhihu_store_impl.py
@@ -235,7 +235,7 @@ class ZhihuJsonStoreImplement(AbstractStore):
 
     async def store_comment(self, comment_item: Dict):
         """
-        comment JSON storage implementatio
+        comment JSON storage implementation
         Args:
             comment_item: