From 66843f216a36ac47440ad10298b0cbdb84eb5e91 Mon Sep 17 00:00:00 2001 From: Bowenwin Date: Thu, 22 May 2025 22:26:30 +0800 Subject: [PATCH] finish_all_for_expand_bili --- config/base_config.py | 24 ++++++++-------- config/db_config.py | 15 ++++------ media_platform/bilibili/client.py | 7 ++--- media_platform/bilibili/core.py | 40 ++++++++++++--------------- store/bilibili/bilibili_store_impl.py | 15 +++++----- store/bilibili/bilibili_store_sql.py | 1 + store/douyin/douyin_store_impl.py | 2 +- store/kuaishou/kuaishou_store_impl.py | 2 +- store/tieba/tieba_store_impl.py | 2 +- store/weibo/weibo_store_impl.py | 2 +- store/xhs/xhs_store_impl.py | 2 +- store/zhihu/zhihu_store_impl.py | 2 +- 12 files changed, 51 insertions(+), 63 deletions(-) diff --git a/config/base_config.py b/config/base_config.py index d05347a..102e567 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -10,16 +10,16 @@ # 基础配置 -PLATFORM = "bili" +PLATFORM = "xhs" KEYWORDS = "编程副业,编程兼职" # 关键词搜索配置,以英文逗号分隔 -LOGIN_TYPE = "phone" # qrcode or phone or cookie +LOGIN_TYPE = "qrcode" # qrcode or phone or cookie COOKIES = "" # 具体值参见media_platform.xxx.field下的枚举值,暂时只支持小红书 SORT_TYPE = "popularity_descending" # 具体值参见media_platform.xxx.field下的枚举值,暂时只支持抖音 PUBLISH_TIME_TYPE = 0 CRAWLER_TYPE = ( - "creator" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据) + "search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据) ) # 自定义User Agent(暂时仅对XHS有效) UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0' @@ -54,9 +54,6 @@ USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name # 爬取开始页数 默认从第一页开始 START_PAGE = 1 -# 爬取粉丝列表开始页数 默认从第一页开始 -START_CONTACTS_PAGE = 1 - # 爬取视频/帖子的数量控制 CRAWLER_MAX_NOTES_COUNT = 200 @@ -147,11 +144,7 @@ DY_CREATOR_ID_LIST = [ # 指定bili创作者ID列表(sec_id) BILI_CREATOR_ID_LIST = [ - # "20813884", - "520819684", - # "472747194", - # "519872016", - # "372201438", + "20813884", # ........................ ] @@ -202,8 +195,15 @@ END_DAY = '2024-01-01' # 若为 True,则按照 START_DAY 至 END_DAY 按照每一天进行筛选,这样能够突破 1000 条视频的限制,最大程度爬取该关键词下的所有视频 ALL_DAY = False +#!!! 下面仅支持 bilibili creator搜索 +# 爬取评论creator主页还是爬取creator动态和关系列表(True为前者) +CREATOR_MODE = True + +# 爬取creator粉丝列表时起始爬取页数 +START_CONTACTS_PAGE = 1 + # 爬取作者粉丝和关注列表数量控制(单作者) CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES = 100 -# 爬取作者动态粉丝和关注列表数量控制(单作者) +# 爬取作者动态数量控制(单作者) CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES = 50 \ No newline at end of file diff --git a/config/db_config.py b/config/db_config.py index 2b9c2d5..51d3fd0 100644 --- a/config/db_config.py +++ b/config/db_config.py @@ -12,16 +12,11 @@ import os # mysql config -# RELATION_DB_PWD = os.getenv("RELATION_DB_PWD", "123456") -# RELATION_DB_USER = os.getenv("RELATION_DB_USER", "root") -# RELATION_DB_HOST = os.getenv("RELATION_DB_HOST", "localhost") -# RELATION_DB_PORT = os.getenv("RELATION_DB_PORT", 3306) -# RELATION_DB_NAME = os.getenv("RELATION_DB_NAME", "media_crawler") -RELATION_DB_HOST = "47.94.233.47" # 替换为你的数据库域名/公网IP -RELATION_DB_PORT = 3306 # 替换为你的数据库端口(通常3306) -RELATION_DB_USER = "remote_user" # 替换为你的数据库用户名 -RELATION_DB_PWD = "314159" # 替换为你的数据库密码 -RELATION_DB_NAME = "Test" # 替换为你的数据库名称 +RELATION_DB_PWD = os.getenv("RELATION_DB_PWD", "123456") +RELATION_DB_USER = os.getenv("RELATION_DB_USER", "root") +RELATION_DB_HOST = os.getenv("RELATION_DB_HOST", "localhost") +RELATION_DB_PORT = os.getenv("RELATION_DB_PORT", 3306) +RELATION_DB_NAME = os.getenv("RELATION_DB_NAME", "media_crawler") # redis config diff --git a/media_platform/bilibili/client.py b/media_platform/bilibili/client.py index d03d105..32af357 100644 --- a/media_platform/bilibili/client.py +++ b/media_platform/bilibili/client.py @@ -224,7 +224,7 @@ class BilibiliClient(AbstractApiClient): async def get_video_all_comments(self, video_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False, callback: Optional[Callable] = None, - max_count: int = 10, ): + max_count: int = 10,): """ get video all comments include sub comments :param video_id: @@ -251,7 +251,7 @@ class BilibiliClient(AbstractApiClient): if (comment.get("rcount", 0) > 0): { await self.get_video_all_level_two_comments( - video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback) + video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback) } if len(result) + len(comment_list) > max_count: comment_list = comment_list[:max_count - len(result)] @@ -321,8 +321,7 @@ class BilibiliClient(AbstractApiClient): result = await self.get(uri, post_data) return result - async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, - order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict: + async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict: """get all videos for a creator :param creator_id: 创作者 ID :param pn: 页数 diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py index 8c66747..1836ba2 100644 --- a/media_platform/bilibili/core.py +++ b/media_platform/bilibili/core.py @@ -89,9 +89,11 @@ class BilibiliCrawler(AbstractCrawler): # Get the information and comments of the specified post await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST) elif config.CRAWLER_TYPE == "creator": - # for creator_id in config.BILI_CREATOR_ID_LIST: - # await self.get_creator_videos(int(creator_id)) - await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST) + if config.CREATOR_MODE: + for creator_id in config.BILI_CREATOR_ID_LIST: + await self.get_creator_videos(int(creator_id)) + else: + await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST) else: pass utils.logger.info( @@ -119,11 +121,9 @@ class BilibiliCrawler(AbstractCrawler): start_day: datetime = datetime.strptime(start, '%Y-%m-%d') end_day: datetime = datetime.strptime(end, '%Y-%m-%d') if start_day > end_day: - raise ValueError( - 'Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end') + raise ValueError('Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end') elif start_day == end_day: # 搜索同一天的内容 - end_day = start_day + timedelta(days=1) - timedelta( - seconds=1) # 则将 end_day 设置为 start_day + 1 day - 1 second + end_day = start_day + timedelta(days=1) - timedelta(seconds=1) # 则将 end_day 设置为 start_day + 1 day - 1 second else: # 搜索 start 至 end end_day = end_day + timedelta(days=1) - timedelta(seconds=1) # 则将 end_day 设置为 end_day + 1 day - 1 second # 将其重新转换为时间戳 @@ -166,11 +166,9 @@ class BilibiliCrawler(AbstractCrawler): semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list = [] try: - task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) - for video_item in video_list] + task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list] except Exception as e: - utils.logger.warning( - f"[BilibiliCrawler.search] error in the task list. The video for this page will not be included. {e}") + utils.logger.warning(f"[BilibiliCrawler.search] error in the task list. The video for this page will not be included. {e}") video_items = await asyncio.gather(*task_list) for video_item in video_items: if video_item: @@ -184,23 +182,21 @@ class BilibiliCrawler(AbstractCrawler): else: for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq='D'): # 按照每一天进行爬取的时间戳参数 - pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'), - end=day.strftime('%Y-%m-%d')) + pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'), end=day.strftime('%Y-%m-%d')) page = 1 - # !该段 while 语句在发生异常时(通常情况下为当天数据为空时)会自动跳转到下一天,以实现最大程度爬取该关键词下当天的所有视频 - # !除了仅保留现在原有的 try, except Exception 语句外,不要再添加其他的异常处理!!!否则将使该段代码失效,使其仅能爬取当天一天数据而无法跳转到下一天 - # !除非将该段代码的逻辑进行重构以实现相同的功能,否则不要进行修改!!! + #!该段 while 语句在发生异常时(通常情况下为当天数据为空时)会自动跳转到下一天,以实现最大程度爬取该关键词下当天的所有视频 + #!除了仅保留现在原有的 try, except Exception 语句外,不要再添加其他的异常处理!!!否则将使该段代码失效,使其仅能爬取当天一天数据而无法跳转到下一天 + #!除非将该段代码的逻辑进行重构以实现相同的功能,否则不要进行修改!!! while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: - # ! Catch any error if response return nothing, go to next day + #! Catch any error if response return nothing, go to next day try: - # ! Don't skip any page, to make sure gather all video in one day + #! Don't skip any page, to make sure gather all video in one day # if page < start_page: # utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}") # page += 1 # continue - utils.logger.info( - f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}") + utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}") video_id_list: List[str] = [] videos_res = await self.bili_client.search_video_by_keyword( keyword=keyword, @@ -213,9 +209,7 @@ class BilibiliCrawler(AbstractCrawler): video_list: List[Dict] = videos_res.get("result") semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) - task_list = [ - self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for - video_item in video_list] + task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list] video_items = await asyncio.gather(*task_list) for video_item in video_items: if video_item: diff --git a/store/bilibili/bilibili_store_impl.py b/store/bilibili/bilibili_store_impl.py index 00ccd79..0fa1504 100644 --- a/store/bilibili/bilibili_store_impl.py +++ b/store/bilibili/bilibili_store_impl.py @@ -38,15 +38,13 @@ def calculate_number_of_files(file_store_path: str) -> int: if not os.path.exists(file_store_path): return 1 try: - return max([int(file_name.split("_")[0]) for file_name in os.listdir(file_store_path)]) + 1 + return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1 except ValueError: return 1 - class BiliCsvStoreImplement(AbstractStore): csv_store_path: str = "data/bilibili" - file_count: int = calculate_number_of_files(csv_store_path) - + file_count:int=calculate_number_of_files(csv_store_path) def make_save_file_name(self, store_type: str) -> str: """ make save file name by store type @@ -196,7 +194,7 @@ class BiliDbStoreImplement(AbstractStore): creator["add_ts"] = utils.get_current_timestamp() await add_new_creator(creator) else: - await update_creator_by_creator_id(creator_id, creator_item=creator) + await update_creator_by_creator_id(creator_id,creator_item=creator) async def store_contact(self, contact_item: Dict): """ @@ -249,10 +247,11 @@ class BiliJsonStoreImplement(AbstractStore): json_store_path: str = "data/bilibili/json" words_store_path: str = "data/bilibili/words" lock = asyncio.Lock() - file_count: int = calculate_number_of_files(json_store_path) + file_count:int=calculate_number_of_files(json_store_path) WordCloud = words.AsyncWordCloudGenerator() - def make_save_file_name(self, store_type: str) -> (str, str): + + def make_save_file_name(self, store_type: str) -> (str,str): """ make save file name by store type Args: @@ -279,7 +278,7 @@ class BiliJsonStoreImplement(AbstractStore): """ pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True) pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True) - save_file_name, words_file_name_prefix = self.make_save_file_name(store_type=store_type) + save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type) save_data = [] async with self.lock: diff --git a/store/bilibili/bilibili_store_sql.py b/store/bilibili/bilibili_store_sql.py index 6ee4048..02b146c 100644 --- a/store/bilibili/bilibili_store_sql.py +++ b/store/bilibili/bilibili_store_sql.py @@ -66,6 +66,7 @@ async def update_content_by_content_id(content_id: str, content_item: Dict) -> i return effect_row + async def query_comment_by_comment_id(comment_id: str) -> Dict: """ 查询一条评论内容 diff --git a/store/douyin/douyin_store_impl.py b/store/douyin/douyin_store_impl.py index c6308b1..30c993c 100644 --- a/store/douyin/douyin_store_impl.py +++ b/store/douyin/douyin_store_impl.py @@ -238,7 +238,7 @@ class DouyinJsonStoreImplement(AbstractStore): async def store_comment(self, comment_item: Dict): """ - comment JSON storage implementatio + comment JSON storage implementation Args: comment_item: diff --git a/store/kuaishou/kuaishou_store_impl.py b/store/kuaishou/kuaishou_store_impl.py index 0ee9f64..4866796 100644 --- a/store/kuaishou/kuaishou_store_impl.py +++ b/store/kuaishou/kuaishou_store_impl.py @@ -215,7 +215,7 @@ class KuaishouJsonStoreImplement(AbstractStore): async def store_comment(self, comment_item: Dict): """ - comment JSON storage implementatio + comment JSON storage implementation Args: comment_item: diff --git a/store/tieba/tieba_store_impl.py b/store/tieba/tieba_store_impl.py index ade65e5..ff5da80 100644 --- a/store/tieba/tieba_store_impl.py +++ b/store/tieba/tieba_store_impl.py @@ -235,7 +235,7 @@ class TieBaJsonStoreImplement(AbstractStore): async def store_comment(self, comment_item: Dict): """ - comment JSON storage implementatio + comment JSON storage implementation Args: comment_item: diff --git a/store/weibo/weibo_store_impl.py b/store/weibo/weibo_store_impl.py index 0db1e51..7348fec 100644 --- a/store/weibo/weibo_store_impl.py +++ b/store/weibo/weibo_store_impl.py @@ -241,7 +241,7 @@ class WeiboJsonStoreImplement(AbstractStore): async def store_comment(self, comment_item: Dict): """ - comment JSON storage implementatio + comment JSON storage implementation Args: comment_item: diff --git a/store/xhs/xhs_store_impl.py b/store/xhs/xhs_store_impl.py index 54ce528..5ad4979 100644 --- a/store/xhs/xhs_store_impl.py +++ b/store/xhs/xhs_store_impl.py @@ -236,7 +236,7 @@ class XhsJsonStoreImplement(AbstractStore): async def store_comment(self, comment_item: Dict): """ - comment JSON storage implementatio + comment JSON storage implementation Args: comment_item: diff --git a/store/zhihu/zhihu_store_impl.py b/store/zhihu/zhihu_store_impl.py index a5c24a3..34d0a5b 100644 --- a/store/zhihu/zhihu_store_impl.py +++ b/store/zhihu/zhihu_store_impl.py @@ -235,7 +235,7 @@ class ZhihuJsonStoreImplement(AbstractStore): async def store_comment(self, comment_item: Dict): """ - comment JSON storage implementatio + comment JSON storage implementation Args: comment_item: