diff --git a/.gitignore b/.gitignore index d44b7a1..c9a9ac8 100644 --- a/.gitignore +++ b/.gitignore @@ -169,4 +169,8 @@ cython_debug/ */.DS_Store .vscode /node_modules -docs/.vitepress/cache \ No newline at end of file +docs/.vitepress/cache + +# other gitignore +.venv +.refer \ No newline at end of file diff --git a/cmd_arg/arg.py b/cmd_arg/arg.py index ba24fd7..12643ee 100644 --- a/cmd_arg/arg.py +++ b/cmd_arg/arg.py @@ -17,25 +17,29 @@ from tools.utils import str2bool async def parse_cmd(): # 读取command arg - parser = argparse.ArgumentParser(description='Media crawler program.') - parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb | tieba | zhihu)', + parser = argparse.ArgumentParser(description='Media crawler program. / 媒体爬虫程序') + parser.add_argument('--platform', type=str, + help='Media platform select / 选择媒体平台 (xhs=小红书 | dy=抖音 | ks=快手 | bili=哔哩哔哩 | wb=微博 | tieba=百度贴吧 | zhihu=知乎)', choices=["xhs", "dy", "ks", "bili", "wb", "tieba", "zhihu"], default=config.PLATFORM) - parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', + parser.add_argument('--lt', type=str, + help='Login type / 登录方式 (qrcode=二维码 | phone=手机号 | cookie=Cookie)', choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE) - parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)', + parser.add_argument('--type', type=str, + help='Crawler type / 爬取类型 (search=搜索 | detail=详情 | creator=创作者)', choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE) parser.add_argument('--start', type=int, - help='number of start page', default=config.START_PAGE) + help='Number of start page / 起始页码', default=config.START_PAGE) parser.add_argument('--keywords', type=str, - help='please input keywords', default=config.KEYWORDS) + help='Please input keywords / 请输入关键词', default=config.KEYWORDS) parser.add_argument('--get_comment', type=str2bool, - help='''whether to crawl level one comment, supported values case insensitive ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_COMMENTS) + help='''Whether to crawl level one comment / 是否爬取一级评论, supported values case insensitive / 支持的值(不区分大小写) ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_COMMENTS) parser.add_argument('--get_sub_comment', type=str2bool, - help=''''whether to crawl level two comment, supported values case insensitive ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_SUB_COMMENTS) + help=''''Whether to crawl level two comment / 是否爬取二级评论, supported values case insensitive / 支持的值(不区分大小写) ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_SUB_COMMENTS) parser.add_argument('--save_data_option', type=str, - help='where to save the data (csv or db or json or sqlite)', choices=['csv', 'db', 'json', 'sqlite'], default=config.SAVE_DATA_OPTION) + help='Where to save the data / 数据保存方式 (csv=CSV文件 | db=MySQL数据库 | json=JSON文件 | sqlite=SQLite数据库)', + choices=['csv', 'db', 'json', 'sqlite'], default=config.SAVE_DATA_OPTION) parser.add_argument('--cookies', type=str, - help='cookies used for cookie login type', default=config.COOKIES) + help='Cookies used for cookie login type / Cookie登录方式使用的Cookie值', default=config.COOKIES) args = parser.parse_args() diff --git a/config/base_config.py b/config/base_config.py index 7302238..c97a3c5 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -39,7 +39,7 @@ SAVE_LOGIN_STATE = True # 是否启用CDP模式 - 使用用户现有的Chrome/Edge浏览器进行爬取,提供更好的反检测能力 # 启用后将自动检测并启动用户的Chrome/Edge浏览器,通过CDP协议进行控制 # 这种方式使用真实的浏览器环境,包括用户的扩展、Cookie和设置,大大降低被检测的风险 -ENABLE_CDP_MODE = True +ENABLE_CDP_MODE = False # CDP调试端口,用于与浏览器通信 # 如果端口被占用,系统会自动尝试下一个可用端口 diff --git a/schema/sqlite_tables.db b/schema/sqlite_tables.db index e1a6d7f..5bc1f8d 100644 Binary files a/schema/sqlite_tables.db and b/schema/sqlite_tables.db differ diff --git a/schema/sqlite_tables.sql b/schema/sqlite_tables.sql index dc5adb7..84696a0 100644 --- a/schema/sqlite_tables.sql +++ b/schema/sqlite_tables.sql @@ -149,6 +149,7 @@ CREATE TABLE douyin_aweme ( aweme_url TEXT DEFAULT NULL, cover_url TEXT DEFAULT NULL, video_download_url TEXT DEFAULT NULL, + music_download_url TEXT DEFAULT NULL, source_keyword TEXT DEFAULT '' ); diff --git a/schema/tables.sql b/schema/tables.sql index 6e0092b..676e8a6 100644 --- a/schema/tables.sql +++ b/schema/tables.sql @@ -149,6 +149,7 @@ CREATE TABLE `douyin_aweme` `aweme_url` varchar(255) DEFAULT NULL COMMENT '视频详情页URL', `cover_url` varchar(500) DEFAULT NULL COMMENT '视频封面图URL', `video_download_url` varchar(1024) DEFAULT NULL COMMENT '视频下载地址', + `music_download_url` varchar(1024) DEFAULT NULL COMMENT '音乐下载地址', PRIMARY KEY (`id`), KEY `idx_douyin_awem_aweme_i_6f7bc6` (`aweme_id`), KEY `idx_douyin_awem_create__299dfe` (`create_time`) diff --git a/store/douyin/__init__.py b/store/douyin/__init__.py index e4958e8..1c3b6a3 100644 --- a/store/douyin/__init__.py +++ b/store/douyin/__init__.py @@ -105,6 +105,22 @@ def _extract_video_download_url(aweme_detail: Dict) -> str: return actual_url_list[-1] +def _extract_music_download_url(aweme_detail: Dict) -> str: + """ + 提取音乐下载地址 + + Args: + aweme_detail (Dict): 抖音视频 + + Returns: + str: 音乐下载地址 + """ + music_item = aweme_detail.get("music", {}) + play_url = music_item.get("play_url", {}) + music_url = play_url.get("uri", "") + return music_url + + async def update_douyin_aweme(aweme_item: Dict): aweme_id = aweme_item.get("aweme_id") user_info = aweme_item.get("author", {}) @@ -131,6 +147,7 @@ async def update_douyin_aweme(aweme_item: Dict): "aweme_url": f"https://www.douyin.com/video/{aweme_id}", "cover_url": _extract_content_cover_url(aweme_item), "video_download_url": _extract_video_download_url(aweme_item), + "music_download_url": _extract_music_download_url(aweme_item), "source_keyword": source_keyword_var.get(), } utils.logger.info(