feat: 快手视频评论爬取done；数据保存到DB、CSV done

2026-06-06 18:07:26 +08:00 · 2023-11-26 21:43:39 +08:00
parent 2f8541a351
commit dfb1788141
9 changed files with 197 additions and 52 deletions
--- a/config/base_config.py
+++ b/config/base_config.py
@@ -1,42 +1,44 @@
-# Desc: base config
+# 基础配置
 PLATFORM = "xhs"
 KEYWORDS = "python,golang"
 LOGIN_TYPE = "qrcode"  # qrcode or phone or cookie
-COOKIES = ""  # login by cookie, if login_type is cookie, you must set this value
+COOKIES = ""
 CRAWLER_TYPE = "search"

-# enable ip proxy
+# 是否开启 IP 代理
 ENABLE_IP_PROXY = False

-# retry_interval
+# 重试时间
 RETRY_INTERVAL = 60 * 30  # 30 minutes

 # playwright headless
 HEADLESS = True

-# save login state
+# 是否保存登录状态
 SAVE_LOGIN_STATE = True

-# save user data dir
+# 用户浏览器缓存的浏览器文件配置
 USER_DATA_DIR = "%s_user_data_dir"  # %s will be replaced by platform name

-# crawler max notes count
+# 爬取视频/帖子的数量控制
 CRAWLER_MAX_NOTES_COUNT = 20

-# max concurrency num
+# 并发爬虫数量控制
 MAX_CONCURRENCY_NUM = 10


-# xhs specified note id list
+# 指定小红书需要爬虫的笔记ID列表
 XHS_SPECIFIED_ID_LIST = [
 "6422c2750000000027000d88",
 "64ca1b73000000000b028dd2",
 "630d5b85000000001203ab41",
+# ........................
 ]


-# douyin specified note id list
+# 指定抖音需要爬取的ID列表
 DY_SPECIFIED_ID_LIST = [
 "7280854932641664319",
 "7202432992642387233"
+# ........................
 ]