mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-05 17:37:35 +08:00
feat: 百度贴吧done
This commit is contained in:
@@ -28,7 +28,7 @@ HEADLESS = False
|
||||
SAVE_LOGIN_STATE = True
|
||||
|
||||
# 数据保存类型选项配置,支持三种类型:csv、db、json
|
||||
SAVE_DATA_OPTION = "db" # csv or db or json
|
||||
SAVE_DATA_OPTION = "csv" # csv or db or json
|
||||
|
||||
# 用户浏览器缓存的浏览器文件配置
|
||||
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
|
||||
@@ -46,18 +46,18 @@ MAX_CONCURRENCY_NUM = 1
|
||||
ENABLE_GET_IMAGES = False
|
||||
|
||||
# 是否开启爬评论模式, 默认不开启爬评论
|
||||
ENABLE_GET_COMMENTS = True
|
||||
ENABLE_GET_COMMENTS = False
|
||||
|
||||
# 是否开启爬二级评论模式, 默认不开启爬二级评论
|
||||
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
|
||||
ENABLE_GET_SUB_COMMENTS = True
|
||||
ENABLE_GET_SUB_COMMENTS = False
|
||||
|
||||
# 指定小红书需要爬虫的笔记ID列表
|
||||
XHS_SPECIFIED_ID_LIST = [
|
||||
"6422c2750000000027000d88",
|
||||
"64ca1b73000000000b028dd2",
|
||||
"630d5b85000000001203ab41",
|
||||
"668fe13000000000030241fa", # 图文混合
|
||||
"668fe13000000000030241fa", # 图文混合
|
||||
# ........................
|
||||
]
|
||||
|
||||
@@ -93,6 +93,10 @@ TIEBA_SPECIFIED_ID_LIST = [
|
||||
|
||||
]
|
||||
|
||||
# 指定贴吧名称列表,爬取该贴吧下的帖子
|
||||
TIEBA_NAME_LIST = [
|
||||
# "盗墓笔记"
|
||||
]
|
||||
|
||||
# 指定小红书创作者ID列表
|
||||
XHS_CREATOR_ID_LIST = [
|
||||
@@ -118,19 +122,18 @@ KS_CREATOR_ID_LIST = [
|
||||
# ........................
|
||||
]
|
||||
|
||||
|
||||
#词云相关
|
||||
#是否开启生成评论词云图
|
||||
# 词云相关
|
||||
# 是否开启生成评论词云图
|
||||
ENABLE_GET_WORDCLOUD = False
|
||||
# 自定义词语及其分组
|
||||
#添加规则:xx:yy 其中xx为自定义添加的词组,yy为将xx该词组分到的组名。
|
||||
# 添加规则:xx:yy 其中xx为自定义添加的词组,yy为将xx该词组分到的组名。
|
||||
CUSTOM_WORDS = {
|
||||
'零几': '年份', # 将“零几”识别为一个整体
|
||||
'高频词': '专业术语' # 示例自定义词
|
||||
}
|
||||
|
||||
#停用(禁用)词文件路径
|
||||
# 停用(禁用)词文件路径
|
||||
STOP_WORDS_FILE = "./docs/hit_stopwords.txt"
|
||||
|
||||
#中文字体文件路径
|
||||
FONT_PATH= "./docs/STZHONGS.TTF"
|
||||
# 中文字体文件路径
|
||||
FONT_PATH = "./docs/STZHONGS.TTF"
|
||||
|
||||
Reference in New Issue
Block a user