mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-08 19:07:33 +08:00
feat: 百度贴吧支持创作者主页帖子爬取
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from typing import List
|
||||
|
||||
from model.m_baidu_tieba import TiebaComment, TiebaNote
|
||||
from model.m_baidu_tieba import TiebaComment, TiebaCreator, TiebaNote
|
||||
from var import source_keyword_var
|
||||
|
||||
from . import tieba_store_impl
|
||||
@@ -23,6 +23,7 @@ class TieBaStoreFactory:
|
||||
"[TieBaStoreFactory.create_store] Invalid save option only supported csv or db or json ...")
|
||||
return store_class()
|
||||
|
||||
|
||||
async def batch_update_tieba_notes(note_list: List[TiebaNote]):
|
||||
"""
|
||||
Batch update tieba notes
|
||||
@@ -37,6 +38,7 @@ async def batch_update_tieba_notes(note_list: List[TiebaNote]):
|
||||
for note_item in note_list:
|
||||
await update_tieba_note(note_item)
|
||||
|
||||
|
||||
async def update_tieba_note(note_item: TiebaNote):
|
||||
"""
|
||||
Add or Update tieba note
|
||||
@@ -54,7 +56,7 @@ async def update_tieba_note(note_item: TiebaNote):
|
||||
await TieBaStoreFactory.create_store().store_content(save_note_item)
|
||||
|
||||
|
||||
async def batch_update_tieba_note_comments(note_id:str, comments: List[TiebaComment]):
|
||||
async def batch_update_tieba_note_comments(note_id: str, comments: List[TiebaComment]):
|
||||
"""
|
||||
Batch update tieba note comments
|
||||
Args:
|
||||
@@ -86,27 +88,16 @@ async def update_tieba_note_comment(note_id: str, comment_item: TiebaComment):
|
||||
await TieBaStoreFactory.create_store().store_comment(save_comment_item)
|
||||
|
||||
|
||||
async def save_creator(user_id: str, user_info: Dict):
|
||||
async def save_creator(user_info: TiebaCreator):
|
||||
"""
|
||||
Save creator information to local
|
||||
Args:
|
||||
user_id:
|
||||
user_info:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
local_db_item = {
|
||||
'user_id': user_id,
|
||||
'nickname': user_info.get('nickname'),
|
||||
'gender': '女' if user_info.get('gender') == "f" else '男',
|
||||
'avatar': user_info.get('avatar'),
|
||||
'ip_location': user_info.get("ip_location", ""),
|
||||
'follows': user_info.get('follow_count', ''),
|
||||
'fans': user_info.get('followers_count', ''),
|
||||
'follow_tieba_list': user_info.get("tieba_list", ''),
|
||||
'last_modify_ts': utils.get_current_timestamp(),
|
||||
'registration_duration': user_info.get("registration_duration", ""),
|
||||
}
|
||||
local_db_item = user_info.model_dump()
|
||||
local_db_item["last_modify_ts"] = utils.get_current_timestamp()
|
||||
utils.logger.info(f"[store.tieba.save_creator] creator:{local_db_item}")
|
||||
await TieBaStoreFactory.create_store().store_creator(local_db_item)
|
||||
await TieBaStoreFactory.create_store().store_creator(local_db_item)
|
||||
|
||||
@@ -24,14 +24,14 @@ def calculate_number_of_files(file_store_path: str) -> int:
|
||||
if not os.path.exists(file_store_path):
|
||||
return 1
|
||||
try:
|
||||
return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1
|
||||
return max([int(file_name.split("_")[0]) for file_name in os.listdir(file_store_path)]) + 1
|
||||
except ValueError:
|
||||
return 1
|
||||
|
||||
|
||||
class TieBaCsvStoreImplement(AbstractStore):
|
||||
csv_store_path: str = "data/tieba"
|
||||
file_count:int=calculate_number_of_files(csv_store_path)
|
||||
file_count: int = calculate_number_of_files(csv_store_path)
|
||||
|
||||
def make_save_file_name(self, store_type: str) -> str:
|
||||
"""
|
||||
@@ -65,7 +65,7 @@ class TieBaCsvStoreImplement(AbstractStore):
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
Xiaohongshu content CSV storage implementation
|
||||
tieba content CSV storage implementation
|
||||
Args:
|
||||
content_item: note item dict
|
||||
|
||||
@@ -76,7 +76,7 @@ class TieBaCsvStoreImplement(AbstractStore):
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
"""
|
||||
Xiaohongshu comment CSV storage implementation
|
||||
tieba comment CSV storage implementation
|
||||
Args:
|
||||
comment_item: comment item dict
|
||||
|
||||
@@ -87,7 +87,7 @@ class TieBaCsvStoreImplement(AbstractStore):
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
"""
|
||||
Xiaohongshu content CSV storage implementation
|
||||
tieba content CSV storage implementation
|
||||
Args:
|
||||
creator: creator dict
|
||||
|
||||
@@ -100,7 +100,7 @@ class TieBaCsvStoreImplement(AbstractStore):
|
||||
class TieBaDbStoreImplement(AbstractStore):
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
Xiaohongshu content DB storage implementation
|
||||
tieba content DB storage implementation
|
||||
Args:
|
||||
content_item: content item dict
|
||||
|
||||
@@ -120,7 +120,7 @@ class TieBaDbStoreImplement(AbstractStore):
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
"""
|
||||
Xiaohongshu content DB storage implementation
|
||||
tieba content DB storage implementation
|
||||
Args:
|
||||
comment_item: comment item dict
|
||||
|
||||
@@ -140,7 +140,7 @@ class TieBaDbStoreImplement(AbstractStore):
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
"""
|
||||
Xiaohongshu content DB storage implementation
|
||||
tieba content DB storage implementation
|
||||
Args:
|
||||
creator: creator dict
|
||||
|
||||
@@ -163,10 +163,10 @@ class TieBaJsonStoreImplement(AbstractStore):
|
||||
json_store_path: str = "data/tieba/json"
|
||||
words_store_path: str = "data/tieba/words"
|
||||
lock = asyncio.Lock()
|
||||
file_count:int=calculate_number_of_files(json_store_path)
|
||||
file_count: int = calculate_number_of_files(json_store_path)
|
||||
WordCloud = words.AsyncWordCloudGenerator()
|
||||
|
||||
def make_save_file_name(self, store_type: str) -> (str,str):
|
||||
def make_save_file_name(self, store_type: str) -> (str, str):
|
||||
"""
|
||||
make save file name by store type
|
||||
Args:
|
||||
@@ -193,7 +193,7 @@ class TieBaJsonStoreImplement(AbstractStore):
|
||||
"""
|
||||
pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True)
|
||||
save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type)
|
||||
save_file_name, words_file_name_prefix = self.make_save_file_name(store_type=store_type)
|
||||
save_data = []
|
||||
|
||||
async with self.lock:
|
||||
@@ -210,6 +210,7 @@ class TieBaJsonStoreImplement(AbstractStore):
|
||||
await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix)
|
||||
except:
|
||||
pass
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
content JSON storage implementation
|
||||
@@ -234,7 +235,7 @@ class TieBaJsonStoreImplement(AbstractStore):
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
"""
|
||||
Xiaohongshu content JSON storage implementation
|
||||
tieba content JSON storage implementation
|
||||
Args:
|
||||
creator: creator dict
|
||||
|
||||
|
||||
Reference in New Issue
Block a user