mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-07 02:17:25 +08:00
feat: 百度贴吧支持创作者主页帖子爬取
This commit is contained in:
@@ -24,14 +24,14 @@ def calculate_number_of_files(file_store_path: str) -> int:
|
||||
if not os.path.exists(file_store_path):
|
||||
return 1
|
||||
try:
|
||||
return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1
|
||||
return max([int(file_name.split("_")[0]) for file_name in os.listdir(file_store_path)]) + 1
|
||||
except ValueError:
|
||||
return 1
|
||||
|
||||
|
||||
class TieBaCsvStoreImplement(AbstractStore):
|
||||
csv_store_path: str = "data/tieba"
|
||||
file_count:int=calculate_number_of_files(csv_store_path)
|
||||
file_count: int = calculate_number_of_files(csv_store_path)
|
||||
|
||||
def make_save_file_name(self, store_type: str) -> str:
|
||||
"""
|
||||
@@ -65,7 +65,7 @@ class TieBaCsvStoreImplement(AbstractStore):
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
Xiaohongshu content CSV storage implementation
|
||||
tieba content CSV storage implementation
|
||||
Args:
|
||||
content_item: note item dict
|
||||
|
||||
@@ -76,7 +76,7 @@ class TieBaCsvStoreImplement(AbstractStore):
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
"""
|
||||
Xiaohongshu comment CSV storage implementation
|
||||
tieba comment CSV storage implementation
|
||||
Args:
|
||||
comment_item: comment item dict
|
||||
|
||||
@@ -87,7 +87,7 @@ class TieBaCsvStoreImplement(AbstractStore):
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
"""
|
||||
Xiaohongshu content CSV storage implementation
|
||||
tieba content CSV storage implementation
|
||||
Args:
|
||||
creator: creator dict
|
||||
|
||||
@@ -100,7 +100,7 @@ class TieBaCsvStoreImplement(AbstractStore):
|
||||
class TieBaDbStoreImplement(AbstractStore):
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
Xiaohongshu content DB storage implementation
|
||||
tieba content DB storage implementation
|
||||
Args:
|
||||
content_item: content item dict
|
||||
|
||||
@@ -120,7 +120,7 @@ class TieBaDbStoreImplement(AbstractStore):
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
"""
|
||||
Xiaohongshu content DB storage implementation
|
||||
tieba content DB storage implementation
|
||||
Args:
|
||||
comment_item: comment item dict
|
||||
|
||||
@@ -140,7 +140,7 @@ class TieBaDbStoreImplement(AbstractStore):
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
"""
|
||||
Xiaohongshu content DB storage implementation
|
||||
tieba content DB storage implementation
|
||||
Args:
|
||||
creator: creator dict
|
||||
|
||||
@@ -163,10 +163,10 @@ class TieBaJsonStoreImplement(AbstractStore):
|
||||
json_store_path: str = "data/tieba/json"
|
||||
words_store_path: str = "data/tieba/words"
|
||||
lock = asyncio.Lock()
|
||||
file_count:int=calculate_number_of_files(json_store_path)
|
||||
file_count: int = calculate_number_of_files(json_store_path)
|
||||
WordCloud = words.AsyncWordCloudGenerator()
|
||||
|
||||
def make_save_file_name(self, store_type: str) -> (str,str):
|
||||
def make_save_file_name(self, store_type: str) -> (str, str):
|
||||
"""
|
||||
make save file name by store type
|
||||
Args:
|
||||
@@ -193,7 +193,7 @@ class TieBaJsonStoreImplement(AbstractStore):
|
||||
"""
|
||||
pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True)
|
||||
save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type)
|
||||
save_file_name, words_file_name_prefix = self.make_save_file_name(store_type=store_type)
|
||||
save_data = []
|
||||
|
||||
async with self.lock:
|
||||
@@ -210,6 +210,7 @@ class TieBaJsonStoreImplement(AbstractStore):
|
||||
await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix)
|
||||
except:
|
||||
pass
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
content JSON storage implementation
|
||||
@@ -234,7 +235,7 @@ class TieBaJsonStoreImplement(AbstractStore):
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
"""
|
||||
Xiaohongshu content JSON storage implementation
|
||||
tieba content JSON storage implementation
|
||||
Args:
|
||||
creator: creator dict
|
||||
|
||||
|
||||
Reference in New Issue
Block a user