feat: 百度贴吧支持创作者主页帖子爬取

This commit is contained in:
Relakkes Yang
2024-08-24 11:03:23 +08:00
parent 8adb593ba6
commit acb29add28
8 changed files with 210 additions and 167 deletions

View File

@@ -24,14 +24,14 @@ def calculate_number_of_files(file_store_path: str) -> int:
if not os.path.exists(file_store_path):
return 1
try:
return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1
return max([int(file_name.split("_")[0]) for file_name in os.listdir(file_store_path)]) + 1
except ValueError:
return 1
class TieBaCsvStoreImplement(AbstractStore):
csv_store_path: str = "data/tieba"
file_count:int=calculate_number_of_files(csv_store_path)
file_count: int = calculate_number_of_files(csv_store_path)
def make_save_file_name(self, store_type: str) -> str:
"""
@@ -65,7 +65,7 @@ class TieBaCsvStoreImplement(AbstractStore):
async def store_content(self, content_item: Dict):
"""
Xiaohongshu content CSV storage implementation
tieba content CSV storage implementation
Args:
content_item: note item dict
@@ -76,7 +76,7 @@ class TieBaCsvStoreImplement(AbstractStore):
async def store_comment(self, comment_item: Dict):
"""
Xiaohongshu comment CSV storage implementation
tieba comment CSV storage implementation
Args:
comment_item: comment item dict
@@ -87,7 +87,7 @@ class TieBaCsvStoreImplement(AbstractStore):
async def store_creator(self, creator: Dict):
"""
Xiaohongshu content CSV storage implementation
tieba content CSV storage implementation
Args:
creator: creator dict
@@ -100,7 +100,7 @@ class TieBaCsvStoreImplement(AbstractStore):
class TieBaDbStoreImplement(AbstractStore):
async def store_content(self, content_item: Dict):
"""
Xiaohongshu content DB storage implementation
tieba content DB storage implementation
Args:
content_item: content item dict
@@ -120,7 +120,7 @@ class TieBaDbStoreImplement(AbstractStore):
async def store_comment(self, comment_item: Dict):
"""
Xiaohongshu content DB storage implementation
tieba content DB storage implementation
Args:
comment_item: comment item dict
@@ -140,7 +140,7 @@ class TieBaDbStoreImplement(AbstractStore):
async def store_creator(self, creator: Dict):
"""
Xiaohongshu content DB storage implementation
tieba content DB storage implementation
Args:
creator: creator dict
@@ -163,10 +163,10 @@ class TieBaJsonStoreImplement(AbstractStore):
json_store_path: str = "data/tieba/json"
words_store_path: str = "data/tieba/words"
lock = asyncio.Lock()
file_count:int=calculate_number_of_files(json_store_path)
file_count: int = calculate_number_of_files(json_store_path)
WordCloud = words.AsyncWordCloudGenerator()
def make_save_file_name(self, store_type: str) -> (str,str):
def make_save_file_name(self, store_type: str) -> (str, str):
"""
make save file name by store type
Args:
@@ -193,7 +193,7 @@ class TieBaJsonStoreImplement(AbstractStore):
"""
pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True)
save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type)
save_file_name, words_file_name_prefix = self.make_save_file_name(store_type=store_type)
save_data = []
async with self.lock:
@@ -210,6 +210,7 @@ class TieBaJsonStoreImplement(AbstractStore):
await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix)
except:
pass
async def store_content(self, content_item: Dict):
"""
content JSON storage implementation
@@ -234,7 +235,7 @@ class TieBaJsonStoreImplement(AbstractStore):
async def store_creator(self, creator: Dict):
"""
Xiaohongshu content JSON storage implementation
tieba content JSON storage implementation
Args:
creator: creator dict