mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-09 03:17:25 +08:00
完成词云图生成函数并添加至存储逻辑中
This commit is contained in:
@@ -11,10 +11,11 @@ from typing import Dict
|
||||
|
||||
import aiofiles
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractStore
|
||||
from tools import utils
|
||||
from var import crawler_type_var
|
||||
|
||||
from tools import words
|
||||
|
||||
def calculate_number_of_files(file_store_path: str) -> int:
|
||||
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中
|
||||
@@ -130,12 +131,14 @@ class BiliDbStoreImplement(AbstractStore):
|
||||
|
||||
|
||||
class BiliJsonStoreImplement(AbstractStore):
|
||||
json_store_path: str = "data/bilibili"
|
||||
json_store_path: str = "data/bilibili/json"
|
||||
words_store_path: str = "data/bilibili/words"
|
||||
lock = asyncio.Lock()
|
||||
file_count:int=calculate_number_of_files(json_store_path)
|
||||
WordCloud = words.AsyncWordCloudGenerator()
|
||||
|
||||
|
||||
def make_save_file_name(self, store_type: str) -> str:
|
||||
def make_save_file_name(self, store_type: str) -> (str,str):
|
||||
"""
|
||||
make save file name by store type
|
||||
Args:
|
||||
@@ -145,7 +148,10 @@ class BiliJsonStoreImplement(AbstractStore):
|
||||
|
||||
"""
|
||||
|
||||
return f"{self.json_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json"
|
||||
return (
|
||||
f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json",
|
||||
f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}"
|
||||
)
|
||||
|
||||
async def save_data_to_json(self, save_item: Dict, store_type: str):
|
||||
"""
|
||||
@@ -158,7 +164,8 @@ class BiliJsonStoreImplement(AbstractStore):
|
||||
|
||||
"""
|
||||
pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True)
|
||||
save_file_name = self.make_save_file_name(store_type=store_type)
|
||||
pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True)
|
||||
save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type)
|
||||
save_data = []
|
||||
|
||||
async with self.lock:
|
||||
@@ -170,6 +177,12 @@ class BiliJsonStoreImplement(AbstractStore):
|
||||
async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file:
|
||||
await file.write(json.dumps(save_data, ensure_ascii=False))
|
||||
|
||||
if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD:
|
||||
try:
|
||||
await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix)
|
||||
except:
|
||||
pass
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
content JSON storage implementation
|
||||
|
||||
@@ -12,8 +12,9 @@ from typing import Dict
|
||||
import aiofiles
|
||||
|
||||
from base.base_crawler import AbstractStore
|
||||
from tools import utils
|
||||
from tools import utils,words
|
||||
from var import crawler_type_var
|
||||
import config
|
||||
|
||||
|
||||
def calculate_number_of_files(file_store_path: str) -> int:
|
||||
@@ -162,11 +163,14 @@ class DouyinDbStoreImplement(AbstractStore):
|
||||
await update_creator_by_user_id(user_id, creator)
|
||||
|
||||
class DouyinJsonStoreImplement(AbstractStore):
|
||||
json_store_path: str = "data/douyin"
|
||||
json_store_path: str = "data/douyin/json"
|
||||
words_store_path: str = "data/douyin/words"
|
||||
|
||||
lock = asyncio.Lock()
|
||||
file_count: int = calculate_number_of_files(json_store_path)
|
||||
WordCloud = words.AsyncWordCloudGenerator()
|
||||
|
||||
def make_save_file_name(self, store_type: str) -> str:
|
||||
def make_save_file_name(self, store_type: str) -> (str,str):
|
||||
"""
|
||||
make save file name by store type
|
||||
Args:
|
||||
@@ -176,8 +180,10 @@ class DouyinJsonStoreImplement(AbstractStore):
|
||||
|
||||
"""
|
||||
|
||||
return f"{self.json_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json"
|
||||
|
||||
return (
|
||||
f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json",
|
||||
f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}"
|
||||
)
|
||||
async def save_data_to_json(self, save_item: Dict, store_type: str):
|
||||
"""
|
||||
Below is a simple way to save it in json format.
|
||||
@@ -189,7 +195,8 @@ class DouyinJsonStoreImplement(AbstractStore):
|
||||
|
||||
"""
|
||||
pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True)
|
||||
save_file_name = self.make_save_file_name(store_type=store_type)
|
||||
pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True)
|
||||
save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type)
|
||||
save_data = []
|
||||
|
||||
async with self.lock:
|
||||
@@ -201,6 +208,12 @@ class DouyinJsonStoreImplement(AbstractStore):
|
||||
async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file:
|
||||
await file.write(json.dumps(save_data, ensure_ascii=False))
|
||||
|
||||
if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD:
|
||||
try:
|
||||
await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix)
|
||||
except:
|
||||
pass
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
content JSON storage implementation
|
||||
|
||||
@@ -12,9 +12,9 @@ from typing import Dict
|
||||
import aiofiles
|
||||
|
||||
from base.base_crawler import AbstractStore
|
||||
from tools import utils
|
||||
from tools import utils,words
|
||||
from var import crawler_type_var
|
||||
|
||||
import config
|
||||
|
||||
def calculate_number_of_files(file_store_path: str) -> int:
|
||||
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中
|
||||
@@ -131,12 +131,15 @@ class KuaishouDbStoreImplement(AbstractStore):
|
||||
|
||||
|
||||
class KuaishouJsonStoreImplement(AbstractStore):
|
||||
json_store_path: str = "data/kuaishou"
|
||||
json_store_path: str = "data/kuaishou/json"
|
||||
words_store_path: str = "data/kuaishou/words"
|
||||
lock = asyncio.Lock()
|
||||
file_count:int=calculate_number_of_files(json_store_path)
|
||||
WordCloud = words.AsyncWordCloudGenerator()
|
||||
|
||||
|
||||
def make_save_file_name(self, store_type: str) -> str:
|
||||
|
||||
def make_save_file_name(self, store_type: str) -> (str,str):
|
||||
"""
|
||||
make save file name by store type
|
||||
Args:
|
||||
@@ -146,8 +149,10 @@ class KuaishouJsonStoreImplement(AbstractStore):
|
||||
|
||||
"""
|
||||
|
||||
|
||||
return f"{self.json_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json"
|
||||
return (
|
||||
f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json",
|
||||
f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}"
|
||||
)
|
||||
|
||||
async def save_data_to_json(self, save_item: Dict, store_type: str):
|
||||
"""
|
||||
@@ -160,7 +165,8 @@ class KuaishouJsonStoreImplement(AbstractStore):
|
||||
|
||||
"""
|
||||
pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True)
|
||||
save_file_name = self.make_save_file_name(store_type=store_type)
|
||||
pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True)
|
||||
save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type)
|
||||
save_data = []
|
||||
|
||||
async with self.lock:
|
||||
@@ -172,6 +178,12 @@ class KuaishouJsonStoreImplement(AbstractStore):
|
||||
async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file:
|
||||
await file.write(json.dumps(save_data, ensure_ascii=False))
|
||||
|
||||
if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD:
|
||||
try:
|
||||
await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix)
|
||||
except:
|
||||
pass
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
content JSON storage implementation
|
||||
|
||||
@@ -12,9 +12,9 @@ from typing import Dict
|
||||
import aiofiles
|
||||
|
||||
from base.base_crawler import AbstractStore
|
||||
from tools import utils
|
||||
from tools import utils,words
|
||||
from var import crawler_type_var
|
||||
|
||||
import config
|
||||
|
||||
def calculate_number_of_files(file_store_path: str) -> int:
|
||||
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中
|
||||
@@ -132,12 +132,14 @@ class WeiboDbStoreImplement(AbstractStore):
|
||||
|
||||
|
||||
class WeiboJsonStoreImplement(AbstractStore):
|
||||
json_store_path: str = "data/weibo"
|
||||
json_store_path: str = "data/weibo/json"
|
||||
words_store_path: str = "data/weibo/words"
|
||||
lock = asyncio.Lock()
|
||||
file_count:int=calculate_number_of_files(json_store_path)
|
||||
WordCloud = words.AsyncWordCloudGenerator()
|
||||
|
||||
|
||||
def make_save_file_name(self, store_type: str) -> str:
|
||||
def make_save_file_name(self, store_type: str) -> (str,str):
|
||||
"""
|
||||
make save file name by store type
|
||||
Args:
|
||||
@@ -147,7 +149,10 @@ class WeiboJsonStoreImplement(AbstractStore):
|
||||
|
||||
"""
|
||||
|
||||
return f"{self.json_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json"
|
||||
return (
|
||||
f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json",
|
||||
f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}"
|
||||
)
|
||||
|
||||
async def save_data_to_json(self, save_item: Dict, store_type: str):
|
||||
"""
|
||||
@@ -160,7 +165,8 @@ class WeiboJsonStoreImplement(AbstractStore):
|
||||
|
||||
"""
|
||||
pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True)
|
||||
save_file_name = self.make_save_file_name(store_type=store_type)
|
||||
pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True)
|
||||
save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type)
|
||||
save_data = []
|
||||
|
||||
async with self.lock:
|
||||
@@ -172,6 +178,12 @@ class WeiboJsonStoreImplement(AbstractStore):
|
||||
async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file:
|
||||
await file.write(json.dumps(save_data, ensure_ascii=False))
|
||||
|
||||
if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD:
|
||||
try:
|
||||
await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix)
|
||||
except:
|
||||
pass
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
content JSON storage implementation
|
||||
|
||||
@@ -12,9 +12,9 @@ from typing import Dict
|
||||
import aiofiles
|
||||
|
||||
from base.base_crawler import AbstractStore
|
||||
from tools import utils
|
||||
from tools import utils,words
|
||||
from var import crawler_type_var
|
||||
|
||||
import config
|
||||
|
||||
def calculate_number_of_files(file_store_path: str) -> int:
|
||||
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中
|
||||
@@ -161,11 +161,13 @@ class XhsDbStoreImplement(AbstractStore):
|
||||
|
||||
|
||||
class XhsJsonStoreImplement(AbstractStore):
|
||||
json_store_path: str = "data/xhs"
|
||||
json_store_path: str = "data/xhs/json"
|
||||
words_store_path: str = "data/xhs/words"
|
||||
lock = asyncio.Lock()
|
||||
file_count:int=calculate_number_of_files(json_store_path)
|
||||
WordCloud = words.AsyncWordCloudGenerator()
|
||||
|
||||
def make_save_file_name(self, store_type: str) -> str:
|
||||
def make_save_file_name(self, store_type: str) -> (str,str):
|
||||
"""
|
||||
make save file name by store type
|
||||
Args:
|
||||
@@ -175,7 +177,10 @@ class XhsJsonStoreImplement(AbstractStore):
|
||||
|
||||
"""
|
||||
|
||||
return f"{self.json_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json"
|
||||
return (
|
||||
f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json",
|
||||
f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}"
|
||||
)
|
||||
|
||||
async def save_data_to_json(self, save_item: Dict, store_type: str):
|
||||
"""
|
||||
@@ -188,7 +193,8 @@ class XhsJsonStoreImplement(AbstractStore):
|
||||
|
||||
"""
|
||||
pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True)
|
||||
save_file_name = self.make_save_file_name(store_type=store_type)
|
||||
pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True)
|
||||
save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type)
|
||||
save_data = []
|
||||
|
||||
async with self.lock:
|
||||
@@ -200,6 +206,11 @@ class XhsJsonStoreImplement(AbstractStore):
|
||||
async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file:
|
||||
await file.write(json.dumps(save_data, ensure_ascii=False))
|
||||
|
||||
if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD:
|
||||
try:
|
||||
await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix)
|
||||
except:
|
||||
pass
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
content JSON storage implementation
|
||||
|
||||
Reference in New Issue
Block a user