mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-06-08 19:07:33 +08:00
feat: 小红书增加指定帖子爬取功能
fix: 修复程序一些异常 bug refactor: 优化部分代码逻辑
This commit is contained in:
@@ -21,6 +21,7 @@ from .login import DouYinLogin
|
||||
class DouYinCrawler(AbstractCrawler):
|
||||
platform: str
|
||||
login_type: str
|
||||
crawler_type: str
|
||||
context_page: Page
|
||||
dy_client: DOUYINClient
|
||||
account_pool: AccountPool
|
||||
@@ -30,10 +31,11 @@ class DouYinCrawler(AbstractCrawler):
|
||||
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
|
||||
self.index_url = "https://www.douyin.com"
|
||||
|
||||
def init_config(self, platform: str, login_type: str, account_pool: AccountPool) -> None:
|
||||
def init_config(self, platform: str, login_type: str, account_pool: AccountPool, crawler_type: str) -> None:
|
||||
self.platform = platform
|
||||
self.login_type = login_type
|
||||
self.account_pool = account_pool
|
||||
self.crawler_type = crawler_type
|
||||
|
||||
async def start(self) -> None:
|
||||
account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
|
||||
@@ -63,8 +65,12 @@ class DouYinCrawler(AbstractCrawler):
|
||||
await login_obj.begin()
|
||||
await self.dy_client.update_cookies(browser_context=self.browser_context)
|
||||
|
||||
# search_posts
|
||||
await self.search()
|
||||
if self.crawler_type == "search":
|
||||
# Search for notes and retrieve their comment information.
|
||||
await self.search()
|
||||
elif self.crawler_type == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_notes()
|
||||
|
||||
utils.logger.info("Douyin Crawler finished ...")
|
||||
|
||||
@@ -74,7 +80,7 @@ class DouYinCrawler(AbstractCrawler):
|
||||
request_keyword_var.set(keyword)
|
||||
utils.logger.info(f"Current keyword: {keyword}")
|
||||
aweme_list: List[str] = []
|
||||
dy_limit_count = 10 # douyin fixed limit page 10
|
||||
dy_limit_count = 10
|
||||
page = 0
|
||||
while (page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
try:
|
||||
@@ -95,6 +101,11 @@ class DouYinCrawler(AbstractCrawler):
|
||||
utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
|
||||
await self.batch_get_note_comments(aweme_list)
|
||||
|
||||
async def get_specified_notes(self):
|
||||
"""Get the information and comments of the specified post"""
|
||||
# todo douyin support
|
||||
pass
|
||||
|
||||
async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
|
||||
task_list: List[Task] = []
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
|
||||
Reference in New Issue
Block a user