Merge branch 'main' into bugfix-null-key

2026-05-08 11:37:36 +08:00 · 2023-11-18 14:59:03 +08:00
parent 602e609115 700946b28a
commit 4c85355ebd
9 changed files with 102 additions and 21 deletions
--- a/media_platform/douyin/core.py
+++ b/media_platform/douyin/core.py
@@ -21,6 +21,7 @@ from .login import DouYinLogin
 class DouYinCrawler(AbstractCrawler):
    platform: str
    login_type: str
+    crawler_type: str
    context_page: Page
    dy_client: DOUYINClient
    account_pool: AccountPool
@@ -30,10 +31,11 @@ class DouYinCrawler(AbstractCrawler):
        self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"  # fixed
        self.index_url = "https://www.douyin.com"

-    def init_config(self, platform: str, login_type: str, account_pool: AccountPool) -> None:
+    def init_config(self, platform: str, login_type: str, account_pool: AccountPool, crawler_type: str) -> None:
        self.platform = platform
        self.login_type = login_type
        self.account_pool = account_pool
+        self.crawler_type = crawler_type

    async def start(self) -> None:
        account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
@@ -63,8 +65,12 @@ class DouYinCrawler(AbstractCrawler):
                await login_obj.begin()
                await self.dy_client.update_cookies(browser_context=self.browser_context)

-            # search_posts
-            await self.search()
+            if self.crawler_type == "search":
+                # Search for notes and retrieve their comment information.
+                await self.search()
+            elif self.crawler_type == "detail":
+                # Get the information and comments of the specified post
+                await self.get_specified_notes()

            utils.logger.info("Douyin Crawler finished ...")

@@ -74,7 +80,7 @@ class DouYinCrawler(AbstractCrawler):
            request_keyword_var.set(keyword)
            utils.logger.info(f"Current keyword: {keyword}")
            aweme_list: List[str] = []
-            dy_limit_count = 10  # douyin fixed limit page 10
+            dy_limit_count = 10
            page = 0
            while (page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
                try:
@@ -95,6 +101,11 @@ class DouYinCrawler(AbstractCrawler):
            utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
            await self.batch_get_note_comments(aweme_list)

+    async def get_specified_notes(self):
+        """Get the information and comments of the specified post"""
+        # todo douyin support
+        pass
+
    async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
        task_list: List[Task] = []
        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
--- a/media_platform/xhs/core.py
+++ b/media_platform/xhs/core.py
@@ -12,7 +12,7 @@ from base.base_crawler import AbstractCrawler
 from base.proxy_account_pool import AccountPool
 from models import xiaohongshu as xhs_model
 from tools import utils
-from var import request_keyword_var
+from var import crawler_type_var

 from .client import XHSClient
 from .exception import DataFetchError
@@ -22,6 +22,7 @@ from .login import XHSLogin
 class XiaoHongShuCrawler(AbstractCrawler):
    platform: str
    login_type: str
+    crawler_type: str
    context_page: Page
    xhs_client: XHSClient
    account_pool: AccountPool
@@ -31,10 +32,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
        self.index_url = "https://www.xiaohongshu.com"
        self.user_agent = utils.get_user_agent()

-    def init_config(self, platform: str, login_type: str, account_pool: AccountPool) -> None:
+    def init_config(self, platform: str, login_type: str, account_pool: AccountPool, crawler_type: str) -> None:
        self.platform = platform
        self.login_type = login_type
        self.account_pool = account_pool
+        self.crawler_type =crawler_type

    async def start(self) -> None:
        account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
@@ -72,8 +74,16 @@ class XiaoHongShuCrawler(AbstractCrawler):
                await login_obj.begin()
                await self.xhs_client.update_cookies(browser_context=self.browser_context)

-            # Search for notes and retrieve their comment information.
-            await self.search()
+            if self.crawler_type == "search":
+                # Search for notes and retrieve their comment information.
+                crawler_type_var.set("search")
+                await self.search()
+            elif self.crawler_type == "detail":
+                # Get the information and comments of the specified post
+                crawler_type_var.set("detail")
+                await self.get_specified_notes()
+            else:
+                pass

            utils.logger.info("Xhs Crawler finished ...")

@@ -82,8 +92,6 @@ class XiaoHongShuCrawler(AbstractCrawler):
        utils.logger.info("Begin search xiaohongshu keywords")
        xhs_limit_count = 20  # xhs limit page fixed value
        for keyword in config.KEYWORDS.split(","):
-            # set keyword to context var
-            request_keyword_var.set(keyword)
            utils.logger.info(f"Current search keyword: {keyword}")
            page = 1
            while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
@@ -107,6 +115,19 @@ class XiaoHongShuCrawler(AbstractCrawler):
                utils.logger.info(f"Note details: {note_details}")
                await self.batch_get_note_comments(note_id_list)

+    async def get_specified_notes(self):
+        """Get the information and comments of the specified post"""
+        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
+        task_list = [
+            self.get_note_detail(note_id=note_id, semaphore=semaphore) for note_id in config.SPECIFIED_ID_LIST
+        ]
+        note_details = await asyncio.gather(*task_list)
+        for note_detail in note_details:
+            if note_detail is not None:
+                await xhs_model.update_xhs_note(note_detail)
+        await self.batch_get_note_comments(config.SPECIFIED_ID_LIST)
+
+
    async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
        """Get note detail"""
        async with semaphore:
@@ -115,6 +136,9 @@ class XiaoHongShuCrawler(AbstractCrawler):
            except DataFetchError as ex:
                utils.logger.error(f"Get note detail error: {ex}")
                return None
+            except KeyError as ex:
+                utils.logger.error(f"have not fund note detail note_id:{note_id}, err: {ex}")
+                return None

    async def batch_get_note_comments(self, note_list: List[str]):
        """Batch get note comments"""