improve base config reading command line arg logic

2026-05-25 20:17:30 +08:00 · 2024-06-09 09:35:52 +08:00
parent 3c7c678d7a
commit eace7d1750
15 changed files with 91 additions and 139 deletions
--- a/media_platform/xhs/core.py
+++ b/media_platform/xhs/core.py
@@ -21,9 +21,6 @@ from .login import XiaoHongShuLogin


 class XiaoHongShuCrawler(AbstractCrawler):
-    platform: str
-    login_type: str
-    crawler_type: str
    context_page: Page
    xhs_client: XiaoHongShuClient
    browser_context: BrowserContext
@@ -32,13 +29,6 @@ class XiaoHongShuCrawler(AbstractCrawler):
        self.index_url = "https://www.xiaohongshu.com"
        self.user_agent = utils.get_user_agent()

-    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
-        self.platform = platform
-        self.login_type = login_type
-        self.crawler_type = crawler_type
-        self.start_page = start_page
-        self.keyword = keyword
-
    async def start(self) -> None:
        playwright_proxy_format, httpx_proxy_format = None, None
        if config.ENABLE_IP_PROXY:
@@ -71,7 +61,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
            self.xhs_client = await self.create_xhs_client(httpx_proxy_format)
            if not await self.xhs_client.pong():
                login_obj = XiaoHongShuLogin(
-                    login_type=self.login_type,
+                    login_type=config.LOGIN_TYPE,
                    login_phone="",  # input your phone number
                    browser_context=self.browser_context,
                    context_page=self.context_page,
@@ -80,14 +70,14 @@ class XiaoHongShuCrawler(AbstractCrawler):
                await login_obj.begin()
                await self.xhs_client.update_cookies(browser_context=self.browser_context)

-            crawler_type_var.set(self.crawler_type)
-            if self.crawler_type == "search":
+            crawler_type_var.set(config.CRAWLER_TYPE)
+            if config.CRAWLER_TYPE == "search":
                # Search for notes and retrieve their comment information.
                await self.search()
-            elif self.crawler_type == "detail":
+            elif config.CRAWLER_TYPE == "detail":
                # Get the information and comments of the specified post
                await self.get_specified_notes()
-            elif self.crawler_type == "creator":
+            elif config.CRAWLER_TYPE == "creator":
                # Get creator's information and their notes and comments
                await self.get_creators_and_notes()
            else:
@@ -101,8 +91,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
        xhs_limit_count = 20  # xhs limit page fixed value
        if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count:
            config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count
-        start_page = self.start_page
-        for keyword in self.keyword.split(","):
+        start_page = config.START_PAGE
+        for keyword in config.KEYWORDS.split(","):
            utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
            page = 1
            while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
@@ -264,7 +254,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
            # feat issue #14
            # we will save login state to avoid login every time
            user_data_dir = os.path.join(os.getcwd(), "browser_data",
-                                         config.USER_DATA_DIR % self.platform)  # type: ignore
+                                         config.USER_DATA_DIR % config.PLATFORM)  # type: ignore
            browser_context = await chromium.launch_persistent_context(
                user_data_dir=user_data_dir,
                accept_downloads=True,