MediaCrawler/tools/file_header_manager.py

# -*- coding: utf-8 -*-
# Copyright (c) 2025 relakkes@gmail.com
#
# This file is part of MediaCrawler project.
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/tools/file_header_manager.py
# GitHub: https://github.com/NanmiCoder
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
#
# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。

"""
File header copyright declaration management tool

Features:
- Automatically add standardized copyright declaration and disclaimer to Python files
- Intelligently detect existing file headers (encoding declaration, author info, disclaimer, etc.)
- Insert copyright info at appropriate position without breaking existing content
- Support batch processing and single file check mode
"""

import os
import re
import sys
from typing import List, Tuple

# Project configuration
REPO_URL = "https://github.com/NanmiCoder/MediaCrawler"
GITHUB_PROFILE = "https://github.com/NanmiCoder"
EMAIL = "relakkes@gmail.com"
COPYRIGHT_YEAR = "2025"
LICENSE_TYPE = "NON-COMMERCIAL LEARNING LICENSE 1.1"

# Disclaimer standard text
DISCLAIMER = """# 声明：本代码仅供学习和研究目的使用。使用者应遵守以下原则：
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率，避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。"""


def get_file_relative_path(file_path: str, project_root: str) -> str:
    """
    Get file path relative to project root

    Args:
        file_path: File absolute path
        project_root: Project root directory

    Returns:
        Relative path string
    """
    return os.path.relpath(file_path, project_root)


def generate_copyright_header(relative_path: str) -> str:
    """
    Generate copyright declaration header

    Args:
        relative_path: File path relative to project root

    Returns:
        Formatted copyright declaration string
    """
    file_url = f"{REPO_URL}/blob/main/{relative_path}"

    header = f"""# Copyright (c) {COPYRIGHT_YEAR} {EMAIL}
#
# This file is part of MediaCrawler project.
# Repository: {file_url}
# GitHub: {GITHUB_PROFILE}
# Licensed under {LICENSE_TYPE}
#"""

    return header


def has_copyright_header(content: str) -> bool:
    """
    Check if file already contains copyright declaration

    Args:
        content: File content

    Returns:
        True if already contains copyright declaration
    """
    # Check if contains Copyright keyword
    return "Copyright (c)" in content and "MediaCrawler project" in content


def has_disclaimer(content: str) -> bool:
    """
    Check if file already contains disclaimer

    Args:
        content: File content

    Returns:
        True if already contains disclaimer
    """
    return "声明：本代码仅供学习和研究目的使用" in content


def find_insert_position(lines: List[str]) -> Tuple[int, bool]:
    """
    Find position to insert copyright declaration

    Args:
        lines: List of file content lines

    Returns:
        (insert line number, whether encoding declaration needs to be added)
    """
    insert_pos = 0
    has_encoding = False

    # Check if first line is shebang
    if lines and lines[0].startswith('#!'):
        insert_pos = 1

    # Check encoding declaration (usually on line 1 or 2)
    for i in range(insert_pos, min(insert_pos + 2, len(lines))):
        if i < len(lines):
            line = lines[i].strip()
            # Match # -*- coding: utf-8 -*- or # coding: utf-8 etc.
            if re.match(r'#.*coding[:=]\s*([-\w.]+)', line):
                has_encoding = True
                insert_pos = i + 1
                break

    return insert_pos, has_encoding


def process_file(file_path: str, project_root: str, dry_run: bool = False) -> Tuple[bool, str]:
    """
    Process single Python file

    Args:
        file_path: File path
        project_root: Project root directory
        dry_run: Check only without modification

    Returns:
        (whether modification needed, status message)
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            lines = content.splitlines(keepends=True)

        # Skip if already has copyright header
        if has_copyright_header(content):
            return False, f"✓ Already has copyright header: {file_path}"

        # Get relative path
        relative_path = get_file_relative_path(file_path, project_root)

        # Generate copyright header
        copyright_header = generate_copyright_header(relative_path)

        # Find insert position
        insert_pos, has_encoding = find_insert_position(lines)

        # Build new file content
        new_lines = []

        # Add encoding declaration if not present
        if not has_encoding:
            new_lines.append("# -*- coding: utf-8 -*-\n")

        # Add front part (shebang and encoding declaration)
        new_lines.extend(lines[:insert_pos])

        # Add copyright header
        new_lines.append(copyright_header + "\n")

        # Add disclaimer if file doesn't have one
        if not has_disclaimer(content):
            new_lines.append(DISCLAIMER + "\n")

        # Add empty line (if next line is not empty)
        if insert_pos < len(lines) and lines[insert_pos].strip():
            new_lines.append("\n")

        # Add remaining content
        new_lines.extend(lines[insert_pos:])

        # Write to file if not dry run
        if not dry_run:
            with open(file_path, 'w', encoding='utf-8') as f:
                f.writelines(new_lines)
            return True, f"✓ Updated: {file_path}"
        else:
            return True, f"→ Would update: {file_path}"

    except Exception as e:
        return False, f"✗ Error processing {file_path}: {str(e)}"


def find_python_files(root_dir: str, exclude_patterns: List[str] = None) -> List[str]:
    """
    Find all Python files

    Args:
        root_dir: Root directory
        exclude_patterns: Directory patterns to exclude

    Returns:
        List of Python file paths
    """
    if exclude_patterns is None:
        exclude_patterns = ['venv', '.venv', 'node_modules', '__pycache__', '.git', 'build', 'dist', '.eggs']

    python_files = []

    for root, dirs, files in os.walk(root_dir):
        # Exclude specific directories
        dirs[:] = [d for d in dirs if d not in exclude_patterns and not d.startswith('.')]

        for file in files:
            if file.endswith('.py'):
                python_files.append(os.path.join(root, file))

    return sorted(python_files)


def main():
    """Main function"""
    import argparse

    parser = argparse.ArgumentParser(description='Python file header copyright declaration management tool')
    parser.add_argument('files', nargs='*', help='File paths to process (optional, defaults to all .py files)')
    parser.add_argument('--dry-run', action='store_true', help='Check only without modifying files')
    parser.add_argument('--project-root', default=None, help='Project root directory (defaults to current directory)')
    parser.add_argument('--check', action='store_true', help='Check mode, return non-zero exit code if files missing copyright declaration')

    args = parser.parse_args()

    # Determine project root directory
    if args.project_root:
        project_root = os.path.abspath(args.project_root)
    else:
        # Assume this script is in tools/ directory
        project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

    print(f"Project root: {project_root}")
    print(f"Mode: {'DRY RUN' if args.dry_run else 'UPDATE'}")
    print("-" * 60)

    # Get list of files to process
    if args.files:
        # Process specified files
        files_to_process = [os.path.abspath(f) for f in args.files if f.endswith('.py')]
    else:
        # Process all Python files
        files_to_process = find_python_files(project_root)

    print(f"Found {len(files_to_process)} Python files to process\n")

    # Process files
    updated_count = 0
    skipped_count = 0
    error_count = 0

    for file_path in files_to_process:
        modified, message = process_file(file_path, project_root, args.dry_run or args.check)
        print(message)

        if "Error" in message:
            error_count += 1
        elif modified:
            updated_count += 1
        else:
            skipped_count += 1

    # Print summary
    print("\n" + "=" * 60)
    print(f"Summary:")
    print(f"  Total files: {len(files_to_process)}")
    print(f"  Updated/Need update: {updated_count}")
    print(f"  Already compliant: {skipped_count}")
    print(f"  Errors: {error_count}")
    print("=" * 60)

    # Return non-zero exit code in check mode if files need update
    if args.check and updated_count > 0:
        sys.exit(1)
    elif error_count > 0:
        sys.exit(1)
    else:
        sys.exit(0)


if __name__ == '__main__':
    main()