#!/usr/bin/env python3
"""
Lists the longest and shortest code files in the project, and counts duplicated function names across files. Useful for identifying potential refactoring targets and enforcing code size guidelines.
Threshold can be set to warn about files longer or shorter than a certain number of lines.
"""

import os
import re
import argparse
from pathlib import Path
from typing import List, Tuple, Dict, Set
from collections import defaultdict

# File extensions to consider as code files
CODE_EXTENSIONS = {
    '.ts', '.tsx', '.js', '.jsx', '.mjs', '.cjs',  # TypeScript/JavaScript
    '.swift',  # macOS/iOS
    '.kt', '.java',  # Android
    '.py', '.sh',  # Scripts
}

# Directories to skip
SKIP_DIRS = {
    'node_modules', '.git', 'dist', 'build', 'coverage',
    '__pycache__', '.turbo', 'out', '.worktrees', 'vendor',
    'Pods', 'DerivedData', '.gradle', '.idea'
}

# Filename patterns to skip in short-file warnings (barrel exports, stubs)
SKIP_SHORT_PATTERNS = {
    'index.js', 'index.ts', 'postinstall.js',
}
SKIP_SHORT_SUFFIXES = ('-cli.ts',)

# Function names to skip in duplicate detection (common utilities, test helpers)
SKIP_DUPLICATE_FUNCTIONS = {
    # Common utility names
    'main', 'init', 'setup', 'teardown', 'cleanup', 'dispose', 'destroy',
    'open', 'close', 'connect', 'disconnect', 'execute', 'run', 'start', 'stop',
    'render', 'update', 'refresh', 'reset', 'clear', 'flush',
}

SKIP_DUPLICATE_PREFIXES = (
    # Transformers
    'normalize', 'parse', 'validate', 'serialize', 'deserialize',
    'convert', 'transform', 'extract', 'encode', 'decode',
    # Predicates
    'is', 'has', 'can', 'should', 'will',
    # Constructors/factories
    'create', 'make', 'build', 'generate', 'new',
    # Accessors
    'get', 'set', 'read', 'write', 'load', 'save', 'fetch',
    # Handlers
    'handle', 'on', 'emit',
    # Modifiers
    'add', 'remove', 'delete', 'update', 'insert', 'append',
    # Other common
    'to', 'from', 'with', 'apply', 'process', 'resolve', 'ensure', 'check',
    'filter', 'map', 'reduce', 'merge', 'split', 'join', 'find', 'search',
    'register', 'unregister', 'subscribe', 'unsubscribe',
)
SKIP_DUPLICATE_FILE_PATTERNS = ('.test.ts', '.test.tsx', '.spec.ts')

# Known packages in the monorepo
PACKAGES = {
    'src', 'apps', 'extensions', 'packages', 'scripts', 'ui', 'test', 'docs'
}


def get_package(file_path: Path, root_dir: Path) -> str:
    """Get the package name for a file, or 'root' if at top level."""
    try:
        relative = file_path.relative_to(root_dir)
        parts = relative.parts
        if len(parts) > 0 and parts[0] in PACKAGES:
            return parts[0]
        return 'root'
    except ValueError:
        return 'root'


def count_lines(file_path: Path) -> int:
    """Count the number of lines in a file."""
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            return sum(1 for _ in f)
    except Exception:
        return 0


def find_code_files(root_dir: Path) -> List[Tuple[Path, int]]:
    """Find all code files and their line counts."""
    files_with_counts = []
    
    for dirpath, dirnames, filenames in os.walk(root_dir):
        # Remove skip directories from dirnames to prevent walking into them
        dirnames[:] = [d for d in dirnames if d not in SKIP_DIRS]
        
        for filename in filenames:
            file_path = Path(dirpath) / filename
            if file_path.suffix.lower() in CODE_EXTENSIONS:
                line_count = count_lines(file_path)
                files_with_counts.append((file_path, line_count))
    
    return files_with_counts


# Regex patterns for TypeScript functions (exported and internal)
TS_FUNCTION_PATTERNS = [
    # export function name(...) or function name(...)
    re.compile(r'^(?:export\s+)?(?:async\s+)?function\s+(\w+)', re.MULTILINE),
    # export const name = or const name =
    re.compile(r'^(?:export\s+)?const\s+(\w+)\s*=\s*(?:\([^)]*\)|\w+)\s*=>', re.MULTILINE),
]


def extract_functions(file_path: Path) -> Set[str]:
    """Extract function names from a TypeScript file."""
    if file_path.suffix.lower() not in {'.ts', '.tsx'}:
        return set()
    
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()
    except Exception:
        return set()
    
    functions = set()
    for pattern in TS_FUNCTION_PATTERNS:
        for match in pattern.finditer(content):
            functions.add(match.group(1))
    
    return functions


def find_duplicate_functions(files: List[Tuple[Path, int]], root_dir: Path) -> Dict[str, List[Path]]:
    """Find function names that appear in multiple files."""
    function_locations: Dict[str, List[Path]] = defaultdict(list)
    
    for file_path, _ in files:
        # Skip test files for duplicate detection
        if any(file_path.name.endswith(pat) for pat in SKIP_DUPLICATE_FILE_PATTERNS):
            continue
        
        functions = extract_functions(file_path)
        for func in functions:
            # Skip known common function names
            if func in SKIP_DUPLICATE_FUNCTIONS:
                continue
            if any(func.startswith(prefix) for prefix in SKIP_DUPLICATE_PREFIXES):
                continue
            function_locations[func].append(file_path)
    
    # Filter to only duplicates
    return {name: paths for name, paths in function_locations.items() if len(paths) > 1}


def main():
    parser = argparse.ArgumentParser(
        description='Analyze code files: list longest/shortest files, find duplicate function names'
    )
    parser.add_argument(
        '-t', '--threshold',
        type=int,
        default=1000,
        help='Warn about files longer than this many lines (default: 1000)'
    )
    parser.add_argument(
        '--min-threshold',
        type=int,
        default=10,
        help='Warn about files shorter than this many lines (default: 10)'
    )
    parser.add_argument(
        '-n', '--top',
        type=int,
        default=20,
        help='Show top N longest files (default: 20)'
    )
    parser.add_argument(
        '-b', '--bottom',
        type=int,
        default=10,
        help='Show bottom N shortest files (default: 10)'
    )
    parser.add_argument(
        '-d', '--directory',
        type=str,
        default='.',
        help='Directory to scan (default: current directory)'
    )
    
    args = parser.parse_args()
    
    root_dir = Path(args.directory).resolve()
    print(f"\n📂 Scanning: {root_dir}\n")
    
    # Find and sort files by line count
    files = find_code_files(root_dir)
    files_desc = sorted(files, key=lambda x: x[1], reverse=True)
    files_asc = sorted(files, key=lambda x: x[1])
    
    # Show top N longest files
    top_files = files_desc[:args.top]
    
    print(f"📊 Top {min(args.top, len(top_files))} longest code files:\n")
    print(f"{'Lines':>8}  {'File'}")
    print("-" * 60)
    
    long_warnings = []
    
    for file_path, line_count in top_files:
        relative_path = file_path.relative_to(root_dir)
        
        # Check if over threshold
        if line_count >= args.threshold:
            marker = " ⚠️"
            long_warnings.append((relative_path, line_count))
        else:
            marker = ""
        
        print(f"{line_count:>8}  {relative_path}{marker}")
    
    # Show bottom N shortest files
    bottom_files = files_asc[:args.bottom]
    
    print(f"\n📉 Bottom {min(args.bottom, len(bottom_files))} shortest code files:\n")
    print(f"{'Lines':>8}  {'File'}")
    print("-" * 60)
    
    short_warnings = []
    
    for file_path, line_count in bottom_files:
        relative_path = file_path.relative_to(root_dir)
        filename = file_path.name
        
        # Skip known barrel exports and stubs
        is_expected_short = (
            filename in SKIP_SHORT_PATTERNS or
            any(filename.endswith(suffix) for suffix in SKIP_SHORT_SUFFIXES)
        )
        
        # Check if under threshold
        if line_count <= args.min_threshold and not is_expected_short:
            marker = " ⚠️"
            short_warnings.append((relative_path, line_count))
        else:
            marker = ""
        
        print(f"{line_count:>8}  {relative_path}{marker}")
    
    # Summary
    total_files = len(files)
    total_lines = sum(count for _, count in files)
    
    print("-" * 60)
    print(f"\n📈 Summary:")
    print(f"   Total code files: {total_files:,}")
    print(f"   Total lines: {total_lines:,}")
    print(f"   Average lines/file: {total_lines // total_files if total_files else 0:,}")
    
    # Per-package breakdown
    package_stats: dict[str, dict] = {}
    for file_path, line_count in files:
        pkg = get_package(file_path, root_dir)
        if pkg not in package_stats:
            package_stats[pkg] = {'files': 0, 'lines': 0}
        package_stats[pkg]['files'] += 1
        package_stats[pkg]['lines'] += line_count
    
    print(f"\n📦 Per-package breakdown:\n")
    print(f"{'Package':<15} {'Files':>8} {'Lines':>10} {'Avg':>8}")
    print("-" * 45)
    
    for pkg in sorted(package_stats.keys(), key=lambda p: package_stats[p]['lines'], reverse=True):
        stats = package_stats[pkg]
        avg = stats['lines'] // stats['files'] if stats['files'] else 0
        print(f"{pkg:<15} {stats['files']:>8,} {stats['lines']:>10,} {avg:>8,}")
    
    # Long file warnings
    if long_warnings:
        print(f"\n⚠️  Warning: {len(long_warnings)} file(s) exceed {args.threshold} lines (consider refactoring):")
        for path, count in long_warnings:
            print(f"   - {path} ({count:,} lines)")
    else:
        print(f"\n✅ No files exceed {args.threshold} lines")
    
    # Short file warnings
    if short_warnings:
        print(f"\n⚠️  Warning: {len(short_warnings)} file(s) are {args.min_threshold} lines or less (check if needed):")
        for path, count in short_warnings:
            print(f"   - {path} ({count} lines)")
    else:
        print(f"\n✅ No files are {args.min_threshold} lines or less")
    
    # Duplicate function names
    duplicates = find_duplicate_functions(files, root_dir)
    if duplicates:
        print(f"\n⚠️  Warning: {len(duplicates)} function name(s) appear in multiple files (consider renaming):")
        for func_name in sorted(duplicates.keys()):
            paths = duplicates[func_name]
            print(f"   - {func_name}:")
            for path in paths:
                print(f"       {path.relative_to(root_dir)}")
    else:
        print(f"\n✅ No duplicate function names")
    
    print()


if __name__ == '__main__':
    main()