CI: add code-size check for files crossing LOC threshold (#12810)

* CI: add code-size check for files crossing LOC threshold * feat(ci): add duplicate function detection to CI code-size check The --compare-to mode now also detects new duplicate function names introduced by a PR. Uses git diff to scope checks to changed files only, keeping CI fast. * fix(ci): address review feedback for code-size check - Validate git ref upfront; exit 2 if ref doesn't exist - Distinguish 'file missing at ref' from genuine git errors - Explicitly fetch base branch ref in CI workflow - Raise threshold from 700 to 1000 lines * fix(ci): exclude Swabble, skills, .pi from code analysis * update gitignore for pycache * ci: make code-size check informational (no failure on violations)
2026-03-12 07:20:45 +00:00 · 2026-02-09 11:34:18 -08:00
parent 268094938b
commit 50b3d32d3c
3 changed files with 268 additions and 8 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -386,6 +386,31 @@ jobs:
      - name: Run ${{ matrix.task }}
        run: ${{ matrix.command }}
  # Check for files that grew past LOC threshold in this PR (delta-only).
  code-size:
    if: github.event_name == 'pull_request'
    runs-on: blacksmith-4vcpu-ubuntu-2404
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          submodules: false
      - name: Setup Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.12"
      - name: Fetch base branch
        run: git fetch origin ${{ github.base_ref }}:refs/remotes/origin/${{ github.base_ref }}
      - name: Check code file sizes
        run: |
          python scripts/analyze_code_files.py \
            --compare-to origin/${{ github.base_ref }} \
            --threshold 1000
  secrets:
    runs-on: blacksmith-4vcpu-ubuntu-2404
    steps:
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,8 @@ pnpm-lock.yaml
 bun.lock
 bun.lockb
 coverage
 __pycache__/
 *.pyc
 .tsbuildinfo
 .pnpm-store
 .worktrees/
--- a/scripts/analyze_code_files.py
+++ b/scripts/analyze_code_files.py
@@ -2,13 +2,18 @@
 """
 Lists the longest and shortest code files in the project, and counts duplicated function names across files. Useful for identifying potential refactoring targets and enforcing code size guidelines.
 Threshold can be set to warn about files longer or shorter than a certain number of lines.
 CI mode (--compare-to): Only warns about files that grew past threshold compared to a base ref.
 Use --strict to exit non-zero on violations for CI gating.
 """
 import os
 import re
 import sys
 import subprocess
 import argparse
 from pathlib import Path
-from typing import List, Tuple, Dict, Set
+from typing import List, Tuple, Dict, Set, Optional
 from collections import defaultdict
 # File extensions to consider as code files
@@ -23,7 +28,10 @@ CODE_EXTENSIONS = {
 SKIP_DIRS = {
    'node_modules', '.git', 'dist', 'build', 'coverage',
    '__pycache__', '.turbo', 'out', '.worktrees', 'vendor',
-    'Pods', 'DerivedData', '.gradle', '.idea'
+    'Pods', 'DerivedData', '.gradle', '.idea',
    'Swabble',  # Separate Swift package
    'skills',   # Standalone skill scripts
    '.pi',      # Pi editor extensions
 }
 # Filename patterns to skip in short-file warnings (barrel exports, stubs)
@@ -125,12 +133,7 @@ def extract_functions(file_path: Path) -> Set[str]:
    except Exception:
        return set()
-    functions = set()
+    return extract_functions_from_content(content)
    for pattern in TS_FUNCTION_PATTERNS:
        for match in pattern.finditer(content):
            functions.add(match.group(1))
    return functions
 def find_duplicate_functions(files: List[Tuple[Path, int]], root_dir: Path) -> Dict[str, List[Path]]:
@@ -155,6 +158,170 @@ def find_duplicate_functions(files: List[Tuple[Path, int]], root_dir: Path) -> D
    return {name: paths for name, paths in function_locations.items() if len(paths) > 1}
 def validate_git_ref(root_dir: Path, ref: str) -> bool:
    """Validate that a git ref exists. Exits with error if not."""
    try:
        result = subprocess.run(
            ['git', 'rev-parse', '--verify', ref],
            capture_output=True,
            cwd=root_dir,
            encoding='utf-8',
        )
        return result.returncode == 0
    except Exception:
        return False
 def get_file_content_at_ref(file_path: Path, root_dir: Path, ref: str) -> Optional[str]:
    """Get content of a file at a specific git ref. Returns None if file doesn't exist at ref."""
    try:
        relative_path = file_path.relative_to(root_dir)
        # Use forward slashes for git paths
        git_path = str(relative_path).replace('\\', '/')
        result = subprocess.run(
            ['git', 'show', f'{ref}:{git_path}'],
            capture_output=True,
            cwd=root_dir,
            encoding='utf-8',
            errors='ignore',
        )
        if result.returncode != 0:
            stderr = result.stderr.strip()
            # "does not exist" or "exists on disk, but not in" = file missing at ref (OK)
            if 'does not exist' in stderr or 'exists on disk' in stderr:
                return None
            # Other errors (bad ref, git broken) = genuine failure
            if stderr:
                print(f"⚠️  git show error for {git_path}: {stderr}", file=sys.stderr)
            return None
        return result.stdout
    except Exception as e:
        print(f"⚠️  failed to read {file_path} at {ref}: {e}", file=sys.stderr)
        return None
 def get_line_count_at_ref(file_path: Path, root_dir: Path, ref: str) -> Optional[int]:
    """Get line count of a file at a specific git ref. Returns None if file doesn't exist at ref."""
    content = get_file_content_at_ref(file_path, root_dir, ref)
    if content is None:
        return None
    return len(content.splitlines())
 def extract_functions_from_content(content: str) -> Set[str]:
    """Extract function names from TypeScript content string."""
    functions = set()
    for pattern in TS_FUNCTION_PATTERNS:
        for match in pattern.finditer(content):
            functions.add(match.group(1))
    return functions
 def get_changed_files(root_dir: Path, compare_ref: str) -> Set[str]:
    """Get set of files changed between compare_ref and HEAD (relative paths with forward slashes)."""
    try:
        result = subprocess.run(
            ['git', 'diff', '--name-only', compare_ref, 'HEAD'],
            capture_output=True,
            cwd=root_dir,
            encoding='utf-8',
            errors='ignore',
        )
        if result.returncode != 0:
            return set()
        return {line.strip() for line in result.stdout.splitlines() if line.strip()}
    except Exception:
        return set()
 def find_duplicate_regressions(
    files: List[Tuple[Path, int]],
    root_dir: Path,
    compare_ref: str,
 ) -> Dict[str, List[Path]]:
    """
    Find new duplicate function names that didn't exist at the base ref.
    Only checks functions in files that changed to keep CI fast.
    Returns dict of function_name -> list of current file paths, only for
    duplicates that are new (weren't duplicated at compare_ref).
    """
    # Build current duplicate map
    current_dupes = find_duplicate_functions(files, root_dir)
    if not current_dupes:
        return {}
    # Get changed files to scope the comparison
    changed_files = get_changed_files(root_dir, compare_ref)
    if not changed_files:
        return {}  # Nothing changed, no new duplicates possible
    # Only check duplicate functions that involve at least one changed file
    relevant_dupes: Dict[str, List[Path]] = {}
    for func_name, paths in current_dupes.items():
        involves_changed = any(
            str(p.relative_to(root_dir)).replace('\\', '/') in changed_files
            for p in paths
        )
        if involves_changed:
            relevant_dupes[func_name] = paths
    if not relevant_dupes:
        return {}
    # For relevant duplicates, check if they were already duplicated at base ref
    # Only need to read base versions of files involved in these duplicates
    files_to_check: Set[Path] = set()
    for paths in relevant_dupes.values():
        files_to_check.update(paths)
    base_function_locations: Dict[str, List[Path]] = defaultdict(list)
    for file_path in files_to_check:
        if file_path.suffix.lower() not in {'.ts', '.tsx'}:
            continue
        content = get_file_content_at_ref(file_path, root_dir, compare_ref)
        if content is None:
            continue
        functions = extract_functions_from_content(content)
        for func in functions:
            if func in SKIP_DUPLICATE_FUNCTIONS:
                continue
            if any(func.startswith(prefix) for prefix in SKIP_DUPLICATE_PREFIXES):
                continue
            base_function_locations[func].append(file_path)
    base_dupes = {name for name, paths in base_function_locations.items() if len(paths) > 1}
    # Return only new duplicates
    return {name: paths for name, paths in relevant_dupes.items() if name not in base_dupes}
 def find_threshold_regressions(
    files: List[Tuple[Path, int]],
    root_dir: Path,
    compare_ref: str,
    threshold: int,
 ) -> List[Tuple[Path, int, Optional[int]]]:
    """
    Find files that crossed the threshold compared to a base ref.
    Returns list of (path, current_lines, base_lines) for files that:
    - Were under threshold (or didn't exist) at compare_ref
    - Are now at or over threshold
    """
    regressions = []
    for file_path, current_lines in files:
        if current_lines < threshold:
            continue  # Not over threshold now, skip
        base_lines = get_line_count_at_ref(file_path, root_dir, compare_ref)
        # Regression if: file is new OR was under threshold before
        if base_lines is None or base_lines < threshold:
            regressions.append((file_path, current_lines, base_lines))
    return regressions
 def main():
    parser = argparse.ArgumentParser(
        description='Analyze code files: list longest/shortest files, find duplicate function names'
@@ -189,10 +356,72 @@ def main():
        default='.',
        help='Directory to scan (default: current directory)'
    )
    parser.add_argument(
        '--compare-to',
        type=str,
        default=None,
        help='Git ref to compare against (e.g., origin/main). Only warn about files that grew past threshold.'
    )
    parser.add_argument(
        '--strict',
        action='store_true',
        help='Exit with non-zero status if any violations found (for CI)'
    )
    args = parser.parse_args()
    root_dir = Path(args.directory).resolve()
    # CI delta mode: only show regressions
    if args.compare_to:
        print(f"\n📂 Scanning: {root_dir}")
        print(f"🔍 Comparing to: {args.compare_to}\n")
        if not validate_git_ref(root_dir, args.compare_to):
            print(f"❌ Invalid git ref: {args.compare_to}", file=sys.stderr)
            print("   Make sure the ref exists (e.g. run 'git fetch origin <branch>')", file=sys.stderr)
            sys.exit(2)
        files = find_code_files(root_dir)
        violations = False
        # Check file length regressions
        regressions = find_threshold_regressions(files, root_dir, args.compare_to, args.threshold)
        if regressions:
            print(f"⚠️  {len(regressions)} file(s) crossed {args.threshold} line threshold:\n")
            for file_path, current, base in regressions:
                relative_path = file_path.relative_to(root_dir)
                if base is None:
                    print(f"   {relative_path}: {current:,} lines (new file)")
                else:
                    print(f"   {relative_path}: {base:,} → {current:,} lines (+{current - base:,})")
            print()
            violations = True
        else:
            print(f"✅ No files crossed {args.threshold} line threshold")
        # Check new duplicate function names
        new_dupes = find_duplicate_regressions(files, root_dir, args.compare_to)
        if new_dupes:
            print(f"⚠️  {len(new_dupes)} new duplicate function name(s):\n")
            for func_name in sorted(new_dupes.keys()):
                paths = new_dupes[func_name]
                print(f"   {func_name}:")
                for path in paths:
                    print(f"       {path.relative_to(root_dir)}")
            print()
            violations = True
        else:
            print(f"✅ No new duplicate function names")
        print()
        if args.strict and violations:
            sys.exit(1)
        return
    print(f"\n📂 Scanning: {root_dir}\n")
    # Find and sort files by line count
@@ -306,6 +535,10 @@ def main():
        print(f"\n✅ No duplicate function names")
    print()
    # Exit with error if --strict and there are violations
    if args.strict and long_warnings:
        sys.exit(1)
 if __name__ == '__main__':