GLM-Tools-Skills-Agents/hooks/claude-codex-settings/scripts/format_python_docstrings.py

#!/usr/bin/env python3
"""Format Python docstrings in Google style without external dependencies."""

from __future__ import annotations

import ast
import json
import re
import sys
from pathlib import Path

URLS = {"https", "http", "ftp"}
SECTIONS = (
    "Args",
    "Attributes",
    "Methods",
    "Returns",
    "Yields",
    "Raises",
    "Example",
    "Examples",
    "Notes",
    "References",
)
SECTION_ALIASES = {
    "Arguments": "Args",
    "Usage": "Examples",
    "Usage Example": "Examples",
    "Usage Examples": "Examples",
    "Example Usage": "Examples",
    "Example": "Examples",
    "Return": "Returns",
    "Yield": "Yields",
    "Raise": "Raises",
    "Note": "Notes",
    "Reference": "References",
}
LIST_RX = re.compile(r"""^(\s*)(?:[-*•]\s+|(?:\d+|[A-Za-z]+)[\.\)]\s+)""")
TABLE_RX = re.compile(r"^\s*\|.*\|\s*$")
TABLE_RULE_RX = re.compile(r"^\s*[:\-\|\s]{3,}$")
TREE_CHARS = ("└", "├", "│", "─")

# Antipatterns for non-Google docstring styles
RST_FIELD_RX = re.compile(r"^\s*:(param|type|return|rtype|raises)\b", re.M)
EPYDOC_RX = re.compile(r"^\s*@(?:param|type|return|rtype|raise)\b", re.M)
NUMPY_UNDERLINE_SECTION_RX = re.compile(r"^\s*(Parameters|Returns|Yields|Raises|Notes|Examples)\n[-]{3,}\s*$", re.M)
GOOGLE_SECTION_RX = re.compile(
    r"^\s*(Args|Attributes|Methods|Returns|Yields|Raises|Example|Examples|Notes|References):\s*$", re.M
)
NON_GOOGLE = {"numpy", "rest", "epydoc"}


def wrap_words(words: list[str], width: int, indent: int, min_words_per_line: int = 1) -> list[str]:
    """Wrap words to width with indent; optionally avoid very short orphan lines."""
    pad = " " * indent
    if not words:
        return []
    lines: list[list[str]] = []
    cur: list[str] = []
    cur_len = indent
    for w in words:
        need = len(w) + (1 if cur else 0)
        if cur and cur_len + need > width:
            lines.append(cur)
            cur, cur_len = [w], indent + len(w)
        else:
            cur.append(w)
            cur_len += need
    if cur:
        lines.append(cur)
    if min_words_per_line > 1:
        i = 1
        while i < len(lines):
            if len(lines[i]) < min_words_per_line and len(lines[i - 1]) > 1:
                donor = lines[i - 1][-1]
                this_len = len(pad) + sum(len(x) for x in lines[i]) + (len(lines[i]) - 1)
                if this_len + (1 if lines[i] else 0) + len(donor) <= width:
                    lines[i - 1].pop()
                    lines[i].insert(0, donor)
                    if i > 1 and len(lines[i - 1]) == 1:
                        i -= 1
                        continue
            i += 1
    return [pad + " ".join(line) for line in lines]


def wrap_para(text: str, width: int, indent: int, min_words_per_line: int = 1) -> list[str]:
    """Wrap a paragraph string; orphan control via min_words_per_line."""
    if text := text.strip():
        return wrap_words(text.split(), width, indent, min_words_per_line)
    else:
        return []


def wrap_hanging(head: str, desc: str, width: int, cont_indent: int) -> list[str]:
    """Wrap 'head + desc' with hanging indent; ensure first continuation has >=2 words."""
    room = width - len(head)
    words = desc.split()
    if not words:
        return [head.rstrip()]
    take, used = [], 0
    for w in words:
        need = len(w) + (1 if take else 0)
        if used + need <= room:
            take.append(w)
            used += need
        else:
            break
    out: list[str] = []
    if take:
        out.append(head + " ".join(take))
        rest = words[len(take) :]
    else:
        out.append(head.rstrip())
        rest = words
    out.extend(wrap_words(rest, width, cont_indent, min_words_per_line=2))
    return out


def is_list_item(s: str) -> bool:
    """Return True if s looks like a bullet/numbered list item."""
    return bool(LIST_RX.match(s.lstrip()))


def is_fence_line(s: str) -> bool:
    """Return True if s is a Markdown code-fence line."""
    t = s.lstrip()
    return t.startswith("```")


def is_table_like(s: str) -> bool:
    """Return True if s resembles a simple Markdown table or rule line."""
    return bool(TABLE_RX.match(s)) or bool(TABLE_RULE_RX.match(s))


def is_tree_like(s: str) -> bool:
    """Return True if s contains common ASCII tree characters."""
    return any(ch in s for ch in TREE_CHARS)


def is_indented_block_line(s: str) -> bool:
    """Return True if s looks like a deeply-indented preformatted block."""
    return bool(s.startswith("        ")) or s.startswith("\t")


def header_name(line: str) -> str | None:
    """Return canonical section header or None."""
    s = line.strip()
    if not s.endswith(":") or len(s) <= 1:
        return None
    name = s[:-1].strip()
    name = SECTION_ALIASES.get(name, name)
    return name if name in SECTIONS else None


def add_header(lines: list[str], indent: int, title: str) -> None:
    """Append a section header with a blank line before it."""
    while lines and lines[-1] == "":
        lines.pop()
    if lines:
        lines.append("")
    lines.append(" " * indent + f"{title}:")


def emit_paragraphs(
    src: list[str], width: int, indent: int, list_indent: int | None = None, orphan_min: int = 1
) -> list[str]:
    """Wrap text while preserving lists, fenced code, tables, trees, and deeply-indented blocks."""
    out: list[str] = []
    buf: list[str] = []
    in_fence = False

    def flush():
        """Flush buffered paragraph with wrapping."""
        nonlocal buf
        if buf:
            out.extend(wrap_para(" ".join(x.strip() for x in buf), width, indent, min_words_per_line=orphan_min))
            buf = []

    for raw in src:
        s = raw.rstrip("\n")
        stripped = s.strip()
        if not stripped:
            flush()
            out.append("")
            continue
        if is_fence_line(s):
            flush()
            out.append(s.rstrip())
            in_fence = not in_fence
            continue
        if in_fence or is_table_like(s) or is_tree_like(s) or is_indented_block_line(s):
            flush()
            out.append(s.rstrip())
            continue
        if is_list_item(s):
            flush()
            out.append((" " * list_indent + stripped) if list_indent is not None else s.rstrip())
            continue
        buf.append(s)
    flush()
    while out and out[-1] == "":
        out.pop()
    return out


def parse_sections(text: str) -> dict[str, list[str]]:
    """Parse Google-style docstring into sections."""
    parts = {k: [] for k in ("summary", "description", *SECTIONS)}
    cur = "summary"
    for raw in text.splitlines():
        line = raw.rstrip("\n")
        if h := header_name(line):
            cur = h
            continue
        if not line.strip():
            if cur == "summary" and parts["summary"]:
                cur = "description"
            if parts[cur]:
                parts[cur].append("")
            continue
        parts[cur].append(line)
    return parts


def looks_like_param(s: str) -> bool:
    """Heuristic: True if line looks like a 'name: desc' param without being a list item."""
    if is_list_item(s) or ":" not in s:
        return False
    head = s.split(":", 1)[0].strip()
    return False if head in URLS else bool(head)


def iter_items(lines: list[str]) -> list[list[str]]:
    """Group lines into logical items separated by next param-like line."""
    items, i, n = [], 0, len(lines)
    while i < n:
        while i < n and not lines[i].strip():
            i += 1
        if i >= n:
            break
        item = [lines[i]]
        i += 1
        while i < n:
            st = lines[i].strip()
            if st and looks_like_param(st):
                break
            item.append(lines[i])
            i += 1
        items.append(item)
    return items


def format_structured_block(lines: list[str], width: int, base: int) -> list[str]:
    """Format Args/Returns/etc.; continuation at base+4, lists at base+8."""
    out: list[str] = []
    cont, lst = base + 4, base + 8
    for item in iter_items(lines):
        first = item[0].strip()
        name, desc = ([*first.split(":", 1), ""])[:2]
        name, desc = name.strip(), desc.strip()
        had_colon = ":" in first
        if not name or (" " in name and "(" not in name and ")" not in name):
            out.extend(emit_paragraphs(item, width, cont, lst, orphan_min=2))
            continue
        # Join continuation lines that aren't new paragraphs into desc
        parts = [desc] if desc else []
        tail, i = [], 1
        while i < len(item):
            line = item[i].strip()
            if not line or is_list_item(item[i]) or is_fence_line(item[i]) or is_table_like(item[i]):
                tail = item[i:]
                break
            parts.append(line)
            i += 1
        else:
            tail = []
        desc = " ".join(parts)
        head = " " * cont + (f"{name}: " if (desc or had_colon) else name)
        out.extend(wrap_hanging(head, desc, width, cont + 4))
        if tail:
            if body := emit_paragraphs(tail, width, cont + 4, lst, orphan_min=2):
                out.extend(body)
    return out


def detect_opener(original_literal: str) -> tuple[str, str, bool]:
    """Return (prefix, quotes, inline_hint) from the original string token safely."""
    s = original_literal.lstrip()
    i = 0
    while i < len(s) and s[i] in "rRuUbBfF":
        i += 1
    quotes = '"""'
    if i + 3 <= len(s) and s[i : i + 3] in ('"""', "'''"):
        quotes = s[i : i + 3]
    keep = "".join(ch for ch in s[:i] if ch in "rRuU")
    j = i + len(quotes)
    inline_hint = j < len(s) and s[j : j + 1] not in {"", "\n", "\r"}
    return keep, quotes, inline_hint


def format_google(text: str, indent: int, width: int, quotes: str, prefix: str, start_newline: bool) -> str:
    """Format multi-line Google-style docstring with start_newline controlling summary placement."""
    p = parse_sections(text)
    opener = prefix + quotes
    out: list[str] = []

    if p["summary"]:
        summary_text = " ".join(x.strip() for x in p["summary"]).strip()
        if summary_text and summary_text[-1] not in ".!?":
            summary_text += "."

        if start_newline:
            out.append(opener)
            out.extend(emit_paragraphs([summary_text], width, indent, list_indent=indent, orphan_min=1))
        else:
            eff_width = max(1, width - indent)
            out.extend(wrap_hanging(opener, summary_text, eff_width, indent))
    else:
        out.append(opener)

    if any(x.strip() for x in p["description"]):
        out.append("")
        out.extend(emit_paragraphs(p["description"], width, indent, list_indent=indent, orphan_min=1))

    has_content = bool(p["summary"]) or any(x.strip() for x in p["description"])
    for sec in ("Args", "Attributes", "Methods", "Returns", "Yields", "Raises"):
        if any(x.strip() for x in p[sec]):
            if has_content:
                add_header(out, indent, sec)
            else:
                out.append(" " * indent + f"{sec}:")
                has_content = True
            out.extend(format_structured_block(p[sec], width, indent))

    for sec in ("Examples", "Notes", "References"):
        if any(x.strip() for x in p[sec]):
            add_header(out, indent, sec)
            out.extend(x.rstrip() for x in p[sec])

    while out and out[-1] == "":
        out.pop()
    out.append(" " * indent + quotes)
    return "\n".join(out)


def likely_docstring_style(text: str) -> str:
    """Return 'google' | 'numpy' | 'rest' | 'epydoc' | 'unknown' for docstring text."""
    t = "\n".join(line.rstrip() for line in text.strip().splitlines())
    if RST_FIELD_RX.search(t):
        return "rest"
    if EPYDOC_RX.search(t):
        return "epydoc"
    if NUMPY_UNDERLINE_SECTION_RX.search(t):
        return "numpy"
    return "google" if GOOGLE_SECTION_RX.search(t) else "unknown"


def format_docstring(
    content: str, indent: int, width: int, quotes: str, prefix: str, start_newline: bool = False
) -> str:
    """Single-line if short/sectionless/no-lists; else Google-style; preserve quotes/prefix."""
    if not content or not content.strip():
        return f"{prefix}{quotes}{quotes}"
    style = likely_docstring_style(content)
    if style in NON_GOOGLE:
        body = "\n".join(line.rstrip() for line in content.rstrip("\n").splitlines())
        return f"{prefix}{quotes}{body}{quotes}"
    text = content.strip()
    has_section = any(f"{s}:" in text for s in SECTIONS)
    has_list = any(is_list_item(line) for line in text.splitlines())
    single_ok = (
        ("\n" not in text)
        and not has_section
        and not has_list
        and (indent + len(prefix) + len(quotes) * 2 + len(text) <= width)
    )
    if single_ok:
        words = text.split()
        if words and not words[0].startswith(("http://", "https://")) and not words[0][0].isupper():
            words[0] = words[0][0].upper() + words[0][1:]
        out = " ".join(words)
        if out and out[-1] not in ".!?":
            out += "."
        return f"{prefix}{quotes}{out}{quotes}"
    return format_google(text, indent, width, quotes, prefix, start_newline)


class Visitor(ast.NodeVisitor):
    """Collect docstring replacements for classes and functions."""

    def __init__(self, src: list[str], width: int = 120, start_newline: bool = False):
        """Init with source lines, target width, and start_newline flag."""
        self.src, self.width, self.repl, self.start_newline = src, width, [], start_newline

    def visit_Module(self, node):
        """Skip module docstring; visit children."""
        self.generic_visit(node)

    def visit_ClassDef(self, node):
        """Visit class definition and handle its docstring."""
        self._handle(node)
        self.generic_visit(node)

    def visit_FunctionDef(self, node):
        """Visit function definition and handle its docstring."""
        self._handle(node)
        self.generic_visit(node)

    def visit_AsyncFunctionDef(self, node):
        """Visit async function definition and handle its docstring."""
        self._handle(node)
        self.generic_visit(node)

    def _handle(self, node):
        """If first stmt is a string expr, schedule replacement."""
        try:
            doc = ast.get_docstring(node, clean=False)
            if not doc or not node.body or not isinstance(node.body[0], ast.Expr):
                return
            s = node.body[0].value
            if not (isinstance(s, ast.Constant) and isinstance(s.value, str)):
                return
            if likely_docstring_style(doc) in NON_GOOGLE:
                return
            sl, el = node.body[0].lineno - 1, node.body[0].end_lineno - 1
            sc, ec = node.body[0].col_offset, node.body[0].end_col_offset
            if sl < 0 or el >= len(self.src):
                return
            original = (
                self.src[sl][sc:ec]
                if sl == el
                else "\n".join([self.src[sl][sc:], *self.src[sl + 1 : el], self.src[el][:ec]])
            )
            prefix, quotes, _ = detect_opener(original)
            formatted = format_docstring(doc, sc, self.width, quotes, prefix, self.start_newline)
            if formatted.strip() != original.strip():
                self.repl.append((sl, el, sc, ec, formatted))
        except Exception:
            return


def format_python_file(text: str, width: int = 120, start_newline: bool = False) -> str:
    """Return source with reformatted docstrings; on failure, return original."""
    s = text
    if not s.strip():
        return s
    if ('"""' not in s and "'''" not in s) or ("def " not in s and "class " not in s and "async def " not in s):
        return s
    try:
        tree = ast.parse(s)
    except SyntaxError:
        return s
    src = s.splitlines()
    v = Visitor(src, width, start_newline=start_newline)
    try:
        v.visit(tree)
    except Exception:
        return s
    if not v.repl:
        return s
    for sl, el, sc, ec, rep in reversed(v.repl):
        try:
            if sl == el:
                src[sl] = src[sl][:sc] + rep + src[sl][ec:]
            else:
                nl = rep.splitlines()
                nl[0] = src[sl][:sc] + nl[0]
                nl[-1] += src[el][ec:]
                src[sl : el + 1] = nl
        except Exception:
            continue
    return "\n".join(src)


def preserve_trailing_newlines(original: str, formatted: str) -> str:
    """Preserve the original trailing newline count."""
    o = len(original) - len(original.rstrip("\n"))
    f = len(formatted) - len(formatted.rstrip("\n"))
    return formatted if o == f else formatted.rstrip("\n") + ("\n" * o)


def read_python_path() -> Path | None:
    """Read the Python path from stdin payload.

    Returns:
        (Path | None): Python file path when present and valid.
    """
    try:
        data = json.load(sys.stdin)
    except Exception:
        return None
    file_path = data.get("tool_input", {}).get("file_path", "")
    path = Path(file_path) if file_path else None
    if not path or path.suffix != ".py" or not path.exists():
        return None
    if any(
        p in path.parts
        for p in [
            ".git",
            ".venv",
            "venv",
            "env",
            ".env",
            "__pycache__",
            ".mypy_cache",
            ".pytest_cache",
            ".tox",
            ".nox",
            ".eggs",
            "eggs",
            ".idea",
            ".vscode",
            "node_modules",
            "site-packages",
            "build",
            "dist",
            ".claude",
        ]
    ):
        return None
    return path


def main() -> None:
    """Format Python docstrings in files."""
    python_file = read_python_path()
    if python_file:
        try:
            content = python_file.read_text()
            formatted = preserve_trailing_newlines(content, format_python_file(content))
            if formatted != content:
                python_file.write_text(formatted)
                print(f"Formatted: {python_file}")
        except Exception as e:
            output = {
                "hookSpecificOutput": {
                    "hookEventName": "PostToolUse",
                    "additionalContext": f"Docstring formatting failed for {python_file.name}: {e}",
                }
            }
            print(json.dumps(output))
    sys.exit(0)


if __name__ == "__main__":
    main()