#!/usr/bin/env python3

# ---
# Copyright (c) 2026 Carl L. Wuebker & Claude.ai Sonnet 4.6

# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.

# Note that Claude.ai Sonnet 4.6, guided by my requests, testing & feedback,
# wrote the code.  I've minimally tested the code, so there still may be bugs.
# ---


"""
pphj - HTML Pretty Printer
Reads an HTML file and outputs indented, pretty-printed HTML,
reporting structural errors as HTML comments in output and to stderr.

Usage: pphj.py [-i N] [-l N] <html_file>
  -i N   Spaces per indent level (default: 2)
  -l N   Soft maximum line length for collapsing pairs (default: 100)

pphj is a prettyprinter which reports some errors.  vnu.jar is an
excellent HTML validator and can be downloaded from:
  https://github.com/validator/validator/releases/tag/latest
Usage is: java -jar /path/to/vnu.jar file.html
"""

import re
import os
import sys
import argparse

# Tags that are self-closing (void elements)
VOID_TAGS = {
    'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
    'link', 'meta', 'param', 'source', 'track', 'wbr'
}

# Phrasing-inline tags: gathered onto the current line without a line break.
INLINE_TAGS = {
    'a', 'abbr', 'acronym', 'b', 'bdi', 'bdo', 'big', 'br', 'button',
    'cite', 'code', 'data', 'del', 'dfn', 'em', 'i', 'img', 'input',
    'ins', 'kbd', 'label', 'mark', 'output', 'q', 's', 'samp', 'select',
    'small', 'span', 'strong', 'sub', 'sup', 'textarea', 'time',
    'tt', 'u', 'var', 'wbr',
}

# Block-inline tags: their full content is collected on one line (like inline
# tags), but they always occupy their OWN line — flushing any prior inline
# buffer before starting, and flushing themselves afterwards.
BLOCK_INLINE_TAGS = {
    'title', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
    'p', 'dt', 'dd', 'caption', 'legend', 'summary',
}

# Mixed tags: open on their own line and seed inline_buf with the opener,
# so inline children (text, <a>, <i>, etc.) accumulate on that same line.
# If a block child arrives, flush_inline() is called first (normal behaviour).
# The closing tag is appended to whatever is in inline_buf rather than being
# emitted on its own indented line.
MIXED_TAGS = {'li', 'td', 'th'}

# Known parent-child relationships: parent -> set of required/expected children
REQUIRED_PARENTS = {
    'li': {'ul', 'ol'},
    'dt': {'dl'},
    'dd': {'dl'},
    'td': {'tr'},
    'th': {'tr'},
    'tr': {'table', 'thead', 'tbody', 'tfoot'},
    'thead': {'table'},
    'tbody': {'table'},
    'tfoot': {'table'},
    'caption': {'table'},
    'colgroup': {'table'},
    'col': {'colgroup'},
    'option': {'select', 'datalist', 'optgroup'},
    'optgroup': {'select'},
}


def report_error(msg, out_lines, indent, indent_unit):
    """Emit an error as an HTML comment to output and plain text to stderr."""
    comment = f"{indent_unit * indent}<!-- HTML Error: {msg} -->"
    out_lines.append(comment)
    print(f"HTML Error: {msg}", file=sys.stderr)


class HTMLPrettyPrinter:
    def __init__(self, text, indent_size=2, max_len=100):
        self.text = text
        self.pos = 0
        self.length = len(text)
        self.out_lines = []
        self.stack = []        # list of (tag_name, line_no, col_no)
        self.indent = 0
        self.indent_size = indent_size
        self.max_len = max_len
        # When not None, we are accumulating inline content onto this line
        # instead of emitting new indented lines.
        self.inline_buf = None   # str: the line being built up
        # Parallel to self.stack: for MIXED_TAGS entries, records the
        # out_lines index where the opener line sits, or None for non-mixed.
        self._mixed_opener_idx = []

    # ------------------------------------------------------------------
    def line_col(self, pos=None):
        if pos is None:
            pos = self.pos
        before = self.text[:pos]
        line = before.count('\n') + 1
        col = pos - before.rfind('\n')
        return line, col

    def ind(self):
        """Return the current indentation string."""
        return ' ' * (self.indent_size * self.indent)

    def flush_inline(self):
        """If we have a buffered inline line, emit it and clear the buffer."""
        if self.inline_buf is not None:
            flushed_idx = len(self.out_lines)
            self.out_lines.append(self.inline_buf.rstrip())
            self.inline_buf = None
            # Update any mixed openers that were still in the buffer
            for i, v in enumerate(self._mixed_opener_idx):
                if v is None and i < len(self._mixed_opener_idx):
                    self._mixed_opener_idx[i] = flushed_idx

    def emit(self, s):
        """Emit a complete line (flushing any inline buffer first)."""
        self.flush_inline()
        self.out_lines.append(s)

    def emit_inline(self, fragment):
        """
        Append fragment to the current inline buffer.
        If the buffer would exceed max_len, flush first and start a new
        indented line (soft limit — we never break a fragment itself).
        """
        if self.inline_buf is None:
            self.inline_buf = self.ind() + fragment
        else:
            candidate = self.inline_buf + fragment
            if len(candidate) <= self.max_len:
                self.inline_buf = candidate
            else:
                self.flush_inline()
                self.inline_buf = self.ind() + fragment

    def err(self, msg):
        report_error(msg, self.out_lines, self.indent,
                     ' ' * self.indent_size)

    # ------------------------------------------------------------------
    def parse(self):
        while self.pos < self.length:
            # Skip inter-token whitespace (we control indentation)
            m = re.match(r'[ \t\r\n]+', self.text[self.pos:])
            if m:
                self.pos += m.end()
                continue

            remaining = self.text[self.pos:]

            if remaining.startswith('<!--'):
                self.parse_comment()
                continue
            if re.match(r'<!', remaining, re.IGNORECASE):
                self.parse_doctype()
                continue
            if remaining.startswith('</'):
                self.parse_closing_tag()
                continue
            if remaining.startswith('<'):
                self.parse_opening_tag()
                continue

            self.parse_text()

        # Anything left on the stack is unclosed
        while self.stack:
            tag, lno, cno = self.stack.pop()
            msg = (f"unclosed <{tag}> opened at line {lno} col {cno}"
                   f" — missing </{tag}>")
            self.indent = len(self.stack)
            self.err(msg)

        self.flush_inline()
        return '\n'.join(self.out_lines)

    # ------------------------------------------------------------------
    def parse_comment(self):
        end = self.text.find('-->', self.pos + 4)
        if end == -1:
            comment_text = self.text[self.pos:]
            self.pos = self.length
        else:
            comment_text = self.text[self.pos:end + 3]
            self.pos = end + 3
        inner = comment_text[4:-3].strip()
        if '\n' in inner or len(inner) > 60:
            self.emit(f"{self.ind()}<!--{comment_text[4:-3]}-->")
        else:
            self.emit(f"{self.ind()}<!-- {inner} -->")

    def parse_doctype(self):
        m = re.match(r'<![^>]*>', self.text[self.pos:], re.DOTALL)
        if m:
            self.emit(self.text[self.pos:self.pos + m.end()])
            self.pos += m.end()
        else:
            self.pos += 1

    def parse_opening_tag(self):
        m = re.match(
            r'<([A-Za-z][A-Za-z0-9_:-]*)'
            r'((?:[^>"\'`]|"[^"]*"|\'[^\']*\'|`[^`]*`)*?)'
            r'(\s*/>|>)',
            self.text[self.pos:],
            re.DOTALL
        )
        if not m:
            self.emit_inline(self.text[self.pos])
            self.pos += 1
            return

        full_match = m.group(0)
        tag    = m.group(1).lower()
        attrs  = m.group(2)
        closer = m.group(3).strip()
        lno, cno = self.line_col()

        # Check parent relationship
        if tag in REQUIRED_PARENTS:
            allowed = REQUIRED_PARENTS[tag]
            parent  = self.stack[-1][0] if self.stack else None
            if parent not in allowed:
                loc = f"inside <{parent}>" if parent else "with no parent"
                self.err(f"<{tag}> at line {lno} col {cno} should be inside "
                         f"{sorted(allowed)} but is {loc}")

        self.pos += len(full_match)

        is_void         = tag in VOID_TAGS
        is_self_closing = closer == '/>' or is_void

        attr_str = attrs.strip()
        tag_open_noind = (f"<{tag} {attr_str}" if attr_str else f"<{tag}")

        if is_self_closing:
            suffix = ">" if is_void else " />"
            if tag in INLINE_TAGS:
                self.emit_inline(tag_open_noind + suffix)
            else:
                self.emit(self.ind() + tag_open_noind + suffix)
            return

        # Script / style: preserve body verbatim (always block-level)
        if tag in ('script', 'style'):
            self.flush_inline()
            end_pat = re.compile(r'</' + re.escape(tag) + r'\s*>',
                                 re.IGNORECASE)
            em = end_pat.search(self.text, self.pos)
            if em:
                body = self.text[self.pos:em.start()]
                self.emit(self.ind() + tag_open_noind + ">")
                for ln in body.split('\n'):
                    self.emit(ln.rstrip())
                self.emit(f"{self.ind()}</{tag}>")
                self.pos = em.end()
            else:
                self.emit(self.ind() + tag_open_noind + ">")
                self.err(f"unclosed <{tag}> opened at line {lno} col {cno}"
                         f" — missing </{tag}>")
                self.pos = self.length
            return

        # Inline tag: parse its content recursively, gather as inline fragment
        if tag in INLINE_TAGS:
            fragment = self._parse_inline_tag(tag, tag_open_noind, lno, cno)
            self.emit_inline(fragment)
            # If the source has whitespace immediately after the closing tag,
            # the main loop will skip it — inject a single trailing space now
            # so text that follows (e.g. "experiment") stays separated.
            if (self.pos < self.length
                    and self.text[self.pos] in (' ', '\t', '\r', '\n')
                    and self.inline_buf is not None
                    and not self.inline_buf.endswith(' ')):
                self.inline_buf += ' '
            return

        # Block-inline tag: collect content on one line but always its own line
        if tag in BLOCK_INLINE_TAGS:
            self.flush_inline()
            fragment = self._parse_inline_tag(tag, tag_open_noind, lno, cno)
            self.out_lines.append(self.ind() + fragment)
            return

        # Mixed tag (e.g. <li>): seed inline_buf with the opener so that
        # inline children accumulate on the same line; block children will
        # trigger flush_inline() themselves when they are encountered.
        if tag in MIXED_TAGS:
            self.flush_inline()
            self.inline_buf = self.ind() + tag_open_noind + ">"
            self.stack.append((tag, lno, cno))
            self.indent += 1
            # Record a sentinel: None means opener is still in inline_buf
            self._mixed_opener_idx.append(None)
            return

        # Block tag: flush any pending inline, emit on its own line, push stack
        self.flush_inline()
        self.emit(self.ind() + tag_open_noind + ">")
        self.stack.append((tag, lno, cno))
        self._mixed_opener_idx.append(None)
        self.indent += 1

    def _parse_inline_tag(self, tag, tag_open_noind, lno, cno):
        """
        Consume everything up to the matching </tag> and return the whole
        thing as a single string fragment (no newlines).  Nested inline tags
        are handled recursively; encountering block-level content or
        end-of-input before the closer is treated as an error.

        Whitespace between tokens is normalised to a single space using a
        pending_space flag so spaces are never lost or doubled around tags.
        """
        buf = tag_open_noind + ">"
        pending_space = False   # True when whitespace was seen since last token
        depth = 1

        while self.pos < self.length and depth > 0:
            # Whitespace: set flag, do not emit yet
            m = re.match(r'[ \t\r\n]+', self.text[self.pos:])
            if m:
                self.pos += m.end()
                if buf:           # only matters after the opening tag
                    pending_space = True
                continue

            remaining = self.text[self.pos:]

            # Helper: emit pending space before next token (but not before
            # closing the tag we opened, where trailing space is unwanted)
            def flush_space(closing=False):
                nonlocal pending_space, buf
                if pending_space and not closing:
                    buf += ' '
                pending_space = False

            # Closing tag
            cm = re.match(r'</([A-Za-z][A-Za-z0-9_:-]*)\s*>', remaining)
            if cm:
                ctag = cm.group(1).lower()
                self.pos += len(cm.group(0))
                if ctag == tag:
                    depth -= 1
                    if depth == 0:
                        flush_space(closing=True)
                        buf += f"</{tag}>"
                        break
                    else:
                        flush_space()
                        buf += f"</{ctag}>"
                else:
                    flush_space()
                    buf += f"</{ctag}>"
                continue

            # Opening tag
            om = re.match(
                r'<([A-Za-z][A-Za-z0-9_:-]*)'                r'((?:[^>"\'`]|"[^"]*"|\'[^\']*\'|`[^`]*`)*?)'                r'(\s*/>|>)',
                remaining, re.DOTALL
            )
            if om:
                itag    = om.group(1).lower()
                iattrs  = om.group(2).strip()
                icloser = om.group(3).strip()
                self.pos += len(om.group(0))
                tag_str = (f"<{itag} {iattrs}" if iattrs else f"<{itag}")
                is_void = itag in VOID_TAGS
                flush_space()
                if icloser == '/>' or is_void:
                    buf += tag_str + (">" if is_void else " />")
                else:
                    buf += tag_str + ">"
                    if itag == tag:
                        depth += 1
                continue

            # Comment
            if remaining.startswith('<!--'):
                end = self.text.find('-->', self.pos + 4)
                if end == -1:
                    flush_space()
                    buf += remaining
                    self.pos = self.length
                else:
                    flush_space()
                    buf += self.text[self.pos:end + 3]
                    self.pos = end + 3
                continue

            # Plain text
            tm = re.match(r'[^<]+', remaining)
            if tm:
                text = tm.group(0).strip()
                if text:
                    flush_space()
                    buf += text
                self.pos += tm.end()
                continue

            # Shouldn't happen
            flush_space()
            buf += remaining[0]
            self.pos += 1

        if depth > 0:
            self.err(f"unclosed <{tag}> opened at line {lno} col {cno}"
                     f" — missing </{tag}>")
            buf += f"</{tag}>"

        return buf

    def parse_closing_tag(self):
        m = re.match(r'</([A-Za-z][A-Za-z0-9_:-]*)\s*>',
                     self.text[self.pos:])
        if not m:
            self.emit_inline(self.text[self.pos])
            self.pos += 1
            return

        tag = m.group(1).lower()
        lno, cno = self.line_col()
        self.pos += len(m.group(0))

        # Find matching opener on the stack
        stack_idx = next(
            (i for i in range(len(self.stack) - 1, -1, -1)
             if self.stack[i][0] == tag),
            None
        )

        if stack_idx is None:
            self.err(f"</{tag}> at line {lno} col {cno} has no matching "
                     f"opening tag")
            return

        # Auto-close any unclosed tags above the match
        while len(self.stack) - 1 > stack_idx:
            utag, u_lno, u_cno = self.stack.pop()
            if self._mixed_opener_idx:
                self._mixed_opener_idx.pop()
            self.indent -= 1
            self.err(f"missing </{utag}> for <{utag}> opened at line "
                     f"{u_lno} col {u_cno}, forced closed by "
                     f"</{tag}> at line {lno} col {cno}")

        mixed_opener_idx = self._mixed_opener_idx.pop() if self._mixed_opener_idx else None
        self.stack.pop()
        self.indent -= 1
        if (tag in MIXED_TAGS
                and self.inline_buf is not None
                and mixed_opener_idx is None):
            # Opener is still in inline_buf — append closer on the same line
            self.inline_buf = self.inline_buf.rstrip() + f"</{tag}>"
            self.flush_inline()
        elif (tag in MIXED_TAGS
                and mixed_opener_idx is not None
                and mixed_opener_idx == len(self.out_lines) - 1
                and self.inline_buf is None):
            # Opener was the very last flushed line and nothing followed —
            # append closer directly to that line
            self.out_lines[-1] = self.out_lines[-1].rstrip() + f"</{tag}>"
        else:
            self.flush_inline()
            self.emit(f"{self.ind()}</{tag}>")

    def parse_text(self):
        m = re.match(r'[^<]+', self.text[self.pos:])
        if m:
            raw  = m.group(0)
            text = raw.strip()
            if text:
                # Preserve a single inter-token space at each boundary so
                # that text surrounding an inline tag keeps its spaces,
                # e.g. "word <i>...</i> word".  Only add a leading space
                # when the inline buffer doesn't already end with one.
                if raw[0].isspace():
                    buf = self.inline_buf or ''
                    if buf and not buf.endswith(' '):
                        text = ' ' + text
                if raw[-1].isspace():
                    text = text + ' '
                self.emit_inline(text)
            elif raw.strip() == '' and self.inline_buf is not None:
                # Whitespace-only text node between tags: ensure one space
                if not self.inline_buf.endswith(' '):
                    self.inline_buf += ' '
            self.pos += m.end()
        else:
            self.pos += 1


# ----------------------------------------------------------------------
def main():
    parser = argparse.ArgumentParser(
        prog=os.path.basename(sys.argv[0]),
        description='HTML Pretty Printer — indents HTML and reports errors.'
    )
    parser.add_argument('html_file',
                        help='HTML file to pretty-print')
    parser.add_argument('-i', '--indent', type=int, default=2, metavar='N',
                        help='spaces per indent level (default: 2)')
    parser.add_argument('-l', '--length', type=int, default=100, metavar='N',
                        help='soft max line length for collapsing pairs '
                             '(default: 100)')
    args = parser.parse_args()

    if args.indent < 0:
        print("Error: indent must be >= 0", file=sys.stderr)
        sys.exit(1)
    if args.length < 1:
        print("Error: line length must be >= 1", file=sys.stderr)
        sys.exit(1)

    try:
        with open(args.html_file, 'r', encoding='utf-8', errors='replace') as f:
            html_text = f.read()
    except OSError as e:
        print(f"Error reading file: {e}", file=sys.stderr)
        sys.exit(1)

    printer = HTMLPrettyPrinter(html_text,
                                indent_size=args.indent,
                                max_len=args.length)
    print(printer.parse())


if __name__ == '__main__':
    main()
