#!/usr/bin/env python3 # --- # Copyright (c) 2026 Carl L. Wuebker & Claude.ai Sonnet 4.6 # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS # IN THE SOFTWARE. # Note that Claude.ai Sonnet 4.6, guided by my requests, testing & feedback, # wrote the code. I've minimally tested the code, so there still may be bugs. # --- """ pphj - HTML Pretty Printer Reads an HTML file and outputs indented, pretty-printed HTML, reporting structural errors as HTML comments in output and to stderr. Usage: pphj.py [-i N] [-l N] -i N Spaces per indent level (default: 2) -l N Soft maximum line length for collapsing pairs (default: 100) pphj is a prettyprinter which reports some errors. vnu.jar is an excellent HTML validator and can be downloaded from: https://github.com/validator/validator/releases/tag/latest Usage is: java -jar /path/to/vnu.jar file.html """ import re import os import sys import argparse # Tags that are self-closing (void elements) VOID_TAGS = { 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr' } # Phrasing-inline tags: gathered onto the current line without a line break. INLINE_TAGS = { 'a', 'abbr', 'acronym', 'b', 'bdi', 'bdo', 'big', 'br', 'button', 'cite', 'code', 'data', 'del', 'dfn', 'em', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'mark', 'output', 'q', 's', 'samp', 'select', 'small', 'span', 'strong', 'sub', 'sup', 'textarea', 'time', 'tt', 'u', 'var', 'wbr', } # Block-inline tags: their full content is collected on one line (like inline # tags), but they always occupy their OWN line — flushing any prior inline # buffer before starting, and flushing themselves afterwards. BLOCK_INLINE_TAGS = { 'title', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'dt', 'dd', 'caption', 'legend', 'summary', } # Mixed tags: open on their own line and seed inline_buf with the opener, # so inline children (text, , , etc.) accumulate on that same line. # If a block child arrives, flush_inline() is called first (normal behaviour). # The closing tag is appended to whatever is in inline_buf rather than being # emitted on its own indented line. MIXED_TAGS = {'li', 'td', 'th'} # Known parent-child relationships: parent -> set of required/expected children REQUIRED_PARENTS = { 'li': {'ul', 'ol'}, 'dt': {'dl'}, 'dd': {'dl'}, 'td': {'tr'}, 'th': {'tr'}, 'tr': {'table', 'thead', 'tbody', 'tfoot'}, 'thead': {'table'}, 'tbody': {'table'}, 'tfoot': {'table'}, 'caption': {'table'}, 'colgroup': {'table'}, 'col': {'colgroup'}, 'option': {'select', 'datalist', 'optgroup'}, 'optgroup': {'select'}, } def report_error(msg, out_lines, indent, indent_unit): """Emit an error as an HTML comment to output and plain text to stderr.""" comment = f"{indent_unit * indent}" out_lines.append(comment) print(f"HTML Error: {msg}", file=sys.stderr) class HTMLPrettyPrinter: def __init__(self, text, indent_size=2, max_len=100): self.text = text self.pos = 0 self.length = len(text) self.out_lines = [] self.stack = [] # list of (tag_name, line_no, col_no) self.indent = 0 self.indent_size = indent_size self.max_len = max_len # When not None, we are accumulating inline content onto this line # instead of emitting new indented lines. self.inline_buf = None # str: the line being built up # Parallel to self.stack: for MIXED_TAGS entries, records the # out_lines index where the opener line sits, or None for non-mixed. self._mixed_opener_idx = [] # ------------------------------------------------------------------ def line_col(self, pos=None): if pos is None: pos = self.pos before = self.text[:pos] line = before.count('\n') + 1 col = pos - before.rfind('\n') return line, col def ind(self): """Return the current indentation string.""" return ' ' * (self.indent_size * self.indent) def flush_inline(self): """If we have a buffered inline line, emit it and clear the buffer.""" if self.inline_buf is not None: flushed_idx = len(self.out_lines) self.out_lines.append(self.inline_buf.rstrip()) self.inline_buf = None # Update any mixed openers that were still in the buffer for i, v in enumerate(self._mixed_opener_idx): if v is None and i < len(self._mixed_opener_idx): self._mixed_opener_idx[i] = flushed_idx def emit(self, s): """Emit a complete line (flushing any inline buffer first).""" self.flush_inline() self.out_lines.append(s) def emit_inline(self, fragment): """ Append fragment to the current inline buffer. If the buffer would exceed max_len, flush first and start a new indented line (soft limit — we never break a fragment itself). """ if self.inline_buf is None: self.inline_buf = self.ind() + fragment else: candidate = self.inline_buf + fragment if len(candidate) <= self.max_len: self.inline_buf = candidate else: self.flush_inline() self.inline_buf = self.ind() + fragment def err(self, msg): report_error(msg, self.out_lines, self.indent, ' ' * self.indent_size) # ------------------------------------------------------------------ def parse(self): while self.pos < self.length: # Skip inter-token whitespace (we control indentation) m = re.match(r'[ \t\r\n]+', self.text[self.pos:]) if m: self.pos += m.end() continue remaining = self.text[self.pos:] if remaining.startswith('', self.pos + 4) if end == -1: comment_text = self.text[self.pos:] self.pos = self.length else: comment_text = self.text[self.pos:end + 3] self.pos = end + 3 inner = comment_text[4:-3].strip() if '\n' in inner or len(inner) > 60: self.emit(f"{self.ind()}") else: self.emit(f"{self.ind()}") def parse_doctype(self): m = re.match(r']*>', self.text[self.pos:], re.DOTALL) if m: self.emit(self.text[self.pos:self.pos + m.end()]) self.pos += m.end() else: self.pos += 1 def parse_opening_tag(self): m = re.match( r'<([A-Za-z][A-Za-z0-9_:-]*)' r'((?:[^>"\'`]|"[^"]*"|\'[^\']*\'|`[^`]*`)*?)' r'(\s*/>|>)', self.text[self.pos:], re.DOTALL ) if not m: self.emit_inline(self.text[self.pos]) self.pos += 1 return full_match = m.group(0) tag = m.group(1).lower() attrs = m.group(2) closer = m.group(3).strip() lno, cno = self.line_col() # Check parent relationship if tag in REQUIRED_PARENTS: allowed = REQUIRED_PARENTS[tag] parent = self.stack[-1][0] if self.stack else None if parent not in allowed: loc = f"inside <{parent}>" if parent else "with no parent" self.err(f"<{tag}> at line {lno} col {cno} should be inside " f"{sorted(allowed)} but is {loc}") self.pos += len(full_match) is_void = tag in VOID_TAGS is_self_closing = closer == '/>' or is_void attr_str = attrs.strip() tag_open_noind = (f"<{tag} {attr_str}" if attr_str else f"<{tag}") if is_self_closing: suffix = ">" if is_void else " />" if tag in INLINE_TAGS: self.emit_inline(tag_open_noind + suffix) else: self.emit(self.ind() + tag_open_noind + suffix) return # Script / style: preserve body verbatim (always block-level) if tag in ('script', 'style'): self.flush_inline() end_pat = re.compile(r'', re.IGNORECASE) em = end_pat.search(self.text, self.pos) if em: body = self.text[self.pos:em.start()] self.emit(self.ind() + tag_open_noind + ">") for ln in body.split('\n'): self.emit(ln.rstrip()) self.emit(f"{self.ind()}") self.pos = em.end() else: self.emit(self.ind() + tag_open_noind + ">") self.err(f"unclosed <{tag}> opened at line {lno} col {cno}" f" — missing ") self.pos = self.length return # Inline tag: parse its content recursively, gather as inline fragment if tag in INLINE_TAGS: fragment = self._parse_inline_tag(tag, tag_open_noind, lno, cno) self.emit_inline(fragment) # If the source has whitespace immediately after the closing tag, # the main loop will skip it — inject a single trailing space now # so text that follows (e.g. "experiment") stays separated. if (self.pos < self.length and self.text[self.pos] in (' ', '\t', '\r', '\n') and self.inline_buf is not None and not self.inline_buf.endswith(' ')): self.inline_buf += ' ' return # Block-inline tag: collect content on one line but always its own line if tag in BLOCK_INLINE_TAGS: self.flush_inline() fragment = self._parse_inline_tag(tag, tag_open_noind, lno, cno) self.out_lines.append(self.ind() + fragment) return # Mixed tag (e.g.
): seed inline_buf with the opener so that # inline children accumulate on the same line; block children will # trigger flush_inline() themselves when they are encountered. if tag in MIXED_TAGS: self.flush_inline() self.inline_buf = self.ind() + tag_open_noind + ">" self.stack.append((tag, lno, cno)) self.indent += 1 # Record a sentinel: None means opener is still in inline_buf self._mixed_opener_idx.append(None) return # Block tag: flush any pending inline, emit on its own line, push stack self.flush_inline() self.emit(self.ind() + tag_open_noind + ">") self.stack.append((tag, lno, cno)) self._mixed_opener_idx.append(None) self.indent += 1 def _parse_inline_tag(self, tag, tag_open_noind, lno, cno): """ Consume everything up to the matching and return the whole thing as a single string fragment (no newlines). Nested inline tags are handled recursively; encountering block-level content or end-of-input before the closer is treated as an error. Whitespace between tokens is normalised to a single space using a pending_space flag so spaces are never lost or doubled around tags. """ buf = tag_open_noind + ">" pending_space = False # True when whitespace was seen since last token depth = 1 while self.pos < self.length and depth > 0: # Whitespace: set flag, do not emit yet m = re.match(r'[ \t\r\n]+', self.text[self.pos:]) if m: self.pos += m.end() if buf: # only matters after the opening tag pending_space = True continue remaining = self.text[self.pos:] # Helper: emit pending space before next token (but not before # closing the tag we opened, where trailing space is unwanted) def flush_space(closing=False): nonlocal pending_space, buf if pending_space and not closing: buf += ' ' pending_space = False # Closing tag cm = re.match(r'', remaining) if cm: ctag = cm.group(1).lower() self.pos += len(cm.group(0)) if ctag == tag: depth -= 1 if depth == 0: flush_space(closing=True) buf += f"" break else: flush_space() buf += f"" else: flush_space() buf += f"" continue # Opening tag om = re.match( r'<([A-Za-z][A-Za-z0-9_:-]*)' r'((?:[^>"\'`]|"[^"]*"|\'[^\']*\'|`[^`]*`)*?)' r'(\s*/>|>)', remaining, re.DOTALL ) if om: itag = om.group(1).lower() iattrs = om.group(2).strip() icloser = om.group(3).strip() self.pos += len(om.group(0)) tag_str = (f"<{itag} {iattrs}" if iattrs else f"<{itag}") is_void = itag in VOID_TAGS flush_space() if icloser == '/>' or is_void: buf += tag_str + (">" if is_void else " />") else: buf += tag_str + ">" if itag == tag: depth += 1 continue # Comment if remaining.startswith('', self.pos + 4) if end == -1: flush_space() buf += remaining self.pos = self.length else: flush_space() buf += self.text[self.pos:end + 3] self.pos = end + 3 continue # Plain text tm = re.match(r'[^<]+', remaining) if tm: text = tm.group(0).strip() if text: flush_space() buf += text self.pos += tm.end() continue # Shouldn't happen flush_space() buf += remaining[0] self.pos += 1 if depth > 0: self.err(f"unclosed <{tag}> opened at line {lno} col {cno}" f" — missing ") buf += f"" return buf def parse_closing_tag(self): m = re.match(r'', self.text[self.pos:]) if not m: self.emit_inline(self.text[self.pos]) self.pos += 1 return tag = m.group(1).lower() lno, cno = self.line_col() self.pos += len(m.group(0)) # Find matching opener on the stack stack_idx = next( (i for i in range(len(self.stack) - 1, -1, -1) if self.stack[i][0] == tag), None ) if stack_idx is None: self.err(f" at line {lno} col {cno} has no matching " f"opening tag") return # Auto-close any unclosed tags above the match while len(self.stack) - 1 > stack_idx: utag, u_lno, u_cno = self.stack.pop() if self._mixed_opener_idx: self._mixed_opener_idx.pop() self.indent -= 1 self.err(f"missing for <{utag}> opened at line " f"{u_lno} col {u_cno}, forced closed by " f" at line {lno} col {cno}") mixed_opener_idx = self._mixed_opener_idx.pop() if self._mixed_opener_idx else None self.stack.pop() self.indent -= 1 if (tag in MIXED_TAGS and self.inline_buf is not None and mixed_opener_idx is None): # Opener is still in inline_buf — append closer on the same line self.inline_buf = self.inline_buf.rstrip() + f"" self.flush_inline() elif (tag in MIXED_TAGS and mixed_opener_idx is not None and mixed_opener_idx == len(self.out_lines) - 1 and self.inline_buf is None): # Opener was the very last flushed line and nothing followed — # append closer directly to that line self.out_lines[-1] = self.out_lines[-1].rstrip() + f"" else: self.flush_inline() self.emit(f"{self.ind()}") def parse_text(self): m = re.match(r'[^<]+', self.text[self.pos:]) if m: raw = m.group(0) text = raw.strip() if text: # Preserve a single inter-token space at each boundary so # that text surrounding an inline tag keeps its spaces, # e.g. "word ... word". Only add a leading space # when the inline buffer doesn't already end with one. if raw[0].isspace(): buf = self.inline_buf or '' if buf and not buf.endswith(' '): text = ' ' + text if raw[-1].isspace(): text = text + ' ' self.emit_inline(text) elif raw.strip() == '' and self.inline_buf is not None: # Whitespace-only text node between tags: ensure one space if not self.inline_buf.endswith(' '): self.inline_buf += ' ' self.pos += m.end() else: self.pos += 1 # ---------------------------------------------------------------------- def main(): parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), description='HTML Pretty Printer — indents HTML and reports errors.' ) parser.add_argument('html_file', help='HTML file to pretty-print') parser.add_argument('-i', '--indent', type=int, default=2, metavar='N', help='spaces per indent level (default: 2)') parser.add_argument('-l', '--length', type=int, default=100, metavar='N', help='soft max line length for collapsing pairs ' '(default: 100)') args = parser.parse_args() if args.indent < 0: print("Error: indent must be >= 0", file=sys.stderr) sys.exit(1) if args.length < 1: print("Error: line length must be >= 1", file=sys.stderr) sys.exit(1) try: with open(args.html_file, 'r', encoding='utf-8', errors='replace') as f: html_text = f.read() except OSError as e: print(f"Error reading file: {e}", file=sys.stderr) sys.exit(1) printer = HTMLPrettyPrinter(html_text, indent_size=args.indent, max_len=args.length) print(printer.parse()) if __name__ == '__main__': main()