"""HTML tree walking for browser-cli Markdown rendering.""" from __future__ import annotations import re from html.parser import HTMLParser def _normalize_text(value): return re.sub(r"\s+", " ", value or "").strip() def _normalize_inline(value): value = value.replace("\xa0", " ") value = re.sub(r"[ \t\r\f\v]+", " ", value) value = re.sub(r" *\n *", "\n", value) return value.strip() def _collapse_blank_lines(value): value = re.sub(r"[ \t]+\n", "\n", value) value = re.sub(r"\n{3,}", "\n\n", value) return value.strip() def _escape_markdown(text): return re.sub(r"([\\`[\]])", r"\\\1", text) def _escape_table_cell(text): return text.replace("|", r"\|").replace("\n", " ").strip() class _HtmlNode: def __init__(self, tag=None, attrs=None, text=None): self.tag = tag self.attrs = attrs or {} self.text = text self.children = [] class _HtmlTreeBuilder(HTMLParser): _VOID_TAGS = {"br", "hr", "img"} def __init__(self): super().__init__(convert_charrefs=True) self.root = _HtmlNode(tag="document") self._stack = [self.root] def handle_starttag(self, tag, attrs): node = _HtmlNode(tag=tag.lower(), attrs=dict(attrs)) self._stack[-1].children.append(node) if node.tag not in self._VOID_TAGS: self._stack.append(node) def handle_startendtag(self, tag, attrs): node = _HtmlNode(tag=tag.lower(), attrs=dict(attrs)) self._stack[-1].children.append(node) def handle_endtag(self, tag): lowered = tag.lower() for index in range(len(self._stack) - 1, 0, -1): if self._stack[index].tag == lowered: del self._stack[index:] break def handle_data(self, data): if data: self._stack[-1].children.append(_HtmlNode(text=data)) def _normalize_text(value): return re.sub(r"\s+", " ", value or "").strip() def _normalize_inline(value): value = value.replace("\xa0", " ") value = re.sub(r"[ \t\r\f\v]+", " ", value) value = re.sub(r" *\n *", "\n", value) return value.strip() def _collapse_blank_lines(value): value = re.sub(r"[ \t]+\n", "\n", value) value = re.sub(r"\n{3,}", "\n\n", value) return value.strip() def _escape_markdown(text): return re.sub(r"([\\`[\]])", r"\\\1", text) def _escape_table_cell(text): return text.replace("|", r"\|").replace("\n", " ").strip() def _iter_descendants(node): for child in getattr(node, "children", []): yield child yield from _iter_descendants(child) def _has_class(node, class_name): classes = (node.attrs.get("class") or "").split() return class_name in classes def _is_code_block_node(node): if not node or not node.tag: return False if node.attrs.get("data-is-code-block-view") == "true": return True return node.tag == "pre" def _inline_text(node): if node.text is not None: return _escape_markdown(node.text) if not node.tag: return "" tag = node.tag if tag == "br": return "\n" if tag == "img": src = node.attrs.get("src") or "" alt = _normalize_text(node.attrs.get("alt") or "") if not src: return "" return f"![{_escape_markdown(alt)}]({src})" if alt else f"![]({src})" if tag == "a": text = _normalize_inline("".join(_inline_text(child) for child in node.children)) href = node.attrs.get("href") or "" return f"[{text or href}]({href})" if href else text if tag == "code": text = _normalize_inline("".join(_inline_text(child) for child in node.children)) return f"`{text.replace('`', r'\\`')}`" if text else "" if tag in {"strong", "b"}: text = _normalize_inline("".join(_inline_text(child) for child in node.children)) return f"**{text}**" if text else "" if tag in {"em", "i"}: text = _normalize_inline("".join(_inline_text(child) for child in node.children)) return f"*{text}*" if text else "" chunks = [] for child in node.children: rendered = _inline_text(child) if rendered: chunks.append(rendered) if child.tag in {"p", "div", "table", "ul", "ol", "pre"}: chunks.append("\n") return "".join(chunks) def _text_block(node): return _collapse_blank_lines(_normalize_inline("".join(_inline_text(child) for child in node.children))) def _inner_text_preserve(node): if node.text is not None: return node.text if not node.tag: return "" if node.tag == "br": return "" return "".join(_inner_text_preserve(child) for child in node.children) def _table_to_markdown(node): rows = [] for descendant in _iter_descendants(node): if descendant.tag != "tr": continue row = [] for cell in descendant.children: if cell.tag in {"td", "th"}: row.append(_escape_table_cell(_text_block(cell))) if row: rows.append(row) if not rows: return "" widths = max(len(row) for row in rows) normalized_rows = [row + [""] * (widths - len(row)) for row in rows] headers = normalized_rows[0] body_rows = normalized_rows[1:] first_row_blank = all(not cell.strip() for cell in headers) if first_row_blank and len(normalized_rows) > 1: headers = normalized_rows[1] body_rows = normalized_rows[2:] has_thead = any(child.tag == "thead" for child in node.children) first_row = next((child for child in _iter_descendants(node) if child.tag == "tr"), None) first_row_has_th = bool(first_row and any(child.tag == "th" for child in first_row.children)) if not (has_thead or first_row_has_th or first_row_blank): headers = [""] * widths body_rows = normalized_rows separator = ["---"] * widths lines = [ f"| {' | '.join(headers)} |", f"| {' | '.join(separator)} |", ] lines.extend(f"| {' | '.join(row)} |" for row in body_rows) return "\n".join(lines) def _list_to_markdown(node, depth=0): ordered = node.tag == "ol" items = [] index = 1 for child in node.children: if child.tag != "li": continue marker = f"{index}. " if ordered else "- " index += 1 content = [] nested = [] for item_child in child.children: if item_child.tag in {"ul", "ol"}: nested.append(_list_to_markdown(item_child, depth + 1)) else: content.append(_inline_text(item_child)) line = _collapse_blank_lines(_normalize_inline("".join(content))) indent = " " * depth if line: line_parts = line.splitlines() items.append(f"{indent}{marker}{line_parts[0]}") continuation_indent = f"{indent}{' ' * len(marker)}" items.extend(f"{continuation_indent}{part}" for part in line_parts[1:]) items.extend(block for block in nested if block) return "\n".join(items) def _code_block_to_markdown(node): if node.tag == "pre": text = _inner_text_preserve(node).rstrip("\n") return f"```\n{text}\n```" if text else "" lines = [] for descendant in _iter_descendants(node): if descendant.tag and _has_class(descendant, "cm-line"): lines.append(_inner_text_preserve(descendant)) code = "\n".join(lines).rstrip("\n") return f"```\n{code}\n```" if code else "" def _block_to_markdown(node): if node.text is not None: return _normalize_text(node.text) if not node.tag: return "" if _is_code_block_node(node): return _code_block_to_markdown(node) if node.tag == "table": return _table_to_markdown(node) if node.tag in {"ul", "ol"}: return _list_to_markdown(node) if re.fullmatch(r"h[1-6]", node.tag): text = _text_block(node) return f"{'#' * int(node.tag[1])} {text}" if text else "" if node.tag in {"p", "figcaption"}: return _text_block(node) if node.tag == "blockquote": content = _collapse_blank_lines("\n\n".join(filter(None, (_block_to_markdown(child) for child in node.children)))) return "\n".join(f"> {line}" if line else ">" for line in content.splitlines()) if content else "" if node.tag == "hr": return "---" if node.tag == "img": return _inline_text(node) child_blocks = [block for block in (_block_to_markdown(child) for child in node.children) if block] if child_blocks: return _collapse_blank_lines("\n\n".join(child_blocks)) return _text_block(node) def convert_html_to_markdown(html, clean_markdown_output): parser = _HtmlTreeBuilder() parser.feed(html or "") markdown = _block_to_markdown(parser.root) return clean_markdown_output(markdown)