"""HTML → Markdown conversion and Markdown clean-up. Pure, presentation-agnostic text transforms shared by the SDK (:meth:`browser_cli.sdk.dom.ExtractNS.markdown`) and the ``extract markdown`` CLI command. No Click/Rich/IPC dependencies — just an HTML tree walker plus a set of repair passes for the markdown the page (or a markdown editor like Obsidian/CodeMirror) hands back. """ from __future__ import annotations import re from html.parser import HTMLParser _FENCE_RE = re.compile(r"```(?:[^\n`]*)\n.*?\n```", re.DOTALL) _ESCAPED_MARKDOWN_RE = re.compile(r"\\([_-])") _TABLE_SEPARATOR_RE = re.compile(r"^\|(?:\s*:?-{3,}:?\s*\|)+\s*$") class _HtmlNode: def __init__(self, tag=None, attrs=None, text=None): self.tag = tag self.attrs = attrs or {} self.text = text self.children = [] class _HtmlTreeBuilder(HTMLParser): _VOID_TAGS = {"br", "hr", "img"} def __init__(self): super().__init__(convert_charrefs=True) self.root = _HtmlNode(tag="document") self._stack = [self.root] def handle_starttag(self, tag, attrs): node = _HtmlNode(tag=tag.lower(), attrs=dict(attrs)) self._stack[-1].children.append(node) if node.tag not in self._VOID_TAGS: self._stack.append(node) def handle_startendtag(self, tag, attrs): node = _HtmlNode(tag=tag.lower(), attrs=dict(attrs)) self._stack[-1].children.append(node) def handle_endtag(self, tag): lowered = tag.lower() for index in range(len(self._stack) - 1, 0, -1): if self._stack[index].tag == lowered: del self._stack[index:] break def handle_data(self, data): if data: self._stack[-1].children.append(_HtmlNode(text=data)) def _normalize_text(value): return re.sub(r"\s+", " ", value or "").strip() def _normalize_inline(value): value = value.replace("\xa0", " ") value = re.sub(r"[ \t\r\f\v]+", " ", value) value = re.sub(r" *\n *", "\n", value) return value.strip() def _collapse_blank_lines(value): value = re.sub(r"[ \t]+\n", "\n", value) value = re.sub(r"\n{3,}", "\n\n", value) return value.strip() def _escape_markdown(text): return re.sub(r"([\\`[\]])", r"\\\1", text) def _escape_table_cell(text): return text.replace("|", r"\|").replace("\n", " ").strip() def _iter_descendants(node): for child in getattr(node, "children", []): yield child yield from _iter_descendants(child) def _has_class(node, class_name): classes = (node.attrs.get("class") or "").split() return class_name in classes def _is_code_block_node(node): if not node or not node.tag: return False if node.attrs.get("data-is-code-block-view") == "true": return True return node.tag == "pre" def _inline_text(node): if node.text is not None: return _escape_markdown(node.text) if not node.tag: return "" tag = node.tag if tag == "br": return "\n" if tag == "img": src = node.attrs.get("src") or "" alt = _normalize_text(node.attrs.get("alt") or "") if not src: return "" return f"![{_escape_markdown(alt)}]({src})" if alt else f"![]({src})" if tag == "a": text = _normalize_inline("".join(_inline_text(child) for child in node.children)) href = node.attrs.get("href") or "" return f"[{text or href}]({href})" if href else text if tag == "code": text = _normalize_inline("".join(_inline_text(child) for child in node.children)) return f"`{text.replace('`', r'\\`')}`" if text else "" if tag in {"strong", "b"}: text = _normalize_inline("".join(_inline_text(child) for child in node.children)) return f"**{text}**" if text else "" if tag in {"em", "i"}: text = _normalize_inline("".join(_inline_text(child) for child in node.children)) return f"*{text}*" if text else "" chunks = [] for child in node.children: rendered = _inline_text(child) if rendered: chunks.append(rendered) if child.tag in {"p", "div", "table", "ul", "ol", "pre"}: chunks.append("\n") return "".join(chunks) def _text_block(node): return _collapse_blank_lines(_normalize_inline("".join(_inline_text(child) for child in node.children))) def _inner_text_preserve(node): if node.text is not None: return node.text if not node.tag: return "" if node.tag == "br": return "" return "".join(_inner_text_preserve(child) for child in node.children) def _table_to_markdown(node): rows = [] for descendant in _iter_descendants(node): if descendant.tag != "tr": continue row = [] for cell in descendant.children: if cell.tag in {"td", "th"}: row.append(_escape_table_cell(_text_block(cell))) if row: rows.append(row) if not rows: return "" widths = max(len(row) for row in rows) normalized_rows = [row + [""] * (widths - len(row)) for row in rows] headers = normalized_rows[0] body_rows = normalized_rows[1:] first_row_blank = all(not cell.strip() for cell in headers) if first_row_blank and len(normalized_rows) > 1: headers = normalized_rows[1] body_rows = normalized_rows[2:] has_thead = any(child.tag == "thead" for child in node.children) first_row = next((child for child in _iter_descendants(node) if child.tag == "tr"), None) first_row_has_th = bool(first_row and any(child.tag == "th" for child in first_row.children)) if not (has_thead or first_row_has_th or first_row_blank): headers = [""] * widths body_rows = normalized_rows separator = ["---"] * widths lines = [ f"| {' | '.join(headers)} |", f"| {' | '.join(separator)} |", ] lines.extend(f"| {' | '.join(row)} |" for row in body_rows) return "\n".join(lines) def _list_to_markdown(node, depth=0): ordered = node.tag == "ol" items = [] index = 1 for child in node.children: if child.tag != "li": continue marker = f"{index}. " if ordered else "- " index += 1 content = [] nested = [] for item_child in child.children: if item_child.tag in {"ul", "ol"}: nested.append(_list_to_markdown(item_child, depth + 1)) else: content.append(_inline_text(item_child)) line = _collapse_blank_lines(_normalize_inline("".join(content))) indent = " " * depth if line: line_parts = line.splitlines() items.append(f"{indent}{marker}{line_parts[0]}") continuation_indent = f"{indent}{' ' * len(marker)}" items.extend(f"{continuation_indent}{part}" for part in line_parts[1:]) items.extend(block for block in nested if block) return "\n".join(items) def _code_block_to_markdown(node): if node.tag == "pre": text = _inner_text_preserve(node).rstrip("\n") return f"```\n{text}\n```" if text else "" lines = [] for descendant in _iter_descendants(node): if descendant.tag and _has_class(descendant, "cm-line"): lines.append(_inner_text_preserve(descendant)) code = "\n".join(lines).rstrip("\n") return f"```\n{code}\n```" if code else "" def _block_to_markdown(node): if node.text is not None: return _normalize_text(node.text) if not node.tag: return "" if _is_code_block_node(node): return _code_block_to_markdown(node) if node.tag == "table": return _table_to_markdown(node) if node.tag in {"ul", "ol"}: return _list_to_markdown(node) if re.fullmatch(r"h[1-6]", node.tag): text = _text_block(node) return f"{'#' * int(node.tag[1])} {text}" if text else "" if node.tag in {"p", "figcaption"}: return _text_block(node) if node.tag == "blockquote": content = _collapse_blank_lines("\n\n".join(filter(None, (_block_to_markdown(child) for child in node.children)))) return "\n".join(f"> {line}" if line else ">" for line in content.splitlines()) if content else "" if node.tag == "hr": return "---" if node.tag == "img": return _inline_text(node) child_blocks = [block for block in (_block_to_markdown(child) for child in node.children) if block] if child_blocks: return _collapse_blank_lines("\n\n".join(child_blocks)) return _text_block(node) def _parse_table_row(line): stripped = line.strip() if not stripped.startswith("|") or not stripped.endswith("|"): return None return [cell.strip() for cell in stripped.strip("|").split("|")] def _repair_table_headers(lines): repaired = [] index = 0 while index < len(lines): if ( index + 2 < len(lines) and _parse_table_row(lines[index]) is not None and _TABLE_SEPARATOR_RE.match(lines[index + 1].strip()) and _parse_table_row(lines[index + 2]) is not None ): first = _parse_table_row(lines[index]) third = _parse_table_row(lines[index + 2]) if first and all(not cell for cell in first) and any(cell for cell in third): repaired.append(lines[index + 2].strip()) repaired.append(lines[index + 1].strip()) index += 3 continue repaired.append(lines[index].strip()) index += 1 return repaired def _repair_list_continuations(lines): repaired = [] previous_was_list_item = False previous_continuation_indent = "" for line in lines: stripped = line.strip() list_match = re.match(r"^(\s*)([-*+]|\d+\.)\s+.+$", stripped) is_markdown_block_start = ( not stripped or stripped.startswith(("```", "#", ">", "|")) or _TABLE_SEPARATOR_RE.match(stripped) or re.match(r"^(\s*)([-*+]|\d+\.)\s+", stripped) ) if previous_was_list_item and stripped and not is_markdown_block_start: repaired.append(f"{previous_continuation_indent}{stripped}") previous_was_list_item = False continue repaired.append(stripped) if list_match: marker = list_match.group(2) base_indent = list_match.group(1) previous_continuation_indent = f"{base_indent}{' ' * (len(marker) + 1)}" previous_was_list_item = True else: previous_was_list_item = False return repaired def _repair_flattened_diagram(text): if "\n" in text: return text if sum(text.count(char) for char in "│▼├└") < 2: return text text = re.sub(r"\s{2,}([│▼])", r"\n \1", text) text = re.sub(r"([│▼])\s{2,}", r"\1\n", text) text = re.sub(r"([│▼])(?=[^\s\n│▼├└])", r"\1\n", text) text = re.sub(r"(?<=[^\s\n])([├└])", r"\n\1", text) text = re.sub(r"([^\s\n])(\()", r"\1\n\2", text) return "\n".join(line.rstrip() for line in text.splitlines() if line.strip()) def _convert_dash_lists_to_branches(lines): converted = [] index = 0 while index < len(lines): match = re.match(r"^(\s*)-\s+(.*)$", lines[index]) if not match: converted.append(lines[index]) index += 1 continue indent = match.group(1) items = [] while index < len(lines): next_match = re.match(rf"^{re.escape(indent)}-\s+(.*)$", lines[index]) if not next_match: break items.append(next_match.group(1)) index += 1 for item_index, item in enumerate(items): branch = "└" if item_index == len(items) - 1 else "├" converted.append(f"{indent}{branch} {item}") return converted def _clean_code_block(code): lines = [line.rstrip() for line in code.splitlines()] while lines and not lines[0].strip(): lines.pop(0) while lines and not lines[-1].strip(): lines.pop() flattened = _repair_flattened_diagram("\n".join(lines)) lines = flattened.splitlines() if flattened else [] lines = [ f" {line.strip()}" if line.strip() in {"│", "▼"} and not re.match(r"^\s+[│▼]\s*$", line) else line for line in lines ] lines = _convert_dash_lists_to_branches(lines) return "\n".join(lines) def _clean_markdown_output(markdown): if not markdown: return "" pieces = [] last_index = 0 for match in _FENCE_RE.finditer(markdown): prose = markdown[last_index:match.start()] if prose: cleaned = _ESCAPED_MARKDOWN_RE.sub(r"\1", prose) lines = [line.strip() for line in cleaned.splitlines()] lines = _repair_table_headers(lines) lines = _repair_list_continuations(lines) cleaned = "\n".join(lines) cleaned = _collapse_blank_lines(cleaned) if cleaned: pieces.append(cleaned) fence = match.group(0) header, _, tail = fence.partition("\n") body, _, _ = tail.rpartition("\n") cleaned_body = _clean_code_block(body) pieces.append(f"{header}\n{cleaned_body}\n```" if cleaned_body else f"{header}\n```") last_index = match.end() trailing = markdown[last_index:] if trailing: cleaned = _ESCAPED_MARKDOWN_RE.sub(r"\1", trailing) lines = [line.strip() for line in cleaned.splitlines()] lines = _repair_table_headers(lines) lines = _repair_list_continuations(lines) cleaned = "\n".join(lines) cleaned = _collapse_blank_lines(cleaned) if cleaned: pieces.append(cleaned) return "\n\n".join(piece for piece in pieces if piece) def _convert_html_to_markdown(html): parser = _HtmlTreeBuilder() parser.feed(html or "") markdown = _block_to_markdown(parser.root) return _clean_markdown_output(markdown) def render_markdown(raw: str | None) -> str: """Normalize *raw* extractor output into clean Markdown. If the payload looks like HTML (first non-space char is ``<``) it is run through the HTML→Markdown converter; otherwise it is treated as Markdown and only the clean-up/repair passes are applied. """ raw = raw or "" if raw.lstrip().startswith("<"): return _convert_html_to_markdown(raw) return _clean_markdown_output(raw)