"""HTML → Markdown conversion and Markdown clean-up. Pure, presentation-agnostic text transforms shared by the SDK (:meth:`browser_cli.sdk.dom.ExtractNS.markdown`) and the ``extract markdown`` CLI command. No Click/Rich/IPC dependencies — just an HTML tree walker plus a set of repair passes for the markdown the page (or a markdown editor like Obsidian/CodeMirror) hands back. """ from __future__ import annotations import re from browser_cli.markdown.html import convert_html_to_markdown _FENCE_RE = re.compile(r"```(?:[^\n`]*)\n.*?\n```", re.DOTALL) _ESCAPED_MARKDOWN_RE = re.compile(r"\\([_-])") _TABLE_SEPARATOR_RE = re.compile(r"^\|(?:\s*:?-{3,}:?\s*\|)+\s*$") def _collapse_blank_lines(value): value = re.sub(r"[ \t]+\n", "\n", value) value = re.sub(r"\n{3,}", "\n\n", value) return value.strip() def _parse_table_row(line): stripped = line.strip() if not stripped.startswith("|") or not stripped.endswith("|"): return None return [cell.strip() for cell in stripped.strip("|").split("|")] def _repair_table_headers(lines): repaired = [] index = 0 while index < len(lines): if ( index + 2 < len(lines) and _parse_table_row(lines[index]) is not None and _TABLE_SEPARATOR_RE.match(lines[index + 1].strip()) and _parse_table_row(lines[index + 2]) is not None ): first = _parse_table_row(lines[index]) third = _parse_table_row(lines[index + 2]) if first and all(not cell for cell in first) and any(cell for cell in third): repaired.append(lines[index + 2].strip()) repaired.append(lines[index + 1].strip()) index += 3 continue repaired.append(lines[index].strip()) index += 1 return repaired def _repair_list_continuations(lines): repaired = [] previous_was_list_item = False previous_continuation_indent = "" for line in lines: stripped = line.strip() list_match = re.match(r"^(\s*)([-*+]|\d+\.)\s+.+$", stripped) is_markdown_block_start = ( not stripped or stripped.startswith(("```", "#", ">", "|")) or _TABLE_SEPARATOR_RE.match(stripped) or re.match(r"^(\s*)([-*+]|\d+\.)\s+", stripped) ) if previous_was_list_item and stripped and not is_markdown_block_start: repaired.append(f"{previous_continuation_indent}{stripped}") previous_was_list_item = False continue repaired.append(stripped) if list_match: marker = list_match.group(2) base_indent = list_match.group(1) previous_continuation_indent = f"{base_indent}{' ' * (len(marker) + 1)}" previous_was_list_item = True else: previous_was_list_item = False return repaired def _repair_flattened_diagram(text): if "\n" in text: return text if sum(text.count(char) for char in "│▼├└") < 2: return text text = re.sub(r"\s{2,}([│▼])", r"\n \1", text) text = re.sub(r"([│▼])\s{2,}", r"\1\n", text) text = re.sub(r"([│▼])(?=[^\s\n│▼├└])", r"\1\n", text) text = re.sub(r"(?<=[^\s\n])([├└])", r"\n\1", text) text = re.sub(r"([^\s\n])(\()", r"\1\n\2", text) return "\n".join(line.rstrip() for line in text.splitlines() if line.strip()) def _convert_dash_lists_to_branches(lines): converted = [] index = 0 while index < len(lines): match = re.match(r"^(\s*)-\s+(.*)$", lines[index]) if not match: converted.append(lines[index]) index += 1 continue indent = match.group(1) items = [] while index < len(lines): next_match = re.match(rf"^{re.escape(indent)}-\s+(.*)$", lines[index]) if not next_match: break items.append(next_match.group(1)) index += 1 for item_index, item in enumerate(items): branch = "└" if item_index == len(items) - 1 else "├" converted.append(f"{indent}{branch} {item}") return converted def _clean_code_block(code): lines = [line.rstrip() for line in code.splitlines()] while lines and not lines[0].strip(): lines.pop(0) while lines and not lines[-1].strip(): lines.pop() flattened = _repair_flattened_diagram("\n".join(lines)) lines = flattened.splitlines() if flattened else [] lines = [ f" {line.strip()}" if line.strip() in {"│", "▼"} and not re.match(r"^\s+[│▼]\s*$", line) else line for line in lines ] lines = _convert_dash_lists_to_branches(lines) return "\n".join(lines) def _clean_markdown_output(markdown): if not markdown: return "" pieces = [] last_index = 0 for match in _FENCE_RE.finditer(markdown): prose = markdown[last_index:match.start()] if prose: cleaned = _ESCAPED_MARKDOWN_RE.sub(r"\1", prose) lines = [line.strip() for line in cleaned.splitlines()] lines = _repair_table_headers(lines) lines = _repair_list_continuations(lines) cleaned = "\n".join(lines) cleaned = _collapse_blank_lines(cleaned) if cleaned: pieces.append(cleaned) fence = match.group(0) header, _, tail = fence.partition("\n") body, _, _ = tail.rpartition("\n") cleaned_body = _clean_code_block(body) pieces.append(f"{header}\n{cleaned_body}\n```" if cleaned_body else f"{header}\n```") last_index = match.end() trailing = markdown[last_index:] if trailing: cleaned = _ESCAPED_MARKDOWN_RE.sub(r"\1", trailing) lines = [line.strip() for line in cleaned.splitlines()] lines = _repair_table_headers(lines) lines = _repair_list_continuations(lines) cleaned = "\n".join(lines) cleaned = _collapse_blank_lines(cleaned) if cleaned: pieces.append(cleaned) return "\n\n".join(piece for piece in pieces if piece) def _convert_html_to_markdown(html): return convert_html_to_markdown(html, _clean_markdown_output) def render_markdown(raw: str | None) -> str: """Normalize *raw* extractor output into clean Markdown. If the payload looks like HTML (first non-space char is ``<``) it is run through the HTML→Markdown converter; otherwise it is treated as Markdown and only the clean-up/repair passes are applied. """ raw = raw or "" if raw.lstrip().startswith("<"): return _convert_html_to_markdown(raw) return _clean_markdown_output(raw)