browser-cli/browser_cli/markdown/render.py

"""HTML → Markdown conversion and Markdown clean-up.

Pure, presentation-agnostic text transforms shared by the SDK
(:meth:`browser_cli.sdk.dom.ExtractNS.markdown`) and the ``extract markdown``
CLI command. No Click/Rich/IPC dependencies — just an HTML tree walker plus a
set of repair passes for the markdown the page (or a markdown editor like
Obsidian/CodeMirror) hands back.
"""
from __future__ import annotations

import re

from browser_cli.markdown.html import convert_html_to_markdown

_FENCE_RE = re.compile(r"```(?:[^\n`]*)\n.*?\n```", re.DOTALL)
_ESCAPED_MARKDOWN_RE = re.compile(r"\\([_-])")
_TABLE_SEPARATOR_RE = re.compile(r"^\|(?:\s*:?-{3,}:?\s*\|)+\s*$")

def _collapse_blank_lines(value):
  value = re.sub(r"[ \t]+\n", "\n", value)
  value = re.sub(r"\n{3,}", "\n\n", value)
  return value.strip()

def _parse_table_row(line):
  stripped = line.strip()
  if not stripped.startswith("|") or not stripped.endswith("|"):
    return None
  return [cell.strip() for cell in stripped.strip("|").split("|")]

def _repair_table_headers(lines):
  repaired = []
  index = 0
  while index < len(lines):
    if (
      index + 2 < len(lines)
      and _parse_table_row(lines[index]) is not None
      and _TABLE_SEPARATOR_RE.match(lines[index + 1].strip())
      and _parse_table_row(lines[index + 2]) is not None
    ):
      first = _parse_table_row(lines[index])
      third = _parse_table_row(lines[index + 2])
      if first and all(not cell for cell in first) and any(cell for cell in third):
        repaired.append(lines[index + 2].strip())
        repaired.append(lines[index + 1].strip())
        index += 3
        continue
    repaired.append(lines[index].strip())
    index += 1
  return repaired

def _repair_list_continuations(lines):
  repaired = []
  previous_was_list_item = False
  previous_continuation_indent = ""

  for line in lines:
    stripped = line.strip()
    list_match = re.match(r"^(\s*)([-*+]|\d+\.)\s+.+$", stripped)
    is_markdown_block_start = (
      not stripped
      or stripped.startswith(("```", "#", ">", "|"))
      or _TABLE_SEPARATOR_RE.match(stripped)
      or re.match(r"^(\s*)([-*+]|\d+\.)\s+", stripped)
    )

    if previous_was_list_item and stripped and not is_markdown_block_start:
      repaired.append(f"{previous_continuation_indent}{stripped}")
      previous_was_list_item = False
      continue

    repaired.append(stripped)
    if list_match:
      marker = list_match.group(2)
      base_indent = list_match.group(1)
      previous_continuation_indent = f"{base_indent}{' ' * (len(marker) + 1)}"
      previous_was_list_item = True
    else:
      previous_was_list_item = False

  return repaired

def _repair_flattened_diagram(text):
  if "\n" in text:
    return text
  if sum(text.count(char) for char in "│▼├└") < 2:
    return text

  text = re.sub(r"\s{2,}([│▼])", r"\n   \1", text)
  text = re.sub(r"([│▼])\s{2,}", r"\1\n", text)
  text = re.sub(r"([│▼])(?=[^\s\n│▼├└])", r"\1\n", text)
  text = re.sub(r"(?<=[^\s\n])([├└])", r"\n\1", text)
  text = re.sub(r"([^\s\n])(\()", r"\1\n\2", text)
  return "\n".join(line.rstrip() for line in text.splitlines() if line.strip())

def _convert_dash_lists_to_branches(lines):
  converted = []
  index = 0
  while index < len(lines):
    match = re.match(r"^(\s*)-\s+(.*)$", lines[index])
    if not match:
      converted.append(lines[index])
      index += 1
      continue

    indent = match.group(1)
    items = []
    while index < len(lines):
      next_match = re.match(rf"^{re.escape(indent)}-\s+(.*)$", lines[index])
      if not next_match:
        break
      items.append(next_match.group(1))
      index += 1

    for item_index, item in enumerate(items):
      branch = "└" if item_index == len(items) - 1 else "├"
      converted.append(f"{indent}{branch} {item}")
  return converted

def _clean_code_block(code):
  lines = [line.rstrip() for line in code.splitlines()]
  while lines and not lines[0].strip():
    lines.pop(0)
  while lines and not lines[-1].strip():
    lines.pop()

  flattened = _repair_flattened_diagram("\n".join(lines))
  lines = flattened.splitlines() if flattened else []
  lines = [
    f"   {line.strip()}"
    if line.strip() in {"│", "▼"} and not re.match(r"^\s+[│▼]\s*$", line)
    else line
    for line in lines
  ]
  lines = _convert_dash_lists_to_branches(lines)
  return "\n".join(lines)

def _clean_markdown_output(markdown):
  if not markdown:
    return ""

  pieces = []
  last_index = 0
  for match in _FENCE_RE.finditer(markdown):
    prose = markdown[last_index:match.start()]
    if prose:
      cleaned = _ESCAPED_MARKDOWN_RE.sub(r"\1", prose)
      lines = [line.strip() for line in cleaned.splitlines()]
      lines = _repair_table_headers(lines)
      lines = _repair_list_continuations(lines)
      cleaned = "\n".join(lines)
      cleaned = _collapse_blank_lines(cleaned)
      if cleaned:
        pieces.append(cleaned)

    fence = match.group(0)
    header, _, tail = fence.partition("\n")
    body, _, _ = tail.rpartition("\n")
    cleaned_body = _clean_code_block(body)
    pieces.append(f"{header}\n{cleaned_body}\n```" if cleaned_body else f"{header}\n```")
    last_index = match.end()

  trailing = markdown[last_index:]
  if trailing:
    cleaned = _ESCAPED_MARKDOWN_RE.sub(r"\1", trailing)
    lines = [line.strip() for line in cleaned.splitlines()]
    lines = _repair_table_headers(lines)
    lines = _repair_list_continuations(lines)
    cleaned = "\n".join(lines)
    cleaned = _collapse_blank_lines(cleaned)
    if cleaned:
      pieces.append(cleaned)

  return "\n\n".join(piece for piece in pieces if piece)

def _convert_html_to_markdown(html):
  return convert_html_to_markdown(html, _clean_markdown_output)

def render_markdown(raw: str | None) -> str:
  """Normalize *raw* extractor output into clean Markdown.

  If the payload looks like HTML (first non-space char is ``<``) it is run
  through the HTML→Markdown converter; otherwise it is treated as Markdown and
  only the clean-up/repair passes are applied.
  """
  raw = raw or ""
  if raw.lstrip().startswith("<"):
    return _convert_html_to_markdown(raw)
  return _clean_markdown_output(raw)