refactor: reorganize client transport and extension internals

- Split client, native, remote, serve, markdown, and SDK internals into focused packages with direct imports. - Move local and remote transport framing/protocol helpers behind clearer module boundaries. - Break up the extension injected DOM logic into a separate content dispatch bundle and dedicated content modules. - Add explicit client handling for passive remote discovery without noisy PQ warnings. - Keep behavior covered with updated unit, integration, and extension tests.
2026-06-13 23:31:24 +02:00
parent fd5447cbb9
commit 076914e5b7
88 changed files with 7491 additions and 5228 deletions
@@ -0,0 +1,8 @@
+"""Markdown rendering and HTML-to-Markdown conversion helpers."""
+from browser_cli.markdown.render import (
+  _clean_markdown_output,
+  _convert_html_to_markdown,
+  render_markdown,
+)
+
+__all__ = ["_clean_markdown_output", "_convert_html_to_markdown", "render_markdown"]
@@ -0,0 +1,259 @@
+"""HTML tree walking for browser-cli Markdown rendering."""
+from __future__ import annotations
+
+import re
+from html.parser import HTMLParser
+
+def _normalize_text(value):
+  return re.sub(r"\s+", " ", value or "").strip()
+
+def _normalize_inline(value):
+  value = value.replace("\xa0", " ")
+  value = re.sub(r"[ \t\r\f\v]+", " ", value)
+  value = re.sub(r" *\n *", "\n", value)
+  return value.strip()
+
+def _collapse_blank_lines(value):
+  value = re.sub(r"[ \t]+\n", "\n", value)
+  value = re.sub(r"\n{3,}", "\n\n", value)
+  return value.strip()
+
+def _escape_markdown(text):
+  return re.sub(r"([\\`[\]])", r"\\\1", text)
+
+def _escape_table_cell(text):
+  return text.replace("|", r"\|").replace("\n", " ").strip()
+
+class _HtmlNode:
+  def __init__(self, tag=None, attrs=None, text=None):
+    self.tag = tag
+    self.attrs = attrs or {}
+    self.text = text
+    self.children = []
+
+class _HtmlTreeBuilder(HTMLParser):
+  _VOID_TAGS = {"br", "hr", "img"}
+
+  def __init__(self):
+    super().__init__(convert_charrefs=True)
+    self.root = _HtmlNode(tag="document")
+    self._stack = [self.root]
+
+  def handle_starttag(self, tag, attrs):
+    node = _HtmlNode(tag=tag.lower(), attrs=dict(attrs))
+    self._stack[-1].children.append(node)
+    if node.tag not in self._VOID_TAGS:
+      self._stack.append(node)
+
+  def handle_startendtag(self, tag, attrs):
+    node = _HtmlNode(tag=tag.lower(), attrs=dict(attrs))
+    self._stack[-1].children.append(node)
+
+  def handle_endtag(self, tag):
+    lowered = tag.lower()
+    for index in range(len(self._stack) - 1, 0, -1):
+      if self._stack[index].tag == lowered:
+        del self._stack[index:]
+        break
+
+  def handle_data(self, data):
+    if data:
+      self._stack[-1].children.append(_HtmlNode(text=data))
+
+def _normalize_text(value):
+  return re.sub(r"\s+", " ", value or "").strip()
+
+def _normalize_inline(value):
+  value = value.replace("\xa0", " ")
+  value = re.sub(r"[ \t\r\f\v]+", " ", value)
+  value = re.sub(r" *\n *", "\n", value)
+  return value.strip()
+
+def _collapse_blank_lines(value):
+  value = re.sub(r"[ \t]+\n", "\n", value)
+  value = re.sub(r"\n{3,}", "\n\n", value)
+  return value.strip()
+
+def _escape_markdown(text):
+  return re.sub(r"([\\`[\]])", r"\\\1", text)
+
+def _escape_table_cell(text):
+  return text.replace("|", r"\|").replace("\n", " ").strip()
+
+def _iter_descendants(node):
+  for child in getattr(node, "children", []):
+    yield child
+    yield from _iter_descendants(child)
+
+def _has_class(node, class_name):
+  classes = (node.attrs.get("class") or "").split()
+  return class_name in classes
+
+def _is_code_block_node(node):
+  if not node or not node.tag:
+    return False
+  if node.attrs.get("data-is-code-block-view") == "true":
+    return True
+  return node.tag == "pre"
+
+def _inline_text(node):
+  if node.text is not None:
+    return _escape_markdown(node.text)
+  if not node.tag:
+    return ""
+
+  tag = node.tag
+  if tag == "br":
+    return "\n"
+  if tag == "img":
+    src = node.attrs.get("src") or ""
+    alt = _normalize_text(node.attrs.get("alt") or "")
+    if not src:
+      return ""
+    return f"![{_escape_markdown(alt)}]({src})" if alt else f"![]({src})"
+  if tag == "a":
+    text = _normalize_inline("".join(_inline_text(child) for child in node.children))
+    href = node.attrs.get("href") or ""
+    return f"[{text or href}]({href})" if href else text
+  if tag == "code":
+    text = _normalize_inline("".join(_inline_text(child) for child in node.children))
+    return f"`{text.replace('`', r'\\`')}`" if text else ""
+  if tag in {"strong", "b"}:
+    text = _normalize_inline("".join(_inline_text(child) for child in node.children))
+    return f"**{text}**" if text else ""
+  if tag in {"em", "i"}:
+    text = _normalize_inline("".join(_inline_text(child) for child in node.children))
+    return f"*{text}*" if text else ""
+
+  chunks = []
+  for child in node.children:
+    rendered = _inline_text(child)
+    if rendered:
+      chunks.append(rendered)
+      if child.tag in {"p", "div", "table", "ul", "ol", "pre"}:
+        chunks.append("\n")
+  return "".join(chunks)
+
+def _text_block(node):
+  return _collapse_blank_lines(_normalize_inline("".join(_inline_text(child) for child in node.children)))
+
+def _inner_text_preserve(node):
+  if node.text is not None:
+    return node.text
+  if not node.tag:
+    return ""
+  if node.tag == "br":
+    return ""
+  return "".join(_inner_text_preserve(child) for child in node.children)
+
+def _table_to_markdown(node):
+  rows = []
+  for descendant in _iter_descendants(node):
+    if descendant.tag != "tr":
+      continue
+    row = []
+    for cell in descendant.children:
+      if cell.tag in {"td", "th"}:
+        row.append(_escape_table_cell(_text_block(cell)))
+    if row:
+      rows.append(row)
+  if not rows:
+    return ""
+
+  widths = max(len(row) for row in rows)
+  normalized_rows = [row + [""] * (widths - len(row)) for row in rows]
+
+  headers = normalized_rows[0]
+  body_rows = normalized_rows[1:]
+  first_row_blank = all(not cell.strip() for cell in headers)
+  if first_row_blank and len(normalized_rows) > 1:
+    headers = normalized_rows[1]
+    body_rows = normalized_rows[2:]
+
+  has_thead = any(child.tag == "thead" for child in node.children)
+  first_row = next((child for child in _iter_descendants(node) if child.tag == "tr"), None)
+  first_row_has_th = bool(first_row and any(child.tag == "th" for child in first_row.children))
+  if not (has_thead or first_row_has_th or first_row_blank):
+    headers = [""] * widths
+    body_rows = normalized_rows
+
+  separator = ["---"] * widths
+  lines = [
+    f"| {' | '.join(headers)} |",
+    f"| {' | '.join(separator)} |",
+  ]
+  lines.extend(f"| {' | '.join(row)} |" for row in body_rows)
+  return "\n".join(lines)
+
+def _list_to_markdown(node, depth=0):
+  ordered = node.tag == "ol"
+  items = []
+  index = 1
+  for child in node.children:
+    if child.tag != "li":
+      continue
+    marker = f"{index}. " if ordered else "- "
+    index += 1
+    content = []
+    nested = []
+    for item_child in child.children:
+      if item_child.tag in {"ul", "ol"}:
+        nested.append(_list_to_markdown(item_child, depth + 1))
+      else:
+        content.append(_inline_text(item_child))
+    line = _collapse_blank_lines(_normalize_inline("".join(content)))
+    indent = "  " * depth
+    if line:
+      line_parts = line.splitlines()
+      items.append(f"{indent}{marker}{line_parts[0]}")
+      continuation_indent = f"{indent}{' ' * len(marker)}"
+      items.extend(f"{continuation_indent}{part}" for part in line_parts[1:])
+    items.extend(block for block in nested if block)
+  return "\n".join(items)
+
+def _code_block_to_markdown(node):
+  if node.tag == "pre":
+    text = _inner_text_preserve(node).rstrip("\n")
+    return f"```\n{text}\n```" if text else ""
+
+  lines = []
+  for descendant in _iter_descendants(node):
+    if descendant.tag and _has_class(descendant, "cm-line"):
+      lines.append(_inner_text_preserve(descendant))
+  code = "\n".join(lines).rstrip("\n")
+  return f"```\n{code}\n```" if code else ""
+
+def _block_to_markdown(node):
+  if node.text is not None:
+    return _normalize_text(node.text)
+  if not node.tag:
+    return ""
+  if _is_code_block_node(node):
+    return _code_block_to_markdown(node)
+  if node.tag == "table":
+    return _table_to_markdown(node)
+  if node.tag in {"ul", "ol"}:
+    return _list_to_markdown(node)
+  if re.fullmatch(r"h[1-6]", node.tag):
+    text = _text_block(node)
+    return f"{'#' * int(node.tag[1])} {text}" if text else ""
+  if node.tag in {"p", "figcaption"}:
+    return _text_block(node)
+  if node.tag == "blockquote":
+    content = _collapse_blank_lines("\n\n".join(filter(None, (_block_to_markdown(child) for child in node.children))))
+    return "\n".join(f"> {line}" if line else ">" for line in content.splitlines()) if content else ""
+  if node.tag == "hr":
+    return "---"
+  if node.tag == "img":
+    return _inline_text(node)
+
+  child_blocks = [block for block in (_block_to_markdown(child) for child in node.children) if block]
+  if child_blocks:
+    return _collapse_blank_lines("\n\n".join(child_blocks))
+  return _text_block(node)
+
+def convert_html_to_markdown(html, clean_markdown_output):
+  parser = _HtmlTreeBuilder()
+  parser.feed(html or "")
+  markdown = _block_to_markdown(parser.root)
+  return clean_markdown_output(markdown)
@@ -0,0 +1,188 @@
+"""HTML → Markdown conversion and Markdown clean-up.
+
+Pure, presentation-agnostic text transforms shared by the SDK
+(:meth:`browser_cli.sdk.dom.ExtractNS.markdown`) and the ``extract markdown``
+CLI command. No Click/Rich/IPC dependencies — just an HTML tree walker plus a
+set of repair passes for the markdown the page (or a markdown editor like
+Obsidian/CodeMirror) hands back.
+"""
+from __future__ import annotations
+
+import re
+
+from browser_cli.markdown.html import convert_html_to_markdown
+
+_FENCE_RE = re.compile(r"```(?:[^\n`]*)\n.*?\n```", re.DOTALL)
+_ESCAPED_MARKDOWN_RE = re.compile(r"\\([_-])")
+_TABLE_SEPARATOR_RE = re.compile(r"^\|(?:\s*:?-{3,}:?\s*\|)+\s*$")
+
+def _collapse_blank_lines(value):
+  value = re.sub(r"[ \t]+\n", "\n", value)
+  value = re.sub(r"\n{3,}", "\n\n", value)
+  return value.strip()
+
+def _parse_table_row(line):
+  stripped = line.strip()
+  if not stripped.startswith("|") or not stripped.endswith("|"):
+    return None
+  return [cell.strip() for cell in stripped.strip("|").split("|")]
+
+def _repair_table_headers(lines):
+  repaired = []
+  index = 0
+  while index < len(lines):
+    if (
+      index + 2 < len(lines)
+      and _parse_table_row(lines[index]) is not None
+      and _TABLE_SEPARATOR_RE.match(lines[index + 1].strip())
+      and _parse_table_row(lines[index + 2]) is not None
+    ):
+      first = _parse_table_row(lines[index])
+      third = _parse_table_row(lines[index + 2])
+      if first and all(not cell for cell in first) and any(cell for cell in third):
+        repaired.append(lines[index + 2].strip())
+        repaired.append(lines[index + 1].strip())
+        index += 3
+        continue
+    repaired.append(lines[index].strip())
+    index += 1
+  return repaired
+
+def _repair_list_continuations(lines):
+  repaired = []
+  previous_was_list_item = False
+  previous_continuation_indent = ""
+
+  for line in lines:
+    stripped = line.strip()
+    list_match = re.match(r"^(\s*)([-*+]|\d+\.)\s+.+$", stripped)
+    is_markdown_block_start = (
+      not stripped
+      or stripped.startswith(("```", "#", ">", "|"))
+      or _TABLE_SEPARATOR_RE.match(stripped)
+      or re.match(r"^(\s*)([-*+]|\d+\.)\s+", stripped)
+    )
+
+    if previous_was_list_item and stripped and not is_markdown_block_start:
+      repaired.append(f"{previous_continuation_indent}{stripped}")
+      previous_was_list_item = False
+      continue
+
+    repaired.append(stripped)
+    if list_match:
+      marker = list_match.group(2)
+      base_indent = list_match.group(1)
+      previous_continuation_indent = f"{base_indent}{' ' * (len(marker) + 1)}"
+      previous_was_list_item = True
+    else:
+      previous_was_list_item = False
+
+  return repaired
+
+def _repair_flattened_diagram(text):
+  if "\n" in text:
+    return text
+  if sum(text.count(char) for char in "│▼├└") < 2:
+    return text
+
+  text = re.sub(r"\s{2,}([│▼])", r"\n   \1", text)
+  text = re.sub(r"([│▼])\s{2,}", r"\1\n", text)
+  text = re.sub(r"([│▼])(?=[^\s\n│▼├└])", r"\1\n", text)
+  text = re.sub(r"(?<=[^\s\n])([├└])", r"\n\1", text)
+  text = re.sub(r"([^\s\n])(\()", r"\1\n\2", text)
+  return "\n".join(line.rstrip() for line in text.splitlines() if line.strip())
+
+def _convert_dash_lists_to_branches(lines):
+  converted = []
+  index = 0
+  while index < len(lines):
+    match = re.match(r"^(\s*)-\s+(.*)$", lines[index])
+    if not match:
+      converted.append(lines[index])
+      index += 1
+      continue
+
+    indent = match.group(1)
+    items = []
+    while index < len(lines):
+      next_match = re.match(rf"^{re.escape(indent)}-\s+(.*)$", lines[index])
+      if not next_match:
+        break
+      items.append(next_match.group(1))
+      index += 1
+
+    for item_index, item in enumerate(items):
+      branch = "└" if item_index == len(items) - 1 else "├"
+      converted.append(f"{indent}{branch} {item}")
+  return converted
+
+def _clean_code_block(code):
+  lines = [line.rstrip() for line in code.splitlines()]
+  while lines and not lines[0].strip():
+    lines.pop(0)
+  while lines and not lines[-1].strip():
+    lines.pop()
+
+  flattened = _repair_flattened_diagram("\n".join(lines))
+  lines = flattened.splitlines() if flattened else []
+  lines = [
+    f"   {line.strip()}"
+    if line.strip() in {"│", "▼"} and not re.match(r"^\s+[│▼]\s*$", line)
+    else line
+    for line in lines
+  ]
+  lines = _convert_dash_lists_to_branches(lines)
+  return "\n".join(lines)
+
+def _clean_markdown_output(markdown):
+  if not markdown:
+    return ""
+
+  pieces = []
+  last_index = 0
+  for match in _FENCE_RE.finditer(markdown):
+    prose = markdown[last_index:match.start()]
+    if prose:
+      cleaned = _ESCAPED_MARKDOWN_RE.sub(r"\1", prose)
+      lines = [line.strip() for line in cleaned.splitlines()]
+      lines = _repair_table_headers(lines)
+      lines = _repair_list_continuations(lines)
+      cleaned = "\n".join(lines)
+      cleaned = _collapse_blank_lines(cleaned)
+      if cleaned:
+        pieces.append(cleaned)
+
+    fence = match.group(0)
+    header, _, tail = fence.partition("\n")
+    body, _, _ = tail.rpartition("\n")
+    cleaned_body = _clean_code_block(body)
+    pieces.append(f"{header}\n{cleaned_body}\n```" if cleaned_body else f"{header}\n```")
+    last_index = match.end()
+
+  trailing = markdown[last_index:]
+  if trailing:
+    cleaned = _ESCAPED_MARKDOWN_RE.sub(r"\1", trailing)
+    lines = [line.strip() for line in cleaned.splitlines()]
+    lines = _repair_table_headers(lines)
+    lines = _repair_list_continuations(lines)
+    cleaned = "\n".join(lines)
+    cleaned = _collapse_blank_lines(cleaned)
+    if cleaned:
+      pieces.append(cleaned)
+
+  return "\n\n".join(piece for piece in pieces if piece)
+
+def _convert_html_to_markdown(html):
+  return convert_html_to_markdown(html, _clean_markdown_output)
+
+def render_markdown(raw: str | None) -> str:
+  """Normalize *raw* extractor output into clean Markdown.
+
+  If the payload looks like HTML (first non-space char is ``<``) it is run
+  through the HTML→Markdown converter; otherwise it is treated as Markdown and
+  only the clean-up/repair passes are applied.
+  """
+  raw = raw or ""
+  if raw.lstrip().startswith("<"):
+    return _convert_html_to_markdown(raw)
+  return _clean_markdown_output(raw)