refactor: reorganize client transport and extension internals
- Split client, native, remote, serve, markdown, and SDK internals into focused packages with direct imports. - Move local and remote transport framing/protocol helpers behind clearer module boundaries. - Break up the extension injected DOM logic into a separate content dispatch bundle and dedicated content modules. - Add explicit client handling for passive remote discovery without noisy PQ warnings. - Keep behavior covered with updated unit, integration, and extension tests.
This commit is contained in:
@@ -0,0 +1,259 @@
|
||||
"""HTML tree walking for browser-cli Markdown rendering."""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from html.parser import HTMLParser
|
||||
|
||||
def _normalize_text(value):
|
||||
return re.sub(r"\s+", " ", value or "").strip()
|
||||
|
||||
def _normalize_inline(value):
|
||||
value = value.replace("\xa0", " ")
|
||||
value = re.sub(r"[ \t\r\f\v]+", " ", value)
|
||||
value = re.sub(r" *\n *", "\n", value)
|
||||
return value.strip()
|
||||
|
||||
def _collapse_blank_lines(value):
|
||||
value = re.sub(r"[ \t]+\n", "\n", value)
|
||||
value = re.sub(r"\n{3,}", "\n\n", value)
|
||||
return value.strip()
|
||||
|
||||
def _escape_markdown(text):
|
||||
return re.sub(r"([\\`[\]])", r"\\\1", text)
|
||||
|
||||
def _escape_table_cell(text):
|
||||
return text.replace("|", r"\|").replace("\n", " ").strip()
|
||||
|
||||
class _HtmlNode:
|
||||
def __init__(self, tag=None, attrs=None, text=None):
|
||||
self.tag = tag
|
||||
self.attrs = attrs or {}
|
||||
self.text = text
|
||||
self.children = []
|
||||
|
||||
class _HtmlTreeBuilder(HTMLParser):
|
||||
_VOID_TAGS = {"br", "hr", "img"}
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(convert_charrefs=True)
|
||||
self.root = _HtmlNode(tag="document")
|
||||
self._stack = [self.root]
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
node = _HtmlNode(tag=tag.lower(), attrs=dict(attrs))
|
||||
self._stack[-1].children.append(node)
|
||||
if node.tag not in self._VOID_TAGS:
|
||||
self._stack.append(node)
|
||||
|
||||
def handle_startendtag(self, tag, attrs):
|
||||
node = _HtmlNode(tag=tag.lower(), attrs=dict(attrs))
|
||||
self._stack[-1].children.append(node)
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
lowered = tag.lower()
|
||||
for index in range(len(self._stack) - 1, 0, -1):
|
||||
if self._stack[index].tag == lowered:
|
||||
del self._stack[index:]
|
||||
break
|
||||
|
||||
def handle_data(self, data):
|
||||
if data:
|
||||
self._stack[-1].children.append(_HtmlNode(text=data))
|
||||
|
||||
def _normalize_text(value):
|
||||
return re.sub(r"\s+", " ", value or "").strip()
|
||||
|
||||
def _normalize_inline(value):
|
||||
value = value.replace("\xa0", " ")
|
||||
value = re.sub(r"[ \t\r\f\v]+", " ", value)
|
||||
value = re.sub(r" *\n *", "\n", value)
|
||||
return value.strip()
|
||||
|
||||
def _collapse_blank_lines(value):
|
||||
value = re.sub(r"[ \t]+\n", "\n", value)
|
||||
value = re.sub(r"\n{3,}", "\n\n", value)
|
||||
return value.strip()
|
||||
|
||||
def _escape_markdown(text):
|
||||
return re.sub(r"([\\`[\]])", r"\\\1", text)
|
||||
|
||||
def _escape_table_cell(text):
|
||||
return text.replace("|", r"\|").replace("\n", " ").strip()
|
||||
|
||||
def _iter_descendants(node):
|
||||
for child in getattr(node, "children", []):
|
||||
yield child
|
||||
yield from _iter_descendants(child)
|
||||
|
||||
def _has_class(node, class_name):
|
||||
classes = (node.attrs.get("class") or "").split()
|
||||
return class_name in classes
|
||||
|
||||
def _is_code_block_node(node):
|
||||
if not node or not node.tag:
|
||||
return False
|
||||
if node.attrs.get("data-is-code-block-view") == "true":
|
||||
return True
|
||||
return node.tag == "pre"
|
||||
|
||||
def _inline_text(node):
|
||||
if node.text is not None:
|
||||
return _escape_markdown(node.text)
|
||||
if not node.tag:
|
||||
return ""
|
||||
|
||||
tag = node.tag
|
||||
if tag == "br":
|
||||
return "\n"
|
||||
if tag == "img":
|
||||
src = node.attrs.get("src") or ""
|
||||
alt = _normalize_text(node.attrs.get("alt") or "")
|
||||
if not src:
|
||||
return ""
|
||||
return f"" if alt else f""
|
||||
if tag == "a":
|
||||
text = _normalize_inline("".join(_inline_text(child) for child in node.children))
|
||||
href = node.attrs.get("href") or ""
|
||||
return f"[{text or href}]({href})" if href else text
|
||||
if tag == "code":
|
||||
text = _normalize_inline("".join(_inline_text(child) for child in node.children))
|
||||
return f"`{text.replace('`', r'\\`')}`" if text else ""
|
||||
if tag in {"strong", "b"}:
|
||||
text = _normalize_inline("".join(_inline_text(child) for child in node.children))
|
||||
return f"**{text}**" if text else ""
|
||||
if tag in {"em", "i"}:
|
||||
text = _normalize_inline("".join(_inline_text(child) for child in node.children))
|
||||
return f"*{text}*" if text else ""
|
||||
|
||||
chunks = []
|
||||
for child in node.children:
|
||||
rendered = _inline_text(child)
|
||||
if rendered:
|
||||
chunks.append(rendered)
|
||||
if child.tag in {"p", "div", "table", "ul", "ol", "pre"}:
|
||||
chunks.append("\n")
|
||||
return "".join(chunks)
|
||||
|
||||
def _text_block(node):
|
||||
return _collapse_blank_lines(_normalize_inline("".join(_inline_text(child) for child in node.children)))
|
||||
|
||||
def _inner_text_preserve(node):
|
||||
if node.text is not None:
|
||||
return node.text
|
||||
if not node.tag:
|
||||
return ""
|
||||
if node.tag == "br":
|
||||
return ""
|
||||
return "".join(_inner_text_preserve(child) for child in node.children)
|
||||
|
||||
def _table_to_markdown(node):
|
||||
rows = []
|
||||
for descendant in _iter_descendants(node):
|
||||
if descendant.tag != "tr":
|
||||
continue
|
||||
row = []
|
||||
for cell in descendant.children:
|
||||
if cell.tag in {"td", "th"}:
|
||||
row.append(_escape_table_cell(_text_block(cell)))
|
||||
if row:
|
||||
rows.append(row)
|
||||
if not rows:
|
||||
return ""
|
||||
|
||||
widths = max(len(row) for row in rows)
|
||||
normalized_rows = [row + [""] * (widths - len(row)) for row in rows]
|
||||
|
||||
headers = normalized_rows[0]
|
||||
body_rows = normalized_rows[1:]
|
||||
first_row_blank = all(not cell.strip() for cell in headers)
|
||||
if first_row_blank and len(normalized_rows) > 1:
|
||||
headers = normalized_rows[1]
|
||||
body_rows = normalized_rows[2:]
|
||||
|
||||
has_thead = any(child.tag == "thead" for child in node.children)
|
||||
first_row = next((child for child in _iter_descendants(node) if child.tag == "tr"), None)
|
||||
first_row_has_th = bool(first_row and any(child.tag == "th" for child in first_row.children))
|
||||
if not (has_thead or first_row_has_th or first_row_blank):
|
||||
headers = [""] * widths
|
||||
body_rows = normalized_rows
|
||||
|
||||
separator = ["---"] * widths
|
||||
lines = [
|
||||
f"| {' | '.join(headers)} |",
|
||||
f"| {' | '.join(separator)} |",
|
||||
]
|
||||
lines.extend(f"| {' | '.join(row)} |" for row in body_rows)
|
||||
return "\n".join(lines)
|
||||
|
||||
def _list_to_markdown(node, depth=0):
|
||||
ordered = node.tag == "ol"
|
||||
items = []
|
||||
index = 1
|
||||
for child in node.children:
|
||||
if child.tag != "li":
|
||||
continue
|
||||
marker = f"{index}. " if ordered else "- "
|
||||
index += 1
|
||||
content = []
|
||||
nested = []
|
||||
for item_child in child.children:
|
||||
if item_child.tag in {"ul", "ol"}:
|
||||
nested.append(_list_to_markdown(item_child, depth + 1))
|
||||
else:
|
||||
content.append(_inline_text(item_child))
|
||||
line = _collapse_blank_lines(_normalize_inline("".join(content)))
|
||||
indent = " " * depth
|
||||
if line:
|
||||
line_parts = line.splitlines()
|
||||
items.append(f"{indent}{marker}{line_parts[0]}")
|
||||
continuation_indent = f"{indent}{' ' * len(marker)}"
|
||||
items.extend(f"{continuation_indent}{part}" for part in line_parts[1:])
|
||||
items.extend(block for block in nested if block)
|
||||
return "\n".join(items)
|
||||
|
||||
def _code_block_to_markdown(node):
|
||||
if node.tag == "pre":
|
||||
text = _inner_text_preserve(node).rstrip("\n")
|
||||
return f"```\n{text}\n```" if text else ""
|
||||
|
||||
lines = []
|
||||
for descendant in _iter_descendants(node):
|
||||
if descendant.tag and _has_class(descendant, "cm-line"):
|
||||
lines.append(_inner_text_preserve(descendant))
|
||||
code = "\n".join(lines).rstrip("\n")
|
||||
return f"```\n{code}\n```" if code else ""
|
||||
|
||||
def _block_to_markdown(node):
|
||||
if node.text is not None:
|
||||
return _normalize_text(node.text)
|
||||
if not node.tag:
|
||||
return ""
|
||||
if _is_code_block_node(node):
|
||||
return _code_block_to_markdown(node)
|
||||
if node.tag == "table":
|
||||
return _table_to_markdown(node)
|
||||
if node.tag in {"ul", "ol"}:
|
||||
return _list_to_markdown(node)
|
||||
if re.fullmatch(r"h[1-6]", node.tag):
|
||||
text = _text_block(node)
|
||||
return f"{'#' * int(node.tag[1])} {text}" if text else ""
|
||||
if node.tag in {"p", "figcaption"}:
|
||||
return _text_block(node)
|
||||
if node.tag == "blockquote":
|
||||
content = _collapse_blank_lines("\n\n".join(filter(None, (_block_to_markdown(child) for child in node.children))))
|
||||
return "\n".join(f"> {line}" if line else ">" for line in content.splitlines()) if content else ""
|
||||
if node.tag == "hr":
|
||||
return "---"
|
||||
if node.tag == "img":
|
||||
return _inline_text(node)
|
||||
|
||||
child_blocks = [block for block in (_block_to_markdown(child) for child in node.children) if block]
|
||||
if child_blocks:
|
||||
return _collapse_blank_lines("\n\n".join(child_blocks))
|
||||
return _text_block(node)
|
||||
|
||||
def convert_html_to_markdown(html, clean_markdown_output):
|
||||
parser = _HtmlTreeBuilder()
|
||||
parser.feed(html or "")
|
||||
markdown = _block_to_markdown(parser.root)
|
||||
return clean_markdown_output(markdown)
|
||||
Reference in New Issue
Block a user