076914e5b7
- Split client, native, remote, serve, markdown, and SDK internals into focused packages with direct imports. - Move local and remote transport framing/protocol helpers behind clearer module boundaries. - Break up the extension injected DOM logic into a separate content dispatch bundle and dedicated content modules. - Add explicit client handling for passive remote discovery without noisy PQ warnings. - Keep behavior covered with updated unit, integration, and extension tests.
189 lines
6.1 KiB
Python
189 lines
6.1 KiB
Python
"""HTML → Markdown conversion and Markdown clean-up.
|
|
|
|
Pure, presentation-agnostic text transforms shared by the SDK
|
|
(:meth:`browser_cli.sdk.dom.ExtractNS.markdown`) and the ``extract markdown``
|
|
CLI command. No Click/Rich/IPC dependencies — just an HTML tree walker plus a
|
|
set of repair passes for the markdown the page (or a markdown editor like
|
|
Obsidian/CodeMirror) hands back.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
|
|
from browser_cli.markdown.html import convert_html_to_markdown
|
|
|
|
_FENCE_RE = re.compile(r"```(?:[^\n`]*)\n.*?\n```", re.DOTALL)
|
|
_ESCAPED_MARKDOWN_RE = re.compile(r"\\([_-])")
|
|
_TABLE_SEPARATOR_RE = re.compile(r"^\|(?:\s*:?-{3,}:?\s*\|)+\s*$")
|
|
|
|
def _collapse_blank_lines(value):
|
|
value = re.sub(r"[ \t]+\n", "\n", value)
|
|
value = re.sub(r"\n{3,}", "\n\n", value)
|
|
return value.strip()
|
|
|
|
def _parse_table_row(line):
|
|
stripped = line.strip()
|
|
if not stripped.startswith("|") or not stripped.endswith("|"):
|
|
return None
|
|
return [cell.strip() for cell in stripped.strip("|").split("|")]
|
|
|
|
def _repair_table_headers(lines):
|
|
repaired = []
|
|
index = 0
|
|
while index < len(lines):
|
|
if (
|
|
index + 2 < len(lines)
|
|
and _parse_table_row(lines[index]) is not None
|
|
and _TABLE_SEPARATOR_RE.match(lines[index + 1].strip())
|
|
and _parse_table_row(lines[index + 2]) is not None
|
|
):
|
|
first = _parse_table_row(lines[index])
|
|
third = _parse_table_row(lines[index + 2])
|
|
if first and all(not cell for cell in first) and any(cell for cell in third):
|
|
repaired.append(lines[index + 2].strip())
|
|
repaired.append(lines[index + 1].strip())
|
|
index += 3
|
|
continue
|
|
repaired.append(lines[index].strip())
|
|
index += 1
|
|
return repaired
|
|
|
|
def _repair_list_continuations(lines):
|
|
repaired = []
|
|
previous_was_list_item = False
|
|
previous_continuation_indent = ""
|
|
|
|
for line in lines:
|
|
stripped = line.strip()
|
|
list_match = re.match(r"^(\s*)([-*+]|\d+\.)\s+.+$", stripped)
|
|
is_markdown_block_start = (
|
|
not stripped
|
|
or stripped.startswith(("```", "#", ">", "|"))
|
|
or _TABLE_SEPARATOR_RE.match(stripped)
|
|
or re.match(r"^(\s*)([-*+]|\d+\.)\s+", stripped)
|
|
)
|
|
|
|
if previous_was_list_item and stripped and not is_markdown_block_start:
|
|
repaired.append(f"{previous_continuation_indent}{stripped}")
|
|
previous_was_list_item = False
|
|
continue
|
|
|
|
repaired.append(stripped)
|
|
if list_match:
|
|
marker = list_match.group(2)
|
|
base_indent = list_match.group(1)
|
|
previous_continuation_indent = f"{base_indent}{' ' * (len(marker) + 1)}"
|
|
previous_was_list_item = True
|
|
else:
|
|
previous_was_list_item = False
|
|
|
|
return repaired
|
|
|
|
def _repair_flattened_diagram(text):
|
|
if "\n" in text:
|
|
return text
|
|
if sum(text.count(char) for char in "│▼├└") < 2:
|
|
return text
|
|
|
|
text = re.sub(r"\s{2,}([│▼])", r"\n \1", text)
|
|
text = re.sub(r"([│▼])\s{2,}", r"\1\n", text)
|
|
text = re.sub(r"([│▼])(?=[^\s\n│▼├└])", r"\1\n", text)
|
|
text = re.sub(r"(?<=[^\s\n])([├└])", r"\n\1", text)
|
|
text = re.sub(r"([^\s\n])(\()", r"\1\n\2", text)
|
|
return "\n".join(line.rstrip() for line in text.splitlines() if line.strip())
|
|
|
|
def _convert_dash_lists_to_branches(lines):
|
|
converted = []
|
|
index = 0
|
|
while index < len(lines):
|
|
match = re.match(r"^(\s*)-\s+(.*)$", lines[index])
|
|
if not match:
|
|
converted.append(lines[index])
|
|
index += 1
|
|
continue
|
|
|
|
indent = match.group(1)
|
|
items = []
|
|
while index < len(lines):
|
|
next_match = re.match(rf"^{re.escape(indent)}-\s+(.*)$", lines[index])
|
|
if not next_match:
|
|
break
|
|
items.append(next_match.group(1))
|
|
index += 1
|
|
|
|
for item_index, item in enumerate(items):
|
|
branch = "└" if item_index == len(items) - 1 else "├"
|
|
converted.append(f"{indent}{branch} {item}")
|
|
return converted
|
|
|
|
def _clean_code_block(code):
|
|
lines = [line.rstrip() for line in code.splitlines()]
|
|
while lines and not lines[0].strip():
|
|
lines.pop(0)
|
|
while lines and not lines[-1].strip():
|
|
lines.pop()
|
|
|
|
flattened = _repair_flattened_diagram("\n".join(lines))
|
|
lines = flattened.splitlines() if flattened else []
|
|
lines = [
|
|
f" {line.strip()}"
|
|
if line.strip() in {"│", "▼"} and not re.match(r"^\s+[│▼]\s*$", line)
|
|
else line
|
|
for line in lines
|
|
]
|
|
lines = _convert_dash_lists_to_branches(lines)
|
|
return "\n".join(lines)
|
|
|
|
def _clean_markdown_output(markdown):
|
|
if not markdown:
|
|
return ""
|
|
|
|
pieces = []
|
|
last_index = 0
|
|
for match in _FENCE_RE.finditer(markdown):
|
|
prose = markdown[last_index:match.start()]
|
|
if prose:
|
|
cleaned = _ESCAPED_MARKDOWN_RE.sub(r"\1", prose)
|
|
lines = [line.strip() for line in cleaned.splitlines()]
|
|
lines = _repair_table_headers(lines)
|
|
lines = _repair_list_continuations(lines)
|
|
cleaned = "\n".join(lines)
|
|
cleaned = _collapse_blank_lines(cleaned)
|
|
if cleaned:
|
|
pieces.append(cleaned)
|
|
|
|
fence = match.group(0)
|
|
header, _, tail = fence.partition("\n")
|
|
body, _, _ = tail.rpartition("\n")
|
|
cleaned_body = _clean_code_block(body)
|
|
pieces.append(f"{header}\n{cleaned_body}\n```" if cleaned_body else f"{header}\n```")
|
|
last_index = match.end()
|
|
|
|
trailing = markdown[last_index:]
|
|
if trailing:
|
|
cleaned = _ESCAPED_MARKDOWN_RE.sub(r"\1", trailing)
|
|
lines = [line.strip() for line in cleaned.splitlines()]
|
|
lines = _repair_table_headers(lines)
|
|
lines = _repair_list_continuations(lines)
|
|
cleaned = "\n".join(lines)
|
|
cleaned = _collapse_blank_lines(cleaned)
|
|
if cleaned:
|
|
pieces.append(cleaned)
|
|
|
|
return "\n\n".join(piece for piece in pieces if piece)
|
|
|
|
def _convert_html_to_markdown(html):
|
|
return convert_html_to_markdown(html, _clean_markdown_output)
|
|
|
|
def render_markdown(raw: str | None) -> str:
|
|
"""Normalize *raw* extractor output into clean Markdown.
|
|
|
|
If the payload looks like HTML (first non-space char is ``<``) it is run
|
|
through the HTML→Markdown converter; otherwise it is treated as Markdown and
|
|
only the clean-up/repair passes are applied.
|
|
"""
|
|
raw = raw or ""
|
|
if raw.lstrip().startswith("<"):
|
|
return _convert_html_to_markdown(raw)
|
|
return _clean_markdown_output(raw)
|