0d5c49c19a
Testing / test (push) Failing after 10m21s
- compat.py → compat/ package: auth.py (auth-field normalizers), commands.py (command-format shims), __init__.py (re-exports) - Add _auth_0_9_3 transformer: normalizes pubkey to lowercase before auth so clients < 0.9.3 sending uppercase hex are accepted - adapt_auth() now called before auth check in serve.py; command extracted after adapt_auth so future transformers can rename commands safely - serve.py: deduplicate _recv_exact (import from client), unify resp/resp_payload across Windows/Unix branches, require lowercase hex pubkey (re.fullmatch), reorganize imports, drop unused os import - client.py: move payload/framed construction inside branches (remote path no longer serializes JSON it never uses); fix _is_valid_key_spec operator precedence; import MAX_MSG_BYTES from version_manager - auth.py: narrow except clause (ValueError instead of bare Exception) - Bump version 0.9.2 → 0.9.3 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
493 lines
16 KiB
Python
493 lines
16 KiB
Python
import json
|
|
import re
|
|
from html.parser import HTMLParser
|
|
|
|
import click
|
|
from browser_cli.commands import _handle
|
|
from rich.console import Console
|
|
from rich.table import Table
|
|
|
|
console = Console()
|
|
_FENCE_RE = re.compile(r"```(?:[^\n`]*)\n.*?\n```", re.DOTALL)
|
|
_ESCAPED_MARKDOWN_RE = re.compile(r"\\([_-])")
|
|
_TABLE_SEPARATOR_RE = re.compile(r"^\|(?:\s*:?-{3,}:?\s*\|)+\s*$")
|
|
|
|
|
|
class _HtmlNode:
|
|
def __init__(self, tag=None, attrs=None, text=None):
|
|
self.tag = tag
|
|
self.attrs = attrs or {}
|
|
self.text = text
|
|
self.children = []
|
|
|
|
|
|
class _HtmlTreeBuilder(HTMLParser):
|
|
_VOID_TAGS = {"br", "hr", "img"}
|
|
|
|
def __init__(self):
|
|
super().__init__(convert_charrefs=True)
|
|
self.root = _HtmlNode(tag="document")
|
|
self._stack = [self.root]
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
node = _HtmlNode(tag=tag.lower(), attrs=dict(attrs))
|
|
self._stack[-1].children.append(node)
|
|
if node.tag not in self._VOID_TAGS:
|
|
self._stack.append(node)
|
|
|
|
def handle_startendtag(self, tag, attrs):
|
|
node = _HtmlNode(tag=tag.lower(), attrs=dict(attrs))
|
|
self._stack[-1].children.append(node)
|
|
|
|
def handle_endtag(self, tag):
|
|
lowered = tag.lower()
|
|
for index in range(len(self._stack) - 1, 0, -1):
|
|
if self._stack[index].tag == lowered:
|
|
del self._stack[index:]
|
|
break
|
|
|
|
def handle_data(self, data):
|
|
if data:
|
|
self._stack[-1].children.append(_HtmlNode(text=data))
|
|
|
|
|
|
def _normalize_text(value):
|
|
return re.sub(r"\s+", " ", value or "").strip()
|
|
|
|
|
|
def _normalize_inline(value):
|
|
value = value.replace("\xa0", " ")
|
|
value = re.sub(r"[ \t\r\f\v]+", " ", value)
|
|
value = re.sub(r" *\n *", "\n", value)
|
|
return value.strip()
|
|
|
|
|
|
def _collapse_blank_lines(value):
|
|
value = re.sub(r"[ \t]+\n", "\n", value)
|
|
value = re.sub(r"\n{3,}", "\n\n", value)
|
|
return value.strip()
|
|
|
|
|
|
def _escape_markdown(text):
|
|
return re.sub(r"([\\`[\]])", r"\\\1", text)
|
|
|
|
|
|
def _escape_table_cell(text):
|
|
return text.replace("|", r"\|").replace("\n", " ").strip()
|
|
|
|
|
|
def _iter_descendants(node):
|
|
for child in getattr(node, "children", []):
|
|
yield child
|
|
yield from _iter_descendants(child)
|
|
|
|
|
|
def _has_class(node, class_name):
|
|
classes = (node.attrs.get("class") or "").split()
|
|
return class_name in classes
|
|
|
|
|
|
def _is_code_block_node(node):
|
|
if not node or not node.tag:
|
|
return False
|
|
if node.attrs.get("data-is-code-block-view") == "true":
|
|
return True
|
|
return node.tag == "pre"
|
|
|
|
|
|
def _inline_text(node):
|
|
if node.text is not None:
|
|
return _escape_markdown(node.text)
|
|
if not node.tag:
|
|
return ""
|
|
|
|
tag = node.tag
|
|
if tag == "br":
|
|
return "\n"
|
|
if tag == "img":
|
|
src = node.attrs.get("src") or ""
|
|
alt = _normalize_text(node.attrs.get("alt") or "")
|
|
if not src:
|
|
return ""
|
|
return f"" if alt else f""
|
|
if tag == "a":
|
|
text = _normalize_inline("".join(_inline_text(child) for child in node.children))
|
|
href = node.attrs.get("href") or ""
|
|
return f"[{text or href}]({href})" if href else text
|
|
if tag == "code":
|
|
text = _normalize_inline("".join(_inline_text(child) for child in node.children))
|
|
return f"`{text.replace('`', r'\\`')}`" if text else ""
|
|
if tag in {"strong", "b"}:
|
|
text = _normalize_inline("".join(_inline_text(child) for child in node.children))
|
|
return f"**{text}**" if text else ""
|
|
if tag in {"em", "i"}:
|
|
text = _normalize_inline("".join(_inline_text(child) for child in node.children))
|
|
return f"*{text}*" if text else ""
|
|
|
|
chunks = []
|
|
for child in node.children:
|
|
rendered = _inline_text(child)
|
|
if rendered:
|
|
chunks.append(rendered)
|
|
if child.tag in {"p", "div", "table", "ul", "ol", "pre"}:
|
|
chunks.append("\n")
|
|
return "".join(chunks)
|
|
|
|
|
|
def _text_block(node):
|
|
return _collapse_blank_lines(_normalize_inline("".join(_inline_text(child) for child in node.children)))
|
|
|
|
|
|
def _inner_text_preserve(node):
|
|
if node.text is not None:
|
|
return node.text
|
|
if not node.tag:
|
|
return ""
|
|
if node.tag == "br":
|
|
return ""
|
|
return "".join(_inner_text_preserve(child) for child in node.children)
|
|
|
|
|
|
def _table_to_markdown(node):
|
|
rows = []
|
|
for descendant in _iter_descendants(node):
|
|
if descendant.tag != "tr":
|
|
continue
|
|
row = []
|
|
for cell in descendant.children:
|
|
if cell.tag in {"td", "th"}:
|
|
row.append(_escape_table_cell(_text_block(cell)))
|
|
if row:
|
|
rows.append(row)
|
|
if not rows:
|
|
return ""
|
|
|
|
widths = max(len(row) for row in rows)
|
|
normalized_rows = [row + [""] * (widths - len(row)) for row in rows]
|
|
|
|
headers = normalized_rows[0]
|
|
body_rows = normalized_rows[1:]
|
|
first_row_blank = all(not cell.strip() for cell in headers)
|
|
if first_row_blank and len(normalized_rows) > 1:
|
|
headers = normalized_rows[1]
|
|
body_rows = normalized_rows[2:]
|
|
|
|
has_thead = any(child.tag == "thead" for child in node.children)
|
|
first_row = next((child for child in _iter_descendants(node) if child.tag == "tr"), None)
|
|
first_row_has_th = bool(first_row and any(child.tag == "th" for child in first_row.children))
|
|
if not (has_thead or first_row_has_th or first_row_blank):
|
|
headers = [""] * widths
|
|
body_rows = normalized_rows
|
|
|
|
separator = ["---"] * widths
|
|
lines = [
|
|
f"| {' | '.join(headers)} |",
|
|
f"| {' | '.join(separator)} |",
|
|
]
|
|
lines.extend(f"| {' | '.join(row)} |" for row in body_rows)
|
|
return "\n".join(lines)
|
|
|
|
|
|
def _list_to_markdown(node, depth=0):
|
|
ordered = node.tag == "ol"
|
|
items = []
|
|
index = 1
|
|
for child in node.children:
|
|
if child.tag != "li":
|
|
continue
|
|
marker = f"{index}. " if ordered else "- "
|
|
index += 1
|
|
content = []
|
|
nested = []
|
|
for item_child in child.children:
|
|
if item_child.tag in {"ul", "ol"}:
|
|
nested.append(_list_to_markdown(item_child, depth + 1))
|
|
else:
|
|
content.append(_inline_text(item_child))
|
|
line = _collapse_blank_lines(_normalize_inline("".join(content)))
|
|
indent = " " * depth
|
|
if line:
|
|
line_parts = line.splitlines()
|
|
items.append(f"{indent}{marker}{line_parts[0]}")
|
|
continuation_indent = f"{indent}{' ' * len(marker)}"
|
|
items.extend(f"{continuation_indent}{part}" for part in line_parts[1:])
|
|
items.extend(block for block in nested if block)
|
|
return "\n".join(items)
|
|
|
|
|
|
def _code_block_to_markdown(node):
|
|
if node.tag == "pre":
|
|
text = _inner_text_preserve(node).rstrip("\n")
|
|
return f"```\n{text}\n```" if text else ""
|
|
|
|
lines = []
|
|
for descendant in _iter_descendants(node):
|
|
if descendant.tag and _has_class(descendant, "cm-line"):
|
|
lines.append(_inner_text_preserve(descendant))
|
|
code = "\n".join(lines).rstrip("\n")
|
|
return f"```\n{code}\n```" if code else ""
|
|
|
|
|
|
def _block_to_markdown(node):
|
|
if node.text is not None:
|
|
return _normalize_text(node.text)
|
|
if not node.tag:
|
|
return ""
|
|
if _is_code_block_node(node):
|
|
return _code_block_to_markdown(node)
|
|
if node.tag == "table":
|
|
return _table_to_markdown(node)
|
|
if node.tag in {"ul", "ol"}:
|
|
return _list_to_markdown(node)
|
|
if re.fullmatch(r"h[1-6]", node.tag):
|
|
text = _text_block(node)
|
|
return f"{'#' * int(node.tag[1])} {text}" if text else ""
|
|
if node.tag in {"p", "figcaption"}:
|
|
return _text_block(node)
|
|
if node.tag == "blockquote":
|
|
content = _collapse_blank_lines("\n\n".join(filter(None, (_block_to_markdown(child) for child in node.children))))
|
|
return "\n".join(f"> {line}" if line else ">" for line in content.splitlines()) if content else ""
|
|
if node.tag == "hr":
|
|
return "---"
|
|
if node.tag == "img":
|
|
return _inline_text(node)
|
|
|
|
child_blocks = [block for block in (_block_to_markdown(child) for child in node.children) if block]
|
|
if child_blocks:
|
|
return _collapse_blank_lines("\n\n".join(child_blocks))
|
|
return _text_block(node)
|
|
|
|
|
|
def _parse_table_row(line):
|
|
stripped = line.strip()
|
|
if not stripped.startswith("|") or not stripped.endswith("|"):
|
|
return None
|
|
return [cell.strip() for cell in stripped.strip("|").split("|")]
|
|
|
|
|
|
def _repair_table_headers(lines):
|
|
repaired = []
|
|
index = 0
|
|
while index < len(lines):
|
|
if (
|
|
index + 2 < len(lines)
|
|
and _parse_table_row(lines[index]) is not None
|
|
and _TABLE_SEPARATOR_RE.match(lines[index + 1].strip())
|
|
and _parse_table_row(lines[index + 2]) is not None
|
|
):
|
|
first = _parse_table_row(lines[index])
|
|
third = _parse_table_row(lines[index + 2])
|
|
if first and all(not cell for cell in first) and any(cell for cell in third):
|
|
repaired.append(lines[index + 2].strip())
|
|
repaired.append(lines[index + 1].strip())
|
|
index += 3
|
|
continue
|
|
repaired.append(lines[index].strip())
|
|
index += 1
|
|
return repaired
|
|
|
|
|
|
def _repair_list_continuations(lines):
|
|
repaired = []
|
|
previous_was_list_item = False
|
|
previous_continuation_indent = ""
|
|
|
|
for line in lines:
|
|
stripped = line.strip()
|
|
list_match = re.match(r"^(\s*)([-*+]|\d+\.)\s+.+$", stripped)
|
|
is_markdown_block_start = (
|
|
not stripped
|
|
or stripped.startswith(("```", "#", ">", "|"))
|
|
or _TABLE_SEPARATOR_RE.match(stripped)
|
|
or re.match(r"^(\s*)([-*+]|\d+\.)\s+", stripped)
|
|
)
|
|
|
|
if previous_was_list_item and stripped and not is_markdown_block_start:
|
|
repaired.append(f"{previous_continuation_indent}{stripped}")
|
|
previous_was_list_item = False
|
|
continue
|
|
|
|
repaired.append(stripped)
|
|
if list_match:
|
|
marker = list_match.group(2)
|
|
base_indent = list_match.group(1)
|
|
previous_continuation_indent = f"{base_indent}{' ' * (len(marker) + 1)}"
|
|
previous_was_list_item = True
|
|
else:
|
|
previous_was_list_item = False
|
|
|
|
return repaired
|
|
|
|
|
|
def _repair_flattened_diagram(text):
|
|
if "\n" in text:
|
|
return text
|
|
if sum(text.count(char) for char in "│▼├└") < 2:
|
|
return text
|
|
|
|
text = re.sub(r"\s{2,}([│▼])", r"\n \1", text)
|
|
text = re.sub(r"([│▼])\s{2,}", r"\1\n", text)
|
|
text = re.sub(r"([│▼])(?=[^\s\n│▼├└])", r"\1\n", text)
|
|
text = re.sub(r"(?<=[^\s\n])([├└])", r"\n\1", text)
|
|
text = re.sub(r"([^\s\n])(\()", r"\1\n\2", text)
|
|
return "\n".join(line.rstrip() for line in text.splitlines() if line.strip())
|
|
|
|
|
|
def _convert_dash_lists_to_branches(lines):
|
|
converted = []
|
|
index = 0
|
|
while index < len(lines):
|
|
match = re.match(r"^(\s*)-\s+(.*)$", lines[index])
|
|
if not match:
|
|
converted.append(lines[index])
|
|
index += 1
|
|
continue
|
|
|
|
indent = match.group(1)
|
|
items = []
|
|
while index < len(lines):
|
|
next_match = re.match(rf"^{re.escape(indent)}-\s+(.*)$", lines[index])
|
|
if not next_match:
|
|
break
|
|
items.append(next_match.group(1))
|
|
index += 1
|
|
|
|
for item_index, item in enumerate(items):
|
|
branch = "└" if item_index == len(items) - 1 else "├"
|
|
converted.append(f"{indent}{branch} {item}")
|
|
return converted
|
|
|
|
|
|
def _clean_code_block(code):
|
|
lines = [line.rstrip() for line in code.splitlines()]
|
|
while lines and not lines[0].strip():
|
|
lines.pop(0)
|
|
while lines and not lines[-1].strip():
|
|
lines.pop()
|
|
|
|
flattened = _repair_flattened_diagram("\n".join(lines))
|
|
lines = flattened.splitlines() if flattened else []
|
|
lines = [
|
|
f" {line.strip()}"
|
|
if line.strip() in {"│", "▼"} and not re.match(r"^\s+[│▼]\s*$", line)
|
|
else line
|
|
for line in lines
|
|
]
|
|
lines = _convert_dash_lists_to_branches(lines)
|
|
return "\n".join(lines)
|
|
|
|
|
|
def _clean_markdown_output(markdown):
|
|
if not markdown:
|
|
return ""
|
|
|
|
pieces = []
|
|
last_index = 0
|
|
for match in _FENCE_RE.finditer(markdown):
|
|
prose = markdown[last_index:match.start()]
|
|
if prose:
|
|
cleaned = _ESCAPED_MARKDOWN_RE.sub(r"\1", prose)
|
|
lines = [line.strip() for line in cleaned.splitlines()]
|
|
lines = _repair_table_headers(lines)
|
|
lines = _repair_list_continuations(lines)
|
|
cleaned = "\n".join(lines)
|
|
cleaned = _collapse_blank_lines(cleaned)
|
|
if cleaned:
|
|
pieces.append(cleaned)
|
|
|
|
fence = match.group(0)
|
|
header, _, tail = fence.partition("\n")
|
|
body, _, _ = tail.rpartition("\n")
|
|
cleaned_body = _clean_code_block(body)
|
|
pieces.append(f"{header}\n{cleaned_body}\n```" if cleaned_body else f"{header}\n```")
|
|
last_index = match.end()
|
|
|
|
trailing = markdown[last_index:]
|
|
if trailing:
|
|
cleaned = _ESCAPED_MARKDOWN_RE.sub(r"\1", trailing)
|
|
lines = [line.strip() for line in cleaned.splitlines()]
|
|
lines = _repair_table_headers(lines)
|
|
lines = _repair_list_continuations(lines)
|
|
cleaned = "\n".join(lines)
|
|
cleaned = _collapse_blank_lines(cleaned)
|
|
if cleaned:
|
|
pieces.append(cleaned)
|
|
|
|
return "\n\n".join(piece for piece in pieces if piece)
|
|
|
|
|
|
def _convert_html_to_markdown(html):
|
|
parser = _HtmlTreeBuilder()
|
|
parser.feed(html or "")
|
|
markdown = _block_to_markdown(parser.root)
|
|
return _clean_markdown_output(markdown)
|
|
|
|
|
|
@click.group("extract")
|
|
def extract_group():
|
|
"""Extract content from the active tab."""
|
|
|
|
|
|
@extract_group.command("links")
|
|
def extract_links():
|
|
"""Extract all links from the active tab."""
|
|
links = _handle("extract.links")
|
|
if not links:
|
|
console.print("[yellow]No links found[/yellow]")
|
|
return
|
|
table = Table(show_header=True, header_style="bold cyan")
|
|
table.add_column("Text", width=40)
|
|
table.add_column("URL")
|
|
for lnk in links:
|
|
table.add_row((lnk.get("text") or "")[:60], lnk.get("href") or "")
|
|
console.print(table)
|
|
|
|
|
|
@extract_group.command("images")
|
|
def extract_images():
|
|
"""Extract all images from the active tab."""
|
|
images = _handle("extract.images")
|
|
if not images:
|
|
console.print("[yellow]No images found[/yellow]")
|
|
return
|
|
table = Table(show_header=True, header_style="bold cyan")
|
|
table.add_column("Alt", width=30)
|
|
table.add_column("Src")
|
|
for img in images:
|
|
table.add_row((img.get("alt") or "")[:40], img.get("src") or "")
|
|
console.print(table)
|
|
|
|
|
|
@extract_group.command("text")
|
|
def extract_text():
|
|
"""Extract all visible text from the active tab."""
|
|
text = _handle("extract.text")
|
|
console.print(text or "")
|
|
|
|
|
|
@extract_group.command("json")
|
|
@click.argument("selector")
|
|
def extract_json(selector):
|
|
"""Parse and pretty-print JSON content inside SELECTOR."""
|
|
data = _handle("extract.json", {"selector": selector})
|
|
console.print_json(json.dumps(data))
|
|
|
|
|
|
@extract_group.command("html")
|
|
def extract_html():
|
|
"""Print the full HTML of the active tab to stdout."""
|
|
html = _handle("extract.html")
|
|
click.echo(html or "")
|
|
|
|
|
|
@extract_group.command("markdown")
|
|
@click.option("--selector", help="Extract only the DOM subtree matching this CSS selector.")
|
|
def extract_markdown(selector):
|
|
"""Extract the page's main content as Markdown."""
|
|
markdown = _handle("extract.markdown", {"selector": selector})
|
|
if (markdown or "").lstrip().startswith("<"):
|
|
markdown = _convert_html_to_markdown(markdown)
|
|
else:
|
|
markdown = _clean_markdown_output(markdown or "")
|
|
click.echo(markdown or "", nl=not (markdown or "").endswith("\n"))
|