adding better markdown extracting and filter out to not have broken staff, allow that session can list over multiple browsers

This commit is contained in:
2026-04-12 17:10:19 +02:00
parent 51054422fb
commit 64d804cf32
7 changed files with 899 additions and 22 deletions
+11
View File
@@ -346,6 +346,17 @@ class BrowserCLI:
return self._cmd("session.diff", {"nameA": name_a, "nameB": name_b})
def session_list(self) -> list[dict]:
"""Return saved sessions.
In implicit multi-browser mode each session dict includes a ``browser`` key.
"""
multi_results = self._collect_multi_browser("session.list", {})
if multi_results:
return [
{**session, "browser": target.display_name}
for target, sessions in multi_results
for session in (sessions or [])
]
return self._cmd("session.list", {})
def session_remove(self, name: str) -> None:
+421 -1
View File
@@ -1,10 +1,426 @@
import click
import json
import re
from html.parser import HTMLParser
import click
from browser_cli.client import send_command, BrowserNotConnected
from rich.console import Console
from rich.table import Table
console = Console()
_FENCE_RE = re.compile(r"```(?:[^\n`]*)\n.*?\n```", re.DOTALL)
_ESCAPED_MARKDOWN_RE = re.compile(r"\\([_-])")
_TABLE_SEPARATOR_RE = re.compile(r"^\|(?:\s*:?-{3,}:?\s*\|)+\s*$")
class _HtmlNode:
def __init__(self, tag=None, attrs=None, text=None):
self.tag = tag
self.attrs = attrs or {}
self.text = text
self.children = []
class _HtmlTreeBuilder(HTMLParser):
_VOID_TAGS = {"br", "hr", "img"}
def __init__(self):
super().__init__(convert_charrefs=True)
self.root = _HtmlNode(tag="document")
self._stack = [self.root]
def handle_starttag(self, tag, attrs):
node = _HtmlNode(tag=tag.lower(), attrs=dict(attrs))
self._stack[-1].children.append(node)
if node.tag not in self._VOID_TAGS:
self._stack.append(node)
def handle_startendtag(self, tag, attrs):
node = _HtmlNode(tag=tag.lower(), attrs=dict(attrs))
self._stack[-1].children.append(node)
def handle_endtag(self, tag):
lowered = tag.lower()
for index in range(len(self._stack) - 1, 0, -1):
if self._stack[index].tag == lowered:
del self._stack[index:]
break
def handle_data(self, data):
if data:
self._stack[-1].children.append(_HtmlNode(text=data))
def _normalize_text(value):
return re.sub(r"\s+", " ", value or "").strip()
def _normalize_inline(value):
value = value.replace("\xa0", " ")
value = re.sub(r"[ \t\r\f\v]+", " ", value)
value = re.sub(r" *\n *", "\n", value)
return value.strip()
def _collapse_blank_lines(value):
value = re.sub(r"[ \t]+\n", "\n", value)
value = re.sub(r"\n{3,}", "\n\n", value)
return value.strip()
def _escape_markdown(text):
return re.sub(r"([\\`[\]])", r"\\\1", text)
def _escape_table_cell(text):
return text.replace("|", r"\|").replace("\n", " ").strip()
def _iter_descendants(node):
for child in getattr(node, "children", []):
yield child
yield from _iter_descendants(child)
def _has_class(node, class_name):
classes = (node.attrs.get("class") or "").split()
return class_name in classes
def _is_code_block_node(node):
if not node or not node.tag:
return False
if node.attrs.get("data-is-code-block-view") == "true":
return True
return node.tag == "pre"
def _inline_text(node):
if node.text is not None:
return _escape_markdown(node.text)
if not node.tag:
return ""
tag = node.tag
if tag == "br":
return "\n"
if tag == "img":
src = node.attrs.get("src") or ""
alt = _normalize_text(node.attrs.get("alt") or "")
if not src:
return ""
return f"![{_escape_markdown(alt)}]({src})" if alt else f"![]({src})"
if tag == "a":
text = _normalize_inline("".join(_inline_text(child) for child in node.children))
href = node.attrs.get("href") or ""
return f"[{text or href}]({href})" if href else text
if tag == "code":
text = _normalize_inline("".join(_inline_text(child) for child in node.children))
return f"`{text.replace('`', r'\\`')}`" if text else ""
if tag in {"strong", "b"}:
text = _normalize_inline("".join(_inline_text(child) for child in node.children))
return f"**{text}**" if text else ""
if tag in {"em", "i"}:
text = _normalize_inline("".join(_inline_text(child) for child in node.children))
return f"*{text}*" if text else ""
chunks = []
for child in node.children:
rendered = _inline_text(child)
if rendered:
chunks.append(rendered)
if child.tag in {"p", "div", "table", "ul", "ol", "pre"}:
chunks.append("\n")
return "".join(chunks)
def _text_block(node):
return _collapse_blank_lines(_normalize_inline("".join(_inline_text(child) for child in node.children)))
def _inner_text_preserve(node):
if node.text is not None:
return node.text
if not node.tag:
return ""
if node.tag == "br":
return ""
return "".join(_inner_text_preserve(child) for child in node.children)
def _table_to_markdown(node):
rows = []
for descendant in _iter_descendants(node):
if descendant.tag != "tr":
continue
row = []
for cell in descendant.children:
if cell.tag in {"td", "th"}:
row.append(_escape_table_cell(_text_block(cell)))
if row:
rows.append(row)
if not rows:
return ""
widths = max(len(row) for row in rows)
normalized_rows = [row + [""] * (widths - len(row)) for row in rows]
headers = normalized_rows[0]
body_rows = normalized_rows[1:]
first_row_blank = all(not cell.strip() for cell in headers)
if first_row_blank and len(normalized_rows) > 1:
headers = normalized_rows[1]
body_rows = normalized_rows[2:]
has_thead = any(child.tag == "thead" for child in node.children)
first_row = next((child for child in _iter_descendants(node) if child.tag == "tr"), None)
first_row_has_th = bool(first_row and any(child.tag == "th" for child in first_row.children))
if not (has_thead or first_row_has_th or first_row_blank):
headers = [""] * widths
body_rows = normalized_rows
separator = ["---"] * widths
lines = [
f"| {' | '.join(headers)} |",
f"| {' | '.join(separator)} |",
]
lines.extend(f"| {' | '.join(row)} |" for row in body_rows)
return "\n".join(lines)
def _list_to_markdown(node, depth=0):
ordered = node.tag == "ol"
items = []
index = 1
for child in node.children:
if child.tag != "li":
continue
marker = f"{index}. " if ordered else "- "
index += 1
content = []
nested = []
for item_child in child.children:
if item_child.tag in {"ul", "ol"}:
nested.append(_list_to_markdown(item_child, depth + 1))
else:
content.append(_inline_text(item_child))
line = _collapse_blank_lines(_normalize_inline("".join(content)))
indent = " " * depth
if line:
line_parts = line.splitlines()
items.append(f"{indent}{marker}{line_parts[0]}")
continuation_indent = f"{indent}{' ' * len(marker)}"
items.extend(f"{continuation_indent}{part}" for part in line_parts[1:])
items.extend(block for block in nested if block)
return "\n".join(items)
def _code_block_to_markdown(node):
if node.tag == "pre":
text = _inner_text_preserve(node).rstrip("\n")
return f"```\n{text}\n```" if text else ""
lines = []
for descendant in _iter_descendants(node):
if descendant.tag and _has_class(descendant, "cm-line"):
lines.append(_inner_text_preserve(descendant))
code = "\n".join(lines).rstrip("\n")
return f"```\n{code}\n```" if code else ""
def _block_to_markdown(node):
if node.text is not None:
return _normalize_text(node.text)
if not node.tag:
return ""
if _is_code_block_node(node):
return _code_block_to_markdown(node)
if node.tag == "table":
return _table_to_markdown(node)
if node.tag in {"ul", "ol"}:
return _list_to_markdown(node)
if re.fullmatch(r"h[1-6]", node.tag):
text = _text_block(node)
return f"{'#' * int(node.tag[1])} {text}" if text else ""
if node.tag in {"p", "figcaption"}:
return _text_block(node)
if node.tag == "blockquote":
content = _collapse_blank_lines("\n\n".join(filter(None, (_block_to_markdown(child) for child in node.children))))
return "\n".join(f"> {line}" if line else ">" for line in content.splitlines()) if content else ""
if node.tag == "hr":
return "---"
if node.tag == "img":
return _inline_text(node)
child_blocks = [block for block in (_block_to_markdown(child) for child in node.children) if block]
if child_blocks:
return _collapse_blank_lines("\n\n".join(child_blocks))
return _text_block(node)
def _parse_table_row(line):
stripped = line.strip()
if not stripped.startswith("|") or not stripped.endswith("|"):
return None
return [cell.strip() for cell in stripped.strip("|").split("|")]
def _repair_table_headers(lines):
repaired = []
index = 0
while index < len(lines):
if (
index + 2 < len(lines)
and _parse_table_row(lines[index]) is not None
and _TABLE_SEPARATOR_RE.match(lines[index + 1].strip())
and _parse_table_row(lines[index + 2]) is not None
):
first = _parse_table_row(lines[index])
third = _parse_table_row(lines[index + 2])
if first and all(not cell for cell in first) and any(cell for cell in third):
repaired.append(lines[index + 2].strip())
repaired.append(lines[index + 1].strip())
index += 3
continue
repaired.append(lines[index].strip())
index += 1
return repaired
def _repair_list_continuations(lines):
repaired = []
previous_was_list_item = False
previous_continuation_indent = ""
for line in lines:
stripped = line.strip()
list_match = re.match(r"^(\s*)([-*+]|\d+\.)\s+.+$", stripped)
is_markdown_block_start = (
not stripped
or stripped.startswith(("```", "#", ">", "|"))
or _TABLE_SEPARATOR_RE.match(stripped)
or re.match(r"^(\s*)([-*+]|\d+\.)\s+", stripped)
)
if previous_was_list_item and stripped and not is_markdown_block_start:
repaired.append(f"{previous_continuation_indent}{stripped}")
previous_was_list_item = False
continue
repaired.append(stripped)
if list_match:
marker = list_match.group(2)
base_indent = list_match.group(1)
previous_continuation_indent = f"{base_indent}{' ' * (len(marker) + 1)}"
previous_was_list_item = True
else:
previous_was_list_item = False
return repaired
def _repair_flattened_diagram(text):
if "\n" in text:
return text
if sum(text.count(char) for char in "│▼├└") < 2:
return text
text = re.sub(r"\s{2,}([│▼])", r"\n \1", text)
text = re.sub(r"([│▼])\s{2,}", r"\1\n", text)
text = re.sub(r"([│▼])(?=[^\s\n│▼├└])", r"\1\n", text)
text = re.sub(r"(?<=[^\s\n])([├└])", r"\n\1", text)
text = re.sub(r"([^\s\n])(\()", r"\1\n\2", text)
return "\n".join(line.rstrip() for line in text.splitlines() if line.strip())
def _convert_dash_lists_to_branches(lines):
converted = []
index = 0
while index < len(lines):
match = re.match(r"^(\s*)-\s+(.*)$", lines[index])
if not match:
converted.append(lines[index])
index += 1
continue
indent = match.group(1)
items = []
while index < len(lines):
next_match = re.match(rf"^{re.escape(indent)}-\s+(.*)$", lines[index])
if not next_match:
break
items.append(next_match.group(1))
index += 1
for item_index, item in enumerate(items):
branch = "" if item_index == len(items) - 1 else ""
converted.append(f"{indent}{branch} {item}")
return converted
def _clean_code_block(code):
lines = [line.rstrip() for line in code.splitlines()]
while lines and not lines[0].strip():
lines.pop(0)
while lines and not lines[-1].strip():
lines.pop()
flattened = _repair_flattened_diagram("\n".join(lines))
lines = flattened.splitlines() if flattened else []
lines = [
f" {line.strip()}"
if line.strip() in {"", ""} and not re.match(r"^\s+[│▼]\s*$", line)
else line
for line in lines
]
lines = _convert_dash_lists_to_branches(lines)
return "\n".join(lines)
def _clean_markdown_output(markdown):
if not markdown:
return ""
pieces = []
last_index = 0
for match in _FENCE_RE.finditer(markdown):
prose = markdown[last_index:match.start()]
if prose:
cleaned = _ESCAPED_MARKDOWN_RE.sub(r"\1", prose)
lines = [line.strip() for line in cleaned.splitlines()]
lines = _repair_table_headers(lines)
lines = _repair_list_continuations(lines)
cleaned = "\n".join(lines)
cleaned = _collapse_blank_lines(cleaned)
if cleaned:
pieces.append(cleaned)
fence = match.group(0)
header, _, tail = fence.partition("\n")
body, _, _ = tail.rpartition("\n")
cleaned_body = _clean_code_block(body)
pieces.append(f"{header}\n{cleaned_body}\n```" if cleaned_body else f"{header}\n```")
last_index = match.end()
trailing = markdown[last_index:]
if trailing:
cleaned = _ESCAPED_MARKDOWN_RE.sub(r"\1", trailing)
lines = [line.strip() for line in cleaned.splitlines()]
lines = _repair_table_headers(lines)
lines = _repair_list_continuations(lines)
cleaned = "\n".join(lines)
cleaned = _collapse_blank_lines(cleaned)
if cleaned:
pieces.append(cleaned)
return "\n\n".join(piece for piece in pieces if piece)
def _convert_html_to_markdown(html):
parser = _HtmlTreeBuilder()
parser.feed(html or "")
markdown = _block_to_markdown(parser.root)
return _clean_markdown_output(markdown)
def _handle(command, args=None):
@@ -80,4 +496,8 @@ def extract_html():
def extract_markdown(selector):
"""Extract the page's main content as Markdown."""
markdown = _handle("extract.markdown", {"selector": selector})
if (markdown or "").lstrip().startswith("<"):
markdown = _convert_html_to_markdown(markdown)
else:
markdown = _clean_markdown_output(markdown or "")
click.echo(markdown or "", nl=not (markdown or "").endswith("\n"))
+39 -6
View File
@@ -1,14 +1,13 @@
import click
import json
from browser_cli.client import send_command, BrowserNotConnected
from browser_cli.client import active_browser_targets, send_command, BrowserNotConnected
from rich.console import Console
console = Console()
def _handle(command, args=None):
def _handle(command, args=None, profile=None):
try:
return send_command(command, args or {})
return send_command(command, args or {}, profile=profile)
except BrowserNotConnected as e:
console.print(f"[red]Error:[/red] {e}")
raise SystemExit(1)
@@ -17,6 +16,23 @@ def _handle(command, args=None):
raise SystemExit(1)
def _handle_multi(command, args=None, profile=None):
try:
return send_command(command, args or {}, profile=profile)
except (BrowserNotConnected, RuntimeError):
return None
def _multi_browser_targets():
root = click.get_current_context().find_root()
if root.obj.get("browser_explicit"):
return []
targets = active_browser_targets()
if len(targets) <= 1:
return []
return targets
@click.group("session")
def session_group():
"""Save and restore browser sessions."""
@@ -71,18 +87,35 @@ def session_diff(name_a, name_b):
def session_list():
"""List all saved sessions."""
from rich.table import Table
sessions = _handle("session.list")
targets = _multi_browser_targets()
show_browser = bool(targets)
if targets:
sessions = []
for target in targets:
result = _handle_multi("session.list", profile=target.profile)
if result is None:
continue
sessions.extend({**session, "browser": target.display_name} for session in result)
if not sessions:
console.print("[red]Error:[/red] Cannot resolve a browser socket automatically.")
raise SystemExit(1)
else:
sessions = _handle("session.list")
if not sessions:
console.print("[yellow]No saved sessions[/yellow]")
return
table = Table(show_header=True, header_style="bold cyan")
if show_browser:
table.add_column("Browser")
table.add_column("Name")
table.add_column("Tabs", width=6)
table.add_column("Saved at")
for s in sessions:
from datetime import datetime
saved = datetime.fromtimestamp(s["savedAt"] / 1000).strftime("%Y-%m-%d %H:%M") if s.get("savedAt") else ""
table.add_row(s["name"], str(s["tabs"]), saved)
row = [s.get("browser", "")] if show_browser else []
row.extend([s["name"], str(s["tabs"]), saved])
table.add_row(*row)
console.print(table)