adding better markdown extracting and filter out to not have broken staff, allow that session can list over multiple browsers

2026-04-12 17:10:19 +02:00
parent 51054422fb
commit 64d804cf32
7 changed files with 899 additions and 22 deletions
@@ -1,10 +1,426 @@
-import click
 import json
+import re
+from html.parser import HTMLParser
+
+import click
 from browser_cli.client import send_command, BrowserNotConnected
 from rich.console import Console
 from rich.table import Table

 console = Console()
+_FENCE_RE = re.compile(r"```(?:[^\n`]*)\n.*?\n```", re.DOTALL)
+_ESCAPED_MARKDOWN_RE = re.compile(r"\\([_-])")
+_TABLE_SEPARATOR_RE = re.compile(r"^\|(?:\s*:?-{3,}:?\s*\|)+\s*$")
+
+
+class _HtmlNode:
+    def __init__(self, tag=None, attrs=None, text=None):
+        self.tag = tag
+        self.attrs = attrs or {}
+        self.text = text
+        self.children = []
+
+
+class _HtmlTreeBuilder(HTMLParser):
+    _VOID_TAGS = {"br", "hr", "img"}
+
+    def __init__(self):
+        super().__init__(convert_charrefs=True)
+        self.root = _HtmlNode(tag="document")
+        self._stack = [self.root]
+
+    def handle_starttag(self, tag, attrs):
+        node = _HtmlNode(tag=tag.lower(), attrs=dict(attrs))
+        self._stack[-1].children.append(node)
+        if node.tag not in self._VOID_TAGS:
+            self._stack.append(node)
+
+    def handle_startendtag(self, tag, attrs):
+        node = _HtmlNode(tag=tag.lower(), attrs=dict(attrs))
+        self._stack[-1].children.append(node)
+
+    def handle_endtag(self, tag):
+        lowered = tag.lower()
+        for index in range(len(self._stack) - 1, 0, -1):
+            if self._stack[index].tag == lowered:
+                del self._stack[index:]
+                break
+
+    def handle_data(self, data):
+        if data:
+            self._stack[-1].children.append(_HtmlNode(text=data))
+
+
+def _normalize_text(value):
+    return re.sub(r"\s+", " ", value or "").strip()
+
+
+def _normalize_inline(value):
+    value = value.replace("\xa0", " ")
+    value = re.sub(r"[ \t\r\f\v]+", " ", value)
+    value = re.sub(r" *\n *", "\n", value)
+    return value.strip()
+
+
+def _collapse_blank_lines(value):
+    value = re.sub(r"[ \t]+\n", "\n", value)
+    value = re.sub(r"\n{3,}", "\n\n", value)
+    return value.strip()
+
+
+def _escape_markdown(text):
+    return re.sub(r"([\\`[\]])", r"\\\1", text)
+
+
+def _escape_table_cell(text):
+    return text.replace("|", r"\|").replace("\n", " ").strip()
+
+
+def _iter_descendants(node):
+    for child in getattr(node, "children", []):
+        yield child
+        yield from _iter_descendants(child)
+
+
+def _has_class(node, class_name):
+    classes = (node.attrs.get("class") or "").split()
+    return class_name in classes
+
+
+def _is_code_block_node(node):
+    if not node or not node.tag:
+        return False
+    if node.attrs.get("data-is-code-block-view") == "true":
+        return True
+    return node.tag == "pre"
+
+
+def _inline_text(node):
+    if node.text is not None:
+        return _escape_markdown(node.text)
+    if not node.tag:
+        return ""
+
+    tag = node.tag
+    if tag == "br":
+        return "\n"
+    if tag == "img":
+        src = node.attrs.get("src") or ""
+        alt = _normalize_text(node.attrs.get("alt") or "")
+        if not src:
+            return ""
+        return f"![{_escape_markdown(alt)}]({src})" if alt else f"![]({src})"
+    if tag == "a":
+        text = _normalize_inline("".join(_inline_text(child) for child in node.children))
+        href = node.attrs.get("href") or ""
+        return f"[{text or href}]({href})" if href else text
+    if tag == "code":
+        text = _normalize_inline("".join(_inline_text(child) for child in node.children))
+        return f"`{text.replace('`', r'\\`')}`" if text else ""
+    if tag in {"strong", "b"}:
+        text = _normalize_inline("".join(_inline_text(child) for child in node.children))
+        return f"**{text}**" if text else ""
+    if tag in {"em", "i"}:
+        text = _normalize_inline("".join(_inline_text(child) for child in node.children))
+        return f"*{text}*" if text else ""
+
+    chunks = []
+    for child in node.children:
+        rendered = _inline_text(child)
+        if rendered:
+            chunks.append(rendered)
+            if child.tag in {"p", "div", "table", "ul", "ol", "pre"}:
+                chunks.append("\n")
+    return "".join(chunks)
+
+
+def _text_block(node):
+    return _collapse_blank_lines(_normalize_inline("".join(_inline_text(child) for child in node.children)))
+
+
+def _inner_text_preserve(node):
+    if node.text is not None:
+        return node.text
+    if not node.tag:
+        return ""
+    if node.tag == "br":
+        return ""
+    return "".join(_inner_text_preserve(child) for child in node.children)
+
+
+def _table_to_markdown(node):
+    rows = []
+    for descendant in _iter_descendants(node):
+        if descendant.tag != "tr":
+            continue
+        row = []
+        for cell in descendant.children:
+            if cell.tag in {"td", "th"}:
+                row.append(_escape_table_cell(_text_block(cell)))
+        if row:
+            rows.append(row)
+    if not rows:
+        return ""
+
+    widths = max(len(row) for row in rows)
+    normalized_rows = [row + [""] * (widths - len(row)) for row in rows]
+
+    headers = normalized_rows[0]
+    body_rows = normalized_rows[1:]
+    first_row_blank = all(not cell.strip() for cell in headers)
+    if first_row_blank and len(normalized_rows) > 1:
+        headers = normalized_rows[1]
+        body_rows = normalized_rows[2:]
+
+    has_thead = any(child.tag == "thead" for child in node.children)
+    first_row = next((child for child in _iter_descendants(node) if child.tag == "tr"), None)
+    first_row_has_th = bool(first_row and any(child.tag == "th" for child in first_row.children))
+    if not (has_thead or first_row_has_th or first_row_blank):
+        headers = [""] * widths
+        body_rows = normalized_rows
+
+    separator = ["---"] * widths
+    lines = [
+        f"| {' | '.join(headers)} |",
+        f"| {' | '.join(separator)} |",
+    ]
+    lines.extend(f"| {' | '.join(row)} |" for row in body_rows)
+    return "\n".join(lines)
+
+
+def _list_to_markdown(node, depth=0):
+    ordered = node.tag == "ol"
+    items = []
+    index = 1
+    for child in node.children:
+        if child.tag != "li":
+            continue
+        marker = f"{index}. " if ordered else "- "
+        index += 1
+        content = []
+        nested = []
+        for item_child in child.children:
+            if item_child.tag in {"ul", "ol"}:
+                nested.append(_list_to_markdown(item_child, depth + 1))
+            else:
+                content.append(_inline_text(item_child))
+        line = _collapse_blank_lines(_normalize_inline("".join(content)))
+        indent = "  " * depth
+        if line:
+            line_parts = line.splitlines()
+            items.append(f"{indent}{marker}{line_parts[0]}")
+            continuation_indent = f"{indent}{' ' * len(marker)}"
+            items.extend(f"{continuation_indent}{part}" for part in line_parts[1:])
+        items.extend(block for block in nested if block)
+    return "\n".join(items)
+
+
+def _code_block_to_markdown(node):
+    if node.tag == "pre":
+        text = _inner_text_preserve(node).rstrip("\n")
+        return f"```\n{text}\n```" if text else ""
+
+    lines = []
+    for descendant in _iter_descendants(node):
+        if descendant.tag and _has_class(descendant, "cm-line"):
+            lines.append(_inner_text_preserve(descendant))
+    code = "\n".join(lines).rstrip("\n")
+    return f"```\n{code}\n```" if code else ""
+
+
+def _block_to_markdown(node):
+    if node.text is not None:
+        return _normalize_text(node.text)
+    if not node.tag:
+        return ""
+    if _is_code_block_node(node):
+        return _code_block_to_markdown(node)
+    if node.tag == "table":
+        return _table_to_markdown(node)
+    if node.tag in {"ul", "ol"}:
+        return _list_to_markdown(node)
+    if re.fullmatch(r"h[1-6]", node.tag):
+        text = _text_block(node)
+        return f"{'#' * int(node.tag[1])} {text}" if text else ""
+    if node.tag in {"p", "figcaption"}:
+        return _text_block(node)
+    if node.tag == "blockquote":
+        content = _collapse_blank_lines("\n\n".join(filter(None, (_block_to_markdown(child) for child in node.children))))
+        return "\n".join(f"> {line}" if line else ">" for line in content.splitlines()) if content else ""
+    if node.tag == "hr":
+        return "---"
+    if node.tag == "img":
+        return _inline_text(node)
+
+    child_blocks = [block for block in (_block_to_markdown(child) for child in node.children) if block]
+    if child_blocks:
+        return _collapse_blank_lines("\n\n".join(child_blocks))
+    return _text_block(node)
+
+
+def _parse_table_row(line):
+    stripped = line.strip()
+    if not stripped.startswith("|") or not stripped.endswith("|"):
+        return None
+    return [cell.strip() for cell in stripped.strip("|").split("|")]
+
+
+def _repair_table_headers(lines):
+    repaired = []
+    index = 0
+    while index < len(lines):
+        if (
+            index + 2 < len(lines)
+            and _parse_table_row(lines[index]) is not None
+            and _TABLE_SEPARATOR_RE.match(lines[index + 1].strip())
+            and _parse_table_row(lines[index + 2]) is not None
+        ):
+            first = _parse_table_row(lines[index])
+            third = _parse_table_row(lines[index + 2])
+            if first and all(not cell for cell in first) and any(cell for cell in third):
+                repaired.append(lines[index + 2].strip())
+                repaired.append(lines[index + 1].strip())
+                index += 3
+                continue
+        repaired.append(lines[index].strip())
+        index += 1
+    return repaired
+
+
+def _repair_list_continuations(lines):
+    repaired = []
+    previous_was_list_item = False
+    previous_continuation_indent = ""
+
+    for line in lines:
+        stripped = line.strip()
+        list_match = re.match(r"^(\s*)([-*+]|\d+\.)\s+.+$", stripped)
+        is_markdown_block_start = (
+            not stripped
+            or stripped.startswith(("```", "#", ">", "|"))
+            or _TABLE_SEPARATOR_RE.match(stripped)
+            or re.match(r"^(\s*)([-*+]|\d+\.)\s+", stripped)
+        )
+
+        if previous_was_list_item and stripped and not is_markdown_block_start:
+            repaired.append(f"{previous_continuation_indent}{stripped}")
+            previous_was_list_item = False
+            continue
+
+        repaired.append(stripped)
+        if list_match:
+            marker = list_match.group(2)
+            base_indent = list_match.group(1)
+            previous_continuation_indent = f"{base_indent}{' ' * (len(marker) + 1)}"
+            previous_was_list_item = True
+        else:
+            previous_was_list_item = False
+
+    return repaired
+
+
+def _repair_flattened_diagram(text):
+    if "\n" in text:
+        return text
+    if sum(text.count(char) for char in "│▼├└") < 2:
+        return text
+
+    text = re.sub(r"\s{2,}([│▼])", r"\n   \1", text)
+    text = re.sub(r"([│▼])\s{2,}", r"\1\n", text)
+    text = re.sub(r"([│▼])(?=[^\s\n│▼├└])", r"\1\n", text)
+    text = re.sub(r"(?<=[^\s\n])([├└])", r"\n\1", text)
+    text = re.sub(r"([^\s\n])(\()", r"\1\n\2", text)
+    return "\n".join(line.rstrip() for line in text.splitlines() if line.strip())
+
+
+def _convert_dash_lists_to_branches(lines):
+    converted = []
+    index = 0
+    while index < len(lines):
+        match = re.match(r"^(\s*)-\s+(.*)$", lines[index])
+        if not match:
+            converted.append(lines[index])
+            index += 1
+            continue
+
+        indent = match.group(1)
+        items = []
+        while index < len(lines):
+            next_match = re.match(rf"^{re.escape(indent)}-\s+(.*)$", lines[index])
+            if not next_match:
+                break
+            items.append(next_match.group(1))
+            index += 1
+
+        for item_index, item in enumerate(items):
+            branch = "└" if item_index == len(items) - 1 else "├"
+            converted.append(f"{indent}{branch} {item}")
+    return converted
+
+
+def _clean_code_block(code):
+    lines = [line.rstrip() for line in code.splitlines()]
+    while lines and not lines[0].strip():
+        lines.pop(0)
+    while lines and not lines[-1].strip():
+        lines.pop()
+
+    flattened = _repair_flattened_diagram("\n".join(lines))
+    lines = flattened.splitlines() if flattened else []
+    lines = [
+        f"   {line.strip()}"
+        if line.strip() in {"│", "▼"} and not re.match(r"^\s+[│▼]\s*$", line)
+        else line
+        for line in lines
+    ]
+    lines = _convert_dash_lists_to_branches(lines)
+    return "\n".join(lines)
+
+
+def _clean_markdown_output(markdown):
+    if not markdown:
+        return ""
+
+    pieces = []
+    last_index = 0
+    for match in _FENCE_RE.finditer(markdown):
+        prose = markdown[last_index:match.start()]
+        if prose:
+            cleaned = _ESCAPED_MARKDOWN_RE.sub(r"\1", prose)
+            lines = [line.strip() for line in cleaned.splitlines()]
+            lines = _repair_table_headers(lines)
+            lines = _repair_list_continuations(lines)
+            cleaned = "\n".join(lines)
+            cleaned = _collapse_blank_lines(cleaned)
+            if cleaned:
+                pieces.append(cleaned)
+
+        fence = match.group(0)
+        header, _, tail = fence.partition("\n")
+        body, _, _ = tail.rpartition("\n")
+        cleaned_body = _clean_code_block(body)
+        pieces.append(f"{header}\n{cleaned_body}\n```" if cleaned_body else f"{header}\n```")
+        last_index = match.end()
+
+    trailing = markdown[last_index:]
+    if trailing:
+        cleaned = _ESCAPED_MARKDOWN_RE.sub(r"\1", trailing)
+        lines = [line.strip() for line in cleaned.splitlines()]
+        lines = _repair_table_headers(lines)
+        lines = _repair_list_continuations(lines)
+        cleaned = "\n".join(lines)
+        cleaned = _collapse_blank_lines(cleaned)
+        if cleaned:
+            pieces.append(cleaned)
+
+    return "\n\n".join(piece for piece in pieces if piece)
+
+
+def _convert_html_to_markdown(html):
+    parser = _HtmlTreeBuilder()
+    parser.feed(html or "")
+    markdown = _block_to_markdown(parser.root)
+    return _clean_markdown_output(markdown)


 def _handle(command, args=None):
@@ -80,4 +496,8 @@ def extract_html():
 def extract_markdown(selector):
    """Extract the page's main content as Markdown."""
    markdown = _handle("extract.markdown", {"selector": selector})
+    if (markdown or "").lstrip().startswith("<"):
+        markdown = _convert_html_to_markdown(markdown)
+    else:
+        markdown = _clean_markdown_output(markdown or "")
    click.echo(markdown or "", nl=not (markdown or "").endswith("\n"))
@@ -1,14 +1,13 @@
 import click
-import json
-from browser_cli.client import send_command, BrowserNotConnected
+from browser_cli.client import active_browser_targets, send_command, BrowserNotConnected
 from rich.console import Console

 console = Console()


-def _handle(command, args=None):
+def _handle(command, args=None, profile=None):
    try:
-        return send_command(command, args or {})
+        return send_command(command, args or {}, profile=profile)
    except BrowserNotConnected as e:
        console.print(f"[red]Error:[/red] {e}")
        raise SystemExit(1)
@@ -17,6 +16,23 @@ def _handle(command, args=None):
        raise SystemExit(1)


+def _handle_multi(command, args=None, profile=None):
+    try:
+        return send_command(command, args or {}, profile=profile)
+    except (BrowserNotConnected, RuntimeError):
+        return None
+
+
+def _multi_browser_targets():
+    root = click.get_current_context().find_root()
+    if root.obj.get("browser_explicit"):
+        return []
+    targets = active_browser_targets()
+    if len(targets) <= 1:
+        return []
+    return targets
+
+
@click.group("session")
 def session_group():
    """Save and restore browser sessions."""
@@ -71,18 +87,35 @@ def session_diff(name_a, name_b):
 def session_list():
    """List all saved sessions."""
    from rich.table import Table
-    sessions = _handle("session.list")
+    targets = _multi_browser_targets()
+    show_browser = bool(targets)
+    if targets:
+        sessions = []
+        for target in targets:
+            result = _handle_multi("session.list", profile=target.profile)
+            if result is None:
+                continue
+            sessions.extend({**session, "browser": target.display_name} for session in result)
+        if not sessions:
+            console.print("[red]Error:[/red] Cannot resolve a browser socket automatically.")
+            raise SystemExit(1)
+    else:
+        sessions = _handle("session.list")
    if not sessions:
        console.print("[yellow]No saved sessions[/yellow]")
        return
    table = Table(show_header=True, header_style="bold cyan")
+    if show_browser:
+        table.add_column("Browser")
    table.add_column("Name")
    table.add_column("Tabs", width=6)
    table.add_column("Saved at")
    for s in sessions:
        from datetime import datetime
        saved = datetime.fromtimestamp(s["savedAt"] / 1000).strftime("%Y-%m-%d %H:%M") if s.get("savedAt") else ""
-        table.add_row(s["name"], str(s["tabs"]), saved)
+        row = [s.get("browser", "")] if show_browser else []
+        row.extend([s["name"], str(s["tabs"]), saved])
+        table.add_row(*row)
    console.print(table)