diff --git a/README.md b/README.md index d2cf816..a8edeea 100644 --- a/README.md +++ b/README.md @@ -122,7 +122,7 @@ browser-cli/ All commands are run with `uv run browser-cli [--browser ALIAS] `. -If exactly one browser instance is connected, commands auto-target it. Use `--browser ALIAS` when multiple browser instances are connected. `tabs list`, `tabs count`, `groups list`, `groups count`, and `windows list` are the only commands that aggregate across all active browsers when `--browser` is omitted; in that mode they show the source browser alias or UUID. You can inspect the active instances with `browser-cli clients` and assign a persistent profile alias from inside the target browser with `browser-cli clients rename --browser `. Closed browsers are removed from the client registry automatically. +If exactly one browser instance is connected, commands auto-target it. Use `--browser ALIAS` when multiple browser instances are connected. `tabs list`, `tabs count`, `groups list`, `groups count`, `windows list`, and `session list` aggregate across all active browsers when `--browser` is omitted; in that mode they show the source browser alias or UUID. You can inspect the active instances with `browser-cli clients` and assign a persistent profile alias from inside the target browser with `browser-cli clients rename --browser `. Closed browsers are removed from the client registry automatically. Important: profile aliases are browser-instance aliases, not window aliases. Window aliases created with `windows rename` are only for targeting windows in commands like `nav open --window work`. If a browser instance has no explicit profile alias set, the native host gives it a generated UUID alias so multiple unaliased browsers stay distinct. diff --git a/browser_cli/__init__.py b/browser_cli/__init__.py index 22b9caa..f0b3644 100644 --- a/browser_cli/__init__.py +++ b/browser_cli/__init__.py @@ -346,6 +346,17 @@ class BrowserCLI: return self._cmd("session.diff", {"nameA": name_a, "nameB": name_b}) def session_list(self) -> list[dict]: + """Return saved sessions. + + In implicit multi-browser mode each session dict includes a ``browser`` key. + """ + multi_results = self._collect_multi_browser("session.list", {}) + if multi_results: + return [ + {**session, "browser": target.display_name} + for target, sessions in multi_results + for session in (sessions or []) + ] return self._cmd("session.list", {}) def session_remove(self, name: str) -> None: diff --git a/browser_cli/commands/extract.py b/browser_cli/commands/extract.py index 34c12e8..6918f9f 100644 --- a/browser_cli/commands/extract.py +++ b/browser_cli/commands/extract.py @@ -1,10 +1,426 @@ -import click import json +import re +from html.parser import HTMLParser + +import click from browser_cli.client import send_command, BrowserNotConnected from rich.console import Console from rich.table import Table console = Console() +_FENCE_RE = re.compile(r"```(?:[^\n`]*)\n.*?\n```", re.DOTALL) +_ESCAPED_MARKDOWN_RE = re.compile(r"\\([_-])") +_TABLE_SEPARATOR_RE = re.compile(r"^\|(?:\s*:?-{3,}:?\s*\|)+\s*$") + + +class _HtmlNode: + def __init__(self, tag=None, attrs=None, text=None): + self.tag = tag + self.attrs = attrs or {} + self.text = text + self.children = [] + + +class _HtmlTreeBuilder(HTMLParser): + _VOID_TAGS = {"br", "hr", "img"} + + def __init__(self): + super().__init__(convert_charrefs=True) + self.root = _HtmlNode(tag="document") + self._stack = [self.root] + + def handle_starttag(self, tag, attrs): + node = _HtmlNode(tag=tag.lower(), attrs=dict(attrs)) + self._stack[-1].children.append(node) + if node.tag not in self._VOID_TAGS: + self._stack.append(node) + + def handle_startendtag(self, tag, attrs): + node = _HtmlNode(tag=tag.lower(), attrs=dict(attrs)) + self._stack[-1].children.append(node) + + def handle_endtag(self, tag): + lowered = tag.lower() + for index in range(len(self._stack) - 1, 0, -1): + if self._stack[index].tag == lowered: + del self._stack[index:] + break + + def handle_data(self, data): + if data: + self._stack[-1].children.append(_HtmlNode(text=data)) + + +def _normalize_text(value): + return re.sub(r"\s+", " ", value or "").strip() + + +def _normalize_inline(value): + value = value.replace("\xa0", " ") + value = re.sub(r"[ \t\r\f\v]+", " ", value) + value = re.sub(r" *\n *", "\n", value) + return value.strip() + + +def _collapse_blank_lines(value): + value = re.sub(r"[ \t]+\n", "\n", value) + value = re.sub(r"\n{3,}", "\n\n", value) + return value.strip() + + +def _escape_markdown(text): + return re.sub(r"([\\`[\]])", r"\\\1", text) + + +def _escape_table_cell(text): + return text.replace("|", r"\|").replace("\n", " ").strip() + + +def _iter_descendants(node): + for child in getattr(node, "children", []): + yield child + yield from _iter_descendants(child) + + +def _has_class(node, class_name): + classes = (node.attrs.get("class") or "").split() + return class_name in classes + + +def _is_code_block_node(node): + if not node or not node.tag: + return False + if node.attrs.get("data-is-code-block-view") == "true": + return True + return node.tag == "pre" + + +def _inline_text(node): + if node.text is not None: + return _escape_markdown(node.text) + if not node.tag: + return "" + + tag = node.tag + if tag == "br": + return "\n" + if tag == "img": + src = node.attrs.get("src") or "" + alt = _normalize_text(node.attrs.get("alt") or "") + if not src: + return "" + return f"![{_escape_markdown(alt)}]({src})" if alt else f"![]({src})" + if tag == "a": + text = _normalize_inline("".join(_inline_text(child) for child in node.children)) + href = node.attrs.get("href") or "" + return f"[{text or href}]({href})" if href else text + if tag == "code": + text = _normalize_inline("".join(_inline_text(child) for child in node.children)) + return f"`{text.replace('`', r'\\`')}`" if text else "" + if tag in {"strong", "b"}: + text = _normalize_inline("".join(_inline_text(child) for child in node.children)) + return f"**{text}**" if text else "" + if tag in {"em", "i"}: + text = _normalize_inline("".join(_inline_text(child) for child in node.children)) + return f"*{text}*" if text else "" + + chunks = [] + for child in node.children: + rendered = _inline_text(child) + if rendered: + chunks.append(rendered) + if child.tag in {"p", "div", "table", "ul", "ol", "pre"}: + chunks.append("\n") + return "".join(chunks) + + +def _text_block(node): + return _collapse_blank_lines(_normalize_inline("".join(_inline_text(child) for child in node.children))) + + +def _inner_text_preserve(node): + if node.text is not None: + return node.text + if not node.tag: + return "" + if node.tag == "br": + return "" + return "".join(_inner_text_preserve(child) for child in node.children) + + +def _table_to_markdown(node): + rows = [] + for descendant in _iter_descendants(node): + if descendant.tag != "tr": + continue + row = [] + for cell in descendant.children: + if cell.tag in {"td", "th"}: + row.append(_escape_table_cell(_text_block(cell))) + if row: + rows.append(row) + if not rows: + return "" + + widths = max(len(row) for row in rows) + normalized_rows = [row + [""] * (widths - len(row)) for row in rows] + + headers = normalized_rows[0] + body_rows = normalized_rows[1:] + first_row_blank = all(not cell.strip() for cell in headers) + if first_row_blank and len(normalized_rows) > 1: + headers = normalized_rows[1] + body_rows = normalized_rows[2:] + + has_thead = any(child.tag == "thead" for child in node.children) + first_row = next((child for child in _iter_descendants(node) if child.tag == "tr"), None) + first_row_has_th = bool(first_row and any(child.tag == "th" for child in first_row.children)) + if not (has_thead or first_row_has_th or first_row_blank): + headers = [""] * widths + body_rows = normalized_rows + + separator = ["---"] * widths + lines = [ + f"| {' | '.join(headers)} |", + f"| {' | '.join(separator)} |", + ] + lines.extend(f"| {' | '.join(row)} |" for row in body_rows) + return "\n".join(lines) + + +def _list_to_markdown(node, depth=0): + ordered = node.tag == "ol" + items = [] + index = 1 + for child in node.children: + if child.tag != "li": + continue + marker = f"{index}. " if ordered else "- " + index += 1 + content = [] + nested = [] + for item_child in child.children: + if item_child.tag in {"ul", "ol"}: + nested.append(_list_to_markdown(item_child, depth + 1)) + else: + content.append(_inline_text(item_child)) + line = _collapse_blank_lines(_normalize_inline("".join(content))) + indent = " " * depth + if line: + line_parts = line.splitlines() + items.append(f"{indent}{marker}{line_parts[0]}") + continuation_indent = f"{indent}{' ' * len(marker)}" + items.extend(f"{continuation_indent}{part}" for part in line_parts[1:]) + items.extend(block for block in nested if block) + return "\n".join(items) + + +def _code_block_to_markdown(node): + if node.tag == "pre": + text = _inner_text_preserve(node).rstrip("\n") + return f"```\n{text}\n```" if text else "" + + lines = [] + for descendant in _iter_descendants(node): + if descendant.tag and _has_class(descendant, "cm-line"): + lines.append(_inner_text_preserve(descendant)) + code = "\n".join(lines).rstrip("\n") + return f"```\n{code}\n```" if code else "" + + +def _block_to_markdown(node): + if node.text is not None: + return _normalize_text(node.text) + if not node.tag: + return "" + if _is_code_block_node(node): + return _code_block_to_markdown(node) + if node.tag == "table": + return _table_to_markdown(node) + if node.tag in {"ul", "ol"}: + return _list_to_markdown(node) + if re.fullmatch(r"h[1-6]", node.tag): + text = _text_block(node) + return f"{'#' * int(node.tag[1])} {text}" if text else "" + if node.tag in {"p", "figcaption"}: + return _text_block(node) + if node.tag == "blockquote": + content = _collapse_blank_lines("\n\n".join(filter(None, (_block_to_markdown(child) for child in node.children)))) + return "\n".join(f"> {line}" if line else ">" for line in content.splitlines()) if content else "" + if node.tag == "hr": + return "---" + if node.tag == "img": + return _inline_text(node) + + child_blocks = [block for block in (_block_to_markdown(child) for child in node.children) if block] + if child_blocks: + return _collapse_blank_lines("\n\n".join(child_blocks)) + return _text_block(node) + + +def _parse_table_row(line): + stripped = line.strip() + if not stripped.startswith("|") or not stripped.endswith("|"): + return None + return [cell.strip() for cell in stripped.strip("|").split("|")] + + +def _repair_table_headers(lines): + repaired = [] + index = 0 + while index < len(lines): + if ( + index + 2 < len(lines) + and _parse_table_row(lines[index]) is not None + and _TABLE_SEPARATOR_RE.match(lines[index + 1].strip()) + and _parse_table_row(lines[index + 2]) is not None + ): + first = _parse_table_row(lines[index]) + third = _parse_table_row(lines[index + 2]) + if first and all(not cell for cell in first) and any(cell for cell in third): + repaired.append(lines[index + 2].strip()) + repaired.append(lines[index + 1].strip()) + index += 3 + continue + repaired.append(lines[index].strip()) + index += 1 + return repaired + + +def _repair_list_continuations(lines): + repaired = [] + previous_was_list_item = False + previous_continuation_indent = "" + + for line in lines: + stripped = line.strip() + list_match = re.match(r"^(\s*)([-*+]|\d+\.)\s+.+$", stripped) + is_markdown_block_start = ( + not stripped + or stripped.startswith(("```", "#", ">", "|")) + or _TABLE_SEPARATOR_RE.match(stripped) + or re.match(r"^(\s*)([-*+]|\d+\.)\s+", stripped) + ) + + if previous_was_list_item and stripped and not is_markdown_block_start: + repaired.append(f"{previous_continuation_indent}{stripped}") + previous_was_list_item = False + continue + + repaired.append(stripped) + if list_match: + marker = list_match.group(2) + base_indent = list_match.group(1) + previous_continuation_indent = f"{base_indent}{' ' * (len(marker) + 1)}" + previous_was_list_item = True + else: + previous_was_list_item = False + + return repaired + + +def _repair_flattened_diagram(text): + if "\n" in text: + return text + if sum(text.count(char) for char in "│▼├└") < 2: + return text + + text = re.sub(r"\s{2,}([│▼])", r"\n \1", text) + text = re.sub(r"([│▼])\s{2,}", r"\1\n", text) + text = re.sub(r"([│▼])(?=[^\s\n│▼├└])", r"\1\n", text) + text = re.sub(r"(?<=[^\s\n])([├└])", r"\n\1", text) + text = re.sub(r"([^\s\n])(\()", r"\1\n\2", text) + return "\n".join(line.rstrip() for line in text.splitlines() if line.strip()) + + +def _convert_dash_lists_to_branches(lines): + converted = [] + index = 0 + while index < len(lines): + match = re.match(r"^(\s*)-\s+(.*)$", lines[index]) + if not match: + converted.append(lines[index]) + index += 1 + continue + + indent = match.group(1) + items = [] + while index < len(lines): + next_match = re.match(rf"^{re.escape(indent)}-\s+(.*)$", lines[index]) + if not next_match: + break + items.append(next_match.group(1)) + index += 1 + + for item_index, item in enumerate(items): + branch = "└" if item_index == len(items) - 1 else "├" + converted.append(f"{indent}{branch} {item}") + return converted + + +def _clean_code_block(code): + lines = [line.rstrip() for line in code.splitlines()] + while lines and not lines[0].strip(): + lines.pop(0) + while lines and not lines[-1].strip(): + lines.pop() + + flattened = _repair_flattened_diagram("\n".join(lines)) + lines = flattened.splitlines() if flattened else [] + lines = [ + f" {line.strip()}" + if line.strip() in {"│", "▼"} and not re.match(r"^\s+[│▼]\s*$", line) + else line + for line in lines + ] + lines = _convert_dash_lists_to_branches(lines) + return "\n".join(lines) + + +def _clean_markdown_output(markdown): + if not markdown: + return "" + + pieces = [] + last_index = 0 + for match in _FENCE_RE.finditer(markdown): + prose = markdown[last_index:match.start()] + if prose: + cleaned = _ESCAPED_MARKDOWN_RE.sub(r"\1", prose) + lines = [line.strip() for line in cleaned.splitlines()] + lines = _repair_table_headers(lines) + lines = _repair_list_continuations(lines) + cleaned = "\n".join(lines) + cleaned = _collapse_blank_lines(cleaned) + if cleaned: + pieces.append(cleaned) + + fence = match.group(0) + header, _, tail = fence.partition("\n") + body, _, _ = tail.rpartition("\n") + cleaned_body = _clean_code_block(body) + pieces.append(f"{header}\n{cleaned_body}\n```" if cleaned_body else f"{header}\n```") + last_index = match.end() + + trailing = markdown[last_index:] + if trailing: + cleaned = _ESCAPED_MARKDOWN_RE.sub(r"\1", trailing) + lines = [line.strip() for line in cleaned.splitlines()] + lines = _repair_table_headers(lines) + lines = _repair_list_continuations(lines) + cleaned = "\n".join(lines) + cleaned = _collapse_blank_lines(cleaned) + if cleaned: + pieces.append(cleaned) + + return "\n\n".join(piece for piece in pieces if piece) + + +def _convert_html_to_markdown(html): + parser = _HtmlTreeBuilder() + parser.feed(html or "") + markdown = _block_to_markdown(parser.root) + return _clean_markdown_output(markdown) def _handle(command, args=None): @@ -80,4 +496,8 @@ def extract_html(): def extract_markdown(selector): """Extract the page's main content as Markdown.""" markdown = _handle("extract.markdown", {"selector": selector}) + if (markdown or "").lstrip().startswith("<"): + markdown = _convert_html_to_markdown(markdown) + else: + markdown = _clean_markdown_output(markdown or "") click.echo(markdown or "", nl=not (markdown or "").endswith("\n")) diff --git a/browser_cli/commands/session.py b/browser_cli/commands/session.py index c0239f6..b6f718d 100644 --- a/browser_cli/commands/session.py +++ b/browser_cli/commands/session.py @@ -1,14 +1,13 @@ import click -import json -from browser_cli.client import send_command, BrowserNotConnected +from browser_cli.client import active_browser_targets, send_command, BrowserNotConnected from rich.console import Console console = Console() -def _handle(command, args=None): +def _handle(command, args=None, profile=None): try: - return send_command(command, args or {}) + return send_command(command, args or {}, profile=profile) except BrowserNotConnected as e: console.print(f"[red]Error:[/red] {e}") raise SystemExit(1) @@ -17,6 +16,23 @@ def _handle(command, args=None): raise SystemExit(1) +def _handle_multi(command, args=None, profile=None): + try: + return send_command(command, args or {}, profile=profile) + except (BrowserNotConnected, RuntimeError): + return None + + +def _multi_browser_targets(): + root = click.get_current_context().find_root() + if root.obj.get("browser_explicit"): + return [] + targets = active_browser_targets() + if len(targets) <= 1: + return [] + return targets + + @click.group("session") def session_group(): """Save and restore browser sessions.""" @@ -71,18 +87,35 @@ def session_diff(name_a, name_b): def session_list(): """List all saved sessions.""" from rich.table import Table - sessions = _handle("session.list") + targets = _multi_browser_targets() + show_browser = bool(targets) + if targets: + sessions = [] + for target in targets: + result = _handle_multi("session.list", profile=target.profile) + if result is None: + continue + sessions.extend({**session, "browser": target.display_name} for session in result) + if not sessions: + console.print("[red]Error:[/red] Cannot resolve a browser socket automatically.") + raise SystemExit(1) + else: + sessions = _handle("session.list") if not sessions: console.print("[yellow]No saved sessions[/yellow]") return table = Table(show_header=True, header_style="bold cyan") + if show_browser: + table.add_column("Browser") table.add_column("Name") table.add_column("Tabs", width=6) table.add_column("Saved at") for s in sessions: from datetime import datetime saved = datetime.fromtimestamp(s["savedAt"] / 1000).strftime("%Y-%m-%d %H:%M") if s.get("savedAt") else "" - table.add_row(s["name"], str(s["tabs"]), saved) + row = [s.get("browser", "")] if show_browser else [] + row.extend([s["name"], str(s["tabs"]), saved]) + table.add_row(*row) console.print(table) diff --git a/extension/background.js b/extension/background.js index 3caa789..1f440d2 100644 --- a/extension/background.js +++ b/extension/background.js @@ -659,6 +659,32 @@ function contentDispatch(funcName, args) { "li", "main", "nav", "ol", "p", "pre", "section", "table", "tbody", "td", "tfoot", "th", "thead", "tr", "ul" ]); + const NOISE_SELECTOR = [ + "script", + "style", + "noscript", + "template", + "svg", + "canvas", + "iframe", + "dialog", + "button", + "input", + "textarea", + "select", + "option", + "form", + "[hidden]", + "[aria-hidden='true']", + ".sr-only", + "[class*='sr-only']", + "[class*='file-tile']", + "form[data-type='unified-composer']", + ".composer-btn", + "[data-composer-surface='true']", + "#thread-bottom-container", + "[data-testid*='action-button']", + ].join(", "); function normalizeText(value) { return value.replace(/\s+/g, " ").trim(); @@ -681,7 +707,7 @@ function contentDispatch(funcName, args) { } function escapeMarkdown(text) { - return text.replace(/([\\`*_{}\[\]()#+\-!|>])/g, "\\$1"); + return text.replace(/([\\`[\]])/g, "\\$1"); } function escapeTableCell(text) { @@ -692,12 +718,55 @@ function contentDispatch(funcName, args) { return attr || fallback || ""; } + function isNoiseElement(node) { + if (!node || node.nodeType !== Node.ELEMENT_NODE) return false; + const tag = node.tagName.toLowerCase(); + if (["script", "style", "noscript", "template", "svg", "canvas", "iframe", "dialog"].includes(tag)) return true; + if (["button", "input", "textarea", "select", "option", "form"].includes(tag)) return true; + if (node.hasAttribute("hidden")) return true; + if ((node.getAttribute("aria-hidden") || "").toLowerCase() === "true") return true; + if (node.matches(".sr-only, [class*='sr-only']")) return true; + if (node.matches("[class*='file-tile'], form[data-type='unified-composer'], .composer-btn, [data-composer-surface='true'], #thread-bottom-container")) return true; + if (node.matches("[data-testid*='action-button']")) return true; + return false; + } + function stripNoise(root) { const clone = root.cloneNode(true); - clone.querySelectorAll("script, style, noscript, template").forEach(node => node.remove()); + clone.querySelectorAll(NOISE_SELECTOR).forEach(node => node.remove()); return clone; } + function candidateScore(node) { + const text = normalizeText(node.innerText || ""); + if (!text) return -Infinity; + + const headings = node.querySelectorAll("h1, h2, h3, h4, h5, h6").length; + const paragraphs = node.querySelectorAll("p").length; + const listItems = node.querySelectorAll("li").length; + const tables = node.querySelectorAll("table").length; + const codeBlocks = node.querySelectorAll("pre, code").length; + const images = node.querySelectorAll("img, figure").length; + const mainLike = node.matches("main, article, [role='main']") ? 1 : 0; + const proseBlocks = node.matches(".markdown, .prose, [data-message-author-role='assistant']") ? 1 : 0; + const buttons = node.querySelectorAll("button, input, textarea, select").length; + const forms = node.querySelectorAll("form").length; + const svgs = node.querySelectorAll("svg, canvas").length; + + return text.length + + (mainLike * 4000) + + (proseBlocks * 5000) + + (headings * 250) + + (paragraphs * 60) + + (listItems * 35) + + (tables * 80) + + (codeBlocks * 60) + + (images * 25) + - (buttons * 120) + - (forms * 200) + - (svgs * 40); + } + function pickRoot() { if (selector) { const matched = document.querySelector(selector); @@ -705,10 +774,12 @@ function contentDispatch(funcName, args) { return matched; } - const candidates = Array.from(document.querySelectorAll("main, article, [role='main']")) + const candidates = Array.from(document.querySelectorAll( + "main, article, [role='main'], section, .markdown, .prose, [data-message-author-role]" + )) .filter(node => normalizeText(node.innerText || "").length > 0); if (!candidates.length) return document.body; - candidates.sort((a, b) => (b.innerText || "").length - (a.innerText || "").length); + candidates.sort((a, b) => candidateScore(b) - candidateScore(a)); return candidates[0]; } @@ -717,9 +788,9 @@ function contentDispatch(funcName, args) { return escapeMarkdown(node.textContent || ""); } if (node.nodeType !== Node.ELEMENT_NODE) return ""; + if (isNoiseElement(node)) return ""; const tag = node.tagName.toLowerCase(); - if (tag === "script" || tag === "style" || tag === "noscript" || tag === "template") return ""; if (tag === "br") return "\n"; if (tag === "img") { const src = absoluteUrl(node.getAttribute("src"), node.src); @@ -762,6 +833,92 @@ function contentDispatch(funcName, args) { return collapseBlankLines(normalizeInline(Array.from(node.childNodes).map(inlineText).join(""))); } + function preserveNodeText(node) { + if (node.nodeType === Node.TEXT_NODE) { + return node.textContent || ""; + } + if (node.nodeType !== Node.ELEMENT_NODE) return ""; + + const tag = node.tagName.toLowerCase(); + if (tag === "br") return "\n"; + + const parts = []; + for (const child of node.childNodes) { + const rendered = preserveNodeText(child); + if (!rendered) continue; + parts.push(rendered); + } + + if (["div", "p", "li"].includes(tag)) { + return `${parts.join("")}\n`; + } + return parts.join(""); + } + + function repairFlattenedDiagram(text) { + if (text.includes("\n")) return text; + const markerCount = (text.match(/[│▼├└]/g) || []).length; + if (markerCount < 2) return text; + + let repaired = text; + repaired = repaired.replace(/\s{2,}([│▼])/g, "\n $1"); + repaired = repaired.replace(/([│▼])\s{2,}/g, "$1\n"); + repaired = repaired.replace(/([│▼])(?=[^\s\n│▼├└])/g, "$1\n"); + repaired = repaired.replace(/(?<=[^\s\n])([├└])/g, "\n$1"); + repaired = repaired.replace(/([^\s\n])(\()/g, "$1\n$2"); + return repaired + .split("\n") + .map(line => line.replace(/\s+$/, "")) + .filter(line => line.trim()) + .join("\n"); + } + + function convertDashListsToBranches(lines) { + const converted = []; + let index = 0; + while (index < lines.length) { + const match = lines[index].match(/^(\s*)-\s+(.*)$/); + if (!match) { + converted.push(lines[index]); + index += 1; + continue; + } + + const indent = match[1]; + const items = []; + while (index < lines.length) { + const nextMatch = lines[index].match(new RegExp(`^${indent.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}-\\s+(.*)$`)); + if (!nextMatch) break; + items.push(nextMatch[1]); + index += 1; + } + + items.forEach((item, itemIndex) => { + const branch = itemIndex === items.length - 1 ? "└" : "├"; + converted.push(`${indent}${branch} ${item}`); + }); + } + return converted; + } + + function normalizeCodeBlock(text) { + let lines = text.replace(/\r\n?/g, "\n").split("\n").map(line => line.replace(/\s+$/, "")); + while (lines.length && !lines[0].trim()) lines.shift(); + while (lines.length && !lines[lines.length - 1].trim()) lines.pop(); + + const flattened = repairFlattenedDiagram(lines.join("\n")); + lines = flattened ? flattened.split("\n") : []; + lines = lines.map(line => { + const trimmed = line.trim(); + if ((trimmed === "│" || trimmed === "▼") && !/^\s+[│▼]\s*$/.test(line)) { + return ` ${trimmed}`; + } + return line; + }); + lines = convertDashListsToBranches(lines); + return lines.join("\n"); + } + function tableToMarkdown(table) { const rows = Array.from(table.querySelectorAll("tr")) .map(row => Array.from(row.children) @@ -780,10 +937,16 @@ function contentDispatch(funcName, args) { let headers = normalizedRows[0]; let bodyRows = normalizedRows.slice(1); + const firstRowIsBlank = headers.every(cell => !cell.trim()); + if (firstRowIsBlank && normalizedRows.length > 1) { + headers = normalizedRows[1]; + bodyRows = normalizedRows.slice(2); + } + const firstRow = table.querySelector("tr"); const thead = table.querySelector("thead"); const firstRowHasTh = firstRow && Array.from(firstRow.children).some(cell => cell.tagName === "TH"); - if (!(thead || firstRowHasTh)) { + if (!(thead || firstRowHasTh || firstRowIsBlank)) { headers = new Array(widths).fill(""); bodyRows = normalizedRows; } @@ -818,7 +981,12 @@ function contentDispatch(funcName, args) { } const line = collapseBlankLines(normalizeInline(content.join(""))); - if (line) items.push(`${indent}${marker}${line}`); + if (line) { + const lineParts = line.split("\n"); + items.push(`${indent}${marker}${lineParts[0]}`); + const continuationIndent = `${indent}${" ".repeat(marker.length)}`; + lineParts.slice(1).forEach(part => items.push(`${continuationIndent}${part}`)); + } nested.filter(Boolean).forEach(block => items.push(block)); }); return items.join("\n"); @@ -829,13 +997,21 @@ function contentDispatch(funcName, args) { return normalizeText(node.textContent || ""); } if (node.nodeType !== Node.ELEMENT_NODE) return ""; + if (isNoiseElement(node)) return ""; const tag = node.tagName.toLowerCase(); - if (tag === "script" || tag === "style" || tag === "noscript" || tag === "template") return ""; if (tag === "table") return tableToMarkdown(node); if (tag === "ul" || tag === "ol") return listToMarkdown(node); + if (node.matches(".cm-editor[data-is-code-block-view='true']")) { + const lines = Array.from(node.querySelectorAll(".cm-line")).map(line => { + const text = preserveNodeText(line); + return text === "\n" ? "" : text.replace(/\n$/, ""); + }); + const code = normalizeCodeBlock(lines.join("\n")); + return code ? `\`\`\`\n${code}\n\`\`\`` : ""; + } if (tag === "pre") { - const code = node.innerText.replace(/\n$/, ""); + const code = normalizeCodeBlock(preserveNodeText(node)); return code ? `\`\`\`\n${code}\n\`\`\`` : ""; } if (tag === "blockquote") { @@ -1011,9 +1187,21 @@ async function clientsRenameProfile({ alias }) { // ── Helpers ─────────────────────────────────────────────────────────────────── async function getActiveTab() { - const [tab] = await chrome.tabs.query({ active: true, lastFocusedWindow: true }); - if (!tab) throw new Error("No active tab found"); - return tab; + const activeTabs = await chrome.tabs.query({ active: true }); + if (!activeTabs.length) throw new Error("No active tab found"); + + const windows = await chrome.windows.getAll({ populate: false }); + const focusedWindowIds = new Set(windows.filter(window => window.focused).map(window => window.id)); + + const chooseTab = (predicate) => activeTabs.find(predicate); + const byFocusAndScriptable = tab => focusedWindowIds.has(tab.windowId) && isScriptableUrl(tab.url || tab.pendingUrl || ""); + const byScriptable = tab => isScriptableUrl(tab.url || tab.pendingUrl || ""); + const byFocus = tab => focusedWindowIds.has(tab.windowId); + + return chooseTab(byFocusAndScriptable) + || chooseTab(byScriptable) + || chooseTab(byFocus) + || activeTabs[0]; } async function resolveGroupId(nameOrId) { diff --git a/tests/test_api.py b/tests/test_api.py index 3547437..202b34f 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -488,6 +488,39 @@ class TestWindows: mock_send.assert_called_once_with("windows.open", {"url": "https://example.com"}, profile=None) +class TestSession: + def test_session_list(self, b, mock_send): + mock_send.return_value = [{"name": "saved", "tabs": 3, "savedAt": 1712707200000}] + + result = b.session_list() + + assert result == [{"name": "saved", "tabs": 3, "savedAt": 1712707200000}] + mock_send.assert_called_once_with("session.list", {}, profile=None) + + def test_session_list_multi_browser_adds_browser(self, b, mock_send): + with patch( + "browser_cli.active_browser_targets", + return_value=[ + BrowserTarget("default", "uuid-1", "/tmp/uuid-1.sock"), + BrowserTarget("work", "work", "/tmp/work.sock"), + ], + ): + mock_send.side_effect = [ + [{"name": "first", "tabs": 2, "savedAt": 1712707200000}], + [{"name": "second", "tabs": 5, "savedAt": 1712707300000}], + ] + result = b.session_list() + + assert result == [ + {"name": "first", "tabs": 2, "savedAt": 1712707200000, "browser": "uuid-1"}, + {"name": "second", "tabs": 5, "savedAt": 1712707300000, "browser": "work"}, + ] + assert mock_send.call_args_list == [ + call("session.list", {}, profile="default"), + call("session.list", {}, profile="work"), + ] + + # ── Tab model ───────────────────────────────────────────────────────────────── class TestTabModel: diff --git a/tests/test_cli.py b/tests/test_cli.py index f6c0f0e..09d4ee7 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -5,6 +5,7 @@ from unittest.mock import patch from browser_cli.cli import main, _project_version from browser_cli.client import BrowserTarget +from browser_cli.commands.extract import _clean_markdown_output, _convert_html_to_markdown def _expected_version() -> str: pyproject = Path(__file__).resolve().parent.parent / "pyproject.toml" @@ -204,6 +205,46 @@ def test_windows_list_multi_browser_shows_browser_column(): assert "work" in result.output +def test_session_list_multi_browser_shows_browser_column(): + def fake_send_command(command, args=None, profile=None): + assert command == "session.list" + return [{"name": f"{profile}-session", "tabs": 2, "savedAt": 1712707200000}] + + with patch( + "browser_cli.commands.session.active_browser_targets", + return_value=[ + BrowserTarget("default", "uuid-1", "/tmp/default.sock"), + BrowserTarget("work", "work", "/tmp/work.sock"), + ], + ), patch("browser_cli.commands.session.send_command", side_effect=fake_send_command): + result = CliRunner().invoke(main, ["session", "list"]) + + assert result.exit_code == 0 + assert "Browser" in result.output + assert "uuid-1" in result.output + assert "work" in result.output + assert "default-session" in result.output + assert "work-session" in result.output + + +def test_session_list_with_explicit_browser_does_not_show_browser_column(): + with patch( + "browser_cli.commands.session.active_browser_targets", + return_value=[ + BrowserTarget("default", "uuid-1", "/tmp/default.sock"), + BrowserTarget("work", "work", "/tmp/work.sock"), + ], + ), patch( + "browser_cli.commands.session.send_command", + return_value=[{"name": "work-session", "tabs": 2, "savedAt": 1712707200000}], + ) as send_command: + result = CliRunner().invoke(main, ["--browser", "work", "session", "list"]) + + assert result.exit_code == 0 + assert "Browser" not in result.output + send_command.assert_called_once_with("session.list", {}, profile=None) + + def test_windows_open_passes_url(): with patch("browser_cli.commands.windows.send_command", return_value={"id": 7}) as send_command: result = CliRunner().invoke(main, ["windows", "open", "https://example.com"]) @@ -213,7 +254,7 @@ def test_windows_open_passes_url(): send_command.assert_called_once_with("windows.open", {"url": "https://example.com"}, profile=None) def test_extract_markdown_command(): - with patch("browser_cli.commands.extract.send_command", return_value="# Title\n") as send_command: + with patch("browser_cli.commands.extract.send_command", return_value="# Title") as send_command: result = CliRunner().invoke(main, ["extract", "markdown"]) assert result.exit_code == 0 @@ -221,9 +262,160 @@ def test_extract_markdown_command(): send_command.assert_called_once_with("extract.markdown", {"selector": None}) def test_extract_markdown_command_with_selector(): - with patch("browser_cli.commands.extract.send_command", return_value="## Post\n") as send_command: + with patch("browser_cli.commands.extract.send_command", return_value="## Post") as send_command: result = CliRunner().invoke(main, ["extract", "markdown", "--selector", "article"]) assert result.exit_code == 0 assert result.output == "## Post\n" send_command.assert_called_once_with("extract.markdown", {"selector": "article"}) + + +def test_clean_markdown_output_removes_escaped_underscores_and_dashes(): + assert _clean_markdown_output(r"hello\_world \- item") == "hello_world - item" + + +def test_clean_markdown_output_trims_useless_whitespace(): + raw = " # Title \n\n\n paragraph with space \n next line\t \n" + assert _clean_markdown_output(raw) == "# Title\n\nparagraph with space\nnext line" + + +def test_clean_markdown_output_repairs_empty_table_header_rows(): + raw = ( + "| | | |\n" + "| --- | --- | --- |\n" + "| Bereich | Plan | Ist |\n" + "| A | B | C |\n" + ) + assert _clean_markdown_output(raw) == ( + "| Bereich | Plan | Ist |\n" + "| --- | --- | --- |\n" + "| A | B | C |" + ) + + +def test_clean_markdown_output_preserves_graph_code_blocks(): + raw = "```\n\nA\n │\n ▼\nB\n\n```" + assert _clean_markdown_output(raw) == "```\nA\n │\n ▼\nB\n```" + + +def test_clean_markdown_output_renders_code_block_list_branches(): + raw = "```\nPlattformen\n- Omnifact\n- Open WebUI + Ollama\n- Le Chat\n```" + assert _clean_markdown_output(raw) == ( + "```\n" + "Plattformen\n" + "├ Omnifact\n" + "├ Open WebUI + Ollama\n" + "└ Le Chat\n" + "```" + ) + + +def test_clean_markdown_output_unflattens_graph_code_blocks(): + raw = ( + "```\n" + "Golden Set │ ▼Promptfoo(Testausführung) │ ▼UpTrain(Qualitätsbewertung) │ " + "▼Langfuse(Logging / Observability) │ ▼Plattformen├ Omnifact├ Open WebUI + Ollama└ Le Chat\n" + "```" + ) + assert _clean_markdown_output(raw) == ( + "```\n" + "Golden Set\n" + " │\n" + " ▼\n" + "Promptfoo\n" + "(Testausführung)\n" + " │\n" + " ▼\n" + "UpTrain\n" + "(Qualitätsbewertung)\n" + " │\n" + " ▼\n" + "Langfuse\n" + "(Logging / Observability)\n" + " │\n" + " ▼\n" + "Plattformen\n" + "├ Omnifact\n" + "├ Open WebUI + Ollama\n" + "└ Le Chat\n" + "```" + ) + + +def test_extract_markdown_command_repairs_malformed_tables_and_code_blocks(): + raw = ( + "| | | |\n" + "| --- | --- | --- |\n" + "| Bereich | Plan | Ist |\n" + "| Eval-Stack | Testumgebung | funktionsfähig |\n\n" + "```\n" + "Golden Set │ ▼Promptfoo(Testausführung) │ ▼Plattformen├ Omnifact└ Le Chat\n" + "```" + ) + with patch("browser_cli.commands.extract.send_command", return_value=raw): + result = CliRunner().invoke(main, ["extract", "markdown"]) + + assert result.exit_code == 0 + assert "| Bereich | Plan | Ist |" in result.output + assert "| | | |" not in result.output + assert "Golden Set\n │\n ▼\nPromptfoo\n(Testausführung)" in result.output + assert "├ Omnifact" in result.output + assert "└ Le Chat" in result.output + + +def test_convert_html_to_markdown_normalizes_blank_table_header_rows(): + html = """ +
+ + + + +
RisikoBeschreibungAuswirkungGegenmaßnahme
DatenschutzXYZ
+
+ """ + markdown = _convert_html_to_markdown(html) + assert "| Risiko | Beschreibung | Auswirkung | Gegenmaßnahme |" in markdown + assert "| | | | |" not in markdown + + +def test_convert_html_to_markdown_preserves_codemirror_graph_blocks(): + html = """ +
+

Teil 5 - Eval-Stack Architektur

+
+
Golden Set
+
+
+
Promptfoo
+
(Testausführung)
+
+
+
Plattformen
+
- Omnifact
+
- Open WebUI + Ollama
+
- Le Chat
+
+
+ """ + markdown = _convert_html_to_markdown(html) + assert "```\nGolden Set\n │\n ▼\nPromptfoo" in markdown + assert "├ Omnifact" in markdown + assert "└ Le Chat" in markdown + + +def test_convert_html_to_markdown_indents_multiline_list_items(): + html = """ +
+

2. Zielarchitektur

+
    +
  • +

    Unternehmensdaten → RAG → KI-Orchestrierung →
    Local LLMs / API Modelle / Spezialmodelle

    +
  • +
+
+ """ + markdown = _convert_html_to_markdown(html) + assert ( + "- Unternehmensdaten → RAG → KI-Orchestrierung →\n" + " Local LLMs / API Modelle / Spezialmodelle" + ) in markdown