diff --git a/README.md b/README.md index 1155a81..8ee7a71 100644 --- a/README.md +++ b/README.md @@ -249,6 +249,8 @@ browser-cli extract images # all tags (src + alt) browser-cli extract text # all visible text (innerText) browser-cli extract json "#data" # parse JSON inside a CSS selector browser-cli extract html # full HTML of the active tab +browser-cli extract markdown # main page content as Markdown +browser-cli extract markdown --selector "article" # specific DOM subtree as Markdown ``` ### Sessions diff --git a/browser_cli/__init__.py b/browser_cli/__init__.py index af6b754..d085fee 100644 --- a/browser_cli/__init__.py +++ b/browser_cli/__init__.py @@ -247,6 +247,9 @@ class BrowserCLI: def extract_json(self, selector: str): return self._cmd("extract.json", {"selector": selector}) + def extract_markdown(self, selector: str | None = None) -> str: + return self._cmd("extract.markdown", {"selector": selector}) or "" + # ── Session ─────────────────────────────────────────────────────────── def session_save(self, name: str) -> None: diff --git a/browser_cli/commands/extract.py b/browser_cli/commands/extract.py index 3c2c89f..34c12e8 100644 --- a/browser_cli/commands/extract.py +++ b/browser_cli/commands/extract.py @@ -73,3 +73,11 @@ def extract_html(): """Print the full HTML of the active tab to stdout.""" html = _handle("extract.html") click.echo(html or "") + + +@extract_group.command("markdown") +@click.option("--selector", help="Extract only the DOM subtree matching this CSS selector.") +def extract_markdown(selector): + """Extract the page's main content as Markdown.""" + markdown = _handle("extract.markdown", {"selector": selector}) + click.echo(markdown or "", nl=not (markdown or "").endswith("\n")) diff --git a/extension/background.js b/extension/background.js index fd03615..da75f44 100644 --- a/extension/background.js +++ b/extension/background.js @@ -123,6 +123,7 @@ async function dispatch(command, args) { case "extract.images": return domOp("extractImages", args); case "extract.text": return domOp("extractText", args); case "extract.json": return domOp("extractJson", args); + case "extract.markdown": return domOp("extractMarkdown", args); case "extract.html": return tabsHtml({}); // ── Session ─────────────────────────────────────────────────────────── @@ -605,9 +606,229 @@ function contentDispatch(funcName, args) { if (!el) throw new Error(`No element: ${selector}`); return JSON.parse(el.textContent); } + function extractMarkdown({ selector }) { + const BLOCKS = new Set([ + "article", "aside", "blockquote", "body", "div", "dl", "fieldset", "figcaption", + "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", + "li", "main", "nav", "ol", "p", "pre", "section", "table", "tbody", "td", "tfoot", + "th", "thead", "tr", "ul" + ]); + + function normalizeText(value) { + return value.replace(/\s+/g, " ").trim(); + } + + function normalizeInline(value) { + return value + .replace(/[ \t]+\n/g, "\n") + .replace(/\n[ \t]+/g, "\n") + .replace(/\n{3,}/g, "\n\n") + .replace(/[ \t]{2,}/g, " ") + .trim(); + } + + function collapseBlankLines(value) { + return value + .replace(/[ \t]+\n/g, "\n") + .replace(/\n{3,}/g, "\n\n") + .trim(); + } + + function escapeMarkdown(text) { + return text.replace(/([\\`*_{}\[\]()#+\-!|>])/g, "\\$1"); + } + + function escapeTableCell(text) { + return text.replace(/\|/g, "\\|").replace(/\n+/g, " ").trim(); + } + + function absoluteUrl(attr, fallback) { + return attr || fallback || ""; + } + + function stripNoise(root) { + const clone = root.cloneNode(true); + clone.querySelectorAll("script, style, noscript, template").forEach(node => node.remove()); + return clone; + } + + function pickRoot() { + if (selector) { + const matched = document.querySelector(selector); + if (!matched) throw new Error(`No element: ${selector}`); + return matched; + } + + const candidates = Array.from(document.querySelectorAll("main, article, [role='main']")) + .filter(node => normalizeText(node.innerText || "").length > 0); + if (!candidates.length) return document.body; + candidates.sort((a, b) => (b.innerText || "").length - (a.innerText || "").length); + return candidates[0]; + } + + function inlineText(node) { + if (node.nodeType === Node.TEXT_NODE) { + return escapeMarkdown(node.textContent || ""); + } + if (node.nodeType !== Node.ELEMENT_NODE) return ""; + + const tag = node.tagName.toLowerCase(); + if (tag === "script" || tag === "style" || tag === "noscript" || tag === "template") return ""; + if (tag === "br") return "\n"; + if (tag === "img") { + const src = absoluteUrl(node.getAttribute("src"), node.src); + if (!src) return ""; + const alt = normalizeText(node.getAttribute("alt") || ""); + return alt ? `![${escapeMarkdown(alt)}](${src})` : `![](${src})`; + } + if (tag === "a") { + const text = normalizeInline(Array.from(node.childNodes).map(inlineText).join("")); + const href = absoluteUrl(node.getAttribute("href"), node.href); + if (!href) return text; + return `[${text || href}](${href})`; + } + if (tag === "code") { + const text = normalizeInline(Array.from(node.childNodes).map(inlineText).join("")); + return text ? `\`${text.replace(/`/g, "\\`")}\`` : ""; + } + if (tag === "strong" || tag === "b") { + const text = normalizeInline(Array.from(node.childNodes).map(inlineText).join("")); + return text ? `**${text}**` : ""; + } + if (tag === "em" || tag === "i") { + const text = normalizeInline(Array.from(node.childNodes).map(inlineText).join("")); + return text ? `*${text}*` : ""; + } + + const chunks = []; + for (const child of node.childNodes) { + const rendered = inlineText(child); + if (!rendered) continue; + chunks.push(rendered); + if (child.nodeType === Node.ELEMENT_NODE && BLOCKS.has(child.tagName.toLowerCase())) { + chunks.push("\n"); + } + } + return chunks.join(""); + } + + function textBlock(node) { + return collapseBlankLines(normalizeInline(Array.from(node.childNodes).map(inlineText).join(""))); + } + + function tableToMarkdown(table) { + const rows = Array.from(table.querySelectorAll("tr")) + .map(row => Array.from(row.children) + .filter(cell => cell.tagName === "TD" || cell.tagName === "TH") + .map(cell => escapeTableCell(textBlock(cell))) + ) + .filter(cells => cells.length > 0); + if (!rows.length) return ""; + + const widths = rows.reduce((max, row) => Math.max(max, row.length), 0); + const normalizedRows = rows.map(row => { + const next = row.slice(); + while (next.length < widths) next.push(""); + return next; + }); + + let headers = normalizedRows[0]; + let bodyRows = normalizedRows.slice(1); + const firstRow = table.querySelector("tr"); + const thead = table.querySelector("thead"); + const firstRowHasTh = firstRow && Array.from(firstRow.children).some(cell => cell.tagName === "TH"); + if (!(thead || firstRowHasTh)) { + headers = new Array(widths).fill(""); + bodyRows = normalizedRows; + } + + const separator = new Array(widths).fill("---"); + const lines = [ + `| ${headers.join(" | ")} |`, + `| ${separator.join(" | ")} |`, + ]; + for (const row of bodyRows) { + lines.push(`| ${row.join(" | ")} |`); + } + return lines.join("\n"); + } + + function listToMarkdown(list, depth = 0) { + const ordered = list.tagName.toLowerCase() === "ol"; + const items = []; + const children = Array.from(list.children).filter(child => child.tagName === "LI"); + children.forEach((item, index) => { + const marker = ordered ? `${index + 1}. ` : "- "; + const indent = " ".repeat(depth); + const nested = []; + const content = []; + + for (const child of item.childNodes) { + if (child.nodeType === Node.ELEMENT_NODE && (child.tagName === "UL" || child.tagName === "OL")) { + nested.push(listToMarkdown(child, depth + 1)); + } else { + content.push(inlineText(child)); + } + } + + const line = collapseBlankLines(normalizeInline(content.join(""))); + if (line) items.push(`${indent}${marker}${line}`); + nested.filter(Boolean).forEach(block => items.push(block)); + }); + return items.join("\n"); + } + + function blockToMarkdown(node) { + if (node.nodeType === Node.TEXT_NODE) { + return normalizeText(node.textContent || ""); + } + if (node.nodeType !== Node.ELEMENT_NODE) return ""; + + const tag = node.tagName.toLowerCase(); + if (tag === "script" || tag === "style" || tag === "noscript" || tag === "template") return ""; + if (tag === "table") return tableToMarkdown(node); + if (tag === "ul" || tag === "ol") return listToMarkdown(node); + if (tag === "pre") { + const code = node.innerText.replace(/\n$/, ""); + return code ? `\`\`\`\n${code}\n\`\`\`` : ""; + } + if (tag === "blockquote") { + const content = collapseBlankLines(Array.from(node.childNodes).map(blockToMarkdown).join("\n\n")); + return content + .split("\n") + .map(line => line ? `> ${line}` : ">") + .join("\n"); + } + if (/^h[1-6]$/.test(tag)) { + const level = Number(tag.slice(1)); + const text = textBlock(node); + return text ? `${"#".repeat(level)} ${text}` : ""; + } + if (tag === "p" || tag === "figcaption") { + return textBlock(node); + } + if (tag === "hr") { + return "---"; + } + if (tag === "img") { + return inlineText(node); + } + + const childBlocks = Array.from(node.childNodes) + .map(child => blockToMarkdown(child)) + .filter(Boolean); + if (childBlocks.length) return collapseBlankLines(childBlocks.join("\n\n")); + + return textBlock(node); + } + + const root = stripNoise(pickRoot()); + const markdown = blockToMarkdown(root); + return collapseBlankLines(markdown); + } const fns = { domQuery, domClick, domType, domAttr, domText, domExists, - extractLinks, extractImages, extractText, extractJson }; + extractLinks, extractImages, extractText, extractJson, extractMarkdown }; const fn = fns[funcName]; if (!fn) throw new Error(`Unknown content function: ${funcName}`); return fn(args); diff --git a/pyproject.toml b/pyproject.toml index 0fa9f9c..6724caf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "browser-cli" -version = "0.4.1" +version = "0.5.0" description = "Control your real running browser from the terminal via a Chrome extension" requires-python = ">=3.10" dependencies = [ diff --git a/tests/test_api.py b/tests/test_api.py index 2199592..9cdc336 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -182,6 +182,21 @@ class TestSearch: assert mock_send.call_args[0][1]["background"] is True +class TestExtract: + def test_extract_markdown_default(self, b, mock_send): + mock_send.return_value = "# Title" + + result = b.extract_markdown() + + assert result == "# Title" + mock_send.assert_called_once_with("extract.markdown", {"selector": None}, profile=None) + + def test_extract_markdown_selector(self, b, mock_send): + b.extract_markdown("article") + + mock_send.assert_called_once_with("extract.markdown", {"selector": "article"}, profile=None) + + # ── Tabs ────────────────────────────────────────────────────────────────────── class TestTabs: diff --git a/tests/test_cli.py b/tests/test_cli.py index 74c771f..0485378 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -54,3 +54,19 @@ def test_clients_exits_cleanly_when_registry_is_missing(): assert result.exit_code == 1 assert "No browser clients found" in result.output + +def test_extract_markdown_command(): + with patch("browser_cli.commands.extract.send_command", return_value="# Title\n") as send_command: + result = CliRunner().invoke(main, ["extract", "markdown"]) + + assert result.exit_code == 0 + assert result.output == "# Title\n" + send_command.assert_called_once_with("extract.markdown", {"selector": None}) + +def test_extract_markdown_command_with_selector(): + with patch("browser_cli.commands.extract.send_command", return_value="## Post\n") as send_command: + result = CliRunner().invoke(main, ["extract", "markdown", "--selector", "article"]) + + assert result.exit_code == 0 + assert result.output == "## Post\n" + send_command.assert_called_once_with("extract.markdown", {"selector": "article"}) diff --git a/tests/test_extract.py b/tests/test_extract.py index b1e26ee..df9a6fb 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -7,18 +7,24 @@ def test_extract_links(browser, http_tab): browser("tabs.active", {"tabId": http_tab["id"]}) links = browser("extract.links") assert isinstance(links, list) + hrefs = [] for lnk in links: assert "href" in lnk assert "text" in lnk + hrefs.append(lnk["href"]) + assert len(hrefs) == len(set(hrefs)) def test_extract_images(browser, http_tab): browser("tabs.active", {"tabId": http_tab["id"]}) images = browser("extract.images") assert isinstance(images, list) + sources = [] for img in images: assert "src" in img assert img["src"] != "" + sources.append(img["src"]) + assert len(sources) == len(set(sources)) def test_extract_text(browser, http_tab): @@ -35,6 +41,19 @@ def test_extract_html(browser, http_tab): assert "<" in html +def test_extract_markdown(browser, http_tab): + browser("tabs.active", {"tabId": http_tab["id"]}) + markdown = browser("extract.markdown") + assert isinstance(markdown, str) + assert len(markdown.strip()) > 0 + + +def test_extract_markdown_missing_selector_errors(browser, http_tab): + browser("tabs.active", {"tabId": http_tab["id"]}) + with pytest.raises(RuntimeError, match="No element"): + browser("extract.markdown", {"selector": ".browser-cli-definitely-missing"}) + + def test_dom_exists(browser, http_tab): browser("tabs.active", {"tabId": http_tab["id"]}) result = browser("dom.exists", {"selector": "body"})