adding new extract command to extract selector or main content as markdown, updateing version as 0.5.0
Package Extension / package-extension (push) Successful in 12s
Build & Publish Package / publish (push) Failing after 21s

This commit is contained in:
2026-04-10 03:44:49 +02:00
parent 79093ed558
commit f2a7f85ee3
8 changed files with 286 additions and 2 deletions
+2
View File
@@ -249,6 +249,8 @@ browser-cli extract images # all <img> tags (src + alt)
browser-cli extract text # all visible text (innerText) browser-cli extract text # all visible text (innerText)
browser-cli extract json "#data" # parse JSON inside a CSS selector browser-cli extract json "#data" # parse JSON inside a CSS selector
browser-cli extract html # full HTML of the active tab browser-cli extract html # full HTML of the active tab
browser-cli extract markdown # main page content as Markdown
browser-cli extract markdown --selector "article" # specific DOM subtree as Markdown
``` ```
### Sessions ### Sessions
+3
View File
@@ -247,6 +247,9 @@ class BrowserCLI:
def extract_json(self, selector: str): def extract_json(self, selector: str):
return self._cmd("extract.json", {"selector": selector}) return self._cmd("extract.json", {"selector": selector})
def extract_markdown(self, selector: str | None = None) -> str:
return self._cmd("extract.markdown", {"selector": selector}) or ""
# ── Session ─────────────────────────────────────────────────────────── # ── Session ───────────────────────────────────────────────────────────
def session_save(self, name: str) -> None: def session_save(self, name: str) -> None:
+8
View File
@@ -73,3 +73,11 @@ def extract_html():
"""Print the full HTML of the active tab to stdout.""" """Print the full HTML of the active tab to stdout."""
html = _handle("extract.html") html = _handle("extract.html")
click.echo(html or "") click.echo(html or "")
@extract_group.command("markdown")
@click.option("--selector", help="Extract only the DOM subtree matching this CSS selector.")
def extract_markdown(selector):
"""Extract the page's main content as Markdown."""
markdown = _handle("extract.markdown", {"selector": selector})
click.echo(markdown or "", nl=not (markdown or "").endswith("\n"))
+222 -1
View File
@@ -123,6 +123,7 @@ async function dispatch(command, args) {
case "extract.images": return domOp("extractImages", args); case "extract.images": return domOp("extractImages", args);
case "extract.text": return domOp("extractText", args); case "extract.text": return domOp("extractText", args);
case "extract.json": return domOp("extractJson", args); case "extract.json": return domOp("extractJson", args);
case "extract.markdown": return domOp("extractMarkdown", args);
case "extract.html": return tabsHtml({}); case "extract.html": return tabsHtml({});
// ── Session ─────────────────────────────────────────────────────────── // ── Session ───────────────────────────────────────────────────────────
@@ -605,9 +606,229 @@ function contentDispatch(funcName, args) {
if (!el) throw new Error(`No element: ${selector}`); if (!el) throw new Error(`No element: ${selector}`);
return JSON.parse(el.textContent); return JSON.parse(el.textContent);
} }
function extractMarkdown({ selector }) {
const BLOCKS = new Set([
"article", "aside", "blockquote", "body", "div", "dl", "fieldset", "figcaption",
"figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr",
"li", "main", "nav", "ol", "p", "pre", "section", "table", "tbody", "td", "tfoot",
"th", "thead", "tr", "ul"
]);
function normalizeText(value) {
return value.replace(/\s+/g, " ").trim();
}
function normalizeInline(value) {
return value
.replace(/[ \t]+\n/g, "\n")
.replace(/\n[ \t]+/g, "\n")
.replace(/\n{3,}/g, "\n\n")
.replace(/[ \t]{2,}/g, " ")
.trim();
}
function collapseBlankLines(value) {
return value
.replace(/[ \t]+\n/g, "\n")
.replace(/\n{3,}/g, "\n\n")
.trim();
}
function escapeMarkdown(text) {
return text.replace(/([\\`*_{}\[\]()#+\-!|>])/g, "\\$1");
}
function escapeTableCell(text) {
return text.replace(/\|/g, "\\|").replace(/\n+/g, " ").trim();
}
function absoluteUrl(attr, fallback) {
return attr || fallback || "";
}
function stripNoise(root) {
const clone = root.cloneNode(true);
clone.querySelectorAll("script, style, noscript, template").forEach(node => node.remove());
return clone;
}
function pickRoot() {
if (selector) {
const matched = document.querySelector(selector);
if (!matched) throw new Error(`No element: ${selector}`);
return matched;
}
const candidates = Array.from(document.querySelectorAll("main, article, [role='main']"))
.filter(node => normalizeText(node.innerText || "").length > 0);
if (!candidates.length) return document.body;
candidates.sort((a, b) => (b.innerText || "").length - (a.innerText || "").length);
return candidates[0];
}
function inlineText(node) {
if (node.nodeType === Node.TEXT_NODE) {
return escapeMarkdown(node.textContent || "");
}
if (node.nodeType !== Node.ELEMENT_NODE) return "";
const tag = node.tagName.toLowerCase();
if (tag === "script" || tag === "style" || tag === "noscript" || tag === "template") return "";
if (tag === "br") return "\n";
if (tag === "img") {
const src = absoluteUrl(node.getAttribute("src"), node.src);
if (!src) return "";
const alt = normalizeText(node.getAttribute("alt") || "");
return alt ? `![${escapeMarkdown(alt)}](${src})` : `![](${src})`;
}
if (tag === "a") {
const text = normalizeInline(Array.from(node.childNodes).map(inlineText).join(""));
const href = absoluteUrl(node.getAttribute("href"), node.href);
if (!href) return text;
return `[${text || href}](${href})`;
}
if (tag === "code") {
const text = normalizeInline(Array.from(node.childNodes).map(inlineText).join(""));
return text ? `\`${text.replace(/`/g, "\\`")}\`` : "";
}
if (tag === "strong" || tag === "b") {
const text = normalizeInline(Array.from(node.childNodes).map(inlineText).join(""));
return text ? `**${text}**` : "";
}
if (tag === "em" || tag === "i") {
const text = normalizeInline(Array.from(node.childNodes).map(inlineText).join(""));
return text ? `*${text}*` : "";
}
const chunks = [];
for (const child of node.childNodes) {
const rendered = inlineText(child);
if (!rendered) continue;
chunks.push(rendered);
if (child.nodeType === Node.ELEMENT_NODE && BLOCKS.has(child.tagName.toLowerCase())) {
chunks.push("\n");
}
}
return chunks.join("");
}
function textBlock(node) {
return collapseBlankLines(normalizeInline(Array.from(node.childNodes).map(inlineText).join("")));
}
function tableToMarkdown(table) {
const rows = Array.from(table.querySelectorAll("tr"))
.map(row => Array.from(row.children)
.filter(cell => cell.tagName === "TD" || cell.tagName === "TH")
.map(cell => escapeTableCell(textBlock(cell)))
)
.filter(cells => cells.length > 0);
if (!rows.length) return "";
const widths = rows.reduce((max, row) => Math.max(max, row.length), 0);
const normalizedRows = rows.map(row => {
const next = row.slice();
while (next.length < widths) next.push("");
return next;
});
let headers = normalizedRows[0];
let bodyRows = normalizedRows.slice(1);
const firstRow = table.querySelector("tr");
const thead = table.querySelector("thead");
const firstRowHasTh = firstRow && Array.from(firstRow.children).some(cell => cell.tagName === "TH");
if (!(thead || firstRowHasTh)) {
headers = new Array(widths).fill("");
bodyRows = normalizedRows;
}
const separator = new Array(widths).fill("---");
const lines = [
`| ${headers.join(" | ")} |`,
`| ${separator.join(" | ")} |`,
];
for (const row of bodyRows) {
lines.push(`| ${row.join(" | ")} |`);
}
return lines.join("\n");
}
function listToMarkdown(list, depth = 0) {
const ordered = list.tagName.toLowerCase() === "ol";
const items = [];
const children = Array.from(list.children).filter(child => child.tagName === "LI");
children.forEach((item, index) => {
const marker = ordered ? `${index + 1}. ` : "- ";
const indent = " ".repeat(depth);
const nested = [];
const content = [];
for (const child of item.childNodes) {
if (child.nodeType === Node.ELEMENT_NODE && (child.tagName === "UL" || child.tagName === "OL")) {
nested.push(listToMarkdown(child, depth + 1));
} else {
content.push(inlineText(child));
}
}
const line = collapseBlankLines(normalizeInline(content.join("")));
if (line) items.push(`${indent}${marker}${line}`);
nested.filter(Boolean).forEach(block => items.push(block));
});
return items.join("\n");
}
function blockToMarkdown(node) {
if (node.nodeType === Node.TEXT_NODE) {
return normalizeText(node.textContent || "");
}
if (node.nodeType !== Node.ELEMENT_NODE) return "";
const tag = node.tagName.toLowerCase();
if (tag === "script" || tag === "style" || tag === "noscript" || tag === "template") return "";
if (tag === "table") return tableToMarkdown(node);
if (tag === "ul" || tag === "ol") return listToMarkdown(node);
if (tag === "pre") {
const code = node.innerText.replace(/\n$/, "");
return code ? `\`\`\`\n${code}\n\`\`\`` : "";
}
if (tag === "blockquote") {
const content = collapseBlankLines(Array.from(node.childNodes).map(blockToMarkdown).join("\n\n"));
return content
.split("\n")
.map(line => line ? `> ${line}` : ">")
.join("\n");
}
if (/^h[1-6]$/.test(tag)) {
const level = Number(tag.slice(1));
const text = textBlock(node);
return text ? `${"#".repeat(level)} ${text}` : "";
}
if (tag === "p" || tag === "figcaption") {
return textBlock(node);
}
if (tag === "hr") {
return "---";
}
if (tag === "img") {
return inlineText(node);
}
const childBlocks = Array.from(node.childNodes)
.map(child => blockToMarkdown(child))
.filter(Boolean);
if (childBlocks.length) return collapseBlankLines(childBlocks.join("\n\n"));
return textBlock(node);
}
const root = stripNoise(pickRoot());
const markdown = blockToMarkdown(root);
return collapseBlankLines(markdown);
}
const fns = { domQuery, domClick, domType, domAttr, domText, domExists, const fns = { domQuery, domClick, domType, domAttr, domText, domExists,
extractLinks, extractImages, extractText, extractJson }; extractLinks, extractImages, extractText, extractJson, extractMarkdown };
const fn = fns[funcName]; const fn = fns[funcName];
if (!fn) throw new Error(`Unknown content function: ${funcName}`); if (!fn) throw new Error(`Unknown content function: ${funcName}`);
return fn(args); return fn(args);
+1 -1
View File
@@ -1,6 +1,6 @@
[project] [project]
name = "browser-cli" name = "browser-cli"
version = "0.4.1" version = "0.5.0"
description = "Control your real running browser from the terminal via a Chrome extension" description = "Control your real running browser from the terminal via a Chrome extension"
requires-python = ">=3.10" requires-python = ">=3.10"
dependencies = [ dependencies = [
+15
View File
@@ -182,6 +182,21 @@ class TestSearch:
assert mock_send.call_args[0][1]["background"] is True assert mock_send.call_args[0][1]["background"] is True
class TestExtract:
def test_extract_markdown_default(self, b, mock_send):
mock_send.return_value = "# Title"
result = b.extract_markdown()
assert result == "# Title"
mock_send.assert_called_once_with("extract.markdown", {"selector": None}, profile=None)
def test_extract_markdown_selector(self, b, mock_send):
b.extract_markdown("article")
mock_send.assert_called_once_with("extract.markdown", {"selector": "article"}, profile=None)
# ── Tabs ────────────────────────────────────────────────────────────────────── # ── Tabs ──────────────────────────────────────────────────────────────────────
class TestTabs: class TestTabs:
+16
View File
@@ -54,3 +54,19 @@ def test_clients_exits_cleanly_when_registry_is_missing():
assert result.exit_code == 1 assert result.exit_code == 1
assert "No browser clients found" in result.output assert "No browser clients found" in result.output
def test_extract_markdown_command():
with patch("browser_cli.commands.extract.send_command", return_value="# Title\n") as send_command:
result = CliRunner().invoke(main, ["extract", "markdown"])
assert result.exit_code == 0
assert result.output == "# Title\n"
send_command.assert_called_once_with("extract.markdown", {"selector": None})
def test_extract_markdown_command_with_selector():
with patch("browser_cli.commands.extract.send_command", return_value="## Post\n") as send_command:
result = CliRunner().invoke(main, ["extract", "markdown", "--selector", "article"])
assert result.exit_code == 0
assert result.output == "## Post\n"
send_command.assert_called_once_with("extract.markdown", {"selector": "article"})
+19
View File
@@ -7,18 +7,24 @@ def test_extract_links(browser, http_tab):
browser("tabs.active", {"tabId": http_tab["id"]}) browser("tabs.active", {"tabId": http_tab["id"]})
links = browser("extract.links") links = browser("extract.links")
assert isinstance(links, list) assert isinstance(links, list)
hrefs = []
for lnk in links: for lnk in links:
assert "href" in lnk assert "href" in lnk
assert "text" in lnk assert "text" in lnk
hrefs.append(lnk["href"])
assert len(hrefs) == len(set(hrefs))
def test_extract_images(browser, http_tab): def test_extract_images(browser, http_tab):
browser("tabs.active", {"tabId": http_tab["id"]}) browser("tabs.active", {"tabId": http_tab["id"]})
images = browser("extract.images") images = browser("extract.images")
assert isinstance(images, list) assert isinstance(images, list)
sources = []
for img in images: for img in images:
assert "src" in img assert "src" in img
assert img["src"] != "" assert img["src"] != ""
sources.append(img["src"])
assert len(sources) == len(set(sources))
def test_extract_text(browser, http_tab): def test_extract_text(browser, http_tab):
@@ -35,6 +41,19 @@ def test_extract_html(browser, http_tab):
assert "<" in html assert "<" in html
def test_extract_markdown(browser, http_tab):
browser("tabs.active", {"tabId": http_tab["id"]})
markdown = browser("extract.markdown")
assert isinstance(markdown, str)
assert len(markdown.strip()) > 0
def test_extract_markdown_missing_selector_errors(browser, http_tab):
browser("tabs.active", {"tabId": http_tab["id"]})
with pytest.raises(RuntimeError, match="No element"):
browser("extract.markdown", {"selector": ".browser-cli-definitely-missing"})
def test_dom_exists(browser, http_tab): def test_dom_exists(browser, http_tab):
browser("tabs.active", {"tabId": http_tab["id"]}) browser("tabs.active", {"tabId": http_tab["id"]})
result = browser("dom.exists", {"selector": "body"}) result = browser("dom.exists", {"selector": "body"})