adding new extract command to extract selector or main content as markdown, updateing version as 0.5.0
This commit is contained in:
@@ -249,6 +249,8 @@ browser-cli extract images # all <img> tags (src + alt)
|
|||||||
browser-cli extract text # all visible text (innerText)
|
browser-cli extract text # all visible text (innerText)
|
||||||
browser-cli extract json "#data" # parse JSON inside a CSS selector
|
browser-cli extract json "#data" # parse JSON inside a CSS selector
|
||||||
browser-cli extract html # full HTML of the active tab
|
browser-cli extract html # full HTML of the active tab
|
||||||
|
browser-cli extract markdown # main page content as Markdown
|
||||||
|
browser-cli extract markdown --selector "article" # specific DOM subtree as Markdown
|
||||||
```
|
```
|
||||||
|
|
||||||
### Sessions
|
### Sessions
|
||||||
|
|||||||
@@ -247,6 +247,9 @@ class BrowserCLI:
|
|||||||
def extract_json(self, selector: str):
|
def extract_json(self, selector: str):
|
||||||
return self._cmd("extract.json", {"selector": selector})
|
return self._cmd("extract.json", {"selector": selector})
|
||||||
|
|
||||||
|
def extract_markdown(self, selector: str | None = None) -> str:
|
||||||
|
return self._cmd("extract.markdown", {"selector": selector}) or ""
|
||||||
|
|
||||||
# ── Session ───────────────────────────────────────────────────────────
|
# ── Session ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
def session_save(self, name: str) -> None:
|
def session_save(self, name: str) -> None:
|
||||||
|
|||||||
@@ -73,3 +73,11 @@ def extract_html():
|
|||||||
"""Print the full HTML of the active tab to stdout."""
|
"""Print the full HTML of the active tab to stdout."""
|
||||||
html = _handle("extract.html")
|
html = _handle("extract.html")
|
||||||
click.echo(html or "")
|
click.echo(html or "")
|
||||||
|
|
||||||
|
|
||||||
|
@extract_group.command("markdown")
|
||||||
|
@click.option("--selector", help="Extract only the DOM subtree matching this CSS selector.")
|
||||||
|
def extract_markdown(selector):
|
||||||
|
"""Extract the page's main content as Markdown."""
|
||||||
|
markdown = _handle("extract.markdown", {"selector": selector})
|
||||||
|
click.echo(markdown or "", nl=not (markdown or "").endswith("\n"))
|
||||||
|
|||||||
+222
-1
@@ -123,6 +123,7 @@ async function dispatch(command, args) {
|
|||||||
case "extract.images": return domOp("extractImages", args);
|
case "extract.images": return domOp("extractImages", args);
|
||||||
case "extract.text": return domOp("extractText", args);
|
case "extract.text": return domOp("extractText", args);
|
||||||
case "extract.json": return domOp("extractJson", args);
|
case "extract.json": return domOp("extractJson", args);
|
||||||
|
case "extract.markdown": return domOp("extractMarkdown", args);
|
||||||
case "extract.html": return tabsHtml({});
|
case "extract.html": return tabsHtml({});
|
||||||
|
|
||||||
// ── Session ───────────────────────────────────────────────────────────
|
// ── Session ───────────────────────────────────────────────────────────
|
||||||
@@ -605,9 +606,229 @@ function contentDispatch(funcName, args) {
|
|||||||
if (!el) throw new Error(`No element: ${selector}`);
|
if (!el) throw new Error(`No element: ${selector}`);
|
||||||
return JSON.parse(el.textContent);
|
return JSON.parse(el.textContent);
|
||||||
}
|
}
|
||||||
|
function extractMarkdown({ selector }) {
|
||||||
|
const BLOCKS = new Set([
|
||||||
|
"article", "aside", "blockquote", "body", "div", "dl", "fieldset", "figcaption",
|
||||||
|
"figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr",
|
||||||
|
"li", "main", "nav", "ol", "p", "pre", "section", "table", "tbody", "td", "tfoot",
|
||||||
|
"th", "thead", "tr", "ul"
|
||||||
|
]);
|
||||||
|
|
||||||
|
function normalizeText(value) {
|
||||||
|
return value.replace(/\s+/g, " ").trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeInline(value) {
|
||||||
|
return value
|
||||||
|
.replace(/[ \t]+\n/g, "\n")
|
||||||
|
.replace(/\n[ \t]+/g, "\n")
|
||||||
|
.replace(/\n{3,}/g, "\n\n")
|
||||||
|
.replace(/[ \t]{2,}/g, " ")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
function collapseBlankLines(value) {
|
||||||
|
return value
|
||||||
|
.replace(/[ \t]+\n/g, "\n")
|
||||||
|
.replace(/\n{3,}/g, "\n\n")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
function escapeMarkdown(text) {
|
||||||
|
return text.replace(/([\\`*_{}\[\]()#+\-!|>])/g, "\\$1");
|
||||||
|
}
|
||||||
|
|
||||||
|
function escapeTableCell(text) {
|
||||||
|
return text.replace(/\|/g, "\\|").replace(/\n+/g, " ").trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
function absoluteUrl(attr, fallback) {
|
||||||
|
return attr || fallback || "";
|
||||||
|
}
|
||||||
|
|
||||||
|
function stripNoise(root) {
|
||||||
|
const clone = root.cloneNode(true);
|
||||||
|
clone.querySelectorAll("script, style, noscript, template").forEach(node => node.remove());
|
||||||
|
return clone;
|
||||||
|
}
|
||||||
|
|
||||||
|
function pickRoot() {
|
||||||
|
if (selector) {
|
||||||
|
const matched = document.querySelector(selector);
|
||||||
|
if (!matched) throw new Error(`No element: ${selector}`);
|
||||||
|
return matched;
|
||||||
|
}
|
||||||
|
|
||||||
|
const candidates = Array.from(document.querySelectorAll("main, article, [role='main']"))
|
||||||
|
.filter(node => normalizeText(node.innerText || "").length > 0);
|
||||||
|
if (!candidates.length) return document.body;
|
||||||
|
candidates.sort((a, b) => (b.innerText || "").length - (a.innerText || "").length);
|
||||||
|
return candidates[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
function inlineText(node) {
|
||||||
|
if (node.nodeType === Node.TEXT_NODE) {
|
||||||
|
return escapeMarkdown(node.textContent || "");
|
||||||
|
}
|
||||||
|
if (node.nodeType !== Node.ELEMENT_NODE) return "";
|
||||||
|
|
||||||
|
const tag = node.tagName.toLowerCase();
|
||||||
|
if (tag === "script" || tag === "style" || tag === "noscript" || tag === "template") return "";
|
||||||
|
if (tag === "br") return "\n";
|
||||||
|
if (tag === "img") {
|
||||||
|
const src = absoluteUrl(node.getAttribute("src"), node.src);
|
||||||
|
if (!src) return "";
|
||||||
|
const alt = normalizeText(node.getAttribute("alt") || "");
|
||||||
|
return alt ? `` : ``;
|
||||||
|
}
|
||||||
|
if (tag === "a") {
|
||||||
|
const text = normalizeInline(Array.from(node.childNodes).map(inlineText).join(""));
|
||||||
|
const href = absoluteUrl(node.getAttribute("href"), node.href);
|
||||||
|
if (!href) return text;
|
||||||
|
return `[${text || href}](${href})`;
|
||||||
|
}
|
||||||
|
if (tag === "code") {
|
||||||
|
const text = normalizeInline(Array.from(node.childNodes).map(inlineText).join(""));
|
||||||
|
return text ? `\`${text.replace(/`/g, "\\`")}\`` : "";
|
||||||
|
}
|
||||||
|
if (tag === "strong" || tag === "b") {
|
||||||
|
const text = normalizeInline(Array.from(node.childNodes).map(inlineText).join(""));
|
||||||
|
return text ? `**${text}**` : "";
|
||||||
|
}
|
||||||
|
if (tag === "em" || tag === "i") {
|
||||||
|
const text = normalizeInline(Array.from(node.childNodes).map(inlineText).join(""));
|
||||||
|
return text ? `*${text}*` : "";
|
||||||
|
}
|
||||||
|
|
||||||
|
const chunks = [];
|
||||||
|
for (const child of node.childNodes) {
|
||||||
|
const rendered = inlineText(child);
|
||||||
|
if (!rendered) continue;
|
||||||
|
chunks.push(rendered);
|
||||||
|
if (child.nodeType === Node.ELEMENT_NODE && BLOCKS.has(child.tagName.toLowerCase())) {
|
||||||
|
chunks.push("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return chunks.join("");
|
||||||
|
}
|
||||||
|
|
||||||
|
function textBlock(node) {
|
||||||
|
return collapseBlankLines(normalizeInline(Array.from(node.childNodes).map(inlineText).join("")));
|
||||||
|
}
|
||||||
|
|
||||||
|
function tableToMarkdown(table) {
|
||||||
|
const rows = Array.from(table.querySelectorAll("tr"))
|
||||||
|
.map(row => Array.from(row.children)
|
||||||
|
.filter(cell => cell.tagName === "TD" || cell.tagName === "TH")
|
||||||
|
.map(cell => escapeTableCell(textBlock(cell)))
|
||||||
|
)
|
||||||
|
.filter(cells => cells.length > 0);
|
||||||
|
if (!rows.length) return "";
|
||||||
|
|
||||||
|
const widths = rows.reduce((max, row) => Math.max(max, row.length), 0);
|
||||||
|
const normalizedRows = rows.map(row => {
|
||||||
|
const next = row.slice();
|
||||||
|
while (next.length < widths) next.push("");
|
||||||
|
return next;
|
||||||
|
});
|
||||||
|
|
||||||
|
let headers = normalizedRows[0];
|
||||||
|
let bodyRows = normalizedRows.slice(1);
|
||||||
|
const firstRow = table.querySelector("tr");
|
||||||
|
const thead = table.querySelector("thead");
|
||||||
|
const firstRowHasTh = firstRow && Array.from(firstRow.children).some(cell => cell.tagName === "TH");
|
||||||
|
if (!(thead || firstRowHasTh)) {
|
||||||
|
headers = new Array(widths).fill("");
|
||||||
|
bodyRows = normalizedRows;
|
||||||
|
}
|
||||||
|
|
||||||
|
const separator = new Array(widths).fill("---");
|
||||||
|
const lines = [
|
||||||
|
`| ${headers.join(" | ")} |`,
|
||||||
|
`| ${separator.join(" | ")} |`,
|
||||||
|
];
|
||||||
|
for (const row of bodyRows) {
|
||||||
|
lines.push(`| ${row.join(" | ")} |`);
|
||||||
|
}
|
||||||
|
return lines.join("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
function listToMarkdown(list, depth = 0) {
|
||||||
|
const ordered = list.tagName.toLowerCase() === "ol";
|
||||||
|
const items = [];
|
||||||
|
const children = Array.from(list.children).filter(child => child.tagName === "LI");
|
||||||
|
children.forEach((item, index) => {
|
||||||
|
const marker = ordered ? `${index + 1}. ` : "- ";
|
||||||
|
const indent = " ".repeat(depth);
|
||||||
|
const nested = [];
|
||||||
|
const content = [];
|
||||||
|
|
||||||
|
for (const child of item.childNodes) {
|
||||||
|
if (child.nodeType === Node.ELEMENT_NODE && (child.tagName === "UL" || child.tagName === "OL")) {
|
||||||
|
nested.push(listToMarkdown(child, depth + 1));
|
||||||
|
} else {
|
||||||
|
content.push(inlineText(child));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const line = collapseBlankLines(normalizeInline(content.join("")));
|
||||||
|
if (line) items.push(`${indent}${marker}${line}`);
|
||||||
|
nested.filter(Boolean).forEach(block => items.push(block));
|
||||||
|
});
|
||||||
|
return items.join("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
function blockToMarkdown(node) {
|
||||||
|
if (node.nodeType === Node.TEXT_NODE) {
|
||||||
|
return normalizeText(node.textContent || "");
|
||||||
|
}
|
||||||
|
if (node.nodeType !== Node.ELEMENT_NODE) return "";
|
||||||
|
|
||||||
|
const tag = node.tagName.toLowerCase();
|
||||||
|
if (tag === "script" || tag === "style" || tag === "noscript" || tag === "template") return "";
|
||||||
|
if (tag === "table") return tableToMarkdown(node);
|
||||||
|
if (tag === "ul" || tag === "ol") return listToMarkdown(node);
|
||||||
|
if (tag === "pre") {
|
||||||
|
const code = node.innerText.replace(/\n$/, "");
|
||||||
|
return code ? `\`\`\`\n${code}\n\`\`\`` : "";
|
||||||
|
}
|
||||||
|
if (tag === "blockquote") {
|
||||||
|
const content = collapseBlankLines(Array.from(node.childNodes).map(blockToMarkdown).join("\n\n"));
|
||||||
|
return content
|
||||||
|
.split("\n")
|
||||||
|
.map(line => line ? `> ${line}` : ">")
|
||||||
|
.join("\n");
|
||||||
|
}
|
||||||
|
if (/^h[1-6]$/.test(tag)) {
|
||||||
|
const level = Number(tag.slice(1));
|
||||||
|
const text = textBlock(node);
|
||||||
|
return text ? `${"#".repeat(level)} ${text}` : "";
|
||||||
|
}
|
||||||
|
if (tag === "p" || tag === "figcaption") {
|
||||||
|
return textBlock(node);
|
||||||
|
}
|
||||||
|
if (tag === "hr") {
|
||||||
|
return "---";
|
||||||
|
}
|
||||||
|
if (tag === "img") {
|
||||||
|
return inlineText(node);
|
||||||
|
}
|
||||||
|
|
||||||
|
const childBlocks = Array.from(node.childNodes)
|
||||||
|
.map(child => blockToMarkdown(child))
|
||||||
|
.filter(Boolean);
|
||||||
|
if (childBlocks.length) return collapseBlankLines(childBlocks.join("\n\n"));
|
||||||
|
|
||||||
|
return textBlock(node);
|
||||||
|
}
|
||||||
|
|
||||||
|
const root = stripNoise(pickRoot());
|
||||||
|
const markdown = blockToMarkdown(root);
|
||||||
|
return collapseBlankLines(markdown);
|
||||||
|
}
|
||||||
|
|
||||||
const fns = { domQuery, domClick, domType, domAttr, domText, domExists,
|
const fns = { domQuery, domClick, domType, domAttr, domText, domExists,
|
||||||
extractLinks, extractImages, extractText, extractJson };
|
extractLinks, extractImages, extractText, extractJson, extractMarkdown };
|
||||||
const fn = fns[funcName];
|
const fn = fns[funcName];
|
||||||
if (!fn) throw new Error(`Unknown content function: ${funcName}`);
|
if (!fn) throw new Error(`Unknown content function: ${funcName}`);
|
||||||
return fn(args);
|
return fn(args);
|
||||||
|
|||||||
+1
-1
@@ -1,6 +1,6 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "browser-cli"
|
name = "browser-cli"
|
||||||
version = "0.4.1"
|
version = "0.5.0"
|
||||||
description = "Control your real running browser from the terminal via a Chrome extension"
|
description = "Control your real running browser from the terminal via a Chrome extension"
|
||||||
requires-python = ">=3.10"
|
requires-python = ">=3.10"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
|||||||
@@ -182,6 +182,21 @@ class TestSearch:
|
|||||||
assert mock_send.call_args[0][1]["background"] is True
|
assert mock_send.call_args[0][1]["background"] is True
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtract:
|
||||||
|
def test_extract_markdown_default(self, b, mock_send):
|
||||||
|
mock_send.return_value = "# Title"
|
||||||
|
|
||||||
|
result = b.extract_markdown()
|
||||||
|
|
||||||
|
assert result == "# Title"
|
||||||
|
mock_send.assert_called_once_with("extract.markdown", {"selector": None}, profile=None)
|
||||||
|
|
||||||
|
def test_extract_markdown_selector(self, b, mock_send):
|
||||||
|
b.extract_markdown("article")
|
||||||
|
|
||||||
|
mock_send.assert_called_once_with("extract.markdown", {"selector": "article"}, profile=None)
|
||||||
|
|
||||||
|
|
||||||
# ── Tabs ──────────────────────────────────────────────────────────────────────
|
# ── Tabs ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
class TestTabs:
|
class TestTabs:
|
||||||
|
|||||||
@@ -54,3 +54,19 @@ def test_clients_exits_cleanly_when_registry_is_missing():
|
|||||||
|
|
||||||
assert result.exit_code == 1
|
assert result.exit_code == 1
|
||||||
assert "No browser clients found" in result.output
|
assert "No browser clients found" in result.output
|
||||||
|
|
||||||
|
def test_extract_markdown_command():
|
||||||
|
with patch("browser_cli.commands.extract.send_command", return_value="# Title\n") as send_command:
|
||||||
|
result = CliRunner().invoke(main, ["extract", "markdown"])
|
||||||
|
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert result.output == "# Title\n"
|
||||||
|
send_command.assert_called_once_with("extract.markdown", {"selector": None})
|
||||||
|
|
||||||
|
def test_extract_markdown_command_with_selector():
|
||||||
|
with patch("browser_cli.commands.extract.send_command", return_value="## Post\n") as send_command:
|
||||||
|
result = CliRunner().invoke(main, ["extract", "markdown", "--selector", "article"])
|
||||||
|
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert result.output == "## Post\n"
|
||||||
|
send_command.assert_called_once_with("extract.markdown", {"selector": "article"})
|
||||||
|
|||||||
@@ -7,18 +7,24 @@ def test_extract_links(browser, http_tab):
|
|||||||
browser("tabs.active", {"tabId": http_tab["id"]})
|
browser("tabs.active", {"tabId": http_tab["id"]})
|
||||||
links = browser("extract.links")
|
links = browser("extract.links")
|
||||||
assert isinstance(links, list)
|
assert isinstance(links, list)
|
||||||
|
hrefs = []
|
||||||
for lnk in links:
|
for lnk in links:
|
||||||
assert "href" in lnk
|
assert "href" in lnk
|
||||||
assert "text" in lnk
|
assert "text" in lnk
|
||||||
|
hrefs.append(lnk["href"])
|
||||||
|
assert len(hrefs) == len(set(hrefs))
|
||||||
|
|
||||||
|
|
||||||
def test_extract_images(browser, http_tab):
|
def test_extract_images(browser, http_tab):
|
||||||
browser("tabs.active", {"tabId": http_tab["id"]})
|
browser("tabs.active", {"tabId": http_tab["id"]})
|
||||||
images = browser("extract.images")
|
images = browser("extract.images")
|
||||||
assert isinstance(images, list)
|
assert isinstance(images, list)
|
||||||
|
sources = []
|
||||||
for img in images:
|
for img in images:
|
||||||
assert "src" in img
|
assert "src" in img
|
||||||
assert img["src"] != ""
|
assert img["src"] != ""
|
||||||
|
sources.append(img["src"])
|
||||||
|
assert len(sources) == len(set(sources))
|
||||||
|
|
||||||
|
|
||||||
def test_extract_text(browser, http_tab):
|
def test_extract_text(browser, http_tab):
|
||||||
@@ -35,6 +41,19 @@ def test_extract_html(browser, http_tab):
|
|||||||
assert "<" in html
|
assert "<" in html
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_markdown(browser, http_tab):
|
||||||
|
browser("tabs.active", {"tabId": http_tab["id"]})
|
||||||
|
markdown = browser("extract.markdown")
|
||||||
|
assert isinstance(markdown, str)
|
||||||
|
assert len(markdown.strip()) > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_markdown_missing_selector_errors(browser, http_tab):
|
||||||
|
browser("tabs.active", {"tabId": http_tab["id"]})
|
||||||
|
with pytest.raises(RuntimeError, match="No element"):
|
||||||
|
browser("extract.markdown", {"selector": ".browser-cli-definitely-missing"})
|
||||||
|
|
||||||
|
|
||||||
def test_dom_exists(browser, http_tab):
|
def test_dom_exists(browser, http_tab):
|
||||||
browser("tabs.active", {"tabId": http_tab["id"]})
|
browser("tabs.active", {"tabId": http_tab["id"]})
|
||||||
result = browser("dom.exists", {"selector": "body"})
|
result = browser("dom.exists", {"selector": "body"})
|
||||||
|
|||||||
Reference in New Issue
Block a user