adding new extract command to extract selector or main content as markdown, updateing version as 0.5.0
This commit is contained in:
+222
-1
@@ -123,6 +123,7 @@ async function dispatch(command, args) {
|
||||
case "extract.images": return domOp("extractImages", args);
|
||||
case "extract.text": return domOp("extractText", args);
|
||||
case "extract.json": return domOp("extractJson", args);
|
||||
case "extract.markdown": return domOp("extractMarkdown", args);
|
||||
case "extract.html": return tabsHtml({});
|
||||
|
||||
// ── Session ───────────────────────────────────────────────────────────
|
||||
@@ -605,9 +606,229 @@ function contentDispatch(funcName, args) {
|
||||
if (!el) throw new Error(`No element: ${selector}`);
|
||||
return JSON.parse(el.textContent);
|
||||
}
|
||||
function extractMarkdown({ selector }) {
|
||||
const BLOCKS = new Set([
|
||||
"article", "aside", "blockquote", "body", "div", "dl", "fieldset", "figcaption",
|
||||
"figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr",
|
||||
"li", "main", "nav", "ol", "p", "pre", "section", "table", "tbody", "td", "tfoot",
|
||||
"th", "thead", "tr", "ul"
|
||||
]);
|
||||
|
||||
function normalizeText(value) {
|
||||
return value.replace(/\s+/g, " ").trim();
|
||||
}
|
||||
|
||||
function normalizeInline(value) {
|
||||
return value
|
||||
.replace(/[ \t]+\n/g, "\n")
|
||||
.replace(/\n[ \t]+/g, "\n")
|
||||
.replace(/\n{3,}/g, "\n\n")
|
||||
.replace(/[ \t]{2,}/g, " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
function collapseBlankLines(value) {
|
||||
return value
|
||||
.replace(/[ \t]+\n/g, "\n")
|
||||
.replace(/\n{3,}/g, "\n\n")
|
||||
.trim();
|
||||
}
|
||||
|
||||
function escapeMarkdown(text) {
|
||||
return text.replace(/([\\`*_{}\[\]()#+\-!|>])/g, "\\$1");
|
||||
}
|
||||
|
||||
function escapeTableCell(text) {
|
||||
return text.replace(/\|/g, "\\|").replace(/\n+/g, " ").trim();
|
||||
}
|
||||
|
||||
function absoluteUrl(attr, fallback) {
|
||||
return attr || fallback || "";
|
||||
}
|
||||
|
||||
function stripNoise(root) {
|
||||
const clone = root.cloneNode(true);
|
||||
clone.querySelectorAll("script, style, noscript, template").forEach(node => node.remove());
|
||||
return clone;
|
||||
}
|
||||
|
||||
function pickRoot() {
|
||||
if (selector) {
|
||||
const matched = document.querySelector(selector);
|
||||
if (!matched) throw new Error(`No element: ${selector}`);
|
||||
return matched;
|
||||
}
|
||||
|
||||
const candidates = Array.from(document.querySelectorAll("main, article, [role='main']"))
|
||||
.filter(node => normalizeText(node.innerText || "").length > 0);
|
||||
if (!candidates.length) return document.body;
|
||||
candidates.sort((a, b) => (b.innerText || "").length - (a.innerText || "").length);
|
||||
return candidates[0];
|
||||
}
|
||||
|
||||
function inlineText(node) {
|
||||
if (node.nodeType === Node.TEXT_NODE) {
|
||||
return escapeMarkdown(node.textContent || "");
|
||||
}
|
||||
if (node.nodeType !== Node.ELEMENT_NODE) return "";
|
||||
|
||||
const tag = node.tagName.toLowerCase();
|
||||
if (tag === "script" || tag === "style" || tag === "noscript" || tag === "template") return "";
|
||||
if (tag === "br") return "\n";
|
||||
if (tag === "img") {
|
||||
const src = absoluteUrl(node.getAttribute("src"), node.src);
|
||||
if (!src) return "";
|
||||
const alt = normalizeText(node.getAttribute("alt") || "");
|
||||
return alt ? `` : ``;
|
||||
}
|
||||
if (tag === "a") {
|
||||
const text = normalizeInline(Array.from(node.childNodes).map(inlineText).join(""));
|
||||
const href = absoluteUrl(node.getAttribute("href"), node.href);
|
||||
if (!href) return text;
|
||||
return `[${text || href}](${href})`;
|
||||
}
|
||||
if (tag === "code") {
|
||||
const text = normalizeInline(Array.from(node.childNodes).map(inlineText).join(""));
|
||||
return text ? `\`${text.replace(/`/g, "\\`")}\`` : "";
|
||||
}
|
||||
if (tag === "strong" || tag === "b") {
|
||||
const text = normalizeInline(Array.from(node.childNodes).map(inlineText).join(""));
|
||||
return text ? `**${text}**` : "";
|
||||
}
|
||||
if (tag === "em" || tag === "i") {
|
||||
const text = normalizeInline(Array.from(node.childNodes).map(inlineText).join(""));
|
||||
return text ? `*${text}*` : "";
|
||||
}
|
||||
|
||||
const chunks = [];
|
||||
for (const child of node.childNodes) {
|
||||
const rendered = inlineText(child);
|
||||
if (!rendered) continue;
|
||||
chunks.push(rendered);
|
||||
if (child.nodeType === Node.ELEMENT_NODE && BLOCKS.has(child.tagName.toLowerCase())) {
|
||||
chunks.push("\n");
|
||||
}
|
||||
}
|
||||
return chunks.join("");
|
||||
}
|
||||
|
||||
function textBlock(node) {
|
||||
return collapseBlankLines(normalizeInline(Array.from(node.childNodes).map(inlineText).join("")));
|
||||
}
|
||||
|
||||
function tableToMarkdown(table) {
|
||||
const rows = Array.from(table.querySelectorAll("tr"))
|
||||
.map(row => Array.from(row.children)
|
||||
.filter(cell => cell.tagName === "TD" || cell.tagName === "TH")
|
||||
.map(cell => escapeTableCell(textBlock(cell)))
|
||||
)
|
||||
.filter(cells => cells.length > 0);
|
||||
if (!rows.length) return "";
|
||||
|
||||
const widths = rows.reduce((max, row) => Math.max(max, row.length), 0);
|
||||
const normalizedRows = rows.map(row => {
|
||||
const next = row.slice();
|
||||
while (next.length < widths) next.push("");
|
||||
return next;
|
||||
});
|
||||
|
||||
let headers = normalizedRows[0];
|
||||
let bodyRows = normalizedRows.slice(1);
|
||||
const firstRow = table.querySelector("tr");
|
||||
const thead = table.querySelector("thead");
|
||||
const firstRowHasTh = firstRow && Array.from(firstRow.children).some(cell => cell.tagName === "TH");
|
||||
if (!(thead || firstRowHasTh)) {
|
||||
headers = new Array(widths).fill("");
|
||||
bodyRows = normalizedRows;
|
||||
}
|
||||
|
||||
const separator = new Array(widths).fill("---");
|
||||
const lines = [
|
||||
`| ${headers.join(" | ")} |`,
|
||||
`| ${separator.join(" | ")} |`,
|
||||
];
|
||||
for (const row of bodyRows) {
|
||||
lines.push(`| ${row.join(" | ")} |`);
|
||||
}
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
function listToMarkdown(list, depth = 0) {
|
||||
const ordered = list.tagName.toLowerCase() === "ol";
|
||||
const items = [];
|
||||
const children = Array.from(list.children).filter(child => child.tagName === "LI");
|
||||
children.forEach((item, index) => {
|
||||
const marker = ordered ? `${index + 1}. ` : "- ";
|
||||
const indent = " ".repeat(depth);
|
||||
const nested = [];
|
||||
const content = [];
|
||||
|
||||
for (const child of item.childNodes) {
|
||||
if (child.nodeType === Node.ELEMENT_NODE && (child.tagName === "UL" || child.tagName === "OL")) {
|
||||
nested.push(listToMarkdown(child, depth + 1));
|
||||
} else {
|
||||
content.push(inlineText(child));
|
||||
}
|
||||
}
|
||||
|
||||
const line = collapseBlankLines(normalizeInline(content.join("")));
|
||||
if (line) items.push(`${indent}${marker}${line}`);
|
||||
nested.filter(Boolean).forEach(block => items.push(block));
|
||||
});
|
||||
return items.join("\n");
|
||||
}
|
||||
|
||||
function blockToMarkdown(node) {
|
||||
if (node.nodeType === Node.TEXT_NODE) {
|
||||
return normalizeText(node.textContent || "");
|
||||
}
|
||||
if (node.nodeType !== Node.ELEMENT_NODE) return "";
|
||||
|
||||
const tag = node.tagName.toLowerCase();
|
||||
if (tag === "script" || tag === "style" || tag === "noscript" || tag === "template") return "";
|
||||
if (tag === "table") return tableToMarkdown(node);
|
||||
if (tag === "ul" || tag === "ol") return listToMarkdown(node);
|
||||
if (tag === "pre") {
|
||||
const code = node.innerText.replace(/\n$/, "");
|
||||
return code ? `\`\`\`\n${code}\n\`\`\`` : "";
|
||||
}
|
||||
if (tag === "blockquote") {
|
||||
const content = collapseBlankLines(Array.from(node.childNodes).map(blockToMarkdown).join("\n\n"));
|
||||
return content
|
||||
.split("\n")
|
||||
.map(line => line ? `> ${line}` : ">")
|
||||
.join("\n");
|
||||
}
|
||||
if (/^h[1-6]$/.test(tag)) {
|
||||
const level = Number(tag.slice(1));
|
||||
const text = textBlock(node);
|
||||
return text ? `${"#".repeat(level)} ${text}` : "";
|
||||
}
|
||||
if (tag === "p" || tag === "figcaption") {
|
||||
return textBlock(node);
|
||||
}
|
||||
if (tag === "hr") {
|
||||
return "---";
|
||||
}
|
||||
if (tag === "img") {
|
||||
return inlineText(node);
|
||||
}
|
||||
|
||||
const childBlocks = Array.from(node.childNodes)
|
||||
.map(child => blockToMarkdown(child))
|
||||
.filter(Boolean);
|
||||
if (childBlocks.length) return collapseBlankLines(childBlocks.join("\n\n"));
|
||||
|
||||
return textBlock(node);
|
||||
}
|
||||
|
||||
const root = stripNoise(pickRoot());
|
||||
const markdown = blockToMarkdown(root);
|
||||
return collapseBlankLines(markdown);
|
||||
}
|
||||
|
||||
const fns = { domQuery, domClick, domType, domAttr, domText, domExists,
|
||||
extractLinks, extractImages, extractText, extractJson };
|
||||
extractLinks, extractImages, extractText, extractJson, extractMarkdown };
|
||||
const fn = fns[funcName];
|
||||
if (!fn) throw new Error(`Unknown content function: ${funcName}`);
|
||||
return fn(args);
|
||||
|
||||
Reference in New Issue
Block a user