import type { ContentArgs } from '../types'; export function extractMarkdown({ selector }: ContentArgs) { const BLOCKS = new Set([ "article", "aside", "blockquote", "body", "div", "dl", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "ol", "p", "pre", "section", "table", "tbody", "td", "tfoot", "th", "thead", "tr", "ul" ]); const NOISE_SELECTOR = [ "script", "style", "noscript", "template", "svg", "canvas", "iframe", "dialog", "button", "input", "textarea", "select", "option", "form", "[hidden]", "[aria-hidden='true']", ".sr-only", "[class*='sr-only']", "[class*='file-tile']", "form[data-type='unified-composer']", ".composer-btn", "[data-composer-surface='true']", "#thread-bottom-container", "[data-testid*='action-button']", ].join(", "); function normalizeText(value: string) { return value.replace(/\s+/g, " ").trim(); } function normalizeInline(value: string) { return value .replace(/[ \t]+\n/g, "\n") .replace(/\n[ \t]+/g, "\n") .replace(/\n{3,}/g, "\n\n") .replace(/[ \t]{2,}/g, " ") .trim(); } function collapseBlankLines(value: string) { return value .replace(/[ \t]+\n/g, "\n") .replace(/\n{3,}/g, "\n\n") .trim(); } function escapeMarkdown(text: string) { return text.replace(/([\\`[\]])/g, "\\$1"); } function escapeTableCell(text: string) { return text.replace(/\|/g, "\\|").replace(/\n+/g, " ").trim(); } function absoluteUrl(attr: string | null | undefined, fallback?: string) { return attr || fallback || ""; } function isNoiseElement(node: Node | null): boolean { if (!node || node.nodeType !== Node.ELEMENT_NODE) return false; const el = node as Element; const tag = el.tagName.toLowerCase(); if (["script", "style", "noscript", "template", "svg", "canvas", "iframe", "dialog"].includes(tag)) return true; if (["button", "input", "textarea", "select", "option", "form"].includes(tag)) return true; if (el.hasAttribute("hidden")) return true; if ((el.getAttribute("aria-hidden") || "").toLowerCase() === "true") return true; if (el.matches(".sr-only, [class*='sr-only']")) return true; if (el.matches("[class*='file-tile'], form[data-type='unified-composer'], .composer-btn, [data-composer-surface='true'], #thread-bottom-container")) return true; if (el.matches("[data-testid*='action-button']")) return true; return false; } function stripNoise(root: Element): Element { const clone = root.cloneNode(true) as Element; clone.querySelectorAll(NOISE_SELECTOR).forEach(node => node.remove()); return clone; } function candidateScore(node: Element) { const text = normalizeText((node as HTMLElement).innerText || ""); if (!text) return -Infinity; const headings = node.querySelectorAll("h1, h2, h3, h4, h5, h6").length; const paragraphs = node.querySelectorAll("p").length; const listItems = node.querySelectorAll("li").length; const tables = node.querySelectorAll("table").length; const codeBlocks = node.querySelectorAll("pre, code").length; const images = node.querySelectorAll("img, figure").length; const mainLike = node.matches("main, article, [role='main']") ? 1 : 0; const proseBlocks = node.matches(".markdown, .prose, [data-message-author-role='assistant']") ? 1 : 0; const buttons = node.querySelectorAll("button, input, textarea, select").length; const forms = node.querySelectorAll("form").length; const svgs = node.querySelectorAll("svg, canvas").length; return text.length + (mainLike * 4000) + (proseBlocks * 5000) + (headings * 250) + (paragraphs * 60) + (listItems * 35) + (tables * 80) + (codeBlocks * 60) + (images * 25) - (buttons * 120) - (forms * 200) - (svgs * 40); } function pickRoot() { if (selector) { const matched = document.querySelector(selector); if (!matched) throw new Error(`No element: ${selector}`); return matched; } const candidates = Array.from(document.querySelectorAll( "main, article, [role='main'], section, .markdown, .prose, [data-message-author-role]" )) .filter(node => normalizeText((node as HTMLElement).innerText || "").length > 0); if (!candidates.length) return document.body; candidates.sort((a, b) => candidateScore(b) - candidateScore(a)); return candidates[0]; } function inlineText(node: Node): string { if (node.nodeType === Node.TEXT_NODE) { return escapeMarkdown(node.textContent || ""); } if (node.nodeType !== Node.ELEMENT_NODE) return ""; if (isNoiseElement(node)) return ""; const el = node as HTMLElement; const tag = el.tagName.toLowerCase(); if (tag === "br") return "\n"; if (tag === "img") { const img = el as HTMLImageElement; const src = absoluteUrl(img.getAttribute("src"), img.src); if (!src) return ""; const alt = normalizeText(img.getAttribute("alt") || ""); return alt ? `![${escapeMarkdown(alt)}](${src})` : `![](${src})`; } if (tag === "a") { const text = normalizeInline(Array.from(el.childNodes).map(inlineText).join("")); const href = absoluteUrl(el.getAttribute("href"), (el as HTMLAnchorElement).href); if (!href) return text; return `[${text || href}](${href})`; } if (tag === "code") { const text = normalizeInline(Array.from(el.childNodes).map(inlineText).join("")); return text ? `\`${text.replace(/`/g, "\\`")}\`` : ""; } if (tag === "strong" || tag === "b") { const text = normalizeInline(Array.from(el.childNodes).map(inlineText).join("")); return text ? `**${text}**` : ""; } if (tag === "em" || tag === "i") { const text = normalizeInline(Array.from(el.childNodes).map(inlineText).join("")); return text ? `*${text}*` : ""; } const chunks: string[] = []; for (const child of el.childNodes) { const rendered = inlineText(child); if (!rendered) continue; chunks.push(rendered); if (child.nodeType === Node.ELEMENT_NODE && BLOCKS.has((child as Element).tagName.toLowerCase())) { chunks.push("\n"); } } return chunks.join(""); } function textBlock(node: Node): string { return collapseBlankLines(normalizeInline(Array.from(node.childNodes).map(inlineText).join(""))); } function preserveNodeText(node: Node): string { if (node.nodeType === Node.TEXT_NODE) { return node.textContent || ""; } if (node.nodeType !== Node.ELEMENT_NODE) return ""; const el = node as HTMLElement; const tag = el.tagName.toLowerCase(); if (tag === "br") return "\n"; const parts: string[] = []; for (const child of el.childNodes) { const rendered = preserveNodeText(child); if (!rendered) continue; parts.push(rendered); } if (["div", "p", "li"].includes(tag)) { return `${parts.join("")}\n`; } return parts.join(""); } function repairFlattenedDiagram(text: string): string { if (text.includes("\n")) return text; const markerCount = (text.match(/[│▼├└]/g) || []).length; if (markerCount < 2) return text; let repaired = text; repaired = repaired.replace(/\s{2,}([│▼])/g, "\n $1"); repaired = repaired.replace(/([│▼])\s{2,}/g, "$1\n"); repaired = repaired.replace(/([│▼])(?=[^\s\n│▼├└])/g, "$1\n"); repaired = repaired.replace(/(?<=[^\s\n])([├└])/g, "\n$1"); repaired = repaired.replace(/([^\s\n])(\()/g, "$1\n$2"); return repaired .split("\n") .map(line => line.replace(/\s+$/, "")) .filter(line => line.trim()) .join("\n"); } function convertDashListsToBranches(lines: string[]): string[] { const converted: string[] = []; let index = 0; while (index < lines.length) { const match = lines[index].match(/^(\s*)-\s+(.*)$/); if (!match) { converted.push(lines[index]); index += 1; continue; } const indent = match[1]; const items = []; while (index < lines.length) { const nextMatch = lines[index].match(new RegExp(`^${indent.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}-\\s+(.*)$`)); if (!nextMatch) break; items.push(nextMatch[1]); index += 1; } items.forEach((item, itemIndex) => { const branch = itemIndex === items.length - 1 ? "└" : "├"; converted.push(`${indent}${branch} ${item}`); }); } return converted; } function normalizeCodeBlock(text: string): string { let lines = text.replace(/\r\n?/g, "\n").split("\n").map(line => line.replace(/\s+$/, "")); while (lines.length && !lines[0].trim()) lines.shift(); while (lines.length && !lines[lines.length - 1].trim()) lines.pop(); const flattened = repairFlattenedDiagram(lines.join("\n")); lines = flattened ? flattened.split("\n") : []; lines = lines.map(line => { const trimmed = line.trim(); if ((trimmed === "│" || trimmed === "▼") && !/^\s+[│▼]\s*$/.test(line)) { return ` ${trimmed}`; } return line; }); lines = convertDashListsToBranches(lines); return lines.join("\n"); } function tableToMarkdown(table: Element) { const rows = Array.from(table.querySelectorAll("tr")) .map(row => Array.from(row.children) .filter(cell => cell.tagName === "TD" || cell.tagName === "TH") .map(cell => escapeTableCell(textBlock(cell))) ) .filter(cells => cells.length > 0); if (!rows.length) return ""; const widths = rows.reduce((max, row) => Math.max(max, row.length), 0); const normalizedRows = rows.map(row => { const next = row.slice(); while (next.length < widths) next.push(""); return next; }); let headers = normalizedRows[0]; let bodyRows = normalizedRows.slice(1); const firstRowIsBlank = headers.every(cell => !cell.trim()); if (firstRowIsBlank && normalizedRows.length > 1) { headers = normalizedRows[1]; bodyRows = normalizedRows.slice(2); } const firstRow = table.querySelector("tr"); const thead = table.querySelector("thead"); const firstRowHasTh = firstRow && Array.from(firstRow.children).some(cell => cell.tagName === "TH"); if (!(thead || firstRowHasTh || firstRowIsBlank)) { headers = new Array(widths).fill(""); bodyRows = normalizedRows; } const separator = new Array(widths).fill("---"); const lines = [ `| ${headers.join(" | ")} |`, `| ${separator.join(" | ")} |`, ]; for (const row of bodyRows) { lines.push(`| ${row.join(" | ")} |`); } return lines.join("\n"); } function listToMarkdown(list: Element, depth = 0): string { const ordered = list.tagName.toLowerCase() === "ol"; const items: string[] = []; const children = Array.from(list.children).filter(child => child.tagName === "LI"); children.forEach((item, index) => { const marker = ordered ? `${index + 1}. ` : "- "; const indent = " ".repeat(depth); const nested: string[] = []; const content: string[] = []; for (const child of item.childNodes) { const childEl = child as Element; if (child.nodeType === Node.ELEMENT_NODE && (childEl.tagName === "UL" || childEl.tagName === "OL")) { nested.push(listToMarkdown(childEl, depth + 1)); } else { content.push(inlineText(child)); } } const line = collapseBlankLines(normalizeInline(content.join(""))); if (line) { const lineParts = line.split("\n"); items.push(`${indent}${marker}${lineParts[0]}`); const continuationIndent = `${indent}${" ".repeat(marker.length)}`; lineParts.slice(1).forEach(part => items.push(`${continuationIndent}${part}`)); } nested.filter(Boolean).forEach(block => items.push(block)); }); return items.join("\n"); } function blockToMarkdown(node: Node): string { if (node.nodeType === Node.TEXT_NODE) { return normalizeText(node.textContent || ""); } if (node.nodeType !== Node.ELEMENT_NODE) return ""; if (isNoiseElement(node)) return ""; const el = node as HTMLElement; const tag = el.tagName.toLowerCase(); if (tag === "table") return tableToMarkdown(el); if (tag === "ul" || tag === "ol") return listToMarkdown(el); if (el.matches(".cm-editor[data-is-code-block-view='true']")) { const lines = Array.from(el.querySelectorAll(".cm-line")).map(line => { const text = preserveNodeText(line); return text === "\n" ? "" : text.replace(/\n$/, ""); }); const code = normalizeCodeBlock(lines.join("\n")); return code ? `\`\`\`\n${code}\n\`\`\`` : ""; } if (tag === "pre") { const code = normalizeCodeBlock(preserveNodeText(el)); return code ? `\`\`\`\n${code}\n\`\`\`` : ""; } if (tag === "blockquote") { const content = collapseBlankLines(Array.from(el.childNodes).map(blockToMarkdown).join("\n\n")); return content .split("\n") .map(line => line ? `> ${line}` : ">") .join("\n"); } if (/^h[1-6]$/.test(tag)) { const level = Number(tag.slice(1)); const text = textBlock(el); return text ? `${"#".repeat(level)} ${text}` : ""; } if (tag === "p" || tag === "figcaption") { return textBlock(el); } if (tag === "hr") { return "---"; } if (tag === "img") { return inlineText(el); } const childBlocks = Array.from(el.childNodes) .map(child => blockToMarkdown(child)) .filter(Boolean); if (childBlocks.length) return collapseBlankLines(childBlocks.join("\n\n")); return textBlock(node); } const root = stripNoise(pickRoot()); const markdown = blockToMarkdown(root); return collapseBlankLines(markdown); }