browser-cli/browser_cli/commands/extract.py

import json
import re
from html.parser import HTMLParser

import click
from browser_cli.commands import _handle
from rich.console import Console
from rich.table import Table

console = Console()
_FENCE_RE = re.compile(r"```(?:[^\n`]*)\n.*?\n```", re.DOTALL)
_ESCAPED_MARKDOWN_RE = re.compile(r"\\([_-])")
_TABLE_SEPARATOR_RE = re.compile(r"^\|(?:\s*:?-{3,}:?\s*\|)+\s*$")


class _HtmlNode:
    def __init__(self, tag=None, attrs=None, text=None):
        self.tag = tag
        self.attrs = attrs or {}
        self.text = text
        self.children = []


class _HtmlTreeBuilder(HTMLParser):
    _VOID_TAGS = {"br", "hr", "img"}

    def __init__(self):
        super().__init__(convert_charrefs=True)
        self.root = _HtmlNode(tag="document")
        self._stack = [self.root]

    def handle_starttag(self, tag, attrs):
        node = _HtmlNode(tag=tag.lower(), attrs=dict(attrs))
        self._stack[-1].children.append(node)
        if node.tag not in self._VOID_TAGS:
            self._stack.append(node)

    def handle_startendtag(self, tag, attrs):
        node = _HtmlNode(tag=tag.lower(), attrs=dict(attrs))
        self._stack[-1].children.append(node)

    def handle_endtag(self, tag):
        lowered = tag.lower()
        for index in range(len(self._stack) - 1, 0, -1):
            if self._stack[index].tag == lowered:
                del self._stack[index:]
                break

    def handle_data(self, data):
        if data:
            self._stack[-1].children.append(_HtmlNode(text=data))


def _normalize_text(value):
    return re.sub(r"\s+", " ", value or "").strip()


def _normalize_inline(value):
    value = value.replace("\xa0", " ")
    value = re.sub(r"[ \t\r\f\v]+", " ", value)
    value = re.sub(r" *\n *", "\n", value)
    return value.strip()


def _collapse_blank_lines(value):
    value = re.sub(r"[ \t]+\n", "\n", value)
    value = re.sub(r"\n{3,}", "\n\n", value)
    return value.strip()


def _escape_markdown(text):
    return re.sub(r"([\\`[\]])", r"\\\1", text)


def _escape_table_cell(text):
    return text.replace("|", r"\|").replace("\n", " ").strip()


def _iter_descendants(node):
    for child in getattr(node, "children", []):
        yield child
        yield from _iter_descendants(child)


def _has_class(node, class_name):
    classes = (node.attrs.get("class") or "").split()
    return class_name in classes


def _is_code_block_node(node):
    if not node or not node.tag:
        return False
    if node.attrs.get("data-is-code-block-view") == "true":
        return True
    return node.tag == "pre"


def _inline_text(node):
    if node.text is not None:
        return _escape_markdown(node.text)
    if not node.tag:
        return ""

    tag = node.tag
    if tag == "br":
        return "\n"
    if tag == "img":
        src = node.attrs.get("src") or ""
        alt = _normalize_text(node.attrs.get("alt") or "")
        if not src:
            return ""
        return f"![{_escape_markdown(alt)}]({src})" if alt else f"![]({src})"
    if tag == "a":
        text = _normalize_inline("".join(_inline_text(child) for child in node.children))
        href = node.attrs.get("href") or ""
        return f"[{text or href}]({href})" if href else text
    if tag == "code":
        text = _normalize_inline("".join(_inline_text(child) for child in node.children))
        return f"`{text.replace('`', r'\\`')}`" if text else ""
    if tag in {"strong", "b"}:
        text = _normalize_inline("".join(_inline_text(child) for child in node.children))
        return f"**{text}**" if text else ""
    if tag in {"em", "i"}:
        text = _normalize_inline("".join(_inline_text(child) for child in node.children))
        return f"*{text}*" if text else ""

    chunks = []
    for child in node.children:
        rendered = _inline_text(child)
        if rendered:
            chunks.append(rendered)
            if child.tag in {"p", "div", "table", "ul", "ol", "pre"}:
                chunks.append("\n")
    return "".join(chunks)


def _text_block(node):
    return _collapse_blank_lines(_normalize_inline("".join(_inline_text(child) for child in node.children)))


def _inner_text_preserve(node):
    if node.text is not None:
        return node.text
    if not node.tag:
        return ""
    if node.tag == "br":
        return ""
    return "".join(_inner_text_preserve(child) for child in node.children)


def _table_to_markdown(node):
    rows = []
    for descendant in _iter_descendants(node):
        if descendant.tag != "tr":
            continue
        row = []
        for cell in descendant.children:
            if cell.tag in {"td", "th"}:
                row.append(_escape_table_cell(_text_block(cell)))
        if row:
            rows.append(row)
    if not rows:
        return ""

    widths = max(len(row) for row in rows)
    normalized_rows = [row + [""] * (widths - len(row)) for row in rows]

    headers = normalized_rows[0]
    body_rows = normalized_rows[1:]
    first_row_blank = all(not cell.strip() for cell in headers)
    if first_row_blank and len(normalized_rows) > 1:
        headers = normalized_rows[1]
        body_rows = normalized_rows[2:]

    has_thead = any(child.tag == "thead" for child in node.children)
    first_row = next((child for child in _iter_descendants(node) if child.tag == "tr"), None)
    first_row_has_th = bool(first_row and any(child.tag == "th" for child in first_row.children))
    if not (has_thead or first_row_has_th or first_row_blank):
        headers = [""] * widths
        body_rows = normalized_rows

    separator = ["---"] * widths
    lines = [
        f"| {' | '.join(headers)} |",
        f"| {' | '.join(separator)} |",
    ]
    lines.extend(f"| {' | '.join(row)} |" for row in body_rows)
    return "\n".join(lines)


def _list_to_markdown(node, depth=0):
    ordered = node.tag == "ol"
    items = []
    index = 1
    for child in node.children:
        if child.tag != "li":
            continue
        marker = f"{index}. " if ordered else "- "
        index += 1
        content = []
        nested = []
        for item_child in child.children:
            if item_child.tag in {"ul", "ol"}:
                nested.append(_list_to_markdown(item_child, depth + 1))
            else:
                content.append(_inline_text(item_child))
        line = _collapse_blank_lines(_normalize_inline("".join(content)))
        indent = "  " * depth
        if line:
            line_parts = line.splitlines()
            items.append(f"{indent}{marker}{line_parts[0]}")
            continuation_indent = f"{indent}{' ' * len(marker)}"
            items.extend(f"{continuation_indent}{part}" for part in line_parts[1:])
        items.extend(block for block in nested if block)
    return "\n".join(items)


def _code_block_to_markdown(node):
    if node.tag == "pre":
        text = _inner_text_preserve(node).rstrip("\n")
        return f"```\n{text}\n```" if text else ""

    lines = []
    for descendant in _iter_descendants(node):
        if descendant.tag and _has_class(descendant, "cm-line"):
            lines.append(_inner_text_preserve(descendant))
    code = "\n".join(lines).rstrip("\n")
    return f"```\n{code}\n```" if code else ""


def _block_to_markdown(node):
    if node.text is not None:
        return _normalize_text(node.text)
    if not node.tag:
        return ""
    if _is_code_block_node(node):
        return _code_block_to_markdown(node)
    if node.tag == "table":
        return _table_to_markdown(node)
    if node.tag in {"ul", "ol"}:
        return _list_to_markdown(node)
    if re.fullmatch(r"h[1-6]", node.tag):
        text = _text_block(node)
        return f"{'#' * int(node.tag[1])} {text}" if text else ""
    if node.tag in {"p", "figcaption"}:
        return _text_block(node)
    if node.tag == "blockquote":
        content = _collapse_blank_lines("\n\n".join(filter(None, (_block_to_markdown(child) for child in node.children))))
        return "\n".join(f"> {line}" if line else ">" for line in content.splitlines()) if content else ""
    if node.tag == "hr":
        return "---"
    if node.tag == "img":
        return _inline_text(node)

    child_blocks = [block for block in (_block_to_markdown(child) for child in node.children) if block]
    if child_blocks:
        return _collapse_blank_lines("\n\n".join(child_blocks))
    return _text_block(node)


def _parse_table_row(line):
    stripped = line.strip()
    if not stripped.startswith("|") or not stripped.endswith("|"):
        return None
    return [cell.strip() for cell in stripped.strip("|").split("|")]


def _repair_table_headers(lines):
    repaired = []
    index = 0
    while index < len(lines):
        if (
            index + 2 < len(lines)
            and _parse_table_row(lines[index]) is not None
            and _TABLE_SEPARATOR_RE.match(lines[index + 1].strip())
            and _parse_table_row(lines[index + 2]) is not None
        ):
            first = _parse_table_row(lines[index])
            third = _parse_table_row(lines[index + 2])
            if first and all(not cell for cell in first) and any(cell for cell in third):
                repaired.append(lines[index + 2].strip())
                repaired.append(lines[index + 1].strip())
                index += 3
                continue
        repaired.append(lines[index].strip())
        index += 1
    return repaired


def _repair_list_continuations(lines):
    repaired = []
    previous_was_list_item = False
    previous_continuation_indent = ""

    for line in lines:
        stripped = line.strip()
        list_match = re.match(r"^(\s*)([-*+]|\d+\.)\s+.+$", stripped)
        is_markdown_block_start = (
            not stripped
            or stripped.startswith(("```", "#", ">", "|"))
            or _TABLE_SEPARATOR_RE.match(stripped)
            or re.match(r"^(\s*)([-*+]|\d+\.)\s+", stripped)
        )

        if previous_was_list_item and stripped and not is_markdown_block_start:
            repaired.append(f"{previous_continuation_indent}{stripped}")
            previous_was_list_item = False
            continue

        repaired.append(stripped)
        if list_match:
            marker = list_match.group(2)
            base_indent = list_match.group(1)
            previous_continuation_indent = f"{base_indent}{' ' * (len(marker) + 1)}"
            previous_was_list_item = True
        else:
            previous_was_list_item = False

    return repaired


def _repair_flattened_diagram(text):
    if "\n" in text:
        return text
    if sum(text.count(char) for char in "│▼├└") < 2:
        return text

    text = re.sub(r"\s{2,}([│▼])", r"\n   \1", text)
    text = re.sub(r"([│▼])\s{2,}", r"\1\n", text)
    text = re.sub(r"([│▼])(?=[^\s\n│▼├└])", r"\1\n", text)
    text = re.sub(r"(?<=[^\s\n])([├└])", r"\n\1", text)
    text = re.sub(r"([^\s\n])(\()", r"\1\n\2", text)
    return "\n".join(line.rstrip() for line in text.splitlines() if line.strip())


def _convert_dash_lists_to_branches(lines):
    converted = []
    index = 0
    while index < len(lines):
        match = re.match(r"^(\s*)-\s+(.*)$", lines[index])
        if not match:
            converted.append(lines[index])
            index += 1
            continue

        indent = match.group(1)
        items = []
        while index < len(lines):
            next_match = re.match(rf"^{re.escape(indent)}-\s+(.*)$", lines[index])
            if not next_match:
                break
            items.append(next_match.group(1))
            index += 1

        for item_index, item in enumerate(items):
            branch = "└" if item_index == len(items) - 1 else "├"
            converted.append(f"{indent}{branch} {item}")
    return converted


def _clean_code_block(code):
    lines = [line.rstrip() for line in code.splitlines()]
    while lines and not lines[0].strip():
        lines.pop(0)
    while lines and not lines[-1].strip():
        lines.pop()

    flattened = _repair_flattened_diagram("\n".join(lines))
    lines = flattened.splitlines() if flattened else []
    lines = [
        f"   {line.strip()}"
        if line.strip() in {"│", "▼"} and not re.match(r"^\s+[│▼]\s*$", line)
        else line
        for line in lines
    ]
    lines = _convert_dash_lists_to_branches(lines)
    return "\n".join(lines)


def _clean_markdown_output(markdown):
    if not markdown:
        return ""

    pieces = []
    last_index = 0
    for match in _FENCE_RE.finditer(markdown):
        prose = markdown[last_index:match.start()]
        if prose:
            cleaned = _ESCAPED_MARKDOWN_RE.sub(r"\1", prose)
            lines = [line.strip() for line in cleaned.splitlines()]
            lines = _repair_table_headers(lines)
            lines = _repair_list_continuations(lines)
            cleaned = "\n".join(lines)
            cleaned = _collapse_blank_lines(cleaned)
            if cleaned:
                pieces.append(cleaned)

        fence = match.group(0)
        header, _, tail = fence.partition("\n")
        body, _, _ = tail.rpartition("\n")
        cleaned_body = _clean_code_block(body)
        pieces.append(f"{header}\n{cleaned_body}\n```" if cleaned_body else f"{header}\n```")
        last_index = match.end()

    trailing = markdown[last_index:]
    if trailing:
        cleaned = _ESCAPED_MARKDOWN_RE.sub(r"\1", trailing)
        lines = [line.strip() for line in cleaned.splitlines()]
        lines = _repair_table_headers(lines)
        lines = _repair_list_continuations(lines)
        cleaned = "\n".join(lines)
        cleaned = _collapse_blank_lines(cleaned)
        if cleaned:
            pieces.append(cleaned)

    return "\n\n".join(piece for piece in pieces if piece)


def _convert_html_to_markdown(html):
    parser = _HtmlTreeBuilder()
    parser.feed(html or "")
    markdown = _block_to_markdown(parser.root)
    return _clean_markdown_output(markdown)


@click.group("extract")
def extract_group():
    """Extract content from the active tab."""


@extract_group.command("links")
def extract_links():
    """Extract all links from the active tab."""
    links = _handle("extract.links")
    if not links:
        console.print("[yellow]No links found[/yellow]")
        return
    table = Table(show_header=True, header_style="bold cyan")
    table.add_column("Text", width=40)
    table.add_column("URL")
    for lnk in links:
        table.add_row((lnk.get("text") or "")[:60], lnk.get("href") or "")
    console.print(table)


@extract_group.command("images")
def extract_images():
    """Extract all images from the active tab."""
    images = _handle("extract.images")
    if not images:
        console.print("[yellow]No images found[/yellow]")
        return
    table = Table(show_header=True, header_style="bold cyan")
    table.add_column("Alt", width=30)
    table.add_column("Src")
    for img in images:
        table.add_row((img.get("alt") or "")[:40], img.get("src") or "")
    console.print(table)


@extract_group.command("text")
def extract_text():
    """Extract all visible text from the active tab."""
    text = _handle("extract.text")
    console.print(text or "")


@extract_group.command("json")
@click.argument("selector")
def extract_json(selector):
    """Parse and pretty-print JSON content inside SELECTOR."""
    data = _handle("extract.json", {"selector": selector})
    console.print_json(json.dumps(data))


@extract_group.command("html")
def extract_html():
    """Print the full HTML of the active tab to stdout."""
    html = _handle("extract.html")
    click.echo(html or "")


@extract_group.command("markdown")
@click.option("--selector", help="Extract only the DOM subtree matching this CSS selector.")
def extract_markdown(selector):
    """Extract the page's main content as Markdown."""
    markdown = _handle("extract.markdown", {"selector": selector})
    if (markdown or "").lstrip().startswith("<"):
        markdown = _convert_html_to_markdown(markdown)
    else:
        markdown = _clean_markdown_output(markdown or "")
    click.echo(markdown or "", nl=not (markdown or "").endswith("\n"))