browser-cli/browser_cli/commands/extract.py

import json

import click
from browser_cli.commands import client_from_ctx, handle_errors
# Re-exported for backward compatibility: the HTML→Markdown engine now lives in
# browser_cli.markdown and is applied by the SDK (ExtractNS.markdown).
from browser_cli.markdown import _clean_markdown_output, _convert_html_to_markdown  # noqa: F401
from rich.console import Console
from rich.table import Table

console = Console()

@click.group("extract")
def extract_group():
    """Extract content from the active tab."""

@extract_group.command("links")
@handle_errors
def extract_links():
    """Extract all links from the active tab."""
    links = client_from_ctx().extract.links()
    if not links:
        console.print("[yellow]No links found[/yellow]")
        return
    table = Table(show_header=True, header_style="bold cyan")
    table.add_column("Text", width=40)
    table.add_column("URL")
    for lnk in links:
        table.add_row((lnk.get("text") or "")[:60], lnk.get("href") or "")
    console.print(table)

@extract_group.command("images")
@handle_errors
def extract_images():
    """Extract all images from the active tab."""
    images = client_from_ctx().extract.images()
    if not images:
        console.print("[yellow]No images found[/yellow]")
        return
    table = Table(show_header=True, header_style="bold cyan")
    table.add_column("Alt", width=30)
    table.add_column("Src")
    for img in images:
        table.add_row((img.get("alt") or "")[:40], img.get("src") or "")
    console.print(table)

@extract_group.command("text")
@handle_errors
def extract_text():
    """Extract all visible text from the active tab."""
    console.print(client_from_ctx().extract.text())

@extract_group.command("json")
@click.argument("selector")
@handle_errors
def extract_json(selector):
    """Parse and pretty-print JSON content inside SELECTOR."""
    data = client_from_ctx().extract.json(selector)
    console.print_json(json.dumps(data))

@extract_group.command("html")
@handle_errors
def extract_html():
    """Print the full HTML of the active tab to stdout."""
    click.echo(client_from_ctx().extract.html())

@extract_group.command("markdown")
@click.option("--selector", help="Extract only the DOM subtree matching this CSS selector.")
@handle_errors
def extract_markdown(selector):
    """Extract the page's main content as Markdown."""
    markdown = client_from_ctx().extract.markdown(selector)
    click.echo(markdown or "", nl=not (markdown or "").endswith("\n"))