"""Security/robustness tests for the HTML→Markdown converter on hostile page content."""
from browser_cli.markdown.html import _MAX_TREE_DEPTH, convert_html_to_markdown
from browser_cli.markdown.render import render_markdown
def _identity(markdown):
return markdown
# ── depth-bounded recursion (Finding 4: HIGH/DoS) ─────────────────────────────────
def test_deeply_nested_html_does_not_crash():
"""Thousands of nested elements must not raise RecursionError."""
depth = 5000
html = "
" * depth + "deep content" + "
" * depth
out = convert_html_to_markdown(html, _identity)
assert "deep content" in out # text preserved despite flattening
def test_deeply_nested_via_render_markdown_entrypoint():
html = "" * 3000 + "x" + "
" * 3000
out = render_markdown(html) # routes HTML through the converter
assert "x" in out
def test_nesting_within_cap_is_preserved_structurally():
# A modest list nesting (well under the cap) still renders as a list.
html = ""
out = convert_html_to_markdown(html, _identity)
assert "- a" in out
assert "b" in out
def test_max_tree_depth_is_sane():
# Cap must be high enough for real documents, low enough to stay under the
# interpreter recursion limit with a few frames per level.
assert 50 <= _MAX_TREE_DEPTH <= 400
# ── unsafe URL schemes (Finding 5: LOW) ───────────────────────────────────────────
def test_javascript_url_in_link_is_neutralised():
# Anchors render their href only in inline context (inside a block like ).
out = convert_html_to_markdown('
click
', _identity)
assert "javascript:" not in out
assert "click" in out # link text kept, dangerous href dropped
def test_data_and_vbscript_urls_dropped():
assert "vbscript:" not in convert_html_to_markdown('y
', _identity)
assert "data:" not in convert_html_to_markdown('
', _identity)
def test_normal_urls_pass_through():
out = convert_html_to_markdown('site
', _identity)
assert "(https://example.com)" in out
img = convert_html_to_markdown('
', _identity)
assert "(https://example.com/x.png)" in img