"""Security/robustness tests for the HTML→Markdown converter on hostile page content.""" from browser_cli.markdown.html import _MAX_TREE_DEPTH, convert_html_to_markdown from browser_cli.markdown.render import render_markdown def _identity(markdown): return markdown # ── depth-bounded recursion (Finding 4: HIGH/DoS) ───────────────────────────────── def test_deeply_nested_html_does_not_crash(): """Thousands of nested elements must not raise RecursionError.""" depth = 5000 html = "
" * depth + "deep content" + "
" * depth out = convert_html_to_markdown(html, _identity) assert "deep content" in out # text preserved despite flattening def test_deeply_nested_via_render_markdown_entrypoint(): html = "
" * 3000 + "x" + "
" * 3000 out = render_markdown(html) # routes HTML through the converter assert "x" in out def test_nesting_within_cap_is_preserved_structurally(): # A modest list nesting (well under the cap) still renders as a list. html = "" out = convert_html_to_markdown(html, _identity) assert "- a" in out assert "b" in out def test_max_tree_depth_is_sane(): # Cap must be high enough for real documents, low enough to stay under the # interpreter recursion limit with a few frames per level. assert 50 <= _MAX_TREE_DEPTH <= 400 # ── unsafe URL schemes (Finding 5: LOW) ─────────────────────────────────────────── def test_javascript_url_in_link_is_neutralised(): # Anchors render their href only in inline context (inside a block like

). out = convert_html_to_markdown('

click

', _identity) assert "javascript:" not in out assert "click" in out # link text kept, dangerous href dropped def test_data_and_vbscript_urls_dropped(): assert "vbscript:" not in convert_html_to_markdown('

y

', _identity) assert "data:" not in convert_html_to_markdown('', _identity) def test_normal_urls_pass_through(): out = convert_html_to_markdown('

site

', _identity) assert "(https://example.com)" in out img = convert_html_to_markdown('pic', _identity) assert "(https://example.com/x.png)" in img