Files
bigwo/tests/test_html_parser.py

146 lines
5.4 KiB
Python
Raw Permalink Normal View History

"""HtmlParser 单元测试"""
import pytest
from exceptions import ParseError
from parsers.html_parser import HtmlParser
@pytest.fixture
def parser():
return HtmlParser()
class TestSupportedExtensions:
def test_supports_html(self, parser):
assert ".html" in parser.supported_extensions()
def test_supports_htm(self, parser):
assert ".htm" in parser.supported_extensions()
def test_only_two_extensions(self, parser):
assert len(parser.supported_extensions()) == 2
class TestParse:
def test_parse_simple_html(self, parser, tmp_path):
f = tmp_path / "test.html"
f.write_text("<html><body><p>Hello, world!</p></body></html>", encoding="utf-8")
result = parser.parse(str(f))
assert "Hello, world!" in result
def test_parse_htm_extension(self, parser, tmp_path):
f = tmp_path / "test.htm"
f.write_text("<html><body><p>HTM file</p></body></html>", encoding="utf-8")
result = parser.parse(str(f))
assert "HTM file" in result
def test_parse_empty_file(self, parser, tmp_path):
f = tmp_path / "empty.html"
f.write_bytes(b"")
assert parser.parse(str(f)) == ""
def test_removes_script_tags(self, parser, tmp_path):
f = tmp_path / "script.html"
html = "<html><body><script>alert('xss');</script><p>Content</p></body></html>"
f.write_text(html, encoding="utf-8")
result = parser.parse(str(f))
assert "alert" not in result
assert "script" not in result.lower() or "Content" in result
assert "Content" in result
def test_removes_style_tags(self, parser, tmp_path):
f = tmp_path / "style.html"
html = "<html><head><style>body { color: red; }</style></head><body><p>Styled</p></body></html>"
f.write_text(html, encoding="utf-8")
result = parser.parse(str(f))
assert "color: red" not in result
assert "Styled" in result
def test_converts_headings_to_markdown(self, parser, tmp_path):
f = tmp_path / "headings.html"
html = "<html><body><h1>Title</h1><h2>Subtitle</h2><p>Text</p></body></html>"
f.write_text(html, encoding="utf-8")
result = parser.parse(str(f))
assert "# Title" in result
assert "## Subtitle" in result
def test_converts_links_to_markdown(self, parser, tmp_path):
f = tmp_path / "links.html"
html = '<html><body><a href="https://example.com">Example</a></body></html>'
f.write_text(html, encoding="utf-8")
result = parser.parse(str(f))
assert "Example" in result
assert "https://example.com" in result
def test_converts_lists_to_markdown(self, parser, tmp_path):
f = tmp_path / "lists.html"
html = "<html><body><ul><li>Item 1</li><li>Item 2</li></ul></body></html>"
f.write_text(html, encoding="utf-8")
result = parser.parse(str(f))
assert "Item 1" in result
assert "Item 2" in result
def test_meta_charset_detection(self, parser, tmp_path):
f = tmp_path / "charset.html"
html = '<html><head><meta charset="utf-8"></head><body><p>UTF-8 content</p></body></html>'
f.write_text(html, encoding="utf-8")
result = parser.parse(str(f))
assert "UTF-8 content" in result
def test_gbk_encoded_html_with_meta_charset(self, parser, tmp_path):
f = tmp_path / "gbk.html"
html = '<html><head><meta charset="gbk"></head><body><p>你好世界,这是中文内容测试</p></body></html>'
f.write_bytes(html.encode("gbk"))
result = parser.parse(str(f))
assert "你好世界" in result
def test_encoding_fallback_to_charset_normalizer(self, parser, tmp_path):
f = tmp_path / "no_meta.html"
html = "<html><body><p>Hello, this is a test with enough text for encoding detection to work properly.</p></body></html>"
f.write_bytes(html.encode("utf-8"))
result = parser.parse(str(f))
assert "Hello" in result
def test_nonexistent_file_raises(self, parser):
with pytest.raises(ParseError) as exc_info:
parser.parse("/nonexistent/path/file.html")
assert "file.html" in exc_info.value.file_name
assert exc_info.value.reason != ""
def test_parse_error_contains_filename(self, parser):
with pytest.raises(ParseError) as exc_info:
parser.parse("/no/such/mypage.html")
assert exc_info.value.file_name == "mypage.html"
def test_complex_html_removes_all_tags(self, parser, tmp_path):
f = tmp_path / "complex.html"
html = """<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
<style>.hidden { display: none; }</style>
<script>var x = 1;</script>
</head>
<body>
<div class="container">
<h1>Main Title</h1>
<p>Paragraph with <strong>bold</strong> and <em>italic</em> text.</p>
<script>console.log('inline script');</script>
<table>
<tr><th>Name</th><th>Value</th></tr>
<tr><td>A</td><td>1</td></tr>
</table>
</div>
</body>
</html>"""
f.write_text(html, encoding="utf-8")
result = parser.parse(str(f))
assert "Main Title" in result
assert "bold" in result.lower() or "**bold**" in result
assert "<script>" not in result
assert "<style>" not in result
assert "<div" not in result
assert "console.log" not in result
assert "var x" not in result