"""HtmlParser 单元测试""" import pytest from exceptions import ParseError from parsers.html_parser import HtmlParser @pytest.fixture def parser(): return HtmlParser() class TestSupportedExtensions: def test_supports_html(self, parser): assert ".html" in parser.supported_extensions() def test_supports_htm(self, parser): assert ".htm" in parser.supported_extensions() def test_only_two_extensions(self, parser): assert len(parser.supported_extensions()) == 2 class TestParse: def test_parse_simple_html(self, parser, tmp_path): f = tmp_path / "test.html" f.write_text("
Hello, world!
", encoding="utf-8") result = parser.parse(str(f)) assert "Hello, world!" in result def test_parse_htm_extension(self, parser, tmp_path): f = tmp_path / "test.htm" f.write_text("HTM file
", encoding="utf-8") result = parser.parse(str(f)) assert "HTM file" in result def test_parse_empty_file(self, parser, tmp_path): f = tmp_path / "empty.html" f.write_bytes(b"") assert parser.parse(str(f)) == "" def test_removes_script_tags(self, parser, tmp_path): f = tmp_path / "script.html" html = "Content
" f.write_text(html, encoding="utf-8") result = parser.parse(str(f)) assert "alert" not in result assert "script" not in result.lower() or "Content" in result assert "Content" in result def test_removes_style_tags(self, parser, tmp_path): f = tmp_path / "style.html" html = "Styled
" f.write_text(html, encoding="utf-8") result = parser.parse(str(f)) assert "color: red" not in result assert "Styled" in result def test_converts_headings_to_markdown(self, parser, tmp_path): f = tmp_path / "headings.html" html = "Text
" f.write_text(html, encoding="utf-8") result = parser.parse(str(f)) assert "# Title" in result assert "## Subtitle" in result def test_converts_links_to_markdown(self, parser, tmp_path): f = tmp_path / "links.html" html = 'Example' f.write_text(html, encoding="utf-8") result = parser.parse(str(f)) assert "Example" in result assert "https://example.com" in result def test_converts_lists_to_markdown(self, parser, tmp_path): f = tmp_path / "lists.html" html = "UTF-8 content
' f.write_text(html, encoding="utf-8") result = parser.parse(str(f)) assert "UTF-8 content" in result def test_gbk_encoded_html_with_meta_charset(self, parser, tmp_path): f = tmp_path / "gbk.html" html = '你好世界,这是中文内容测试
' f.write_bytes(html.encode("gbk")) result = parser.parse(str(f)) assert "你好世界" in result def test_encoding_fallback_to_charset_normalizer(self, parser, tmp_path): f = tmp_path / "no_meta.html" html = "Hello, this is a test with enough text for encoding detection to work properly.
" f.write_bytes(html.encode("utf-8")) result = parser.parse(str(f)) assert "Hello" in result def test_nonexistent_file_raises(self, parser): with pytest.raises(ParseError) as exc_info: parser.parse("/nonexistent/path/file.html") assert "file.html" in exc_info.value.file_name assert exc_info.value.reason != "" def test_parse_error_contains_filename(self, parser): with pytest.raises(ParseError) as exc_info: parser.parse("/no/such/mypage.html") assert exc_info.value.file_name == "mypage.html" def test_complex_html_removes_all_tags(self, parser, tmp_path): f = tmp_path / "complex.html" html = """Paragraph with bold and italic text.
| Name | Value |
|---|---|
| A | 1 |