146 lines
5.4 KiB
Python
146 lines
5.4 KiB
Python
|
|
"""HtmlParser 单元测试"""
|
||
|
|
|
||
|
|
import pytest
|
||
|
|
|
||
|
|
from exceptions import ParseError
|
||
|
|
from parsers.html_parser import HtmlParser
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.fixture
|
||
|
|
def parser():
|
||
|
|
return HtmlParser()
|
||
|
|
|
||
|
|
|
||
|
|
class TestSupportedExtensions:
|
||
|
|
def test_supports_html(self, parser):
|
||
|
|
assert ".html" in parser.supported_extensions()
|
||
|
|
|
||
|
|
def test_supports_htm(self, parser):
|
||
|
|
assert ".htm" in parser.supported_extensions()
|
||
|
|
|
||
|
|
def test_only_two_extensions(self, parser):
|
||
|
|
assert len(parser.supported_extensions()) == 2
|
||
|
|
|
||
|
|
|
||
|
|
class TestParse:
|
||
|
|
def test_parse_simple_html(self, parser, tmp_path):
|
||
|
|
f = tmp_path / "test.html"
|
||
|
|
f.write_text("<html><body><p>Hello, world!</p></body></html>", encoding="utf-8")
|
||
|
|
result = parser.parse(str(f))
|
||
|
|
assert "Hello, world!" in result
|
||
|
|
|
||
|
|
def test_parse_htm_extension(self, parser, tmp_path):
|
||
|
|
f = tmp_path / "test.htm"
|
||
|
|
f.write_text("<html><body><p>HTM file</p></body></html>", encoding="utf-8")
|
||
|
|
result = parser.parse(str(f))
|
||
|
|
assert "HTM file" in result
|
||
|
|
|
||
|
|
def test_parse_empty_file(self, parser, tmp_path):
|
||
|
|
f = tmp_path / "empty.html"
|
||
|
|
f.write_bytes(b"")
|
||
|
|
assert parser.parse(str(f)) == ""
|
||
|
|
|
||
|
|
def test_removes_script_tags(self, parser, tmp_path):
|
||
|
|
f = tmp_path / "script.html"
|
||
|
|
html = "<html><body><script>alert('xss');</script><p>Content</p></body></html>"
|
||
|
|
f.write_text(html, encoding="utf-8")
|
||
|
|
result = parser.parse(str(f))
|
||
|
|
assert "alert" not in result
|
||
|
|
assert "script" not in result.lower() or "Content" in result
|
||
|
|
assert "Content" in result
|
||
|
|
|
||
|
|
def test_removes_style_tags(self, parser, tmp_path):
|
||
|
|
f = tmp_path / "style.html"
|
||
|
|
html = "<html><head><style>body { color: red; }</style></head><body><p>Styled</p></body></html>"
|
||
|
|
f.write_text(html, encoding="utf-8")
|
||
|
|
result = parser.parse(str(f))
|
||
|
|
assert "color: red" not in result
|
||
|
|
assert "Styled" in result
|
||
|
|
|
||
|
|
def test_converts_headings_to_markdown(self, parser, tmp_path):
|
||
|
|
f = tmp_path / "headings.html"
|
||
|
|
html = "<html><body><h1>Title</h1><h2>Subtitle</h2><p>Text</p></body></html>"
|
||
|
|
f.write_text(html, encoding="utf-8")
|
||
|
|
result = parser.parse(str(f))
|
||
|
|
assert "# Title" in result
|
||
|
|
assert "## Subtitle" in result
|
||
|
|
|
||
|
|
def test_converts_links_to_markdown(self, parser, tmp_path):
|
||
|
|
f = tmp_path / "links.html"
|
||
|
|
html = '<html><body><a href="https://example.com">Example</a></body></html>'
|
||
|
|
f.write_text(html, encoding="utf-8")
|
||
|
|
result = parser.parse(str(f))
|
||
|
|
assert "Example" in result
|
||
|
|
assert "https://example.com" in result
|
||
|
|
|
||
|
|
def test_converts_lists_to_markdown(self, parser, tmp_path):
|
||
|
|
f = tmp_path / "lists.html"
|
||
|
|
html = "<html><body><ul><li>Item 1</li><li>Item 2</li></ul></body></html>"
|
||
|
|
f.write_text(html, encoding="utf-8")
|
||
|
|
result = parser.parse(str(f))
|
||
|
|
assert "Item 1" in result
|
||
|
|
assert "Item 2" in result
|
||
|
|
|
||
|
|
def test_meta_charset_detection(self, parser, tmp_path):
|
||
|
|
f = tmp_path / "charset.html"
|
||
|
|
html = '<html><head><meta charset="utf-8"></head><body><p>UTF-8 content</p></body></html>'
|
||
|
|
f.write_text(html, encoding="utf-8")
|
||
|
|
result = parser.parse(str(f))
|
||
|
|
assert "UTF-8 content" in result
|
||
|
|
|
||
|
|
def test_gbk_encoded_html_with_meta_charset(self, parser, tmp_path):
|
||
|
|
f = tmp_path / "gbk.html"
|
||
|
|
html = '<html><head><meta charset="gbk"></head><body><p>你好世界,这是中文内容测试</p></body></html>'
|
||
|
|
f.write_bytes(html.encode("gbk"))
|
||
|
|
result = parser.parse(str(f))
|
||
|
|
assert "你好世界" in result
|
||
|
|
|
||
|
|
def test_encoding_fallback_to_charset_normalizer(self, parser, tmp_path):
|
||
|
|
f = tmp_path / "no_meta.html"
|
||
|
|
html = "<html><body><p>Hello, this is a test with enough text for encoding detection to work properly.</p></body></html>"
|
||
|
|
f.write_bytes(html.encode("utf-8"))
|
||
|
|
result = parser.parse(str(f))
|
||
|
|
assert "Hello" in result
|
||
|
|
|
||
|
|
def test_nonexistent_file_raises(self, parser):
|
||
|
|
with pytest.raises(ParseError) as exc_info:
|
||
|
|
parser.parse("/nonexistent/path/file.html")
|
||
|
|
assert "file.html" in exc_info.value.file_name
|
||
|
|
assert exc_info.value.reason != ""
|
||
|
|
|
||
|
|
def test_parse_error_contains_filename(self, parser):
|
||
|
|
with pytest.raises(ParseError) as exc_info:
|
||
|
|
parser.parse("/no/such/mypage.html")
|
||
|
|
assert exc_info.value.file_name == "mypage.html"
|
||
|
|
|
||
|
|
def test_complex_html_removes_all_tags(self, parser, tmp_path):
|
||
|
|
f = tmp_path / "complex.html"
|
||
|
|
html = """<!DOCTYPE html>
|
||
|
|
<html>
|
||
|
|
<head>
|
||
|
|
<title>Test Page</title>
|
||
|
|
<style>.hidden { display: none; }</style>
|
||
|
|
<script>var x = 1;</script>
|
||
|
|
</head>
|
||
|
|
<body>
|
||
|
|
<div class="container">
|
||
|
|
<h1>Main Title</h1>
|
||
|
|
<p>Paragraph with <strong>bold</strong> and <em>italic</em> text.</p>
|
||
|
|
<script>console.log('inline script');</script>
|
||
|
|
<table>
|
||
|
|
<tr><th>Name</th><th>Value</th></tr>
|
||
|
|
<tr><td>A</td><td>1</td></tr>
|
||
|
|
</table>
|
||
|
|
</div>
|
||
|
|
</body>
|
||
|
|
</html>"""
|
||
|
|
f.write_text(html, encoding="utf-8")
|
||
|
|
result = parser.parse(str(f))
|
||
|
|
assert "Main Title" in result
|
||
|
|
assert "bold" in result.lower() or "**bold**" in result
|
||
|
|
assert "<script>" not in result
|
||
|
|
assert "<style>" not in result
|
||
|
|
assert "<div" not in result
|
||
|
|
assert "console.log" not in result
|
||
|
|
assert "var x" not in result
|