爬虫实现,修改class转样式,前端渲染

This commit is contained in:
2025-11-21 16:49:37 +08:00
parent 7eab82c6af
commit a492b68fa2
11 changed files with 406 additions and 45 deletions

View File

@@ -250,6 +250,56 @@ class XxqgCrawler(BaseCrawler):
# 相对路径,补全域名
return self.config.base_url + url
def _extract_inline_style(self, element) -> str:
"""
提取元素的计算样式并转换为inline style
Args:
element: Selenium WebElement
Returns:
inline style 字符串
"""
# 需要提取的CSS属性列表
css_properties = [
'text-align',
'text-indent',
'margin',
'margin-top',
'margin-bottom',
'margin-left',
'margin-right',
'padding',
'padding-top',
'padding-bottom',
'padding-left',
'padding-right',
'font-size',
'font-weight',
'font-style',
'color',
'background-color',
'line-height',
'letter-spacing',
'word-spacing'
]
styles = []
for prop in css_properties:
try:
value = element.value_of_css_property(prop)
# 过滤默认值和空值
if value and value not in ['none', 'normal', 'auto', '0px', 'rgba(0, 0, 0, 0)', 'transparent']:
# 对于 margin/padding如果都是 0px 就跳过
if 'margin' in prop or 'padding' in prop:
if value == '0px' or value == '0':
continue
styles.append(f"{prop}: {value}")
except:
continue
return "; ".join(styles) if styles else ""
def parse_news_detail(self, url: str) -> NewsItem:
news_item = NewsItem(title='', contentRows=[], url=url)
@@ -355,11 +405,21 @@ class XxqgCrawler(BaseCrawler):
text_content = child.text.strip()
# 过滤空内容
if text_content:
# 提取计算样式并转换为inline style
inline_style = self._extract_inline_style(child)
tag_name = child.tag_name
# 构建新的HTML标签用inline style替代class
if inline_style:
content_html = f'<{tag_name} style="{inline_style}">{child.get_attribute("innerHTML")}</{tag_name}>'
else:
content_html = f'<{tag_name}>{child.get_attribute("innerHTML")}</{tag_name}>'
news_item.contentRows.append({
"type": "text",
"content": text_content
"content": content_html
})
# logger.debug(f"提取文字: {text_content[:50]}...")
logger.debug(f"提取文字(转换样式): {text_content[:50]}...")
except Exception as e:
logger.warning(f"处理内容元素失败: {str(e)}")