爬虫实现,修改class转样式,前端渲染
This commit is contained in:
@@ -250,6 +250,56 @@ class XxqgCrawler(BaseCrawler):
|
||||
|
||||
# 相对路径,补全域名
|
||||
return self.config.base_url + url
|
||||
|
||||
def _extract_inline_style(self, element) -> str:
|
||||
"""
|
||||
提取元素的计算样式并转换为inline style
|
||||
|
||||
Args:
|
||||
element: Selenium WebElement
|
||||
|
||||
Returns:
|
||||
inline style 字符串
|
||||
"""
|
||||
# 需要提取的CSS属性列表
|
||||
css_properties = [
|
||||
'text-align',
|
||||
'text-indent',
|
||||
'margin',
|
||||
'margin-top',
|
||||
'margin-bottom',
|
||||
'margin-left',
|
||||
'margin-right',
|
||||
'padding',
|
||||
'padding-top',
|
||||
'padding-bottom',
|
||||
'padding-left',
|
||||
'padding-right',
|
||||
'font-size',
|
||||
'font-weight',
|
||||
'font-style',
|
||||
'color',
|
||||
'background-color',
|
||||
'line-height',
|
||||
'letter-spacing',
|
||||
'word-spacing'
|
||||
]
|
||||
|
||||
styles = []
|
||||
for prop in css_properties:
|
||||
try:
|
||||
value = element.value_of_css_property(prop)
|
||||
# 过滤默认值和空值
|
||||
if value and value not in ['none', 'normal', 'auto', '0px', 'rgba(0, 0, 0, 0)', 'transparent']:
|
||||
# 对于 margin/padding,如果都是 0px 就跳过
|
||||
if 'margin' in prop or 'padding' in prop:
|
||||
if value == '0px' or value == '0':
|
||||
continue
|
||||
styles.append(f"{prop}: {value}")
|
||||
except:
|
||||
continue
|
||||
|
||||
return "; ".join(styles) if styles else ""
|
||||
|
||||
def parse_news_detail(self, url: str) -> NewsItem:
|
||||
news_item = NewsItem(title='', contentRows=[], url=url)
|
||||
@@ -355,11 +405,21 @@ class XxqgCrawler(BaseCrawler):
|
||||
text_content = child.text.strip()
|
||||
# 过滤空内容
|
||||
if text_content:
|
||||
# 提取计算样式并转换为inline style
|
||||
inline_style = self._extract_inline_style(child)
|
||||
tag_name = child.tag_name
|
||||
|
||||
# 构建新的HTML标签(用inline style替代class)
|
||||
if inline_style:
|
||||
content_html = f'<{tag_name} style="{inline_style}">{child.get_attribute("innerHTML")}</{tag_name}>'
|
||||
else:
|
||||
content_html = f'<{tag_name}>{child.get_attribute("innerHTML")}</{tag_name}>'
|
||||
|
||||
news_item.contentRows.append({
|
||||
"type": "text",
|
||||
"content": text_content
|
||||
"content": content_html
|
||||
})
|
||||
# logger.debug(f"提取文字: {text_content[:50]}...")
|
||||
logger.debug(f"提取文字(转换样式): {text_content[:50]}...")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"处理内容元素失败: {str(e)}")
|
||||
|
||||
Reference in New Issue
Block a user