爬虫实现，修改class转样式，前端渲染

2025-11-21 16:49:37 +08:00
parent 7eab82c6af
commit a492b68fa2
11 changed files with 406 additions and 45 deletions
--- a/schoolNewsCrawler/crawler/xxqg/XxqgCrawler.py
+++ b/schoolNewsCrawler/crawler/xxqg/XxqgCrawler.py
@@ -250,6 +250,56 @@ class XxqgCrawler(BaseCrawler):
        
        # 相对路径，补全域名
        return self.config.base_url + url
+    
+    def _extract_inline_style(self, element) -> str:
+        """
+        提取元素的计算样式并转换为inline style
+        
+        Args:
+            element: Selenium WebElement
+            
+        Returns:
+            inline style 字符串
+        """
+        # 需要提取的CSS属性列表
+        css_properties = [
+            'text-align',
+            'text-indent', 
+            'margin',
+            'margin-top',
+            'margin-bottom',
+            'margin-left',
+            'margin-right',
+            'padding',
+            'padding-top',
+            'padding-bottom',
+            'padding-left',
+            'padding-right',
+            'font-size',
+            'font-weight',
+            'font-style',
+            'color',
+            'background-color',
+            'line-height',
+            'letter-spacing',
+            'word-spacing'
+        ]
+        
+        styles = []
+        for prop in css_properties:
+            try:
+                value = element.value_of_css_property(prop)
+                # 过滤默认值和空值
+                if value and value not in ['none', 'normal', 'auto', '0px', 'rgba(0, 0, 0, 0)', 'transparent']:
+                    # 对于 margin/padding，如果都是 0px 就跳过
+                    if 'margin' in prop or 'padding' in prop:
+                        if value == '0px' or value == '0':
+                            continue
+                    styles.append(f"{prop}: {value}")
+            except:
+                continue
+        
+        return "; ".join(styles) if styles else ""
  
    def parse_news_detail(self, url: str) -> NewsItem:
        news_item = NewsItem(title='', contentRows=[], url=url)
@@ -355,11 +405,21 @@ class XxqgCrawler(BaseCrawler):
                    text_content = child.text.strip()
                    # 过滤空内容
                    if text_content:
+                        # 提取计算样式并转换为inline style
+                        inline_style = self._extract_inline_style(child)
+                        tag_name = child.tag_name
+                        
+                        # 构建新的HTML标签（用inline style替代class）
+                        if inline_style:
+                            content_html = f'<{tag_name} style="{inline_style}">{child.get_attribute("innerHTML")}</{tag_name}>'
+                        else:
+                            content_html = f'<{tag_name}>{child.get_attribute("innerHTML")}</{tag_name}>'
+                        
                        news_item.contentRows.append({
                            "type": "text",
-                            "content": text_content
+                            "content": content_html
                        })
-                        # logger.debug(f"提取文字: {text_content[:50]}...")
+                        logger.debug(f"提取文字（转换样式）: {text_content[:50]}...")
                
                except Exception as e:
                    logger.warning(f"处理内容元素失败: {str(e)}")