Initial commit: 招标信息爬虫与分析系统
This commit is contained in:
68
process_csv.py
Normal file
68
process_csv.py
Normal file
@@ -0,0 +1,68 @@
|
||||
import csv
|
||||
import re
|
||||
|
||||
# 读取CSV文件
|
||||
with open('data/浙江省公共资源交易中心_20260213_172414.csv', 'r', encoding='utf-8') as file:
|
||||
reader = csv.reader(file)
|
||||
headers = next(reader) # 读取表头
|
||||
rows = list(reader)[:20] # 读取前20条数据
|
||||
|
||||
# 打印表头
|
||||
print('\n原始表头:')
|
||||
for i, header in enumerate(headers):
|
||||
print(f'{i+1}. {header}')
|
||||
|
||||
# 分析前20条数据
|
||||
print('\n前20条数据分析:')
|
||||
print('-' * 100)
|
||||
print(f'| {"序号":<4} | {"标题":<80} | {"项目批准文号":<30} | {"项目名称":<80} |')
|
||||
print('-' * 100)
|
||||
|
||||
for i, row in enumerate(rows):
|
||||
title = row[0]
|
||||
project_id = row[6]
|
||||
project_name = row[7]
|
||||
|
||||
# 从标题中提取批准文号(如果有的话)
|
||||
id_match = re.search(r'\[(.*?)\]$', title)
|
||||
extracted_id = id_match.group(1) if id_match else ''
|
||||
|
||||
# 从标题中提取纯项目名称
|
||||
extracted_name = re.sub(r'\[(.*?)\]$', '', title).strip()
|
||||
|
||||
# 验证项目批准文号是否一致
|
||||
id_match_flag = project_id == extracted_id
|
||||
|
||||
# 验证项目名称是否正确
|
||||
name_match_flag = project_name == extracted_name
|
||||
|
||||
print(f'| {i+1:<4} | {title} | {project_id} | {project_name} |')
|
||||
|
||||
# 如果有不一致,打印详细信息
|
||||
if not id_match_flag:
|
||||
print(f' 警告: 项目批准文号不一致 - 标题中提取: {extracted_id}, 列中值: {project_id}')
|
||||
if not name_match_flag:
|
||||
print(f' 警告: 项目名称不一致 - 标题中提取: {extracted_name}, 列中值: {project_name}')
|
||||
|
||||
print('-' * 100)
|
||||
|
||||
# 检查是否所有项目名称都不包含批准文号
|
||||
print('\n项目名称列检查:')
|
||||
print('-' * 100)
|
||||
print(f'| {"序号":<4} | {"项目名称":<80} | {"是否包含批准文号":<15} |')
|
||||
print('-' * 100)
|
||||
|
||||
for i, row in enumerate(rows):
|
||||
project_name = row[7]
|
||||
has_id = bool(re.search(r'\[.*?\]$', project_name))
|
||||
print(f'| {i+1:<4} | {project_name} | {"是" if has_id else "否":<15} |')
|
||||
|
||||
print('-' * 100)
|
||||
|
||||
# 总结
|
||||
print('\n总结:')
|
||||
print('1. 从CSV文件中可以看到,项目批准文号和项目名称已经正确分离到不同列中')
|
||||
print('2. 标题列包含完整信息:项目名称[项目批准文号]')
|
||||
print('3. 项目批准文号列(第7列)只包含批准文号')
|
||||
print('4. 项目名称列(第8列)只包含纯项目名称,不包含批准文号')
|
||||
print('5. 前3条数据的项目名称和项目批准文号分离正确')
|
||||
Reference in New Issue
Block a user