Files
ztb/process_csv.py
2026-02-13 18:15:20 +08:00

69 lines
2.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import csv
import re
# 读取CSV文件
with open('data/浙江省公共资源交易中心_20260213_172414.csv', 'r', encoding='utf-8') as file:
reader = csv.reader(file)
headers = next(reader) # 读取表头
rows = list(reader)[:20] # 读取前20条数据
# 打印表头
print('\n原始表头:')
for i, header in enumerate(headers):
print(f'{i+1}. {header}')
# 分析前20条数据
print('\n前20条数据分析:')
print('-' * 100)
print(f'| {"序号":<4} | {"标题":<80} | {"项目批准文号":<30} | {"项目名称":<80} |')
print('-' * 100)
for i, row in enumerate(rows):
title = row[0]
project_id = row[6]
project_name = row[7]
# 从标题中提取批准文号(如果有的话)
id_match = re.search(r'\[(.*?)\]$', title)
extracted_id = id_match.group(1) if id_match else ''
# 从标题中提取纯项目名称
extracted_name = re.sub(r'\[(.*?)\]$', '', title).strip()
# 验证项目批准文号是否一致
id_match_flag = project_id == extracted_id
# 验证项目名称是否正确
name_match_flag = project_name == extracted_name
print(f'| {i+1:<4} | {title} | {project_id} | {project_name} |')
# 如果有不一致,打印详细信息
if not id_match_flag:
print(f' 警告: 项目批准文号不一致 - 标题中提取: {extracted_id}, 列中值: {project_id}')
if not name_match_flag:
print(f' 警告: 项目名称不一致 - 标题中提取: {extracted_name}, 列中值: {project_name}')
print('-' * 100)
# 检查是否所有项目名称都不包含批准文号
print('\n项目名称列检查:')
print('-' * 100)
print(f'| {"序号":<4} | {"项目名称":<80} | {"是否包含批准文号":<15} |')
print('-' * 100)
for i, row in enumerate(rows):
project_name = row[7]
has_id = bool(re.search(r'\[.*?\]$', project_name))
print(f'| {i+1:<4} | {project_name} | {"" if has_id else "":<15} |')
print('-' * 100)
# 总结
print('\n总结:')
print('1. 从CSV文件中可以看到项目批准文号和项目名称已经正确分离到不同列中')
print('2. 标题列包含完整信息:项目名称[项目批准文号]')
print('3. 项目批准文号列第7列只包含批准文号')
print('4. 项目名称列第8列只包含纯项目名称不包含批准文号')
print('5. 前3条数据的项目名称和项目批准文号分离正确')