车辆在线市场统计,爬虫实现
This commit is contained in:
148
merge_results.py
Normal file
148
merge_results.py
Normal file
@@ -0,0 +1,148 @@
|
||||
import os
|
||||
import csv
|
||||
import glob
|
||||
import pandas as pd
|
||||
import json
|
||||
|
||||
def merge_csv_files(output_file='merged_results.csv'):
|
||||
"""合并当前目录下的所有 temp_results_*.csv 和 query_results.csv 文件"""
|
||||
print("🚀 开始合并 CSV 文件...")
|
||||
|
||||
# 查找所有相关 CSV 文件
|
||||
csv_files = glob.glob('temp_results_*.csv')
|
||||
if os.path.exists('query_results.csv'):
|
||||
csv_files.append('query_results.csv')
|
||||
|
||||
if not csv_files:
|
||||
print("⚠️ 未找到任何结果文件")
|
||||
return
|
||||
|
||||
print(f"📋 找到 {len(csv_files)} 个文件: {csv_files}")
|
||||
|
||||
all_data = []
|
||||
seen_vins = set()
|
||||
|
||||
for file in csv_files:
|
||||
try:
|
||||
# 尝试使用 pandas 读取,处理编码问题
|
||||
try:
|
||||
df = pd.read_csv(file, encoding='utf-8-sig')
|
||||
except:
|
||||
df = pd.read_csv(file, encoding='gbk')
|
||||
|
||||
# 标准化列名(处理之前版本可能存在的列名差异)
|
||||
# 假设现在的列是 ['VIN', '结果', '查询时间', '状态']
|
||||
# 旧的可能是 ['VIN', '单车在线时长', ...]
|
||||
|
||||
for index, row in df.iterrows():
|
||||
vin = str(row.get('VIN', '')).strip().upper()
|
||||
if not vin or vin in seen_vins:
|
||||
continue
|
||||
|
||||
# 获取结果数据
|
||||
result = row.get('结果', row.get('单车在线时长', ''))
|
||||
|
||||
# 尝试解析 JSON 结果
|
||||
parsed_data = {}
|
||||
is_json = False
|
||||
try:
|
||||
if result and result.strip().startswith('{'):
|
||||
parsed_data = json.loads(result)
|
||||
is_json = True
|
||||
except:
|
||||
pass
|
||||
|
||||
# 基础信息
|
||||
entry = {
|
||||
'VIN': vin,
|
||||
'查询时间': row.get('查询时间', ''),
|
||||
'状态': row.get('状态', ''),
|
||||
'原始结果': result
|
||||
}
|
||||
|
||||
# 如果是 JSON 数据,提取详细字段
|
||||
if is_json:
|
||||
# 提取 intervalList 和 daysList
|
||||
intervals = parsed_data.get('intervalList', [])
|
||||
days = parsed_data.get('daysList', [])
|
||||
|
||||
if len(intervals) == len(days):
|
||||
for i, interval in enumerate(intervals):
|
||||
entry[f'时长区间_{interval}'] = days[i]
|
||||
|
||||
# 也可以提取 startTime 和 endTime
|
||||
# entry['startTime'] = parsed_data.get('startTime', '')
|
||||
# entry['endTime'] = parsed_data.get('endTime', '')
|
||||
|
||||
all_data.append(entry)
|
||||
seen_vins.add(vin)
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 处理文件 {file} 出错: {e}")
|
||||
|
||||
if not all_data:
|
||||
print("⚠️ 没有有效数据")
|
||||
return
|
||||
|
||||
# 转换为 DataFrame
|
||||
result_df = pd.DataFrame(all_data)
|
||||
|
||||
# 计算估算在线总时长
|
||||
# 映射规则:取区间下限作为估算值('≤0.5'按0.5算,'>6'按6算,区间按平均值或下限算)
|
||||
# 用户需求是“至少在线时长”,所以我们统一使用区间下限
|
||||
# 区间列表: ['≤0.5', '(0.5,1]', '(1,2]', '(2,3]', '(3,4]', '(4,6]', '>6']
|
||||
# 对应权重: [0, 0.5, 1, 2, 3, 4, 6] (≤0.5 算作 0,确保是“至少”)
|
||||
|
||||
weight_map = {
|
||||
'时长区间_≤0.5': 0,
|
||||
'时长区间_(0.5,1]': 0.5,
|
||||
'时长区间_(1,2]': 1,
|
||||
'时长区间_(2,3]': 2,
|
||||
'时长区间_(3,4]': 3,
|
||||
'时长区间_(4,6]': 4,
|
||||
'时长区间_>6': 6
|
||||
}
|
||||
|
||||
def calculate_min_duration(row):
|
||||
total_hours = 0
|
||||
for col, weight in weight_map.items():
|
||||
if col in row:
|
||||
try:
|
||||
days = float(row[col])
|
||||
if pd.notnull(days):
|
||||
total_hours += days * weight
|
||||
except:
|
||||
pass
|
||||
return total_hours
|
||||
|
||||
# 只有当包含相关列时才计算
|
||||
if any(col in result_df.columns for col in weight_map.keys()):
|
||||
result_df['估算至少在线总时长(小时)'] = result_df.apply(calculate_min_duration, axis=1)
|
||||
print("✅ 已计算‘至少在线总时长’")
|
||||
|
||||
# 调整列顺序
|
||||
# 优先显示的列
|
||||
first_cols = ['VIN', '估算至少在线总时长(小时)', '状态', '查询时间']
|
||||
# 动态生成的区间列
|
||||
interval_cols = [c for c in result_df.columns if c.startswith('时长区间_')]
|
||||
# 其他列
|
||||
other_cols = [c for c in result_df.columns if c not in first_cols and c not in interval_cols]
|
||||
|
||||
final_cols = first_cols + interval_cols + other_cols
|
||||
result_df = result_df.reindex(columns=final_cols)
|
||||
|
||||
# 导出
|
||||
result_df.to_csv(output_file, index=False, encoding='utf-8-sig')
|
||||
print(f"\n✅ 合并完成!结果已保存至: {output_file}")
|
||||
print(f"📊 总计合并数据: {len(result_df)} 条")
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 检查 pandas 是否安装
|
||||
try:
|
||||
import pandas
|
||||
merge_csv_files()
|
||||
except ImportError:
|
||||
print("❌ 需要安装 pandas 库: pip install pandas")
|
||||
# 降级处理:使用标准库简单的合并(不解析JSON)
|
||||
print("⚠️ 降级模式:仅简单合并文件")
|
||||
# ... (简易合并逻辑可以省略,通常用户环境已有 pandas)
|
||||
Reference in New Issue
Block a user