Files
iov_data_analysis_agent/data_preprocessing/sorter.py
2026-02-02 09:44:07 +08:00

83 lines
2.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
数据排序模块
按时间列对 CSV 文件进行排序
"""
import os
import pandas as pd
from typing import Optional
from .config import default_config
def sort_by_time(
input_path: str,
output_path: Optional[str] = None,
time_column: str = None,
inplace: bool = False
) -> str:
"""
按时间列对 CSV 文件排序
Args:
input_path: 输入 CSV 文件路径
output_path: 输出路径。如果为 None 且 inplace=False则输出到 cleaned_data 目录
time_column: 时间列名,默认使用配置中的 default_time_column
inplace: 是否原地覆盖输入文件
Returns:
输出文件的绝对路径
Raises:
FileNotFoundError: 输入文件不存在
KeyError: 时间列不存在
"""
# 参数处理
time_column = time_column or default_config.default_time_column
if not os.path.exists(input_path):
raise FileNotFoundError(f"文件不存在: {input_path}")
# 确定输出路径
if inplace:
output_path = input_path
elif output_path is None:
default_config.ensure_dirs()
basename = os.path.basename(input_path)
name, ext = os.path.splitext(basename)
output_path = os.path.join(
default_config.cleaned_data_dir,
f"{name}_sorted{ext}"
)
print(f"[READ] 正在读取: {input_path}")
df = pd.read_csv(input_path, low_memory=False)
print(f" 数据行数: {len(df)}")
# 检查时间列是否存在
if time_column not in df.columns:
available_cols = list(df.columns)
raise KeyError(
f"未找到时间列 '{time_column}'。可用列: {available_cols}"
)
print(f"[PARSE] 正在解析时间列 '{time_column}'...")
df[time_column] = pd.to_datetime(df[time_column], errors='coerce')
# 统计无效时间
nat_count = df[time_column].isna().sum()
if nat_count > 0:
print(f"[WARN] 发现 {nat_count} 行无效时间数据,排序时将排在最后")
print("[SORT] 正在按时间排序...")
df_sorted = df.sort_values(by=time_column, na_position='last')
print(f"[SAVE] 正在保存: {output_path}")
df_sorted.to_csv(output_path, index=False, encoding=default_config.csv_encoding)
abs_output = os.path.abspath(output_path)
print(f"[OK] 排序完成!输出文件: {abs_output}")
return abs_output