大更新,架构调整,数据分析能力提升,
This commit is contained in:
@@ -154,6 +154,111 @@ def load_data_chunked(file_path: str, chunksize: Optional[int] = None) -> Iterat
|
||||
print(f"[ERROR] 读取Excel文件失败: {e}")
|
||||
|
||||
|
||||
def _profile_chunked(file_path: str) -> str:
|
||||
"""
|
||||
Profile a large file by reading the first chunk plus sampled subsequent chunks.
|
||||
|
||||
Uses ``load_data_chunked()`` to stream the file. The first chunk is kept
|
||||
in full; every 5th subsequent chunk contributes up to 100 sampled rows.
|
||||
A markdown profile is generated from the combined sample.
|
||||
|
||||
Args:
|
||||
file_path: Path to the data file.
|
||||
|
||||
Returns:
|
||||
A markdown string containing the sampled profile for this file.
|
||||
"""
|
||||
file_name = os.path.basename(file_path)
|
||||
chunks_iter = load_data_chunked(file_path)
|
||||
first_chunk = next(chunks_iter, None)
|
||||
if first_chunk is None:
|
||||
return f"## 文件: {file_name}\n\n[ERROR] 无法读取文件: {file_path}\n\n"
|
||||
|
||||
sample_parts = [first_chunk]
|
||||
for i, chunk in enumerate(chunks_iter):
|
||||
if i % 5 == 0: # sample every 5th subsequent chunk
|
||||
sample_parts.append(chunk.head(min(100, len(chunk))))
|
||||
|
||||
combined = pd.concat(sample_parts, ignore_index=True)
|
||||
|
||||
# Build profile from the combined sample
|
||||
profile = f"## 文件: {file_name}\n\n"
|
||||
profile += f"- **注意**: 此画像基于抽样数据生成(首块 + 每5块采样100行)\n"
|
||||
rows, cols = combined.shape
|
||||
profile += f"- **样本维度**: {rows} 行 x {cols} 列\n"
|
||||
profile += f"- **列名**: `{', '.join(combined.columns)}`\n\n"
|
||||
profile += "### 列详细分布:\n"
|
||||
|
||||
for col in combined.columns:
|
||||
dtype = combined[col].dtype
|
||||
null_count = combined[col].isnull().sum()
|
||||
null_ratio = (null_count / rows) * 100 if rows > 0 else 0
|
||||
|
||||
profile += f"#### {col} ({dtype})\n"
|
||||
if null_count > 0:
|
||||
profile += f"- [WARN] 空值: {null_count} ({null_ratio:.1f}%)\n"
|
||||
|
||||
if pd.api.types.is_numeric_dtype(dtype):
|
||||
desc = combined[col].describe()
|
||||
profile += f"- 统计: Min={desc['min']:.2f}, Max={desc['max']:.2f}, Mean={desc['mean']:.2f}\n"
|
||||
elif pd.api.types.is_object_dtype(dtype) or pd.api.types.is_categorical_dtype(dtype):
|
||||
unique_count = combined[col].nunique()
|
||||
profile += f"- 唯一值数量: {unique_count}\n"
|
||||
if unique_count > 0:
|
||||
top_n = combined[col].value_counts().head(5)
|
||||
top_items_str = ", ".join([f"{k}({v})" for k, v in top_n.items()])
|
||||
profile += f"- **TOP 5 高频值**: {top_items_str}\n"
|
||||
elif pd.api.types.is_datetime64_any_dtype(dtype):
|
||||
profile += f"- 范围: {combined[col].min()} 至 {combined[col].max()}\n"
|
||||
|
||||
profile += "\n"
|
||||
|
||||
return profile
|
||||
|
||||
|
||||
def load_and_profile_data_smart(file_paths: list, max_file_size_mb: int = None) -> str:
|
||||
"""
|
||||
Smart data loader: selects chunked profiling for large files and full
|
||||
profiling for small files based on a size threshold.
|
||||
|
||||
Args:
|
||||
file_paths: List of file paths to profile.
|
||||
max_file_size_mb: Size threshold in MB. Files larger than this use
|
||||
chunked profiling. Defaults to ``app_config.max_file_size_mb``.
|
||||
|
||||
Returns:
|
||||
A markdown string containing the combined data profile.
|
||||
"""
|
||||
if max_file_size_mb is None:
|
||||
max_file_size_mb = app_config.max_file_size_mb
|
||||
|
||||
profile_summary = "# 数据画像报告 (Data Profile)\n\n"
|
||||
|
||||
if not file_paths:
|
||||
return profile_summary + "未提供数据文件。"
|
||||
|
||||
for file_path in file_paths:
|
||||
if not os.path.exists(file_path):
|
||||
profile_summary += f"## 文件: {os.path.basename(file_path)}\n\n"
|
||||
profile_summary += f"[WARN] 文件不存在: {file_path}\n\n"
|
||||
continue
|
||||
|
||||
try:
|
||||
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
|
||||
if file_size_mb > max_file_size_mb:
|
||||
profile_summary += _profile_chunked(file_path)
|
||||
else:
|
||||
# Use existing full-load profiling for this single file
|
||||
profile_summary += load_and_profile_data([file_path]).replace(
|
||||
"# 数据画像报告 (Data Profile)\n\n", ""
|
||||
)
|
||||
except Exception as e:
|
||||
profile_summary += f"## 文件: {os.path.basename(file_path)}\n\n"
|
||||
profile_summary += f"[ERROR] 读取或分析文件失败: {str(e)}\n\n"
|
||||
|
||||
return profile_summary
|
||||
|
||||
|
||||
def load_data_with_cache(file_path: str, force_reload: bool = False) -> Optional[pd.DataFrame]:
|
||||
"""
|
||||
带缓存的数据加载
|
||||
|
||||
Reference in New Issue
Block a user