大更新，架构调整，数据分析能力提升，

2026-04-19 21:30:08 +08:00
parent 9d01f004d4
commit 00bd48e7e7
26 changed files with 4375 additions and 252 deletions
--- a/utils/data_loader.py
+++ b/utils/data_loader.py
@@ -154,6 +154,111 @@ def load_data_chunked(file_path: str, chunksize: Optional[int] = None) -> Iterat
            print(f"[ERROR] 读取Excel文件失败: {e}")


+def _profile_chunked(file_path: str) -> str:
+    """
+    Profile a large file by reading the first chunk plus sampled subsequent chunks.
+
+    Uses ``load_data_chunked()`` to stream the file.  The first chunk is kept
+    in full; every 5th subsequent chunk contributes up to 100 sampled rows.
+    A markdown profile is generated from the combined sample.
+
+    Args:
+        file_path: Path to the data file.
+
+    Returns:
+        A markdown string containing the sampled profile for this file.
+    """
+    file_name = os.path.basename(file_path)
+    chunks_iter = load_data_chunked(file_path)
+    first_chunk = next(chunks_iter, None)
+    if first_chunk is None:
+        return f"## 文件: {file_name}\n\n[ERROR] 无法读取文件: {file_path}\n\n"
+
+    sample_parts = [first_chunk]
+    for i, chunk in enumerate(chunks_iter):
+        if i % 5 == 0:  # sample every 5th subsequent chunk
+            sample_parts.append(chunk.head(min(100, len(chunk))))
+
+    combined = pd.concat(sample_parts, ignore_index=True)
+
+    # Build profile from the combined sample
+    profile = f"## 文件: {file_name}\n\n"
+    profile += f"- **注意**: 此画像基于抽样数据生成（首块 + 每5块采样100行）\n"
+    rows, cols = combined.shape
+    profile += f"- **样本维度**: {rows} 行 x {cols} 列\n"
+    profile += f"- **列名**: `{', '.join(combined.columns)}`\n\n"
+    profile += "### 列详细分布:\n"
+
+    for col in combined.columns:
+        dtype = combined[col].dtype
+        null_count = combined[col].isnull().sum()
+        null_ratio = (null_count / rows) * 100 if rows > 0 else 0
+
+        profile += f"#### {col} ({dtype})\n"
+        if null_count > 0:
+            profile += f"- [WARN] 空值: {null_count} ({null_ratio:.1f}%)\n"
+
+        if pd.api.types.is_numeric_dtype(dtype):
+            desc = combined[col].describe()
+            profile += f"- 统计: Min={desc['min']:.2f}, Max={desc['max']:.2f}, Mean={desc['mean']:.2f}\n"
+        elif pd.api.types.is_object_dtype(dtype) or pd.api.types.is_categorical_dtype(dtype):
+            unique_count = combined[col].nunique()
+            profile += f"- 唯一值数量: {unique_count}\n"
+            if unique_count > 0:
+                top_n = combined[col].value_counts().head(5)
+                top_items_str = ", ".join([f"{k}({v})" for k, v in top_n.items()])
+                profile += f"- **TOP 5 高频值**: {top_items_str}\n"
+        elif pd.api.types.is_datetime64_any_dtype(dtype):
+            profile += f"- 范围: {combined[col].min()} 至 {combined[col].max()}\n"
+
+        profile += "\n"
+
+    return profile
+
+
+def load_and_profile_data_smart(file_paths: list, max_file_size_mb: int = None) -> str:
+    """
+    Smart data loader: selects chunked profiling for large files and full
+    profiling for small files based on a size threshold.
+
+    Args:
+        file_paths: List of file paths to profile.
+        max_file_size_mb: Size threshold in MB.  Files larger than this use
+            chunked profiling.  Defaults to ``app_config.max_file_size_mb``.
+
+    Returns:
+        A markdown string containing the combined data profile.
+    """
+    if max_file_size_mb is None:
+        max_file_size_mb = app_config.max_file_size_mb
+
+    profile_summary = "# 数据画像报告 (Data Profile)\n\n"
+
+    if not file_paths:
+        return profile_summary + "未提供数据文件。"
+
+    for file_path in file_paths:
+        if not os.path.exists(file_path):
+            profile_summary += f"## 文件: {os.path.basename(file_path)}\n\n"
+            profile_summary += f"[WARN] 文件不存在: {file_path}\n\n"
+            continue
+
+        try:
+            file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
+            if file_size_mb > max_file_size_mb:
+                profile_summary += _profile_chunked(file_path)
+            else:
+                # Use existing full-load profiling for this single file
+                profile_summary += load_and_profile_data([file_path]).replace(
+                    "# 数据画像报告 (Data Profile)\n\n", ""
+                )
+        except Exception as e:
+            profile_summary += f"## 文件: {os.path.basename(file_path)}\n\n"
+            profile_summary += f"[ERROR] 读取或分析文件失败: {str(e)}\n\n"
+
+    return profile_summary
+
+
 def load_data_with_cache(file_path: str, force_reload: bool = False) -> Optional[pd.DataFrame]:
    """
    带缓存的数据加载