Compare commits
15 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 9d01f004d4 | |||
| b256aa27d9 | |||
| c5083736e2 | |||
| b033eb61cc | |||
| c8fe5e6d6f | |||
| 3585ba6932 | |||
| ad90cd29d3 | |||
| e9644360ce | |||
| 5eb13324c2 | |||
| 674f48c74b | |||
| fbbb5a2470 | |||
| 162f5c4da4 | |||
| b1d0cc5462 | |||
| e51cdfea6f | |||
| 621e546b43 |
22
.env.example
22
.env.example
@@ -1,8 +1,18 @@
|
||||
# LLM Provider 配置
|
||||
# 支持 openai / gemini
|
||||
LLM_PROVIDER=openai
|
||||
|
||||
# 火山引擎配置
|
||||
OPENAI_API_KEY=sk-c44i1hy64xgzwox6x08o4zug93frq6rgn84oqugf2pje1tg4
|
||||
OPENAI_BASE_URL=https://api.xiaomimimo.com/v1/chat/completions
|
||||
# 文本模型
|
||||
OPENAI_MODEL=mimo-v2-flash
|
||||
# OPENAI_MODEL=deepseek-r1-250528
|
||||
# OpenAI 兼容接口配置
|
||||
OPENAI_API_KEY=your-api-key-here
|
||||
OPENAI_BASE_URL=http://127.0.0.1:9999/v1
|
||||
OPENAI_MODEL=your-model-name
|
||||
|
||||
# Gemini 配置(当 LLM_PROVIDER=gemini 时生效)
|
||||
# GEMINI_API_KEY=your-gemini-api-key
|
||||
# GEMINI_BASE_URL=https://generativelanguage.googleapis.com
|
||||
# GEMINI_MODEL=gemini-2.5-flash
|
||||
|
||||
# 应用配置(可选)
|
||||
# APP_MAX_ROUNDS=20
|
||||
# APP_CHUNK_SIZE=100000
|
||||
# APP_CACHE_ENABLED=true
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -6,6 +6,8 @@ __pycache__/
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
|
||||
1
.kiro/specs/agent-robustness-optimization/.config.kiro
Normal file
1
.kiro/specs/agent-robustness-optimization/.config.kiro
Normal file
@@ -0,0 +1 @@
|
||||
{"specId": "ea41aaef-0737-4255-bcad-90f156a5b2d5", "workflowType": "requirements-first", "specType": "feature"}
|
||||
515
.kiro/specs/agent-robustness-optimization/design.md
Normal file
515
.kiro/specs/agent-robustness-optimization/design.md
Normal file
@@ -0,0 +1,515 @@
|
||||
# Design Document: Agent Robustness Optimization
|
||||
|
||||
## Overview
|
||||
|
||||
This design addresses five areas of improvement for the AI Data Analysis Agent: data privacy fallback recovery, conversation history trimming, analysis template integration, frontend progress display, and multi-file chunked/parallel loading. The changes span the Python backend (`data_analysis_agent.py`, `config/app_config.py`, `utils/data_privacy.py`, `utils/data_loader.py`, `web/main.py`) and the vanilla JS frontend (`web/static/script.js`, `web/static/index.html`, `web/static/clean_style.css`).
|
||||
|
||||
The core design principle is **minimal invasiveness**: each feature is implemented as a composable module or method that plugs into the existing agent loop, avoiding large-scale refactors of the `DataAnalysisAgent.analyze()` main loop.
|
||||
|
||||
## Architecture
|
||||
|
||||
The system follows a layered architecture where the `DataAnalysisAgent` orchestrates LLM calls and code execution, the FastAPI server manages sessions and exposes APIs, and the frontend polls for status updates.
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
subgraph Frontend
|
||||
UI[script.js + index.html]
|
||||
end
|
||||
|
||||
subgraph FastAPI Server
|
||||
API[web/main.py]
|
||||
SM[SessionManager]
|
||||
end
|
||||
|
||||
subgraph Agent Core
|
||||
DA[DataAnalysisAgent]
|
||||
EC[ErrorClassifier]
|
||||
HG[HintGenerator]
|
||||
HT[HistoryTrimmer]
|
||||
TI[TemplateIntegration]
|
||||
end
|
||||
|
||||
subgraph Utilities
|
||||
DP[data_privacy.py]
|
||||
DL[data_loader.py]
|
||||
AT[analysis_templates.py]
|
||||
CE[code_executor.py]
|
||||
end
|
||||
|
||||
subgraph Config
|
||||
AC[app_config.py]
|
||||
end
|
||||
|
||||
UI -->|POST /api/start, GET /api/status, GET /api/templates| API
|
||||
API --> SM
|
||||
API --> DA
|
||||
DA --> EC
|
||||
DA --> HG
|
||||
DA --> HT
|
||||
DA --> TI
|
||||
DA --> CE
|
||||
HG --> DP
|
||||
DL --> AC
|
||||
DA --> DL
|
||||
TI --> AT
|
||||
EC --> AC
|
||||
HT --> AC
|
||||
```
|
||||
|
||||
### Change Impact Summary
|
||||
|
||||
| Area | Files Modified | New Files |
|
||||
|------|---------------|-----------|
|
||||
| Data Privacy Fallback | `data_analysis_agent.py`, `utils/data_privacy.py`, `config/app_config.py` | None |
|
||||
| Conversation Trimming | `data_analysis_agent.py`, `config/app_config.py` | None |
|
||||
| Template System | `data_analysis_agent.py`, `web/main.py`, `web/static/script.js`, `web/static/index.html`, `web/static/clean_style.css` | None |
|
||||
| Progress Bar | `web/main.py`, `web/static/script.js`, `web/static/index.html`, `web/static/clean_style.css` | None |
|
||||
| Multi-File Loading | `utils/data_loader.py`, `data_analysis_agent.py`, `config/app_config.py` | None |
|
||||
|
||||
## Components and Interfaces
|
||||
|
||||
### 1. Error Classifier (`data_analysis_agent.py`)
|
||||
|
||||
A new method `_classify_error(error_message: str) -> str` on `DataAnalysisAgent` that inspects error messages and returns `"data_context"` or `"other"`.
|
||||
|
||||
```python
|
||||
DATA_CONTEXT_PATTERNS = [
|
||||
r"KeyError:\s*['\"](.+?)['\"]",
|
||||
r"ValueError.*(?:column|col|field)",
|
||||
r"NameError.*(?:df|data|frame)",
|
||||
r"(?:empty|no\s+data|0\s+rows)",
|
||||
r"IndexError.*(?:out of range|out of bounds)",
|
||||
]
|
||||
|
||||
def _classify_error(self, error_message: str) -> str:
|
||||
"""Classify execution error as data-context or other."""
|
||||
for pattern in DATA_CONTEXT_PATTERNS:
|
||||
if re.search(pattern, error_message, re.IGNORECASE):
|
||||
return "data_context"
|
||||
return "other"
|
||||
```
|
||||
|
||||
### 2. Enriched Hint Generator (`utils/data_privacy.py`)
|
||||
|
||||
A new function `generate_enriched_hint(error_message: str, safe_profile: str) -> str` that extracts the referenced column name from the error, looks it up in the safe profile, and returns a hint string containing only schema-level metadata.
|
||||
|
||||
```python
|
||||
def generate_enriched_hint(error_message: str, safe_profile: str) -> str:
|
||||
"""
|
||||
Generate an enriched hint from the safe profile for a data-context error.
|
||||
Returns schema-level metadata only — no real data values.
|
||||
"""
|
||||
column_name = _extract_column_from_error(error_message)
|
||||
column_meta = _lookup_column_in_profile(column_name, safe_profile)
|
||||
|
||||
hint = "[RETRY CONTEXT] 上一次代码执行因数据上下文错误失败。\n"
|
||||
hint += f"错误信息: {error_message}\n"
|
||||
if column_meta:
|
||||
hint += f"相关列 '{column_name}' 的结构信息:\n"
|
||||
hint += f" - 数据类型: {column_meta['dtype']}\n"
|
||||
hint += f" - 唯一值数量: {column_meta['unique_count']}\n"
|
||||
hint += f" - 空值率: {column_meta['null_rate']}\n"
|
||||
hint += f" - 特征描述: {column_meta['description']}\n"
|
||||
hint += "请根据以上结构信息修正代码,不要假设具体的数据值。"
|
||||
return hint
|
||||
|
||||
def _extract_column_from_error(error_message: str) -> Optional[str]:
|
||||
"""Extract column name from error message patterns like KeyError: 'col_name'."""
|
||||
match = re.search(r"KeyError:\s*['\"](.+?)['\"]", error_message)
|
||||
if match:
|
||||
return match.group(1)
|
||||
match = re.search(r"column\s+['\"](.+?)['\"]", error_message, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return None
|
||||
|
||||
def _lookup_column_in_profile(column_name: Optional[str], safe_profile: str) -> Optional[dict]:
|
||||
"""Look up column metadata in the safe profile markdown table."""
|
||||
if not column_name:
|
||||
return None
|
||||
# Parse the markdown table rows for the matching column
|
||||
for line in safe_profile.split("\n"):
|
||||
if line.startswith("|") and column_name in line:
|
||||
parts = [p.strip() for p in line.split("|") if p.strip()]
|
||||
if len(parts) >= 5 and parts[0] == column_name:
|
||||
return {
|
||||
"dtype": parts[1],
|
||||
"null_rate": parts[2],
|
||||
"unique_count": parts[3],
|
||||
"description": parts[4],
|
||||
}
|
||||
return None
|
||||
```
|
||||
|
||||
### 3. Conversation History Trimmer (`data_analysis_agent.py`)
|
||||
|
||||
A new method `_trim_conversation_history()` on `DataAnalysisAgent` that implements sliding window trimming with summary compression.
|
||||
|
||||
```python
|
||||
def _trim_conversation_history(self):
|
||||
"""Apply sliding window trimming to conversation history."""
|
||||
window_size = app_config.conversation_window_size
|
||||
max_messages = window_size * 2 # pairs of user+assistant messages
|
||||
|
||||
if len(self.conversation_history) <= max_messages:
|
||||
return # No trimming needed
|
||||
|
||||
first_message = self.conversation_history[0] # Always retain
|
||||
|
||||
# Determine trim boundary: skip first message + possible existing summary
|
||||
start_idx = 1
|
||||
has_existing_summary = (
|
||||
len(self.conversation_history) > 1
|
||||
and self.conversation_history[1]["role"] == "user"
|
||||
and self.conversation_history[1]["content"].startswith("[分析摘要]")
|
||||
)
|
||||
if has_existing_summary:
|
||||
start_idx = 2
|
||||
|
||||
# Messages to trim vs keep
|
||||
messages_to_consider = self.conversation_history[start_idx:]
|
||||
messages_to_trim = messages_to_consider[:-max_messages]
|
||||
messages_to_keep = messages_to_consider[-max_messages:]
|
||||
|
||||
if not messages_to_trim:
|
||||
return
|
||||
|
||||
# Generate summary of trimmed messages
|
||||
summary = self._compress_trimmed_messages(messages_to_trim)
|
||||
|
||||
# Rebuild history: first_message + summary + recent messages
|
||||
self.conversation_history = [first_message]
|
||||
if summary:
|
||||
self.conversation_history.append({"role": "user", "content": summary})
|
||||
self.conversation_history.extend(messages_to_keep)
|
||||
|
||||
def _compress_trimmed_messages(self, messages: list) -> str:
|
||||
"""Compress trimmed messages into a summary string."""
|
||||
summary_parts = ["[分析摘要] 以下是之前分析轮次的概要:"]
|
||||
round_num = 0
|
||||
|
||||
for msg in messages:
|
||||
content = msg["content"]
|
||||
if msg["role"] == "assistant":
|
||||
round_num += 1
|
||||
# Extract action type from YAML-like content
|
||||
action = "generate_code"
|
||||
if "action: \"collect_figures\"" in content or "action: collect_figures" in content:
|
||||
action = "collect_figures"
|
||||
elif "action: \"analysis_complete\"" in content or "action: analysis_complete" in content:
|
||||
action = "analysis_complete"
|
||||
summary_parts.append(f"- 轮次{round_num}: 动作={action}")
|
||||
elif msg["role"] == "user" and "代码执行反馈" in content:
|
||||
success = "失败" if "[ERROR]" in content or "执行错误" in content else "成功"
|
||||
summary_parts[-1] += f", 执行结果={success}"
|
||||
|
||||
return "\n".join(summary_parts)
|
||||
```
|
||||
|
||||
### 4. Template Integration (`data_analysis_agent.py` + `web/main.py`)
|
||||
|
||||
The `analyze()` method gains an optional `template_name` parameter. When provided, the template prompt is prepended to the user requirement.
|
||||
|
||||
**Agent side:**
|
||||
```python
|
||||
def analyze(self, user_input: str, files=None, session_output_dir=None,
|
||||
reset_session=True, max_rounds=None, template_name=None):
|
||||
# ... existing init code ...
|
||||
if template_name:
|
||||
from utils.analysis_templates import get_template
|
||||
template = get_template(template_name) # Raises ValueError if invalid
|
||||
template_prompt = template.get_full_prompt()
|
||||
user_input = f"{template_prompt}\n\n{user_input}"
|
||||
# ... rest of analyze ...
|
||||
```
|
||||
|
||||
**API side (`web/main.py`):**
|
||||
```python
|
||||
# New endpoint
|
||||
@app.get("/api/templates")
|
||||
async def list_available_templates():
|
||||
from utils.analysis_templates import list_templates
|
||||
return {"templates": list_templates()}
|
||||
|
||||
# Modified StartRequest
|
||||
class StartRequest(BaseModel):
|
||||
requirement: str
|
||||
template: Optional[str] = None
|
||||
```
|
||||
|
||||
### 5. Progress Bar Integration
|
||||
|
||||
**Backend (`web/main.py`):** Update `run_analysis_task` to set progress fields on `SessionData` via a callback or by polling the agent's `current_round`. The simplest approach is to add a progress callback to the agent.
|
||||
|
||||
```python
|
||||
# In DataAnalysisAgent
|
||||
def set_progress_callback(self, callback):
|
||||
"""Set a callback function(current_round, max_rounds, message) for progress updates."""
|
||||
self._progress_callback = callback
|
||||
|
||||
# Called at the start of each round in the analyze() loop:
|
||||
if hasattr(self, '_progress_callback') and self._progress_callback:
|
||||
self._progress_callback(self.current_round, self.max_rounds, f"第{self.current_round}轮分析中...")
|
||||
```
|
||||
|
||||
**Backend (`web/main.py`):** In `run_analysis_task`, wire the callback:
|
||||
```python
|
||||
def progress_cb(current, total, message):
|
||||
session.current_round = current
|
||||
session.max_rounds = total
|
||||
session.progress_percentage = round((current / total) * 100, 1) if total > 0 else 0
|
||||
session.status_message = message
|
||||
|
||||
agent.set_progress_callback(progress_cb)
|
||||
```
|
||||
|
||||
**API response:** Add progress fields to `GET /api/status`:
|
||||
```python
|
||||
return {
|
||||
"is_running": session.is_running,
|
||||
"log": log_content,
|
||||
"has_report": ...,
|
||||
"current_round": session.current_round,
|
||||
"max_rounds": session.max_rounds,
|
||||
"progress_percentage": session.progress_percentage,
|
||||
"status_message": session.status_message,
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
**Frontend (`script.js`):** During polling, render a progress bar when `is_running` is true:
|
||||
```javascript
|
||||
// In the polling callback:
|
||||
if (data.is_running) {
|
||||
updateProgressBar(data.progress_percentage, data.status_message);
|
||||
}
|
||||
```
|
||||
|
||||
### 6. Multi-File Chunked & Parallel Loading
|
||||
|
||||
**Chunked loading enhancement (`utils/data_loader.py`):**
|
||||
|
||||
```python
|
||||
def load_and_profile_data_smart(file_paths: list, max_file_size_mb: int = None) -> str:
|
||||
"""Smart loader: uses chunked reading for large files, regular for small."""
|
||||
if max_file_size_mb is None:
|
||||
max_file_size_mb = app_config.max_file_size_mb
|
||||
|
||||
profile_summary = "# 数据画像报告 (Data Profile)\n\n"
|
||||
for file_path in file_paths:
|
||||
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
|
||||
if file_size_mb > max_file_size_mb:
|
||||
profile_summary += _profile_chunked(file_path)
|
||||
else:
|
||||
profile_summary += _profile_full(file_path)
|
||||
return profile_summary
|
||||
|
||||
def _profile_chunked(file_path: str) -> str:
|
||||
"""Profile a large file by reading first chunk + sampling subsequent chunks."""
|
||||
chunks = load_data_chunked(file_path)
|
||||
first_chunk = next(chunks, None)
|
||||
if first_chunk is None:
|
||||
return f"[ERROR] 无法读取文件: {file_path}\n"
|
||||
|
||||
# Sample from subsequent chunks
|
||||
sample_rows = [first_chunk]
|
||||
for i, chunk in enumerate(chunks):
|
||||
if i % 5 == 0: # Sample every 5th chunk
|
||||
sample_rows.append(chunk.sample(min(100, len(chunk))))
|
||||
|
||||
combined = pd.concat(sample_rows, ignore_index=True)
|
||||
# Generate profile from combined sample
|
||||
return _generate_profile_for_df(combined, file_path, sampled=True)
|
||||
```
|
||||
|
||||
**Parallel profiling (`data_analysis_agent.py`):**
|
||||
|
||||
```python
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
def _profile_files_parallel(self, file_paths: list) -> tuple[str, str]:
|
||||
"""Profile multiple files concurrently."""
|
||||
max_workers = app_config.max_parallel_profiles
|
||||
safe_profiles = []
|
||||
local_profiles = []
|
||||
|
||||
def profile_single(path):
|
||||
safe = build_safe_profile([path])
|
||||
local = build_local_profile([path])
|
||||
return path, safe, local
|
||||
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
futures = {executor.submit(profile_single, p): p for p in file_paths}
|
||||
for future in as_completed(futures):
|
||||
path = futures[future]
|
||||
try:
|
||||
_, safe, local = future.result()
|
||||
safe_profiles.append(safe)
|
||||
local_profiles.append(local)
|
||||
except Exception as e:
|
||||
error_entry = f"## 文件: {os.path.basename(path)}\n[ERROR] 分析失败: {e}\n\n"
|
||||
safe_profiles.append(error_entry)
|
||||
local_profiles.append(error_entry)
|
||||
|
||||
return "\n".join(safe_profiles), "\n".join(local_profiles)
|
||||
```
|
||||
|
||||
## Data Models
|
||||
|
||||
### AppConfig Extensions (`config/app_config.py`)
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
class AppConfig:
|
||||
# ... existing fields ...
|
||||
|
||||
# New fields
|
||||
max_data_context_retries: int = field(default=2)
|
||||
conversation_window_size: int = field(default=10)
|
||||
max_parallel_profiles: int = field(default=4)
|
||||
|
||||
@classmethod
|
||||
def from_env(cls) -> 'AppConfig':
|
||||
config = cls()
|
||||
# ... existing env overrides ...
|
||||
if val := os.getenv("APP_MAX_DATA_CONTEXT_RETRIES"):
|
||||
config.max_data_context_retries = int(val)
|
||||
if val := os.getenv("APP_CONVERSATION_WINDOW_SIZE"):
|
||||
config.conversation_window_size = int(val)
|
||||
if val := os.getenv("APP_MAX_PARALLEL_PROFILES"):
|
||||
config.max_parallel_profiles = int(val)
|
||||
return config
|
||||
```
|
||||
|
||||
### StartRequest Extension (`web/main.py`)
|
||||
|
||||
```python
|
||||
class StartRequest(BaseModel):
|
||||
requirement: str
|
||||
template: Optional[str] = None # New field
|
||||
```
|
||||
|
||||
### SessionData Progress Fields (already exist, just need wiring)
|
||||
|
||||
The `SessionData` class already has `current_round`, `max_rounds`, `progress_percentage`, and `status_message` fields. These just need to be updated during analysis and included in the `/api/status` response.
|
||||
|
||||
## Correctness Properties
|
||||
|
||||
*A property is a characteristic or behavior that should hold true across all valid executions of a system — essentially, a formal statement about what the system should do. Properties serve as the bridge between human-readable specifications and machine-verifiable correctness guarantees.*
|
||||
|
||||
### Property 1: Error Classification Correctness
|
||||
|
||||
*For any* error message string, if it contains a data-context pattern (KeyError on a column name, ValueError on column values, NameError for data variables, or empty DataFrame conditions), `_classify_error` SHALL return `"data_context"`; otherwise it SHALL return `"other"`.
|
||||
|
||||
**Validates: Requirements 1.1**
|
||||
|
||||
### Property 2: Retry Below Limit Produces Enriched Hint
|
||||
|
||||
*For any* `max_data_context_retries` value and any current retry count strictly less than that value, when a data-context error is detected, the agent SHALL produce an enriched hint message rather than forwarding the raw error.
|
||||
|
||||
**Validates: Requirements 1.3**
|
||||
|
||||
### Property 3: Enriched Hint Contains Correct Column Metadata Without Real Data
|
||||
|
||||
*For any* error message referencing a column name present in the Safe_Profile, the generated enriched hint SHALL contain that column's data type, unique value count, null rate, and categorical description, and SHALL NOT contain any real data values (min, max, mean, sample rows) from the Local_Profile.
|
||||
|
||||
**Validates: Requirements 2.1, 2.2, 2.4**
|
||||
|
||||
### Property 4: Environment Variable Override for Config Fields
|
||||
|
||||
*For any* positive integer value set as the `APP_MAX_DATA_CONTEXT_RETRIES` environment variable, `AppConfig.from_env()` SHALL produce a config where `max_data_context_retries` equals that integer value.
|
||||
|
||||
**Validates: Requirements 3.2**
|
||||
|
||||
### Property 5: Sliding Window Trimming Preserves First Message and Retains Recent Pairs
|
||||
|
||||
*For any* conversation history whose length exceeds `2 * conversation_window_size` and any `conversation_window_size >= 1`, after trimming: (a) the first user message is always retained at index 0, and (b) the most recent `conversation_window_size` message pairs are retained in full.
|
||||
|
||||
**Validates: Requirements 4.2, 4.3**
|
||||
|
||||
### Property 6: Trimming Summary Contains Round Info and Excludes Code/Raw Output
|
||||
|
||||
*For any* set of trimmed conversation messages, the generated summary SHALL list each trimmed round's action type and execution success/failure, and SHALL NOT contain any code blocks (``` markers) or raw execution output.
|
||||
|
||||
**Validates: Requirements 4.4, 5.1, 5.2**
|
||||
|
||||
### Property 7: Template Prompt Integration
|
||||
|
||||
*For any* valid template name in `TEMPLATE_REGISTRY` and any user requirement string, the initial conversation message SHALL contain the template's `get_full_prompt()` output prepended to the user requirement.
|
||||
|
||||
**Validates: Requirements 6.1, 6.2**
|
||||
|
||||
### Property 8: Invalid Template Name Raises Descriptive Error
|
||||
|
||||
*For any* string that is not a key in `TEMPLATE_REGISTRY`, calling `get_template()` SHALL raise a `ValueError` whose message contains the list of available template names.
|
||||
|
||||
**Validates: Requirements 6.3**
|
||||
|
||||
### Property 9: Chunked Loading Threshold
|
||||
|
||||
*For any* file path and `max_file_size_mb` threshold, if the file's size in MB exceeds the threshold, the smart loader SHALL use chunked loading; otherwise it SHALL use full loading.
|
||||
|
||||
**Validates: Requirements 10.1**
|
||||
|
||||
### Property 10: Chunked Profiling Uses First Chunk Plus Samples
|
||||
|
||||
*For any* file loaded in chunked mode, the generated profile SHALL be based on the first chunk plus sampled rows from subsequent chunks, not from the entire file loaded into memory.
|
||||
|
||||
**Validates: Requirements 10.3**
|
||||
|
||||
### Property 11: Parallel Profile Merge With Error Resilience
|
||||
|
||||
*For any* set of file paths where some are valid and some are invalid/corrupted, the merged profile output SHALL contain valid profile entries for successful files and error entries for failed files, with no files missing from the output.
|
||||
|
||||
**Validates: Requirements 11.2, 11.3**
|
||||
|
||||
## Error Handling
|
||||
|
||||
| Scenario | Handling Strategy |
|
||||
|----------|------------------|
|
||||
| Data-context error below retry limit | Generate enriched hint, retry with LLM |
|
||||
| Data-context error at retry limit | Fall back to normal sanitized error forwarding |
|
||||
| Invalid template name | Raise `ValueError` with available template list |
|
||||
| File too large for memory | Automatically switch to chunked loading |
|
||||
| Chunked loading fails | Return descriptive error, continue with other files |
|
||||
| Single file profiling fails in parallel | Include error entry, continue profiling remaining files |
|
||||
| Conversation history exceeds window | Trim old messages, generate compressed summary |
|
||||
| Summary generation fails | Log warning, proceed without summary (graceful degradation) |
|
||||
| Progress callback fails | Log warning, analysis continues without progress updates |
|
||||
|
||||
## Testing Strategy
|
||||
|
||||
### Property-Based Tests (using `hypothesis`)
|
||||
|
||||
Each correctness property maps to a property-based test with minimum 100 iterations. The test library is `hypothesis` (Python).
|
||||
|
||||
- **Property 1**: Generate random error strings with/without data-context patterns → verify classification
|
||||
- **Property 2**: Generate random retry counts and limits → verify hint vs raw error behavior
|
||||
- **Property 3**: Generate random Safe_Profile tables and error messages → verify hint content and absence of real data
|
||||
- **Property 4**: Generate random positive integers → set env var → verify config
|
||||
- **Property 5**: Generate random conversation histories and window sizes → verify trimming invariants
|
||||
- **Property 6**: Generate random trimmed message sets → verify summary content and absence of code blocks
|
||||
- **Property 7**: Pick random valid template names and requirement strings → verify prompt construction
|
||||
- **Property 8**: Generate random strings not in registry → verify ValueError
|
||||
- **Property 9**: Generate random file sizes and thresholds → verify loading method selection
|
||||
- **Property 10**: Generate random chunked data → verify profile source
|
||||
- **Property 11**: Generate random file sets with failures → verify merged output
|
||||
|
||||
Tag format: `Feature: agent-robustness-optimization, Property {N}: {title}`
|
||||
|
||||
### Unit Tests
|
||||
|
||||
- Error classifier with specific known error messages (KeyError, ValueError, NameError, generic errors)
|
||||
- Enriched hint generation with known column profiles
|
||||
- Conversation trimming with exact message counts at boundary conditions
|
||||
- Template retrieval for each registered template
|
||||
- Progress callback wiring
|
||||
- API endpoint response shapes (`GET /api/templates`, `GET /api/status` with progress fields)
|
||||
|
||||
### Integration Tests
|
||||
|
||||
- `GET /api/templates` returns all registered templates
|
||||
- `POST /api/start` with `template` field passes template to agent
|
||||
- `GET /api/status` includes progress fields during analysis
|
||||
- Multi-file parallel profiling with real CSV files
|
||||
- End-to-end: start analysis with template → verify template prompt in conversation history
|
||||
142
.kiro/specs/agent-robustness-optimization/requirements.md
Normal file
142
.kiro/specs/agent-robustness-optimization/requirements.md
Normal file
@@ -0,0 +1,142 @@
|
||||
# Requirements Document
|
||||
|
||||
## Introduction
|
||||
|
||||
This document specifies the requirements for improving the robustness, efficiency, and usability of the AI Data Analysis Agent. The improvements span five areas: a data privacy fallback mechanism for recovering from LLM-generated code failures when real data is unavailable, conversation history trimming to reduce token consumption and prevent data leakage, integration of the existing analysis template system, frontend progress bar display, and multi-file parallel/chunked analysis support.
|
||||
|
||||
## Glossary
|
||||
|
||||
- **Agent**: The `DataAnalysisAgent` class in `data_analysis_agent.py` that orchestrates LLM calls and IPython code execution for data analysis.
|
||||
- **Safe_Profile**: The schema-only data description generated by `build_safe_profile()` in `utils/data_privacy.py`, containing column names, data types, null rates, and unique value counts — but no real data values.
|
||||
- **Local_Profile**: The full data profile generated by `build_local_profile()` containing real data values, statistics, and sample rows — used only in the local execution environment.
|
||||
- **Code_Executor**: The `CodeExecutor` class in `utils/code_executor.py` that runs Python code in an IPython sandbox and returns execution results.
|
||||
- **Conversation_History**: The list of `{"role": ..., "content": ...}` message dictionaries maintained by the Agent across analysis rounds.
|
||||
- **Feedback_Sanitizer**: The `sanitize_execution_feedback()` function in `utils/data_privacy.py` that removes real data values from execution output before sending to the LLM.
|
||||
- **Template_Registry**: The `TEMPLATE_REGISTRY` dictionary in `utils/analysis_templates.py` mapping template names to template classes.
|
||||
- **Session_Data**: The `SessionData` class in `web/main.py` that tracks session state including `progress_percentage`, `current_round`, `max_rounds`, and `status_message`.
|
||||
- **Polling_Loop**: The `setInterval`-based polling mechanism in `web/static/script.js` that fetches `/api/status` every 2 seconds.
|
||||
- **Data_Loader**: The module `utils/data_loader.py` providing `load_and_profile_data`, `load_data_chunked`, and `load_data_with_cache` functions.
|
||||
- **AppConfig**: The `AppConfig` dataclass in `config/app_config.py` holding configuration values such as `max_rounds`, `chunk_size`, and `max_file_size_mb`.
|
||||
|
||||
## Requirements
|
||||
|
||||
### Requirement 1: Data Privacy Fallback — Error Detection
|
||||
|
||||
**User Story:** As a system operator, I want the Agent to detect when LLM-generated code fails due to missing real data context, so that the system can attempt intelligent recovery instead of wasting an analysis round.
|
||||
|
||||
#### Acceptance Criteria
|
||||
|
||||
1. WHEN the Code_Executor returns a failed execution result, THE Agent SHALL classify the error as either a data-context error or a non-data error by inspecting the error message for patterns such as `KeyError`, `ValueError` on column values, `NameError` for undefined data variables, or empty DataFrame conditions.
|
||||
2. WHEN a data-context error is detected, THE Agent SHALL increment a per-round retry counter for the current analysis round.
|
||||
3. WHILE the retry counter for a given round is below the configured maximum retry limit, THE Agent SHALL attempt recovery by generating an enriched hint prompt rather than forwarding the raw error to the LLM as a normal failure.
|
||||
4. IF the retry counter reaches the configured maximum retry limit, THEN THE Agent SHALL fall back to normal error handling by forwarding the sanitized error feedback to the LLM and proceeding to the next round.
|
||||
|
||||
### Requirement 2: Data Privacy Fallback — Enriched Hint Generation
|
||||
|
||||
**User Story:** As a system operator, I want the Agent to provide the LLM with enriched schema hints when data-context errors occur, so that the LLM can generate corrected code without receiving raw data values.
|
||||
|
||||
#### Acceptance Criteria
|
||||
|
||||
1. WHEN a data-context error is detected and retry is permitted, THE Agent SHALL generate an enriched hint containing the relevant column's data type, unique value count, null rate, and a categorical description (e.g., "low-cardinality category with 5 classes") extracted from the Safe_Profile.
|
||||
2. WHEN the error involves a specific column name referenced in the error message, THE Agent SHALL include that column's schema metadata in the enriched hint.
|
||||
3. THE Agent SHALL append the enriched hint to the conversation history as a user message with a prefix indicating it is a retry context, before requesting a new LLM response.
|
||||
4. THE Agent SHALL NOT include any real data values, sample rows, or statistical values (min, max, mean) from the Local_Profile in the enriched hint sent to the LLM.
|
||||
|
||||
### Requirement 3: Data Privacy Fallback — Configuration
|
||||
|
||||
**User Story:** As a system operator, I want to configure the maximum number of data-context retries, so that I can balance between recovery attempts and analysis throughput.
|
||||
|
||||
#### Acceptance Criteria
|
||||
|
||||
1. THE AppConfig SHALL include a `max_data_context_retries` field with a default value of 2.
|
||||
2. WHEN the `APP_MAX_DATA_CONTEXT_RETRIES` environment variable is set, THE AppConfig SHALL use its integer value to override the default.
|
||||
3. THE Agent SHALL read the `max_data_context_retries` value from AppConfig during initialization.
|
||||
|
||||
### Requirement 4: Conversation History Trimming — Sliding Window
|
||||
|
||||
**User Story:** As a system operator, I want the conversation history to be trimmed using a sliding window, so that token consumption stays bounded and early execution results containing potential data leakage are removed.
|
||||
|
||||
#### Acceptance Criteria
|
||||
|
||||
1. THE AppConfig SHALL include a `conversation_window_size` field with a default value of 10, representing the maximum number of recent message pairs to retain in full.
|
||||
2. WHEN the Conversation_History length exceeds twice the `conversation_window_size` (counting individual messages), THE Agent SHALL retain only the most recent `conversation_window_size` pairs of messages in full detail.
|
||||
3. THE Agent SHALL always retain the first user message (containing the original requirement and Safe_Profile) regardless of window trimming.
|
||||
4. WHEN messages are trimmed from the Conversation_History, THE Agent SHALL generate a compressed summary of the trimmed messages and prepend it after the first user message.
|
||||
|
||||
### Requirement 5: Conversation History Trimming — Summary Compression
|
||||
|
||||
**User Story:** As a system operator, I want trimmed conversation rounds to be compressed into a summary, so that the LLM retains awareness of prior analysis steps without consuming excessive tokens.
|
||||
|
||||
#### Acceptance Criteria
|
||||
|
||||
1. WHEN conversation messages are trimmed, THE Agent SHALL produce a summary string that lists each trimmed round's action type (generate_code, collect_figures), a one-line description of what was done, and whether execution succeeded or failed.
|
||||
2. THE summary SHALL NOT contain any code blocks, raw execution output, or data values from prior rounds.
|
||||
3. THE summary SHALL be inserted into the Conversation_History as a single user message immediately after the first user message, replacing any previous summary message.
|
||||
4. IF no messages have been trimmed, THEN THE Agent SHALL NOT insert a summary message.
|
||||
|
||||
### Requirement 6: Analysis Template System — Backend Integration
|
||||
|
||||
**User Story:** As a user, I want to select a predefined analysis template when starting an analysis, so that the Agent follows a structured analysis plan tailored to my scenario.
|
||||
|
||||
#### Acceptance Criteria
|
||||
|
||||
1. WHEN a template name is provided in the analysis request, THE Agent SHALL retrieve the corresponding template from the Template_Registry using the `get_template()` function.
|
||||
2. WHEN a valid template is retrieved, THE Agent SHALL call `get_full_prompt()` on the template and prepend the resulting structured prompt to the user's requirement in the initial conversation message.
|
||||
3. IF an invalid template name is provided, THEN THE Agent SHALL raise a descriptive error listing available template names.
|
||||
4. WHEN no template name is provided, THE Agent SHALL proceed with the default unstructured analysis flow.
|
||||
|
||||
### Requirement 7: Analysis Template System — API Endpoint
|
||||
|
||||
**User Story:** As a frontend developer, I want API endpoints to list available templates and to accept a template selection when starting analysis, so that the frontend can offer template choices to users.
|
||||
|
||||
#### Acceptance Criteria
|
||||
|
||||
1. THE FastAPI server SHALL expose a `GET /api/templates` endpoint that returns the list of available templates by calling `list_templates()`, with each entry containing `name`, `display_name`, and `description`.
|
||||
2. THE `POST /api/start` request body SHALL accept an optional `template` field containing the template name string.
|
||||
3. WHEN the `template` field is present in the start request, THE FastAPI server SHALL pass the template name to the Agent's `analyze()` method.
|
||||
4. WHEN the `template` field is absent or empty, THE FastAPI server SHALL start analysis without a template.
|
||||
|
||||
### Requirement 8: Analysis Template System — Frontend Template Selector
|
||||
|
||||
**User Story:** As a user, I want to see and select analysis templates in the web interface before starting analysis, so that I can choose a structured analysis approach.
|
||||
|
||||
#### Acceptance Criteria
|
||||
|
||||
1. WHEN the web page loads, THE frontend SHALL fetch the template list from `GET /api/templates` and render selectable template cards above the requirement input area.
|
||||
2. WHEN a user selects a template card, THE frontend SHALL visually highlight the selected template and store the template name.
|
||||
3. WHEN the user clicks "Start Analysis" with a template selected, THE frontend SHALL include the template name in the `POST /api/start` request body.
|
||||
4. THE frontend SHALL provide a "No Template (Free Analysis)" option that is selected by default, allowing users to proceed without a template.
|
||||
|
||||
### Requirement 9: Frontend Progress Bar Display
|
||||
|
||||
**User Story:** As a user, I want to see a real-time progress bar during analysis, so that I can understand how far the analysis has progressed.
|
||||
|
||||
#### Acceptance Criteria
|
||||
|
||||
1. THE FastAPI server SHALL update the Session_Data's `current_round`, `max_rounds`, `progress_percentage`, and `status_message` fields during each analysis round in the `run_analysis_task` function.
|
||||
2. THE `GET /api/status` response SHALL include `current_round`, `max_rounds`, `progress_percentage`, and `status_message` fields.
|
||||
3. WHEN the Polling_Loop receives status data with `is_running` equal to true, THE frontend SHALL render a progress bar element showing the `progress_percentage` value and the `status_message` text.
|
||||
4. WHEN `progress_percentage` changes between polls, THE frontend SHALL animate the progress bar width transition smoothly.
|
||||
5. WHEN `is_running` becomes false, THE frontend SHALL set the progress bar to 100% and display a completion message.
|
||||
|
||||
### Requirement 10: Multi-File Chunked Loading
|
||||
|
||||
**User Story:** As a user, I want large data files to be loaded in chunks, so that the system can handle files that exceed available memory.
|
||||
|
||||
#### Acceptance Criteria
|
||||
|
||||
1. WHEN a data file's size exceeds the `max_file_size_mb` threshold in AppConfig, THE Data_Loader SHALL use `load_data_chunked()` to stream the file in chunks of `chunk_size` rows instead of loading the entire file into memory.
|
||||
2. WHEN chunked loading is used, THE Agent SHALL instruct the Code_Executor to make the chunked iterator available in the notebook environment as a variable, so that LLM-generated code can process data in chunks.
|
||||
3. WHEN chunked loading is used for profiling, THE Agent SHALL generate the Safe_Profile by reading only the first chunk plus sampling from subsequent chunks, rather than loading the entire file.
|
||||
4. IF a file cannot be loaded even in chunked mode, THEN THE Data_Loader SHALL return a descriptive error message indicating the failure reason.
|
||||
|
||||
### Requirement 11: Multi-File Parallel Profiling
|
||||
|
||||
**User Story:** As a user, I want multiple data files to be profiled concurrently, so that the initial data exploration phase completes faster when multiple files are uploaded.
|
||||
|
||||
#### Acceptance Criteria
|
||||
|
||||
1. WHEN multiple files are provided for analysis, THE Agent SHALL profile each file concurrently using thread-based parallelism rather than sequentially.
|
||||
2. THE Agent SHALL collect all profiling results and merge them into a single Safe_Profile string and a single Local_Profile string, maintaining the same format as the current sequential output.
|
||||
3. IF any individual file profiling fails, THEN THE Agent SHALL include an error entry for that file in the profile output and continue profiling the remaining files.
|
||||
4. THE AppConfig SHALL include a `max_parallel_profiles` field with a default value of 4, controlling the maximum number of concurrent profiling threads.
|
||||
74
.kiro/specs/agent-robustness-optimization/tasks.md
Normal file
74
.kiro/specs/agent-robustness-optimization/tasks.md
Normal file
@@ -0,0 +1,74 @@
|
||||
# Tasks — Agent Robustness Optimization
|
||||
|
||||
## Priority 1: Configuration Foundation
|
||||
|
||||
- [x] 1. Add new config fields to AppConfig
|
||||
- [x] 1.1 Add `max_data_context_retries` field (default=2) with `APP_MAX_DATA_CONTEXT_RETRIES` env override to `config/app_config.py`
|
||||
- [x] 1.2 Add `conversation_window_size` field (default=10) with `APP_CONVERSATION_WINDOW_SIZE` env override to `config/app_config.py`
|
||||
- [x] 1.3 Add `max_parallel_profiles` field (default=4) with `APP_MAX_PARALLEL_PROFILES` env override to `config/app_config.py`
|
||||
|
||||
## Priority 2: Data Privacy Fallback (R1–R3)
|
||||
|
||||
- [ ] 2. Implement error classification
|
||||
- [-] 2.1 Add `_classify_error(error_message: str) -> str` method to `DataAnalysisAgent` in `data_analysis_agent.py` with regex patterns for KeyError, ValueError, NameError, empty DataFrame
|
||||
- [-] 2.2 Add `_extract_column_from_error(error_message: str) -> Optional[str]` function to `utils/data_privacy.py`
|
||||
- [-] 2.3 Add `_lookup_column_in_profile(column_name, safe_profile) -> Optional[dict]` function to `utils/data_privacy.py`
|
||||
- [ ] 3. Implement enriched hint generation
|
||||
- [-] 3.1 Add `generate_enriched_hint(error_message: str, safe_profile: str) -> str` function to `utils/data_privacy.py`
|
||||
- [-] 3.2 Integrate retry logic into the `analyze()` loop in `data_analysis_agent.py`: add per-round retry counter, call `_classify_error` on failures, generate enriched hint when below retry limit, fall back to normal error handling at limit
|
||||
|
||||
## Priority 3: Conversation History Trimming (R4–R5)
|
||||
|
||||
- [ ] 4. Implement conversation trimming
|
||||
- [~] 4.1 Add `_trim_conversation_history()` method to `DataAnalysisAgent` implementing sliding window with first-message preservation
|
||||
- [~] 4.2 Add `_compress_trimmed_messages(messages: list) -> str` method to `DataAnalysisAgent` that generates summary with action types and success/failure, excluding code blocks and raw output
|
||||
- [~] 4.3 Call `_trim_conversation_history()` at the start of each round in the `analyze()` loop, after the first round
|
||||
|
||||
## Priority 4: Analysis Template System (R6–R8)
|
||||
|
||||
- [ ] 5. Backend template integration
|
||||
- [~] 5.1 Add optional `template_name` parameter to `DataAnalysisAgent.analyze()` method; retrieve template via `get_template()`, prepend `get_full_prompt()` to user requirement
|
||||
- [~] 5.2 Add `GET /api/templates` endpoint to `web/main.py` returning `list_templates()` result
|
||||
- [~] 5.3 Add optional `template` field to `StartRequest` model in `web/main.py`; pass template name to agent in `run_analysis_task`
|
||||
- [ ] 6. Frontend template selector
|
||||
- [~] 6.1 Add template selector HTML section (cards above requirement input) to `web/static/index.html`
|
||||
- [~] 6.2 Add template fetching, selection logic, and "No Template" default to `web/static/script.js`
|
||||
- [~] 6.3 Add template card styles (`.template-card`, `.template-card.selected`) to `web/static/clean_style.css`
|
||||
|
||||
## Priority 5: Frontend Progress Bar (R9)
|
||||
|
||||
- [ ] 7. Backend progress updates
|
||||
- [~] 7.1 Add `set_progress_callback(callback)` method to `DataAnalysisAgent`; call callback at start of each round in `analyze()` loop
|
||||
- [~] 7.2 Wire progress callback in `run_analysis_task` in `web/main.py` to update `SessionData` progress fields
|
||||
- [~] 7.3 Add `current_round`, `max_rounds`, `progress_percentage`, `status_message` to `GET /api/status` response in `web/main.py`
|
||||
- [ ] 8. Frontend progress bar
|
||||
- [~] 8.1 Add progress bar HTML element below the status bar area in `web/static/index.html`
|
||||
- [~] 8.2 Add `updateProgressBar(percentage, message)` function to `web/static/script.js`; call it during polling when `is_running` is true; set to 100% on completion
|
||||
- [~] 8.3 Add progress bar styles with CSS transition animation to `web/static/clean_style.css`
|
||||
|
||||
## Priority 6: Multi-File Chunked & Parallel Loading (R10–R11)
|
||||
|
||||
- [ ] 9. Chunked loading enhancement
|
||||
- [~] 9.1 Add `_profile_chunked(file_path: str) -> str` function to `utils/data_loader.py` that profiles using first chunk + sampled subsequent chunks
|
||||
- [~] 9.2 Add `load_and_profile_data_smart(file_paths, max_file_size_mb) -> str` function to `utils/data_loader.py` that selects chunked vs full loading based on file size threshold
|
||||
- [~] 9.3 Update `DataAnalysisAgent.analyze()` to use smart loader and expose chunked iterator in Code_Executor namespace for large files
|
||||
- [ ] 10. Parallel profiling
|
||||
- [~] 10.1 Add `_profile_files_parallel(file_paths: list) -> tuple[str, str]` method to `DataAnalysisAgent` using `ThreadPoolExecutor` with `max_parallel_profiles` workers
|
||||
- [~] 10.2 Update `DataAnalysisAgent.analyze()` to call `_profile_files_parallel` when multiple files are provided, replacing sequential `build_safe_profile` + `build_local_profile` calls
|
||||
|
||||
## Priority 7: Testing
|
||||
|
||||
- [ ] 11. Write property-based tests
|
||||
- [ ] 11.1 ~PBT~ Property test for error classification correctness (Property 1) using `hypothesis`
|
||||
- [ ] 11.2 ~PBT~ Property test for enriched hint content and privacy (Property 3) using `hypothesis`
|
||||
- [ ] 11.3 ~PBT~ Property test for env var config override (Property 4) using `hypothesis`
|
||||
- [ ] 11.4 ~PBT~ Property test for sliding window trimming invariants (Property 5) using `hypothesis`
|
||||
- [ ] 11.5 ~PBT~ Property test for trimming summary content (Property 6) using `hypothesis`
|
||||
- [ ] 11.6 ~PBT~ Property test for template prompt integration (Property 7) using `hypothesis`
|
||||
- [ ] 11.7 ~PBT~ Property test for invalid template error (Property 8) using `hypothesis`
|
||||
- [ ] 11.8 ~PBT~ Property test for parallel profile merge with error resilience (Property 11) using `hypothesis`
|
||||
- [ ] 12. Write unit and integration tests
|
||||
- [ ] 12.1 Unit tests for error classifier with known error messages
|
||||
- [ ] 12.2 Unit tests for conversation trimming at boundary conditions
|
||||
- [ ] 12.3 Integration tests for `GET /api/templates` and `POST /api/start` with template field
|
||||
- [ ] 12.4 Integration tests for `GET /api/status` progress fields
|
||||
21
LICENSE
21
LICENSE
@@ -1,21 +0,0 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2025 Data Analysis Agent Team
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
50
README.md
50
README.md
@@ -31,7 +31,9 @@ data_analysis_agent/
|
||||
│ ├── fallback_openai_client.py # 支持故障转移的OpenAI客户端
|
||||
│ ├── extract_code.py # 代码提取工具
|
||||
│ ├── format_execution_result.py # 执行结果格式化
|
||||
│ └── create_session_dir.py # 会话目录管理
|
||||
│ ├── create_session_dir.py # 会话目录管理
|
||||
│ ├── data_loader.py # 数据加载与画像生成
|
||||
│ └── script_generator.py # 可复用脚本生成器
|
||||
├── 📄 data_analysis_agent.py # 主智能体类
|
||||
├── 📄 prompts.py # 系统提示词模板
|
||||
├── 📄 main.py # 使用示例
|
||||
@@ -160,7 +162,7 @@ agent = DataAnalysisAgent(llm_config)
|
||||
# 开始分析
|
||||
files = ["your_data.csv"]
|
||||
report = agent.analyze(
|
||||
user_input="分析销售数据,生成趋势图表和关键指标",
|
||||
user_input="分析XXXXXXXXX数据,生成趋势图表和关键指标",
|
||||
files=files
|
||||
)
|
||||
|
||||
@@ -191,9 +193,9 @@ report = quick_analysis(
|
||||
|
||||
```python
|
||||
# 示例:茅台财务分析
|
||||
files = ["贵州茅台利润表.csv"]
|
||||
files = ["XXXXXXXXx.csv"]
|
||||
report = agent.analyze(
|
||||
user_input="基于贵州茅台的数据,输出五个重要的统计指标,并绘制相关图表。最后生成汇报给我。",
|
||||
user_input="基于数据,输出五个重要的统计指标,并绘制相关图表。最后生成汇报给我。",
|
||||
files=files
|
||||
)
|
||||
```
|
||||
@@ -207,6 +209,33 @@ report = agent.analyze(
|
||||
- 📋 营业成本占比分析
|
||||
- 📄 综合分析报告
|
||||
|
||||
## 🌐 Web界面可视化
|
||||
|
||||
本项目提供了现代化的Web界面,支持零代码交互。
|
||||
|
||||
### 启动方式
|
||||
|
||||
**macOS/Linux:**
|
||||
```bash
|
||||
./start_web.sh
|
||||
```
|
||||
|
||||
**Windows:**
|
||||
```bash
|
||||
start_web.bat
|
||||
```
|
||||
|
||||
访问地址: `http://localhost:8000`
|
||||
|
||||
### 核心功能 (Web)
|
||||
|
||||
- **🖼️ 图表画廊 (Gallery)**: 网格化展示所有生成图表,每张图表附带AI生成的分析解读。
|
||||
- **📜 实时日志**: 像黑客帝国一样实时查看后台分析过程和Agent的思考逻辑。
|
||||
- **📦 一键导出**: 支持一键下载包含 Markdown 报告和所有高清原图的 ZIP 压缩包。
|
||||
- **🛠️ 数据工具箱**:
|
||||
- **Excel合并**: 将多个同构 Excel 文件快速合并为分析可用的 CSV。
|
||||
- **时间排序**: 自动修复 CSV 数据的乱序问题,确保时序分析准确。
|
||||
|
||||
## 🎨 流程可视化
|
||||
|
||||
### 📊 分析过程状态图
|
||||
@@ -239,12 +268,15 @@ stateDiagram-v2
|
||||
```python
|
||||
@dataclass
|
||||
class LLMConfig:
|
||||
provider: str = "openai"
|
||||
provider: str = os.environ.get("LLM_PROVIDER", "openai")
|
||||
api_key: str = os.environ.get("OPENAI_API_KEY", "")
|
||||
base_url: str = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")
|
||||
model: str = os.environ.get("OPENAI_MODEL", "gpt-4")
|
||||
max_tokens: int = 4000
|
||||
temperature: float = 0.1
|
||||
temperature: float = 0.5
|
||||
max_tokens: int = 8192
|
||||
|
||||
# 支持 gemini 等其他 provider 配置
|
||||
# ...
|
||||
```
|
||||
|
||||
### 执行器配置
|
||||
@@ -254,7 +286,9 @@ class LLMConfig:
|
||||
ALLOWED_IMPORTS = {
|
||||
'pandas', 'numpy', 'matplotlib', 'duckdb',
|
||||
'scipy', 'sklearn', 'plotly', 'requests',
|
||||
'os', 'json', 'datetime', 're', 'pathlib'
|
||||
'os', 'json', 'datetime', 're', 'pathlib',
|
||||
'seaborn', 'statsmodels', 'networkx', 'jieba',
|
||||
'wordcloud', 'PIL', 'sqlite3', 'yaml'
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
62
bootstrap.py
Normal file
62
bootstrap.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import sys
|
||||
import subprocess
|
||||
import importlib.metadata
|
||||
import os
|
||||
|
||||
def check_dependencies():
|
||||
"""Checks if dependencies in requirements.txt are installed."""
|
||||
requirements_file = "requirements.txt"
|
||||
if not os.path.exists(requirements_file):
|
||||
print(f"Warning: {requirements_file} not found. Skipping dependency check.")
|
||||
return
|
||||
|
||||
print("Checking dependencies...")
|
||||
missing_packages = []
|
||||
|
||||
with open(requirements_file, "r") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
|
||||
# Simple parsing for package name.
|
||||
# This handles 'package>=version', 'package==version', 'package'
|
||||
# It does NOT handle complex markers perfectly, but suffices for basic checking.
|
||||
package_name = line.split("=")[0].split(">")[0].split("<")[0].strip()
|
||||
|
||||
try:
|
||||
importlib.metadata.version(package_name)
|
||||
except importlib.metadata.PackageNotFoundError:
|
||||
missing_packages.append(line)
|
||||
|
||||
if missing_packages:
|
||||
print(f"Missing dependencies: {', '.join(missing_packages)}")
|
||||
print("Installing missing dependencies...")
|
||||
try:
|
||||
subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", requirements_file])
|
||||
print("Dependencies installed successfully.")
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error installing dependencies: {e}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print("All dependencies checked.")
|
||||
|
||||
def main():
|
||||
check_dependencies()
|
||||
|
||||
print("Starting application...")
|
||||
try:
|
||||
# Run the main application
|
||||
# Using sys.executable ensures we use the same python interpreter
|
||||
subprocess.run([sys.executable, "main.py"], check=True)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Application exited with error: {e}")
|
||||
sys.exit(e.returncode)
|
||||
except KeyboardInterrupt:
|
||||
print("\nApplication stopped by user.")
|
||||
except Exception as e:
|
||||
print(f"An unexpected error occurred: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
0
cleaned_data/.gitkeep
Normal file
0
cleaned_data/.gitkeep
Normal file
@@ -4,5 +4,6 @@
|
||||
"""
|
||||
|
||||
from .llm_config import LLMConfig
|
||||
from .app_config import AppConfig, app_config
|
||||
|
||||
__all__ = ['LLMConfig']
|
||||
__all__ = ['LLMConfig', 'AppConfig', 'app_config']
|
||||
|
||||
93
config/app_config.py
Normal file
93
config/app_config.py
Normal file
@@ -0,0 +1,93 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
应用配置中心 - 集中管理所有配置项
|
||||
"""
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class AppConfig:
|
||||
"""应用配置中心"""
|
||||
|
||||
# 分析配置
|
||||
max_rounds: int = field(default=20)
|
||||
force_max_rounds: bool = field(default=False)
|
||||
default_output_dir: str = field(default="outputs")
|
||||
|
||||
# 数据处理配置
|
||||
max_file_size_mb: int = field(default=500) # 最大文件大小(MB)
|
||||
chunk_size: int = field(default=100000) # 分块读取大小
|
||||
data_cache_enabled: bool = field(default=True)
|
||||
cache_dir: str = field(default=".cache/data")
|
||||
|
||||
# LLM配置
|
||||
llm_cache_enabled: bool = field(default=True)
|
||||
llm_cache_dir: str = field(default=".cache/llm")
|
||||
llm_stream_enabled: bool = field(default=False)
|
||||
|
||||
# 代码执行配置
|
||||
code_timeout: int = field(default=300) # 代码执行超时(秒)
|
||||
allowed_imports: List[str] = field(default_factory=lambda: [
|
||||
'pandas', 'numpy', 'matplotlib', 'seaborn', 'plotly',
|
||||
'scipy', 'sklearn', 'duckdb', 'datetime', 'json',
|
||||
'os', 're', 'pathlib', 'glob', 'typing', 'collections',
|
||||
'itertools', 'functools', 'warnings'
|
||||
])
|
||||
|
||||
# Web配置
|
||||
web_host: str = field(default="0.0.0.0")
|
||||
web_port: int = field(default=8000)
|
||||
upload_dir: str = field(default="uploads")
|
||||
|
||||
# 日志配置
|
||||
log_filename: str = field(default="log.txt")
|
||||
enable_code_logging: bool = field(default=False) # 是否记录生成的代码
|
||||
|
||||
# 健壮性配置
|
||||
max_data_context_retries: int = field(default=2) # 数据上下文错误最大重试次数
|
||||
conversation_window_size: int = field(default=10) # 对话历史滑动窗口大小(消息对数)
|
||||
max_parallel_profiles: int = field(default=4) # 并行数据画像最大线程数
|
||||
|
||||
@classmethod
|
||||
def from_env(cls) -> 'AppConfig':
|
||||
"""从环境变量创建配置"""
|
||||
config = cls()
|
||||
|
||||
# 从环境变量覆盖配置
|
||||
if max_rounds := os.getenv("APP_MAX_ROUNDS"):
|
||||
config.max_rounds = int(max_rounds)
|
||||
|
||||
if chunk_size := os.getenv("APP_CHUNK_SIZE"):
|
||||
config.chunk_size = int(chunk_size)
|
||||
|
||||
if cache_enabled := os.getenv("APP_CACHE_ENABLED"):
|
||||
config.data_cache_enabled = cache_enabled.lower() == "true"
|
||||
|
||||
if val := os.getenv("APP_MAX_DATA_CONTEXT_RETRIES"):
|
||||
config.max_data_context_retries = int(val)
|
||||
if val := os.getenv("APP_CONVERSATION_WINDOW_SIZE"):
|
||||
config.conversation_window_size = int(val)
|
||||
if val := os.getenv("APP_MAX_PARALLEL_PROFILES"):
|
||||
config.max_parallel_profiles = int(val)
|
||||
|
||||
return config
|
||||
|
||||
def validate(self) -> bool:
|
||||
"""验证配置"""
|
||||
if self.max_rounds <= 0:
|
||||
raise ValueError("max_rounds must be positive")
|
||||
|
||||
if self.chunk_size <= 0:
|
||||
raise ValueError("chunk_size must be positive")
|
||||
|
||||
if self.code_timeout <= 0:
|
||||
raise ValueError("code_timeout must be positive")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
# 全局配置实例
|
||||
app_config = AppConfig.from_env()
|
||||
@@ -17,12 +17,25 @@ load_dotenv()
|
||||
class LLMConfig:
|
||||
"""LLM配置"""
|
||||
|
||||
provider: str = "openai" # openai, anthropic, etc.
|
||||
api_key: str = os.environ.get("OPENAI_API_KEY", "sk-c44i1hy64xgzwox6x08o4zug93frq6rgn84oqugf2pje1tg4")
|
||||
base_url: str = os.environ.get("OPENAI_BASE_URL", "https://api.xiaomimimo.com/v1")
|
||||
model: str = os.environ.get("OPENAI_MODEL", "mimo-v2-flash")
|
||||
provider: str = os.environ.get("LLM_PROVIDER", "openai") # openai, gemini, etc.
|
||||
api_key: str = os.environ.get("OPENAI_API_KEY", "")
|
||||
base_url: str = os.environ.get("OPENAI_BASE_URL", "http://127.0.0.1:9999/v1")
|
||||
model: str = os.environ.get("OPENAI_MODEL", "gemini-3-flash")
|
||||
temperature: float = 0.5
|
||||
max_tokens: int = 131072
|
||||
max_tokens: int = 8192 # 降低默认值,避免某些API不支持过大的值
|
||||
|
||||
def __post_init__(self):
|
||||
"""配置初始化后的处理"""
|
||||
if self.provider == "gemini":
|
||||
# 如果使用 Gemini,尝试从环境变量加载 Gemini 配置,或者使用默认的 Gemini 配置
|
||||
# 注意:如果 OPENAI_API_KEY 已设置且 GEMINI_API_KEY 未设置,可能会沿用 OpenAI 的 Key,
|
||||
# 但既然用户切换了 provider,通常会有配套的 Key。
|
||||
self.api_key = os.environ.get("GEMINI_API_KEY", "")
|
||||
# Gemini 的 OpenAI 兼容接口地址
|
||||
self.base_url = os.environ.get("GEMINI_BASE_URL", "https://gemini.jeason.online")
|
||||
self.model = os.environ.get("GEMINI_MODEL", "gemini-2.5-flash")
|
||||
# Gemini 有更严格的 token 限制
|
||||
self.max_tokens = 8192
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""转换为字典"""
|
||||
|
||||
@@ -18,8 +18,10 @@ from utils.extract_code import extract_code_from_response
|
||||
from utils.data_loader import load_and_profile_data
|
||||
from utils.llm_helper import LLMHelper
|
||||
from utils.code_executor import CodeExecutor
|
||||
from utils.script_generator import generate_reusable_script
|
||||
from utils.data_privacy import build_safe_profile, build_local_profile, sanitize_execution_feedback
|
||||
from config.llm_config import LLMConfig
|
||||
from prompts import data_analysis_system_prompt, final_report_system_prompt
|
||||
from prompts import data_analysis_system_prompt, final_report_system_prompt, data_analysis_followup_prompt
|
||||
|
||||
|
||||
class DataAnalysisAgent:
|
||||
@@ -60,7 +62,10 @@ class DataAnalysisAgent:
|
||||
self.current_round = 0
|
||||
self.session_output_dir = None
|
||||
self.executor = None
|
||||
self.data_profile = "" # 存储数据画像
|
||||
self.data_profile = "" # 存储数据画像(完整版,本地使用)
|
||||
self.data_profile_safe = "" # 存储安全画像(发给LLM)
|
||||
self.data_files = [] # 存储数据文件列表
|
||||
self.user_requirement = "" # 存储用户需求
|
||||
|
||||
def _process_response(self, response: str) -> Dict[str, Any]:
|
||||
"""
|
||||
@@ -76,7 +81,7 @@ class DataAnalysisAgent:
|
||||
yaml_data = self.llm.parse_yaml_response(response)
|
||||
action = yaml_data.get("action", "generate_code")
|
||||
|
||||
print(f"🎯 检测到动作: {action}")
|
||||
print(f"[TARGET] 检测到动作: {action}")
|
||||
|
||||
if action == "analysis_complete":
|
||||
return self._handle_analysis_complete(response, yaml_data)
|
||||
@@ -85,18 +90,22 @@ class DataAnalysisAgent:
|
||||
elif action == "generate_code":
|
||||
return self._handle_generate_code(response, yaml_data)
|
||||
else:
|
||||
print(f"⚠️ 未知动作类型: {action},按generate_code处理")
|
||||
print(f"[WARN] 未知动作类型: {action},按generate_code处理")
|
||||
return self._handle_generate_code(response, yaml_data)
|
||||
|
||||
except Exception as e:
|
||||
print(f"⚠️ 解析响应失败: {str(e)},按generate_code处理")
|
||||
print(f"[WARN] 解析响应失败: {str(e)},尝试提取代码并按generate_code处理")
|
||||
# 即使YAML解析失败,也尝试提取代码
|
||||
extracted_code = extract_code_from_response(response)
|
||||
if extracted_code:
|
||||
return self._handle_generate_code(response, {"code": extracted_code})
|
||||
return self._handle_generate_code(response, {})
|
||||
|
||||
def _handle_analysis_complete(
|
||||
self, response: str, yaml_data: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
"""处理分析完成动作"""
|
||||
print("✅ 分析任务完成")
|
||||
print("[OK] 分析任务完成")
|
||||
final_report = yaml_data.get("final_report", "分析完成,无最终报告")
|
||||
return {
|
||||
"action": "analysis_complete",
|
||||
@@ -109,10 +118,12 @@ class DataAnalysisAgent:
|
||||
self, response: str, yaml_data: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
"""处理图片收集动作"""
|
||||
print("📊 开始收集图片")
|
||||
print("[CHART] 开始收集图片")
|
||||
figures_to_collect = yaml_data.get("figures_to_collect", [])
|
||||
|
||||
collected_figures = []
|
||||
# 使用seen_paths集合来去重,防止重复收集
|
||||
seen_paths = set()
|
||||
|
||||
for figure_info in figures_to_collect:
|
||||
figure_number = figure_info.get("figure_number", "未知")
|
||||
@@ -126,41 +137,36 @@ class DataAnalysisAgent:
|
||||
description = figure_info.get("description", "")
|
||||
analysis = figure_info.get("analysis", "")
|
||||
|
||||
print(f"📈 收集图片 {figure_number}: {filename}")
|
||||
print(f" 📂 路径: {file_path}")
|
||||
print(f" 📝 描述: {description}")
|
||||
print(f" 🔍 分析: {analysis}")
|
||||
print(f"[GRAPH] 收集图片 {figure_number}: {filename}")
|
||||
print(f" [DIR] 路径: {file_path}")
|
||||
print(f" [NOTE] 描述: {description}")
|
||||
print(f" [SEARCH] 分析: {analysis}")
|
||||
|
||||
|
||||
# 记录图片信息
|
||||
collected_figures.append(
|
||||
{
|
||||
"figure_number": figure_number,
|
||||
"filename": filename,
|
||||
"file_path": file_path,
|
||||
"description": description,
|
||||
"analysis": analysis,
|
||||
}
|
||||
)
|
||||
# 验证文件是否存在
|
||||
# 只有文件真正存在时才加入列表,防止报告出现裂图
|
||||
if file_path and os.path.exists(file_path):
|
||||
print(f" ✅ 文件存在: {file_path}")
|
||||
# 记录图片信息
|
||||
collected_figures.append(
|
||||
{
|
||||
"figure_number": figure_number,
|
||||
"filename": filename,
|
||||
"file_path": file_path,
|
||||
"description": description,
|
||||
"analysis": analysis,
|
||||
}
|
||||
)
|
||||
# 检查是否已经收集过该路径
|
||||
abs_path = os.path.abspath(file_path)
|
||||
if abs_path not in seen_paths:
|
||||
print(f" [OK] 文件存在: {file_path}")
|
||||
# 记录图片信息
|
||||
collected_figures.append(
|
||||
{
|
||||
"figure_number": figure_number,
|
||||
"filename": filename,
|
||||
"file_path": file_path,
|
||||
"description": description,
|
||||
"analysis": analysis,
|
||||
}
|
||||
)
|
||||
seen_paths.add(abs_path)
|
||||
else:
|
||||
print(f" [WARN] 跳过重复图片: {file_path}")
|
||||
else:
|
||||
if file_path:
|
||||
print(f" ⚠️ 文件不存在: {file_path}")
|
||||
print(f" [WARN] 文件不存在: {file_path}")
|
||||
else:
|
||||
print(f" ⚠️ 未提供文件路径")
|
||||
print(f" [WARN] 未提供文件路径")
|
||||
|
||||
return {
|
||||
"action": "collect_figures",
|
||||
@@ -192,7 +198,7 @@ class DataAnalysisAgent:
|
||||
code = code.strip()
|
||||
|
||||
if code:
|
||||
print(f"🔧 执行代码:\n{code}")
|
||||
print(f"[TOOL] 执行代码:\n{code}")
|
||||
print("-" * 40)
|
||||
|
||||
# 执行代码
|
||||
@@ -200,7 +206,7 @@ class DataAnalysisAgent:
|
||||
|
||||
# 格式化执行结果
|
||||
feedback = format_execution_result(result)
|
||||
print(f"📋 执行反馈:\n{feedback}")
|
||||
print(f"[LIST] 执行反馈:\n{feedback}")
|
||||
|
||||
return {
|
||||
"action": "generate_code",
|
||||
@@ -212,7 +218,7 @@ class DataAnalysisAgent:
|
||||
}
|
||||
else:
|
||||
# 如果没有代码,说明LLM响应格式有问题,需要重新生成
|
||||
print("⚠️ 未从响应中提取到可执行代码,要求LLM重新生成")
|
||||
print("[WARN] 未从响应中提取到可执行代码,要求LLM重新生成")
|
||||
return {
|
||||
"action": "invalid_response",
|
||||
"error": "响应中缺少可执行代码",
|
||||
@@ -220,7 +226,7 @@ class DataAnalysisAgent:
|
||||
"continue": True,
|
||||
}
|
||||
|
||||
def analyze(self, user_input: str, files: List[str] = None, session_output_dir: str = None) -> Dict[str, Any]:
|
||||
def analyze(self, user_input: str, files: List[str] = None, session_output_dir: str = None, reset_session: bool = True, max_rounds: int = None) -> Dict[str, Any]:
|
||||
"""
|
||||
开始分析流程
|
||||
|
||||
@@ -228,89 +234,150 @@ class DataAnalysisAgent:
|
||||
user_input: 用户的自然语言需求
|
||||
files: 数据文件路径列表
|
||||
session_output_dir: 指定的会话输出目录(可选)
|
||||
reset_session: 是否重置会话 (True: 新开启分析; False: 在现有上下文中继续)
|
||||
max_rounds: 本次分析的最大轮数 (可选,如果不填则使用默认值)
|
||||
|
||||
Returns:
|
||||
分析结果字典
|
||||
"""
|
||||
# 重置状态
|
||||
self.conversation_history = []
|
||||
self.analysis_results = []
|
||||
self.current_round = 0
|
||||
|
||||
# 创建本次分析的专用输出目录
|
||||
if session_output_dir:
|
||||
self.session_output_dir = session_output_dir
|
||||
# 确定本次运行的轮数限制
|
||||
current_max_rounds = max_rounds if max_rounds is not None else self.max_rounds
|
||||
|
||||
if reset_session:
|
||||
# --- 初始化新会话 ---
|
||||
self.conversation_history = []
|
||||
self.analysis_results = []
|
||||
self.current_round = 0
|
||||
self.data_files = files or [] # 保存数据文件列表
|
||||
self.user_requirement = user_input # 保存用户需求
|
||||
|
||||
# 创建本次分析的专用输出目录
|
||||
if session_output_dir:
|
||||
self.session_output_dir = session_output_dir
|
||||
else:
|
||||
self.session_output_dir = create_session_output_dir(
|
||||
self.base_output_dir, user_input
|
||||
)
|
||||
|
||||
# 初始化代码执行器,使用会话目录
|
||||
self.executor = CodeExecutor(self.session_output_dir)
|
||||
|
||||
# 设置会话目录变量到执行环境中
|
||||
self.executor.set_variable("session_output_dir", self.session_output_dir)
|
||||
|
||||
# 生成数据画像(分级:安全级发给LLM,完整级留本地)
|
||||
data_profile_safe = ""
|
||||
data_profile_local = ""
|
||||
if files:
|
||||
print("[SEARCH] 正在生成数据画像...")
|
||||
try:
|
||||
data_profile_safe = build_safe_profile(files)
|
||||
data_profile_local = build_local_profile(files)
|
||||
print("[OK] 数据画像生成完毕(安全级 + 本地级)")
|
||||
except Exception as e:
|
||||
print(f"[WARN] 数据画像生成失败: {e}")
|
||||
|
||||
# 安全画像发给LLM,完整画像留给最终报告生成
|
||||
self.data_profile = data_profile_local # 本地完整版用于最终报告
|
||||
self.data_profile_safe = data_profile_safe # 安全版用于LLM对话
|
||||
|
||||
# 构建初始prompt(只发送安全级画像给LLM)
|
||||
initial_prompt = f"""用户需求: {user_input}"""
|
||||
if files:
|
||||
initial_prompt += f"\n数据文件: {', '.join(files)}"
|
||||
|
||||
if data_profile_safe:
|
||||
initial_prompt += f"\n\n{data_profile_safe}\n\n请根据上述【数据结构概览】中的列名、数据类型和特征描述来制定分析策略。先通过代码探索数据的实际分布,再进行深度分析。"
|
||||
|
||||
print(f"[START] 开始数据分析任务")
|
||||
print(f"[NOTE] 用户需求: {user_input}")
|
||||
if files:
|
||||
print(f"[FOLDER] 数据文件: {', '.join(files)}")
|
||||
print(f"[DIR] 输出目录: {self.session_output_dir}")
|
||||
|
||||
# 添加到对话历史
|
||||
self.conversation_history.append({"role": "user", "content": initial_prompt})
|
||||
|
||||
else:
|
||||
self.session_output_dir = create_session_output_dir(
|
||||
self.base_output_dir, user_input
|
||||
)
|
||||
# --- 继续现有会话 ---
|
||||
# 如果是追问,且没有指定轮数,默认减少轮数,避免过度分析
|
||||
if max_rounds is None:
|
||||
current_max_rounds = 10 # 追问通常不需要那么长的思考链,10轮足够
|
||||
|
||||
print(f"\n[START] 继续分析任务 (追问模式)")
|
||||
print(f"[NOTE] 后续需求: {user_input}")
|
||||
|
||||
# 初始化代码执行器,使用会话目录
|
||||
self.executor = CodeExecutor(self.session_output_dir)
|
||||
# 重置当前轮数计数器,以便给新任务足够的轮次
|
||||
self.current_round = 0
|
||||
|
||||
# 设置会话目录变量到执行环境中
|
||||
self.executor.set_variable("session_output_dir", self.session_output_dir)
|
||||
# 添加到对话历史
|
||||
# 提示Agent这是后续追问,可以简化步骤
|
||||
follow_up_prompt = f"后续需求: {user_input}\n(注意:这是后续追问,请直接针对该问题进行分析,无需从头开始执行完整SOP。)"
|
||||
self.conversation_history.append({"role": "user", "content": follow_up_prompt})
|
||||
|
||||
# 设用工具生成数据画像
|
||||
data_profile = ""
|
||||
if files:
|
||||
print("🔍 正在生成数据画像...")
|
||||
data_profile = load_and_profile_data(files)
|
||||
print("✅ 数据画像生成完毕")
|
||||
|
||||
# 保存到实例变量供最终报告使用
|
||||
self.data_profile = data_profile
|
||||
|
||||
# 构建初始prompt
|
||||
initial_prompt = f"""用户需求: {user_input}"""
|
||||
if files:
|
||||
initial_prompt += f"\n数据文件: {', '.join(files)}"
|
||||
|
||||
if data_profile:
|
||||
initial_prompt += f"\n\n{data_profile}\n\n请根据上述【数据画像】中的统计信息(如高频值、缺失率、数据范围)来制定分析策略。如果发现明显的高频问题或异常分布,请优先进行深度分析。"
|
||||
|
||||
print(f"🚀 开始数据分析任务")
|
||||
print(f"📝 用户需求: {user_input}")
|
||||
if files:
|
||||
print(f"📁 数据文件: {', '.join(files)}")
|
||||
print(f"📂 输出目录: {self.session_output_dir}")
|
||||
print(f"🔢 最大轮数: {self.max_rounds}")
|
||||
print(f"[NUM] 本次最大轮数: {current_max_rounds}")
|
||||
if self.force_max_rounds:
|
||||
print(f"⚡ 强制模式: 将运行满 {self.max_rounds} 轮(忽略AI完成信号)")
|
||||
print(f"[FAST] 强制模式: 将运行满 {current_max_rounds} 轮(忽略AI完成信号)")
|
||||
print("=" * 60)
|
||||
# 添加到对话历史
|
||||
self.conversation_history.append({"role": "user", "content": initial_prompt})
|
||||
|
||||
# 保存原始 max_rounds 以便恢复(虽然 analyze 结束后不需要恢复,但为了逻辑严谨)
|
||||
original_max_rounds = self.max_rounds
|
||||
self.max_rounds = current_max_rounds
|
||||
|
||||
# 初始化连续失败计数器
|
||||
consecutive_failures = 0
|
||||
|
||||
while self.current_round < self.max_rounds:
|
||||
self.current_round += 1
|
||||
print(f"\n🔄 第 {self.current_round} 轮分析")
|
||||
print(f"\n[LOOP] 第 {self.current_round} 轮分析")
|
||||
# 调用LLM生成响应
|
||||
try: # 获取当前执行环境的变量信息
|
||||
notebook_variables = self.executor.get_environment_info()
|
||||
|
||||
# Select prompt based on mode
|
||||
if self.current_round == 1 and not reset_session:
|
||||
# For the first round of a follow-up session, use the specialized prompt
|
||||
base_system_prompt = data_analysis_followup_prompt
|
||||
elif not reset_session and self.current_round > 1:
|
||||
# For subsequent rounds in follow-up, continue using the follow-up context
|
||||
# or maybe just the standard one is fine as long as SOP isn't fully enforced?
|
||||
# Let's stick to the follow-up prompt to prevent SOP regression
|
||||
base_system_prompt = data_analysis_followup_prompt
|
||||
else:
|
||||
base_system_prompt = data_analysis_system_prompt
|
||||
|
||||
# 格式化系统提示词,填入动态的notebook变量信息
|
||||
formatted_system_prompt = data_analysis_system_prompt.format(
|
||||
formatted_system_prompt = base_system_prompt.format(
|
||||
notebook_variables=notebook_variables
|
||||
)
|
||||
print(f"🐛 [DEBUG] System Prompt Head:\n{formatted_system_prompt[:500]}...\n[...]")
|
||||
print(f"🐛 [DEBUG] System Prompt Rules Check: 'stop_words' in prompt? {'stop_words' in formatted_system_prompt}")
|
||||
print(f"[DEBUG] [DEBUG] System Prompt Head:\n{formatted_system_prompt[:500]}...\n[...]")
|
||||
print(f"[DEBUG] [DEBUG] System Prompt Rules Check: 'stop_words' in prompt? {'stop_words' in formatted_system_prompt}")
|
||||
|
||||
response = self.llm.call(
|
||||
prompt=self._build_conversation_prompt(),
|
||||
system_prompt=formatted_system_prompt,
|
||||
)
|
||||
|
||||
print(f"🤖 助手响应:\n{response}")
|
||||
print(f"[AI] 助手响应:\n{response}")
|
||||
|
||||
# 使用统一的响应处理方法
|
||||
process_result = self._process_response(response)
|
||||
|
||||
# 根据处理结果决定是否继续(仅在非强制模式下)
|
||||
if process_result.get("action") == "invalid_response":
|
||||
consecutive_failures += 1
|
||||
print(f"[WARN] 连续失败次数: {consecutive_failures}/3")
|
||||
if consecutive_failures >= 3:
|
||||
print(f"[ERROR] 连续3次无法获取有效响应,分析终止。请检查网络或配置。")
|
||||
break
|
||||
else:
|
||||
consecutive_failures = 0 # 重置计数器
|
||||
|
||||
if not self.force_max_rounds and not process_result.get(
|
||||
"continue", True
|
||||
):
|
||||
print(f"\n✅ 分析完成!")
|
||||
print(f"\n[OK] 分析完成!")
|
||||
break
|
||||
|
||||
# 添加到对话历史
|
||||
@@ -321,8 +388,10 @@ class DataAnalysisAgent:
|
||||
# 根据动作类型添加不同的反馈
|
||||
if process_result["action"] == "generate_code":
|
||||
feedback = process_result.get("feedback", "")
|
||||
# 对执行反馈进行脱敏,移除真实数据值后再发给LLM
|
||||
safe_feedback = sanitize_execution_feedback(feedback)
|
||||
self.conversation_history.append(
|
||||
{"role": "user", "content": f"代码执行反馈:\n{feedback}"}
|
||||
{"role": "user", "content": f"代码执行反馈:\n{safe_feedback}"}
|
||||
)
|
||||
|
||||
# 记录分析结果
|
||||
@@ -342,7 +411,7 @@ class DataAnalysisAgent:
|
||||
|
||||
feedback = f"已收集 {len(collected_figures)} 个有效图片及其分析。"
|
||||
if missing_figures:
|
||||
feedback += f"\n⚠️ 以下图片未找到,请检查代码是否成功保存了这些图片: {missing_figures}"
|
||||
feedback += f"\n[WARN] 以下图片未找到,请检查代码是否成功保存了这些图片: {missing_figures}"
|
||||
|
||||
self.conversation_history.append(
|
||||
{
|
||||
@@ -365,7 +434,7 @@ class DataAnalysisAgent:
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"LLM调用错误: {str(e)}"
|
||||
print(f"❌ {error_msg}")
|
||||
print(f"[ERROR] {error_msg}")
|
||||
self.conversation_history.append(
|
||||
{
|
||||
"role": "user",
|
||||
@@ -374,7 +443,7 @@ class DataAnalysisAgent:
|
||||
)
|
||||
# 生成最终总结
|
||||
if self.current_round >= self.max_rounds:
|
||||
print(f"\n⚠️ 已达到最大轮数 ({self.max_rounds}),分析结束")
|
||||
print(f"\n[WARN] 已达到最大轮数 ({self.max_rounds}),分析结束")
|
||||
|
||||
return self._generate_final_report()
|
||||
|
||||
@@ -400,10 +469,39 @@ class DataAnalysisAgent:
|
||||
if result.get("action") == "collect_figures":
|
||||
all_figures.extend(result.get("collected_figures", []))
|
||||
|
||||
print(f"\n📊 开始生成最终分析报告...")
|
||||
print(f"📂 输出目录: {self.session_output_dir}")
|
||||
print(f"🔢 总轮数: {self.current_round}")
|
||||
print(f"📈 收集图片: {len(all_figures)} 个")
|
||||
print(f"\n[CHART] 开始生成最终分析报告...")
|
||||
print(f"[DIR] 输出目录: {self.session_output_dir}")
|
||||
|
||||
# --- 自动补全/发现图片机制 ---
|
||||
# 扫描目录下所有的png文件
|
||||
try:
|
||||
import glob
|
||||
existing_pngs = glob.glob(os.path.join(self.session_output_dir, "*.png"))
|
||||
|
||||
# 获取已收集的图片路径集合
|
||||
collected_paths = set()
|
||||
for fig in all_figures:
|
||||
if fig.get("file_path"):
|
||||
collected_paths.add(os.path.abspath(fig.get("file_path")))
|
||||
|
||||
# 检查是否有漏网之鱼
|
||||
for png_path in existing_pngs:
|
||||
abs_png_path = os.path.abspath(png_path)
|
||||
if abs_png_path not in collected_paths:
|
||||
print(f"[SEARCH] [自动发现] 补充未显式收集的图片: {os.path.basename(png_path)}")
|
||||
all_figures.append({
|
||||
"figure_number": "Auto",
|
||||
"filename": os.path.basename(png_path),
|
||||
"file_path": abs_png_path,
|
||||
"description": f"自动发现的分析图表: {os.path.basename(png_path)}",
|
||||
"analysis": "(该图表由系统自动捕获,Agent未提供具体分析文本,请结合图表标题理解)"
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"[WARN] 自动发现图片失败: {e}")
|
||||
# ---------------------------
|
||||
|
||||
print(f"[NUM] 总轮数: {self.current_round}")
|
||||
print(f"[GRAPH] 收集图片: {len(all_figures)} 个")
|
||||
|
||||
# 构建用于生成最终报告的提示词
|
||||
final_report_prompt = self._build_final_report_prompt(all_figures)
|
||||
@@ -415,33 +513,22 @@ class DataAnalysisAgent:
|
||||
max_tokens=16384, # 设置较大的token限制以容纳完整报告
|
||||
)
|
||||
|
||||
# 解析响应,提取最终报告
|
||||
try:
|
||||
# 尝试解析YAML
|
||||
yaml_data = self.llm.parse_yaml_response(response)
|
||||
# 直接使用LLM响应作为最终报告(因为我们在prompt中要求直接输出Markdown)
|
||||
final_report_content = response
|
||||
|
||||
# 情况1: 标准YAML格式,包含 action: analysis_complete
|
||||
if yaml_data.get("action") == "analysis_complete":
|
||||
final_report_content = yaml_data.get("final_report", response)
|
||||
# 兼容旧逻辑:如果意外返回了YAML,尝试解析
|
||||
if response.strip().startswith("action:") or "final_report:" in response:
|
||||
try:
|
||||
yaml_data = self.llm.parse_yaml_response(response)
|
||||
if yaml_data.get("action") == "analysis_complete":
|
||||
final_report_content = yaml_data.get("final_report", response)
|
||||
except:
|
||||
pass # 解析失败则保持原样
|
||||
|
||||
# 情况2: 解析成功但没字段,或者解析失败
|
||||
else:
|
||||
# 如果内容看起来像Markdown报告(包含标题),直接使用
|
||||
if "# " in response or "## " in response:
|
||||
print("⚠️ 未检测到标准YAML动作,但内容疑似Markdown报告,直接采纳")
|
||||
final_report_content = response
|
||||
else:
|
||||
final_report_content = "LLM未返回有效报告内容"
|
||||
|
||||
except Exception as e:
|
||||
# 解析完全失败,直接使用原始响应
|
||||
print(f"⚠️ YAML解析失败 ({e}),直接使用原始响应作为报告")
|
||||
final_report_content = response
|
||||
|
||||
print("✅ 最终报告生成完成")
|
||||
print("[OK] 最终报告生成完成")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 生成最终报告时出错: {str(e)}")
|
||||
print(f"[ERROR] 生成最终报告时出错: {str(e)}")
|
||||
final_report_content = f"报告生成失败: {str(e)}"
|
||||
|
||||
# 保存最终报告到文件
|
||||
@@ -449,9 +536,21 @@ class DataAnalysisAgent:
|
||||
try:
|
||||
with open(report_file_path, "w", encoding="utf-8") as f:
|
||||
f.write(final_report_content)
|
||||
print(f"📄 最终报告已保存至: {report_file_path}")
|
||||
print(f"[DOC] 最终报告已保存至: {report_file_path}")
|
||||
except Exception as e:
|
||||
print(f"❌ 保存报告文件失败: {str(e)}")
|
||||
print(f"[ERROR] 保存报告文件失败: {str(e)}")
|
||||
|
||||
# 生成可复用脚本
|
||||
script_path = ""
|
||||
try:
|
||||
script_path = generate_reusable_script(
|
||||
analysis_results=self.analysis_results,
|
||||
data_files=self.data_files,
|
||||
session_output_dir=self.session_output_dir,
|
||||
user_requirement=self.user_requirement
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"[WARN] 脚本生成失败: {e}")
|
||||
|
||||
# 返回完整的分析结果
|
||||
return {
|
||||
@@ -462,6 +561,7 @@ class DataAnalysisAgent:
|
||||
"conversation_history": self.conversation_history,
|
||||
"final_report": final_report_content,
|
||||
"report_file_path": report_file_path,
|
||||
"reusable_script_path": script_path,
|
||||
}
|
||||
|
||||
def _build_final_report_prompt(self, all_figures: List[Dict[str, Any]]) -> str:
|
||||
@@ -508,7 +608,7 @@ class DataAnalysisAgent:
|
||||
# 在提示词中明确要求使用相对路径
|
||||
prompt += """
|
||||
|
||||
📁 **图片路径使用说明**:
|
||||
[FOLDER] **图片路径使用说明**:
|
||||
报告和图片都在同一目录下,请在报告中使用相对路径引用图片:
|
||||
- 格式:
|
||||
- 示例:
|
||||
|
||||
89
data_preprocessing/README.md
Normal file
89
data_preprocessing/README.md
Normal file
@@ -0,0 +1,89 @@
|
||||
# 数据预处理模块
|
||||
|
||||
独立的数据清洗工具,用于在正式分析前准备数据。
|
||||
|
||||
## 功能
|
||||
|
||||
- **数据合并**:将多个 Excel/CSV 文件合并为单一 CSV
|
||||
- **时间排序**:按时间列对数据进行排序
|
||||
- **目录管理**:标准化的原始数据和输出数据目录
|
||||
|
||||
## 目录结构
|
||||
|
||||
```
|
||||
project/
|
||||
├── raw_data/ # 原始数据存放目录
|
||||
│ ├── remotecontrol/ # 按数据来源分类
|
||||
│ └── ...
|
||||
├── cleaned_data/ # 清洗后数据输出目录
|
||||
│ ├── xxx_merged.csv
|
||||
│ └── xxx_sorted.csv
|
||||
└── data_preprocessing/ # 本模块
|
||||
```
|
||||
|
||||
## 使用方法
|
||||
|
||||
### 命令行
|
||||
|
||||
```bash
|
||||
# 初始化目录结构
|
||||
python -m data_preprocessing.cli init
|
||||
|
||||
# 合并 Excel 文件
|
||||
python -m data_preprocessing.cli merge --source raw_data/remotecontrol
|
||||
|
||||
# 合并并按时间排序
|
||||
python -m data_preprocessing.cli merge --source raw_data/remotecontrol --sort-by SendTime
|
||||
|
||||
# 指定输出路径
|
||||
python -m data_preprocessing.cli merge -s raw_data/remotecontrol -o cleaned_data/my_output.csv
|
||||
|
||||
# 排序已有 CSV
|
||||
python -m data_preprocessing.cli sort --input some_file.csv --time-col SendTime
|
||||
|
||||
# 原地排序(覆盖原文件)
|
||||
python -m data_preprocessing.cli sort --input data.csv --inplace
|
||||
```
|
||||
|
||||
### Python API
|
||||
|
||||
```python
|
||||
from data_preprocessing import merge_files, sort_by_time, Config
|
||||
|
||||
# 合并文件
|
||||
output_path = merge_files(
|
||||
source_dir="raw_data/remotecontrol",
|
||||
output_file="cleaned_data/merged.csv",
|
||||
pattern="*.xlsx",
|
||||
time_column="SendTime" # 可选:合并后排序
|
||||
)
|
||||
|
||||
# 排序 CSV
|
||||
sorted_path = sort_by_time(
|
||||
input_path="data.csv",
|
||||
output_path="sorted_data.csv",
|
||||
time_column="CreateTime"
|
||||
)
|
||||
|
||||
# 自定义配置
|
||||
config = Config()
|
||||
config.raw_data_dir = "/path/to/raw"
|
||||
config.cleaned_data_dir = "/path/to/cleaned"
|
||||
config.ensure_dirs()
|
||||
```
|
||||
|
||||
## 配置项
|
||||
|
||||
| 配置项 | 默认值 | 说明 |
|
||||
|--------|--------|------|
|
||||
| `raw_data_dir` | `raw_data/` | 原始数据目录 |
|
||||
| `cleaned_data_dir` | `cleaned_data/` | 清洗输出目录 |
|
||||
| `default_time_column` | `SendTime` | 默认时间列名 |
|
||||
| `csv_encoding` | `utf-8-sig` | CSV 编码格式 |
|
||||
|
||||
## 注意事项
|
||||
|
||||
1. 本模块与 `DataAnalysisAgent` 完全独立,不会相互调用
|
||||
2. 合并时会自动添加 `_source_file` 列标记数据来源(可用 `--no-source-col` 禁用)
|
||||
3. Excel 文件会自动合并所有 Sheet
|
||||
4. 无效时间值在排序时会被放到最后
|
||||
14
data_preprocessing/__init__.py
Normal file
14
data_preprocessing/__init__.py
Normal file
@@ -0,0 +1,14 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
数据预处理模块
|
||||
|
||||
提供独立的数据清洗功能:
|
||||
- 按时间排序
|
||||
- 同类数据合并
|
||||
"""
|
||||
|
||||
from .sorter import sort_by_time
|
||||
from .merger import merge_files
|
||||
from .config import Config
|
||||
|
||||
__all__ = ["sort_by_time", "merge_files", "Config"]
|
||||
140
data_preprocessing/cli.py
Normal file
140
data_preprocessing/cli.py
Normal file
@@ -0,0 +1,140 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
数据预处理命令行接口
|
||||
|
||||
使用示例:
|
||||
# 合并 Excel 文件
|
||||
python -m data_preprocessing.cli merge --source raw_data/remotecontrol --output cleaned_data/merged.csv
|
||||
|
||||
# 合并并排序
|
||||
python -m data_preprocessing.cli merge --source raw_data/remotecontrol --sort-by SendTime
|
||||
|
||||
# 排序已有 CSV
|
||||
python -m data_preprocessing.cli sort --input data.csv --output sorted.csv --time-col SendTime
|
||||
|
||||
# 初始化目录结构
|
||||
python -m data_preprocessing.cli init
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from .config import default_config
|
||||
from .sorter import sort_by_time
|
||||
from .merger import merge_files
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="data_preprocessing",
|
||||
description="数据预处理工具:排序、合并",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
示例:
|
||||
%(prog)s merge --source raw_data/remotecontrol --sort-by SendTime
|
||||
%(prog)s sort --input data.csv --time-col CreateTime
|
||||
%(prog)s init
|
||||
"""
|
||||
)
|
||||
|
||||
subparsers = parser.add_subparsers(dest="command", help="可用命令")
|
||||
|
||||
# ========== merge 命令 ==========
|
||||
merge_parser = subparsers.add_parser("merge", help="合并同类文件")
|
||||
merge_parser.add_argument(
|
||||
"--source", "-s",
|
||||
required=True,
|
||||
help="源数据目录路径"
|
||||
)
|
||||
merge_parser.add_argument(
|
||||
"--output", "-o",
|
||||
default=None,
|
||||
help="输出文件路径 (默认: cleaned_data/<目录名>_merged.csv)"
|
||||
)
|
||||
merge_parser.add_argument(
|
||||
"--pattern", "-p",
|
||||
default="*.xlsx",
|
||||
help="文件匹配模式 (默认: *.xlsx)"
|
||||
)
|
||||
merge_parser.add_argument(
|
||||
"--sort-by",
|
||||
default=None,
|
||||
dest="time_column",
|
||||
help="合并后按此时间列排序"
|
||||
)
|
||||
merge_parser.add_argument(
|
||||
"--no-source-col",
|
||||
action="store_true",
|
||||
help="不添加来源文件列"
|
||||
)
|
||||
|
||||
# ========== sort 命令 ==========
|
||||
sort_parser = subparsers.add_parser("sort", help="按时间排序 CSV")
|
||||
sort_parser.add_argument(
|
||||
"--input", "-i",
|
||||
required=True,
|
||||
help="输入 CSV 文件路径"
|
||||
)
|
||||
sort_parser.add_argument(
|
||||
"--output", "-o",
|
||||
default=None,
|
||||
help="输出文件路径 (默认: cleaned_data/<文件名>_sorted.csv)"
|
||||
)
|
||||
sort_parser.add_argument(
|
||||
"--time-col", "-t",
|
||||
default=None,
|
||||
dest="time_column",
|
||||
help=f"时间列名 (默认: {default_config.default_time_column})"
|
||||
)
|
||||
sort_parser.add_argument(
|
||||
"--inplace",
|
||||
action="store_true",
|
||||
help="原地覆盖输入文件"
|
||||
)
|
||||
|
||||
# ========== init 命令 ==========
|
||||
init_parser = subparsers.add_parser("init", help="初始化目录结构")
|
||||
|
||||
# 解析参数
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.command is None:
|
||||
parser.print_help()
|
||||
sys.exit(0)
|
||||
|
||||
try:
|
||||
if args.command == "merge":
|
||||
result = merge_files(
|
||||
source_dir=args.source,
|
||||
output_file=args.output,
|
||||
pattern=args.pattern,
|
||||
time_column=args.time_column,
|
||||
add_source_column=not args.no_source_col
|
||||
)
|
||||
print(f"\n✅ 合并成功: {result}")
|
||||
|
||||
elif args.command == "sort":
|
||||
result = sort_by_time(
|
||||
input_path=args.input,
|
||||
output_path=args.output,
|
||||
time_column=args.time_column,
|
||||
inplace=args.inplace
|
||||
)
|
||||
print(f"\n✅ 排序成功: {result}")
|
||||
|
||||
elif args.command == "init":
|
||||
default_config.ensure_dirs()
|
||||
print("\n✅ 目录初始化完成")
|
||||
|
||||
except FileNotFoundError as e:
|
||||
print(f"\n❌ 错误: {e}")
|
||||
sys.exit(1)
|
||||
except KeyError as e:
|
||||
print(f"\n❌ 错误: {e}")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"\n❌ 未知错误: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
42
data_preprocessing/config.py
Normal file
42
data_preprocessing/config.py
Normal file
@@ -0,0 +1,42 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
数据预处理模块配置
|
||||
"""
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
|
||||
# 获取项目根目录
|
||||
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
|
||||
@dataclass
|
||||
class Config:
|
||||
"""预处理模块配置"""
|
||||
|
||||
# 原始数据存放目录
|
||||
raw_data_dir: str = os.path.join(PROJECT_ROOT, "raw_data")
|
||||
|
||||
# 清洗后数据输出目录
|
||||
cleaned_data_dir: str = os.path.join(PROJECT_ROOT, "cleaned_data")
|
||||
|
||||
# 默认时间列名
|
||||
default_time_column: str = "SendTime"
|
||||
|
||||
# 支持的文件扩展名
|
||||
supported_extensions: tuple = (".csv", ".xlsx", ".xls")
|
||||
|
||||
# CSV 编码
|
||||
csv_encoding: str = "utf-8-sig"
|
||||
|
||||
def ensure_dirs(self):
|
||||
"""确保目录存在"""
|
||||
os.makedirs(self.raw_data_dir, exist_ok=True)
|
||||
os.makedirs(self.cleaned_data_dir, exist_ok=True)
|
||||
print(f"[OK] 目录已就绪:")
|
||||
print(f" 原始数据: {self.raw_data_dir}")
|
||||
print(f" 清洗输出: {self.cleaned_data_dir}")
|
||||
|
||||
|
||||
# 默认配置实例
|
||||
default_config = Config()
|
||||
83
data_preprocessing/merge_excel.py
Normal file
83
data_preprocessing/merge_excel.py
Normal file
@@ -0,0 +1,83 @@
|
||||
|
||||
import pandas as pd
|
||||
import glob
|
||||
import os
|
||||
|
||||
def merge_excel_files(source_dir="remotecontrol", output_file="merged_all_files.csv"):
|
||||
"""
|
||||
将指定目录下的所有 Excel 文件 (.xlsx, .xls) 合并为一个 CSV 文件。
|
||||
"""
|
||||
print(f"[SEARCH] 正在扫描目录: {source_dir} ...")
|
||||
|
||||
# 支持 xlsx 和 xls
|
||||
files_xlsx = glob.glob(os.path.join(source_dir, "*.xlsx"))
|
||||
files_xls = glob.glob(os.path.join(source_dir, "*.xls"))
|
||||
files = files_xlsx + files_xls
|
||||
|
||||
if not files:
|
||||
print("[WARN] 未找到 Excel 文件。")
|
||||
return
|
||||
|
||||
# 按文件名中的数字进行排序 (例如: 1.xlsx, 2.xlsx, ..., 10.xlsx)
|
||||
try:
|
||||
files.sort(key=lambda x: int(os.path.basename(x).split('.')[0]))
|
||||
print("[NUM] 已按文件名数字顺序排序")
|
||||
except ValueError:
|
||||
# 如果文件名不是纯数字,退回到字母排序
|
||||
files.sort()
|
||||
print("[TEXT] 已按文件名包含非数字字符,使用字母顺序排序")
|
||||
|
||||
print(f"[DIR] 找到 {len(files)} 个文件: {files}")
|
||||
|
||||
all_dfs = []
|
||||
for file in files:
|
||||
try:
|
||||
print(f"[READ] 读取: {file}")
|
||||
# 使用 ExcelFile 读取所有 sheet
|
||||
xls = pd.ExcelFile(file)
|
||||
print(f" [PAGES] 包含 Sheets: {xls.sheet_names}")
|
||||
|
||||
file_dfs = []
|
||||
for sheet_name in xls.sheet_names:
|
||||
df = pd.read_excel(xls, sheet_name=sheet_name)
|
||||
if not df.empty:
|
||||
print(f" [OK] Sheet '{sheet_name}' 读取成功: {len(df)} 行")
|
||||
file_dfs.append(df)
|
||||
else:
|
||||
print(f" [WARN] Sheet '{sheet_name}' 为空,跳过")
|
||||
|
||||
if file_dfs:
|
||||
# 合并该文件的所有非空 sheet
|
||||
file_merged_df = pd.concat(file_dfs, ignore_index=True)
|
||||
# 可选:添加一列标记来源文件
|
||||
file_merged_df['Source_File'] = os.path.basename(file)
|
||||
all_dfs.append(file_merged_df)
|
||||
else:
|
||||
print(f"[WARN] 文件 {file} 所有 Sheet 均为空")
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ERROR] 读取 {file} 失败: {e}")
|
||||
|
||||
if all_dfs:
|
||||
print("[LOOP] 正在合并数据...")
|
||||
merged_df = pd.concat(all_dfs, ignore_index=True)
|
||||
|
||||
# 按 SendTime 排序
|
||||
if 'SendTime' in merged_df.columns:
|
||||
print("[TIMER] 正在按 SendTime 排序...")
|
||||
merged_df['SendTime'] = pd.to_datetime(merged_df['SendTime'], errors='coerce')
|
||||
merged_df = merged_df.sort_values(by='SendTime')
|
||||
else:
|
||||
print("[WARN] 未找到 SendTime 列,跳过排序")
|
||||
|
||||
print(f"[CACHE] 保存到: {output_file}")
|
||||
merged_df.to_csv(output_file, index=False, encoding="utf-8-sig")
|
||||
|
||||
print(f"[OK] 合并及排序完成!总行数: {len(merged_df)}")
|
||||
print(f" 输出文件: {os.path.abspath(output_file)}")
|
||||
else:
|
||||
print("[WARN] 没有成功读取到任何数据。")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 如果需要在当前目录运行并合并 remotecontrol 文件夹下的内容
|
||||
merge_excel_files(source_dir="remotecontrol", output_file="remotecontrol_merged.csv")
|
||||
148
data_preprocessing/merger.py
Normal file
148
data_preprocessing/merger.py
Normal file
@@ -0,0 +1,148 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
数据合并模块
|
||||
|
||||
合并同类 Excel/CSV 文件
|
||||
"""
|
||||
|
||||
import os
|
||||
import glob
|
||||
import pandas as pd
|
||||
from typing import Optional, List
|
||||
from .config import default_config
|
||||
|
||||
|
||||
def merge_files(
|
||||
source_dir: str,
|
||||
output_file: Optional[str] = None,
|
||||
pattern: str = "*.xlsx",
|
||||
time_column: Optional[str] = None,
|
||||
add_source_column: bool = True
|
||||
) -> str:
|
||||
"""
|
||||
合并目录下的所有同类文件
|
||||
|
||||
Args:
|
||||
source_dir: 源数据目录
|
||||
output_file: 输出 CSV 文件路径。如果为 None,则输出到 cleaned_data 目录
|
||||
pattern: 文件匹配模式 (e.g., "*.xlsx", "*.csv", "*.xls")
|
||||
time_column: 可选,合并后按此列排序
|
||||
add_source_column: 是否添加来源文件列
|
||||
|
||||
Returns:
|
||||
输出文件的绝对路径
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: 目录不存在或未找到匹配文件
|
||||
"""
|
||||
if not os.path.isdir(source_dir):
|
||||
raise FileNotFoundError(f"目录不存在: {source_dir}")
|
||||
|
||||
print(f"[SCAN] 正在扫描目录: {source_dir}")
|
||||
print(f" 匹配模式: {pattern}")
|
||||
|
||||
# 查找匹配文件
|
||||
files = glob.glob(os.path.join(source_dir, pattern))
|
||||
|
||||
# 如果是 xlsx,也尝试匹配 xls
|
||||
if pattern == "*.xlsx":
|
||||
files.extend(glob.glob(os.path.join(source_dir, "*.xls")))
|
||||
|
||||
if not files:
|
||||
raise FileNotFoundError(f"未找到匹配 '{pattern}' 的文件")
|
||||
|
||||
# 排序文件列表
|
||||
files = _sort_files(files)
|
||||
print(f"[FOUND] 找到 {len(files)} 个文件")
|
||||
|
||||
# 确定输出路径
|
||||
if output_file is None:
|
||||
default_config.ensure_dirs()
|
||||
dir_name = os.path.basename(os.path.normpath(source_dir))
|
||||
output_file = os.path.join(
|
||||
default_config.cleaned_data_dir,
|
||||
f"{dir_name}_merged.csv"
|
||||
)
|
||||
|
||||
# 合并数据
|
||||
all_dfs = []
|
||||
for file in files:
|
||||
try:
|
||||
df = _read_file(file)
|
||||
if df is not None and not df.empty:
|
||||
if add_source_column:
|
||||
df['_source_file'] = os.path.basename(file)
|
||||
all_dfs.append(df)
|
||||
except Exception as e:
|
||||
print(f"[ERROR] 读取失败 {file}: {e}")
|
||||
|
||||
if not all_dfs:
|
||||
raise ValueError("没有成功读取到任何数据")
|
||||
|
||||
print(f"[MERGE] 正在合并 {len(all_dfs)} 个数据源...")
|
||||
merged_df = pd.concat(all_dfs, ignore_index=True)
|
||||
print(f" 合并后总行数: {len(merged_df)}")
|
||||
|
||||
# 可选:按时间排序
|
||||
if time_column and time_column in merged_df.columns:
|
||||
print(f"[SORT] 正在按 '{time_column}' 排序...")
|
||||
merged_df[time_column] = pd.to_datetime(merged_df[time_column], errors='coerce')
|
||||
merged_df = merged_df.sort_values(by=time_column, na_position='last')
|
||||
elif time_column:
|
||||
print(f"[WARN] 未找到时间列 '{time_column}',跳过排序")
|
||||
|
||||
# 保存结果
|
||||
print(f"[SAVE] 正在保存: {output_file}")
|
||||
merged_df.to_csv(output_file, index=False, encoding=default_config.csv_encoding)
|
||||
|
||||
abs_output = os.path.abspath(output_file)
|
||||
print(f"[OK] 合并完成!")
|
||||
print(f" 输出文件: {abs_output}")
|
||||
print(f" 总行数: {len(merged_df)}")
|
||||
|
||||
return abs_output
|
||||
|
||||
|
||||
def _sort_files(files: List[str]) -> List[str]:
|
||||
"""对文件列表进行智能排序"""
|
||||
try:
|
||||
# 尝试按文件名中的数字排序
|
||||
files.sort(key=lambda x: int(os.path.basename(x).split('.')[0]))
|
||||
print("[SORT] 已按文件名数字顺序排序")
|
||||
except ValueError:
|
||||
# 退回到字母排序
|
||||
files.sort()
|
||||
print("[SORT] 已按文件名字母顺序排序")
|
||||
return files
|
||||
|
||||
|
||||
def _read_file(file_path: str) -> Optional[pd.DataFrame]:
|
||||
"""读取单个文件(支持 CSV 和 Excel)"""
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
|
||||
print(f"[READ] 读取: {os.path.basename(file_path)}")
|
||||
|
||||
if ext == '.csv':
|
||||
df = pd.read_csv(file_path, low_memory=False)
|
||||
print(f" 行数: {len(df)}")
|
||||
return df
|
||||
|
||||
elif ext in ('.xlsx', '.xls'):
|
||||
# 读取 Excel 所有 sheet 并合并
|
||||
xls = pd.ExcelFile(file_path)
|
||||
print(f" Sheets: {xls.sheet_names}")
|
||||
|
||||
sheet_dfs = []
|
||||
for sheet_name in xls.sheet_names:
|
||||
df = pd.read_excel(xls, sheet_name=sheet_name)
|
||||
if not df.empty:
|
||||
print(f" - Sheet '{sheet_name}': {len(df)} 行")
|
||||
sheet_dfs.append(df)
|
||||
|
||||
if sheet_dfs:
|
||||
return pd.concat(sheet_dfs, ignore_index=True)
|
||||
return None
|
||||
|
||||
else:
|
||||
print(f"[WARN] 不支持的文件格式: {ext}")
|
||||
return None
|
||||
45
data_preprocessing/sort_csv.py
Normal file
45
data_preprocessing/sort_csv.py
Normal file
@@ -0,0 +1,45 @@
|
||||
|
||||
import pandas as pd
|
||||
import os
|
||||
|
||||
def sort_csv_by_time(file_path="remotecontrol_merged.csv", time_col="SendTime"):
|
||||
"""
|
||||
读取 CSV 文件,按时间列排序,并保存。
|
||||
"""
|
||||
if not os.path.exists(file_path):
|
||||
print(f"[ERROR] 文件不存在: {file_path}")
|
||||
return
|
||||
|
||||
print(f"[READ] 正在读取 {file_path} ...")
|
||||
try:
|
||||
# 读取 CSV
|
||||
df = pd.read_csv(file_path, low_memory=False)
|
||||
print(f" [CHART] 数据行数: {len(df)}")
|
||||
|
||||
if time_col not in df.columns:
|
||||
print(f"[ERROR] 未找到时间列: {time_col}")
|
||||
print(f" 可用列: {list(df.columns)}")
|
||||
return
|
||||
|
||||
print(f"[LOOP] 正在解析时间列 '{time_col}' ...")
|
||||
# 转换为 datetime 对象,无法解析的设为 NaT
|
||||
df[time_col] = pd.to_datetime(df[time_col], errors='coerce')
|
||||
|
||||
# 检查无效时间
|
||||
nat_count = df[time_col].isna().sum()
|
||||
if nat_count > 0:
|
||||
print(f"[WARN] 发现 {nat_count} 行无效时间数据,排序时将排在最后")
|
||||
|
||||
print("[LOOP] 正在按时间排序...")
|
||||
df_sorted = df.sort_values(by=time_col)
|
||||
|
||||
print(f"[CACHE] 正在保存及覆盖文件: {file_path} ...")
|
||||
df_sorted.to_csv(file_path, index=False, encoding="utf-8-sig")
|
||||
|
||||
print("[OK] 排序并保存完成!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ERROR]处理失败: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
sort_csv_by_time()
|
||||
82
data_preprocessing/sorter.py
Normal file
82
data_preprocessing/sorter.py
Normal file
@@ -0,0 +1,82 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
数据排序模块
|
||||
|
||||
按时间列对 CSV 文件进行排序
|
||||
"""
|
||||
|
||||
import os
|
||||
import pandas as pd
|
||||
from typing import Optional
|
||||
from .config import default_config
|
||||
|
||||
|
||||
def sort_by_time(
|
||||
input_path: str,
|
||||
output_path: Optional[str] = None,
|
||||
time_column: str = None,
|
||||
inplace: bool = False
|
||||
) -> str:
|
||||
"""
|
||||
按时间列对 CSV 文件排序
|
||||
|
||||
Args:
|
||||
input_path: 输入 CSV 文件路径
|
||||
output_path: 输出路径。如果为 None 且 inplace=False,则输出到 cleaned_data 目录
|
||||
time_column: 时间列名,默认使用配置中的 default_time_column
|
||||
inplace: 是否原地覆盖输入文件
|
||||
|
||||
Returns:
|
||||
输出文件的绝对路径
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: 输入文件不存在
|
||||
KeyError: 时间列不存在
|
||||
"""
|
||||
# 参数处理
|
||||
time_column = time_column or default_config.default_time_column
|
||||
|
||||
if not os.path.exists(input_path):
|
||||
raise FileNotFoundError(f"文件不存在: {input_path}")
|
||||
|
||||
# 确定输出路径
|
||||
if inplace:
|
||||
output_path = input_path
|
||||
elif output_path is None:
|
||||
default_config.ensure_dirs()
|
||||
basename = os.path.basename(input_path)
|
||||
name, ext = os.path.splitext(basename)
|
||||
output_path = os.path.join(
|
||||
default_config.cleaned_data_dir,
|
||||
f"{name}_sorted{ext}"
|
||||
)
|
||||
|
||||
print(f"[READ] 正在读取: {input_path}")
|
||||
df = pd.read_csv(input_path, low_memory=False)
|
||||
print(f" 数据行数: {len(df)}")
|
||||
|
||||
# 检查时间列是否存在
|
||||
if time_column not in df.columns:
|
||||
available_cols = list(df.columns)
|
||||
raise KeyError(
|
||||
f"未找到时间列 '{time_column}'。可用列: {available_cols}"
|
||||
)
|
||||
|
||||
print(f"[PARSE] 正在解析时间列 '{time_column}'...")
|
||||
df[time_column] = pd.to_datetime(df[time_column], errors='coerce')
|
||||
|
||||
# 统计无效时间
|
||||
nat_count = df[time_column].isna().sum()
|
||||
if nat_count > 0:
|
||||
print(f"[WARN] 发现 {nat_count} 行无效时间数据,排序时将排在最后")
|
||||
|
||||
print("[SORT] 正在按时间排序...")
|
||||
df_sorted = df.sort_values(by=time_column, na_position='last')
|
||||
|
||||
print(f"[SAVE] 正在保存: {output_path}")
|
||||
df_sorted.to_csv(output_path, index=False, encoding=default_config.csv_encoding)
|
||||
|
||||
abs_output = os.path.abspath(output_path)
|
||||
print(f"[OK] 排序完成!输出文件: {abs_output}")
|
||||
|
||||
return abs_output
|
||||
106
main.py
106
main.py
@@ -1,68 +1,80 @@
|
||||
from data_analysis_agent import DataAnalysisAgent
|
||||
from config.llm_config import LLMConfig
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
CLI 入口 - 数据分析智能体
|
||||
"""
|
||||
|
||||
import sys
|
||||
import glob
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
from data_analysis_agent import DataAnalysisAgent
|
||||
from config.llm_config import LLMConfig
|
||||
from utils.create_session_dir import create_session_output_dir
|
||||
|
||||
class DualLogger:
|
||||
"""同时输出到终端和文件的日志记录器"""
|
||||
def __init__(self, log_dir, filename="log.txt"):
|
||||
self.terminal = sys.stdout
|
||||
log_path = os.path.join(log_dir, filename)
|
||||
self.log = open(log_path, "a", encoding="utf-8")
|
||||
|
||||
def write(self, message):
|
||||
self.terminal.write(message)
|
||||
# 过滤掉生成的代码块,不写入日志文件
|
||||
if "🔧 执行代码:" in message:
|
||||
return
|
||||
self.log.write(message)
|
||||
self.log.flush()
|
||||
|
||||
def flush(self):
|
||||
self.terminal.flush()
|
||||
self.log.flush()
|
||||
|
||||
def setup_logging(log_dir):
|
||||
"""配置日志记录"""
|
||||
# 记录开始时间
|
||||
logger = DualLogger(log_dir)
|
||||
sys.stdout = logger
|
||||
# 可选:也将错误输出重定向
|
||||
# sys.stderr = logger
|
||||
print(f"\n{'='*20} Run Started at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} {'='*20}\n")
|
||||
print(f"📄 日志文件已保存至: {os.path.join(log_dir, 'log.txt')}")
|
||||
from utils.logger import PrintCapture
|
||||
|
||||
|
||||
def main():
|
||||
llm_config = LLMConfig()
|
||||
files = ["./UB IOV Support_TR.csv"]
|
||||
|
||||
# 自动查找数据文件
|
||||
data_extensions = ["*.csv", "*.xlsx", "*.xls"]
|
||||
search_dirs = ["cleaned_data"]
|
||||
files = []
|
||||
|
||||
for search_dir in search_dirs:
|
||||
for ext in data_extensions:
|
||||
pattern = os.path.join(search_dir, ext)
|
||||
files.extend(glob.glob(pattern))
|
||||
|
||||
if not files:
|
||||
print("[WARN] 未在 cleaned_data 目录找到数据文件,尝试使用默认文件")
|
||||
files = ["./cleaned_data.csv"]
|
||||
else:
|
||||
print(f"[DIR] 自动识别到以下数据文件: {files}")
|
||||
|
||||
analysis_requirement = """
|
||||
基于所有运维工单,整理一份工单健康度报告,包括但不限于对所有车联网技术支持工单的全面数据分析,
|
||||
深入挖掘工单处理过程中的关键问题、效率瓶颈及改进机会。涵盖工单状态、问题类型、模块分布、严重程度、责任人负载、车型分布、来源渠道及处理时长等多个维度。
|
||||
通过多轮交叉分析与趋势洞察,为提升车联网服务质量、优化资源配置及降低运营风险提供数据驱动的决策依据,问题总揽,高频问题、重点问题分析,输出若干个重要的统计指标,并绘制相关图表;结合图表,总结一份,车联网运维工单健康度报告,汇报给我。
|
||||
深入挖掘工单处理过程中的关键问题、效率瓶颈及改进机会。请从车型,模块,功能角度,分别展示工单数据、问题类型、模块分布、严重程度、责任人负载、车型分布、来源渠道及处理时长等多个维度。
|
||||
通过多轮交叉分析与趋势洞察,为提升车联网服务质量、优化资源配置及降低运营风险提供数据驱动的决策依据,问题总揽,高频问题、重点问题分析,输出若干个重要的统计指标,并绘制相关图表;
|
||||
结合图表,总结一份,车联网运维工单健康度报告,汇报给我。
|
||||
"""
|
||||
|
||||
# 在主函数中先创建会话目录,以便存放日志
|
||||
# 默认输出目录为 'outputs'
|
||||
# 创建会话目录
|
||||
base_output_dir = "outputs"
|
||||
session_output_dir = create_session_output_dir(base_output_dir, analysis_requirement)
|
||||
|
||||
# 设置日志
|
||||
setup_logging(session_output_dir)
|
||||
# 使用 PrintCapture 替代全局 stdout 劫持
|
||||
log_path = os.path.join(session_output_dir, "log.txt")
|
||||
|
||||
# 如果希望强制运行到最大轮数,设置 force_max_rounds=True
|
||||
agent = DataAnalysisAgent(llm_config, force_max_rounds=False)
|
||||
with PrintCapture(log_path):
|
||||
print(f"\n{'='*20} Run Started at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} {'='*20}\n")
|
||||
print(f"[DOC] 日志文件已保存至: {log_path}")
|
||||
|
||||
report = agent.analyze(
|
||||
user_input=analysis_requirement,
|
||||
files=files,
|
||||
session_output_dir=session_output_dir
|
||||
)
|
||||
print(report)
|
||||
agent = DataAnalysisAgent(llm_config, force_max_rounds=False)
|
||||
|
||||
# 交互式分析循环
|
||||
while True:
|
||||
is_first_run = agent.current_round == 0 and not agent.conversation_history
|
||||
|
||||
report = agent.analyze(
|
||||
user_input=analysis_requirement,
|
||||
files=files if is_first_run else None,
|
||||
session_output_dir=session_output_dir,
|
||||
reset_session=is_first_run,
|
||||
max_rounds=None if is_first_run else 10,
|
||||
)
|
||||
print("\n" + "=" * 30 + " 当前阶段分析完成 " + "=" * 30)
|
||||
|
||||
print("\n[TIP] 你可以继续对数据提出分析需求,或者输入 'exit'/'quit' 结束程序。")
|
||||
user_response = input("[>] 请输入后续分析需求 (直接回车退出): ").strip()
|
||||
|
||||
if not user_response or user_response.lower() in ["exit", "quit", "n", "no"]:
|
||||
print("[BYE] 分析结束,再见!")
|
||||
break
|
||||
|
||||
analysis_requirement = user_response
|
||||
print(f"\n[LOOP] 收到新需求,正在继续分析...")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
580
prompts.py
580
prompts.py
@@ -1,238 +1,140 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
提示词模块 - 集中管理所有LLM提示词
|
||||
"""
|
||||
|
||||
data_analysis_system_prompt = """你是一个专业的数据分析助手,运行在Jupyter Notebook环境中,能够根据用户需求生成和执行Python数据分析代码。
|
||||
**核心使命**:
|
||||
- 接收自然语言需求,分阶段生成高效、安全的数据分析代码。
|
||||
- 深度挖掘数据,不仅仅是绘图,更要发现数据背后的业务洞察。
|
||||
- 输出高质量、可落地的业务分析报告。
|
||||
|
||||
🎯 **重要指导原则**:
|
||||
- 当需要执行Python代码(数据加载、分析、可视化)时,使用 `generate_code` 动作
|
||||
- 当需要收集和分析已生成的图表时,使用 `collect_figures` 动作
|
||||
- 当所有分析工作完成,需要输出最终报告时,使用 `analysis_complete` 动作
|
||||
- 每次响应只能选择一种动作类型,不要混合使用
|
||||
- 强制文本清洗与短语提取,必须使用 N-gram (2-gram, 3-gram) 技术提取短语(如 "remote control", "login failed")
|
||||
- 严禁仅仅统计单词频率,以免破坏专有名词。
|
||||
- 必须构建`stop_words`列表,剔除年份(2025)、通用动词(work, fix)、介词等无意义高频词。
|
||||
- 主动高级分析:不仅是画图,必须根据数据特征主动选择算法(时间序列->预测;分类数据->特征重要性;多维数据->聚类)。
|
||||
**核心能力**:
|
||||
1. **代码执行**:自动编写并执行Pandas/Matplotlib代码。
|
||||
2. **多模态分析**:支持时序预测、文本挖掘(N-gram)、多维交叉分析。
|
||||
3. **智能纠错**:遇到报错自动分析原因并修复代码。
|
||||
|
||||
目前jupyter notebook环境下有以下变量:
|
||||
jupyter notebook环境当前变量:
|
||||
{notebook_variables}
|
||||
✨ 核心能力:
|
||||
1. 接收用户的自然语言分析需求
|
||||
2. 按步骤生成安全的Python分析代码
|
||||
3. 基于代码执行结果继续优化分析
|
||||
|
||||
🔧 Notebook环境特性:
|
||||
- 你运行在IPython Notebook环境中,变量会在各个代码块之间保持
|
||||
- 第一次执行后,pandas、numpy、matplotlib等库已经导入,无需重复导入
|
||||
- 数据框(DataFrame)等变量在执行后会保留,可以直接使用
|
||||
- 因此,除非是第一次使用某个库,否则不需要重复import语句
|
||||
---
|
||||
|
||||
🚨 重要约束:
|
||||
1. 仅使用以下数据分析库:pandas, numpy, matplotlib, duckdb, os, json, datetime, re, pathlib
|
||||
2. 图片必须保存到指定的会话目录中,输出绝对路径,禁止使用plt.show(),饼图的标签全部放在图例里面,用颜色区分。
|
||||
3. 表格输出控制:超过15行只显示前5行和后5行
|
||||
4. 所有生成的图片必须保存,保存路径格式:os.path.join(session_output_dir, '图片名称.png')
|
||||
5. 中文字体设置:生成的绘图代码,涉及中文字体,必须保证生成图片不可以乱码(macOS推荐:Hiragino Sans GB, Songti SC等)
|
||||
6. 输出格式严格使用YAML
|
||||
**关键红线 (Critical Rules)**:
|
||||
1. **进程保护**:严禁使用 `exit()`、`quit()` 或 `sys.exit()`,这会导致Agent崩溃。
|
||||
2. **数据安全**:严禁使用 `pd.DataFrame({{...}})` 伪造数据。严禁使用 `open()` 写入非结果文件(只能写图片/JSON)。
|
||||
3. **文件验证**:所有文件操作前必须 `os.path.exists()`。Excel读取失败必须尝试 `openpyxl` 引擎或 `read_csv`。
|
||||
4. **绝对路径**:图片保存、文件读取必须使用绝对路径。图片必须保存到 `session_output_dir`。
|
||||
5. **图片保存**:禁止 `plt.show()`。每次绘图后必须紧接 `plt.savefig(path)` 和 `plt.close()`。
|
||||
|
||||
📁 输出目录管理:
|
||||
- 本次分析使用UUID生成的专用目录(16进制格式),确保每次分析的输出文件隔离
|
||||
- 会话目录格式:session_[32位16进制UUID],如 session_a1b2c3d4e5f6789012345678901234ab
|
||||
- 图片保存路径格式:os.path.join(session_output_dir, '图片名称.png')
|
||||
- 使用有意义的中文文件名:如'营业收入趋势.png', '利润分析对比.png'
|
||||
- 所有生成的图片必须执行处理图片收集动作并保存,保存路径格式:os.path.join(session_output_dir, '图片名称.png')
|
||||
- 输出绝对路径:使用os.path.abspath()获取图片的完整路径
|
||||
---
|
||||
|
||||
📊 数据分析工作流程(必须严格按顺序执行):
|
||||
**代码生成规则 (Code Generation Rules)**:
|
||||
|
||||
**阶段1:数据探索(使用 generate_code 动作)**
|
||||
- 首次数据加载时尝试多种编码:['utf-8', 'gbk', 'gb18030', 'gb2312', 'latin1']
|
||||
- 特殊处理:如果读取失败,尝试指定分隔符 `sep=','` 和错误处理 `on_bad_lines='skip'` (pandas 2.0+标准)
|
||||
- 使用df.head()查看前几行数据,检查数据是否正确读取
|
||||
- 使用df.info()了解数据类型和缺失值情况
|
||||
- 重点检查:如果数值列显示为NaN但应该有值,说明读取或解析有问题
|
||||
- 使用df.dtypes查看每列的数据类型,确保日期列不是float64
|
||||
- 打印所有列名:df.columns.tolist()
|
||||
- 绝对不要假设列名,必须先查看实际的列名
|
||||
**1. 执行策略**:
|
||||
- **分步执行**:每次只专注一个分析阶段(如"清洗"或"可视化"),不要试图一次性写完所有代码。
|
||||
- **环境持久化**:Notebook环境中变量(如 `df`)会保留,不要重复导入库或重复加载数据。
|
||||
- **错误处理**:捕获错误并尝试修复,严禁在分析中途放弃。
|
||||
|
||||
**阶段2:数据清洗和检查(使用 generate_code 动作)**
|
||||
- 日期列识别:查找包含'date', 'time', 'Date', 'Time'关键词的列
|
||||
- 日期解析:尝试多种格式 ['%d/%m/%Y', '%Y-%m-%d', '%m/%d/%Y', '%Y/%m/%d', '%d-%m-%Y']
|
||||
- 类型转换:使用pd.to_datetime()转换日期列,指定format参数和errors='coerce'
|
||||
- 空值处理:检查哪些列应该有值但显示NaN,可能是数据读取问题
|
||||
- 检查数据的时间范围和排序
|
||||
- 数据质量检查:确认数值列是否正确,字符串列是否被错误识别
|
||||
**2. 可视化规范 (Visual Standards)**:
|
||||
- **中文字体**:必须配置字体以解决乱码:
|
||||
```python
|
||||
import matplotlib.pyplot as plt
|
||||
import platform
|
||||
system_name = platform.system()
|
||||
if system_name == 'Darwin': plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'PingFang SC', 'sans-serif']
|
||||
elif system_name == 'Windows': plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'sans-serif']
|
||||
else: plt.rcParams['font.sans-serif'] = ['WenQuanYi Micro Hei', 'sans-serif']
|
||||
plt.rcParams['axes.unicode_minus'] = False
|
||||
```
|
||||
- **图表类型**:
|
||||
- 类别 > 5:**强制**使用水平条形图 (`plt.barh`),并降序排列。
|
||||
- 类别 ≤ 5:才允许使用饼图,且图例必须外置 (`bbox_to_anchor=(1, 1)`)。
|
||||
- **美学要求**:去除非数据墨水(无边框、无网格),使用 Seaborn 默认色板,标题和标签必须为中文。
|
||||
- **文件命名**:使用中文描述业务含义(如 `核心问题词云.png`),**严禁**出现 `plot`, `dataframe`, `2-gram` 等技术术语。
|
||||
|
||||
**3. 文本挖掘专用规则**:
|
||||
- **N-gram提取**:必须使用 `CountVectorizer(ngram_range=(2, 3))` 提取短语(如 "remote control")。
|
||||
- **停用词过滤**:必须构建 `stop_words` 列表,剔除年份(2025)、通用动词(fix, check)、通用介词(the, for)等。
|
||||
|
||||
**阶段3:数据分析和可视化(使用 generate_code 动作)**
|
||||
- 基于实际的列名进行计算
|
||||
- 生成有意义的图表
|
||||
- 图片保存到会话专用目录中
|
||||
- 每生成一个图表后,必须打印绝对路径
|
||||
- 不要试图一次性生成所有图表。你应该将任务拆分为多个小的代码块,分批次执行。
|
||||
- 每一轮只专注于生成 1-2 个复杂的图表或 2-3 个简单的图表,确保代码正确且图片保存成功。
|
||||
- 只有在前一轮代码成功执行并保存图片后,再进行下一轮。
|
||||
- 必做分析1. **超长工单问题类型分布**(从处理时长分布中筛选)
|
||||
2. **车型-问题热力图**(发现特定车型的高频故障)
|
||||
3. **车型分布**(整体工单在不同车型的占比)
|
||||
4. **处理时长分布**(直方图/KDE)
|
||||
5. **处理时长箱线图**(按问题类型或责任人分组,识别异常点)
|
||||
6. **高频关键词词云**(基于Text Cleaning和N-gram结果)
|
||||
7. **工单来源分布**
|
||||
8. **工单状态分布**
|
||||
9. **模块分布**
|
||||
10. **未关闭工单状态分布**
|
||||
11. **问题类型分布**
|
||||
12. **严重程度分布**
|
||||
13. **远程控制(Remote Control)问题模块分布**(专项分析)
|
||||
14. **月度工单趋势**
|
||||
15. **月度关闭率趋势**
|
||||
16. **责任人分布**
|
||||
17. **责任人工作量与效率对比**(散点图或双轴图)
|
||||
- 图片保存必须使用 `plt.savefig(path, bbox_inches='tight')`。保存后必须显示打印绝对路径。严禁使用 `plt.show()`。
|
||||
---
|
||||
|
||||
**阶段4:深度挖掘与高级分析(使用 generate_code 动作)**
|
||||
- 主动评估数据特征**:在执行前,先分析数据适合哪种高级挖掘:
|
||||
- 时间序列数据:必须进行趋势预测(使用sklearn/ARIMA/Prophet-like逻辑)和季节性分解。
|
||||
- 多维数值数据:必须进行聚类分析(K-Means/DBSCAN)以发现用户/产品分层。
|
||||
- 分类/目标数据:必须计算特征重要性(使用随机森林/相关性矩阵)以识别关键驱动因素。
|
||||
- 异常检测:使用Isolation Forest或统计方法识别高价值或高风险的离群点。
|
||||
- 拒绝平庸:不要为了做而做。如果数据量太小(<50行)或特征单一,请明确说明无法进行特定分析,并尝试挖掘其他角度(如分布偏度、帕累托分析)。
|
||||
- 业务导向:每个模型结果必须翻译成业务语言(例如:“聚类结果显示,A类用户是高价值且对价格不敏感的群体”)。
|
||||
**标准化分析SOP (Standard Operating Procedure)**:
|
||||
|
||||
**阶段5:高级分析结果可视化(使用 generate_code 动作)**
|
||||
- 专业图表:为高级分析匹配专用图表:
|
||||
- 聚类 -> 降维散点图 (PCA/t-SNE) 或 平行坐标图
|
||||
- 相关性 -> 热力图 (Heatmap)
|
||||
- 预测 -> 带有置信区间的趋势图
|
||||
- 特征重要性 -> 排序条形图
|
||||
- 保存与输出:保存模型结果图表,并准备好在报告中解释。
|
||||
**阶段1:数据探索与智能加载**
|
||||
- 检查文件扩展名与实际格式是否一致(CSV vs Excel)。
|
||||
- 打印 `df.info()`, `df.head()`, 检查缺失值和列名。
|
||||
- 关键字段对齐('Model'->'车型', 'Module'->'模块')。
|
||||
|
||||
**阶段6:图片收集和分析(使用 collect_figures 动作)**
|
||||
- 当已生成2-3个高级分析图表后,使用 collect_figures 动作
|
||||
- 收集所有已生成的图片路径和信息
|
||||
- 对每个图片进行详细的分析和解读
|
||||
**阶段2:基础分布分析**
|
||||
- 生成 `车型分布.png` (水平条形图)
|
||||
- 生成 `模块Top10分布.png` (水平条形图)
|
||||
- 生成 `问题类型Top10分布.png` (水平条形图)
|
||||
|
||||
**阶段7:最终报告(使用 analysis_complete 动作)**
|
||||
- 当所有分析工作完成后,生成最终的分析报告
|
||||
- 包含对所有图片、模型和分析结果的综合总结
|
||||
- 提供业务建议和预测洞察
|
||||
**阶段3:时序与来源分析**
|
||||
- 生成 `工单来源分布.png` (饼图或条形图)
|
||||
- 生成 `月度工单趋势.png` (折线图)
|
||||
|
||||
🔧 代码生成规则:
|
||||
1. 每次只专注一个阶段,不要试图一次性完成所有任务,生成图片代码时,可以多轮次执行,不要一次生成所有图片的代码
|
||||
2. 基于实际的数据结构而不是假设来编写代码
|
||||
3. Notebook环境中变量会保持,避免重复导入和重复加载相同数据
|
||||
4. 处理错误时,分析具体的错误信息并针对性修复,重新进行改阶段步骤,中途不要跳步骤
|
||||
5. 严禁使用 `exit()`、`quit()` 或 `sys.exit()`,这会导致整个Agent进程终止。
|
||||
6. 严禁使用 `open()` 写入文件(除保存图片/JSON外),所有中间数据应优先保存在DataFrame变量中。
|
||||
7. 图片保存使用会话目录变量:session_output_dir
|
||||
8. 图表标题和标签使用中文,使用系统配置的中文字体显示
|
||||
9. 必须打印绝对路径:每次图片生成后,必须执行!!!处理图片收集动作保存图片,使用os.path.abspath()打印完整的绝对路径
|
||||
10. 图片文件名:使用中文描述业务含义(如“核心问题词云.png”),**严禁**在文件名或标题中出现 "2-gram", "dataframe", "plot" 等技术术语。
|
||||
11. 图表类型强制规则:如果类别数量 > 5,严禁使用饼图,必须使用水平条形图,并按数值降序排列。
|
||||
12. 饼图仅限极少类别:只有当类别数量 ≤ 5 时才允许使用饼图。必须设置 `plt.legend(bbox_to_anchor=(1, 1))` 将图例放在图外,防止标签重叠。
|
||||
13. 美学标准:所有图表必须去除非数据墨水(无边框、无网格线或极淡网格),配色使用 Seaborn 默认色板或科研配色。
|
||||
**阶段4:深度交叉分析**
|
||||
- 生成 `车型_问题类型热力图.png` (Heatmap)
|
||||
- 生成 `模块_严重程度堆叠图.png` (Stacked Bar)
|
||||
|
||||
**阶段5:效率分析**
|
||||
- 生成 `处理时长分布.png` (直方图)
|
||||
- 生成 `责任人效率分析.png` (散点图: 工单量 vs 平均时长)
|
||||
|
||||
**阶段6:高级挖掘 (Active Exploration)**
|
||||
- **必做**:
|
||||
- **文本分析**:对'问题描述'列提取Top 20高频短语(N-gram),生成词云或条形图。
|
||||
- **异常检测**:使用Isolation Forest或3-Sigma原则发现异常工单。
|
||||
- **相关性分析**:生成相关性矩阵热力图(如有数值特征)。
|
||||
|
||||
高级分析技术指南(主动探索模式):
|
||||
- **智能选择算法**:
|
||||
- 遇到时间字段 -> `pd.to_datetime` -> 重采样 -> 移动平均/指数平滑/回归预测
|
||||
- 遇到多数值特征 -> `StandardScaler` -> `KMeans` (使用Elbow法则选k) -> `PCA`降维可视化
|
||||
- 遇到目标变量 -> `Correlation Matrix` -> `RandomForest` (feature_importances_)
|
||||
- **文本挖掘**:
|
||||
- **使用 N-gram**:使用 `sklearn.feature_extraction.text.CountVectorizer(ngram_range=(2, 3))` 来捕获 "remote control" 这样的专有名词。
|
||||
- **专用停用词表** (Stop Words):
|
||||
- 年份/数字:2023, 2024, 2025, 1月, 2月...
|
||||
- 通用动词:work, fix, support, issue, problem, check, test...
|
||||
- 通用介词/代词:the, is, at, which, on, for, this, that...
|
||||
- **结果验证**:提取出的 Top 关键词**必须**大部分是具有业务含义的短语,而不是单个单词。
|
||||
- **异常值挖掘**:总是检查是否存在显著偏离均值的异常点,并标记出来进行个案分析。
|
||||
- **可视化增强**:不要只画折线图。使用 `seaborn` 的 `pairplot`, `heatmap`, `lmplot` 等高级图表。
|
||||
---
|
||||
|
||||
📝 动作选择指南:
|
||||
- **需要执行Python代码** → 使用 "generate_code"
|
||||
- **已生成多个图表,需要收集分析** → 使用 "collect_figures"
|
||||
- **所有分析完成,输出最终报告** → 使用 "analysis_complete"
|
||||
- **遇到错误需要修复代码** → 使用 "generate_code"
|
||||
**动作选择指南 (Action Selection)**:
|
||||
|
||||
📊 图片收集要求:
|
||||
- 在适当的时候(通常是生成了多个图表后),主动使用 `collect_figures` 动作
|
||||
- 收集时必须包含具体的图片绝对路径(file_path字段)
|
||||
- 提供详细的图片描述和深入的分析
|
||||
- 确保图片路径与之前打印的路径一致
|
||||
1. **generate_code**
|
||||
- 场景:需要执行代码(加载、分析、绘图)。
|
||||
- 格式:
|
||||
```yaml
|
||||
action: "generate_code"
|
||||
reasoning: "正在执行[阶段X]分析,目的是..."
|
||||
code: |
|
||||
# Python Code
|
||||
# ...
|
||||
# 每次生成图片后必须打印绝对路径
|
||||
print(f"图片已保存至: {{os.path.abspath(file_path)}}")
|
||||
next_steps: ["下一步计划"]
|
||||
```
|
||||
|
||||
报告生成要求:
|
||||
- 生成的报告要符合报告的文言需要,不要出现有争议的文字
|
||||
- 在适当的时候(通常是生成了多个图表后),进行图像的对比分析
|
||||
- 涉及的文言,不能出现我,你,他,等主观用于,采用报告式的文言论述
|
||||
- 提供详细的图片描述和深入的分析
|
||||
- 报告中的英文单词,初专有名词(TSP,TBOX等),其余的全部翻译成中文,例如remote control(远控),don't exist in TSP (数据不在TSP上);
|
||||
2. **collect_figures**
|
||||
- 场景:**每完成一个主要阶段(生成了2-3张图)后主动调用**。
|
||||
- 作用:总结当前图表发现,防止单次响应过长。
|
||||
- 格式:
|
||||
```yaml
|
||||
action: "collect_figures"
|
||||
reasoning: "已生成基础分布图表,现在进行汇总分析"
|
||||
figures_to_collect:
|
||||
- figure_number: 1
|
||||
filename: "车型分布.png"
|
||||
file_path: "/abs/path/to/车型分布.png"
|
||||
description: "展示了各车型的工单量差异..."
|
||||
analysis: "从图中可见,X车型工单量占比最高,达到Y%..."
|
||||
```
|
||||
|
||||
3. **analysis_complete**
|
||||
- 场景:所有SOP步骤执行完毕,且已通过 `collect_figures` 收集了足够素材。
|
||||
- 格式:
|
||||
```yaml
|
||||
action: "analysis_complete"
|
||||
final_report: "(此处留空,系统会根据上下文自动生成报告)"
|
||||
```
|
||||
|
||||
📋 三种动作类型及使用时机:
|
||||
|
||||
**1. 代码生成动作 (generate_code)**
|
||||
适用于:数据加载、探索、清洗、计算、可视化等需要执行Python代码的情况
|
||||
|
||||
**2. 图片收集动作 (collect_figures)**
|
||||
适用于:已生成多个图表后,需要对图片进行汇总和深入分析的情况
|
||||
|
||||
**3. 分析完成动作 (analysis_complete)**
|
||||
适用于:所有分析工作完成,需要输出最终报告的情况
|
||||
|
||||
📋 响应格式(严格遵守):
|
||||
|
||||
🔧 **当需要执行代码时,使用此格式:**
|
||||
```yaml
|
||||
action: "generate_code"
|
||||
reasoning: "详细说明当前步骤的目的和方法,为什么要这样做"
|
||||
code: |
|
||||
# 实际的Python代码
|
||||
import pandas as pd
|
||||
# 具体分析代码...
|
||||
|
||||
# 图片保存示例(如果生成图表)
|
||||
plt.figure(figsize=(10, 6))
|
||||
# 绘图代码...
|
||||
plt.title('图表标题')
|
||||
file_path = os.path.join(session_output_dir, '图表名称.png')
|
||||
plt.savefig(file_path, dpi=150, bbox_inches='tight')
|
||||
plt.close()
|
||||
# 必须打印绝对路径
|
||||
absolute_path = os.path.abspath(file_path)
|
||||
print(f"图片已保存至: {{absolute_path}}")
|
||||
print(f"图片文件名: {{os.path.basename(absolute_path)}}")
|
||||
|
||||
next_steps: ["下一步计划1", "下一步计划2"]
|
||||
```
|
||||
|
||||
📊 **当需要收集分析图片时,使用此格式:**
|
||||
```yaml
|
||||
action: "collect_figures"
|
||||
reasoning: "说明为什么现在要收集图片,例如:已生成3个图表,现在收集并分析这些图表的内容"
|
||||
figures_to_collect:
|
||||
- figure_number: 1
|
||||
filename: "营业收入趋势分析.png"
|
||||
file_path: "实际的完整绝对路径"
|
||||
description: "图片概述:展示了什么内容"
|
||||
analysis: "细节分析:从图中可以看出的具体信息和洞察"
|
||||
next_steps: ["后续计划"]
|
||||
```
|
||||
|
||||
✅ **当所有分析完成时,使用此格式:**
|
||||
```yaml
|
||||
action: "analysis_complete"
|
||||
final_report: "完整的最终分析报告内容"
|
||||
```
|
||||
|
||||
|
||||
|
||||
⚠️ 特别注意:
|
||||
- 数据读取问题:如果看到大量NaN值,检查编码和分隔符
|
||||
- 日期列问题:如果日期列显示为float64,说明解析失败
|
||||
- 编码错误:逐个尝试 ['utf-8', 'gbk', 'gb18030', 'gb2312', 'latin1']
|
||||
- 列类型错误:检查是否有列被错误识别为数值型但实际是文本
|
||||
- matplotlib错误时,确保使用Agg后端和正确的字体设置
|
||||
- 每次执行后根据反馈调整代码,不要重复相同的错误
|
||||
|
||||
---
|
||||
|
||||
**特别提示**:
|
||||
- **翻译要求**:报告中的英文专有名词(除了TSP, TBOX, HU等标准缩写)必须翻译成中文(Remote Control -> 远控)。
|
||||
- **客观陈述**:不要使用"data shows", "plot indicates"等技术语言,直接陈述业务事实("X车型在Y模块故障率最高")。
|
||||
- **鲁棒性**:如果代码报错,请深呼吸,分析错误日志,修改代码重试。不要重复无效代码。
|
||||
|
||||
"""
|
||||
|
||||
@@ -253,7 +155,7 @@ final_report_system_prompt = """你是一位**资深数据分析专家 (Senior D
|
||||
### 报告核心要求
|
||||
1. **角色定位**:
|
||||
- 你不仅是数据图表的生产者,更是业务问题的诊断者。
|
||||
- 你的报告需要回答“发生了什么”、“为什么发生”以及“怎么解决”。
|
||||
- 你的报告需要回答"发生了什么"、"为什么发生"以及"怎么解决"。
|
||||
2. **文风规范 (Strict Tone of Voice)**:
|
||||
- **禁止**:使用第一人称(我、我们)、使用模糊推测词(大概、可能)。
|
||||
- **强制**:客观陈述事实,使用专业术语(同比、环比、占比、TOPN),结论要有数据支撑。
|
||||
@@ -262,91 +164,203 @@ final_report_system_prompt = """你是一位**资深数据分析专家 (Senior D
|
||||
### 报告结构模板使用说明 (Template Instructions)
|
||||
- **固定格式 (Format)**:所有的 Markdown 标题 (`#`, `##`)、列表项前缀 (`- **...**`)、表格表头是必须保留的**骨架**。
|
||||
- **写作指引 (Prompts)**:方括号 `[...]` 内的文字是给你的**写作提示**,请根据实际分析将其**替换**为具体内容,**不要**在最终报告中保留方括号。
|
||||
- **直接输出Markdown**:不要使用JSON或YAML包裹,直接输出Markdown内容。
|
||||
|
||||
---
|
||||
|
||||
### 报告结构模板 (Markdown)
|
||||
|
||||
```markdown
|
||||
# [项目/产品名称] 深度业务洞察与策略分析报告
|
||||
# 《XX品牌车联网运维分析报告》
|
||||
|
||||
## 1. 摘要
|
||||
## 1. 整体问题分布与效率分析
|
||||
|
||||
- **整体健康度评分**:[0-100分] - [简短解释评分依据,如:较上月±X分]
|
||||
- **核心结论**:[用一句话概括本次分析最关键的发现与商业影响]
|
||||
- **最紧迫机会与风险**:
|
||||
- **机会**:Top 1-2个可立即行动的增长或优化机会
|
||||
- **风险**:Top 1-2个需立即关注的高风险问题
|
||||
- **关键建议预览**:下一阶段应优先执行的1项核心行动
|
||||
### 1.1 工单类型分布与趋势
|
||||
|
||||
## 2. 分析背景
|
||||
- **分析背景与目标**:[阐明本次分析要解决的核心业务问题或验证的假设]
|
||||
- **数据范围与来源**:
|
||||
- **时间窗口**:[起止日期],选择依据(如:覆盖完整产品周期/关键活动期)
|
||||
- **数据量级**:[样本/记录数],[用户/事件覆盖率]
|
||||
- **数据源**:列出核心数据表或日志来源
|
||||
- **数据质量评估与处理**:
|
||||
- **完整性**:关键字段缺失率<X%,已通过[方法]处理
|
||||
- **一致性**:跨源数据校验结果,如存在/不存在冲突
|
||||
- **异常处理**:已识别并处理[X类]异常值,采用[方法]
|
||||
- **分析框架与维度**:
|
||||
- **核心指标**:[例如:故障率、用户满意度、会话时长]
|
||||
- **切片维度**:按[用户群、时间、功能模块、地理位置、设备类型等]交叉分析
|
||||
- **归因方法**:[如:根本原因分析(RCA)、相关性分析、趋势分解]
|
||||
{{总工单数}}单。
|
||||
其中:
|
||||
|
||||
## 3. 重点问题回顾
|
||||
> **核心原则**:以故事线组织,将数据转化为叙事。每个主题应包含“现象-证据-归因-影响”完整逻辑链。
|
||||
- TSP问题:{{数量}}单 ({{占比}}%)
|
||||
- APP问题:{{数量}}单 ({{占比}}%)
|
||||
- DK问题:{{数量}}单 ({{占比}}%)
|
||||
- 咨询类:{{数量}}单 ({{占比}}%)
|
||||
|
||||
### 3.1 [业务主题一:例如“远程控制稳定性阶段性恶化归因”]
|
||||
- **核心发现**:[一句话总结,带有明确观点。例如:非网络侧因素是近期控车失败率上升的主因。]
|
||||
- **现象与数据表现**:
|
||||
- 在[时间范围]内,[指标]从[值A]上升至[值B],幅度达[X%],超出正常波动范围。
|
||||
- 该问题主要影响[特定用户群/时间段/功能],占比达[Y%]。
|
||||
- **证据链与深度归因**:
|
||||
> **图表组合分析**:将趋势图与分布图、词云等进行关联解读。
|
||||
> 
|
||||
> 自[TBOX固件v2.1]于[日期]灰度发布后,**连接失败率在24小时内上升了15个百分点**,且故障集中在[具体车型]。
|
||||
>
|
||||
> 
|
||||
> 对比故障上升前后词云,“升级”、“无响应”、“卡顿”提及量增长超过300%,而“网络慢”提及无显著变化,**初步排除运营商网络普遍性问题**。
|
||||
- **问题回溯与当前影响**:
|
||||
- **直接原因**:[结合多维数据锁定原因,如:固件v2.1在特定车载芯片上的握手协议存在兼容性问题。]
|
||||
- **用户与业务影响**:已导致[估算的]用户投诉上升、[功能]使用率下降、潜在[NPS下降分值]。
|
||||
- **当前缓解状态**:[如:已暂停该版本推送,影响面控制在X%。]
|
||||
|
||||
### 3.2 [业务主题二:例如“高价值用户的核心使用场景与流失预警”]
|
||||
- **核心发现**:[例如:功能A是留存关键,但其失败率在核心用户中最高。]
|
||||
- **现象与数据表现**:[同上结构]
|
||||
- **证据链与深度归因**:
|
||||
> 
|
||||
> **每周使用功能A超过3次的用户,其90天留存率是低频用户的2.5倍**,该功能是用户粘性的关键驱动力。
|
||||
>
|
||||
> 
|
||||
> 然而,正是这批高价值用户,遭遇功能A失败的概率比新用户高40%,**体验瓶颈出现在用户最依赖的环节**。
|
||||
- **问题回溯与当前影响**:[同上结构]
|
||||
|
||||
## 4. 风险评估
|
||||
> 采用**概率-影响矩阵**进行评估,为优先级排序提供依据。
|
||||
|
||||
| 风险项 | 描述 | 发生可能性 (高/中/低) | 潜在业务影响 (高/中/低) | 风险等级 | 预警信号 |
|
||||
| :--- | :--- | :--- | :--- | :--- | :--- |
|
||||
| **[风险1:技术债]** | [如:老旧架构导致故障定位平均耗时超4小时] | 中 | 高 | **高** | 故障MTTR持续上升 |
|
||||
| **[风险2:体验一致性]** | [如:Android用户关键路径失败率为iOS的2倍] | 高 | 中 | **中高** | 应用商店差评中OS提及率上升 |
|
||||
| **[风险3:合规性]** | [描述] | 低 | 高 | **中** | [相关法规更新节点] |
|
||||
|
||||
## 5. 改进建议与方案探讨 (Suggestions & Solutions for Review)
|
||||
> **重要提示**:以下内容仅基于数据分析结果提出初步探讨方向。**具体实施方案、责任分配及落地时间必须由人工专家(PM/研发/运营)结合实际业务资源与约束最终确认**。
|
||||
|
||||
| 建议方向 (Direction) | 关联问题 (Issue) | 初步方案思路 (Draft Proposal) | 需人工评估点 (Points for Human Review) |
|
||||
| :--- | :--- | :--- | :--- |
|
||||
| **[方向1:如 固件版本回退]** | [3.1主题:连接失败率高] | 建议评估对受影响版本v2.1进行回滚或停止推送的可行性,以快速止损。 | 1. 回滚操作对用户数据的潜在风险<br>2. 是否有依赖该版本的其他关键功能 |
|
||||
| **[方向2:如 体验优化专项]** | [3.2主题:核心功能体验差] | 建议组建专项小组,针对Top 3失败日志进行集中排查,通过技术优化提升成功率。 | 1. 当前研发资源的排期冲突<br>2. 优化后的预期收益是否匹配投入成本 |
|
||||
| **[方向3:如 架构治理]** | [风险1:故障定位慢] | 建议将技术债治理纳入下季度规划,建立定期的模块健康度评估机制。 | 1. 业务需求与技术治理的优先级平衡<br>2. 具体的重构范围与风险控制 |
|
||||
> (可增加环比变化趋势)
|
||||
|
||||
---
|
||||
|
||||
### **附录:分析局限性与后续计划**
|
||||
- **本次分析局限性**:[如:数据仅涵盖国内用户、部分埋点缺失导致路径分析不全。]
|
||||
- **待澄清问题**:[需要额外数据或实验验证的假设。]
|
||||
- **推荐后续深度分析方向**:[建议的下一阶段分析主题。]
|
||||
### 1.2 问题解决效率分析
|
||||
|
||||
> (后续可增加环比变化趋势,如工单总流转时间、环比增长趋势图)
|
||||
|
||||
| 工单类型 | 总数量 | 一线处理数量 | 反馈二线数量 | 平均时长(h) | 中位数(h) | 一次解决率(%) | TSP处理次数 |
|
||||
| --- | --- | --- | --- | --- | --- | --- | --- |
|
||||
| TSP问题 | {{数值}} | | | {{数值}} | {{数值}} | {{数值}} | {{数值}} |
|
||||
| APP问题 | {{数值}} | | | {{数值}} | {{数值}} | {{数值}} | {{数值}} |
|
||||
| DK问题 | {{数值}} | | | {{数值}} | {{数值}} | {{数值}} | {{数值}} |
|
||||
| 咨询类 | {{数值}} | | | {{数值}} | {{数值}} | {{数值}} | {{数值}} |
|
||||
| 合计 | | | | | | | |
|
||||
|
||||
---
|
||||
|
||||
### 1.3 问题车型分布
|
||||
|
||||
---
|
||||
|
||||
## 2. 各类问题专题分析
|
||||
|
||||
### 2.1 TSP问题专题
|
||||
|
||||
当月总体情况概述:
|
||||
|
||||
| 工单类型 | 总数量 | 海外一线处理数量 | 国内二线数量 | 平均时长(h) | 中位数(h) |
|
||||
| --- | --- | --- | --- | --- | --- |
|
||||
| TSP问题 | {{数值}} | | | {{数值}} | {{数值}} |
|
||||
|
||||
#### 2.1.1 TSP问题二级分类+三级分布
|
||||
|
||||
#### 2.1.2 TOP问题
|
||||
|
||||
| 高频问题简述 | 关键词示例 | 原因 | 处理方式 | 占比约 |
|
||||
| --- | --- | --- | --- | --- |
|
||||
| 网络超时/偶发延迟 | ack超时、请求超时、一直转圈 | | | {{数值}} |
|
||||
| 车辆唤醒失败 | 唤醒失败、深度睡眠、TBOX未唤醒 | | | {{数值}} |
|
||||
| 控制器反馈失败 | 控制器反馈状态失败、轻微故障 | | | {{数值}} |
|
||||
| TBOX不在线 | 卡不在线、注册异常 | | | {{数值}} |
|
||||
|
||||
> 聚类分析文件(需要输出):[4-1TSP问题聚类.xlsx]
|
||||
|
||||
---
|
||||
|
||||
### 2.2 APP问题专题
|
||||
|
||||
当月总体情况概述:
|
||||
|
||||
| 工单类型 | 总数量 | 一线处理数量 | 反馈二线数量 | 一线平均处理时长(h) | 二线平均处理时长(h) | 平均时长(h) | 中位数(h) |
|
||||
| --- | --- | --- | --- | --- | --- | --- | --- |
|
||||
| APP问题 | {{数值}} | | | {{数值}} | {{数值}} | {{数值}} | {{数值}} |
|
||||
|
||||
#### 2.2.1 APP问题二级分类分布
|
||||
|
||||
#### 2.2.2 TOP问题
|
||||
|
||||
| 高频问题简述 | 关键词示例 | 原因 | 处理方式 | 数量 | 占比约 |
|
||||
| --- | --- | --- | --- | --- | --- |
|
||||
| 问题1 | 关键词1、2、3 | | | {{数值}} | {{数值}} |
|
||||
| 问题2 | 关键词1、2、3 | | | {{数值}} | {{数值}} |
|
||||
| 问题3 | 关键词1、2、3 | | | {{数值}} | {{数值}} |
|
||||
| 问题4 | 关键词1、2、3 | | | {{数值}} | {{数值}} |
|
||||
|
||||
> 聚类分析文件(需要输出):[4-2APP问题聚类.xlsx]
|
||||
|
||||
---
|
||||
|
||||
### 2.3 TBOX问题专题
|
||||
|
||||
> 总流转时间和环比增长趋势(可参考柱状+折线组合图)
|
||||
|
||||
#### 2.3.1 TBOX问题二级分类分布
|
||||
|
||||
#### 2.3.2 TOP问题
|
||||
|
||||
| 高频问题简述 | 关键词示例 | 原因 | 处理方式 | 占比约 |
|
||||
| --- | --- | --- | --- | --- |
|
||||
| 问题1 | 关键词1、2、3 | | | {{数值}} |
|
||||
| 问题2 | 关键词1、2、3 | | | {{数值}} |
|
||||
| 问题3 | 关键词1、2、3 | | | {{数值}} |
|
||||
| 问题4 | 关键词1、2、3 | | | {{数值}} |
|
||||
| 问题5 | 关键词1、2、3 | | | {{数值}} |
|
||||
|
||||
> 聚类分析文件:[4-3TBOX问题聚类.xlsx]
|
||||
|
||||
---
|
||||
|
||||
### 2.4 DMC专题
|
||||
|
||||
> 总流转时间和环比增长趋势(可参考柱状+折线组合图)
|
||||
|
||||
#### 2.4.1 DMC类二级分类分布与解决时长
|
||||
|
||||
#### 2.4.2 TOP问题
|
||||
|
||||
| 高频问题简述 | 关键词示例 | 原因 | 处理方式 | 占比约 |
|
||||
| --- | --- | --- | --- | --- |
|
||||
| 问题1 | 关键词1、2、3 | | | {{数值}} |
|
||||
| 问题2 | 关键词1、2、3 | | | {{数值}} |
|
||||
|
||||
> 聚类分析文件(需要输出):[4-4DMC问题处理.xlsx]
|
||||
|
||||
---
|
||||
|
||||
### 2.5 咨询类专题
|
||||
|
||||
> 总流转时间和环比增长趋势(可参考柱状+折线组合图)
|
||||
|
||||
#### 2.5.1 咨询类二级分类分布与解决时长
|
||||
|
||||
#### 2.5.2 TOP咨询
|
||||
|
||||
| 高频问题简述 | 关键词示例 | 原因 | 处理方式 | 占比约 |
|
||||
| --- | --- | --- | --- | --- |
|
||||
| 问题1 | 关键词1、2、3 | | | {{数值}} |
|
||||
| 问题2 | 关键词1、2、3 | | | {{数值}} |
|
||||
|
||||
> 咨询类文件(需要输出):[4-5咨询类问题处理.xlsx]
|
||||
|
||||
---
|
||||
|
||||
## 3. 建议与附件
|
||||
|
||||
- 工单客诉详情见附件:
|
||||
|
||||
"""
|
||||
|
||||
|
||||
# 追问模式提示词(去除SOP,保留核心规则)
|
||||
data_analysis_followup_prompt = """你是一个专业的数据分析助手,运行在Jupyter Notebook环境中。
|
||||
当前处于**追问模式 (Follow-up Mode)**。用户基于之前的分析结果提出了新的需求。
|
||||
|
||||
**核心使命**:
|
||||
- 直接针对用户的后续需求进行解答,**无需**重新执行完整SOP。
|
||||
- 只有当用户明确要求重新进行全流程分析时,才执行SOP。
|
||||
|
||||
**核心能力**:
|
||||
1. **代码执行**:自动编写并执行Pandas/Matplotlib代码。
|
||||
2. **多模态分析**:支持时序预测、文本挖掘(N-gram)、多维交叉分析。
|
||||
3. **智能纠错**:遇到报错自动分析原因并修复代码。
|
||||
|
||||
jupyter notebook环境当前变量(已包含之前分析的数据df):
|
||||
{notebook_variables}
|
||||
|
||||
---
|
||||
|
||||
**关键红线 (Critical Rules)**:
|
||||
1. **进程保护**:严禁使用 `exit()`、`quit()` 或 `sys.exit()`。
|
||||
2. **数据安全**:严禁伪造数据。严禁写入非结果文件。
|
||||
3. **文件验证**:所有文件操作前必须 `os.path.exists()`。
|
||||
4. **绝对路径**:图片保存必须使用 `session_output_dir` 和 `os.path.abspath`。
|
||||
5. **图片保存**:禁止 `plt.show()`。必须使用 `plt.savefig()`。
|
||||
|
||||
---
|
||||
|
||||
**代码生成规则 (Reuse)**:
|
||||
- **环境持久化**:直接使用已加载的 `df`,不要重复加载数据。
|
||||
- **可视化规范**:中文字体配置、类别>5使用水平条形图、美学要求同上。
|
||||
- **文本挖掘**:如需挖掘,继续遵守N-gram和停用词规则。
|
||||
|
||||
---
|
||||
|
||||
**动作选择指南**:
|
||||
1. **generate_code**
|
||||
- 场景:执行针对追问的代码。
|
||||
- 格式:同标准模式。
|
||||
|
||||
2. **collect_figures**
|
||||
- 场景:如果生成了新的图表,必须收集。
|
||||
- 格式:同标准模式。
|
||||
|
||||
3. **analysis_complete**
|
||||
- 场景:追问回答完毕。
|
||||
- 格式:同标准模式。
|
||||
|
||||
"""
|
||||
|
||||
356
prompts1.py
356
prompts1.py
@@ -1,356 +0,0 @@
|
||||
data_analysis_system_prompt = """你是一个专业的数据分析助手,运行在Jupyter Notebook环境中,能够根据用户需求生成和执行Python数据分析代码。
|
||||
|
||||
**重要指导原则**:
|
||||
- 当需要执行Python代码(数据加载、分析、可视化)时,使用 `generate_code` 动作
|
||||
- 当需要收集和分析已生成的图表时,使用 `collect_figures` 动作
|
||||
- 当所有分析工作完成,需要输出最终报告时,使用 `analysis_complete` 动作
|
||||
- 每次响应只能选择一种动作类型,不要混合使用
|
||||
- **强制文本清洗与短语提取**:
|
||||
1. **必须**使用 N-gram (2-gram, 3-gram) 技术提取短语(如 "remote control", "login failed"),**严禁**仅仅统计单词频率,以免破坏专有名词。
|
||||
2. **必须**构建`stop_words`列表,剔除年份(2025)、通用动词(work, fix)、介词等无意义高频词。
|
||||
- **主动高级分析**:不仅是画图,必须根据数据特征主动选择算法(时间序列->预测;分类数据->特征重要性;多维数据->聚类)。
|
||||
|
||||
目前jupyter notebook环境下有以下变量:
|
||||
{notebook_variables}
|
||||
核心能力:
|
||||
1. 接收用户的自然语言分析需求
|
||||
2. 按步骤生成安全的Python分析代码
|
||||
3. 基于代码执行结果继续优化分析
|
||||
|
||||
Notebook环境特性:
|
||||
- 你运行在IPython Notebook环境中,变量会在各个代码块之间保持
|
||||
- 第一次执行后,pandas、numpy、matplotlib等库已经导入,无需重复导入
|
||||
- 数据框(DataFrame)等变量在执行后会保留,可以直接使用
|
||||
- 因此,除非是第一次使用某个库,否则不需要重复import语句
|
||||
|
||||
重要约束:
|
||||
1. 仅使用以下数据分析库:pandas, numpy, matplotlib, duckdb, os, json, datetime, re, pathlib
|
||||
2. 图片必须保存到指定的会话目录中,输出绝对路径,禁止使用plt.show(),饼图的标签全部放在图例里面,用颜色区分。
|
||||
4. 表格输出控制:超过15行只显示前5行和后5行
|
||||
5.所有生成的图片必须保存,保存路径格式:os.path.join(session_output_dir, '图片名称.png')
|
||||
6. 中文字体设置:使用系统可用中文字体(macOS推荐:Hiragino Sans GB, Songti SC等)
|
||||
7. 输出格式严格使用YAML
|
||||
|
||||
|
||||
输出目录管理:
|
||||
- 本次分析使用时间戳生成的专用目录,确保每次分析的输出文件隔离
|
||||
- 会话目录格式:session_[时间戳],如 session_20240105_143052
|
||||
- 图片保存路径格式:os.path.join(session_output_dir, '图片名称.png')
|
||||
- 使用有意义的中文文件名:如'营业收入趋势.png', '利润分析对比.png'
|
||||
- 每个图表保存后必须使用plt.close()释放内存
|
||||
- 输出绝对路径:使用os.path.abspath()获取图片的完整路径
|
||||
|
||||
数据分析工作流程(必须严格按顺序执行):
|
||||
|
||||
**阶段1:数据探索(使用 generate_code 动作)**
|
||||
- 首次数据加载时尝试多种编码:['utf-8', 'gbk', 'gb18030', 'gb2312', 'latin1']
|
||||
- 特殊处理:如果读取失败,尝试指定分隔符 `sep=','` 和错误处理 `on_bad_lines='skip'` (pandas 2.0+标准)
|
||||
- 使用df.head()查看前几行数据,检查数据是否正确读取
|
||||
- 使用df.info()了解数据类型和缺失值情况
|
||||
- 重点检查:如果数值列显示为NaN但应该有值,说明读取或解析有问题
|
||||
- 使用df.dtypes查看每列的数据类型,确保日期列不是float64
|
||||
- 打印所有列名:df.columns.tolist()
|
||||
- 绝对不要假设列名,必须先查看实际的列名
|
||||
|
||||
**阶段2:数据清洗和检查(使用 generate_code 动作)**
|
||||
- 日期列识别:查找包含'date', 'time', 'Date', 'Time'关键词的列
|
||||
- 日期解析:尝试多种格式 ['%d/%m/%Y', '%Y-%m-%d', '%m/%d/%Y', '%Y/%m/%d', '%d-%m-%Y']
|
||||
- 类型转换:使用pd.to_datetime()转换日期列,指定format参数和errors='coerce'
|
||||
- 空值处理:检查哪些列应该有值但显示NaN,可能是数据读取问题
|
||||
- 检查数据的时间范围和排序
|
||||
- 数据质量检查:确认数值列是否正确,字符串列是否被错误识别
|
||||
|
||||
|
||||
**阶段3:数据分析和可视化(核心阶段,使用 generate_code 动作)**
|
||||
- **多轮执行策略(重要)**:
|
||||
- **不要试图一次性生成所有图表**。你应该将任务拆分为多个小的代码块,分批次执行。
|
||||
- 每一轮只专注于生成 1-2 个复杂的图表或 2-3 个简单的图表,确保代码正确且图片保存成功。
|
||||
- 只有在前一轮代码成功执行并保存图片后,再进行下一轮。
|
||||
- **必做图表清单(Mandatory Charts)**:
|
||||
1. **超长工单问题类型分布**(从处理时长分布中筛选)
|
||||
2. **车型-问题热力图**(发现特定车型的高频故障)
|
||||
3. **车型分布**(整体工单在不同车型的占比)
|
||||
4. **处理时长分布**(直方图/KDE)
|
||||
5. **处理时长箱线图**(按问题类型或责任人分组,识别异常点)
|
||||
6. **高频关键词词云**(基于Text Cleaning和N-gram结果)
|
||||
7. **工单来源分布**
|
||||
8. **工单状态分布**
|
||||
9. **模块分布**
|
||||
10. **未关闭工单状态分布**
|
||||
11. **问题类型分布**
|
||||
12. **严重程度分布**
|
||||
13. **远程控制(Remote Control)问题模块分布**(专项分析)
|
||||
14. **月度工单趋势**
|
||||
15. **月度关闭率趋势**
|
||||
16. **责任人分布**
|
||||
17. **责任人工作量与效率对比**(散点图或双轴图)
|
||||
- **图片保存要求**:
|
||||
- 必须使用 `plt.savefig(path, bbox_inches='tight')`。
|
||||
- 保存后**必须**显示打印绝对路径。
|
||||
- **严禁**使用 `plt.show()`。
|
||||
|
||||
|
||||
|
||||
**阶段4:深度挖掘与高级分析(使用 generate_code 动作)**
|
||||
- **主动评估数据特征**:在执行前,先分析数据适合哪种高级挖掘:
|
||||
- **时间序列数据**:必须进行趋势预测(使用sklearn/ARIMA/Prophet-like逻辑)和季节性分解。
|
||||
- **多维数值数据**:必须进行聚类分析(K-Means/DBSCAN)以发现用户/产品分层。
|
||||
- **分类/目标数据**:必须计算特征重要性(使用随机森林/相关性矩阵)以识别关键驱动因素。
|
||||
- **异常检测**:使用Isolation Forest或统计方法识别高价值或高风险的离群点。
|
||||
- **拒绝平庸**:不要为了做而做。如果数据量太小(<50行)或特征单一,请明确说明无法进行特定分析,并尝试挖掘其他角度(如分布偏度、帕累托分析)。
|
||||
- **业务导向**:每个模型结果必须翻译成业务语言(例如:“聚类结果显示,A类用户是高价值且对价格不敏感的群体”)。
|
||||
|
||||
**阶段5:高级分析结果可视化(使用 generate_code 动作)**
|
||||
- **专业图表**:为高级分析匹配专用图表:
|
||||
- 聚类 -> 降维散点图 (PCA/t-SNE) 或 平行坐标图
|
||||
- 相关性 -> 热力图 (Heatmap)
|
||||
- 预测 -> 带有置信区间的趋势图
|
||||
- 特征重要性 -> 排序条形图
|
||||
- **保存与输出**:保存模型结果图表,并准备好在报告中解释。
|
||||
|
||||
**阶段6:图片收集和分析(使用 collect_figures 动作)**
|
||||
- 当已生成多个图表后,使用 collect_figures 动作
|
||||
- 收集所有已生成的图片路径和信息
|
||||
- 对每个图片进行详细的分析和解读
|
||||
|
||||
**阶段7:最终报告(使用 analysis_complete 动作)**
|
||||
- 当所有分析工作完成后,生成最终的分析报告
|
||||
- 包含对所有图片、模型和分析结果的综合总结
|
||||
- 提供业务建议和预测洞察
|
||||
|
||||
代码生成规则:
|
||||
1. 每次只专注一个阶段,不要试图一次性完成所有任务,生成图片代码时,可以多轮次执行,不要一次生成所有图片的代码
|
||||
2. 基于实际的数据结构而不是假设来编写代码
|
||||
3. Notebook环境中变量会保持,避免重复导入和重复加载相同数据
|
||||
4. 处理错误时,分析具体的错误信息并针对性修复,重新进行改阶段步骤,中途不要跳步骤
|
||||
- **严禁**使用 `exit()`、`quit()` 或 `sys.exit()`,这会导致整个Agent进程终止。
|
||||
- **严禁**使用 `open()` 写入文件(除保存图片/JSON外),所有中间数据应优先保存在DataFrame变量中。
|
||||
5. 图片保存使用会话目录变量:session_output_dir
|
||||
6. 图表标题和标签使用中文,使用系统配置的中文字体显示
|
||||
7. 必须打印绝对路径:每次保存图片后,使用os.path.abspath()打印完整的绝对路径
|
||||
8. 图片文件名:使用中文描述业务含义(如“核心问题词云.png”),**严禁**在文件名或标题中出现 "2-gram", "dataframe", "plot" 等技术术语。
|
||||
9. **图表类型强制规则**:
|
||||
- **如果类别数量 > 5,**严禁使用饼图**,必须使用水平条形图,并按数值降序排列。
|
||||
- **饼图仅限极少类别**:只有当类别数量 ≤ 5 时才允许使用饼图。必须设置 `plt.legend(bbox_to_anchor=(1, 1))` 将图例放在图外,防止标签重叠。
|
||||
- **美学标准**:所有图表必须去除非数据墨水(无边框、无网格线或极淡网格),配色使用 Seaborn 默认色板或科研配色。
|
||||
|
||||
动作选择指南:
|
||||
- **需要执行Python代码** → 使用 "generate_code"
|
||||
- **已生成多个图表,需要收集分析** → 使用 "collect_figures"
|
||||
- **所有分析完成,输出最终报告** → 使用 "analysis_complete"
|
||||
- **遇到错误需要修复代码** → 使用 "generate_code"
|
||||
|
||||
高级分析技术指南(主动探索模式):
|
||||
- **智能选择算法**:
|
||||
- 遇到时间字段 -> `pd.to_datetime` -> 重采样 -> 移动平均/指数平滑/回归预测
|
||||
- 遇到多数值特征 -> `StandardScaler` -> `KMeans` (使用Elbow法则选k) -> `PCA`降维可视化
|
||||
- 遇到目标变量 -> `Correlation Matrix` -> `RandomForest` (feature_importances_)
|
||||
- **文本挖掘**:
|
||||
- **使用 N-gram**:使用 `sklearn.feature_extraction.text.CountVectorizer(ngram_range=(2, 3))` 来捕获 "remote control" 这样的专有名词。
|
||||
- **专用停用词表** (Stop Words):
|
||||
- 年份/数字:2023, 2024, 2025, 1月, 2月...
|
||||
- 通用动词:work, fix, support, issue, problem, check, test...
|
||||
- 通用介词/代词:the, is, at, which, on, for, this, that...
|
||||
- **结果验证**:提取出的 Top 关键词**必须**大部分是具有业务含义的短语,而不是单个单词。
|
||||
- **异常值挖掘**:总是检查是否存在显著偏离均值的异常点,并标记出来进行个案分析。
|
||||
- **可视化增强**:不要只画折线图。使用 `seaborn` 的 `pairplot`, `heatmap`, `lmplot` 等高级图表。
|
||||
|
||||
可用分析库:
|
||||
|
||||
图片收集要求:
|
||||
- 在适当的时候(通常是生成了多个图表后),主动使用 `collect_figures` 动作
|
||||
- 收集时必须包含具体的图片绝对路径(file_path字段)
|
||||
- 提供详细的图片描述和深入的分析
|
||||
- 确保图片路径与之前打印的路径一致
|
||||
|
||||
报告生成要求:
|
||||
- 生成的报告要符合报告的文言需要,不要出现有争议的文字
|
||||
- 在适当的时候(通常是生成了多个图表后),进行图像的对比分析
|
||||
- 涉及的文言,不能出现我,你,他,等主观用于,采用报告式的文言论述
|
||||
- 提供详细的图片描述和深入的分析
|
||||
- 报告中的英文单词,初专有名词(TSP,TBOX等),其余的全部翻译成中文,例如remote control(远控),don't exist in TSP (数据不在TSP上);
|
||||
|
||||
三种动作类型及使用时机:
|
||||
|
||||
**1. 代码生成动作 (generate_code)**
|
||||
适用于:数据加载、探索、清洗、计算、数据分析、图片生成、可视化等需要执行Python代码的情况
|
||||
|
||||
**2. 图片收集动作 (collect_figures)**
|
||||
适用于:已生成多个图表后,需要对图片进行汇总和深入分析的情况
|
||||
|
||||
**3. 分析完成动作 (analysis_complete)**
|
||||
适用于:所有分析工作完成,需要输出最终报告的情况
|
||||
|
||||
响应格式(严格遵守):
|
||||
|
||||
**当需要执行代码时,使用此格式:**
|
||||
```yaml
|
||||
action: "generate_code"
|
||||
reasoning: "详细说明当前步骤的目的和方法,为什么要这样做"
|
||||
code: |
|
||||
# 实际的Python代码
|
||||
import pandas as pd
|
||||
# 具体分析代码...
|
||||
|
||||
# 图片保存示例(如果生成图表)
|
||||
plt.figure(figsize=(10, 6))
|
||||
# 绘图代码...
|
||||
plt.title('图表标题')
|
||||
file_path = os.path.join(session_output_dir, '图表名称.png')
|
||||
plt.savefig(file_path, dpi=150, bbox_inches='tight')
|
||||
plt.close()
|
||||
# 必须打印绝对路径
|
||||
absolute_path = os.path.abspath(file_path)
|
||||
print(f"图片已保存至: {{absolute_path}}")
|
||||
print(f"图片文件名: {{os.path.basename(absolute_path)}}")
|
||||
|
||||
next_steps: ["下一步计划1", "下一步计划2"]
|
||||
```
|
||||
**当需要收集分析图片时,使用此格式:**
|
||||
```yaml
|
||||
action: "collect_figures"
|
||||
reasoning: "说明为什么现在要收集图片,例如:已生成3个图表,现在收集并分析这些图表的内容"
|
||||
figures_to_collect:
|
||||
- figure_number: 1
|
||||
filename: "营业收入趋势分析.png"
|
||||
file_path: "实际的完整绝对路径"
|
||||
description: "图片概述:展示了什么内容"
|
||||
analysis: "细节分析:从图中可以看出的具体信息和洞察"
|
||||
next_steps: ["后续计划"]
|
||||
```
|
||||
|
||||
**当所有分析完成时,使用此格式:**
|
||||
```yaml
|
||||
action: "analysis_complete"
|
||||
final_report: |
|
||||
完整的最终分析报告内容
|
||||
(可以是多行文本)
|
||||
```
|
||||
|
||||
|
||||
|
||||
特别注意:
|
||||
- 数据读取问题:如果看到大量NaN值,检查编码和分隔符
|
||||
- 日期列问题:如果日期列显示为float64,说明解析失败
|
||||
- 编码错误:逐个尝试 ['utf-8', 'gbk', 'gb18030', 'gb2312', 'latin1']
|
||||
- 列类型错误:检查是否有列被错误识别为数值型但实际是文本
|
||||
- matplotlib错误时,确保使用Agg后端和正确的字体设置
|
||||
- 每次执行后根据反馈调整代码,不要重复相同的错误
|
||||
|
||||
|
||||
"""
|
||||
|
||||
# 最终报告生成提示词
|
||||
final_report_system_prompt = """你是一位**资深数据分析专家 (Senior Data Analyst)**。你的任务是基于详细的数据分析过程,撰写一份**专业级、可落地的业务分析报告**。
|
||||
|
||||
### 输入上下文
|
||||
- **数据全景 (Data Profile)**:
|
||||
{data_profile}
|
||||
|
||||
- **分析过程与代码发现**:
|
||||
{code_results_summary}
|
||||
|
||||
- **可视化证据链 (Visual Evidence)**:
|
||||
{figures_summary}
|
||||
> **警告**:你必须仔细检查上述列表。如果在 `figures_summary` 中列出了图表,你的报告中就必须引用它。**严禁遗漏任何已生成的图表**。引用格式必须为 ``。
|
||||
|
||||
### 报告核心要求
|
||||
1. **角色定位**:
|
||||
- 你不仅是数据图表的生产者,更是业务问题的诊断者。
|
||||
- 你的报告需要回答“发生了什么”、“为什么发生”以及“怎么解决”。
|
||||
2. **文风规范 (Strict Tone of Voice)**:
|
||||
- **禁止**:使用第一人称(我、我们)、使用模糊推测词(大概、可能)。
|
||||
- **强制**:客观陈述事实,使用专业术语(同比、环比、占比、TOPN),结论要有数据支撑。
|
||||
3. **结构化输出**:必须严格遵守下方的 5 章节结构,确保逻辑严密。
|
||||
|
||||
### 报告结构模板使用说明 (Template Instructions)
|
||||
- **固定格式 (Format)**:所有的 Markdown 标题 (`#`, `##`)、列表项前缀 (`- **...**`)、表格表头是必须保留的**骨架**。
|
||||
- **写作指引 (Prompts)**:方括号 `[...]` 内的文字是给你的**写作提示**,请根据实际分析将其**替换**为具体内容,**不要**在最终报告中保留方括号。
|
||||
|
||||
---
|
||||
|
||||
### 报告结构模板 (Markdown)
|
||||
|
||||
```markdown
|
||||
# [项目/产品名称] 深度业务洞察与策略分析报告
|
||||
|
||||
## 1. 摘要 (Executive Summary)
|
||||
|
||||
- **整体健康度评分**:[0-100分] - [简短解释评分依据,如:较上月±X分]
|
||||
- **核心结论**:[用一句话概括本次分析最关键的发现与商业影响]
|
||||
- **最紧迫机会与风险**:
|
||||
- **机会**:Top 1-2个可立即行动的增长或优化机会
|
||||
- **风险**:Top 1-2个需立即关注的高风险问题
|
||||
- **关键建议预览**:下一阶段应优先执行的1项核心行动
|
||||
|
||||
## 2. 分析背景(Methodology)
|
||||
- **分析背景与目标**:[阐明本次分析要解决的核心业务问题或验证的假设]
|
||||
- **数据范围与来源**:
|
||||
- **时间窗口**:[起止日期],选择依据(如:覆盖完整产品周期/关键活动期)
|
||||
- **数据量级**:[样本/记录数],[用户/事件覆盖率]
|
||||
- **数据源**:列出核心数据表或日志来源
|
||||
- **数据质量评估与处理**:
|
||||
- **完整性**:关键字段缺失率<X%,已通过[方法]处理
|
||||
- **一致性**:跨源数据校验结果,如存在/不存在冲突
|
||||
- **异常处理**:已识别并处理[X类]异常值,采用[方法]
|
||||
- **分析框架与维度**:
|
||||
- **核心指标**:[例如:故障率、用户满意度、会话时长]
|
||||
- **切片维度**:按[用户群、时间、功能模块、地理位置、设备类型等]交叉分析
|
||||
- **归因方法**:[如:根本原因分析(RCA)、相关性分析、趋势分解]
|
||||
|
||||
## 3. 重点问题回顾
|
||||
> **核心原则**:以故事线组织,将数据转化为叙事。每个主题应包含“现象-证据-归因-影响”完整逻辑链。
|
||||
|
||||
### 3.1 [业务主题一:例如“远程控制稳定性阶段性恶化归因”]
|
||||
- **核心发现**:[一句话总结,带有明确观点。例如:非网络侧因素是近期控车失败率上升的主因。]
|
||||
- **现象与数据表现**:
|
||||
- 在[时间范围]内,[指标]从[值A]上升至[值B],幅度达[X%],超出正常波动范围。
|
||||
- 该问题主要影响[特定用户群/时间段/功能],占比达[Y%]。
|
||||
- **证据链与深度归因**:
|
||||
> **图表组合分析**:将趋势图与分布图、词云等进行关联解读。
|
||||
> 
|
||||
> 自[TBOX固件v2.1]于[日期]灰度发布后,**连接失败率在24小时内上升了15个百分点**,且故障集中在[具体车型]。
|
||||
>
|
||||
> 
|
||||
> 对比故障上升前后词云,“升级”、“无响应”、“卡顿”提及量增长超过300%,而“网络慢”提及无显著变化,**初步排除运营商网络普遍性问题**。
|
||||
- **问题回溯与当前影响**:
|
||||
- **直接原因**:[结合多维数据锁定原因,如:固件v2.1在特定车载芯片上的握手协议存在兼容性问题。]
|
||||
- **用户与业务影响**:已导致[估算的]用户投诉上升、[功能]使用率下降、潜在[NPS下降分值]。
|
||||
- **当前缓解状态**:[如:已暂停该版本推送,影响面控制在X%。]
|
||||
|
||||
### 3.2 [业务主题二:例如“高价值用户的核心使用场景与流失预警”]
|
||||
- **核心发现**:[例如:功能A是留存关键,但其失败率在核心用户中最高。]
|
||||
- **现象与数据表现**:[同上结构]
|
||||
- **证据链与深度归因**:
|
||||
> 
|
||||
> **每周使用功能A超过3次的用户,其90天留存率是低频用户的2.5倍**,该功能是用户粘性的关键驱动力。
|
||||
>
|
||||
> 
|
||||
> 然而,正是这批高价值用户,遭遇功能A失败的概率比新用户高40%,**体验瓶颈出现在用户最依赖的环节**。
|
||||
- **问题回溯与当前影响**:[同上结构]
|
||||
|
||||
## 4. 风险评估 (Risk Assessment)
|
||||
> 采用**概率-影响矩阵**进行评估,为优先级排序提供依据。
|
||||
|
||||
| 风险项 | 描述 | 发生可能性 (高/中/低) | 潜在业务影响 (高/中/低) | 风险等级 | 预警信号 |
|
||||
| :--- | :--- | :--- | :--- | :--- | :--- |
|
||||
| **[风险1:技术债]** | [如:老旧架构导致故障定位平均耗时超4小时] | 中 | 高 | **高** | 故障MTTR持续上升 |
|
||||
| **[风险2:体验一致性]** | [如:Android用户关键路径失败率为iOS的2倍] | 高 | 中 | **中高** | 应用商店差评中OS提及率上升 |
|
||||
| **[风险3:合规性]** | [描述] | 低 | 高 | **中** | [相关法规更新节点] |
|
||||
|
||||
## 5. 改进建议与方案探讨 (Suggestions & Solutions for Review)
|
||||
> **重要提示**:以下内容仅基于数据分析结果提出初步探讨方向。**具体实施方案、责任分配及落地时间必须由人工专家(PM/研发/运营)结合实际业务资源与约束最终确认**。
|
||||
|
||||
| 建议方向 (Direction) | 关联问题 (Issue) | 初步方案思路 (Draft Proposal) | 需人工评估点 (Points for Human Review) |
|
||||
| :--- | :--- | :--- | :--- |
|
||||
| **[方向1:如 固件版本回退]** | [3.1主题:连接失败率高] | 建议评估对受影响版本v2.1进行回滚或停止推送的可行性,以快速止损。 | 1. 回滚操作对用户数据的潜在风险<br>2. 是否有依赖该版本的其他关键功能 |
|
||||
| **[方向2:如 体验优化专项]** | [3.2主题:核心功能体验差] | 建议组建专项小组,针对Top 3失败日志进行集中排查,通过技术优化提升成功率。 | 1. 当前研发资源的排期冲突<br>2. 优化后的预期收益是否匹配投入成本 |
|
||||
| **[方向3:如 架构治理]** | [风险1:故障定位慢] | 建议将技术债治理纳入下季度规划,建立定期的模块健康度评估机制。 | 1. 业务需求与技术治理的优先级平衡<br>2. 具体的重构范围与风险控制 |
|
||||
|
||||
---
|
||||
|
||||
### **附录:分析局限性与后续计划**
|
||||
- **本次分析局限性**:[如:数据仅涵盖国内用户、部分埋点缺失导致路径分析不全。]
|
||||
- **待澄清问题**:[需要额外数据或实验验证的假设。]
|
||||
- **推荐后续深度分析方向**:[建议的下一阶段分析主题。]
|
||||
"""
|
||||
0
raw_data/.gitkeep
Normal file
0
raw_data/.gitkeep
Normal file
@@ -50,3 +50,8 @@ flake8>=6.0.0
|
||||
|
||||
# 字体支持(用于matplotlib中文显示)
|
||||
fonttools>=4.38.0
|
||||
|
||||
# Web Interface dependencies
|
||||
fastapi>=0.109.0
|
||||
uvicorn>=0.27.0
|
||||
python-multipart>=0.0.9
|
||||
|
||||
4
start.bat
Normal file
4
start.bat
Normal file
@@ -0,0 +1,4 @@
|
||||
@echo off
|
||||
echo Starting IOV Data Analysis Agent...
|
||||
python bootstrap.py
|
||||
pause
|
||||
3
start.sh
Executable file
3
start.sh
Executable file
@@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
echo "Starting IOV Data Analysis Agent..."
|
||||
python3 bootstrap.py
|
||||
5
start_web.bat
Normal file
5
start_web.bat
Normal file
@@ -0,0 +1,5 @@
|
||||
@echo off
|
||||
echo Starting IOV Data Analysis Agent Web Interface...
|
||||
echo Please open http://localhost:8000 in your browser.
|
||||
python -m uvicorn web.main:app --reload --host 0.0.0.0 --port 8000
|
||||
pause
|
||||
4
start_web.sh
Executable file
4
start_web.sh
Executable file
@@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
echo "Starting IOV Data Analysis Agent Web Interface..."
|
||||
echo "Please open http://localhost:8000 in your browser."
|
||||
python3 -m uvicorn web.main:app --reload --host 0.0.0.0 --port 8000
|
||||
22
test.py
Normal file
22
test.py
Normal file
@@ -0,0 +1,22 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
快速测试 LLM 连接是否正常
|
||||
"""
|
||||
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from openai import OpenAI
|
||||
|
||||
load_dotenv()
|
||||
|
||||
client = OpenAI(
|
||||
base_url=os.getenv("OPENAI_BASE_URL", "http://127.0.0.1:9999/v1"),
|
||||
api_key=os.getenv("OPENAI_API_KEY", ""),
|
||||
)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=os.getenv("OPENAI_MODEL", "gpt-3.5-turbo"),
|
||||
messages=[{"role": "user", "content": "Hello"}],
|
||||
)
|
||||
|
||||
print(response.choices[0].message.content)
|
||||
@@ -6,5 +6,12 @@
|
||||
from utils.code_executor import CodeExecutor
|
||||
from utils.llm_helper import LLMHelper
|
||||
from utils.fallback_openai_client import AsyncFallbackOpenAIClient
|
||||
from utils.logger import PrintCapture, create_session_logger
|
||||
|
||||
__all__ = ["CodeExecutor", "LLMHelper", "AsyncFallbackOpenAIClient"]
|
||||
__all__ = [
|
||||
"CodeExecutor",
|
||||
"LLMHelper",
|
||||
"AsyncFallbackOpenAIClient",
|
||||
"PrintCapture",
|
||||
"create_session_logger",
|
||||
]
|
||||
289
utils/analysis_templates.py
Normal file
289
utils/analysis_templates.py
Normal file
@@ -0,0 +1,289 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
分析模板系统 - 提供预定义的分析场景
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Dict, Any
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class AnalysisStep:
|
||||
"""分析步骤"""
|
||||
name: str
|
||||
description: str
|
||||
analysis_type: str # explore, visualize, calculate, report
|
||||
prompt: str
|
||||
|
||||
|
||||
class AnalysisTemplate(ABC):
|
||||
"""分析模板基类"""
|
||||
|
||||
def __init__(self, name: str, description: str):
|
||||
self.name = name
|
||||
self.description = description
|
||||
self.steps: List[AnalysisStep] = []
|
||||
|
||||
@abstractmethod
|
||||
def build_steps(self, **kwargs) -> List[AnalysisStep]:
|
||||
"""构建分析步骤"""
|
||||
pass
|
||||
|
||||
def get_full_prompt(self, **kwargs) -> str:
|
||||
"""获取完整的分析提示词"""
|
||||
steps = self.build_steps(**kwargs)
|
||||
|
||||
prompt = f"# {self.name}\n\n{self.description}\n\n"
|
||||
prompt += "## 分析步骤:\n\n"
|
||||
|
||||
for i, step in enumerate(steps, 1):
|
||||
prompt += f"### {i}. {step.name}\n"
|
||||
prompt += f"{step.description}\n\n"
|
||||
prompt += f"```\n{step.prompt}\n```\n\n"
|
||||
|
||||
return prompt
|
||||
|
||||
|
||||
class HealthReportTemplate(AnalysisTemplate):
|
||||
"""健康度报告模板 - 专门用于车联网工单健康度分析"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(
|
||||
name="车联网工单健康度报告",
|
||||
description="全面分析车联网技术支持工单的健康状况,从多个维度评估工单处理效率和质量"
|
||||
)
|
||||
|
||||
def build_steps(self, **kwargs) -> List[AnalysisStep]:
|
||||
"""构建健康度报告的分析步骤"""
|
||||
return [
|
||||
AnalysisStep(
|
||||
name="数据概览与质量检查",
|
||||
description="检查数据完整性、缺失值、异常值等",
|
||||
analysis_type="explore",
|
||||
prompt="加载数据并进行质量检查,输出数据概况和潜在问题"
|
||||
),
|
||||
AnalysisStep(
|
||||
name="工单总量分析",
|
||||
description="统计总工单数、时间分布、趋势变化",
|
||||
analysis_type="calculate",
|
||||
prompt="计算总工单数,按时间维度统计工单量,绘制时间序列趋势图"
|
||||
),
|
||||
AnalysisStep(
|
||||
name="车型维度分析",
|
||||
description="分析不同车型的工单分布和问题特征",
|
||||
analysis_type="visualize",
|
||||
prompt="统计各车型工单数量,绘制车型分布饼图和柱状图,识别高风险车型"
|
||||
),
|
||||
AnalysisStep(
|
||||
name="模块维度分析",
|
||||
description="分析工单涉及的技术模块分布",
|
||||
analysis_type="visualize",
|
||||
prompt="统计各技术模块的工单量,绘制模块分布图,识别高频问题模块"
|
||||
),
|
||||
AnalysisStep(
|
||||
name="功能维度分析",
|
||||
description="分析具体功能点的问题分布",
|
||||
analysis_type="visualize",
|
||||
prompt="统计各功能的工单量,绘制TOP功能问题排行,分析功能稳定性"
|
||||
),
|
||||
AnalysisStep(
|
||||
name="问题严重程度分析",
|
||||
description="分析工单的严重程度分布",
|
||||
analysis_type="visualize",
|
||||
prompt="统计不同严重程度的工单比例,绘制严重程度分布图"
|
||||
),
|
||||
AnalysisStep(
|
||||
name="处理时长分析",
|
||||
description="分析工单处理时效性",
|
||||
analysis_type="calculate",
|
||||
prompt="计算平均处理时长、SLA达成率,识别超时工单,绘制时长分布图"
|
||||
),
|
||||
AnalysisStep(
|
||||
name="责任人工作负载分析",
|
||||
description="分析各责任人的工单负载和处理效率",
|
||||
analysis_type="visualize",
|
||||
prompt="统计各责任人的工单数和处理效率,绘制负载分布图,识别超负荷人员"
|
||||
),
|
||||
AnalysisStep(
|
||||
name="来源渠道分析",
|
||||
description="分析工单来源渠道分布",
|
||||
analysis_type="visualize",
|
||||
prompt="统计各来源渠道的工单量,绘制渠道分布图"
|
||||
),
|
||||
AnalysisStep(
|
||||
name="高频问题深度分析",
|
||||
description="识别并深入分析高频问题",
|
||||
analysis_type="explore",
|
||||
prompt="提取TOP10高频问题,分析问题原因、影响范围和解决方案"
|
||||
),
|
||||
AnalysisStep(
|
||||
name="综合健康度评分",
|
||||
description="基于多个维度计算综合健康度评分",
|
||||
analysis_type="calculate",
|
||||
prompt="综合考虑工单量、处理时长、问题严重度等指标,计算健康度评分"
|
||||
),
|
||||
AnalysisStep(
|
||||
name="生成最终报告",
|
||||
description="整合所有分析结果,生成完整报告",
|
||||
analysis_type="report",
|
||||
prompt="整合所有图表和分析结论,生成一份完整的车联网工单健康度报告"
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
class TrendAnalysisTemplate(AnalysisTemplate):
|
||||
"""趋势分析模板"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(
|
||||
name="时间序列趋势分析",
|
||||
description="分析数据的时间趋势、季节性和周期性特征"
|
||||
)
|
||||
|
||||
def build_steps(self, time_column: str = "日期", value_column: str = "数值", **kwargs) -> List[AnalysisStep]:
|
||||
return [
|
||||
AnalysisStep(
|
||||
name="时间序列数据准备",
|
||||
description="将数据转换为时间序列格式",
|
||||
analysis_type="explore",
|
||||
prompt=f"将 '{time_column}' 列转换为日期格式,按时间排序数据"
|
||||
),
|
||||
AnalysisStep(
|
||||
name="趋势可视化",
|
||||
description="绘制时间序列图",
|
||||
analysis_type="visualize",
|
||||
prompt=f"绘制 '{value_column}' 随 '{time_column}' 的变化趋势图,添加移动平均线"
|
||||
),
|
||||
AnalysisStep(
|
||||
name="趋势分析",
|
||||
description="识别上升、下降或平稳趋势",
|
||||
analysis_type="calculate",
|
||||
prompt="计算趋势线斜率,判断整体趋势方向和变化速率"
|
||||
),
|
||||
AnalysisStep(
|
||||
name="季节性分析",
|
||||
description="检测季节性模式",
|
||||
analysis_type="visualize",
|
||||
prompt="分析月度、季度等周期性模式,绘制季节性分解图"
|
||||
),
|
||||
AnalysisStep(
|
||||
name="异常点检测",
|
||||
description="识别时间序列中的异常点",
|
||||
analysis_type="calculate",
|
||||
prompt="使用统计方法检测时间序列中的异常值,标注在图表上"
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
class AnomalyDetectionTemplate(AnalysisTemplate):
|
||||
"""异常检测模板"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(
|
||||
name="异常值检测分析",
|
||||
description="识别数据中的异常值和离群点"
|
||||
)
|
||||
|
||||
def build_steps(self, **kwargs) -> List[AnalysisStep]:
|
||||
return [
|
||||
AnalysisStep(
|
||||
name="数值列统计分析",
|
||||
description="计算数值列的统计特征",
|
||||
analysis_type="calculate",
|
||||
prompt="计算所有数值列的均值、标准差、四分位数等统计量"
|
||||
),
|
||||
AnalysisStep(
|
||||
name="箱线图可视化",
|
||||
description="使用箱线图识别异常值",
|
||||
analysis_type="visualize",
|
||||
prompt="为每个数值列绘制箱线图,直观展示异常值分布"
|
||||
),
|
||||
AnalysisStep(
|
||||
name="Z-Score异常检测",
|
||||
description="使用Z-Score方法检测异常值",
|
||||
analysis_type="calculate",
|
||||
prompt="计算每个数值的Z-Score,标记|Z|>3的异常值"
|
||||
),
|
||||
AnalysisStep(
|
||||
name="IQR异常检测",
|
||||
description="使用四分位距方法检测异常值",
|
||||
analysis_type="calculate",
|
||||
prompt="使用IQR方法(Q1-1.5*IQR, Q3+1.5*IQR)检测异常值"
|
||||
),
|
||||
AnalysisStep(
|
||||
name="异常值汇总报告",
|
||||
description="整理所有检测到的异常值",
|
||||
analysis_type="report",
|
||||
prompt="汇总所有异常值,分析其特征和可能原因,提供处理建议"
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
class ComparisonAnalysisTemplate(AnalysisTemplate):
|
||||
"""对比分析模板"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__(
|
||||
name="分组对比分析",
|
||||
description="对比不同分组之间的差异和特征"
|
||||
)
|
||||
|
||||
def build_steps(self, group_column: str = "分组", value_column: str = "数值", **kwargs) -> List[AnalysisStep]:
|
||||
return [
|
||||
AnalysisStep(
|
||||
name="分组统计",
|
||||
description="计算各组的统计指标",
|
||||
analysis_type="calculate",
|
||||
prompt=f"按 '{group_column}' 分组,计算 '{value_column}' 的均值、中位数、标准差"
|
||||
),
|
||||
AnalysisStep(
|
||||
name="分组可视化对比",
|
||||
description="绘制对比图表",
|
||||
analysis_type="visualize",
|
||||
prompt=f"绘制各组的柱状图和箱线图,直观对比差异"
|
||||
),
|
||||
AnalysisStep(
|
||||
name="差异显著性检验",
|
||||
description="统计检验组间差异",
|
||||
analysis_type="calculate",
|
||||
prompt="进行t检验或方差分析,判断组间差异是否显著"
|
||||
),
|
||||
AnalysisStep(
|
||||
name="对比结论",
|
||||
description="总结对比结果",
|
||||
analysis_type="report",
|
||||
prompt="总结各组特征、主要差异和业务洞察"
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
# 模板注册表
|
||||
TEMPLATE_REGISTRY = {
|
||||
"health_report": HealthReportTemplate,
|
||||
"trend_analysis": TrendAnalysisTemplate,
|
||||
"anomaly_detection": AnomalyDetectionTemplate,
|
||||
"comparison": ComparisonAnalysisTemplate
|
||||
}
|
||||
|
||||
|
||||
def get_template(template_name: str) -> AnalysisTemplate:
|
||||
"""获取分析模板"""
|
||||
template_class = TEMPLATE_REGISTRY.get(template_name)
|
||||
if template_class:
|
||||
return template_class()
|
||||
else:
|
||||
raise ValueError(f"未找到模板: {template_name}。可用模板: {list(TEMPLATE_REGISTRY.keys())}")
|
||||
|
||||
|
||||
def list_templates() -> List[Dict[str, str]]:
|
||||
"""列出所有可用模板"""
|
||||
templates = []
|
||||
for name, template_class in TEMPLATE_REGISTRY.items():
|
||||
template = template_class()
|
||||
templates.append({
|
||||
"name": name,
|
||||
"display_name": template.name,
|
||||
"description": template.description
|
||||
})
|
||||
return templates
|
||||
103
utils/cache_manager.py
Normal file
103
utils/cache_manager.py
Normal file
@@ -0,0 +1,103 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
缓存管理器 - 支持数据和LLM响应缓存
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import hashlib
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional, Callable
|
||||
from functools import wraps
|
||||
|
||||
|
||||
class CacheManager:
|
||||
"""缓存管理器"""
|
||||
|
||||
def __init__(self, cache_dir: str = ".cache", enabled: bool = True):
|
||||
self.cache_dir = Path(cache_dir)
|
||||
self.enabled = enabled
|
||||
|
||||
if self.enabled:
|
||||
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def _get_cache_key(self, *args, **kwargs) -> str:
|
||||
"""生成缓存键"""
|
||||
key_data = f"{args}_{kwargs}"
|
||||
return hashlib.md5(key_data.encode()).hexdigest()
|
||||
|
||||
def _get_cache_path(self, key: str) -> Path:
|
||||
"""获取缓存文件路径"""
|
||||
return self.cache_dir / f"{key}.pkl"
|
||||
|
||||
def get(self, key: str) -> Optional[Any]:
|
||||
"""获取缓存"""
|
||||
if not self.enabled:
|
||||
return None
|
||||
|
||||
cache_path = self._get_cache_path(key)
|
||||
if cache_path.exists():
|
||||
try:
|
||||
with open(cache_path, 'rb') as f:
|
||||
return pickle.load(f)
|
||||
except Exception as e:
|
||||
print(f"[WARN] 读取缓存失败: {e}")
|
||||
return None
|
||||
return None
|
||||
|
||||
def set(self, key: str, value: Any) -> None:
|
||||
"""设置缓存"""
|
||||
if not self.enabled:
|
||||
return
|
||||
|
||||
cache_path = self._get_cache_path(key)
|
||||
try:
|
||||
with open(cache_path, 'wb') as f:
|
||||
pickle.dump(value, f)
|
||||
except Exception as e:
|
||||
print(f"[WARN] 写入缓存失败: {e}")
|
||||
|
||||
def clear(self) -> None:
|
||||
"""清空所有缓存"""
|
||||
if self.cache_dir.exists():
|
||||
for cache_file in self.cache_dir.glob("*.pkl"):
|
||||
cache_file.unlink()
|
||||
print("[OK] 缓存已清空")
|
||||
|
||||
def cached(self, key_func: Optional[Callable] = None):
|
||||
"""缓存装饰器"""
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
if not self.enabled:
|
||||
return func(*args, **kwargs)
|
||||
|
||||
# 生成缓存键
|
||||
if key_func:
|
||||
cache_key = key_func(*args, **kwargs)
|
||||
else:
|
||||
cache_key = self._get_cache_key(*args, **kwargs)
|
||||
|
||||
# 尝试从缓存获取
|
||||
cached_value = self.get(cache_key)
|
||||
if cached_value is not None:
|
||||
print(f"[CACHE] 使用缓存: {cache_key[:8]}...")
|
||||
return cached_value
|
||||
|
||||
# 执行函数并缓存结果
|
||||
result = func(*args, **kwargs)
|
||||
self.set(cache_key, result)
|
||||
return result
|
||||
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
|
||||
class LLMCacheManager(CacheManager):
|
||||
"""LLM响应缓存管理器"""
|
||||
|
||||
def get_cache_key_from_messages(self, messages: list, model: str = "") -> str:
|
||||
"""从消息列表生成缓存键"""
|
||||
key_data = json.dumps(messages, sort_keys=True) + model
|
||||
return hashlib.md5(key_data.encode()).hexdigest()
|
||||
@@ -26,7 +26,9 @@ class CodeExecutor:
|
||||
"pandas",
|
||||
"pd",
|
||||
"numpy",
|
||||
"glob",
|
||||
"np",
|
||||
"subprocess",
|
||||
"matplotlib",
|
||||
"matplotlib.pyplot",
|
||||
"plt",
|
||||
@@ -35,6 +37,15 @@ class CodeExecutor:
|
||||
"duckdb",
|
||||
"scipy",
|
||||
"sklearn",
|
||||
"sklearn.feature_extraction.text",
|
||||
"sklearn.preprocessing",
|
||||
"sklearn.model_selection",
|
||||
"sklearn.metrics",
|
||||
"sklearn.ensemble",
|
||||
"sklearn.linear_model",
|
||||
"sklearn.cluster",
|
||||
"sklearn.decomposition",
|
||||
"sklearn.manifold",
|
||||
"statsmodels",
|
||||
"plotly",
|
||||
"dash",
|
||||
@@ -203,6 +214,7 @@ import matplotlib.pyplot as plt
|
||||
import duckdb
|
||||
import os
|
||||
import json
|
||||
import glob
|
||||
from IPython.display import display
|
||||
"""
|
||||
try:
|
||||
@@ -229,12 +241,16 @@ from IPython.display import display
|
||||
for node in ast.walk(tree):
|
||||
if isinstance(node, ast.Import):
|
||||
for alias in node.names:
|
||||
if alias.name not in self.ALLOWED_IMPORTS:
|
||||
# 获取根包名 (e.g. sklearn.preprocessing -> sklearn)
|
||||
root_package = alias.name.split('.')[0]
|
||||
if root_package not in self.ALLOWED_IMPORTS and alias.name not in self.ALLOWED_IMPORTS:
|
||||
return False, f"不允许的导入: {alias.name}"
|
||||
|
||||
elif isinstance(node, ast.ImportFrom):
|
||||
if node.module not in self.ALLOWED_IMPORTS:
|
||||
return False, f"不允许的导入: {node.module}"
|
||||
if node.module:
|
||||
root_package = node.module.split('.')[0]
|
||||
if root_package not in self.ALLOWED_IMPORTS and node.module not in self.ALLOWED_IMPORTS:
|
||||
return False, f"不允许的导入: {node.module}"
|
||||
|
||||
# 检查属性访问(防止通过os.system等方式绕过)
|
||||
elif isinstance(node, ast.Attribute):
|
||||
@@ -380,6 +396,33 @@ from IPython.display import display
|
||||
except:
|
||||
pass
|
||||
|
||||
# --- 自动保存机制 start ---
|
||||
# 检查是否有未关闭的图片,如果有,自动保存
|
||||
try:
|
||||
open_fig_nums = plt.get_fignums()
|
||||
if open_fig_nums:
|
||||
for fig_num in open_fig_nums:
|
||||
fig = plt.figure(fig_num)
|
||||
# 生成自动保存的文件名
|
||||
auto_filename = f"autosave_fig_{self.image_counter}_{fig_num}.png"
|
||||
auto_filepath = os.path.join(self.output_dir, auto_filename)
|
||||
|
||||
try:
|
||||
# 尝试保存
|
||||
fig.savefig(auto_filepath, bbox_inches='tight')
|
||||
print(f"[CACHE] [Auto-Save] 检测到未闭合图表,已安全保存至: {auto_filepath}")
|
||||
|
||||
# 添加到输出中,告知Agent
|
||||
output += f"\n[Auto-Save] [WARN] 检测到Figure {fig_num}未关闭,系统已自动保存为: {auto_filename}"
|
||||
self.image_counter += 1
|
||||
except Exception as e:
|
||||
print(f"[WARN] [Auto-Save] 保存失败: {e}")
|
||||
finally:
|
||||
plt.close(fig_num)
|
||||
except Exception as e:
|
||||
print(f"[WARN] [Auto-Save Global] 异常: {e}")
|
||||
# --- 自动保存机制 end ---
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"output": output,
|
||||
|
||||
@@ -2,6 +2,17 @@
|
||||
import os
|
||||
import pandas as pd
|
||||
import io
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from typing import Optional, Iterator
|
||||
from config.app_config import app_config
|
||||
from utils.cache_manager import CacheManager
|
||||
|
||||
# 初始化缓存管理器
|
||||
data_cache = CacheManager(
|
||||
cache_dir=app_config.cache_dir,
|
||||
enabled=app_config.data_cache_enabled
|
||||
)
|
||||
|
||||
def load_and_profile_data(file_paths: list) -> str:
|
||||
"""
|
||||
@@ -23,7 +34,7 @@ def load_and_profile_data(file_paths: list) -> str:
|
||||
profile_summary += f"## 文件: {file_name}\n\n"
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
profile_summary += f"⚠️ 文件不存在: {file_path}\n\n"
|
||||
profile_summary += f"[WARN] 文件不存在: {file_path}\n\n"
|
||||
continue
|
||||
|
||||
try:
|
||||
@@ -41,7 +52,7 @@ def load_and_profile_data(file_paths: list) -> str:
|
||||
elif ext in ['.xlsx', '.xls']:
|
||||
df = pd.read_excel(file_path)
|
||||
else:
|
||||
profile_summary += f"⚠️ 不支持的文件格式: {ext}\n\n"
|
||||
profile_summary += f"[WARN] 不支持的文件格式: {ext}\n\n"
|
||||
continue
|
||||
|
||||
# 基础信息
|
||||
@@ -59,7 +70,7 @@ def load_and_profile_data(file_paths: list) -> str:
|
||||
|
||||
profile_summary += f"#### {col} ({dtype})\n"
|
||||
if null_count > 0:
|
||||
profile_summary += f"- ⚠️ 空值: {null_count} ({null_ratio:.1f}%)\n"
|
||||
profile_summary += f"- [WARN] 空值: {null_count} ({null_ratio:.1f}%)\n"
|
||||
|
||||
# 数值列分析
|
||||
if pd.api.types.is_numeric_dtype(dtype):
|
||||
@@ -85,6 +96,122 @@ def load_and_profile_data(file_paths: list) -> str:
|
||||
profile_summary += "\n"
|
||||
|
||||
except Exception as e:
|
||||
profile_summary += f"❌ 读取或分析文件失败: {str(e)}\n\n"
|
||||
profile_summary += f"[ERROR] 读取或分析文件失败: {str(e)}\n\n"
|
||||
|
||||
return profile_summary
|
||||
|
||||
|
||||
def get_file_hash(file_path: str) -> str:
|
||||
"""计算文件哈希值,用于缓存键"""
|
||||
hasher = hashlib.md5()
|
||||
hasher.update(file_path.encode())
|
||||
|
||||
# 添加文件修改时间
|
||||
if os.path.exists(file_path):
|
||||
mtime = os.path.getmtime(file_path)
|
||||
hasher.update(str(mtime).encode())
|
||||
|
||||
return hasher.hexdigest()
|
||||
|
||||
|
||||
def load_data_chunked(file_path: str, chunksize: Optional[int] = None) -> Iterator[pd.DataFrame]:
|
||||
"""
|
||||
流式读取大文件,分块返回DataFrame
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
chunksize: 每块行数,默认使用配置值
|
||||
|
||||
Yields:
|
||||
DataFrame块
|
||||
"""
|
||||
if chunksize is None:
|
||||
chunksize = app_config.chunk_size
|
||||
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
|
||||
if ext == '.csv':
|
||||
# 尝试多种编码
|
||||
for encoding in ['utf-8', 'gbk', 'latin1']:
|
||||
try:
|
||||
chunks = pd.read_csv(file_path, encoding=encoding, chunksize=chunksize)
|
||||
for chunk in chunks:
|
||||
yield chunk
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"[ERROR] 读取CSV文件失败: {e}")
|
||||
break
|
||||
elif ext in ['.xlsx', '.xls']:
|
||||
# Excel文件不支持chunksize,直接读取
|
||||
try:
|
||||
df = pd.read_excel(file_path)
|
||||
# 手动分块
|
||||
for i in range(0, len(df), chunksize):
|
||||
yield df.iloc[i:i+chunksize]
|
||||
except Exception as e:
|
||||
print(f"[ERROR] 读取Excel文件失败: {e}")
|
||||
|
||||
|
||||
def load_data_with_cache(file_path: str, force_reload: bool = False) -> Optional[pd.DataFrame]:
|
||||
"""
|
||||
带缓存的数据加载
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
force_reload: 是否强制重新加载
|
||||
|
||||
Returns:
|
||||
DataFrame或None
|
||||
"""
|
||||
if not os.path.exists(file_path):
|
||||
print(f"[WARN] 文件不存在: {file_path}")
|
||||
return None
|
||||
|
||||
# 检查文件大小
|
||||
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
|
||||
|
||||
# 对于大文件,建议使用流式处理
|
||||
if file_size_mb > app_config.max_file_size_mb:
|
||||
print(f"[WARN] 文件过大 ({file_size_mb:.1f}MB),建议使用 load_data_chunked() 流式处理")
|
||||
|
||||
# 生成缓存键
|
||||
cache_key = get_file_hash(file_path)
|
||||
|
||||
# 尝试从缓存加载
|
||||
if not force_reload and app_config.data_cache_enabled:
|
||||
cached_data = data_cache.get(cache_key)
|
||||
if cached_data is not None:
|
||||
print(f"[CACHE] 从缓存加载数据: {os.path.basename(file_path)}")
|
||||
return cached_data
|
||||
|
||||
# 加载数据
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
df = None
|
||||
|
||||
try:
|
||||
if ext == '.csv':
|
||||
# 尝试多种编码
|
||||
for encoding in ['utf-8', 'gbk', 'latin1']:
|
||||
try:
|
||||
df = pd.read_csv(file_path, encoding=encoding)
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
elif ext in ['.xlsx', '.xls']:
|
||||
df = pd.read_excel(file_path)
|
||||
else:
|
||||
print(f"[WARN] 不支持的文件格式: {ext}")
|
||||
return None
|
||||
|
||||
# 缓存数据
|
||||
if df is not None and app_config.data_cache_enabled:
|
||||
data_cache.set(cache_key, df)
|
||||
print(f"[OK] 数据已缓存: {os.path.basename(file_path)}")
|
||||
|
||||
return df
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ERROR] 加载数据失败: {e}")
|
||||
return None
|
||||
|
||||
225
utils/data_privacy.py
Normal file
225
utils/data_privacy.py
Normal file
@@ -0,0 +1,225 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
数据隐私保护层
|
||||
|
||||
核心原则:发给外部 LLM 的信息只包含 schema 级别的元数据,
|
||||
绝不包含真实数据值。所有真实数据仅在本地代码执行环境中使用。
|
||||
|
||||
分级策略:
|
||||
- SAFE(安全级): 可发送给 LLM — 列名、数据类型、行列数、空值率、唯一值数量
|
||||
- LOCAL(本地级): 仅本地使用 — 真实数据值、TOP N 高频值、统计数值、样本行
|
||||
"""
|
||||
|
||||
import re
|
||||
import pandas as pd
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
def build_safe_profile(file_paths: list) -> str:
|
||||
"""
|
||||
生成可安全发送给外部 LLM 的数据画像。
|
||||
只包含 schema 信息,不包含任何真实数据值。
|
||||
|
||||
Args:
|
||||
file_paths: 数据文件路径列表
|
||||
|
||||
Returns:
|
||||
安全的 Markdown 格式数据画像
|
||||
"""
|
||||
import os
|
||||
|
||||
profile = "# 数据结构概览 (Schema Profile)\n\n"
|
||||
|
||||
if not file_paths:
|
||||
return profile + "未提供数据文件。"
|
||||
|
||||
for file_path in file_paths:
|
||||
file_name = os.path.basename(file_path)
|
||||
profile += f"## 文件: {file_name}\n\n"
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
profile += f"[WARN] 文件不存在: {file_path}\n\n"
|
||||
continue
|
||||
|
||||
try:
|
||||
df = _load_dataframe(file_path)
|
||||
if df is None:
|
||||
continue
|
||||
|
||||
rows, cols = df.shape
|
||||
profile += f"- **维度**: {rows} 行 x {cols} 列\n"
|
||||
profile += f"- **列名**: `{', '.join(df.columns)}`\n\n"
|
||||
profile += "### 列结构:\n\n"
|
||||
profile += "| 列名 | 数据类型 | 空值率 | 唯一值数 | 特征描述 |\n"
|
||||
profile += "|------|---------|--------|---------|----------|\n"
|
||||
|
||||
for col in df.columns:
|
||||
dtype = str(df[col].dtype)
|
||||
null_count = df[col].isnull().sum()
|
||||
null_pct = f"{(null_count / rows) * 100:.1f}%" if rows > 0 else "0%"
|
||||
unique_count = df[col].nunique()
|
||||
|
||||
# 特征描述:只描述数据特征,不暴露具体值
|
||||
feature_desc = _describe_column_safe(df[col], unique_count, rows)
|
||||
|
||||
profile += f"| {col} | {dtype} | {null_pct} | {unique_count} | {feature_desc} |\n"
|
||||
|
||||
profile += "\n"
|
||||
|
||||
except Exception as e:
|
||||
profile += f"[ERROR] 读取文件失败: {str(e)}\n\n"
|
||||
|
||||
return profile
|
||||
|
||||
|
||||
def build_local_profile(file_paths: list) -> str:
|
||||
"""
|
||||
生成完整的本地数据画像(包含真实数据值)。
|
||||
仅用于本地代码执行环境,不发送给 LLM。
|
||||
|
||||
这是原来 load_and_profile_data 的功能,保留完整信息。
|
||||
"""
|
||||
from utils.data_loader import load_and_profile_data
|
||||
return load_and_profile_data(file_paths)
|
||||
|
||||
|
||||
def sanitize_execution_feedback(feedback: str, max_lines: int = 30) -> str:
|
||||
"""
|
||||
对代码执行反馈进行脱敏处理,移除可能包含真实数据的内容。
|
||||
|
||||
保留:
|
||||
- 执行状态(成功/失败)
|
||||
- 错误信息
|
||||
- DataFrame 的 shape 信息
|
||||
- 图片保存路径
|
||||
- 列名信息
|
||||
|
||||
移除/截断:
|
||||
- 具体的数据行(DataFrame 输出)
|
||||
- 大段的数值输出
|
||||
|
||||
Args:
|
||||
feedback: 原始执行反馈
|
||||
max_lines: 最大保留行数
|
||||
|
||||
Returns:
|
||||
脱敏后的反馈
|
||||
"""
|
||||
if not feedback:
|
||||
return feedback
|
||||
|
||||
lines = feedback.split("\n")
|
||||
safe_lines = []
|
||||
in_dataframe_output = False
|
||||
df_line_count = 0
|
||||
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
|
||||
# 始终保留的关键信息
|
||||
if any(kw in stripped for kw in [
|
||||
"图片已保存", "保存至", "[OK]", "[WARN]", "[ERROR]",
|
||||
"[Auto-Save]", "数据表形状", "列名:", ".png",
|
||||
"shape", "columns", "dtype", "info()", "describe()",
|
||||
]):
|
||||
safe_lines.append(line)
|
||||
in_dataframe_output = False
|
||||
continue
|
||||
|
||||
# 检测 DataFrame 输出的开始(通常有列头行)
|
||||
if _looks_like_dataframe_row(stripped):
|
||||
if not in_dataframe_output:
|
||||
in_dataframe_output = True
|
||||
df_line_count = 0
|
||||
safe_lines.append("[数据输出已省略 - 数据仅在本地执行环境中可见]")
|
||||
df_line_count += 1
|
||||
continue
|
||||
|
||||
# 检测纯数值行
|
||||
if _is_numeric_heavy_line(stripped):
|
||||
if not in_dataframe_output:
|
||||
in_dataframe_output = True
|
||||
safe_lines.append("[数值输出已省略]")
|
||||
continue
|
||||
|
||||
# 普通文本行
|
||||
in_dataframe_output = False
|
||||
safe_lines.append(line)
|
||||
|
||||
# 限制总行数
|
||||
if len(safe_lines) > max_lines:
|
||||
safe_lines = safe_lines[:max_lines]
|
||||
safe_lines.append(f"[... 输出已截断,共 {len(lines)} 行]")
|
||||
|
||||
return "\n".join(safe_lines)
|
||||
|
||||
|
||||
def _load_dataframe(file_path: str):
|
||||
"""加载 DataFrame,支持多种格式和编码"""
|
||||
import os
|
||||
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
if ext == ".csv":
|
||||
for encoding in ["utf-8", "gbk", "gb18030", "latin1"]:
|
||||
try:
|
||||
return pd.read_csv(file_path, encoding=encoding)
|
||||
except (UnicodeDecodeError, Exception):
|
||||
continue
|
||||
elif ext in [".xlsx", ".xls"]:
|
||||
try:
|
||||
return pd.read_excel(file_path)
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def _describe_column_safe(series: pd.Series, unique_count: int, total_rows: int) -> str:
|
||||
"""安全地描述列特征,不暴露具体值"""
|
||||
dtype = series.dtype
|
||||
|
||||
if pd.api.types.is_numeric_dtype(dtype):
|
||||
if unique_count <= 5:
|
||||
return "低基数数值(可能是分类编码)"
|
||||
elif unique_count < total_rows * 0.05:
|
||||
return "离散数值"
|
||||
else:
|
||||
return "连续数值"
|
||||
|
||||
if pd.api.types.is_datetime64_any_dtype(dtype):
|
||||
return "时间序列"
|
||||
|
||||
# 文本/分类列
|
||||
if unique_count == 1:
|
||||
return "单一值(常量列)"
|
||||
elif unique_count <= 10:
|
||||
return f"低基数分类({unique_count}类)"
|
||||
elif unique_count <= 50:
|
||||
return f"中基数分类({unique_count}类)"
|
||||
elif unique_count > total_rows * 0.8:
|
||||
return "高基数文本(可能是ID或描述)"
|
||||
else:
|
||||
return f"文本分类({unique_count}类)"
|
||||
|
||||
|
||||
def _looks_like_dataframe_row(line: str) -> bool:
|
||||
"""判断一行是否看起来像 DataFrame 输出"""
|
||||
if not line:
|
||||
return False
|
||||
# DataFrame 输出通常有多个空格分隔的列
|
||||
parts = line.split()
|
||||
if len(parts) >= 3:
|
||||
# 第一个元素是索引(数字)
|
||||
try:
|
||||
int(parts[0])
|
||||
return True
|
||||
except ValueError:
|
||||
pass
|
||||
return False
|
||||
|
||||
|
||||
def _is_numeric_heavy_line(line: str) -> bool:
|
||||
"""判断一行是否主要由数值组成"""
|
||||
if not line or len(line) < 5:
|
||||
return False
|
||||
digits_and_dots = sum(1 for c in line if c.isdigit() or c in ".,-+eE ")
|
||||
return digits_and_dots / len(line) > 0.7
|
||||
224
utils/data_quality.py
Normal file
224
utils/data_quality.py
Normal file
@@ -0,0 +1,224 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
数据质量检查模块 - 自动评估数据质量并提供改进建议
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from typing import Dict, List, Tuple, Any
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class QualityIssue:
|
||||
"""数据质量问题"""
|
||||
column: str
|
||||
issue_type: str # missing, duplicate, outlier, type_mismatch等
|
||||
severity: str # high, medium, low
|
||||
description: str
|
||||
suggestion: str
|
||||
|
||||
|
||||
class DataQualityChecker:
|
||||
"""数据质量检查器"""
|
||||
|
||||
def __init__(self, df: pd.DataFrame):
|
||||
self.df = df
|
||||
self.issues: List[QualityIssue] = []
|
||||
self.quality_score: float = 100.0
|
||||
|
||||
def check_all(self) -> Dict[str, Any]:
|
||||
"""执行所有质量检查"""
|
||||
self.check_missing_values()
|
||||
self.check_duplicates()
|
||||
self.check_data_types()
|
||||
self.check_outliers()
|
||||
self.check_consistency()
|
||||
|
||||
return self.generate_report()
|
||||
|
||||
def check_missing_values(self) -> None:
|
||||
"""检查缺失值"""
|
||||
for col in self.df.columns:
|
||||
missing_count = self.df[col].isnull().sum()
|
||||
missing_ratio = (missing_count / len(self.df)) * 100
|
||||
|
||||
if missing_ratio > 50:
|
||||
severity = "high"
|
||||
self.quality_score -= 10
|
||||
elif missing_ratio > 20:
|
||||
severity = "medium"
|
||||
self.quality_score -= 5
|
||||
elif missing_ratio > 0:
|
||||
severity = "low"
|
||||
self.quality_score -= 2
|
||||
else:
|
||||
continue
|
||||
|
||||
issue = QualityIssue(
|
||||
column=col,
|
||||
issue_type="missing",
|
||||
severity=severity,
|
||||
description=f"列 '{col}' 存在 {missing_count} 个缺失值 ({missing_ratio:.1f}%)",
|
||||
suggestion=self._suggest_missing_handling(col, missing_ratio)
|
||||
)
|
||||
self.issues.append(issue)
|
||||
|
||||
def check_duplicates(self) -> None:
|
||||
"""检查重复数据"""
|
||||
duplicate_count = self.df.duplicated().sum()
|
||||
if duplicate_count > 0:
|
||||
duplicate_ratio = (duplicate_count / len(self.df)) * 100
|
||||
|
||||
severity = "high" if duplicate_ratio > 10 else "medium"
|
||||
self.quality_score -= 5 if severity == "high" else 3
|
||||
|
||||
issue = QualityIssue(
|
||||
column="全表",
|
||||
issue_type="duplicate",
|
||||
severity=severity,
|
||||
description=f"发现 {duplicate_count} 行重复数据 ({duplicate_ratio:.1f}%)",
|
||||
suggestion="建议使用 df.drop_duplicates() 删除重复行,或检查是否为合理的重复记录"
|
||||
)
|
||||
self.issues.append(issue)
|
||||
|
||||
def check_data_types(self) -> None:
|
||||
"""检查数据类型一致性"""
|
||||
for col in self.df.columns:
|
||||
# 检查是否有数值列被识别为object
|
||||
if self.df[col].dtype == 'object':
|
||||
try:
|
||||
# 尝试转换为数值
|
||||
pd.to_numeric(self.df[col].dropna(), errors='raise')
|
||||
|
||||
issue = QualityIssue(
|
||||
column=col,
|
||||
issue_type="type_mismatch",
|
||||
severity="medium",
|
||||
description=f"列 '{col}' 当前为文本类型,但可以转换为数值类型",
|
||||
suggestion=f"建议使用 df['{col}'] = pd.to_numeric(df['{col}']) 转换类型"
|
||||
)
|
||||
self.issues.append(issue)
|
||||
self.quality_score -= 3
|
||||
except:
|
||||
pass
|
||||
|
||||
def check_outliers(self) -> None:
|
||||
"""检查数值列的异常值"""
|
||||
numeric_cols = self.df.select_dtypes(include=[np.number]).columns
|
||||
|
||||
for col in numeric_cols:
|
||||
q1 = self.df[col].quantile(0.25)
|
||||
q3 = self.df[col].quantile(0.75)
|
||||
iqr = q3 - q1
|
||||
|
||||
lower_bound = q1 - 3 * iqr
|
||||
upper_bound = q3 + 3 * iqr
|
||||
|
||||
outliers = self.df[(self.df[col] < lower_bound) | (self.df[col] > upper_bound)]
|
||||
outlier_count = len(outliers)
|
||||
|
||||
if outlier_count > 0:
|
||||
outlier_ratio = (outlier_count / len(self.df)) * 100
|
||||
|
||||
if outlier_ratio > 5:
|
||||
severity = "medium"
|
||||
self.quality_score -= 3
|
||||
else:
|
||||
severity = "low"
|
||||
self.quality_score -= 1
|
||||
|
||||
issue = QualityIssue(
|
||||
column=col,
|
||||
issue_type="outlier",
|
||||
severity=severity,
|
||||
description=f"列 '{col}' 存在 {outlier_count} 个异常值 ({outlier_ratio:.1f}%)",
|
||||
suggestion=f"建议检查 {lower_bound:.2f} 以下和 {upper_bound:.2f} 以上的值是否合理"
|
||||
)
|
||||
self.issues.append(issue)
|
||||
|
||||
def check_consistency(self) -> None:
|
||||
"""检查数据一致性"""
|
||||
# 检查时间列的时序性
|
||||
datetime_cols = self.df.select_dtypes(include=['datetime64']).columns
|
||||
|
||||
for col in datetime_cols:
|
||||
if not self.df[col].is_monotonic_increasing:
|
||||
issue = QualityIssue(
|
||||
column=col,
|
||||
issue_type="consistency",
|
||||
severity="medium",
|
||||
description=f"时间列 '{col}' 不是单调递增的,可能存在乱序",
|
||||
suggestion=f"建议使用 df.sort_values('{col}') 进行排序"
|
||||
)
|
||||
self.issues.append(issue)
|
||||
self.quality_score -= 3
|
||||
|
||||
def _suggest_missing_handling(self, col: str, missing_ratio: float) -> str:
|
||||
"""建议缺失值处理方法"""
|
||||
if missing_ratio > 70:
|
||||
return f"缺失比例过高,建议删除列 '{col}'"
|
||||
elif missing_ratio > 30:
|
||||
return f"建议填充或删除缺失值:使用中位数/众数填充或删除含缺失值的行"
|
||||
else:
|
||||
if pd.api.types.is_numeric_dtype(self.df[col]):
|
||||
return f"建议使用均值/中位数填充:df['{col}'].fillna(df['{col}'].median())"
|
||||
else:
|
||||
return f"建议使用众数填充:df['{col}'].fillna(df['{col}'].mode()[0])"
|
||||
|
||||
def generate_report(self) -> Dict[str, Any]:
|
||||
"""生成质量报告"""
|
||||
# 确保质量分数在0-100之间
|
||||
self.quality_score = max(0, min(100, self.quality_score))
|
||||
|
||||
# 按严重程度分类
|
||||
high_issues = [i for i in self.issues if i.severity == "high"]
|
||||
medium_issues = [i for i in self.issues if i.severity == "medium"]
|
||||
low_issues = [i for i in self.issues if i.severity == "low"]
|
||||
|
||||
return {
|
||||
"quality_score": round(self.quality_score, 2),
|
||||
"total_issues": len(self.issues),
|
||||
"high_severity": len(high_issues),
|
||||
"medium_severity": len(medium_issues),
|
||||
"low_severity": len(low_issues),
|
||||
"issues": self.issues,
|
||||
"summary": self._generate_summary()
|
||||
}
|
||||
|
||||
def _generate_summary(self) -> str:
|
||||
"""生成可读的摘要"""
|
||||
summary = f"## 数据质量报告\n\n"
|
||||
summary += f"**质量评分**: {self.quality_score:.1f}/100\n\n"
|
||||
|
||||
if self.quality_score >= 90:
|
||||
summary += "[OK] **评级**: 优秀 - 数据质量很好\n\n"
|
||||
elif self.quality_score >= 75:
|
||||
summary += "[WARN] **评级**: 良好 - 存在一些小问题\n\n"
|
||||
elif self.quality_score >= 60:
|
||||
summary += "[WARN] **评级**: 一般 - 需要处理多个问题\n\n"
|
||||
else:
|
||||
summary += "[ERROR] **评级**: 差 - 数据质量问题严重\n\n"
|
||||
|
||||
summary += f"**问题统计**: 共 {len(self.issues)} 个质量问题\n"
|
||||
summary += f"- [RED] 高严重性: {len([i for i in self.issues if i.severity == 'high'])} 个\n"
|
||||
summary += f"- [YELLOW] 中严重性: {len([i for i in self.issues if i.severity == 'medium'])} 个\n"
|
||||
summary += f"- [GREEN] 低严重性: {len([i for i in self.issues if i.severity == 'low'])} 个\n\n"
|
||||
|
||||
if self.issues:
|
||||
summary += "### 主要问题:\n\n"
|
||||
# 只显示高和中严重性的问题
|
||||
for issue in self.issues:
|
||||
if issue.severity in ["high", "medium"]:
|
||||
emoji = "[RED]" if issue.severity == "high" else "[YELLOW]"
|
||||
summary += f"{emoji} **{issue.column}** - {issue.description}\n"
|
||||
summary += f" [TIP] {issue.suggestion}\n\n"
|
||||
|
||||
return summary
|
||||
|
||||
|
||||
def quick_quality_check(df: pd.DataFrame) -> str:
|
||||
"""快速数据质量检查"""
|
||||
checker = DataQualityChecker(df)
|
||||
report = checker.check_all()
|
||||
return report['summary']
|
||||
@@ -29,6 +29,22 @@ def extract_code_from_response(response: str) -> Optional[str]:
|
||||
end = response.find('```', start)
|
||||
if end != -1:
|
||||
return response[start:end].strip()
|
||||
|
||||
# 尝试提取 code: | 形式的代码块(针对YAML格式错误但结构清晰的情况)
|
||||
import re
|
||||
# 匹配 code: | 后面的内容,直到遇到下一个键(next_key:)或结尾
|
||||
# 假设代码块至少缩进2个空格
|
||||
pattern = r'code:\s*\|\s*\n((?: {2,}.*\n?)+)'
|
||||
match = re.search(pattern, response)
|
||||
if match:
|
||||
code_block = match.group(1)
|
||||
# 尝试去除公共缩进
|
||||
try:
|
||||
import textwrap
|
||||
return textwrap.dedent(code_block).strip()
|
||||
except:
|
||||
return code_block.strip()
|
||||
|
||||
elif '```' in response:
|
||||
start = response.find('```') + 3
|
||||
end = response.find('```', start)
|
||||
|
||||
@@ -57,7 +57,7 @@ class AsyncFallbackOpenAIClient:
|
||||
self.fallback_client = AsyncOpenAI(api_key=fallback_api_key, base_url=fallback_base_url, **_fallback_args)
|
||||
self.fallback_model_name = fallback_model_name
|
||||
else:
|
||||
print("⚠️ 警告: 未完全配置备用 API 客户端。如果主 API 失败,将无法进行回退。")
|
||||
print("[WARN] 警告: 未完全配置备用 API 客户端。如果主 API 失败,将无法进行回退。")
|
||||
|
||||
self.content_filter_error_code = content_filter_error_code
|
||||
self.content_filter_error_field = content_filter_error_field
|
||||
@@ -90,35 +90,60 @@ class AsyncFallbackOpenAIClient:
|
||||
return completion
|
||||
except (APIConnectionError, APITimeoutError) as e: # 通常可以重试的网络错误
|
||||
last_exception = e
|
||||
print(f"⚠️ {api_name} API 调用时发生可重试错误 ({type(e).__name__}): {e}. 尝试次数 {attempt + 1}/{max_retries + 1}")
|
||||
print(f"[WARN] {api_name} API 调用时发生可重试错误 ({type(e).__name__}): {e}. 尝试次数 {attempt + 1}/{max_retries + 1}")
|
||||
if attempt < max_retries:
|
||||
await asyncio.sleep(self.retry_delay_seconds * (attempt + 1)) # 增加延迟
|
||||
else:
|
||||
print(f"❌ {api_name} API 在达到最大重试次数后仍然失败。")
|
||||
print(f"[ERROR] {api_name} API 在达到最大重试次数后仍然失败。")
|
||||
except APIStatusError as e: # API 返回的特定状态码错误
|
||||
is_content_filter_error = False
|
||||
if e.status_code == 400:
|
||||
try:
|
||||
error_json = e.response.json()
|
||||
error_details = error_json.get("error", {})
|
||||
if (error_details.get("code") == self.content_filter_error_code and
|
||||
self.content_filter_error_field in error_json):
|
||||
is_content_filter_error = True
|
||||
except Exception:
|
||||
pass # 解析错误响应失败,不认为是内容过滤错误
|
||||
retry_after = None
|
||||
|
||||
# 尝试解析错误详情以获取更多信息(如 Google RPC RetryInfo)
|
||||
try:
|
||||
error_json = e.response.json()
|
||||
error_details = error_json.get("error", {})
|
||||
|
||||
# 检查内容过滤错误(针对特定服务商)
|
||||
if (error_details.get("code") == self.content_filter_error_code and
|
||||
self.content_filter_error_field in error_json):
|
||||
is_content_filter_error = True
|
||||
|
||||
# 检查 Google RPC RetryInfo
|
||||
# 格式示例: {'error': {'details': [{'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '38s'}]}}
|
||||
if "details" in error_details:
|
||||
for detail in error_details["details"]:
|
||||
if detail.get("@type") == "type.googleapis.com/google.rpc.RetryInfo":
|
||||
delay_str = detail.get("retryDelay", "")
|
||||
if delay_str.endswith("s"):
|
||||
try:
|
||||
retry_after = float(delay_str[:-1])
|
||||
print(f"[TIMER] 收到服务器 RetryInfo,等待时间: {retry_after}秒")
|
||||
except ValueError:
|
||||
pass
|
||||
except Exception:
|
||||
pass # 解析错误响应失败,忽略
|
||||
|
||||
if is_content_filter_error and api_name == "主": # 如果是主 API 的内容过滤错误,则直接抛出以便回退
|
||||
raise e
|
||||
|
||||
last_exception = e
|
||||
print(f"⚠️ {api_name} API 调用时发生 APIStatusError ({e.status_code}): {e}. 尝试次数 {attempt + 1}/{max_retries + 1}")
|
||||
print(f"[WARN] {api_name} API 调用时发生 APIStatusError ({e.status_code}): {e}. 尝试次数 {attempt + 1}/{max_retries + 1}")
|
||||
|
||||
if attempt < max_retries:
|
||||
await asyncio.sleep(self.retry_delay_seconds * (attempt + 1))
|
||||
# 如果获取到了明确的 retry_after,则使用它;否则使用默认的指数退避
|
||||
wait_time = retry_after if retry_after is not None else (self.retry_delay_seconds * (attempt + 1))
|
||||
# 如果是 429 Too Many Requests 且没有解析出 retry_after,建议加大等待时间
|
||||
if e.status_code == 429 and retry_after is None:
|
||||
wait_time = max(wait_time, 5.0 * (attempt + 1)) # 429 默认至少等 5 秒
|
||||
|
||||
print(f"[WAIT] 将等待 {wait_time:.2f} 秒后重试...")
|
||||
await asyncio.sleep(wait_time)
|
||||
else:
|
||||
print(f"❌ {api_name} API 在达到最大重试次数后仍然失败 (APIStatusError)。")
|
||||
print(f"[ERROR] {api_name} API 在达到最大重试次数后仍然失败 (APIStatusError)。")
|
||||
except APIError as e: # 其他不可轻易重试的 OpenAI 错误
|
||||
last_exception = e
|
||||
print(f"❌ {api_name} API 调用时发生不可重试错误 ({type(e).__name__}): {e}")
|
||||
print(f"[ERROR] {api_name} API 调用时发生不可重试错误 ({type(e).__name__}): {e}")
|
||||
break # 不再重试此类错误
|
||||
|
||||
if last_exception:
|
||||
@@ -171,7 +196,7 @@ class AsyncFallbackOpenAIClient:
|
||||
pass
|
||||
|
||||
if is_content_filter_error and self.fallback_client and self.fallback_model_name:
|
||||
print(f"ℹ️ 主 API 内容过滤错误 ({e_primary.status_code})。尝试切换到备用 API ({self.fallback_client.base_url})...")
|
||||
print(f"[INFO] 主 API 内容过滤错误 ({e_primary.status_code})。尝试切换到备用 API ({self.fallback_client.base_url})...")
|
||||
try:
|
||||
fallback_completion = await self._attempt_api_call(
|
||||
client=self.fallback_client,
|
||||
@@ -181,20 +206,20 @@ class AsyncFallbackOpenAIClient:
|
||||
api_name="备用",
|
||||
**kwargs.copy()
|
||||
)
|
||||
print(f"✅ 备用 API 调用成功。")
|
||||
print(f"[OK] 备用 API 调用成功。")
|
||||
return fallback_completion
|
||||
except APIError as e_fallback:
|
||||
print(f"❌ 备用 API 调用最终失败: {type(e_fallback).__name__} - {e_fallback}")
|
||||
print(f"[ERROR] 备用 API 调用最终失败: {type(e_fallback).__name__} - {e_fallback}")
|
||||
raise e_fallback
|
||||
else:
|
||||
if not (self.fallback_client and self.fallback_model_name and is_content_filter_error):
|
||||
# 如果不是内容过滤错误,或者没有可用的备用API,则记录主API的原始错误
|
||||
print(f"ℹ️ 主 API 错误 ({type(e_primary).__name__}: {e_primary}), 且不满足备用条件或备用API未配置。")
|
||||
print(f"[INFO] 主 API 错误 ({type(e_primary).__name__}: {e_primary}), 且不满足备用条件或备用API未配置。")
|
||||
raise e_primary
|
||||
except APIError as e_primary_other:
|
||||
print(f"❌ 主 API 调用最终失败 (非内容过滤,错误类型: {type(e_primary_other).__name__}): {e_primary_other}")
|
||||
print(f"[ERROR] 主 API 调用最终失败 (非内容过滤,错误类型: {type(e_primary_other).__name__}): {e_primary_other}")
|
||||
if self.fallback_client and self.fallback_model_name:
|
||||
print(f"ℹ️ 主 API 失败,尝试切换到备用 API ({self.fallback_client.base_url})...")
|
||||
print(f"[INFO] 主 API 失败,尝试切换到备用 API ({self.fallback_client.base_url})...")
|
||||
try:
|
||||
fallback_completion = await self._attempt_api_call(
|
||||
client=self.fallback_client,
|
||||
@@ -204,10 +229,10 @@ class AsyncFallbackOpenAIClient:
|
||||
api_name="备用",
|
||||
**kwargs.copy()
|
||||
)
|
||||
print(f"✅ 备用 API 调用成功。")
|
||||
print(f"[OK] 备用 API 调用成功。")
|
||||
return fallback_completion
|
||||
except APIError as e_fallback_after_primary_fail:
|
||||
print(f"❌ 备用 API 在主 API 失败后也调用失败: {type(e_fallback_after_primary_fail).__name__} - {e_fallback_after_primary_fail}")
|
||||
print(f"[ERROR] 备用 API 在主 API 失败后也调用失败: {type(e_fallback_after_primary_fail).__name__} - {e_fallback_after_primary_fail}")
|
||||
raise e_fallback_after_primary_fail
|
||||
else:
|
||||
raise e_primary_other
|
||||
|
||||
@@ -7,17 +7,17 @@ def format_execution_result(result: Dict[str, Any]) -> str:
|
||||
feedback = []
|
||||
|
||||
if result['success']:
|
||||
feedback.append("✅ 代码执行成功")
|
||||
feedback.append("[OK] 代码执行成功")
|
||||
|
||||
if result['output']:
|
||||
feedback.append(f"📊 输出结果:\n{result['output']}")
|
||||
feedback.append(f"[CHART] 输出结果:\n{result['output']}")
|
||||
|
||||
if result.get('variables'):
|
||||
feedback.append("📋 新生成的变量:")
|
||||
feedback.append("[LIST] 新生成的变量:")
|
||||
for var_name, var_info in result['variables'].items():
|
||||
feedback.append(f" - {var_name}: {var_info}")
|
||||
else:
|
||||
feedback.append("❌ 代码执行失败")
|
||||
feedback.append("[ERROR] 代码执行失败")
|
||||
feedback.append(f"错误信息: {result['error']}")
|
||||
if result['output']:
|
||||
feedback.append(f"部分输出: {result['output']}")
|
||||
|
||||
@@ -5,8 +5,17 @@ LLM调用辅助模块
|
||||
|
||||
import asyncio
|
||||
import yaml
|
||||
from typing import Optional, Callable, AsyncIterator
|
||||
from config.llm_config import LLMConfig
|
||||
from config.app_config import app_config
|
||||
from utils.fallback_openai_client import AsyncFallbackOpenAIClient
|
||||
from utils.cache_manager import LLMCacheManager
|
||||
|
||||
# 初始化LLM缓存管理器
|
||||
llm_cache = LLMCacheManager(
|
||||
cache_dir=app_config.llm_cache_dir,
|
||||
enabled=app_config.llm_cache_enabled
|
||||
)
|
||||
|
||||
class LLMHelper:
|
||||
"""LLM调用辅助类,支持同步和异步调用"""
|
||||
@@ -75,12 +84,111 @@ class LLMHelper:
|
||||
else:
|
||||
yaml_content = response.strip()
|
||||
|
||||
return yaml.safe_load(yaml_content)
|
||||
parsed = yaml.safe_load(yaml_content)
|
||||
return parsed if parsed is not None else {}
|
||||
except Exception as e:
|
||||
print(f"YAML解析失败: {e}")
|
||||
print(f"原始响应: {response}")
|
||||
return {}
|
||||
|
||||
|
||||
async def close(self):
|
||||
"""关闭客户端"""
|
||||
await self.client.close()
|
||||
|
||||
async def async_call_with_cache(
|
||||
self,
|
||||
prompt: str,
|
||||
system_prompt: str = None,
|
||||
max_tokens: int = None,
|
||||
temperature: float = None,
|
||||
use_cache: bool = True
|
||||
) -> str:
|
||||
"""带缓存的异步LLM调用"""
|
||||
messages = []
|
||||
if system_prompt:
|
||||
messages.append({"role": "system", "content": system_prompt})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
# 生成缓存键
|
||||
cache_key = llm_cache.get_cache_key_from_messages(messages, self.config.model)
|
||||
|
||||
# 尝试从缓存获取
|
||||
if use_cache and app_config.llm_cache_enabled:
|
||||
cached_response = llm_cache.get(cache_key)
|
||||
if cached_response:
|
||||
print("[CACHE] 使用LLM缓存响应")
|
||||
return cached_response
|
||||
|
||||
# 调用LLM
|
||||
response = await self.async_call(prompt, system_prompt, max_tokens, temperature)
|
||||
|
||||
# 缓存响应
|
||||
if use_cache and app_config.llm_cache_enabled and response:
|
||||
llm_cache.set(cache_key, response)
|
||||
|
||||
return response
|
||||
|
||||
def call_with_cache(
|
||||
self,
|
||||
prompt: str,
|
||||
system_prompt: str = None,
|
||||
max_tokens: int = None,
|
||||
temperature: float = None,
|
||||
use_cache: bool = True
|
||||
) -> str:
|
||||
"""带缓存的同步LLM调用"""
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
except RuntimeError:
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
import nest_asyncio
|
||||
nest_asyncio.apply()
|
||||
|
||||
return loop.run_until_complete(
|
||||
self.async_call_with_cache(prompt, system_prompt, max_tokens, temperature, use_cache)
|
||||
)
|
||||
|
||||
async def async_call_stream(
|
||||
self,
|
||||
prompt: str,
|
||||
system_prompt: str = None,
|
||||
max_tokens: int = None,
|
||||
temperature: float = None,
|
||||
callback: Optional[Callable[[str], None]] = None
|
||||
) -> AsyncIterator[str]:
|
||||
"""流式异步LLM调用"""
|
||||
messages = []
|
||||
if system_prompt:
|
||||
messages.append({"role": "system", "content": system_prompt})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
kwargs = {
|
||||
'stream': True,
|
||||
'max_tokens': max_tokens or self.config.max_tokens,
|
||||
'temperature': temperature or self.config.temperature
|
||||
}
|
||||
|
||||
try:
|
||||
response = await self.client.chat_completions_create(
|
||||
messages=messages,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
full_response = ""
|
||||
async for chunk in response:
|
||||
if chunk.choices[0].delta.content:
|
||||
content = chunk.choices[0].delta.content
|
||||
full_response += content
|
||||
|
||||
# 调用回调函数
|
||||
if callback:
|
||||
callback(content)
|
||||
|
||||
yield content
|
||||
|
||||
except Exception as e:
|
||||
print(f"流式LLM调用失败: {e}")
|
||||
yield ""
|
||||
113
utils/logger.py
Normal file
113
utils/logger.py
Normal file
@@ -0,0 +1,113 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
统一日志模块 - 替代全局 sys.stdout 劫持
|
||||
|
||||
提供线程安全的日志记录,支持同时输出到终端和文件。
|
||||
每个会话拥有独立的日志文件,不会互相干扰。
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def create_session_logger(
|
||||
session_id: str,
|
||||
log_dir: str,
|
||||
log_filename: str = "process.log",
|
||||
level: int = logging.INFO,
|
||||
) -> logging.Logger:
|
||||
"""
|
||||
为指定会话创建独立的 Logger 实例。
|
||||
|
||||
Args:
|
||||
session_id: 会话唯一标识
|
||||
log_dir: 日志文件所在目录
|
||||
log_filename: 日志文件名
|
||||
level: 日志级别
|
||||
|
||||
Returns:
|
||||
配置好的 Logger 实例
|
||||
"""
|
||||
logger = logging.getLogger(f"session.{session_id}")
|
||||
logger.setLevel(level)
|
||||
|
||||
# 避免重复添加 handler
|
||||
if logger.handlers:
|
||||
return logger
|
||||
|
||||
formatter = logging.Formatter(
|
||||
fmt="%(asctime)s %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
|
||||
# 文件 handler — 写入会话专属日志
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
log_path = os.path.join(log_dir, log_filename)
|
||||
file_handler = logging.FileHandler(log_path, encoding="utf-8", mode="a")
|
||||
file_handler.setFormatter(formatter)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
# 终端 handler — 输出到 stderr(不干扰 stdout)
|
||||
console_handler = logging.StreamHandler(sys.stderr)
|
||||
console_handler.setFormatter(formatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
# 不向父 logger 传播
|
||||
logger.propagate = False
|
||||
|
||||
return logger
|
||||
|
||||
|
||||
class PrintCapture:
|
||||
"""
|
||||
轻量级 print 捕获器,将 print 输出同时写入日志文件。
|
||||
用于兼容现有大量使用 print() 的代码,无需逐行改造。
|
||||
|
||||
用法:
|
||||
with PrintCapture(log_path) as cap:
|
||||
print("hello") # 同时输出到终端和文件
|
||||
# 退出后 sys.stdout 自动恢复
|
||||
"""
|
||||
|
||||
def __init__(self, log_path: str, filter_patterns: Optional[list] = None):
|
||||
self.log_path = log_path
|
||||
self.filter_patterns = filter_patterns or ["[TOOL] 执行代码:"]
|
||||
self._original_stdout = None
|
||||
self._log_file = None
|
||||
|
||||
def __enter__(self):
|
||||
os.makedirs(os.path.dirname(self.log_path), exist_ok=True)
|
||||
self._original_stdout = sys.stdout
|
||||
self._log_file = open(self.log_path, "a", encoding="utf-8", buffering=1)
|
||||
sys.stdout = self._DualWriter(
|
||||
self._original_stdout, self._log_file, self.filter_patterns
|
||||
)
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
sys.stdout = self._original_stdout
|
||||
if self._log_file:
|
||||
self._log_file.close()
|
||||
return False
|
||||
|
||||
class _DualWriter:
|
||||
"""同时写入两个流,支持过滤"""
|
||||
|
||||
def __init__(self, terminal, log_file, filter_patterns):
|
||||
self.terminal = terminal
|
||||
self.log_file = log_file
|
||||
self.filter_patterns = filter_patterns
|
||||
|
||||
def write(self, message):
|
||||
self.terminal.write(message)
|
||||
# 过滤不需要写入日志的内容
|
||||
if any(p in message for p in self.filter_patterns):
|
||||
return
|
||||
self.log_file.write(message)
|
||||
|
||||
def flush(self):
|
||||
self.terminal.flush()
|
||||
self.log_file.flush()
|
||||
215
utils/script_generator.py
Normal file
215
utils/script_generator.py
Normal file
@@ -0,0 +1,215 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
可复用脚本生成器
|
||||
|
||||
从分析会话的执行历史中提取成功执行的代码,
|
||||
合并去重后生成可独立运行的 .py 脚本文件。
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Any, Set
|
||||
|
||||
|
||||
def extract_imports(code: str) -> Set[str]:
|
||||
"""从代码中提取所有 import 语句"""
|
||||
imports = set()
|
||||
lines = code.split('\n')
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
if stripped.startswith('import ') or stripped.startswith('from '):
|
||||
# 标准化 import 语句
|
||||
imports.add(stripped)
|
||||
return imports
|
||||
|
||||
|
||||
def remove_imports(code: str) -> str:
|
||||
"""从代码中移除所有 import 语句"""
|
||||
lines = code.split('\n')
|
||||
result_lines = []
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
if not stripped.startswith('import ') and not stripped.startswith('from '):
|
||||
result_lines.append(line)
|
||||
return '\n'.join(result_lines)
|
||||
|
||||
|
||||
def clean_code_block(code: str) -> str:
|
||||
"""清理代码块,移除不必要的内容"""
|
||||
# 移除可能的重复配置代码
|
||||
patterns_to_skip = [
|
||||
r"plt\.rcParams\['font\.sans-serif'\]", # 字体配置在模板中统一处理
|
||||
r"plt\.rcParams\['axes\.unicode_minus'\]",
|
||||
]
|
||||
|
||||
lines = code.split('\n')
|
||||
result_lines = []
|
||||
skip_until_empty = False
|
||||
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
|
||||
# 跳过空行连续的情况
|
||||
if not stripped:
|
||||
if skip_until_empty:
|
||||
skip_until_empty = False
|
||||
continue
|
||||
result_lines.append(line)
|
||||
continue
|
||||
|
||||
# 检查是否需要跳过的模式
|
||||
should_skip = False
|
||||
for pattern in patterns_to_skip:
|
||||
if re.search(pattern, stripped):
|
||||
should_skip = True
|
||||
break
|
||||
|
||||
if not should_skip:
|
||||
result_lines.append(line)
|
||||
|
||||
return '\n'.join(result_lines)
|
||||
|
||||
|
||||
def generate_reusable_script(
|
||||
analysis_results: List[Dict[str, Any]],
|
||||
data_files: List[str],
|
||||
session_output_dir: str,
|
||||
user_requirement: str = ""
|
||||
) -> str:
|
||||
"""
|
||||
从分析结果中生成可复用的 Python 脚本
|
||||
|
||||
Args:
|
||||
analysis_results: 分析过程中记录的结果列表,每个元素包含 'code', 'result' 等
|
||||
data_files: 原始数据文件路径列表
|
||||
session_output_dir: 会话输出目录
|
||||
user_requirement: 用户的原始需求描述
|
||||
|
||||
Returns:
|
||||
生成的脚本文件路径
|
||||
"""
|
||||
# 收集所有成功执行的代码
|
||||
all_imports = set()
|
||||
code_blocks = []
|
||||
|
||||
for result in analysis_results:
|
||||
# 只处理 generate_code 类型的结果
|
||||
if result.get("action") == "collect_figures":
|
||||
continue
|
||||
|
||||
code = result.get("code", "")
|
||||
exec_result = result.get("result", {})
|
||||
|
||||
# 只收集成功执行的代码
|
||||
if code and exec_result.get("success", False):
|
||||
# 提取 imports
|
||||
imports = extract_imports(code)
|
||||
all_imports.update(imports)
|
||||
|
||||
# 清理代码块
|
||||
cleaned_code = remove_imports(code)
|
||||
cleaned_code = clean_code_block(cleaned_code)
|
||||
|
||||
# 只添加非空的代码块
|
||||
if cleaned_code.strip():
|
||||
code_blocks.append({
|
||||
"round": result.get("round", 0),
|
||||
"code": cleaned_code.strip()
|
||||
})
|
||||
|
||||
if not code_blocks:
|
||||
print("[WARN] 没有成功执行的代码块,跳过脚本生成")
|
||||
return ""
|
||||
|
||||
# 生成脚本内容
|
||||
now = datetime.now()
|
||||
timestamp = now.strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
# 构建脚本头部
|
||||
script_header = f'''#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
数据分析脚本 - 自动生成
|
||||
=====================================
|
||||
原始数据文件: {', '.join(data_files)}
|
||||
生成时间: {now.strftime("%Y-%m-%d %H:%M:%S")}
|
||||
原始需求: {user_requirement[:200] + '...' if len(user_requirement) > 200 else user_requirement}
|
||||
=====================================
|
||||
|
||||
使用方法:
|
||||
1. 修改下方 DATA_FILES 列表中的文件路径
|
||||
2. 修改 OUTPUT_DIR 指定输出目录
|
||||
3. 运行: python {os.path.basename(session_output_dir)}_分析脚本.py
|
||||
"""
|
||||
|
||||
import os
|
||||
'''
|
||||
|
||||
# 添加标准 imports(去重后排序)
|
||||
standard_imports = sorted([imp for imp in all_imports if imp.startswith('import ')])
|
||||
from_imports = sorted([imp for imp in all_imports if imp.startswith('from ')])
|
||||
|
||||
imports_section = '\n'.join(standard_imports + from_imports)
|
||||
|
||||
# 配置区域
|
||||
config_section = f'''
|
||||
# ========== 配置区域 (可修改) ==========
|
||||
|
||||
# 数据文件路径 - 修改此处以分析不同的数据
|
||||
DATA_FILES = {repr(data_files)}
|
||||
|
||||
# 输出目录 - 图片和报告将保存在此目录
|
||||
OUTPUT_DIR = "./analysis_output"
|
||||
|
||||
# 创建输出目录
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
# ========== 字体配置 (中文显示) ==========
|
||||
import platform
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
system_name = platform.system()
|
||||
if system_name == 'Darwin':
|
||||
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'PingFang SC', 'sans-serif']
|
||||
elif system_name == 'Windows':
|
||||
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'sans-serif']
|
||||
else:
|
||||
plt.rcParams['font.sans-serif'] = ['WenQuanYi Micro Hei', 'sans-serif']
|
||||
plt.rcParams['axes.unicode_minus'] = False
|
||||
|
||||
# 设置 session_output_dir 变量(兼容原始代码)
|
||||
session_output_dir = OUTPUT_DIR
|
||||
'''
|
||||
|
||||
# 合并代码块
|
||||
code_section = "\n# ========== 分析代码 ==========\n\n"
|
||||
|
||||
for i, block in enumerate(code_blocks, 1):
|
||||
code_section += f"# --- 第 {block['round']} 轮分析 ---\n"
|
||||
code_section += block['code'] + "\n\n"
|
||||
|
||||
# 脚本尾部
|
||||
script_footer = '''
|
||||
# ========== 完成 ==========
|
||||
print("\\n" + "=" * 50)
|
||||
print("[OK] 分析完成!")
|
||||
print(f"[OUTPUT] 输出目录: {os.path.abspath(OUTPUT_DIR)}")
|
||||
print("=" * 50)
|
||||
'''
|
||||
|
||||
# 组装完整脚本
|
||||
full_script = script_header + imports_section + config_section + code_section + script_footer
|
||||
|
||||
# 保存脚本文件
|
||||
script_filename = f"分析脚本_{timestamp}.py"
|
||||
script_path = os.path.join(session_output_dir, script_filename)
|
||||
|
||||
try:
|
||||
with open(script_path, 'w', encoding='utf-8') as f:
|
||||
f.write(full_script)
|
||||
print(f"[OK] 可复用脚本已生成: {script_path}")
|
||||
return script_path
|
||||
except Exception as e:
|
||||
print(f"[ERROR] 保存脚本失败: {e}")
|
||||
return ""
|
||||
857
web/main.py
Normal file
857
web/main.py
Normal file
@@ -0,0 +1,857 @@
|
||||
|
||||
import sys
|
||||
import os
|
||||
import threading
|
||||
import glob
|
||||
import uuid
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, List
|
||||
from fastapi import FastAPI, UploadFile, File, BackgroundTasks, HTTPException, Query
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.responses import FileResponse, JSONResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
# Add parent directory to path to import agent modules
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from data_analysis_agent import DataAnalysisAgent
|
||||
from config.llm_config import LLMConfig
|
||||
from utils.create_session_dir import create_session_output_dir
|
||||
from utils.logger import PrintCapture
|
||||
|
||||
app = FastAPI(title="IOV Data Analysis Agent")
|
||||
|
||||
|
||||
def _to_web_path(fs_path: str) -> str:
|
||||
"""将文件系统路径转为 URL 安全的正斜杠路径(修复 Windows 反斜杠问题)"""
|
||||
return fs_path.replace("\\", "/")
|
||||
|
||||
|
||||
# CORS
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# --- Session Management ---
|
||||
|
||||
class SessionData:
|
||||
def __init__(self, session_id: str):
|
||||
self.session_id = session_id
|
||||
self.is_running = False
|
||||
self.output_dir: Optional[str] = None
|
||||
self.generated_report: Optional[str] = None
|
||||
self.log_file: Optional[str] = None
|
||||
self.analysis_results: List[Dict] = [] # Store analysis results for gallery
|
||||
self.agent: Optional[DataAnalysisAgent] = None # Store the agent instance for follow-up
|
||||
|
||||
# 新增:进度跟踪
|
||||
self.current_round: int = 0
|
||||
self.max_rounds: int = 20
|
||||
self.progress_percentage: float = 0.0
|
||||
self.status_message: str = "等待开始"
|
||||
|
||||
# 新增:历史记录
|
||||
self.created_at: str = ""
|
||||
self.last_updated: str = ""
|
||||
self.user_requirement: str = ""
|
||||
self.file_list: List[str] = []
|
||||
self.reusable_script: Optional[str] = None # 新增:可复用脚本路径
|
||||
|
||||
|
||||
class SessionManager:
|
||||
def __init__(self):
|
||||
self.sessions: Dict[str, SessionData] = {}
|
||||
self.lock = threading.Lock()
|
||||
|
||||
def create_session(self) -> str:
|
||||
with self.lock:
|
||||
session_id = str(uuid.uuid4())
|
||||
self.sessions[session_id] = SessionData(session_id)
|
||||
return session_id
|
||||
|
||||
|
||||
def get_session(self, session_id: str) -> Optional[SessionData]:
|
||||
if session_id in self.sessions:
|
||||
return self.sessions[session_id]
|
||||
|
||||
# Fallback: Try to reconstruct from disk for history sessions
|
||||
output_dir = os.path.join("outputs", f"session_{session_id}")
|
||||
if os.path.exists(output_dir) and os.path.isdir(output_dir):
|
||||
return self._reconstruct_session(session_id, output_dir)
|
||||
|
||||
return None
|
||||
|
||||
def _reconstruct_session(self, session_id: str, output_dir: str) -> SessionData:
|
||||
"""从磁盘目录重建会话对象"""
|
||||
session = SessionData(session_id)
|
||||
session.output_dir = output_dir
|
||||
session.is_running = False
|
||||
session.current_round = session.max_rounds
|
||||
session.progress_percentage = 100.0
|
||||
session.status_message = "已完成 (历史记录)"
|
||||
|
||||
# Recover Log
|
||||
log_path = os.path.join(output_dir, "process.log")
|
||||
if os.path.exists(log_path):
|
||||
session.log_file = log_path
|
||||
|
||||
# Recover Report
|
||||
# 宽容查找:扫描所有 .md 文件,优先取包含 "report" 或 "报告" 的文件
|
||||
md_files = glob.glob(os.path.join(output_dir, "*.md"))
|
||||
if md_files:
|
||||
# 默认取第一个
|
||||
chosen = md_files[0]
|
||||
# 尝试找更好的匹配
|
||||
for md in md_files:
|
||||
fname = os.path.basename(md).lower()
|
||||
if "report" in fname or "报告" in fname:
|
||||
chosen = md
|
||||
break
|
||||
session.generated_report = chosen
|
||||
|
||||
# Recover Script (查找可能的脚本文件)
|
||||
possible_scripts = ["data_analysis_script.py", "script.py", "analysis_script.py"]
|
||||
for s in possible_scripts:
|
||||
p = os.path.join(output_dir, s)
|
||||
if os.path.exists(p):
|
||||
session.reusable_script = p
|
||||
break
|
||||
|
||||
# Recover Results (images etc)
|
||||
results_json = os.path.join(output_dir, "results.json")
|
||||
if os.path.exists(results_json):
|
||||
try:
|
||||
with open(results_json, "r") as f:
|
||||
session.analysis_results = json.load(f)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Recover Metadata
|
||||
try:
|
||||
stat = os.stat(output_dir)
|
||||
dt = datetime.fromtimestamp(stat.st_ctime)
|
||||
session.created_at = dt.strftime("%Y-%m-%d %H:%M:%S")
|
||||
except:
|
||||
pass
|
||||
|
||||
# Cache it
|
||||
with self.lock:
|
||||
self.sessions[session_id] = session
|
||||
|
||||
return session
|
||||
|
||||
def list_sessions(self):
|
||||
return list(self.sessions.keys())
|
||||
|
||||
def delete_session(self, session_id: str) -> bool:
|
||||
"""删除指定会话"""
|
||||
with self.lock:
|
||||
if session_id in self.sessions:
|
||||
session = self.sessions[session_id]
|
||||
if session.agent:
|
||||
session.agent.reset()
|
||||
del self.sessions[session_id]
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_session_info(self, session_id: str) -> Optional[Dict]:
|
||||
"""获取会话详细信息"""
|
||||
session = self.get_session(session_id)
|
||||
if session:
|
||||
return {
|
||||
"session_id": session.session_id,
|
||||
"is_running": session.is_running,
|
||||
"progress": session.progress_percentage,
|
||||
"status": session.status_message,
|
||||
"current_round": session.current_round,
|
||||
"max_rounds": session.max_rounds,
|
||||
"created_at": session.created_at,
|
||||
"last_updated": session.last_updated,
|
||||
"user_requirement": session.user_requirement[:100] + "..." if len(session.user_requirement) > 100 else session.user_requirement,
|
||||
"script_path": session.reusable_script # 新增:返回脚本路径
|
||||
}
|
||||
return None
|
||||
|
||||
session_manager = SessionManager()
|
||||
|
||||
# Mount static files
|
||||
os.makedirs("web/static", exist_ok=True)
|
||||
os.makedirs("uploads", exist_ok=True)
|
||||
os.makedirs("outputs", exist_ok=True)
|
||||
|
||||
app.mount("/static", StaticFiles(directory="web/static"), name="static")
|
||||
app.mount("/outputs", StaticFiles(directory="outputs"), name="outputs")
|
||||
|
||||
# --- Helper Functions ---
|
||||
|
||||
def run_analysis_task(session_id: str, files: list, user_requirement: str, is_followup: bool = False):
|
||||
"""在后台线程中运行分析任务"""
|
||||
session = session_manager.get_session(session_id)
|
||||
if not session:
|
||||
print(f"Error: Session {session_id} not found in background task.")
|
||||
return
|
||||
|
||||
session.is_running = True
|
||||
try:
|
||||
base_output_dir = "outputs"
|
||||
|
||||
if not session.output_dir:
|
||||
session.output_dir = create_session_output_dir(base_output_dir, user_requirement)
|
||||
|
||||
session_output_dir = session.output_dir
|
||||
session.log_file = os.path.join(session_output_dir, "process.log")
|
||||
|
||||
# 使用 PrintCapture 替代全局 FileLogger,退出 with 块后自动恢复 stdout
|
||||
with PrintCapture(session.log_file):
|
||||
if is_followup:
|
||||
print(f"\n--- Follow-up Session {session_id} Continued ---")
|
||||
else:
|
||||
print(f"--- Session {session_id} Started ---")
|
||||
|
||||
try:
|
||||
if not is_followup:
|
||||
llm_config = LLMConfig()
|
||||
agent = DataAnalysisAgent(llm_config, force_max_rounds=False, output_dir=base_output_dir)
|
||||
session.agent = agent
|
||||
|
||||
result = agent.analyze(
|
||||
user_input=user_requirement,
|
||||
files=files,
|
||||
session_output_dir=session_output_dir,
|
||||
reset_session=True,
|
||||
)
|
||||
else:
|
||||
agent = session.agent
|
||||
if not agent:
|
||||
print("Error: Agent not initialized for follow-up.")
|
||||
return
|
||||
|
||||
result = agent.analyze(
|
||||
user_input=user_requirement,
|
||||
files=None,
|
||||
session_output_dir=session_output_dir,
|
||||
reset_session=False,
|
||||
max_rounds=10,
|
||||
)
|
||||
|
||||
session.generated_report = result.get("report_file_path", None)
|
||||
session.analysis_results = result.get("analysis_results", [])
|
||||
session.reusable_script = result.get("reusable_script_path", None)
|
||||
|
||||
# 持久化结果
|
||||
with open(os.path.join(session_output_dir, "results.json"), "w") as f:
|
||||
json.dump(session.analysis_results, f, default=str)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during analysis: {e}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"System Error: {e}")
|
||||
finally:
|
||||
session.is_running = False
|
||||
|
||||
# --- Pydantic Models ---
|
||||
|
||||
class StartRequest(BaseModel):
|
||||
requirement: str
|
||||
|
||||
class ChatRequest(BaseModel):
|
||||
session_id: str
|
||||
message: str
|
||||
|
||||
# --- API Endpoints ---
|
||||
|
||||
@app.get("/")
|
||||
async def read_root():
|
||||
return FileResponse("web/static/index.html")
|
||||
|
||||
@app.post("/api/upload")
|
||||
async def upload_files(files: list[UploadFile] = File(...)):
|
||||
saved_files = []
|
||||
for file in files:
|
||||
file_location = f"uploads/{file.filename}"
|
||||
with open(file_location, "wb+") as file_object:
|
||||
file_object.write(file.file.read())
|
||||
saved_files.append(file_location)
|
||||
return {"info": f"Saved {len(saved_files)} files", "paths": saved_files}
|
||||
|
||||
@app.post("/api/start")
|
||||
async def start_analysis(request: StartRequest, background_tasks: BackgroundTasks):
|
||||
session_id = session_manager.create_session()
|
||||
|
||||
files = glob.glob("uploads/*.csv")
|
||||
if not files:
|
||||
if os.path.exists("cleaned_data.csv"):
|
||||
files = ["cleaned_data.csv"]
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail="No CSV files found")
|
||||
|
||||
files = [os.path.abspath(f) for f in files] # Only use absolute paths
|
||||
|
||||
background_tasks.add_task(run_analysis_task, session_id, files, request.requirement, is_followup=False)
|
||||
return {"status": "started", "session_id": session_id}
|
||||
|
||||
@app.post("/api/chat")
|
||||
async def chat_analysis(request: ChatRequest, background_tasks: BackgroundTasks):
|
||||
session = session_manager.get_session(request.session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
if session.is_running:
|
||||
raise HTTPException(status_code=400, detail="Analysis already in progress")
|
||||
|
||||
background_tasks.add_task(run_analysis_task, request.session_id, [], request.message, is_followup=True)
|
||||
return {"status": "started"}
|
||||
|
||||
@app.get("/api/status")
|
||||
async def get_status(session_id: str = Query(..., description="Session ID")):
|
||||
session = session_manager.get_session(session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
log_content = ""
|
||||
if session.log_file and os.path.exists(session.log_file):
|
||||
with open(session.log_file, "r", encoding="utf-8") as f:
|
||||
log_content = f.read()
|
||||
|
||||
return {
|
||||
"is_running": session.is_running,
|
||||
"log": log_content,
|
||||
"has_report": session.generated_report is not None,
|
||||
"report_path": session.generated_report,
|
||||
"script_path": session.reusable_script # 新增:返回脚本路径
|
||||
}
|
||||
|
||||
@app.get("/api/export")
|
||||
async def export_session(session_id: str = Query(..., description="Session ID")):
|
||||
"""导出会话数据为ZIP"""
|
||||
session = session_manager.get_session(session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
if not session.output_dir or not os.path.exists(session.output_dir):
|
||||
raise HTTPException(status_code=404, detail="No data available for export")
|
||||
|
||||
import zipfile
|
||||
from datetime import datetime as dt
|
||||
|
||||
timestamp = dt.now().strftime("%Y%m%d_%H%M%S")
|
||||
zip_filename = f"report_{timestamp}.zip"
|
||||
|
||||
export_dir = "outputs"
|
||||
os.makedirs(export_dir, exist_ok=True)
|
||||
temp_zip_path = os.path.join(export_dir, zip_filename)
|
||||
|
||||
with zipfile.ZipFile(temp_zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
|
||||
for root, dirs, files in os.walk(session.output_dir):
|
||||
for file in files:
|
||||
if file.endswith(('.md', '.png', '.csv', '.log', '.json', '.yaml')):
|
||||
abs_path = os.path.join(root, file)
|
||||
rel_path = os.path.relpath(abs_path, session.output_dir)
|
||||
zf.write(abs_path, arcname=rel_path)
|
||||
|
||||
return FileResponse(
|
||||
path=temp_zip_path,
|
||||
filename=zip_filename,
|
||||
media_type='application/zip'
|
||||
)
|
||||
|
||||
@app.get("/api/report")
|
||||
async def get_report(session_id: str = Query(..., description="Session ID")):
|
||||
session = session_manager.get_session(session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
if not session.generated_report or not os.path.exists(session.generated_report):
|
||||
return {"content": "Report not ready.", "paragraphs": []}
|
||||
|
||||
with open(session.generated_report, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
# Fix image paths
|
||||
relative_session_path = _to_web_path(os.path.relpath(session.output_dir, os.getcwd()))
|
||||
web_base_path = f"/{relative_session_path}"
|
||||
|
||||
# Robust image path replacement
|
||||
content = content.replace("](./", f"]({web_base_path}/")
|
||||
|
||||
import re
|
||||
def replace_link(match):
|
||||
alt = match.group(1)
|
||||
url = match.group(2)
|
||||
if url.startswith("http") or url.startswith("/") or url.startswith("data:"):
|
||||
return match.group(0)
|
||||
clean_url = url.lstrip("./")
|
||||
return f""
|
||||
|
||||
content = re.sub(r'!\[(.*?)\]\((.*?)\)', replace_link, content)
|
||||
|
||||
# 将报告按段落拆分,为前端润色功能提供结构化数据
|
||||
paragraphs = _split_report_to_paragraphs(content)
|
||||
|
||||
return {"content": content, "base_path": web_base_path, "paragraphs": paragraphs}
|
||||
|
||||
@app.get("/api/figures")
|
||||
async def get_figures(session_id: str = Query(..., description="Session ID")):
|
||||
session = session_manager.get_session(session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
# We can try to get from memory first
|
||||
results = session.analysis_results
|
||||
|
||||
# If empty in memory (maybe server restarted but files exist?), try load json
|
||||
if not results and session.output_dir:
|
||||
json_path = os.path.join(session.output_dir, "results.json")
|
||||
if os.path.exists(json_path):
|
||||
with open(json_path, 'r') as f:
|
||||
results = json.load(f)
|
||||
|
||||
# Extract collected figures
|
||||
figures = []
|
||||
|
||||
# We iterate over analysis results to find 'collect_figures' actions
|
||||
if results:
|
||||
for item in results:
|
||||
if item.get("action") == "collect_figures":
|
||||
collected = item.get("collected_figures", [])
|
||||
for fig in collected:
|
||||
# Enrich with web path
|
||||
if session.output_dir:
|
||||
# Assume filename is present
|
||||
fname = fig.get("filename")
|
||||
relative_session_path = _to_web_path(os.path.relpath(session.output_dir, os.getcwd()))
|
||||
fig["web_url"] = f"/{relative_session_path}/{fname}"
|
||||
figures.append(fig)
|
||||
|
||||
# Also check for 'generate_code' results that might have implicit figures if we parse them
|
||||
# But the 'collect_figures' action is the reliable source as per agent design
|
||||
|
||||
# Auto-discovery fallback if list is empty but pngs exist?
|
||||
if not figures and session.output_dir:
|
||||
# Simple scan
|
||||
pngs = glob.glob(os.path.join(session.output_dir, "*.png"))
|
||||
for p in pngs:
|
||||
fname = os.path.basename(p)
|
||||
relative_session_path = _to_web_path(os.path.relpath(session.output_dir, os.getcwd()))
|
||||
figures.append({
|
||||
"filename": fname,
|
||||
"description": "Auto-discovered image",
|
||||
"analysis": "No analysis available",
|
||||
"web_url": f"/{relative_session_path}/{fname}"
|
||||
})
|
||||
|
||||
return {"figures": figures}
|
||||
|
||||
@app.get("/api/download_script")
|
||||
async def download_script(session_id: str = Query(..., description="Session ID")):
|
||||
"""下载生成的Python脚本"""
|
||||
session = session_manager.get_session(session_id)
|
||||
if not session or not session.reusable_script:
|
||||
raise HTTPException(status_code=404, detail="Script not found")
|
||||
|
||||
if not os.path.exists(session.reusable_script):
|
||||
raise HTTPException(status_code=404, detail="Script file missing on server")
|
||||
|
||||
return FileResponse(
|
||||
path=session.reusable_script,
|
||||
filename=os.path.basename(session.reusable_script),
|
||||
media_type='text/x-python'
|
||||
)
|
||||
|
||||
# --- Tools API ---
|
||||
|
||||
|
||||
|
||||
# --- 新增API端点 ---
|
||||
|
||||
@app.get("/api/sessions/progress")
|
||||
async def get_session_progress(session_id: str = Query(..., description="Session ID")):
|
||||
"""获取会话分析进度"""
|
||||
session_info = session_manager.get_session_info(session_id)
|
||||
if not session_info:
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
return session_info
|
||||
|
||||
|
||||
@app.get("/api/sessions/list")
|
||||
async def list_all_sessions():
|
||||
"""获取所有会话列表"""
|
||||
session_ids = session_manager.list_sessions()
|
||||
sessions_info = []
|
||||
|
||||
for sid in session_ids:
|
||||
info = session_manager.get_session_info(sid)
|
||||
if info:
|
||||
sessions_info.append(info)
|
||||
|
||||
return {"sessions": sessions_info, "total": len(sessions_info)}
|
||||
|
||||
|
||||
@app.delete("/api/sessions/{session_id}")
|
||||
async def delete_specific_session(session_id: str):
|
||||
"""删除指定会话"""
|
||||
success = session_manager.delete_session(session_id)
|
||||
if not success:
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
return {"status": "deleted", "session_id": session_id}
|
||||
|
||||
|
||||
# --- Report Polishing API ---
|
||||
|
||||
import re as _re
|
||||
|
||||
def _split_report_to_paragraphs(markdown_content: str) -> list:
|
||||
"""
|
||||
将 Markdown 报告按语义段落拆分。
|
||||
每个段落包含 id、类型(heading/text/table/image)、原始内容。
|
||||
前端可据此实现段落级选择与润色。
|
||||
"""
|
||||
lines = markdown_content.split("\n")
|
||||
paragraphs = []
|
||||
current_block = []
|
||||
current_type = "text"
|
||||
para_id = 0
|
||||
|
||||
def flush_block():
|
||||
nonlocal para_id, current_block, current_type
|
||||
text = "\n".join(current_block).strip()
|
||||
if text:
|
||||
paragraphs.append({
|
||||
"id": f"p-{para_id}",
|
||||
"type": current_type,
|
||||
"content": text,
|
||||
})
|
||||
para_id += 1
|
||||
current_block = []
|
||||
current_type = "text"
|
||||
|
||||
in_table = False
|
||||
in_code = False
|
||||
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
|
||||
# 代码块边界
|
||||
if stripped.startswith("```"):
|
||||
if in_code:
|
||||
current_block.append(line)
|
||||
flush_block()
|
||||
in_code = False
|
||||
continue
|
||||
else:
|
||||
flush_block()
|
||||
current_block.append(line)
|
||||
current_type = "code"
|
||||
in_code = True
|
||||
continue
|
||||
|
||||
if in_code:
|
||||
current_block.append(line)
|
||||
continue
|
||||
|
||||
# 标题行 — 独立成段
|
||||
if _re.match(r"^#{1,6}\s", stripped):
|
||||
flush_block()
|
||||
current_block.append(line)
|
||||
current_type = "heading"
|
||||
flush_block()
|
||||
continue
|
||||
|
||||
# 图片行
|
||||
if _re.match(r"^!\[.*\]\(.*\)", stripped):
|
||||
flush_block()
|
||||
current_block.append(line)
|
||||
current_type = "image"
|
||||
flush_block()
|
||||
continue
|
||||
|
||||
# 表格行
|
||||
if stripped.startswith("|"):
|
||||
if not in_table:
|
||||
flush_block()
|
||||
in_table = True
|
||||
current_type = "table"
|
||||
current_block.append(line)
|
||||
continue
|
||||
else:
|
||||
if in_table:
|
||||
flush_block()
|
||||
in_table = False
|
||||
|
||||
# 空行 — 段落分隔
|
||||
if not stripped:
|
||||
flush_block()
|
||||
continue
|
||||
|
||||
# 普通文本
|
||||
current_block.append(line)
|
||||
|
||||
flush_block()
|
||||
return paragraphs
|
||||
|
||||
|
||||
class PolishRequest(BaseModel):
|
||||
session_id: str
|
||||
paragraph_id: str
|
||||
mode: str = "context" # "context" | "data" | "custom"
|
||||
custom_instruction: str = ""
|
||||
|
||||
|
||||
@app.post("/api/report/polish")
|
||||
async def polish_paragraph(request: PolishRequest):
|
||||
"""
|
||||
对报告中指定段落进行 AI 润色。
|
||||
|
||||
mode:
|
||||
- context: 根据上下文和图表信息润色,使表述更专业、更有洞察
|
||||
- data: 结合原始分析数据重新生成该段落内容
|
||||
- custom: 用户自定义润色指令
|
||||
"""
|
||||
session = session_manager.get_session(request.session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
if not session.generated_report or not os.path.exists(session.generated_report):
|
||||
raise HTTPException(status_code=404, detail="Report not found")
|
||||
|
||||
# 读取报告并拆分段落
|
||||
with open(session.generated_report, "r", encoding="utf-8") as f:
|
||||
report_content = f.read()
|
||||
|
||||
paragraphs = _split_report_to_paragraphs(report_content)
|
||||
|
||||
# 找到目标段落
|
||||
target = None
|
||||
target_idx = -1
|
||||
for i, p in enumerate(paragraphs):
|
||||
if p["id"] == request.paragraph_id:
|
||||
target = p
|
||||
target_idx = i
|
||||
break
|
||||
|
||||
if not target:
|
||||
raise HTTPException(status_code=404, detail=f"Paragraph {request.paragraph_id} not found")
|
||||
|
||||
# 构建上下文窗口(前后各2个段落)
|
||||
context_window = []
|
||||
for j in range(max(0, target_idx - 2), min(len(paragraphs), target_idx + 3)):
|
||||
if j != target_idx:
|
||||
context_window.append(paragraphs[j]["content"])
|
||||
context_text = "\n\n".join(context_window)
|
||||
|
||||
# 收集图表信息
|
||||
figures_info = ""
|
||||
if session.analysis_results:
|
||||
fig_parts = []
|
||||
for item in session.analysis_results:
|
||||
if item.get("action") == "collect_figures":
|
||||
for fig in item.get("collected_figures", []):
|
||||
fig_parts.append(f"- {fig.get('filename', '?')}: {fig.get('description', '')} / {fig.get('analysis', '')}")
|
||||
if fig_parts:
|
||||
figures_info = "\n".join(fig_parts)
|
||||
|
||||
# 构建润色 prompt
|
||||
if request.mode == "data":
|
||||
# 收集代码执行结果摘要
|
||||
data_summary_parts = []
|
||||
for item in session.analysis_results:
|
||||
result = item.get("result", {})
|
||||
if result.get("success") and result.get("output"):
|
||||
output_text = result["output"][:2000]
|
||||
data_summary_parts.append(output_text)
|
||||
data_summary = "\n---\n".join(data_summary_parts[:5])
|
||||
|
||||
polish_prompt = f"""你是一位资深数据分析专家。请基于以下分析数据,重写下方段落,使其包含更精确的数据引用和更深入的业务洞察。
|
||||
|
||||
## 分析数据摘要
|
||||
{data_summary}
|
||||
|
||||
## 图表信息
|
||||
{figures_info}
|
||||
|
||||
## 需要润色的段落
|
||||
{target['content']}
|
||||
|
||||
## 要求
|
||||
- 保持原有的 Markdown 格式(标题级别、表格结构等)
|
||||
- 用具体数据替换模糊描述
|
||||
- 增加业务洞察和趋势判断
|
||||
- 禁止使用第一人称
|
||||
- 直接输出润色后的 Markdown 内容,不要包裹在代码块中"""
|
||||
|
||||
elif request.mode == "custom":
|
||||
polish_prompt = f"""你是一位资深数据分析专家。请根据用户的指令润色以下段落。
|
||||
|
||||
## 用户指令
|
||||
{request.custom_instruction}
|
||||
|
||||
## 上下文
|
||||
{context_text}
|
||||
|
||||
## 图表信息
|
||||
{figures_info}
|
||||
|
||||
## 需要润色的段落
|
||||
{target['content']}
|
||||
|
||||
## 要求
|
||||
- 保持原有的 Markdown 格式
|
||||
- 严格遵循用户指令
|
||||
- 禁止使用第一人称
|
||||
- 直接输出润色后的 Markdown 内容,不要包裹在代码块中"""
|
||||
|
||||
else: # context mode
|
||||
polish_prompt = f"""你是一位资深数据分析专家。请润色以下段落,使其表述更专业、更有洞察力。
|
||||
|
||||
## 上下文(前后段落)
|
||||
{context_text}
|
||||
|
||||
## 图表信息
|
||||
{figures_info}
|
||||
|
||||
## 需要润色的段落
|
||||
{target['content']}
|
||||
|
||||
## 要求
|
||||
- 保持原有的 Markdown 格式(标题级别、表格结构等)
|
||||
- 提升专业性:使用同比、环比、占比等术语
|
||||
- 增加洞察:不仅描述现象,还要分析原因和影响
|
||||
- 禁止使用第一人称
|
||||
- 直接输出润色后的 Markdown 内容,不要包裹在代码块中"""
|
||||
|
||||
# 调用 LLM 润色
|
||||
try:
|
||||
from utils.llm_helper import LLMHelper
|
||||
llm = LLMHelper(LLMConfig())
|
||||
polished_content = llm.call(
|
||||
prompt=polish_prompt,
|
||||
system_prompt="你是一位专业的数据分析报告润色专家。直接输出润色后的内容,不要添加任何解释或包裹。",
|
||||
max_tokens=4096,
|
||||
)
|
||||
|
||||
# 清理可能的代码块包裹
|
||||
polished_content = polished_content.strip()
|
||||
if polished_content.startswith("```markdown"):
|
||||
polished_content = polished_content[len("```markdown"):].strip()
|
||||
if polished_content.startswith("```"):
|
||||
polished_content = polished_content[3:].strip()
|
||||
if polished_content.endswith("```"):
|
||||
polished_content = polished_content[:-3].strip()
|
||||
|
||||
return {
|
||||
"paragraph_id": request.paragraph_id,
|
||||
"original": target["content"],
|
||||
"polished": polished_content,
|
||||
"mode": request.mode,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Polish failed: {str(e)}")
|
||||
|
||||
|
||||
class ApplyPolishRequest(BaseModel):
|
||||
session_id: str
|
||||
paragraph_id: str
|
||||
new_content: str
|
||||
|
||||
|
||||
@app.post("/api/report/apply")
|
||||
async def apply_polish(request: ApplyPolishRequest):
|
||||
"""
|
||||
将润色后的内容应用到报告文件中,替换指定段落。
|
||||
"""
|
||||
session = session_manager.get_session(request.session_id)
|
||||
if not session:
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
if not session.generated_report or not os.path.exists(session.generated_report):
|
||||
raise HTTPException(status_code=404, detail="Report not found")
|
||||
|
||||
with open(session.generated_report, "r", encoding="utf-8") as f:
|
||||
report_content = f.read()
|
||||
|
||||
paragraphs = _split_report_to_paragraphs(report_content)
|
||||
|
||||
# 找到目标段落并替换
|
||||
target = None
|
||||
for p in paragraphs:
|
||||
if p["id"] == request.paragraph_id:
|
||||
target = p
|
||||
break
|
||||
|
||||
if not target:
|
||||
raise HTTPException(status_code=404, detail=f"Paragraph {request.paragraph_id} not found")
|
||||
|
||||
# 在原文中替换
|
||||
new_report = report_content.replace(target["content"], request.new_content, 1)
|
||||
|
||||
# 写回文件
|
||||
with open(session.generated_report, "w", encoding="utf-8") as f:
|
||||
f.write(new_report)
|
||||
|
||||
return {"status": "applied", "paragraph_id": request.paragraph_id}
|
||||
|
||||
|
||||
# --- History API ---
|
||||
|
||||
@app.get("/api/history")
|
||||
async def get_history():
|
||||
"""
|
||||
Get list of past analysis sessions from outputs directory
|
||||
"""
|
||||
history = []
|
||||
output_base = "outputs"
|
||||
|
||||
if not os.path.exists(output_base):
|
||||
return {"history": []}
|
||||
|
||||
try:
|
||||
# Scan for session_* directories
|
||||
for entry in os.scandir(output_base):
|
||||
if entry.is_dir() and entry.name.startswith("session_"):
|
||||
# Extract timestamp from folder name: session_20250101_120000
|
||||
session_id = entry.name.replace("session_", "")
|
||||
|
||||
# Check creation time or extract from name
|
||||
try:
|
||||
# Try to parse timestamp from ID if it matches format
|
||||
# Format: YYYYMMDD_HHMMSS
|
||||
timestamp_str = session_id
|
||||
dt = datetime.strptime(timestamp_str, "%Y%m%d_%H%M%S")
|
||||
display_time = dt.strftime("%Y-%m-%d %H:%M:%S")
|
||||
sort_key = dt.timestamp()
|
||||
except ValueError:
|
||||
# Fallback to file creation time
|
||||
sort_key = entry.stat().st_ctime
|
||||
display_time = datetime.fromtimestamp(sort_key).strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
history.append({
|
||||
"id": session_id,
|
||||
"timestamp": display_time,
|
||||
"sort_key": sort_key,
|
||||
"name": f"Session {display_time}"
|
||||
})
|
||||
|
||||
# Sort by latest first
|
||||
history.sort(key=lambda x: x["sort_key"], reverse=True)
|
||||
|
||||
# Cleanup internal sort key
|
||||
for item in history:
|
||||
del item["sort_key"]
|
||||
|
||||
return {"history": history}
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error scanning history: {e}")
|
||||
return {"history": []}
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
767
web/static/clean_style.css
Normal file
767
web/static/clean_style.css
Normal file
@@ -0,0 +1,767 @@
|
||||
/* Clean Style - IOV Data Analysis Agent */
|
||||
|
||||
:root {
|
||||
--primary-color: #2563EB;
|
||||
/* Tech Blue */
|
||||
--primary-hover: #1D4ED8;
|
||||
--bg-color: #FFFFFF;
|
||||
--sidebar-bg: #F9FAFB;
|
||||
--text-primary: #111827;
|
||||
--text-secondary: #6B7280;
|
||||
--border-color: #E5E7EB;
|
||||
--card-shadow: 0 1px 3px 0 rgba(0, 0, 0, 0.1), 0 1px 2px 0 rgba(0, 0, 0, 0.06);
|
||||
--font-family: 'Inter', -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
|
||||
}
|
||||
|
||||
* {
|
||||
box-sizing: border-box;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
body {
|
||||
font-family: var(--font-family);
|
||||
color: var(--text-primary);
|
||||
background-color: var(--bg-color);
|
||||
line-height: 1.5;
|
||||
height: 100vh;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.app-container {
|
||||
display: flex;
|
||||
height: 100vh;
|
||||
}
|
||||
|
||||
/* Sidebar */
|
||||
.sidebar {
|
||||
width: 240px;
|
||||
/* Compact width */
|
||||
background-color: var(--sidebar-bg);
|
||||
border-right: 1px solid var(--border-color);
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
padding: 1rem;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
.brand {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.75rem;
|
||||
margin-bottom: 1.5rem;
|
||||
font-weight: 600;
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.brand i {
|
||||
color: var(--primary-color);
|
||||
font-size: 1.5rem;
|
||||
}
|
||||
|
||||
.nav-menu {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.5rem;
|
||||
flex: 1;
|
||||
overflow-y: hidden;
|
||||
/* Let history list handle scroll */
|
||||
}
|
||||
|
||||
.nav-item {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.75rem;
|
||||
padding: 0.75rem 1rem;
|
||||
border-radius: 0.375rem;
|
||||
color: var(--text-secondary);
|
||||
text-decoration: none;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
font-size: 0.95rem;
|
||||
border: none;
|
||||
background: none;
|
||||
width: 100%;
|
||||
text-align: left;
|
||||
}
|
||||
|
||||
.nav-item:hover {
|
||||
background-color: #F3F4F6;
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.nav-item.active {
|
||||
background-color: #EFF6FF;
|
||||
color: var(--primary-color);
|
||||
font-weight: 500;
|
||||
}
|
||||
|
||||
.nav-item i {
|
||||
width: 1.25rem;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.nav-divider {
|
||||
height: 1px;
|
||||
background-color: var(--border-color);
|
||||
margin: 1rem 0 0.5rem 0;
|
||||
}
|
||||
|
||||
.nav-section-title {
|
||||
font-size: 0.75rem;
|
||||
text-transform: uppercase;
|
||||
color: var(--text-secondary);
|
||||
font-weight: 600;
|
||||
letter-spacing: 0.05em;
|
||||
margin-bottom: 0.5rem;
|
||||
padding-left: 0.5rem;
|
||||
}
|
||||
|
||||
/* History List */
|
||||
.history-list {
|
||||
flex: 1;
|
||||
overflow-y: auto;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.25rem;
|
||||
padding-right: 5px;
|
||||
}
|
||||
|
||||
.history-item {
|
||||
font-size: 0.85rem;
|
||||
color: var(--text-secondary);
|
||||
padding: 0.5rem 0.75rem;
|
||||
border-radius: 0.375rem;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
white-space: nowrap;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
}
|
||||
|
||||
.history-item:hover {
|
||||
background-color: #F3F4F6;
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.history-item.active {
|
||||
background-color: #EFF6FF;
|
||||
color: var(--primary-color);
|
||||
}
|
||||
|
||||
|
||||
.status-bar {
|
||||
margin-top: auto;
|
||||
padding-top: 1rem;
|
||||
border-top: 1px solid var(--border-color);
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
font-size: 0.875rem;
|
||||
color: var(--text-secondary);
|
||||
}
|
||||
|
||||
.status-dot {
|
||||
width: 8px;
|
||||
height: 8px;
|
||||
border-radius: 50%;
|
||||
background-color: #D1D5DB;
|
||||
}
|
||||
|
||||
.status-dot.running {
|
||||
background-color: var(--primary-color);
|
||||
box-shadow: 0 0 0 2px rgba(37, 99, 235, 0.2);
|
||||
}
|
||||
|
||||
/* Main Content */
|
||||
.main-content {
|
||||
flex: 1;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
height: 100vh;
|
||||
overflow: hidden;
|
||||
background-color: #FFFFFF;
|
||||
}
|
||||
|
||||
.header {
|
||||
height: 64px;
|
||||
border-bottom: 1px solid var(--border-color);
|
||||
display: flex;
|
||||
align-items: center;
|
||||
padding: 0 2rem;
|
||||
background-color: #FFFFFF;
|
||||
}
|
||||
|
||||
.header h2 {
|
||||
font-size: 1.25rem;
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.content-area {
|
||||
flex: 1;
|
||||
overflow-y: auto;
|
||||
padding: 2rem;
|
||||
background-color: #ffffff;
|
||||
}
|
||||
|
||||
/* Sections & Panel */
|
||||
.section {
|
||||
display: none;
|
||||
max-width: 1000px;
|
||||
margin: 0 auto;
|
||||
}
|
||||
|
||||
.section.active {
|
||||
display: block;
|
||||
}
|
||||
|
||||
.analysis-grid {
|
||||
display: grid;
|
||||
grid-template-columns: 350px 1fr;
|
||||
gap: 2rem;
|
||||
height: calc(100vh - 64px - 4rem);
|
||||
}
|
||||
|
||||
.panel {
|
||||
background: #FFFFFF;
|
||||
border: 1px solid var(--border-color);
|
||||
border-radius: 0.5rem;
|
||||
padding: 1.5rem;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 1.5rem;
|
||||
}
|
||||
|
||||
.panel-title {
|
||||
font-size: 1rem;
|
||||
font-weight: 600;
|
||||
color: var(--text-primary);
|
||||
margin-bottom: 0.5rem;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
}
|
||||
|
||||
/* Forms */
|
||||
.form-group {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.5rem;
|
||||
}
|
||||
|
||||
.form-label {
|
||||
font-size: 0.875rem;
|
||||
font-weight: 500;
|
||||
color: var(--text-secondary);
|
||||
}
|
||||
|
||||
.form-input,
|
||||
.form-textarea {
|
||||
padding: 0.625rem 0.875rem;
|
||||
border: 1px solid var(--border-color);
|
||||
border-radius: 0.375rem;
|
||||
font-family: inherit;
|
||||
font-size: 0.9rem;
|
||||
color: var(--text-primary);
|
||||
outline: none;
|
||||
transition: border-color 0.2s;
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.form-input:focus,
|
||||
.form-textarea:focus {
|
||||
border-color: var(--primary-color);
|
||||
box-shadow: 0 0 0 2px rgba(37, 99, 235, 0.1);
|
||||
}
|
||||
|
||||
.form-textarea {
|
||||
resize: vertical;
|
||||
min-height: 100px;
|
||||
}
|
||||
|
||||
/* Buttons */
|
||||
.btn {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 0.5rem;
|
||||
padding: 0.625rem 1.25rem;
|
||||
border-radius: 0.375rem;
|
||||
font-weight: 500;
|
||||
font-size: 0.9rem;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
border: 1px solid transparent;
|
||||
}
|
||||
|
||||
.btn-primary {
|
||||
background-color: var(--primary-color);
|
||||
color: white;
|
||||
}
|
||||
|
||||
.btn-primary:hover {
|
||||
background-color: var(--primary-hover);
|
||||
}
|
||||
|
||||
.btn-secondary {
|
||||
background-color: white;
|
||||
border-color: var(--border-color);
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.btn-secondary:hover {
|
||||
background-color: #F9FAFB;
|
||||
border-color: #D1D5DB;
|
||||
}
|
||||
|
||||
.btn-sm {
|
||||
padding: 0.375rem 0.75rem;
|
||||
font-size: 0.875rem;
|
||||
}
|
||||
|
||||
/* Upload Area */
|
||||
.upload-area {
|
||||
border: 2px dashed var(--border-color);
|
||||
border-radius: 0.5rem;
|
||||
padding: 2rem;
|
||||
text-align: center;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
background-color: #F9FAFB;
|
||||
}
|
||||
|
||||
.upload-area:hover,
|
||||
.upload-area.dragover {
|
||||
border-color: var(--primary-color);
|
||||
background-color: #EFF6FF;
|
||||
}
|
||||
|
||||
.upload-icon {
|
||||
font-size: 1.5rem;
|
||||
color: var(--text-secondary);
|
||||
margin-bottom: 0.75rem;
|
||||
}
|
||||
|
||||
.file-list {
|
||||
margin-top: 1rem;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.5rem;
|
||||
}
|
||||
|
||||
.file-item {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
font-size: 0.85rem;
|
||||
color: var(--text-primary);
|
||||
background: #FFFFFF;
|
||||
padding: 0.5rem;
|
||||
border: 1px solid var(--border-color);
|
||||
border-radius: 0.25rem;
|
||||
}
|
||||
|
||||
/* Tabs */
|
||||
.tabs {
|
||||
display: flex;
|
||||
gap: 1rem;
|
||||
margin-left: 1rem;
|
||||
}
|
||||
|
||||
.tab {
|
||||
padding: 0.25rem 0.5rem;
|
||||
font-size: 0.9rem;
|
||||
color: var(--text-secondary);
|
||||
cursor: pointer;
|
||||
border-bottom: 2px solid transparent;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.tab:hover {
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.tab.active {
|
||||
color: var(--primary-color);
|
||||
border-bottom-color: var(--primary-color);
|
||||
font-weight: 500;
|
||||
}
|
||||
|
||||
/* Log & Report Content */
|
||||
.output-container {
|
||||
flex: 1;
|
||||
overflow-y: hidden;
|
||||
/* Individual tabs scroll */
|
||||
background: #F9FAFB;
|
||||
border: 1px solid var(--border-color);
|
||||
border-radius: 0.375rem;
|
||||
padding: 1rem;
|
||||
position: relative;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
#logsTab {
|
||||
background-color: #1a1b26;
|
||||
color: #a9b1d6;
|
||||
font-family: 'JetBrains Mono', 'Menlo', 'Monaco', 'Courier New', monospace;
|
||||
padding: 1.5rem;
|
||||
}
|
||||
|
||||
.log-content {
|
||||
font-family: inherit;
|
||||
font-size: 0.85rem;
|
||||
white-space: pre-wrap;
|
||||
line-height: 1.6;
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
.report-content {
|
||||
font-size: 0.95rem;
|
||||
line-height: 1.7;
|
||||
color: #1F2937;
|
||||
}
|
||||
|
||||
.report-content img {
|
||||
max-width: 100%;
|
||||
border-radius: 0.375rem;
|
||||
margin: 1rem 0;
|
||||
box-shadow: var(--card-shadow);
|
||||
}
|
||||
|
||||
/* Empty State */
|
||||
.empty-state {
|
||||
text-align: center;
|
||||
padding: 4rem 2rem;
|
||||
color: var(--text-secondary);
|
||||
}
|
||||
|
||||
/* Utilities */
|
||||
.hidden {
|
||||
display: none !important;
|
||||
}
|
||||
|
||||
/* Gallery Carousel */
|
||||
.carousel-container {
|
||||
position: relative;
|
||||
width: 100%;
|
||||
flex: 1;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
background: #F3F4F6;
|
||||
border-radius: 0.5rem;
|
||||
overflow: hidden;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
.carousel-slide {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
padding: 2rem;
|
||||
}
|
||||
|
||||
.carousel-slide img {
|
||||
max-width: 100%;
|
||||
max-height: 500px;
|
||||
object-fit: contain;
|
||||
border-radius: 0.25rem;
|
||||
box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1);
|
||||
transition: transform 0.2s;
|
||||
background: white;
|
||||
}
|
||||
|
||||
.carousel-btn {
|
||||
position: absolute;
|
||||
top: 50%;
|
||||
transform: translateY(-50%);
|
||||
background: rgba(255, 255, 255, 0.9);
|
||||
border: 1px solid var(--border-color);
|
||||
border-radius: 50%;
|
||||
width: 44px;
|
||||
height: 44px;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
cursor: pointer;
|
||||
z-index: 10;
|
||||
color: var(--text-primary);
|
||||
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.carousel-btn:hover {
|
||||
background: var(--primary-color);
|
||||
color: white;
|
||||
border-color: var(--primary-color);
|
||||
transform: translateY(-50%) scale(1.1);
|
||||
}
|
||||
|
||||
.carousel-btn.prev {
|
||||
left: 1rem;
|
||||
}
|
||||
|
||||
.carousel-btn.next {
|
||||
right: 1rem;
|
||||
}
|
||||
|
||||
.image-info {
|
||||
width: 100%;
|
||||
text-align: center;
|
||||
color: var(--text-primary);
|
||||
background: white;
|
||||
padding: 1rem;
|
||||
border-radius: 0.5rem;
|
||||
border: 1px solid var(--border-color);
|
||||
}
|
||||
|
||||
.image-title {
|
||||
font-weight: 600;
|
||||
font-size: 1.1rem;
|
||||
margin-bottom: 0.5rem;
|
||||
color: var(--primary-color);
|
||||
}
|
||||
|
||||
.image-desc {
|
||||
font-size: 0.9rem;
|
||||
color: var(--text-secondary);
|
||||
}
|
||||
|
||||
|
||||
/* ===== Report Paragraph Polishing ===== */
|
||||
|
||||
.report-paragraph {
|
||||
position: relative;
|
||||
padding: 0.5rem 0.75rem;
|
||||
margin: 0.125rem 0;
|
||||
border-left: 3px solid transparent;
|
||||
border-radius: 0 0.25rem 0.25rem 0;
|
||||
transition: all 0.2s ease;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.report-paragraph:hover {
|
||||
background-color: #F0F7FF;
|
||||
border-left-color: #93C5FD;
|
||||
}
|
||||
|
||||
.report-paragraph.selected {
|
||||
background-color: #EFF6FF;
|
||||
border-left-color: var(--primary-color);
|
||||
box-shadow: 0 1px 4px rgba(37, 99, 235, 0.1);
|
||||
}
|
||||
|
||||
/* 段落类型微调 */
|
||||
.report-paragraph.para-heading {
|
||||
border-left-color: transparent;
|
||||
}
|
||||
|
||||
.report-paragraph.para-heading:hover {
|
||||
border-left-color: #60A5FA;
|
||||
}
|
||||
|
||||
.report-paragraph.para-image {
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.report-paragraph.para-table .para-content {
|
||||
overflow-x: auto;
|
||||
}
|
||||
|
||||
/* 润色操作按钮 */
|
||||
.para-actions {
|
||||
display: flex;
|
||||
gap: 0.5rem;
|
||||
margin-top: 0.5rem;
|
||||
padding-top: 0.5rem;
|
||||
border-top: 1px dashed var(--border-color);
|
||||
flex-wrap: wrap;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.polish-btn {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 0.35rem;
|
||||
padding: 0.3rem 0.65rem;
|
||||
font-size: 0.8rem;
|
||||
border: 1px solid var(--border-color);
|
||||
border-radius: 1rem;
|
||||
background: white;
|
||||
color: var(--text-secondary);
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.polish-btn:hover {
|
||||
background: var(--primary-color);
|
||||
color: white;
|
||||
border-color: var(--primary-color);
|
||||
transform: translateY(-1px);
|
||||
box-shadow: 0 2px 6px rgba(37, 99, 235, 0.25);
|
||||
}
|
||||
|
||||
.polish-loading {
|
||||
font-size: 0.85rem;
|
||||
color: var(--primary-color);
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
padding: 0.25rem 0;
|
||||
}
|
||||
|
||||
/* 自定义润色输入 */
|
||||
.custom-polish-input {
|
||||
display: flex;
|
||||
gap: 0.5rem;
|
||||
width: 100%;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.custom-polish-input .form-input {
|
||||
flex: 1;
|
||||
padding: 0.4rem 0.75rem;
|
||||
font-size: 0.85rem;
|
||||
}
|
||||
|
||||
/* 润色对比视图 */
|
||||
.polish-diff {
|
||||
border: 1px solid #BFDBFE;
|
||||
border-radius: 0.5rem;
|
||||
overflow: hidden;
|
||||
background: white;
|
||||
}
|
||||
|
||||
.diff-header {
|
||||
background: linear-gradient(135deg, #EFF6FF, #DBEAFE);
|
||||
padding: 0.6rem 1rem;
|
||||
border-bottom: 1px solid #BFDBFE;
|
||||
}
|
||||
|
||||
.diff-title {
|
||||
font-size: 0.9rem;
|
||||
font-weight: 600;
|
||||
color: var(--primary-color);
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
}
|
||||
|
||||
.diff-panels {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr 1fr;
|
||||
gap: 0;
|
||||
}
|
||||
|
||||
.diff-panel {
|
||||
padding: 0.75rem 1rem;
|
||||
font-size: 0.9rem;
|
||||
line-height: 1.6;
|
||||
min-height: 80px;
|
||||
overflow-y: auto;
|
||||
max-height: 400px;
|
||||
}
|
||||
|
||||
.diff-original {
|
||||
background: #FEF2F2;
|
||||
border-right: 1px solid #BFDBFE;
|
||||
}
|
||||
|
||||
.diff-polished {
|
||||
background: #F0FDF4;
|
||||
}
|
||||
|
||||
.diff-label {
|
||||
font-size: 0.75rem;
|
||||
font-weight: 600;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.05em;
|
||||
margin-bottom: 0.5rem;
|
||||
padding-bottom: 0.25rem;
|
||||
border-bottom: 1px solid rgba(0,0,0,0.08);
|
||||
}
|
||||
|
||||
.diff-original .diff-label {
|
||||
color: #DC2626;
|
||||
}
|
||||
|
||||
.diff-polished .diff-label {
|
||||
color: #16A34A;
|
||||
}
|
||||
|
||||
.diff-body {
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.diff-body img {
|
||||
max-width: 100%;
|
||||
}
|
||||
|
||||
.diff-body table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
font-size: 0.85rem;
|
||||
}
|
||||
|
||||
.diff-body th, .diff-body td {
|
||||
border: 1px solid var(--border-color);
|
||||
padding: 0.3rem 0.5rem;
|
||||
text-align: left;
|
||||
}
|
||||
|
||||
.diff-actions {
|
||||
display: flex;
|
||||
gap: 0.75rem;
|
||||
padding: 0.75rem 1rem;
|
||||
border-top: 1px solid #BFDBFE;
|
||||
background: #F9FAFB;
|
||||
justify-content: flex-end;
|
||||
}
|
||||
|
||||
/* 报告内容中的表格样式 */
|
||||
.para-content table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
margin: 0.5rem 0;
|
||||
font-size: 0.9rem;
|
||||
}
|
||||
|
||||
.para-content th, .para-content td {
|
||||
border: 1px solid var(--border-color);
|
||||
padding: 0.4rem 0.6rem;
|
||||
text-align: left;
|
||||
}
|
||||
|
||||
.para-content th {
|
||||
background: #F3F4F6;
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.para-content img {
|
||||
max-width: 100%;
|
||||
border-radius: 0.375rem;
|
||||
margin: 0.5rem 0;
|
||||
box-shadow: var(--card-shadow);
|
||||
}
|
||||
|
||||
/* 响应式:小屏幕下对比视图改为上下排列 */
|
||||
@media (max-width: 900px) {
|
||||
.diff-panels {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
.diff-original {
|
||||
border-right: none;
|
||||
border-bottom: 1px solid #BFDBFE;
|
||||
}
|
||||
.analysis-grid {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
}
|
||||
168
web/static/index.html
Normal file
168
web/static/index.html
Normal file
@@ -0,0 +1,168 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>IOV Data Analysis Agent</title>
|
||||
<link rel="stylesheet" href="/static/clean_style.css">
|
||||
|
||||
<!-- Fonts -->
|
||||
<link rel="preconnect" href="https://fonts.googleapis.com">
|
||||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||||
<link
|
||||
href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600&family=JetBrains+Mono:wght@400;500&display=swap"
|
||||
rel="stylesheet">
|
||||
|
||||
<!-- Icons -->
|
||||
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
|
||||
|
||||
<!-- Markdown -->
|
||||
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div class="app-container">
|
||||
<!-- Sidebar -->
|
||||
<aside class="sidebar">
|
||||
<div class="brand">
|
||||
<i class="fa-solid fa-cube"></i>
|
||||
<span>IOV Agent</span>
|
||||
</div>
|
||||
|
||||
<nav class="nav-menu">
|
||||
<button class="nav-item active" onclick="switchView('analysis')">
|
||||
<i class="fa-solid fa-chart-line"></i> Analysis
|
||||
</button>
|
||||
|
||||
<div class="nav-divider"></div>
|
||||
<div class="nav-section-title">History</div>
|
||||
<div id="historyList" class="history-list">
|
||||
<!-- History items loaded via JS -->
|
||||
<div style="padding:0.5rem; font-size:0.8rem; color:#9CA3AF;">Loading...</div>
|
||||
</div>
|
||||
</nav>
|
||||
|
||||
<div class="status-bar">
|
||||
<div id="statusDot" class="status-dot"></div>
|
||||
<span id="statusText">Ready</span>
|
||||
</div>
|
||||
</aside>
|
||||
|
||||
<!-- Main Content -->
|
||||
<main class="main-content">
|
||||
<header class="header">
|
||||
<h2 id="pageTitle">Analysis Dashboard</h2>
|
||||
</header>
|
||||
|
||||
<div class="content-area">
|
||||
<!-- VIEW: ANALYSIS -->
|
||||
<div id="viewAnalysis" class="section active">
|
||||
<div class="analysis-grid">
|
||||
|
||||
<!-- Configuration Panel -->
|
||||
<div class="panel">
|
||||
<div class="panel-title">
|
||||
<span>Configuration</span>
|
||||
</div>
|
||||
|
||||
<div class="form-group">
|
||||
<label class="form-label">1. Data Upload</label>
|
||||
<div id="uploadZone" class="upload-area">
|
||||
<i class="fa-solid fa-cloud-arrow-up upload-icon"></i>
|
||||
<p>Click or Drag CSV/Excel Files</p>
|
||||
<div id="fileList" class="file-list"></div>
|
||||
</div>
|
||||
<input type="file" id="fileInput" multiple accept=".csv,.xlsx,.xls" hidden>
|
||||
</div>
|
||||
|
||||
<div class="form-group">
|
||||
<label class="form-label">2. Requirement</label>
|
||||
<textarea id="requirementInput" class="form-textarea"
|
||||
placeholder="Describe what you want to analyze..."></textarea>
|
||||
</div>
|
||||
|
||||
<button id="startBtn" class="btn btn-primary" style="margin-top: 1rem; width: 100%;">
|
||||
<i class="fa-solid fa-play"></i> Start Analysis
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<!-- Output Panel -->
|
||||
<div class="panel" style="overflow:hidden; display:flex; flex-direction:column;">
|
||||
<div class="panel-title" style="margin-bottom:0.5rem;">
|
||||
<span>Output</span>
|
||||
<div class="tabs">
|
||||
<div class="tab active" onclick="switchTab('logs')">Live Log</div>
|
||||
<div class="tab" onclick="switchTab('report')">Report</div>
|
||||
<div class="tab" onclick="switchTab('gallery')">Gallery</div>
|
||||
</div>
|
||||
<button id="downloadScriptBtn" class="btn btn-sm btn-secondary hidden"
|
||||
onclick="downloadScript()" style="margin-left:auto;">
|
||||
<i class="fa-solid fa-code"></i> Script
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<div class="output-container" id="outputContainer">
|
||||
<!-- Logs Tab -->
|
||||
<div id="logsTab" class="tab-content active" style="height:100%; overflow-y:auto;">
|
||||
<pre id="logOutput" class="log-content">Waiting to start...</pre>
|
||||
</div>
|
||||
|
||||
<!-- Report Tab -->
|
||||
<div id="reportTab" class="tab-content hidden" style="height:100%; overflow-y:auto;">
|
||||
<div id="reportContainer" class="report-content markdown-body">
|
||||
<div class="empty-state">
|
||||
<p>Report will appear here after analysis.</p>
|
||||
</div>
|
||||
</div>
|
||||
<div id="followUpSection" class="hidden"
|
||||
style="margin-top:2rem; border-top:1px solid var(--border-color); padding-top:1rem;">
|
||||
<div class="form-group">
|
||||
<label class="form-label">Follow-up Analysis</label>
|
||||
<div style="display:flex; gap:0.5rem;">
|
||||
<input type="text" id="followUpInput" class="form-input"
|
||||
placeholder="Ask a follow-up question...">
|
||||
<button class="btn btn-primary btn-sm"
|
||||
onclick="sendFollowUp()">Send</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div style="margin-top:1rem; text-align:right">
|
||||
<button id="exportBtn" class="btn btn-secondary btn-sm"
|
||||
onclick="triggerExport()">
|
||||
<i class="fa-solid fa-download"></i> Export ZIP
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Gallery Tab -->
|
||||
<div id="galleryTab" class="tab-content hidden"
|
||||
style="height:100%; display:flex; flex-direction:column; align-items:center; justify-content:center;">
|
||||
<div class="carousel-container">
|
||||
<button class="carousel-btn prev" onclick="prevImage()"><i
|
||||
class="fa-solid fa-chevron-left"></i></button>
|
||||
<div class="carousel-slide" id="carouselSlide">
|
||||
<p class="placeholder-text" style="color:var(--text-secondary);">No images
|
||||
generated.</p>
|
||||
</div>
|
||||
<button class="carousel-btn next" onclick="nextImage()"><i
|
||||
class="fa-solid fa-chevron-right"></i></button>
|
||||
</div>
|
||||
<div class="image-info" id="imageInfo" style="margin-top:1rem; text-align:center;">
|
||||
<!-- Title/Desc -->
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</main>
|
||||
</div>
|
||||
|
||||
<script src="/static/script.js"></script>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
587
web/static/script.js
Normal file
587
web/static/script.js
Normal file
@@ -0,0 +1,587 @@
|
||||
// DOM Elements
|
||||
const uploadZone = document.getElementById('uploadZone');
|
||||
const fileInput = document.getElementById('fileInput');
|
||||
const fileList = document.getElementById('fileList');
|
||||
const startBtn = document.getElementById('startBtn');
|
||||
const requirementInput = document.getElementById('requirementInput');
|
||||
const statusDot = document.getElementById('statusDot');
|
||||
const statusText = document.getElementById('statusText');
|
||||
const logOutput = document.getElementById('logOutput');
|
||||
const reportContainer = document.getElementById('reportContainer');
|
||||
const downloadScriptBtn = document.getElementById('downloadScriptBtn');
|
||||
|
||||
let isRunning = false;
|
||||
let pollingInterval = null;
|
||||
let currentSessionId = null;
|
||||
|
||||
// 报告段落数据(用于润色功能)
|
||||
let reportParagraphs = [];
|
||||
|
||||
// --- Upload Logic ---
|
||||
if (uploadZone) {
|
||||
uploadZone.addEventListener('dragover', (e) => {
|
||||
e.preventDefault();
|
||||
uploadZone.classList.add('dragover');
|
||||
});
|
||||
uploadZone.addEventListener('dragleave', () => uploadZone.classList.remove('dragover'));
|
||||
uploadZone.addEventListener('drop', (e) => {
|
||||
e.preventDefault();
|
||||
uploadZone.classList.remove('dragover');
|
||||
handleFiles(e.dataTransfer.files);
|
||||
});
|
||||
uploadZone.addEventListener('click', () => fileInput.click());
|
||||
}
|
||||
|
||||
if (fileInput) {
|
||||
fileInput.addEventListener('change', (e) => handleFiles(e.target.files));
|
||||
fileInput.addEventListener('click', (e) => e.stopPropagation());
|
||||
}
|
||||
|
||||
async function handleFiles(files) {
|
||||
if (files.length === 0) return;
|
||||
|
||||
fileList.innerHTML = '';
|
||||
const formData = new FormData();
|
||||
|
||||
for (const file of files) {
|
||||
formData.append('files', file);
|
||||
const fileItem = document.createElement('div');
|
||||
fileItem.className = 'file-item';
|
||||
fileItem.innerHTML = `<i class="fa-regular fa-file-excel"></i> ${file.name}`;
|
||||
fileList.appendChild(fileItem);
|
||||
}
|
||||
|
||||
try {
|
||||
const res = await fetch('/api/upload', { method: 'POST', body: formData });
|
||||
if (!res.ok) alert('Upload failed');
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
alert('Upload failed');
|
||||
}
|
||||
}
|
||||
|
||||
// --- Analysis Logic ---
|
||||
if (startBtn) {
|
||||
startBtn.addEventListener('click', startAnalysis);
|
||||
}
|
||||
|
||||
async function startAnalysis() {
|
||||
if (isRunning) return;
|
||||
|
||||
const requirement = requirementInput.value.trim();
|
||||
if (!requirement) {
|
||||
alert('Please enter analysis requirement');
|
||||
return;
|
||||
}
|
||||
|
||||
setRunningState(true);
|
||||
|
||||
try {
|
||||
const res = await fetch('/api/start', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ requirement })
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
const data = await res.json();
|
||||
currentSessionId = data.session_id;
|
||||
startPolling();
|
||||
switchTab('logs');
|
||||
} else {
|
||||
const err = await res.json();
|
||||
alert('Failed to start: ' + err.detail);
|
||||
setRunningState(false);
|
||||
}
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
alert('Error starting analysis');
|
||||
setRunningState(false);
|
||||
}
|
||||
}
|
||||
|
||||
function setRunningState(running) {
|
||||
isRunning = running;
|
||||
startBtn.disabled = running;
|
||||
|
||||
if (running) {
|
||||
startBtn.innerHTML = '<i class="fa-solid fa-spinner fa-spin"></i> Analysis in Progress...';
|
||||
statusDot.className = 'status-dot running';
|
||||
statusText.innerText = 'Analyzing';
|
||||
statusText.style.color = 'var(--primary-color)';
|
||||
const followUpSection = document.getElementById('followUpSection');
|
||||
if (followUpSection) followUpSection.classList.add('hidden');
|
||||
if (downloadScriptBtn) downloadScriptBtn.classList.add('hidden');
|
||||
} else {
|
||||
startBtn.innerHTML = '<i class="fa-solid fa-play"></i> Start Analysis';
|
||||
statusDot.className = 'status-dot';
|
||||
statusText.innerText = 'Completed';
|
||||
statusText.style.color = 'var(--text-secondary)';
|
||||
const followUpSection = document.getElementById('followUpSection');
|
||||
if (currentSessionId && followUpSection) followUpSection.classList.remove('hidden');
|
||||
}
|
||||
}
|
||||
|
||||
function startPolling() {
|
||||
if (pollingInterval) clearInterval(pollingInterval);
|
||||
if (!currentSessionId) return;
|
||||
|
||||
pollingInterval = setInterval(async () => {
|
||||
try {
|
||||
const res = await fetch(`/api/status?session_id=${currentSessionId}`);
|
||||
if (!res.ok) return;
|
||||
const data = await res.json();
|
||||
|
||||
logOutput.innerText = data.log || "Waiting for output...";
|
||||
const logTab = document.getElementById('logsTab');
|
||||
if (logTab) logTab.scrollTop = logTab.scrollHeight;
|
||||
|
||||
if (!data.is_running && isRunning) {
|
||||
setRunningState(false);
|
||||
clearInterval(pollingInterval);
|
||||
|
||||
if (data.has_report) {
|
||||
await loadReport();
|
||||
switchTab('report');
|
||||
}
|
||||
if (data.script_path && downloadScriptBtn) {
|
||||
downloadScriptBtn.classList.remove('hidden');
|
||||
downloadScriptBtn.style.display = 'inline-flex';
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('Polling error', e);
|
||||
}
|
||||
}, 2000);
|
||||
}
|
||||
|
||||
// --- Report Logic (with paragraph-level polishing) ---
|
||||
|
||||
async function loadReport() {
|
||||
if (!currentSessionId) return;
|
||||
try {
|
||||
const res = await fetch(`/api/report?session_id=${currentSessionId}`);
|
||||
const data = await res.json();
|
||||
|
||||
if (!data.content || data.content === "Report not ready.") {
|
||||
reportContainer.innerHTML = '<div class="empty-state"><p>Analysis in progress or no report generated yet.</p></div>';
|
||||
reportParagraphs = [];
|
||||
return;
|
||||
}
|
||||
|
||||
// 保存段落数据
|
||||
reportParagraphs = data.paragraphs || [];
|
||||
|
||||
// 渲染段落化的报告(支持点击润色)
|
||||
renderParagraphReport(reportParagraphs);
|
||||
|
||||
} catch (e) {
|
||||
reportContainer.innerHTML = '<p class="error">Failed to load report.</p>';
|
||||
}
|
||||
}
|
||||
|
||||
function renderParagraphReport(paragraphs) {
|
||||
if (!paragraphs || paragraphs.length === 0) {
|
||||
reportContainer.innerHTML = '<div class="empty-state"><p>No report content.</p></div>';
|
||||
return;
|
||||
}
|
||||
|
||||
let html = '';
|
||||
for (const p of paragraphs) {
|
||||
const renderedContent = marked.parse(p.content);
|
||||
const typeClass = `para-${p.type}`;
|
||||
html += `
|
||||
<div class="report-paragraph ${typeClass}" data-para-id="${p.id}" onclick="selectParagraph('${p.id}')">
|
||||
<div class="para-content">${renderedContent}</div>
|
||||
<div class="para-actions hidden">
|
||||
<button class="polish-btn" onclick="event.stopPropagation(); polishParagraph('${p.id}', 'context')" title="根据上下文润色">
|
||||
<i class="fa-solid fa-wand-magic-sparkles"></i> 上下文润色
|
||||
</button>
|
||||
<button class="polish-btn" onclick="event.stopPropagation(); polishParagraph('${p.id}', 'data')" title="结合分析数据润色">
|
||||
<i class="fa-solid fa-database"></i> 数据润色
|
||||
</button>
|
||||
<button class="polish-btn" onclick="event.stopPropagation(); showCustomPolish('${p.id}')" title="自定义润色指令">
|
||||
<i class="fa-solid fa-pen"></i> 自定义
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
reportContainer.innerHTML = html;
|
||||
}
|
||||
|
||||
window.selectParagraph = function(paraId) {
|
||||
// 取消所有选中
|
||||
document.querySelectorAll('.report-paragraph').forEach(el => {
|
||||
el.classList.remove('selected');
|
||||
el.querySelector('.para-actions')?.classList.add('hidden');
|
||||
});
|
||||
|
||||
// 选中当前段落
|
||||
const target = document.querySelector(`[data-para-id="${paraId}"]`);
|
||||
if (target) {
|
||||
target.classList.add('selected');
|
||||
target.querySelector('.para-actions')?.classList.remove('hidden');
|
||||
}
|
||||
}
|
||||
|
||||
window.polishParagraph = async function(paraId, mode, customInstruction = '') {
|
||||
if (!currentSessionId) return;
|
||||
|
||||
const target = document.querySelector(`[data-para-id="${paraId}"]`);
|
||||
if (!target) return;
|
||||
|
||||
// 显示加载状态
|
||||
const actionsEl = target.querySelector('.para-actions');
|
||||
const originalActions = actionsEl.innerHTML;
|
||||
actionsEl.innerHTML = '<span class="polish-loading"><i class="fa-solid fa-spinner fa-spin"></i> AI 润色中...</span>';
|
||||
|
||||
try {
|
||||
const res = await fetch('/api/report/polish', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
session_id: currentSessionId,
|
||||
paragraph_id: paraId,
|
||||
mode: mode,
|
||||
custom_instruction: customInstruction,
|
||||
})
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
const err = await res.json();
|
||||
alert('润色失败: ' + (err.detail || 'Unknown error'));
|
||||
actionsEl.innerHTML = originalActions;
|
||||
return;
|
||||
}
|
||||
|
||||
const data = await res.json();
|
||||
|
||||
// 显示对比视图
|
||||
showPolishDiff(target, paraId, data.original, data.polished);
|
||||
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
alert('润色请求失败');
|
||||
actionsEl.innerHTML = originalActions;
|
||||
}
|
||||
}
|
||||
|
||||
function showPolishDiff(targetEl, paraId, original, polished) {
|
||||
const polishedHtml = marked.parse(polished);
|
||||
|
||||
targetEl.innerHTML = `
|
||||
<div class="polish-diff">
|
||||
<div class="diff-header">
|
||||
<span class="diff-title"><i class="fa-solid fa-wand-magic-sparkles"></i> 润色结果预览</span>
|
||||
</div>
|
||||
<div class="diff-panels">
|
||||
<div class="diff-panel diff-original">
|
||||
<div class="diff-label">原文</div>
|
||||
<div class="diff-body">${marked.parse(original)}</div>
|
||||
</div>
|
||||
<div class="diff-panel diff-polished">
|
||||
<div class="diff-label">润色后</div>
|
||||
<div class="diff-body">${polishedHtml}</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="diff-actions">
|
||||
<button class="btn btn-primary btn-sm" id="acceptBtn-${paraId}">
|
||||
<i class="fa-solid fa-check"></i> 采纳
|
||||
</button>
|
||||
<button class="btn btn-secondary btn-sm" id="rejectBtn-${paraId}">
|
||||
<i class="fa-solid fa-xmark"></i> 放弃
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
|
||||
// 用 addEventListener 绑定,避免内联 onclick 中特殊字符破坏 HTML
|
||||
document.getElementById(`acceptBtn-${paraId}`).addEventListener('click', (e) => {
|
||||
e.stopPropagation();
|
||||
applyPolish(paraId, polished);
|
||||
});
|
||||
document.getElementById(`rejectBtn-${paraId}`).addEventListener('click', (e) => {
|
||||
e.stopPropagation();
|
||||
rejectPolish(paraId);
|
||||
});
|
||||
}
|
||||
|
||||
window.applyPolish = async function(paraId, newContent) {
|
||||
if (!currentSessionId) return;
|
||||
|
||||
try {
|
||||
const res = await fetch('/api/report/apply', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
session_id: currentSessionId,
|
||||
paragraph_id: paraId,
|
||||
new_content: newContent,
|
||||
})
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
// 重新加载报告
|
||||
await loadReport();
|
||||
} else {
|
||||
alert('应用失败');
|
||||
}
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
alert('应用失败');
|
||||
}
|
||||
}
|
||||
|
||||
window.rejectPolish = function(paraId) {
|
||||
// 重新加载报告恢复原状
|
||||
loadReport();
|
||||
}
|
||||
|
||||
window.showCustomPolish = function(paraId) {
|
||||
const target = document.querySelector(`[data-para-id="${paraId}"]`);
|
||||
if (!target) return;
|
||||
|
||||
const actionsEl = target.querySelector('.para-actions');
|
||||
if (!actionsEl) return;
|
||||
|
||||
actionsEl.innerHTML = `
|
||||
<div class="custom-polish-input">
|
||||
<input type="text" class="form-input" id="customInput-${paraId}" placeholder="输入润色指令,如:增加数据对比、语气更正式..." style="flex:1;">
|
||||
<button class="btn btn-primary btn-sm" onclick="event.stopPropagation(); submitCustomPolish('${paraId}')">
|
||||
<i class="fa-solid fa-paper-plane"></i>
|
||||
</button>
|
||||
<button class="btn btn-secondary btn-sm" onclick="event.stopPropagation(); loadReport()">
|
||||
<i class="fa-solid fa-xmark"></i>
|
||||
</button>
|
||||
</div>
|
||||
`;
|
||||
|
||||
document.getElementById(`customInput-${paraId}`)?.focus();
|
||||
}
|
||||
|
||||
window.submitCustomPolish = function(paraId) {
|
||||
const input = document.getElementById(`customInput-${paraId}`);
|
||||
if (!input) return;
|
||||
const instruction = input.value.trim();
|
||||
if (!instruction) {
|
||||
alert('请输入润色指令');
|
||||
return;
|
||||
}
|
||||
polishParagraph(paraId, 'custom', instruction);
|
||||
}
|
||||
|
||||
// --- Gallery Logic ---
|
||||
let galleryImages = [];
|
||||
let currentImageIndex = 0;
|
||||
|
||||
async function loadGallery() {
|
||||
if (!currentSessionId) return;
|
||||
try {
|
||||
const res = await fetch(`/api/figures?session_id=${currentSessionId}`);
|
||||
const data = await res.json();
|
||||
galleryImages = data.figures || [];
|
||||
currentImageIndex = 0;
|
||||
renderGalleryImage();
|
||||
} catch (e) {
|
||||
console.error("Gallery load failed", e);
|
||||
document.getElementById('carouselSlide').innerHTML = '<p class="error">Failed to load images.</p>';
|
||||
}
|
||||
}
|
||||
|
||||
function renderGalleryImage() {
|
||||
const slide = document.getElementById('carouselSlide');
|
||||
const info = document.getElementById('imageInfo');
|
||||
|
||||
if (galleryImages.length === 0) {
|
||||
slide.innerHTML = '<p class="placeholder-text" style="color:var(--text-secondary);">No images generated in this session.</p>';
|
||||
info.innerHTML = '';
|
||||
return;
|
||||
}
|
||||
|
||||
const img = galleryImages[currentImageIndex];
|
||||
slide.innerHTML = `<img src="${img.web_url}" alt="${img.filename}" onclick="window.open('${img.web_url}', '_blank')">`;
|
||||
info.innerHTML = `
|
||||
<div class="image-title">${img.filename} (${currentImageIndex + 1}/${galleryImages.length})</div>
|
||||
<div class="image-desc">${img.description || 'No description available.'}</div>
|
||||
${img.analysis ? `<div style="font-size:0.8rem; margin-top:0.5rem; color:#4B5563; background:#F3F4F6; padding:0.5rem; border-radius:4px;">${img.analysis}</div>` : ''}
|
||||
`;
|
||||
}
|
||||
|
||||
window.prevImage = function () {
|
||||
if (galleryImages.length === 0) return;
|
||||
currentImageIndex = (currentImageIndex - 1 + galleryImages.length) % galleryImages.length;
|
||||
renderGalleryImage();
|
||||
}
|
||||
|
||||
window.nextImage = function () {
|
||||
if (galleryImages.length === 0) return;
|
||||
currentImageIndex = (currentImageIndex + 1) % galleryImages.length;
|
||||
renderGalleryImage();
|
||||
}
|
||||
|
||||
// --- Download / Export ---
|
||||
window.downloadScript = async function () {
|
||||
if (!currentSessionId) return;
|
||||
const link = document.createElement('a');
|
||||
link.href = `/api/download_script?session_id=${currentSessionId}`;
|
||||
link.download = '';
|
||||
document.body.appendChild(link);
|
||||
link.click();
|
||||
document.body.removeChild(link);
|
||||
}
|
||||
|
||||
window.triggerExport = async function () {
|
||||
if (!currentSessionId) {
|
||||
alert("No active session to export.");
|
||||
return;
|
||||
}
|
||||
const btn = document.getElementById('exportBtn');
|
||||
const originalContent = btn.innerHTML;
|
||||
btn.innerHTML = '<i class="fa-solid fa-spinner fa-spin"></i> Zipping...';
|
||||
btn.disabled = true;
|
||||
|
||||
try {
|
||||
window.open(`/api/export?session_id=${currentSessionId}`, '_blank');
|
||||
} catch (e) {
|
||||
alert("Export failed: " + e.message);
|
||||
} finally {
|
||||
setTimeout(() => {
|
||||
btn.innerHTML = originalContent;
|
||||
btn.disabled = false;
|
||||
}, 2000);
|
||||
}
|
||||
}
|
||||
|
||||
// --- Follow-up Chat ---
|
||||
window.sendFollowUp = async function () {
|
||||
if (!currentSessionId || isRunning) return;
|
||||
const input = document.getElementById('followUpInput');
|
||||
const message = input.value.trim();
|
||||
if (!message) return;
|
||||
|
||||
input.disabled = true;
|
||||
try {
|
||||
const res = await fetch('/api/chat', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ session_id: currentSessionId, message: message })
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
input.value = '';
|
||||
setRunningState(true);
|
||||
startPolling();
|
||||
switchTab('logs');
|
||||
} else {
|
||||
alert('Failed to send request');
|
||||
}
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
} finally {
|
||||
input.disabled = false;
|
||||
}
|
||||
}
|
||||
|
||||
// --- History Logic ---
|
||||
async function loadHistory() {
|
||||
const list = document.getElementById('historyList');
|
||||
if (!list) return;
|
||||
|
||||
try {
|
||||
const res = await fetch('/api/history');
|
||||
const data = await res.json();
|
||||
|
||||
if (data.history.length === 0) {
|
||||
list.innerHTML = '<div style="padding:0.5rem; font-size:0.8rem; color:#9CA3AF;">No history yet</div>';
|
||||
return;
|
||||
}
|
||||
|
||||
let html = '';
|
||||
data.history.forEach(item => {
|
||||
html += `
|
||||
<div class="history-item" onclick="loadSession('${item.id}')" id="hist-${item.id}">
|
||||
<i class="fa-regular fa-clock"></i>
|
||||
<span>${item.id}</span>
|
||||
</div>
|
||||
`;
|
||||
});
|
||||
list.innerHTML = html;
|
||||
} catch (e) {
|
||||
console.error("Failed to load history", e);
|
||||
}
|
||||
}
|
||||
|
||||
window.loadSession = async function (sessionId) {
|
||||
if (isRunning) {
|
||||
alert("Analysis in progress, please wait.");
|
||||
return;
|
||||
}
|
||||
|
||||
currentSessionId = sessionId;
|
||||
|
||||
document.querySelectorAll('.history-item').forEach(el => el.classList.remove('active'));
|
||||
const activeItem = document.getElementById(`hist-${sessionId}`);
|
||||
if (activeItem) activeItem.classList.add('active');
|
||||
|
||||
logOutput.innerText = "Loading session data...";
|
||||
reportContainer.innerHTML = "";
|
||||
if (downloadScriptBtn) downloadScriptBtn.classList.add('hidden');
|
||||
|
||||
try {
|
||||
const res = await fetch(`/api/status?session_id=${sessionId}`);
|
||||
if (res.ok) {
|
||||
const data = await res.json();
|
||||
logOutput.innerText = data.log || "No logs available.";
|
||||
|
||||
const logTab = document.getElementById('logsTab');
|
||||
if (logTab) logTab.scrollTop = logTab.scrollHeight;
|
||||
|
||||
if (data.has_report) {
|
||||
await loadReport();
|
||||
if (data.script_path && downloadScriptBtn) {
|
||||
downloadScriptBtn.classList.remove('hidden');
|
||||
downloadScriptBtn.style.display = 'inline-flex';
|
||||
}
|
||||
switchTab('report');
|
||||
} else {
|
||||
switchTab('logs');
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
logOutput.innerText = "Error loading session.";
|
||||
}
|
||||
}
|
||||
|
||||
// --- Init & Navigation ---
|
||||
document.addEventListener('DOMContentLoaded', () => {
|
||||
loadHistory();
|
||||
});
|
||||
|
||||
window.switchView = function (viewName) {
|
||||
console.log("View switch requested:", viewName);
|
||||
}
|
||||
|
||||
window.switchTab = function (tabName) {
|
||||
document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
|
||||
|
||||
['logs', 'report', 'gallery'].forEach(name => {
|
||||
const content = document.getElementById(`${name}Tab`);
|
||||
if (content) content.classList.add('hidden');
|
||||
});
|
||||
|
||||
document.querySelectorAll('.tab').forEach(btn => {
|
||||
if (btn.getAttribute('onclick') && btn.getAttribute('onclick').includes(`'${tabName}'`)) {
|
||||
btn.classList.add('active');
|
||||
}
|
||||
});
|
||||
|
||||
if (tabName === 'logs') {
|
||||
document.getElementById('logsTab').classList.remove('hidden');
|
||||
} else if (tabName === 'report') {
|
||||
document.getElementById('reportTab').classList.remove('hidden');
|
||||
} else if (tabName === 'gallery') {
|
||||
document.getElementById('galleryTab').classList.remove('hidden');
|
||||
if (currentSessionId) loadGallery();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user