feat: 重大功能更新 v1.4.0 - 飞书集成、AI语义相似度、前端优化

主要更新内容:
- 🚀 飞书多维表格集成,支持工单数据同步
- 🤖 AI建议与人工描述语义相似度计算
- 🎨 前端UI全面优化,现代化设计
- 📊 智能知识库入库策略(AI准确率<90%使用人工描述)
- 🔧 代码重构,模块化架构优化
- 📚 完整文档整合和更新
- 🐛 修复配置导入和数据库字段问题

技术特性:
- 使用sentence-transformers进行语义相似度计算
- 快速模式结合TF-IDF和语义方法
- 响应式设计,支持移动端
- 加载状态和动画效果
- 配置化AI准确率阈值
This commit is contained in:
赵杰 Jie Zhao (雄狮汽车科技)
2025-09-19 19:32:42 +01:00
parent 79cf316c63
commit da4736c323
30 changed files with 4778 additions and 1406 deletions

View File

@@ -76,18 +76,24 @@ def extract_keywords(text: str, max_keywords: int = 10) -> List[str]:
return [word for word, count in sorted_words[:max_keywords]]
def calculate_similarity(text1: str, text2: str) -> float:
"""计算文本相似度"""
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
"""计算文本相似度(使用语义相似度)"""
try:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([text1, text2])
similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
return float(similarity)
from src.utils.semantic_similarity import calculate_semantic_similarity
return calculate_semantic_similarity(text1, text2)
except Exception as e:
logging.error(f"计算相似度失败: {e}")
return 0.0
logging.error(f"计算语义相似度失败: {e}")
# 回退到传统方法
try:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([text1, text2])
similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
return float(similarity)
except Exception as e2:
logging.error(f"计算TF-IDF相似度失败: {e2}")
return 0.0
def format_time_duration(seconds: float) -> str:
"""格式化时间持续时间"""

View File

@@ -0,0 +1,256 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
语义相似度计算服务
使用sentence-transformers进行更准确的语义相似度计算
"""
import logging
import numpy as np
from typing import List, Tuple, Optional
from sentence_transformers import SentenceTransformer
import torch
logger = logging.getLogger(__name__)
class SemanticSimilarityCalculator:
"""语义相似度计算器"""
def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
"""
初始化语义相似度计算器
Args:
model_name: 使用的预训练模型名称
- all-MiniLM-L6-v2: 英文模型,速度快,推荐用于生产环境
- paraphrase-multilingual-MiniLM-L12-v2: 多语言模型,支持中文
- paraphrase-multilingual-mpnet-base-v2: 多语言模型,精度高
"""
self.model_name = model_name
self.model = None
self._load_model()
def _load_model(self):
"""加载预训练模型"""
try:
logger.info(f"正在加载语义相似度模型: {self.model_name}")
self.model = SentenceTransformer(self.model_name)
logger.info("语义相似度模型加载成功")
except Exception as e:
logger.error(f"加载语义相似度模型失败: {e}")
# 回退到简单模型
self.model = None
def calculate_similarity(self, text1: str, text2: str, fast_mode: bool = True) -> float:
"""
计算两个文本的语义相似度
Args:
text1: 第一个文本
text2: 第二个文本
fast_mode: 是否使用快速模式(结合传统方法)
Returns:
相似度分数 (0-1之间)
"""
if not text1 or not text2:
return 0.0
try:
# 快速模式:先使用传统方法快速筛选
if fast_mode:
tfidf_sim = self._calculate_tfidf_similarity(text1, text2)
# 如果传统方法相似度很高或很低,直接返回
if tfidf_sim >= 0.9:
return tfidf_sim
elif tfidf_sim <= 0.3:
return tfidf_sim
# 中等相似度时,使用语义方法进行精确计算
if self.model is not None:
semantic_sim = self._calculate_semantic_similarity(text1, text2)
# 结合两种方法的结果
return (tfidf_sim * 0.3 + semantic_sim * 0.7)
else:
return tfidf_sim
# 完整模式:直接使用语义相似度
if self.model is not None:
return self._calculate_semantic_similarity(text1, text2)
else:
return self._calculate_tfidf_similarity(text1, text2)
except Exception as e:
logger.error(f"计算语义相似度失败: {e}")
return self._calculate_tfidf_similarity(text1, text2)
def _calculate_semantic_similarity(self, text1: str, text2: str) -> float:
"""使用sentence-transformers计算语义相似度"""
try:
# 获取文本嵌入向量
embeddings = self.model.encode([text1, text2])
# 计算余弦相似度
similarity = self._cosine_similarity(embeddings[0], embeddings[1])
# 确保结果在0-1范围内
similarity = max(0.0, min(1.0, similarity))
logger.debug(f"语义相似度计算: {similarity:.4f}")
return float(similarity)
except Exception as e:
logger.error(f"语义相似度计算失败: {e}")
return self._calculate_tfidf_similarity(text1, text2)
def _calculate_tfidf_similarity(self, text1: str, text2: str) -> float:
"""使用TF-IDF计算相似度回退方法"""
try:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
vectorizer = TfidfVectorizer(max_features=1000, stop_words=None)
vectors = vectorizer.fit_transform([text1, text2])
similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
logger.debug(f"TF-IDF相似度计算: {similarity:.4f}")
return float(similarity)
except Exception as e:
logger.error(f"TF-IDF相似度计算失败: {e}")
return 0.0
def _cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
"""计算余弦相似度"""
try:
# 计算点积
dot_product = np.dot(vec1, vec2)
# 计算向量的模长
norm1 = np.linalg.norm(vec1)
norm2 = np.linalg.norm(vec2)
# 避免除零错误
if norm1 == 0 or norm2 == 0:
return 0.0
# 计算余弦相似度
similarity = dot_product / (norm1 * norm2)
return float(similarity)
except Exception as e:
logger.error(f"余弦相似度计算失败: {e}")
return 0.0
def batch_calculate_similarity(self, text_pairs: List[Tuple[str, str]]) -> List[float]:
"""
批量计算相似度
Args:
text_pairs: 文本对列表 [(text1, text2), ...]
Returns:
相似度分数列表
"""
if not text_pairs:
return []
try:
if self.model is not None:
return self._batch_semantic_similarity(text_pairs)
else:
return [self._calculate_tfidf_similarity(t1, t2) for t1, t2 in text_pairs]
except Exception as e:
logger.error(f"批量相似度计算失败: {e}")
return [0.0] * len(text_pairs)
def _batch_semantic_similarity(self, text_pairs: List[Tuple[str, str]]) -> List[float]:
"""批量计算语义相似度"""
try:
# 提取所有文本
all_texts = []
for text1, text2 in text_pairs:
all_texts.extend([text1, text2])
# 批量获取嵌入向量
embeddings = self.model.encode(all_texts)
# 计算每对的相似度
similarities = []
for i in range(0, len(embeddings), 2):
similarity = self._cosine_similarity(embeddings[i], embeddings[i+1])
similarities.append(float(similarity))
return similarities
except Exception as e:
logger.error(f"批量语义相似度计算失败: {e}")
return [self._calculate_tfidf_similarity(t1, t2) for t1, t2 in text_pairs]
def get_similarity_explanation(self, text1: str, text2: str, similarity: float) -> str:
"""
获取相似度解释
Args:
text1: 第一个文本
text2: 第二个文本
similarity: 相似度分数
Returns:
相似度解释文本
"""
if similarity >= 0.95:
return "语义高度相似,建议自动审批"
elif similarity >= 0.8:
return "语义较为相似,建议人工审核"
elif similarity >= 0.6:
return "语义部分相似,需要人工判断"
elif similarity >= 0.4:
return "语义相似度较低,建议重新生成"
else:
return "语义差异较大,建议重新生成"
def is_model_available(self) -> bool:
"""检查模型是否可用"""
return self.model is not None
# 全局实例
_similarity_calculator = None
def get_similarity_calculator() -> SemanticSimilarityCalculator:
"""获取全局相似度计算器实例"""
global _similarity_calculator
if _similarity_calculator is None:
_similarity_calculator = SemanticSimilarityCalculator()
return _similarity_calculator
def calculate_semantic_similarity(text1: str, text2: str, fast_mode: bool = True) -> float:
"""
计算语义相似度的便捷函数
Args:
text1: 第一个文本
text2: 第二个文本
fast_mode: 是否使用快速模式
Returns:
相似度分数 (0-1之间)
"""
calculator = get_similarity_calculator()
return calculator.calculate_similarity(text1, text2, fast_mode)
def batch_calculate_semantic_similarity(text_pairs: List[Tuple[str, str]]) -> List[float]:
"""
批量计算语义相似度的便捷函数
Args:
text_pairs: 文本对列表
Returns:
相似度分数列表
"""
calculator = get_similarity_calculator()
return calculator.batch_calculate_similarity(text_pairs)