本文通过通俗易懂的方式介绍支持向量机(SVM)如何处理高维和复杂数据集,包括核函数技巧、特征工程和优化方法。
如果想更加全面清晰地了解金融资产组合模型进化论的体系架构,可参考:
0. 金融资产组合模型进化全图鉴
理解SVM处理高维数据,需要从以下几个维度进行分析:
import numpy as np
from sklearn.preprocessing import StandardScaler
class SVMKernels:
def __init__(self):
self.scaler = StandardScaler()
def linear_kernel(self, X1, X2):
"""
线性核函数
K(x,y) = x^T y
"""
return np.dot(X1, X2.T)
def polynomial_kernel(self, X1, X2, degree=3, coef0=1):
"""
多项式核函数
K(x,y) = (x^T y + coef0)^degree
"""
return (np.dot(X1, X2.T) + coef0) ** degree
def rbf_kernel(self, X1, X2, gamma=0.1):
"""
RBF(高斯)核函数
K(x,y) = exp(-gamma ||x-y||^2)
"""
X1_norm = np.sum(X1**2, axis=1).reshape(-1,1)
X2_norm = np.sum(X2**2, axis=1).reshape(1,-1)
K = np.dot(X1, X2.T)
K *= -2
K += X1_norm + X2_norm
return np.exp(-gamma * K)
class CustomKernels:
def __init__(self):
pass
def chi2_kernel(self, X1, X2, gamma=1.0):
"""
卡方核函数,适用于非负特征
"""
K = np.zeros((X1.shape[0], X2.shape[0]))
for i in range(X1.shape[0]):
for j in range(X2.shape[0]):
numerator = (X1[i] - X2[j]) ** 2
denominator = X1[i] + X2[j]
# 避免除零
denominator[denominator == 0] = 1e-10
K[i,j] = np.sum(numerator / denominator)
return np.exp(-gamma * K)
def spectral_kernel(self, X1, X2, gamma=1.0, freq=1.0):
"""
谱核函数,适用于周期性数据
"""
diff = X1.reshape(-1,1,X1.shape[1]) - X2.reshape(1,-1,X2.shape[1])
return np.exp(-gamma * np.sum(diff**2, axis=2)) * np.cos(freq * np.pi * np.sum(diff, axis=2))
class FeatureProcessor:
def __init__(self):
self.scaler = StandardScaler()
def process_features(self, X, method='standard'):
"""
特征预处理
"""
if method == 'standard':
return self.scaler.fit_transform(X)
elif method == 'minmax':
return (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
elif method == 'robust':
q1 = np.percentile(X, 25, axis=0)
q3 = np.percentile(X, 75, axis=0)
iqr = q3 - q1
return (X - q1) / iqr
def reduce_dimensions(self, X, n_components=0.95):
"""
降维处理
"""
from sklearn.decomposition import PCA
pca = PCA(n_components=n_components)
return pca.fit_transform(X)
def handle_missing_values(self, X):
"""
处理缺失值
"""
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
return imputer.fit_transform(X)
class SVMOptimizer:
def __init__(self):
from sklearn.svm import SVC
self.base_model = SVC()
def grid_search_cv(self, X, y, param_grid):
"""
网格搜索最优参数
"""
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(
self.base_model,
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1
)
grid_search.fit(X, y)
return {
'best_params': grid_search.best_params_,
'best_score': grid_search.best_score_,
'best_model': grid_search.best_estimator_
}
def bayesian_optimization(self, X, y, param_space):
"""
贝叶斯优化参数
"""
from skopt import BayesSearchCV
bayes_search = BayesSearchCV(
self.base_model,
param_space,
n_iter=50,
cv=5,
n_jobs=-1
)
bayes_search.fit(X, y)
return {
'best_params': bayes_search.best_params_,
'best_score': bayes_search.best_score_,
'best_model': bayes_search.best_estimator_
}
class KernelSelector:
def __init__(self):
pass
def recommend_kernel(self, X, y):
"""
推荐合适的核函数
"""
n_samples, n_features = X.shape
if n_features > 1000:
return {
'kernel': 'linear',
'reason': '高维数据,线性核函数计算效率高'
}
if n_samples < 1000:
return {
'kernel': 'rbf',
'reason': '样本量适中,RBF核函数可以处理非线性关系'
}
# 检查数据特征
if np.all(X >= 0): # 非负特征
return {
'kernel': 'chi2',
'reason': '适用于非负特征的数据'
}
return {
'kernel': 'poly',
'reason': '默认选择,可以处理复杂的非线性关系'
}
class SVMOptimizationStrategy:
def __init__(self):
pass
def optimize_for_large_dataset(self, X, y):
"""
大数据集优化策略
"""
strategies = {
'preprocessing': [
'使用增量学习处理大规模数据',
'特征选择减少维度',
'数据采样平衡类别'
],
'training': [
'使用线性核函数',
'调整惩罚参数C',
'使用SGD优化器'
],
'evaluation': [
'使用交叉验证',
'监控训练时间',
'评估模型复杂度'
]
}
return strategies
支持向量机(SVM)处理高维复杂数据集主要通过核函数技巧和特征工程两大方向。可以把核函数想象成一个"数据变形器",它能够将复杂的数据转换到一个更容易分类的空间中。
关键技术点:
实践建议:
通过合理运用这些技术,SVM能够有效处理高维复杂数据,在保证模型性能的同时兼顾计算效率。关键是要根据具体问题选择合适的策略组合。