## 二分类问题
* 使用skleran自带的逻辑回归、支持向量机、决策树API进行二分类的任务
* 使用sklearn的iris数据集,将iris数据集变成一个二分类的数据集,删除类别为2的数据
* 使用准确率对模型进行评价
### 准备数据
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn import linear_model
from sklearn import tree
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn import metrics
iris = datasets.load_iris()
feature_columns = iris.feature_names
target_column = ['res']
iris.target
np.where(iris.target==2) #说明需要删除100-149行
features = pd.DataFrame(iris.data,columns= feature_columns)
labels = pd.DataFrame(iris.target,columns = target_column)
features = features[0:100]
labels = labels[0:100]
train_test_data = pd.concat([features,labels],axis=1)
#将数据集分为测试集和训练集
train, test = cross_validation.train_test_split(train_test_data, test_size=0.1)
train_X = train[feature_columns].values
train_y = train[target_column].values.reshape(train_y.size)
test_X = test[feature_columns].values
test_y = test[target_column].values.reshape(test_y.size)
### 训练模型
# 训练一个逻辑回归模型
linear = linear_model.LogisticRegression(penalty='l2',C=1.0)
linear.fit(train_X,train_y)
preL = linear.predict(test_X)
metrics.accuracy_score(test_y,preL)
# 训练一个决策树模型
DT = tree.DecisionTreeClassifier(max_depth = 3)
DT = DT.fit(train_X,train_y)
preT = DT.predict(test_X)
metrics.accuracy_score(test_y,preT)
# 训练一个支持向量机
SVM = LinearSVC()
SVM = SVM.fit(train_X,train_y)
preS = SVM.predict(test_X)
metrics.accuracy_score(test_y,preS)
## 多分类问题
* 使用SVM进行一对一,一对多多分类
* 使用决策树进行多分类
* 使用随机森林进行多分类
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn import linear_model
from sklearn import tree
from sklearn.svm import LinearSVC,SVC
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn import metrics
### 准备数据
iris = datasets.load_iris()
feature_columns = iris.feature_names
target_column = ['res']
features = pd.DataFrame(iris.data,columns= feature_columns)
labels = pd.DataFrame(iris.target,columns = target_column)
train_test_data = pd.concat([features,labels],axis=1)
train, test = cross_validation.train_test_split(train_test_data, test_size=0.1)
train_X = train[feature_columns].values
train_y = train[target_column].values
train_y = train_y.reshape(train_y.size)
test_X = test[feature_columns].values
test_y = test[target_column].values
test_y =test_y.reshape(test_y.size)
### 训练模型
# linearSVC 采用的是 one vs the rest 多分类的支持向量机
Linear_SVM = LinearSVC()
Linear_SVM = Linear_SVM.fit(train_X,train_y)
preLS = Linear_SVM.predict(test_X)
# SVC 采用的是 one vs one 多分类支持向量机(C-Support Vector Classification)
C_SVM = SVC()
C_SVM = C_SVM.fit(train_X,train_y)
preCS = Linear_SVM.predict(test_X)
metrics.accuracy_score(test_y,preCS)
# 使用决策树进行多分类
DT = tree.DecisionTreeClassifier(max_depth = 3)
DT = DT.fit(train_X,train_y)
preT = DT.predict(test_X)
metrics.accuracy_score(test_y,preT)
# 使用随机森林进行多分类
RF = RandomForestClassifier()
RF = RF.fit(train_X,train_y)
preRF = RF.predict(test_X)
metrics.accuracy_score(test_y,preRF)
metrics.accuracy_score(test_y,preLS)
### 回归
* 使用boston数据集
* 使用线性回归
* 使用树回归
* 使用支持向量机进行回归
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn import tree
from sklearn import linear_model
from sklearn import svm
from sklearn import model_selection
from sklearn import metrics
boston = datasets.load_boston()
feature_columns = boston.feature_names
target_column = ['target']
features = pd.DataFrame(boston.data,columns= feature_columns)
labels = pd.DataFrame(boston.target,columns = target_column)
train_test_data = pd.concat([features,labels],axis=1)
train, test = model_selection.train_test_split(train_test_data, test_size=0.1)
train_X = train[feature_columns].values
train_y = train[target_column].values
train_y = train_y.reshape(train_y.size)
test_X = test[feature_columns].values
test_y = test[target_column].values
test_y =test_y.reshape(test_y.size)
### 训练模型
#训练线性回归模型
linear = linear_model.LinearRegression()
linear.fit(train_X,train_y)
preL = linear.predict(test_X)
metrics.mean_squared_error(test_y,preL)**0.5
#训练树回归模型
DT = tree.DecisionTreeRegressor()
DT = DT.fit(train_X,train_y)
preT = DT.predict(test_X)
metrics.mean_squared_error(test_y,preT)**0.5
#训练支持向量机回归模型
SVM = svm.LinearSVR()
SVM = SVM.fit(train_X,train_y)
preS = SVM.predict(test_X)
metrics.mean_squared_error(test_y,preS)**0.5
## 特征清洗
* 通过pandas了解数据
* 通过pandas填充缺失的数据
data = pd.read_csv(path)#在路径path中读取csv文件,读取后data的格式为pd.DataFrame
data.head() # 查看数据前5行
data.shape #查看数据大小
data.info() #查看数据特征信息,包括缺失值数量等
data.describe() #查看数据的统计信息,包括每个特征的平均值/标准差等
data['feature1'].fillna(value = data['feature1'].mean) #将feature1列中的缺失值以feature1列的平均值进行填充
## 特征工程
### 数值型数据
* 幅度变换
* 计算统计值
* 特征之间进行算术和逻辑运算以产生新特征
* 产生高次特征和交叉特征
* 进行离散化
* One-hot 编码
#1.进行log变化,log对数据进行缩放,有助于数据呈现正态分布
#采用np+apply的方法
import numpy as np
log_feature1 = data['feature1'].apply(lambda x:np.log(x))
data.loc[:,'log_feature1'] = log_feature1 #增加一个对数特征列
#也可以使用sklearn自带的幅度变换函数进行幅度变换
# 幅度缩放,最大最小值缩放
from sklearn.preprocessing import MinMaxScaler
mm_scaler = MinMaxScaler()
feature1_mms = mm_scaler.fit_transform(data[['feature1']])
# 幅度缩放,标准化
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
feature1_ss = std_scaler.fit_transform(data[['feature1']])
#等等
#2.计算统计值
data['feature1'].max()
data['feature1'].min()
#计算分位数
data['feature1'].quantile(0.25)
# 3.1特征之间进行运算以产生新特征
data.loc[:,'new_feature1'] = data['feature1']+ 4*data['feature2']+1
data.loc[:,'new_feature2'] = (data['feature1']==0)&(data['feature2']==0)
# 4.产生高次特征和交叉特征
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
poly_fea = poly.fit_transform(data[['feature1','feature2']])
# 5.离散化 通过pandas的cut和qcut
data.loc[:,'feature1_cut'] = pd.cut(df_train['feature1'],5)
data.loc[:,'feature1_qcut'] = pd.qcut(df_train['feature1'],5)
# 6. onehot 编码
feature1_oht = pd.get_dummies(data[['feature1']])
### 日期处理
# 将数据中的data转换成pd中的datatime类型
data.loc[:,'date'] = pd.to_datetime(data['date_t'], format="")
# 取出月份
data.loc[:,'month'] = data['date'].dt.month
# 取出日
data.loc[:,'dom'] = data['date'].dt.day
# 取出一年当中第几天
data.loc[:,'doy'] = data['date'].dt.dayofyear
# 取出星期几
data.loc[:,'dow'] = data['date'].dt.dayofweek
### 文本处理
* 词袋模型
* TF-IDF
# 1.词袋模型
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
corpus = [
'This is the first document.',
'This is the second second document.',
'And the third one.',
'Is this the first document?'
]
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names() #得到特征名
X.toarray() #将X转换为np数组形势
#2.TF-IDF 模型
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_X = tfidf_vectorizer.fit_transform(corpus)
tfidf_vectorizer.get_feature_names()
tfidf_X.toarray()
## 特征选择
* Filter
* Wrapper
* Embedded
# Filter
from sklearn.feature_selection import SelectKBest
X_new = SelectKBest(k=2).fit_transform(X, y)# 默认使用卡方检验
# Wrapper
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rfe = RFE(estimator=rf, n_features_to_select=2)
X_rfe = rfe.fit_transform(X,y)
#Embedded
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
model = SelectFromModel(lsvc, prefit=True)
X_embed = model.transform(X)
## 模型融合
* 投票器
* Bagging
* Adaboost
# 投票器
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
X = array[:,0:8]
Y = array[:,8]
kfold = model_selection.KFold(n_splits=5, random_state=2018)
# 创建投票器的子模型
estimators = []
model_1 = LogisticRegression()
estimators.append(('logistic', model_1))
model_2 = DecisionTreeClassifier()
estimators.append(('dt', model_2))
model_3 = SVC()
estimators.append(('svm', model_3))
# 构建投票器融合
ensemble = VotingClassifier(estimators)
result = model_selection.cross_val_score(ensemble, X, Y, cv=kfold)
print(result.mean())
# Bagging
from sklearn.ensemble import BaggingClassifier
dt = DecisionTreeClassifier()
num = 100
kfold = model_selection.KFold(n_splits=5, random_state=2018)
model = BaggingClassifier(base_estimator=dt, n_estimators=num, random_state=2018)
result = model_selection.cross_val_score(model, X, Y, cv=kfold)
print(result.mean())
#Adaboost
from sklearn.ensemble import AdaBoostClassifier
num_trees = 25
kfold = model_selection.KFold(n_splits=5, random_state=2018)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=2018)
result = model_selection.cross_val_score(model, X, Y, cv=kfold)
print(result.mean())
## xgboost
import pickle
import xgboost as xgb
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.datasets import load_iris, load_digits, load_boston
#回归问题示例
#网格搜索交叉验证,代码举例,没有测试集,测试集和训练集相同,当然这和基本套路不符合
boston = load_boston()
y = boston['target']
X = boston['data']
kf = KFold(n_splits=5, shuffle=True)
xgb_model = xgb.XGBRegressor()
clf = GridSearchCV(xgb_model,
{'max_depth': [2,4,6],
'n_estimators': [50,100,200]}, verbose=0,cv=kf)
clf.fit(X,y)
print(clf.best_score_)# best_score 的评估标准是什么,还要考证
print(clf.best_params_)
xgb1_model = xgb.XGBRegressor(max_depth = 4,n_estimators = 200).fit(X,y)
predictions = xgb1_model.predict(X)
actuals = y
print("MSE:",mean_squared_error(actuals, predictions))
## lightGBM
estimator = lgb.LGBMRegressor(num_leaves=31)
param_grid = {
'learning_rate': [0.01, 0.1, 1],
'n_estimators': [20, 40]
}
gbm = GridSearchCV(estimator, param_grid, cv = kf)
gbm.fit(X, y)
print('用网格搜索找到的最优超参数为:')
print(gbm.best_params_)
gbm = lgb.LGBMRegressor(objective='regression',
num_leaves=31,
learning_rate=0.1,
n_estimators=40)
# 使用fit函数拟合
gbm.fit(X, y,
eval_set=[(X, y)],
eval_metric='l1',
early_stopping_rounds=5)
# 预测
print('开始预测...')
y_pred = gbm.predict(X, num_iteration=gbm.best_iteration_)
# 评估预测结果
print('预测结果的rmse是:')
print(mean_squared_error(y, y_pred) ** 0.5)