本文主要记录使用 Sklearn 做机器学习分类和回归的例子,适合初学者理解和扩展。
Sklearn 包含了很多种机器学习的方式,如下表所示:
模块英文名 | 模块中文名 |
---|---|
Classification | 分类 |
Regression | 回归 |
Clustering | 非监督分类 |
Dimensionality reduction | 数据降维 |
Model Selection | 模型选择 |
Preprocessing | 数据预处理 |
多层感知机分类
from sklearn.neural_network import MLPClassifier
#输入数据(特征)
x = [[0., 0.], [1., 1.]]
# 标签数据(标签)
y = [1, 2]
# 创建分类器
clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(5,2), random_state=1)
# 训练分类器
clf.fit(x, y)
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
beta_2=0.999, early_stopping=False, epsilon=1e-08,
hidden_layer_sizes=(5, 2), learning_rate='constant',
learning_rate_init=0.001, max_iter=200,
momentum=0.9, nesterovs_momentum=True,
power_t=0.5, random_state=1, shuffle=True, solver='adam',
tol=0.0001, validation_fraction=0.1, verbose=False,
warm_start=False)
# 预测未知数据
results = clf.predict([[-2., -2.]])
print(results)
随机森林分类
from sklearn.ensemble import RandomForestClassifier #做分类
#输入数据(特征)
x = [[0., 0.], [1., 1.]]
# 标签数据(标签)
y = [1, 2]
# 创建分类器
model_RF = RandomForestClassifier(n_estimators=20,max_features="auto",criterion='gini',\
n_jobs=4,oob_score=True)
#调用fit训练
model_RF.fit(x, y)
# 预测未知数据
results = model_RF.predict([[-2., -2.]])
print(results)
支持矢量机分类
import numpy as np
from sklearn.svm import SVC # 导入svm的svc类(支持向量分类)
#输入数据(特征)
X = np.array([[-1, -1],
[-2, -1],
[1, 1],
[2, 1]])
# 标签数据(标签)
y = np.array([1,
1,
2,
2])
# 创建分类器
clf = SVC()
# 训练分类器
clf.fit(X, y)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
# 预测未知数据
results = clf.predict([[-0.8, -1]])
print(results)
梯度提升树分类
# 梯度提升树
from sklearn.ensemble import GradientBoostingClassifier
#输入数据(特征)
x = [[0., 0.], [1., 1.]]
# 标签数据(标签)
y = [1, 2]
# 创建分类器,默认:n_estimators=60,learning_rate=0.1,random_state=10
clf = GradientBoostingClassifier()
# 训练分类器
clf.fit(x, y)
# 预测未知数据
results = clf.predict([[-2., -2.]])
print(results)
k-邻域
from sklearn.neighbors import KNeighborsClassifier
#输入数据(特征)
x = [[0., 0.], [1., 1.]]
# 标签数据(标签)
y = [1, 2]
# 创建分类器,k=1
clf = KNeighborsClassifier(n_neighbors= 1)
# 训练分类器
clf.fit(x, y)
# 预测未知数据
results = clf.predict([[-2., -2.]])
print(results)
#输入数据(特征),一行一个样本,每一列为特征
x = [[0., 0.],
[1., 1.]]
# 标签数据(标签)
y = [1, 2]
# 选择不同方法(在本文方法列表中复制过来)
from sklearn import tree
model = tree.DecisionTreeRegressor()
# 训练模型
model.fit(x,y)
# 回归结果
result = model.predict([[-2., -2.]])
print(result)
# 决策树回归
from sklearn import tree
model_DecisionTreeRegressor = tree.DecisionTreeRegressor()
# 线性回归
from sklearn import linear_model
model_LinearRegression = linear_model.LinearRegression()
# SVM回归
from sklearn import svm
model_SVR = svm.SVR()
# KNN回归
from sklearn import neighbors
model_KNeighborsRegressor = neighbors.KNeighborsRegressor()
# 随机森林回归
from sklearn import ensemble
model_RandomForestRegressor = ensemble.RandomForestRegressor(n_estimators=20)#这里使用20个决策树
# Adaboost回归
from sklearn import ensemble
model_AdaBoostRegressor = ensemble.AdaBoostRegressor(n_estimators=50)#这里使用50个决策树
# GBRT回归
from sklearn import ensemble
model_GradientBoostingRegressor = ensemble.GradientBoostingRegressor(n_estimators=100)#这里使用100个决策树
# Bagging回归
from sklearn.ensemble import BaggingRegressor
model_BaggingRegressor = BaggingRegressor()
# ExtraTree极端随机树回归
from sklearn.tree import ExtraTreeRegressor
model_ExtraTreeRegressor = ExtraTreeRegressor()
from sklearn.metrics import confusion_matrix
# 输入数据行数
numberSamples = litoNp.shape[0]
# 真实标签
expected=litoNp[:,3]
# 预测标签
predicted = []
# 逐行遍历
for i in range(numberSamples):
# 整理预测标签
predicted.append(clf.predict([litoScale[i]]))
# 计算混淆矩阵
results = confusion_matrix(expected,predicted)
print(results)
# 计算模型准确率等函数库
from sklearn import metrics
#计算模型准确率,y为标签数据,results是预测结果数据
print("RF准确率为:{0:%}".format(metrics.accuracy_score(y, results)))
from sklearn import preprocessing
# 均值,方差
scaler = preprocessing.StandardScaler().fit(litoTrans)
# 标准化
litoScale = scaler.transform(litoTrans)
# 检查均值、方差
print(litoScale.mean(axis=0))
print(litoScale.std(axis=0))
#保存模型
import joblib
joblib.dump(clf,'model.pkl')
model = joblib.load('model.pkl')