3.1 机器学习 - 机器学习项目案例

机器学习 - 机器学习项目案例

案例1:利用岭回归研究波士顿房价

读取数据

from sklearn.datasets import load_boston
boston = load_boston()
print('feature_names:', boston.feature_names)
print('data (shape) :', boston.data.shape)

在这里插入图片描述

线性回归模型
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

lin_reg.fit(boston.data, boston.target) # Fit
pre = lin_reg.predict(boston.data) # Predict
lin_reg.score(boston.data, boston.target) #Score

在这里插入图片描述

岭回归模型
from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha=0.5) # alpha值越大 正则化项所占比重越大

ridge_reg.fit(boston.data, boston.target) # Fit
ridge_reg.score(boston.data, boston.target) # Score

在这里插入图片描述

test_Ridge_alpha函数
探究alpha不同值,得到的回归结果

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
def test_Ridge_alpha(*data):
    X_train, X_test, y_train, y_test = data
    alphas = [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 5, 10, 20, 50, 100, 200, 500, 1000]
    scores = []
    for i, alpha in enumerate(alphas):
        ridge_reg = Ridge(alpha=alpha)
        ridge_reg.fit(X_train, y_train)
        scores.append(ridge_reg.score(X_test, y_test))
    plt.xlabel('Alphas')
	plt.ylabel('Scores')
    sns.lineplot(x=alphas, y=scores)
    
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.3, random_state=31)
test_Ridge_alpha(X_train, X_test, y_train, y_test)

3.1 机器学习 - 机器学习项目案例_第1张图片

invalid value encountered in true_divide # Remove the CWD from sys.path while we load stuff.

在这里插入图片描述

import numpy as np 
np.seterr(divide='ignore', invalid='ignore')

在这里插入图片描述
在这里插入图片描述

案例2:利用决策树回归预测波士顿放假

import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.tree import DecisionTreeRegressor

# Load DataSet
boston = load_boston()
X, y = boston.data, boston.target
features = boston.feature_names

# Fit
regression_tree = DecisionTreeRegressor(min_samples_split=30, min_samples_leaf=10, random_state=0) #决策树
regression_tree.fit(X, y)

# Score
score = np.mean(cross_val_score(regression_tree, X, y, cv=3)) # cv=3
print('Mean squared error: {0}'.format(round(abs(score),2)))

在这里插入图片描述

案例3:Logistic回归实现对鸢尾花数据分类

import matplotlib.pyplot as plt
from sklearn import datasets

iris = datasets.load_iris() # 加载鸢尾花数据

sepal_length_list = iris.data[:, 0] # 花萼长度
sepal_width_list = iris.data[:, 1] # 花萼宽度

# 构建 setosa、versicolor、virginica 索引数组
setosa_index_list = iris.target == 0 # setosa 索引数组
versicolor_index_list = iris.target == 1 # versicolor 索引数组
virginica_index_list = iris.target == 2 # virginica 索引数组

plt.scatter(sepal_length_list[setosa_index_list], 
            sepal_width_list[setosa_index_list], color="red", marker='o', label="setosa")
plt.scatter(sepal_length_list[versicolor_index_list], 
            sepal_width_list[versicolor_index_list], color="blue", marker="x", label="versicolor")
plt.scatter(sepal_length_list[virginica_index_list], 
            sepal_width_list[virginica_index_list],color="green", marker="+", label="virginica")
# 设置 legend
plt.legend(loc="best", title="iris type")
# 设定横坐标名称
plt.xlabel("sepal_length (cm)")
# 设定纵坐标名称
plt.ylabel("sepal_width (cm)")

3.1 机器学习 - 机器学习项目案例_第2张图片

逻辑回归
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import datasets

# 加载鸢尾花数据
iris = datasets.load_iris() 

# 设置训练集和测试集
X_train, X_test , y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.5, random_state=1)

# 创建一个Logistic回归分类器
logr = LogisticRegression(penalty='l2', random_state=0)

# 训练分类器
logr.fit(X_train, y_train)

# 预测所属类别
category = logr.predict(X_test)
category

在这里插入图片描述

模型可视化
import numpy as np
import matplotlib.pyplot as plt

# 只考虑前两个特征,即花萼长度(sepal length)、花萼宽度(sepal width)
X = iris.data[:, 0:2]
y = iris.target

logreg = LogisticRegression(C=1e5)    #C:惩罚项系数的倒数,越小,正则化项越大
logreg.fit(X, y)

# 网格大小
h = 0.02

x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5    # 将 X 的第一列(花萼长度)作为 x 轴,并求出 x 轴的最大值与最小值
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5    # 将 X 的第二列(花萼宽度)作为 y 轴,并求出 y 轴的最大值与最小值

xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# 调用 ravel() 函数将 xx 和 yy 平铺,然后使用 np.c_ 将平铺后的列表拼接
# 生成需要预测的特征矩阵,每一行的表示一个样本,每一列表示每个特征的取值
pre_data = np.c_[xx.ravel(), yy.ravel()]
Z = logreg.predict(pre_data)

Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(8, 6))

# 
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k', cmap=plt.cm.Paired)

# 设置坐标轴label
plt.xlabel("sepal length")
plt.ylabel("sepal width")

# 设置坐标轴范围
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())

# 设置坐标轴刻度
plt.xticks(np.arange(x_min, x_max, h * 10))
plt.yticks(np.arange(y_min, y_max, h * 10))

plt.show()

3.1 机器学习 - 机器学习项目案例_第3张图片

案例4:利用贝叶斯分类实现手写数字识别

加载数据集

import matplotlib.pyplot as plt
from sklearn.datasets import load_digits

digits = load_digits()

fig = plt.figure()
for i in range(25):
    ax = fig.add_subplot(5, 5, i+1)
    ax.imshow(digits.images[i], cmap=plt.cm.gray_r, interpolation='nearest')

3.1 机器学习 - 机器学习项目案例_第4张图片

测试集的样本数

# 划分数据集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.3, random_state=0)
# 测试集的样本数
print("y_test(shape):", y_test.shape)

在这里插入图片描述

import numpy as np 
np.seterr(divide='ignore', invalid='ignore')

在这里插入图片描述

GaussianNB

高斯贝叶斯分类器,特征的条件概率符合高斯分布

from sklearn.naive_bayes import GaussianNB

gau_nb = GaussianNB()
gau_nb.fit(X_train, y_train)
gy_pre = gau_nb.predict(X_test)

# 评估模型得分
print("Score:", gau_nb.score(X_test, y_test))
# 检验预测正确的数字个数
print("Right:", y_pre[(y_test / gy_pre) == 1].size)

在这里插入图片描述

MultinomialNB

多项式贝叶斯分类器,特征的条件概率符合多项式分布

from sklearn.naive_bayes import MultinomialNB

mul_nb = MultinomialNB()
mul_nb.fit(X_train, y_train)
my_pre = mul_nb.predict(X_test)

print("Score:", mul_nb.score(X_test, y_test))
print('Right:', my_pre[(y_test / my_pre) == 1].size)

在这里插入图片描述

BernoulliNB

伯努利贝叶斯分类器,符合伯努利分布(二项式分布)

from sklearn.naive_bayes import BernoulliNB

ber_nb = BernoulliNB()
ber_nb.fit(X_train, y_train)
by_pre = ber_nb.predict(X_test)

print("Score:", ber_nb.score(X_test, y_test))
print('Right:', by_pre[(y_test / by_pre) == 1].size)

在这里插入图片描述

模型可视化#1
import pandas as pd

naive_bayes = pd.DataFrame(['GaussianNB', 'MultinomialNB', 'BernoulliNB'])

score = pd.DataFrame([gau_nb.score(X_test, y_test), mul_nb.score(X_test, y_test), ber_nb.score(X_test, y_test)])

right = pd.DataFrame([y_pre[(y_test / gy_pre) == 1].size, my_pre[(y_test / my_pre) == 1].size, 
                      by_pre[(y_test / by_pre) == 1].size])

vs = pd.concat([naive_bayes, score, right], axis=1)

vs.columns = ['NaiveBayes', 'Score', 'Right']
vs

3.1 机器学习 - 机器学习项目案例_第5张图片

vs.plot.barh()

3.1 机器学习 - 机器学习项目案例_第6张图片

模型可视化#2
vs_naive_bayes = pd.DataFrame({'NaiveBayes': pd.Series(['GaussianNB', 'MultinomialNB', 'BernoulliNB']),
                             'Score': pd.Series([gau_nb.score(X_test, y_test), mul_nb.score(X_test, y_test), ber_nb.score(X_test, y_test)]),
                              'Right': pd.Series([y_pre[(y_test / gy_pre) == 1].size, my_pre[(y_test / my_pre) == 1].size, by_pre[(y_test / by_pre) == 1].size])})
vs_naive_bayes

3.1 机器学习 - 机器学习项目案例_第7张图片

import seaborn as sns
sns.barplot(vs_naive_bayes.NaiveBayes, vs_naive_bayes.Right)

3.1 机器学习 - 机器学习项目案例_第8张图片

sns.barplot(vs_naive_bayes.NaiveBayes, vs_naive_bayes.Score)

3.1 机器学习 - 机器学习项目案例_第9张图片

案例5:利用随机森林分类筛查乳腺癌

from sklearn.datasets import load_breast_cancer
dataset = load_breast_cancer()
dataset.target_names

在这里插入图片描述

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

NAMES = ["CodeNumber", "ClumpThickness", "UniformityCellSize", "UniformityCellShape", "MarginalAdhesion", "SingleEpithelialCellSize", "BareNuclei", "BlandChromatin", "NormalNucleoli", "Mitoses", "CancerType"]
breast_cancer_data =pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data', 
                                header=None,
                               names=NAMES)

breast_cancer_data

3.1 机器学习 - 机器学习项目案例_第10张图片

breast_cancer_data.describe()

3.1 机器学习 - 机器学习项目案例_第11张图片

train_x, test_x, train_y, test_y = train_test_split(breast_cancer_data[NAMES[1:-1]], breast_cancer_data[NAMES[-1]], train_size=0.7)

print("Train_x Shape :: ", train_x.shape) 
print("Train_y Shape :: ", train_y.shape)
print("Test_x Shape :: ", test_x.shape)
print("Test_y Shape :: ", test_y.shape)

在这里插入图片描述

RandomForestClassifier #1
# 利用随机森林分类进行筛选
clf = RandomForestClassifier()
clf.fit(train_x, train_y)

predictions = clf.predict(test_x)

for i in range(0, 5):
    print("Actual outcome :: {} and Predicted outcome :: {}".format(list(test_y)[i], predictions[i]))

print("Train Accuracy :: ", accuracy_score(train_y, clf.predict(train_x)))
print("Test Accuracy  :: ", accuracy_score(test_y, predictions))
print(" Confusion matrix ", confusion_matrix(test_y, predictions))

3.1 机器学习 - 机器学习项目案例_第12张图片
breast_cancer_data.info()
3.1 机器学习 - 机器学习项目案例_第13张图片

breast_cancer_data.iloc[np.where(breast_cancer_data['BareNuclei'] == '?')]

3.1 机器学习 - 机器学习项目案例_第14张图片

# 计算异常值列的平均值
mean_value = breast_cancer_data[breast_cancer_data["BareNuclei"] != "?"]["BareNuclei"].astype(np.int).mean() 
mean_value

breast_cancer_data['BareNuclei'] = breast_cancer_data['BareNuclei'].replace('?', mean_value) # mean_value替换?

breast_cancer_data.iloc[np.where(breast_cancer_data['BareNuclei'] == '?')]

在这里插入图片描述

breast_cancer_data["BareNuclei"] = breast_cancer_data["BareNuclei"].astype(np.int64)

breast_cancer_data.info()

3.1 机器学习 - 机器学习项目案例_第15张图片

RandomForestClassifier #2
train_x, test_x, train_y, test_y = train_test_split(breast_cancer_data[NAMES[1:-1]], breast_cancer_data[NAMES[-1]], train_size=0.7)

# 利用随机森林分类进行筛选
clf = RandomForestClassifier()
clf.fit(train_x, train_y)

predictions = clf.predict(test_x)

for i in range(0, 5):
    print("Actual outcome :: {} and Predicted outcome :: {}".format(list(test_y)[i], predictions[i]))

print("Train Accuracy :: ", accuracy_score(train_y, clf.predict(train_x)))
print("Test Accuracy  :: ", accuracy_score(test_y, predictions))
print("Confusion matrix :: \n", confusion_matrix(test_y, predictions))

3.1 机器学习 - 机器学习项目案例_第16张图片

参考资料

DataFrame

Matplotlib

help(plt.pcolormesh)

Create a pseudocolor plot with a non-regular rectangular grid.

Numpy

help(np.meshgrid)

Return coordinate matrices from coordinate vectors.

help(np.ravel)

Return a contiguous flattened array.

help(np.c_)

Translates slice objects to concatenation along the second axis.

help(np.seterr)

Set how floating-point errors are handled.

你可能感兴趣的:(#,苏大应用统计数据科学)