Python机器学习经典模型实战篇

B站账号@狼群里的小杨,记得点赞收藏加关注,一键三连哦!

朴素贝叶斯算法实例文本分类

from sklearn.datasets import  fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import  MultinomialNB
from sklearn.metrics import classification_report

def naviebayes():
    """
    朴素贝叶斯进行文本分类
    :return: None
    """
    # 获取新闻数据集
    news = fetch_20newsgroups(subset='all')

    # 进行数据分割
    x_train,x_test,y_train,y_test = train_test_split(news.data, news.target, test_size=0.2)

    # 对数据集进行特征抽取
    tf = TfidfVectorizer()

    # 以训练集当中的词的列表进行每篇文章重要性统计['a','b','c','d',]训练值必须是已经分好词的数据,将分词结果用空格连接起来形成每一篇文章的数据。
    x_train = tf.fit_transform(x_train)

    print(tf.get_feature_names())

    x_test = tf.transform(x_test)

    # 进行朴素贝叶斯算法的预测
    mlt = MultinomialNB(alpha=1.0)
    print(x_train)

    #设置训练集
    mlt.fit(x_train, y_train)

    y_predict = mlt.predict(x_test)
    print("预测的文章类别为: ",y_predict)

    # 得出准确率
    print("准确率:",mlt.score(x_test,y_test))

    print("每个类别的精确率和召回率:\n",classification_report(y_test, y_predict, target_names=news.target_names))

    return None

if __name__ =="__main__":
    naviebayes()

决策树实例-预测泰坦尼克号乘客生存率

import pandas as pd
from sklearn.feature_extraction import DictVectorizer #导入特征抽取的函数库
from sklearn.model_selection import train_test_split # 导入数据分割模块
from sklearn.tree import DecisionTreeClassifier # 导入决策树模型库
from sklearn.tree import export_graphviz #导入生成决策树图片的库

def decision():
    """
    决策树对泰坦尼克号进行预测生死
    :return: None
    """
    # 获取数据
    # titan = pd.read_csv('titan.csv')
    titan = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt")
    # 处理数据,找出特征值和目标值
    x = titan[['pclass', 'age', 'sex']]

    y= titan['survived']

    print(x)
    #  缺失值处理
    x['age'].fillna(x['age'].mean(), inplace=True)


    # 分割数据集到训练集合测试集
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

    #  进行处理(特征工程) 特征-> 类别-> one_hot编码
    dict = DictVectorizer(sparse=False)

    x_train = dict.fit_transform(x_train.to_dict(orient="records")) #将每一行的数据转换成字典列表。 转换成字典才可以进行特征抽取

    print(dict.get_feature_names())

    x_test = dict.transform(x_test.to_dict(orient="records"))
    # print(x_train)
    # 用决策树进行预测
    dec = DecisionTreeClassifier(max_depth=8) # 传入最大深度值

    dec.fit(x_train, y_train)

    # 预测准确率
    print("预测的准确率: ", dec.score(x_test,y_test))

    # 导出决策树的结构
    export_graphviz(dec, out_file="./tree.dot",feature_names=['年龄','pclass=1st', 'pclass=2nd', 'pclass=3rd', '女性', '男性'])


    return None

if __name__ =="__main__":
    decision()


随机森林实例-泰坦尼克号乘客生存预测

import pandas as pd
from sklearn.feature_extraction import DictVectorizer #导入特征抽取的函数库
from sklearn.model_selection import train_test_split # 导入数据分割模块
from sklearn.tree import DecisionTreeClassifier # 导入决策树模型库
from sklearn.tree import export_graphviz #导入生成决策树图片的库
from sklearn.ensemble import RandomForestClassifier # 导入随机森林算法库
from sklearn.model_selection import GridSearchCV # 导入网格搜索与交叉验证库

def decision():
    """
    决策树对泰坦尼克号进行预测生死
    :return: None
    """
    # 获取数据
    # titan = pd.read_csv('titan.csv')
    titan = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt")
    # 处理数据,找出特征值和目标值
    x = titan[['pclass', 'age', 'sex']]

    y= titan['survived']

    print(x)
    #  缺失值处理
    x['age'].fillna(x['age'].mean(), inplace=True)


    # 分割数据集到训练集合测试集
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

    #  进行处理(特征工程) 特征-> 类别-> one_hot编码
    dict = DictVectorizer(sparse=False)

    x_train = dict.fit_transform(x_train.to_dict(orient="records")) # 转换成字典才可以进行特征抽取

    print(dict.get_feature_names())

    x_test = dict.transform(x_test.to_dict(orient="records"))
    # print(x_train)

    #用决策树进行预测
    # dec = DecisionTreeClassifier(max_depth=8) # 传入最大深度值
    #
    # dec.fit(x_train, y_train)
    #
    # # 预测准确率
    # print("预测的准确率: ", dec.score(x_test,y_test))
    #
    # # 导出决策树的结构
    # export_graphviz(dec, out_file="./tree.dot",feature_names=['年龄','pclass=1st', 'pclass=2nd', 'pclass=3rd', '女性', '男性'])

    # 随机森林进行预测(超参数调优)
    rf = RandomForestClassifier()
    # 设置参数字典,第一个参数列表包含决策树的数目,第二个参数列表是随机数深度,从列表中选取一个值两辆组合进行计算。
    # 这里共有6x5=30种组合情况需要计算
    param = {"n_estimators": [120, 200, 300, 500, 800, 1200], "max_depth":[5, 8, 15, 25,30]}
    # 网格搜索与交叉验证
    gc = GridSearchCV(rf, param_grid=param, cv=2)

    #可以计算信息熵的大小
    gc.fit(x_train, y_train)

    print("准确率:",gc.score(x_test, y_test))

    print("查看选择的参数模型:",gc.best_params_)
    #这里不能导出随机森林的结构,只能导出单个决策树的结构生成图片
    return None

if __name__ =="__main__":
    decision()


线性回归、梯度下降、岭回归预测波士顿房价

from sklearn.datasets import load_boston # 获取波士顿的房价数据
from  sklearn.linear_model import LinearRegression,SGDRegressor,Ridge # 导入线性回归,梯度下降,岭回归的函数库
from sklearn.model_selection import train_test_split # 导入数据集分割函数
from sklearn.preprocessing import StandardScaler # 导入标准化函数库
from sklearn.metrics import mean_squared_error #回归评估API

def mylinear():
    """
    线性回归直接预测房子价格
    :return:None
    """
    # 获取数据
    lb = load_boston()

    #分割数据集到训练集合测试集
    x_train,x_test, y_train,y_test =   train_test_split(lb.data, lb.target,test_size=0.25)

    print(y_train, y_test)

    #进行标准化处理(?) 目标值处理?
    #特征值和目标值是都必须进行标准化处理,实例化两个标准化API
    std_x = StandardScaler()

    x_train = std_x.fit_transform(x_train)
    x_test = std_x.transform(x_test)

    #目标值
    std_y = StandardScaler()

    y_train = std_y.fit_transform(y_train.reshape(-1, 1))

    y_test = std_y.transform(y_test.reshape(-1, 1))

    #estimator预测
    #正规方程求解方式预测结果
    lr = LinearRegression()
    lr.fit(x_train, y_train)

    print(lr.coef_)# 打印权重参数

    #预测测试集的房子价格
    y_lr_predict = std_y.inverse_transform(lr.predict(x_test))

    print("正规方程测试集里面每个房子的预测价格:\n",y_lr_predict)

    print("正规方程的均方误差:",mean_squared_error(std_y.inverse_transform(y_test), y_lr_predict))

    # 梯度下降去进行房价预测
    sgd = SGDRegressor()

    sgd.fit(x_train, y_train)

    print(sgd.coef_)  # 打印权重参数

    # 预测测试集的房子价格
    y_sgd_predict = std_y.inverse_transform(sgd.predict(x_test))

    print("梯度下降测试集里面每个房子的预测价格:\n", y_sgd_predict)

    print("梯度下降的均方误差:", mean_squared_error(std_y.inverse_transform(y_test), y_sgd_predict))

    # 岭回归去进行房价预测
    rd = Ridge(alpha=1.0)

    rd.fit(x_train, y_train)

    print(rd.coef_)  # 打印权重参数

    # 预测测试集的房子价格
    y_rd_predict = std_y.inverse_transform(rd.predict(x_test))

    print("岭回归测试集里面每个房子的预测价格:\n", y_rd_predict)

    print("岭回归的均方误差:", mean_squared_error(std_y.inverse_transform(y_test), y_rd_predict))

    return None

if __name__=="__main__":
    mylinear()


保存和加载API

from sklearn.datasets import load_boston # 获取波士顿的房价数据
from  sklearn.linear_model import LinearRegression,SGDRegressor,Ridge # 导入线性回归,梯度下降,岭回归的函数库
from sklearn.model_selection import train_test_split # 导入数据集分割函数
from sklearn.preprocessing import StandardScaler # 导入标准化函数库
from sklearn.externals import joblib #导入保存模型的API

def mylinear():
    """
    线性回归直接预测房子价格
    :return:None
    """
    # 获取数据
    lb = load_boston()

    #分割数据集到训练集合测试集
    x_train,x_test, y_train,y_test =   train_test_split(lb.data, lb.target,test_size=0.25)

    print(y_train, y_test)

    #进行标准化处理(?) 目标值处理?
    #特征值和目标值是都必须进行标准化处理,实例化两个标准化API
    std_x = StandardScaler()

    x_train = std_x.fit_transform(x_train)
    x_test = std_x.transform(x_test)

    #目标值
    std_y = StandardScaler()

    y_train = std_y.fit_transform(y_train.reshape(-1, 1))
    y_test = std_y.transform(y_test.reshape(-1, 1))

    # 预测房价结果
    # model = joblib.load("./test.pkl") # 加载之前保存的模型,当前文件夹下面的test.pkl文件即为之前保存的模型
    # 
    # y_predict = std_y.inverse_transform(model.predict(x_test))
    # print("保存的模型预测的结果:\n",y_predict)

    #estimator预测
    #正规方程求解方式预测结果
    lr = LinearRegression()
    lr.fit(x_train, y_train)

    print(lr.coef_)# 打印权重参数

    #保存训练好的模型到当前文件夹下面的test.pkl文件中
    joblib.dump(lr, "./test.pkl")

    return None

if __name__=="__main__":
    mylinear()


逻辑回归二分类实例癌症预测

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split # 导入数据集分割函数库
from sklearn.preprocessing import StandardScaler # 导入标准化的API
from  sklearn.linear_model import LogisticRegression # 导入逻辑回归API
from sklearn.metrics import classification_report #分类模型的评估API

def logistic():
    """
    逻辑回归做二分类进行癌症预测(根据细胞的属性特征)
    :return: None
    """

    # 构造列标签名字
    column = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size',
              'Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size', 'Bare Nuclei',
              'Bland Chromatin','Normal Nucleoli', 'Mitoses', 'Class']

    #读取数据
    data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data", names=column)

    print(data)

    # 缺失值进行处理
    data = data.replace(to_replace='?', value=np.nan)

    data = data.dropna()

    #进行数据分割
    x_train, x_test,y_train, y_test = train_test_split(data[column[1:10]], data[column[10]], test_size=0.25)
    print(y_train, y_test)

    #进行标准化处理
    std = StandardScaler()

    x_train = std.fit_transform(x_train)
    x_test = std.transform(x_test)

    #逻辑回归
    lg = LogisticRegression(C=1.0)
    lg.fit(x_train, y_train)

    print(lg.coef_)

    y_predict = lg.predict(x_test)

    print("准确率:",lg.score(x_test, y_test))

    print("召回率:",classification_report(y_test, y_predict, labels=[2, 4], target_names=["良性", "恶性"]))

    return None

if __name__=="__main__":
    logistic()


聚类Kmeans算法实现市场菜篮子案例

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans # 导入聚类的API


# 读取四张表的数据
prior = pd.read_csv("order_products__prior.csv")
products = pd.read_csv("products.csv")
orders = pd.read_csv("orders.csv")
aisles = pd.read_csv("aisles.csv")

# 合并四张表到同一张表(用户-物品类别)
_mg = pd.merge(prior, products, on=['product_id', 'product_id'])
_mg = pd.merge(_mg, orders, on=['order_id','order_id'])
mt = pd.merge(_mg, aisles, on=['aisle_id','aisle_id'])
print(mt.head(10))

#交叉表(特殊的分组工具)
#按user_id分类,统计各个分类中aisle的频数
cross = pd.crosstab(mt['user_id'], mt['aisle'])
print(cross.head(10))

# 进行主成分分析
pca = PCA(n_components=0.9)
data = pca.fit_transform(cross)

# 把样本数量减少
x = data[:500]
print(x.shape)

km =KMeans(n_clusters=4)
km.fit(x)

predict = km.predict(x)
print("预测结果:\n",predict)

# 显示聚类的结果
plt.figure(figsize=(6, 6))# 指定画板大小

#建立四个颜色的列表
colored = ['orange', 'green', 'blue', 'purple']
colr = [colored[i] for i in predict]
#绘制散点图
plt.scatter(x[:,1], x[:, 20], color=colr)
plt.xlabel("1")
plt.ylabel("20")
plt.show()

# print(data.shape)


你可能感兴趣的:(Python机器学习,机器学习,python,分类)