训练数据的保存和加载

代码

from sklearn.datasets import load_boston # 波士顿房价数据集使用API
from sklearn.linear_model import LogisticRegression ##回归预测时使用的API  Ridge岭回归  LogisticRegression逻辑回归
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler ## 标准化API
from sklearn.metrics import mean_squared_error,classification_report
from sklearn.externals import joblib
import pandas as pd
import numpy as np
from sklearn.externals import joblib

def charge_data():
    # 构造标签名字
    colums=["colun1","colum2","colum3","colum4","colum5","colum6","colum7","colum8","colum9","colum10","TARGET"]
    # 读取数据
    data=pd.read_csv("./breast-cancer-wisconsin.data",names=colums)

    # 缺失值处理
    data=data.replace(to_replace="?",value=np.nan)
    data=data.dropna()
    # 数据集分割
    x_train,x_text,y_train,y_text=train_test_split(data[colums[1:10]],data[colums[10]],test_size=0.25)
    # print("特征值,训练集的\n",x_train)
    # print("特征值,测试集的\n",x_text)
    # print("目标值,训练集的\n",y_train)
    # print("目标值,测试集的\n",y_text)

    # 特征值进行标准化处理
    std=StandardScaler()
    std.fit_transform(x_train)
    std.transform(x_text)

    # 逻辑回归预测
    lg=LogisticRegression(C=1.0)
    lg.fit(x_train,y_train)
    print("回归参数:",lg.coef_)
    pre=lg.predict(x_train)
    print("预测值",pre)
    print("准确率:",lg.score(x_text,y_text))
    print("召回率:\n",classification_report(y_train,pre,labels=[2,4],target_names=["良性","恶性"]))
    return None

# https://archive.ics.uci.edu/ml/machine-learning-databases.data

def get_train():
    # 构造标签名字
    colums = ["colun1", "colum2", "colum3", "colum4", "colum5", "colum6", "colum7", "colum8", "colum9", "colum10",
              "TARGET"]
    # 读取数据
    data = pd.read_csv("./breast-cancer-wisconsin.data", names=colums)

    # 缺失值处理
    data = data.replace(to_replace="?", value=np.nan)
    data = data.dropna()
    # 数据集分割
    x_train, x_text, y_train, y_text = train_test_split(data[colums[1:10]], data[colums[10]], test_size=0.25)
    # print("特征值,训练集的\n",x_train)
    # print("特征值,测试集的\n",x_text)
    # print("目标值,训练集的\n",y_train)
    # print("目标值,测试集的\n",y_text)

    # 特征值进行标准化处理
    std = StandardScaler()
    std.fit_transform(x_train)
    std.transform(x_text)

    # 逻辑回归预测
    lg = LogisticRegression(C=1.0)
    lg.fit(x_train, y_train)
    joblib.dump(lg,filename="text.pkl")
    print("保存成功")
    return None

if __name__ == '__main__':
    # get_train()
    # lg=joblib.load("text.pkl")
    # print(print("回归参数是",lg.coef_))

    colums = ["colun1", "colum2", "colum3", "colum4", "colum5", "colum6", "colum7", "colum8", "colum9", "colum10",
              "TARGET"]
    data=pd.read_csv("./breast-cancer-wisconsin.data",names=colums)
    data=data.replace(to_replace="?",value=np.nan)
    data=data.dropna()
    x_train,x_text,y_train,y_text=train_test_split(data[colums[1:10]],data[colums[10]],test_size=0.25)
    lg=joblib.load("text.pkl")
    pre=lg.predict(x_text)
    print("预测数据是",pre)
    print("预测准确率",lg.score(x_train,y_train))



# 明天计划:
# 1.保存训练结果并用训练结果预测测试集结果 判断是否准确率为100%
# 2.利用训练集训练结果预测训练集 看看是不是100%
# 3.回顾以前不懂的地方

训练数据的保存和加载_第1张图片

你可能感兴趣的:(训练数据的保存和加载)