B站账号@狼群里的小杨,记得点赞收藏加关注,一键三连哦!
朴素贝叶斯算法实例文本分类
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
def naviebayes():
"""
朴素贝叶斯进行文本分类
:return: None
"""
news = fetch_20newsgroups(subset='all')
x_train,x_test,y_train,y_test = train_test_split(news.data, news.target, test_size=0.2)
tf = TfidfVectorizer()
x_train = tf.fit_transform(x_train)
print(tf.get_feature_names())
x_test = tf.transform(x_test)
mlt = MultinomialNB(alpha=1.0)
print(x_train)
mlt.fit(x_train, y_train)
y_predict = mlt.predict(x_test)
print("预测的文章类别为: ",y_predict)
print("准确率:",mlt.score(x_test,y_test))
print("每个类别的精确率和召回率:\n",classification_report(y_test, y_predict, target_names=news.target_names))
return None
if __name__ =="__main__":
naviebayes()
决策树实例-预测泰坦尼克号乘客生存率
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
def decision():
"""
决策树对泰坦尼克号进行预测生死
:return: None
"""
titan = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt")
x = titan[['pclass', 'age', 'sex']]
y= titan['survived']
print(x)
x['age'].fillna(x['age'].mean(), inplace=True)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
dict = DictVectorizer(sparse=False)
x_train = dict.fit_transform(x_train.to_dict(orient="records"))
print(dict.get_feature_names())
x_test = dict.transform(x_test.to_dict(orient="records"))
dec = DecisionTreeClassifier(max_depth=8)
dec.fit(x_train, y_train)
print("预测的准确率: ", dec.score(x_test,y_test))
export_graphviz(dec, out_file="./tree.dot",feature_names=['年龄','pclass=1st', 'pclass=2nd', 'pclass=3rd', '女性', '男性'])
return None
if __name__ =="__main__":
decision()
随机森林实例-泰坦尼克号乘客生存预测
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
def decision():
"""
决策树对泰坦尼克号进行预测生死
:return: None
"""
titan = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt")
x = titan[['pclass', 'age', 'sex']]
y= titan['survived']
print(x)
x['age'].fillna(x['age'].mean(), inplace=True)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
dict = DictVectorizer(sparse=False)
x_train = dict.fit_transform(x_train.to_dict(orient="records"))
print(dict.get_feature_names())
x_test = dict.transform(x_test.to_dict(orient="records"))
rf = RandomForestClassifier()
param = {"n_estimators": [120, 200, 300, 500, 800, 1200], "max_depth":[5, 8, 15, 25,30]}
gc = GridSearchCV(rf, param_grid=param, cv=2)
gc.fit(x_train, y_train)
print("准确率:",gc.score(x_test, y_test))
print("查看选择的参数模型:",gc.best_params_)
return None
if __name__ =="__main__":
decision()
线性回归、梯度下降、岭回归预测波士顿房价
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression,SGDRegressor,Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
def mylinear():
"""
线性回归直接预测房子价格
:return:None
"""
lb = load_boston()
x_train,x_test, y_train,y_test = train_test_split(lb.data, lb.target,test_size=0.25)
print(y_train, y_test)
std_x = StandardScaler()
x_train = std_x.fit_transform(x_train)
x_test = std_x.transform(x_test)
std_y = StandardScaler()
y_train = std_y.fit_transform(y_train.reshape(-1, 1))
y_test = std_y.transform(y_test.reshape(-1, 1))
lr = LinearRegression()
lr.fit(x_train, y_train)
print(lr.coef_)
y_lr_predict = std_y.inverse_transform(lr.predict(x_test))
print("正规方程测试集里面每个房子的预测价格:\n",y_lr_predict)
print("正规方程的均方误差:",mean_squared_error(std_y.inverse_transform(y_test), y_lr_predict))
sgd = SGDRegressor()
sgd.fit(x_train, y_train)
print(sgd.coef_)
y_sgd_predict = std_y.inverse_transform(sgd.predict(x_test))
print("梯度下降测试集里面每个房子的预测价格:\n", y_sgd_predict)
print("梯度下降的均方误差:", mean_squared_error(std_y.inverse_transform(y_test), y_sgd_predict))
rd = Ridge(alpha=1.0)
rd.fit(x_train, y_train)
print(rd.coef_)
y_rd_predict = std_y.inverse_transform(rd.predict(x_test))
print("岭回归测试集里面每个房子的预测价格:\n", y_rd_predict)
print("岭回归的均方误差:", mean_squared_error(std_y.inverse_transform(y_test), y_rd_predict))
return None
if __name__=="__main__":
mylinear()
保存和加载API
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression,SGDRegressor,Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib
def mylinear():
"""
线性回归直接预测房子价格
:return:None
"""
lb = load_boston()
x_train,x_test, y_train,y_test = train_test_split(lb.data, lb.target,test_size=0.25)
print(y_train, y_test)
std_x = StandardScaler()
x_train = std_x.fit_transform(x_train)
x_test = std_x.transform(x_test)
std_y = StandardScaler()
y_train = std_y.fit_transform(y_train.reshape(-1, 1))
y_test = std_y.transform(y_test.reshape(-1, 1))
lr = LinearRegression()
lr.fit(x_train, y_train)
print(lr.coef_)
joblib.dump(lr, "./test.pkl")
return None
if __name__=="__main__":
mylinear()
逻辑回归二分类实例癌症预测
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
def logistic():
"""
逻辑回归做二分类进行癌症预测(根据细胞的属性特征)
:return: None
"""
column = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size',
'Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size', 'Bare Nuclei',
'Bland Chromatin','Normal Nucleoli', 'Mitoses', 'Class']
data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data", names=column)
print(data)
data = data.replace(to_replace='?', value=np.nan)
data = data.dropna()
x_train, x_test,y_train, y_test = train_test_split(data[column[1:10]], data[column[10]], test_size=0.25)
print(y_train, y_test)
std = StandardScaler()
x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)
lg = LogisticRegression(C=1.0)
lg.fit(x_train, y_train)
print(lg.coef_)
y_predict = lg.predict(x_test)
print("准确率:",lg.score(x_test, y_test))
print("召回率:",classification_report(y_test, y_predict, labels=[2, 4], target_names=["良性", "恶性"]))
return None
if __name__=="__main__":
logistic()
聚类Kmeans算法实现市场菜篮子案例
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
prior = pd.read_csv("order_products__prior.csv")
products = pd.read_csv("products.csv")
orders = pd.read_csv("orders.csv")
aisles = pd.read_csv("aisles.csv")
_mg = pd.merge(prior, products, on=['product_id', 'product_id'])
_mg = pd.merge(_mg, orders, on=['order_id','order_id'])
mt = pd.merge(_mg, aisles, on=['aisle_id','aisle_id'])
print(mt.head(10))
cross = pd.crosstab(mt['user_id'], mt['aisle'])
print(cross.head(10))
pca = PCA(n_components=0.9)
data = pca.fit_transform(cross)
x = data[:500]
print(x.shape)
km =KMeans(n_clusters=4)
km.fit(x)
predict = km.predict(x)
print("预测结果:\n",predict)
plt.figure(figsize=(6, 6))
colored = ['orange', 'green', 'blue', 'purple']
colr = [colored[i] for i in predict]
plt.scatter(x[:,1], x[:, 20], color=colr)
plt.xlabel("1")
plt.ylabel("20")
plt.show()