常见机器学习算法的使用

数据处理算法

k-means聚类算法

#k-means聚类算法
from sklearn.cluster import KMeans  
from sklearn.datasets import load_iris
li = load_iris()
k=5 #确定聚类中心
kmeans_model=KMeans(n_clusters=k,n_jobs=4,random_state=123)
fit_kmeans=kmeans_model.fit(li.data) #模型训练
kmeans_cc=kmeans_model.cluster_centers_ #聚类中心
print('各类的聚类中心为:\n',kmeans_cc)
kmeans_labels=kmeans_model.labels_
print('各类的样本类别标签为:\n',kmeans_labels)
r1=pd.Series(kmeans_model.labels_).value_counts()
print('每类的样本个数是:\n',r1)
#评估metrics的轮廓系数
from sklearn.metrics import silhouette_score
print('轮廓系数得分是:\n',silhouette_score(li.data,kmeans_model.predict(li.data)))

逻辑斯蒂回归算法

#逻辑斯蒂回归算法  分类算法
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
ld = load_digits() #导入数据集 
#进行训练集和测试集的划分
train_data,test_data,train_target,test_target = train_test_split(ld.data,ld.target,test_size=0.2)
#构建模型
lr = LogisticRegression()
lr.fit(train_data,train_target)
#预测结果
print("在测试集上的预测结果是:",lr.predict(test_data))
print("在预测集合上的真实结果是:",test_target)
#预测概率
print("第一张手写体的概率预测结果是::::",lr.predict_proba(np.array([ld.data[0]])))
#预测结果的正确率 f1_score评分
from sklearn.metrics import f1_score
print(f1_score(test_target,lr.predict(test_data),average='macro'))
from sklearn.metrics import confusion_matrix
print(confusion_matrix(test_target,lr.predict(test_data)))

KNN分类算法

#knn算法 非参数模型 分类
import numpy as np
import pandas as pd
datas  = np.array([[80,10],[10,50],[70,15],[20,65],[18,70],[76,19]])
labels = np.array([1,2,1,2,2,1])
movie_new=np.array([50,20])
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(n_neighbors=3)
knc.fit(datas,labels)
print(knc.predict(np.array([movie_new])))

朴素贝叶斯分类

#朴素贝叶斯分类
import numpy as np
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
datas = np.array(["Chinese Beijing Chinese","Chinese Chinese Shanghai","Chinese Macao","Tokyo Japan Chinese"])
target = np.array([1,1,1,0])
#TF-IDF = TF * IDF
#TF = 某个词在文章中出现的次数/文章中出现的总词数
#IDF = log_10(语料库中文章的总数/出现某个词的文章个数),代表该词对文章类别的区分度,越大区分度越高
#一个在1000篇文章的语料库中出现了10次那么它的idf是   log_10(1000/10) = 2
tfidf_vec = TfidfVectorizer()
tfidf_data = tfidf_vec.fit_transform(datas) #对数据进行tf-idf向量花
#打印出相应数据
# print(tfidf_data.toarray())
# print(tfidf_vec.get_feature_names())
#训练模型
multi_nb = MultinomialNB()
multi_nb.fit(tfidf_data,target)
#对新数据进行预测
sample_new = np.array(["Chinese Chinese Chinese Tokyo Japan Japan Japan Japan Japan Japan Japan"])
sample_new_tfidf = tfidf_vec.transform(sample_new)
print("预测结果是:::::",multi_nb.predict(sample_new_tfidf))
#正常数据
from sklearn.datasets import load_wine 
from sklearn.model_selection import train_test_split
lw = load_wine()  #导入数据集
train_data,test_data,train_target,test_target = train_test_split(lw.data,lw.target,test_size=0.2)  #划分数据集
from sklearn.naive_bayes import MultinomialNB,GaussianNB
multi_nb = MultinomialNB()
multi_nb.fit(train_data,train_target)
print("在测试集上的预测结果是:",multi_nb.predict(test_data))
print("在预测集合上的真实结果是:",test_target)

决策树

#决策树
#one-hot 独热
#北京 上海 深圳
#[1,0,0]  [0,1,0] ,[0,0,1]
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
#基尼系数
df = pd.read_csv('dtree.csv')
#将数据变成字典形式[{'Outlook':'sunny'.....},{}.....]
#转换数据
datas = df.loc[:,['Outlook','Temperature','Humidity','Windy']].to_dict(orient='record')
#转换训练目标(label)
target = df.loc[:,['Play']].to_dict(orient='record')
# print("原始数据:",df.loc[:,['Outlook','Temperature','Humidity','Windy']])
#print("字典数据:",datas)
#构建字典向量化器,sparse=False 使用矩阵稀疏表示
dv_train = DictVectorizer(sparse=False)
#将数据转换成向量形式
train_data = dv_train.fit_transform(datas)
dv_target = DictVectorizer(sparse=False)
target_data = dv_target.fit_transform(target)
# print("特征名称:",dv_train.feature_names_)
# print(train_data)
#模型的加载和训练
dt = DecisionTreeClassifier()
dt.fit(train_data,target_data)
#开始预测
record_new = {'Outlook': 'overcast', 'Temperature': 60, 'Humidity': 60, 'Windy': True}
record_vector = dv_train.transform(record_new)
print("该数据结果是::",dv_target.inverse_transform(dt.predict(record_vector)))
#标准化数据
from sklearn.datasets import load_wine 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
lw = load_wine()  #导入数据集
train_data,test_data,train_target,test_target = train_test_split(lw.data,lw.target,test_size=0.4)  #划分数据集
dt = DecisionTreeClassifier()
dt.fit(train_data,train_target)
dt.predict(test_data)
from sklearn.metrics import f1_score
print(f1_score(test_target,dt.predict(test_data),average='macro'))
'''
确定最优剪枝参数-学习曲线
那么,具体如何来确定每个参数填写什么值呢?这时候,我们就要使用确定超参数的曲线来进行判断了,继续使用我们已经训练好的决策树模型clf。超参数的学习曲线,是一条以超参数的取值为横坐标,模型的度量指标为纵坐标的曲线, 是用来衡量不同超参数取值下模型的表现的线。在我们建好的决策树里,我们的模型度量指标就是score。
'''
import matplotlib.pyplot as plt
test = []
clf = DecisionTreeClassifier(  )
for i in range(10):
    clf = DecisionTreeClassifier(max_depth=i+1
                                      ,criterion="entropy"
                                      ,random_state=30
                                      ,splitter="random"
                                     )
    clf = clf.fit(train_data,train_target)
    score = clf.score(test_data, test_target)
    test.append(score)
plt.plot(range(1,11),test,color="red",label="max_depth")
plt.legend()
plt.show()
#可视化决策树
import os     
os.environ["PATH"] += os.pathsep + 'G:/graphviz/bin'
dot_data = tree.export_graphviz(dt, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("lw") 
dot_data = tree.export_graphviz(clf, out_file=None, 
                         feature_names=iris.feature_names,  
                         class_names=lw.target_names,  
                         filled=True, rounded=True,  
                         special_characters=True)  

graph = graphviz.Source(dot_data)
graph

支持向量机

#支持向量机 分类
from sklearn.datasets import load_wine 
from sklearn.model_selection import train_test_split
lw = load_wine()  #导入数据集
train_data,test_data,train_target,test_target = train_test_split(lw.data,lw.target,test_size=0.2)  #划分数据集
from sklearn.svm import NuSVC,SVC  #导入支持向量机
svc=SVC()
svc.fit(train_data,train_target) 
svc.predict(test_data)

#支持向量机 回归
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
data = load_boston()
train_data,test_data,train_target,test_target = train_test_split(data.data,data.target,test_size=0.2)
from sklearn.svm import LinearSVR
li=LinearSVR()
li.fit(train_data,train_target)
y_predict = li.predict(test_data)
print("预测值为: ", y_predict)
print(test_target)

线性回归算法

#线性回归算法
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# 1.获取数据
data = load_boston()
# 2.数据集划分
train_data,test_data,train_target,test_target = train_test_split(data.data,data.target,test_size=0.2)
#3.特征工程-标准化
transfer = StandardScaler()
train_data = transfer.fit_transform(train_data)
test_data = transfer.fit_transform(test_data)
#4.机器学习-线性回归(正规方程)
estimator = LinearRegression()
estimator.fit(train_data,train_target)
# 5.模型评估
# 5.1 获取系数等值
y_predict = estimator.predict(test_data)
print("预测值为: ", y_predict)
print("真实值为: ", test_target)
print("模型中的系数为: ", estimator.coef_)
print("模型中的偏置为: ", estimator.intercept_)
error = mean_squared_error(test_target, y_predict)
print("误差为: ", error)
#已知数据预测下月数据
import pandas as pd
import numpy as np
data=pd.read_csv('11.csv')
del data['Unnamed: 0']
new=pd.DataFrame({'时间':'2018-11-01',
                  },
                 index=[1])   # 自定义索引为:1 ,这里也可以不设置index
data=data.append(new,ignore_index=True)  
new=pd.DataFrame({'时间':'2018-12-01',
                  },
                 index=[1])   # 自定义索引为:1 ,这里也可以不设置index
data=data.append(new,ignore_index=True)  
data=data.fillna(0)
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
data['时间']=pd.to_datetime(data['时间'])
from sklearn.preprocessing import StandardScaler
standScaler = StandardScaler() 
cols = ['时间']
data[cols] = standScaler.fit_transform(data[cols])
data=data.sort_values(by="时间",ascending=False)  
data.index = range(len(data))
X=np.array(data['时间'][2:38]).reshape(-1,1)
X=np.concatenate([X**3,X**2,X],axis=-1)
y=data.loc[2:38,['灭鼠杀虫剂','电蚊香套装','盘香灭蟑香蚊香盘','蚊香加热器','蚊香液','蚊香片','防霉防蛀片']]
y=np.array(y)
lr = LinearRegression()
lr.fit(X, y)
print("模型中的系数为: ", lr.coef_)
print("模型中的偏置为: ", lr.intercept_)

f ( x ) = w 1 ∗ x 3 + w 2 ∗ x 2 + w 1 ∗ x + b f(x)=w1*x^3+w2*x^2+w1*x+b f(x)=w1x3+w2x2+w1x+b

岭回归

from sklearn.datasets import load_boston
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
data = load_boston()
train_data,test_data,train_target,test_target = train_test_split(data.data,data.target,test_size=0.2)
transfer = StandardScaler()
train_data = transfer.fit_transform(train_data)
test_data = transfer.fit_transform(test_data)
ri=Ridge(alpha=1)
ri.fit(train_data,train_target)
y_predict = ri.predict(test_data)
print("预测值为: ", y_predict)
print("真实值为: ", test_target)
print("模型中的系数为: ", ri.coef_)
print("模型中的偏置为: ", ri.intercept_)
error = mean_squared_error(test_target, y_predict)
print("误差为: ", error)
from sklearn.metrics import r2_score  #拟合优度
print(r2_score(test_target,ri.predict(test_data)))

拉锁回归

from sklearn.datasets import load_boston
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
data = load_boston()
train_data,test_data,train_target,test_target = train_test_split(data.data,data.target,test_size=0.2)
transfer = StandardScaler()
train_data = transfer.fit_transform(train_data)
test_data = transfer.fit_transform(test_data)
la=Lasso()
la.fit(train_data,train_target)
y_predict = la.predict(test_data)
print("预测值为: ", y_predict)
print("真实值为: ", test_target)
print("模型中的系数为: ", la.coef_)
print("模型中的偏置为: ", la.intercept_)
error = mean_squared_error(test_target, y_predict)
print("误差为: ", error)
from sklearn.metrics import r2_score  #拟合优度
print(r2_score(test_target,la.predict(test_data)))

梯度下降

#梯度下降
from sklearn.datasets import load_boston
data = load_boston()
#进行数据的分割训练数据集与测试数据集
from sklearn.model_selection import train_test_split
train_data,test_data,train_target,test_target = train_test_split(data.data,data.target,test_size=0.5)
from sklearn.preprocessing import StandardScaler
transfer = StandardScaler()
train_data = transfer.fit_transform(train_data)
test_data = transfer.fit_transform(test_data)
from sklearn.linear_model import SGDRegressor
sdg=SGDRegressor(n_iter_no_change=15)
sdg.fit(train_data,train_target)
y_predict = sdg.predict(test_data)
print("预测值为: ", y_predict)
print("真实值为: ", test_target)
print("模型中的系数为: ", estimator.coef_)
print("模型中的偏置为: ", estimator.intercept_)
from sklearn.metrics import mean_squared_error
error = mean_squared_error(test_target, y_predict)
print("误差为: ", error)

交叉搜索网格验证

from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
# 2、数据基本处理 -- 划分数据集
ld = load_iris()
train_data,test_data,train_target,test_target = train_test_split(ld.data,ld.target,test_size=0.2)
# 3、特征工程:标准化
from sklearn.preprocessing import StandardScaler
transfer = StandardScaler()
train_data = transfer.fit_transform(train_data)
test_data = transfer.fit_transform(test_data)
# 4、KNN预估器流程
#  4.1 实例化预估器类
from sklearn.model_selection import GridSearchCV
estimator = KNeighborsClassifier()
# 4.2 模型选择与调优——网格搜索和交叉验证
# 准备要调的超参数
param_dict = {"n_neighbors": [3, 5,4,6]}
estimator = GridSearchCV(estimator, param_grid=param_dict, cv=4)
# 4.3 fit数据进行训练
estimator.fit(train_data, train_target)
# 5、评估模型效果
# 方法a:比对预测结果和真实值
y_predict = estimator.predict(test_data)
print("比对预测结果和真实值:\n", y_predict == test_target)
print("在交叉验证中验证的最好结果:\n", estimator.best_score_)
print("最好的参数模型:\n", estimator.best_estimator_)
print("每次交叉验证后的准确率结果:\n", estimator.cv_results_)

过采样和欠采样

#过采样
from imblearn.over_sampling import SMOTE
from pandas import Series,DataFrame
smo = SMOTE(random_state=)
# Series(train_target).value_counts() 
# 解决numpy.ndarray' object has no attribute 'value_counts'
X_train_new,y_train_new = smote.fit_sample(train_data,train_target)
Series(y_train_new).value_counts()

随机森林

from sklearn.ensemble import RandomForestClassifier
clfs = RandomForestClassifier()
clfs.fit(train_data,train_target) 
clfs.predict(test_data)
from sklearn.metrics import f1_score
print(f1_score(test_target,clfs.predict(test_data),average='macro'))

GBTD

from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier()
X_train = credit2.iloc[:,:-1]
y_train = credit2['Class']
clf.fit(X_train,y_train)
X_train.shape
feature_importances_ = clf.feature_importances_
feature_importances_
cols = X_train.columns
cols
# 从大到小进行排列
index = feature_importances_.argsort()[::-1]
index
len(index)
plt.figure(figsize=(12,9))
plt.bar(np.arange(len(index)),feature_importances_[index])
_ = plt.xticks(np.arange(len(index)),cols[index])

时间序列模型

import pandas as pd
import numpy as np
data=pd.read_excel('a.xls',index_col = u'日期')
data['销量']=data['销量'].astype(float)
from matplotlib import style
style.use('ggplot')
#时序图
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
data.plot()
plt.show()
#自相关图
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(data,lags=35).show()
#平稳性检测
from statsmodels.tsa.stattools import adfuller as ADF
print('原始序列的ADF检验结果为:', ADF(data['销量']))
#差分后的结果
D_data = data.diff().dropna()
D_data.columns = [u'销量差分']
D_data.plot() #时序图
plt.show()
plot_acf(D_data).show() #自相关图
from statsmodels.graphics.tsaplots import plot_pacf
plot_pacf(D_data).show() #偏自相关图
print(u'差分序列的ADF检验结果为:', ADF(D_data[u'销量差分'])) #平稳性检测
#白噪声检验
from statsmodels.stats.diagnostic import acorr_ljungbox
print(u'差分序列的白噪声检验结果为:', acorr_ljungbox(D_data, lags=1)) #返回统计量和p值
pmax = int(len(D_data)/10) #一般阶数不超过length/10
qmax = int(len(D_data)/10) #一般阶数不超过length/10
from statsmodels.tsa.arima_model import ARIMA
bic_matrix = [] #bic矩阵
for p in range(pmax+1):
    tmp = []
    for q in range(qmax+1):
        try:
            tmp.append(ARIMA(data,(p,1,q)).fit().bic)
        except:
            tmp.append(None)
    bic_matrix.append(tmp)
bic_matrix = pd.DataFrame(bic_matrix) #从中可以找出最小值
p,q = bic_matrix.stack().astype(float).idxmin() 
#先用stack展平,然后用idxmin找出最小值位置。
print(u'BIC最小的p值和q值为:%s、%s' %(p,q)) 
model = ARIMA(data, (p,1,q)).fit() #建立ARIMA(0, 1, 1)模型
model.summary2() #给出一份模型报告
model.forecast(5) #作为期5天的预测,返回预测结果、标准误差、置信区间。
resid = model.resid #残差检验
acorr_ljungbox(resid, lags=1 )

你可能感兴趣的:(机器学习,算法,聚类,分类,回归算法)