前言:期末考已基本结束,为了2月的美赛做更完善的知识储备,故整理了部分机器学习算法的应用(sklearn下的调用),就算原理不懂,只要能理解它的适用场景以及参数的调节,就能在美赛的C题上发挥出一定作用。
需要提前熟悉的东西:numpy库、pandas库、sklearn库、最好再看看scipy库和seaborn库。
注:下面的示例代码大多都是实战代码整理出来的,数据预处理部分未展出,以及具体调参需另查官网文档https://scikit-learn.org/stable/,本文仅仅是简单套用sklearn中的方法,仅供参考,另外,切记特征要标准化再输入模型!
最常用的聚类方法,凡是需要聚类都可拿它一试,看看聚出来的效果,最好是通过肘部法对最佳簇类数进行选择。
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
# kmeans聚类
model = KMeans(n_clusters=4)
model.fit(new_pd_shutdown_df)
# 返回标签和簇中心
print(model.labels_)
print(model.cluster_centers_)
一层一层进行聚类,最终结果通过可视化的方式呈现,更为直观,只需对树状图进行切割分成若干类。
使用spss进行操作即可。
针对基于密度的聚类,例如2020年小美赛C题中,可以将火灾的空间位置横纵纬度和时间进行DBSCAN聚类,把多次测量的一个火灾点聚成一个火灾事件。
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
# DBSCAN聚类
model = DBSCAN(eps=0.0008, min_samples=8)
model.fit(db_test_data)
# 返回聚类结果——result是一个一维数组array([0,0,-1,...,n]),每个元素代表数据集中对应行的簇,-1代表未形成簇,0代表第一个簇,n代表第n-1个簇
result = model.fit_predict(db_test_data)
有时我们很难确定一个比较合适的模型,此时使用具有自然的、非概率特性的模糊均值C聚类。它提供了更加灵活的聚类结果,融合了模糊理论的精髓。
from skfuzzy.cluster import cmeans
import pandas as pd
import numpy as np
# 准备数据--从全量数据集中抽取所需特征,并转为直接带入cmeans方法的数据格式
for_kmean_pd=pd_dataframe_all[['number','called_city_dispersion','called_dispersion','calling_count_standard','calling_roaming_city_count_standard']]
data=for_kmean_pd[['called_city_dispersion','called_dispersion','calling_count_standard','calling_roaming_city_count_standard']].T
# 进行FCM聚类,并返回分类结果的各个参数
center, u, u0, d, jm, p, fpc = cmeans(data, m=2, c=4, error=0.005, maxiter=1000)
# 得到每个簇的数量
label=np.argmax(u,axis=0)
dict_label={
}
for i in label:
if dict_label.get(i) is None:
dict_label[i]=1
else:
dict_label[i]=dict_label.get(i)+1
print(dict_label)
# 取各簇的所有数据的特征
index_0=[]
index_1=[]
index_2=[]
index_3=[]
for i in range(len(label)):
if label[i]==0:
index_0.append(i)
elif label[i]==1:
index_1.append(i)
elif label[i]==2:
index_2.append(i)
elif label[i]==3:
index_3.append(i)
# 看各个簇的特征分布
nums_pd_0=pd_dataframe_all.iloc[index_0,:]
nums_pd_0.describe()
谱聚类,请参考https://www.cnblogs.com/pinard/p/6235920.html
基于权值的kmeans聚类【深大的大数据实验室黄哲学教授发明的算法】
import numpy as np
import random
import math
from sklearn.datasets import make_blobs
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.cluster import KMeans
def InitCentroids(X, K):
n = np.size(X, 0)
rands_index = np.array(random.sample(range(1, n), K))
centriod = X[rands_index, :]
return centriod
def findClosestCentroids(X, w, centroids):
K = np.size(centroids, 0)
idx = np.zeros((np.size(X, 0)), dtype=int)
n = X.shape[0] # n 表示样本个数
for i in range(n):
subs = centroids - X[i, :]
dimension2 = np.power(subs, 2)
w_dimension2 = np.multiply(w, dimension2)
w_distance2 = np.sum(w_dimension2, axis=1)
if math.isnan(w_distance2.sum()) or math.isinf(w_distance2.sum()):
w_distance2 = np.zeros(K)
# print 'the situation that w_distance2 is nan or inf'
idx[i] = np.where(w_distance2 == w_distance2.min())[0][0]
return idx
def computeCentroids(X, idx, K):
n, m = X.shape
centriod = np.zeros((K, m), dtype=float)
for k in range(K):
index = np.where(idx == k)[0] # 一个簇一个簇的分开来计算
temp = X[index, :] # ? by m # 每次先取出一个簇中的所有样本
s = np.sum(temp, axis=0)
centriod[k, :] = s / np.size(index)
return centriod
def computeWeight(X, centroid, idx, K, belta):
n, m = X.shape
weight = np.zeros((1, m), dtype=float)
D = np.zeros((1, m), dtype=float)
for k in range(K):
index = np.where(idx == k)[0]
temp = X[index, :] # 取第k个簇的所有样本
distance2 = np.power((temp - centroid[k, :]), 2) # ? by m
D = D + np.sum(distance2, axis=0)
e = 1 / float(belta - 1)
for j in range(m):
temp = D[0][j] / D[0]
weight[0][j] = 1 / np.sum((np.power(temp, e)), axis=0)
print(weight)
return weight
def wkmeans(X, K, belta=7, max_iter=10):
n, m = X.shape
r = np.random.rand(1, m)
w = np.divide(r, r.sum())
centroids = InitCentroids(X, K)
idx = None
for i in range(max_iter):
idx = findClosestCentroids(X, w, centroids)
centroids = computeCentroids(X, idx, K)
w = computeWeight(X, centroids, idx, K, belta)
print(centroids)
return idx
def make_data():
from sklearn.preprocessing import StandardScaler
np.random.seed(100)
centers = [[2.5, 1], [3.8, 1], [3, 2.5]] # 指定簇中心
x, y = make_blobs(n_samples=900, centers=centers, cluster_std=0.35, random_state=200)
noise = np.reshape(np.sin(2 * x[:, 0] * x[:, 1]), [-1, 1])
x_noise = np.hstack([x, noise])
ss = StandardScaler()
x_noise = ss.fit_transform(x_noise)
return x, y, x_noise
if __name__ == '__main__':
x, y, x_noise = make_data()
y_pred = wkmeans(x, 3, belta=3)
nmi = normalized_mutual_info_score(y, y_pred)
print("NMI without noise: ", nmi)
y_pred = wkmeans(x_noise, 3,belta=3)
nmi = normalized_mutual_info_score(y, y_pred)
print("NMI with noise : ", nmi)
# 以kmeans为例
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
def zhoubu(x): # x代表需要聚类数据的坐标
# Sum用于储存肘部法则判断指标——误差,10是指肘部法中测试聚类的次数,如下:7~16
Sum=np.zeros(10)
for i in range(7,17): # 分别进行7~16个簇的聚类
kmeans=KMeans(n_clusters=i).fit(x)
m=kmeans.labels_ # 取出分类得出的标签
c=kmeans.cluster_centers_ # 取出每个分类中心
for j in range(len(x)):
c1=c[m[j]] # 第j个样本所属类的中心
x1=x[j] # 第j个样本的坐标
Sum[i-7]=Sum[i-7]+sum((x1-c1)**2) # 计算判断指标——误差
c=plt.plot(np.arange(7,17),Sum )
plt.xticks(np.arange(7,17))
return Sum
下面以一个二分类的实验为例
# 如下以高斯朴素贝叶斯为例,还有两种分别为MultinomialNB和BernoulliNB
from sklearn.naive_bayes import GaussianNB # 高斯朴素贝叶斯
from sklearn.model_selection import train_test_split
x = pd_df[["calling_count","called_dispersion","avg_call_time","short_call_time_rate_5s","shortcall_rate_10s","shortcall_rate_30s" ,"long_call_rate_200s","long_call_rate_600s","user_answer_rate","max_call_time","ring_one_time_rate"]].values
y = pd_df[['label']].values
# 训练测试集划分
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3)
# 创建分类器
clf=GaussianNB()
clf.fit(X_train,y_train)
# 预测
prediction = clf.predict(X_test)
pre = pd.DataFrame(prediction)
print(pre.describe())
y_test_df = pd.DataFrame(y_test)
print(y_test_df.describe())
acc = clf.score(X_test,y_test)
print(acc)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
x = pd_df[["calling_count","called_dispersion","avg_call_time","short_call_time_rate_5s","shortcall_rate_10s","shortcall_rate_30s" ,"long_call_rate_200s","long_call_rate_600s","user_answer_rate","max_call_time","ring_one_time_rate"]].values
y = pd_df[['label']].values
# 训练测试集划分
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3)
# 创建分类器
clf=KNeighborsClassifier(n_neighbors=2) # 二分类
clf.fit(X_train,y_train)
# 预测
prediction = clf.predict(X_test)
pre = pd.DataFrame(prediction)
print(pre.describe())
y_test_df = pd.DataFrame(y_test)
print(y_test_df.describe())
acc = clf.score(X_test,y_test)
print(acc)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
x = pd_df[["calling_count","called_dispersion","avg_call_time","short_call_time_rate_5s","shortcall_rate_10s","shortcall_rate_30s" ,"long_call_rate_200s","long_call_rate_600s","user_answer_rate","max_call_time","ring_one_time_rate"]].values
y = pd_df[['label']].values
# 训练测试集划分
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3)
# 创建分类器
clf=LogisticRegression()
clf.fit(X_train,y_train)
# 预测
prediction = clf.predict(X_test)
pre = pd.DataFrame(prediction)
print(pre.describe())
y_test_df = pd.DataFrame(y_test)
print(y_test_df.describe())
acc = clf.score(X_test,y_test)
print(acc)
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
x = pd_df[["calling_count","called_dispersion","avg_call_time","short_call_time_rate_5s","shortcall_rate_10s","shortcall_rate_30s" ,"long_call_rate_200s","long_call_rate_600s","user_answer_rate","max_call_time","ring_one_time_rate"]].values
y = pd_df[['label']].values
# 训练测试集划分
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3)
# 创建分类器
clf=SVC(kernel='linear')
clf.fit(X_train,y_train)
# 预测
prediction = clf.predict(X_test)
pre = pd.DataFrame(prediction)
print(pre.describe())
y_test_df = pd.DataFrame(y_test)
print(y_test_df.describe())
acc = clf.score(X_test,y_test)
print(acc)
from sklearn import tree
from sklearn.model_selection import train_test_split
x = pd_df[["calling_count","called_dispersion","avg_call_time","short_call_time_rate_5s","shortcall_rate_10s","shortcall_rate_30s" ,"long_call_rate_200s","long_call_rate_600s","user_answer_rate","max_call_time","ring_one_time_rate"]].values
y = pd_df[['label']].values
# 训练测试集划分
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3)
# 创建分类器
clf=tree.DecisionTreeClassifier()
clf.fit(X_train,y_train)
# 预测
prediction = clf.predict(X_test)
pre = pd.DataFrame(prediction)
print(pre.describe())
y_test_df = pd.DataFrame(y_test)
print(y_test_df.describe())
acc = clf.score(X_test,y_test)
print(acc)
import numpy as np
import pandas as pd
import random
from sklearn.tree import DecisionTreeClassifier
data = pd_df
# 训练随机森林
M = [] # 存储决策树模型的数组
R = [] # 存储各决策树随机选取的特征组合
n_trees = 100 # 设置森林中树的颗数
# 训练多棵树
for i in range(n_trees):
# 随机选取样本
sample = data.sample(frac=0.3) # 对样本进行采样,目的是建造不同的树
# 随机选取特征,随机选取k个特征组合r
k = np.random.randint(1, sample.shape[1]) # 随机选取k个特征
r = np.random.choice(range(sample.shape[1]), k, replace=True).tolist() # replace=False 无放回的随机选取2个特征组合
sample2 = data.sample(frac=0.3)
X = sample.iloc[:, r]
# 选取Y
Y = sample.iloc[:, -1]
# 新建决策树模型
model = DecisionTreeClassifier()
model.fit(X, Y)
# 存储模型
M.append(model) # 将决策树模型加入数组
R.append(r) # 将特征组合加入数组中
X2 = sample.iloc[:, r]
Y2 = sample.iloc[:, -1]
print('第' + str(i) + '颗预测score=', model.score(X2, Y2)) # 打印每个基础模型的效果
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt
# 准备数据
X1 = np.array(input_data[['spatial_index']])
X2 = np.array(input_data[['time_index']])
y = np.array(input_data['BSTC'])
# 简单线性回归
X = np.hstack((X1, X2))
lin_reg = LinearRegression()
lin_reg.fit(X, y)
y_predict = lin_reg.predict(X)
# 简单线性回归参数值
print(lin_reg.coef_)
print(lin_reg.intercept_)
print(lin_reg.score(X,y))
# 绘制误差分布柱形图
y_bias = y-y_predict
y_bias = pd.DataFrame(y_bias)
y_bias.hist(bins=100)
plt.xlabel('prediction error')
plt.ylabel('counts')
plt.title('Binary Linear Regression')
foo_fig = plt.gcf() # 'get current figure'
foo_fig.savefig('Binary Linear Regression.png', format='png', dpi=1000)
# 多项式回归
from sklearn.linear_model import LinearRegression
X = np.hstack((X1, X2))
X = np.hstack((X,X1**2))
X = np.hstack((X,X1**3))
X = np.hstack((X,X2**2))
X = np.hstack((X,X2**3))
X = np.hstack((X,X2*X1))
X = np.hstack((X,X2**2*X1))
X = np.hstack((X,X1**2*X2))
# 多项式回归--特殊的简单线性回归
lin_reg2 = LinearRegression()
lin_reg2.fit(X, y)
y_predict2 = lin_reg2.predict(X)
# 绘制误差分布柱形图
print(lin_reg2.coef_)
print(lin_reg2.intercept_)
print(lin_reg2.score(X,y))
多项式回归的X处理的有点繁琐,不用在意~
参考文章https://www.cnblogs.com/pinard/p/6243025.html
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
from sklearn.datasets.samples_generator import make_blobs
# X为样本特征,Y为样本簇类别, 共1000个样本,每个样本3个特征,共4个簇
X, y = make_blobs(n_samples=10000, n_features=3, centers=[[3,3, 3], [0,0,0], [1,1,1], [2,2,2]], cluster_std=[0.2, 0.1, 0.2, 0.2],
random_state =9)
fig = plt.figure()
ax = Axes3D(fig, rect=[0, 0, 1, 1], elev=30, azim=20)
plt.scatter(X[:, 0], X[:, 1], X[:, 2],marker='o')
# 先不降维,只对数据进行投影,看看投影后的三个维度的方差分布
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
pca.fit(X)
print pca.explained_variance_ratio_
print pca.explained_variance_
# 从三维降到2维
pca = PCA(n_components=2)
pca.fit(X)
print pca.explained_variance_ratio_
print pca.explained_variance_
# 绘制转化后的数据分布图
X_new = pca.transform(X)
plt.scatter(X_new[:, 0], X_new[:, 1],marker='o')
plt.show()
# 看看不直接指定降维的维度,而指定降维后的主成分方差和比例
pca = PCA(n_components=0.95)
pca.fit(X)
print pca.explained_variance_ratio_
print pca.explained_variance_
print pca.n_components_
# 调整阈值为99%
pca = PCA(n_components=0.99)
pca.fit(X)
print pca.explained_variance_ratio_
print pca.explained_variance_
print pca.n_components_
# 使用MLE算法
pca = PCA(n_components='mle')
pca.fit(X)
print pca.explained_variance_ratio_
print pca.explained_variance_
print pca.n_components_
本文仅供参考,欢迎各位大佬雅正!