一、数据可视化的图形表示法
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from pandas.plotting import parallel_coordinates
plt.rcParams['font.family'] = 'Microsoft YaHei'
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
data = pd.read_csv("../data/肺活量与其他指标的数据.txt", sep='\t')
# (1) 绘制OXY与time和age的散布图
plt.scatter(data["OXY"], data["time"], label="OXY与time")
plt.scatter(data["OXY"], data["age"], label="OXY与age")
plt.legend()
plt.grid(True)
plt.show()
# (2)绘制7项指标散布图矩阵。
pd.plotting.scatter_matrix(data)
plt.show()
# (3)绘制轮廓图
plt.figure()
data1=data.loc[[0, 1, 20, 21]]
pd.plotting.parallel_coordinates(data1, "No")
plt.show()
# (3)绘制雷达图
data1 = data1.set_index("No") # 数据处理
labels = data1.columns.values # 特征值
kinds = list(data1.index) # 成员变量
data1 = pd.concat([data1, data1[[labels[0]]]], axis=1) # 再添加第一列,使雷达图闭合。
contents = np.array(data1)
nAttr = len(labels)
angle = np.linspace(0, 2 * np.pi, nAttr, endpoint=False) # 平分雷达图
angle = np.concatenate((angle, [angle[0]])) # 闭合
labels = np.concatenate((labels, [labels[0]])) # 特征值闭合
fig = plt.figure()
ax = fig.add_subplot(111, polar=True)
for i in range(len(kinds)):
ax.plot(angle, contents[i], linewidth=1, label=kinds[i])
ax.fill(angle, contents[i], alpha=0.2)
ax.set_thetagrids(angle * 180 / np.pi, labels)
plt.legend()
plt.show()
# (4)绘制调和曲线图
pd.plotting.andrews_curves(data.loc[[0, 1, 20, 21]], "No")
plt.show()
二、多元线性回归
可求回归系数、可决系数、预测值、残差等
题目:BostonHoursing住房房价预测
口普查区的经度)、lat(人口普查区的纬度)。
我们将用 comedy(修正了的自住房屋房价中位数)作为因变量,而将 crim,zn,indus,nox,rm,age,dis,rad,tax,ptratio,b,lstat 这12个变量作为自变量(数据详见BostonHousing2.csv文件)。
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
"""------------------------------------------------------------------------------------------------------------------"""
dicts = {"x1": [0, 1, 2, 3, 4],
"x2": [-1, -1, 2, 3, 2],
"y": [1, 4, 3, 8, 9]}
data = pd.DataFrame.from_dict(dicts)
x_exam = data[["x1", "x2"]]
y_exam = data["y"]
model = LinearRegression()
model.fit(x_exam, y_exam)
print(model.coef_) # 回归系数
print(model.intercept_) # 截距
# print(model.score(x_exam, y_exam)) # 可决系数
print(model.predict(x_exam)) # y预测值
print(y_exam - model.predict(x_exam)) # 残差
print(sum((y_exam - model.predict(x_exam)) ** 2)) # 残差平方和
"""------------------------------------------------------------------------------------------------------------------"""
data_BostonHousing = pd.read_csv("../data/BostonHousing.csv")
print(data_BostonHousing)
data_handle = data_BostonHousing[
["cmedv", "crim", "zn", "indus", "nox", "rm", "age", "dis", "rad", "tax", "ptratio", "b", "lstat"]]
pd.plotting.scatter_matrix(data_handle) # 2-1
plt.show()
"""------------------------------------------------------------------------------------------------------------------"""
examDf = pd.DataFrame(data_handle)
exam_X = examDf[["crim", "zn", "indus", "nox", "rm", "age", "dis", "rad", "tax", "ptratio", "b", "lstat"]]
exam_Y = examDf[["cmedv"]]
# model = Ridge(alpha=0.5, fit_intercept=True)
# 通过 RidgeCV 使用交叉验证获取最佳参数值
model = RidgeCV(alphas=[0.1, 1.0, 10.0])
model.fit(exam_X, exam_Y)
print(model.score(exam_X, exam_Y)) # 2-2
"""------------------------------------------------------------------------------------------------------------------"""
init = np.random.randint(2, 13)
columns = np.random.choice(["crim", "zn", "indus", "nox", "rm", "age", "dis", "rad", "tax", "ptratio", "b", "lstat"],
size=init, replace=False)
random_X = examDf[columns]
# random_model = Ridge(alpha=0.5, fit_intercept=True)
random_model = RidgeCV(alphas=[0.1, 1.0, 10.0]) # 从所有随机变量中随机的抽取 n 个自变量,并利用十折交叉验证计算所建模型的可决系数
random_model.fit(random_X, exam_Y)
print(random_model.score(random_X, exam_Y)) # 2-3
三、 Fisher线性判别分析
题目:LDA简单建模、高维数据建模、模型的适用性
例划分为训练集和测试集,估计模型的分类准确率。
2、利用十折交叉验证和 Fisher 线性判别准则对数据集 LDA-sparse_data 进行建 模,并观察指定模型中相关输入参数 shrinkage 的取值与不指定该参数取值时模型前后之间的差异。
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import discriminant_analysis
from sklearn.linear_model import RidgeCV
import matplotlib.pyplot as plt
import matplotlib
"""------------------------------------------------------------------------------------------------------------------"""
pd_data = pd.read_csv("../data/pendigits.csv")
# print(pd_data)
exam_columns = pd_data.columns
# print(exam_columns)
x_exam = pd_data[exam_columns[:16]]
y_exam = pd_data[exam_columns[-1]]
# print(x_exam)
# print(y_exam)
x_train, x_test, y_train, y_test = train_test_split(x_exam, y_exam, train_size=float(1) / 6) # 分割训练集和测试集。
# print(y_train)
lda = discriminant_analysis.LinearDiscriminantAnalysis()
lda.fit(x_train, y_train)
# print('Coefficients:%s, intercept %s' % (lda.coef_, lda.intercept_)) # 输出权重向量和 b
# print('Score: %.2f' % lda.score(x_test, y_test)) # 测试集
"""------------------------------------------------------------------------------------------------------------------"""
data_lda = pd.read_excel("../data/LDA-sparse_data.xlsx", sheet_name="Sheet1")
# print(data_lda)
lda_2 = discriminant_analysis.LinearDiscriminantAnalysis(solver="eigen", shrinkage=0.3) # Fisher线性判别准测。
lda_2_cv = RidgeCV(alphas=[0.1, 1.0, 10.0]) # 十折交叉验证。
lda_columns = data_lda.columns
lda_2.fit(data_lda[lda_columns[:698]], data_lda[lda_columns[-1]]) # Fisher线性判别准测。
lda_2_cv.fit(data_lda[lda_columns[:698]], data_lda[lda_columns[-1]]) # 十折交叉验证。
# print(lda_2.coef_) # 权重
# print(lda_2_cv.coef_) # 权重
# print(lda_2.score(data_lda[lda_columns[:698]], data_lda[lda_columns[-1]]))
"""------------------------------------------------------------------------------------------------------------------"""
matplotlib.rcParams['font.family'] = 'Microsoft YaHei'
matplotlib.rcParams['font.sans-serif'] = ['Microsoft YaHei']
data_ban = pd.read_table("../data/banana.dat", sep=",")
plt.scatter(data_ban["At1"], data_ban["Class"], label="At1")
plt.scatter(data_ban["At2"], data_ban["Class"], label="At2")
plt.legend()
plt.show()
四、层次聚类方法
题目:世界银行样本数据集
import pandas as pd
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from matplotlib import pyplot as plt
def data_norm(df, *cols):
df_n = df.copy()
for col in cols:
ma = df[col].max()
mi = df[col].min()
df_n[col] = (df[col] - mi) / (ma - mi)
return df_n
data = pd.read_csv("../data/WBClust2013.csv", index_col=0)
X = data_norm(data, data.columns)
Z = linkage(X, "ward")
f = fcluster(Z, 4)
fig = plt.figure()
dn = dendrogram(Z, labels=data.index)
plt.show()
data_NASA = pd.read_csv("../data/NASAUnderstory.csv", index_col=0)
X_NASA = data_norm(data_NASA, data_NASA.columns)
Z_NASA = linkage(X_NASA, "ward")
f_NASA = fcluster(Z_NASA, 4)
fig_NASA = plt.figure()
dn_NASA = dendrogram(Z_NASA, labels=data_NASA.index)
plt.show()
五、PCA主成分分析
题目:半导体数据降维,崖底碎石图
对这些特征进行降维处理(数据集 secom.data)。
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.decomposition as dp
from sklearn import decomposition
from sklearn import preprocessing
secom = pd.read_table("../data/secom.data", header=None, sep=" ")
for column in list(secom.columns[secom.isnull().sum() > 0]):
mean_val = secom[column].mean()
secom[column].fillna(mean_val, inplace=True)
pca = decomposition.PCA(n_components=0.9)
pca.fit(secom)
reduced_x = pca.fit_transform(secom)
a = list(reduced_x[:, 0])
b = list(reduced_x[:, 1])
c = list(reduced_x[:, 2])
a.sort(reverse=True)
b.sort(reverse=True)
c.sort(reverse=True)
plt.scatter(range(len(reduced_x)), a)
plt.scatter(range(len(reduced_x)), b)
plt.scatter(range(len(reduced_x)), c)
plt.show()
"""------------------------------------------------------------------------------------------------------------------"""
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 6 16:29:26 2021
@author: sys
"""
raw_data = pd.read_csv("../data/AAUP.csv")
data = preprocessing.scale(raw_data) # 归一化处理
cov_matrix = np.cov(data.T) # 默认行为属性(或变量),需要进行对数据矩阵转置
# 协方差矩阵等于相似度矩阵
eig_val, eig_vec = np.linalg.eigh(cov_matrix) # 计算特征值和特征向量
p = data.shape[1] # 原始数据的维数
###以下操作为由大到小排序后的特征值及其对应的特征向量
idx = np.argsort(eig_val) # 按照特征值从小到大顺序排序
idx = idx[::-1] # 按照特征值从大到小顺序排序的
eig_vec = eig_vec[:, idx] # 特征向量,即计算主成分的权向量(所有的)
eig_val = eig_val[idx] # 特征值,即计算主成分的反映原始变量的信息量
###以上操作为由大到小排序后的特征值及其对应的特征向量#
contribution = np.cumsum(eig_val) / np.sum(eig_val) # 计算累计贡献率
############崖底碎石土#########################
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.plot(range(1, p + 1), contribution)
plt.xlabel("主成分的个数")
plt.ylabel("累计贡献率")
plt.show()
##################################################
columns = ["PCA" + str(i) for i in range(1, len(eig_val) + 1)]
pca_vector = pd.DataFrame(eig_vec, columns=columns)
New_data = np.dot(data, pca_vector.iloc[:, :3])
pca = dp.PCA(n_components=0.9)
reduced_x = pca.fit_transform(data)
# critical_valu=0.8
# pca_number=np.argmax(contribution>=critical_valu)
# pca.components_
"""------------------------------------------------------------------------------------------------------------------"""
raw_data = pd.read_csv("../data/secom.data", sep=" ", header=None)
def fill_value(x):
idx = x.isnull()
x[idx] = x[~idx].mean()
return x
clear_data = raw_data.apply(fill_value, axis=0)
pca = dp.PCA(n_components=0.95) # 设置累计贡献率阈值
reduced_x = pca.fit_transform(clear_data) # 降
pca.explained_variance_ratio_
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.xlabel("主成分的个数")
plt.ylabel("累计贡献率")
plt.plot(range(1, pca.n_components_ + 1), np.cumsum(pca.explained_variance_ratio_))
plt.show()
condense_ratio = 1 - pca.n_components_ / clear_data.shape[0]
print(f"原始数据被压缩率为{condense_ratio}")
六、典型相关分析
题目:教师数据
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
from sklearn.cross_decomposition import CCA
w = pd.read_csv(r"../data/FullAAUP.csv")
u = w[w.columns[4:]]
X = np.array(u[u.columns[:8]])
Y = np.array(u[u.columns[8:]])
cc = CCA(3)
cc.fit(X, Y)
X_c, Y_c = cc.transform(X, Y)
print(cc.x_weights_)
print(cc.y_weights_)
print(cc.x_loadings_)
print(cc.y_loadings_)
print(cc.x_scores_)
print(cc.y_scores_)