数据挖掘 之 水质分析(决策树分类)

1. 数据提取

1.1思路

从训练图像集中的水质样本采集出:RGB三通管道的一、二、三阶颜色矩信息

并写入文件:如下图b*9的数据矩阵

数据挖掘 之 水质分析(决策树分类)_第1张图片

1.2重要函数

文件夹读取:dir = os.listdir(imgpath) for filename in dir:
平均值:r_mean = r.mean()
标准差:r_std = r.std()
方差:r_var = r.var()
洗牌函数:shuffle(data)
数据保存:np.save('data9image.npy', data)

1.3代码


from random import shuffle
import cv2
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier  #分类树

def feature_extraction():
    imgpath = "E:\data\data9\images"
    dir = os.listdir(imgpath)
    data = []
    for filename in dir:
        print("----------------------------")
        #获取标签
        print(filename)
        target,_ = filename.split("_")
        target = int(target)
        print(target,type(target))
        filepath = os.path.join(imgpath,filename)
        print(filepath)

        #读取图片
        scrimg = cv2.imread(filepath)
        if scrimg is not None:
            print(scrimg.shape)
            #截取中间100*100的方块
            featureimg = scrimg[int((scrimg.shape[0]/2)-50):int((scrimg.shape[0]/2)+50),
                     int((scrimg.shape[1]/2)-50):int((scrimg.shape[1]/2)+50)]
            print(featureimg.shape)
            r,g,b = featureimg[:,:,0],featureimg[:,:,1],featureimg[:,:,2]
            print(r.shape,g.shape,b.shape)
            r_mean = r.mean()
            g_mean = g.mean()
            b_mean = b.mean()
            r_std = r.std()
            g_std = g.std()
            b_std = b.std()
            r_var = r.var()
            g_var = g.var()
            b_var = b.var()
            feature = [r_mean,g_mean,b_mean,r_std,g_std,b_std,r_var,g_var,b_var]
            print("r_mean",r_mean)
            print("r_std",r_std)
            print("r_var",r_var)
            print(feature)
            data.append([np.array(feature),target])
    # 洗牌
    shuffle(data)
    print(data)
    #保存至文件
    np.save('data9image.npy', data)

# 特征的提取与上传
feature_extraction()

2. 数据挖掘建模:实现决策树分类

2.1 思路

预测混淆矩阵和预测准确率

2.2重要函数

数据下载:data = np.load('data9image.npy', allow_pickle=True)
训练集分割:X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
分类树(sklearn.tree):model = DecisionTreeClassifier(random_state=5).fit(X_train,y_train)
预测:predict_y = model.predict(X_test)
混淆矩阵(sklearn.metrics):cm = confusion_matrix(y_test,predict_y)
精确度(sklearn.metrics):accuracy_score(y_test,predict_y)

2.3代码

from random import shuffle
import cv2
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier  #分类树

data = np.load('data9image.npy', allow_pickle=True)
#训练集测试集分割
print(data)
X = [i[0] for i in data]
Y = [i[1] for i in data]
print(X,Y)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

# 训练
model = DecisionTreeClassifier(random_state=5).fit(X_train,y_train)
# 预测
predict_y = model.predict(X_test)
print(predict_y==y_test)

# 水质评价:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,predict_y)
print("---------\n混淆矩阵: \n",cm)

from sklearn.metrics import accuracy_score
#准确度
print(accuracy_score(y_test,predict_y))

3. 空气质量分析(数据清洗、决策树、支持向量机)

3.1思路

1.分析分类数据,分析五个原始的属性(数据预处理:缺失值处理,数据探索:箱型图),然后进行筛选(数据规约:相关性分析)

2.数据可视化、模型建立(决策树)、决策树可视化

数据:environment_data.xls

数据挖掘 之 水质分析(决策树分类)_第2张图片

3.2重要函数

读取:data = pd.read_excel(path)
获取属性:name = data.columns.values.tolist()
缺失值处理:data.replace(0, np.nan, inplace=True)   na = data.isnull().sum()  na = na[na!=0]
缺失值处理:data[arr].fillna(data[arr].mean(),inplace=True )
绘制胡须图:data.plot(kind = 'box')
相关性分析:cor = np.corrcoef(data[name[:-1]].values.T)
绘制热力图:sns.heatmap(cor, cbar=False, annot=True, square=True, fmt='0.2f', yticklabels=name[:-1], xticklabels=name[:-1])
数据规约:data3d = data.drop(columns=['NO','PM10',"NOx"])
绘制散点:fig = plt.figure(),   ax = Axes3D(fig)
Ascatter = data.loc[data.iloc[:,-1]=="I"]
A_x,A_y,A_z = Ascatter.loc[:,"SO2"],Ascatter.loc[:,"NO2"],Ascatter.loc[:,"PM2-5"]
ax.scatter(A_x, A_y, A_z, c='lightgreen', label="I")
支持向量机:
model1, model2 = svm.SVC(kernel='linear'), svm.SVC(kernel="rbf")
model.fit(X_train, y_train)

3.3代码

# -*- coding: utf-8 -*-
# 开发时间: 2022/10/28 10:08
import numpy as np
import pandas as pd
from random import shuffle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

#读取
path = r"E://data//environment_data.xls"
data = pd.read_excel(path)
print(data,type(data))

# 获取属性
name = data.columns.values.tolist()
print("属性: ",name)

# 缺失值处理
data.replace(0, np.nan, inplace=True)
na = data.isnull().sum()
na = na[na!=0]
print("缺失值统计:\n",na)
# 缺失值利用均值填充
for arr in na.index:
    data[arr].fillna(data[arr].mean(),inplace=True )
print(data)
print(data.isnull().sum())  # 已经无缺失值

# 属性分析:
# 1. 胡须图
data.plot(kind = 'box')
plt.show()
# 2.探究各属性之间的相关系数
# 可以看出在属性SO2,NO,NO2,NOx,PM10,PM2-5中:PM10与PM2-5相关性为94、NO与NOx相关性为90
cor = np.corrcoef(data[name[:-1]].values.T)
print(cor)
plt.figure(figsize=(5,5))
sns.set(font_scale=1.0)   #设置字体的大小
sns.heatmap(cor, cbar=False, annot=True, square=True, fmt='0.2f', yticklabels=name[:-1], xticklabels=name[:-1])
plt.show()

data3d = data.drop(columns=['NO','PM10',"NOx"])
print("丢弃完:",data)
name3d = data3d.columns.values.tolist()
cor3d = np.corrcoef(data3d[name3d[:-1]].values.T)
sns.heatmap(cor3d, cbar=False, annot=True, square=True, fmt='0.2f', yticklabels=name3d[:-1], xticklabels=name3d[:-1])
# plt.show()

# 绘制立体图
from mpl_toolkits.mplot3d import Axes3D  # 空间三维画图
fig = plt.figure()
ax = Axes3D(fig)
# x:SO2  y:NO2  z:PM2-5
ax.set_xlabel("SO2")
ax.set_ylabel("NO2")
ax.set_zlabel("PM2-5")
Ascatter = data.loc[data.iloc[:,-1]=="I"]
A_x,A_y,A_z = Ascatter.loc[:,"SO2"],Ascatter.loc[:,"NO2"],Ascatter.loc[:,"PM2-5"]
Bscatter = data.loc[data.iloc[:,-1]=="II"]
B_x,B_y,B_z = Bscatter.loc[:,"SO2"],Bscatter.loc[:,"NO2"],Bscatter.loc[:,"PM2-5"]
Cscatter = data.loc[data.iloc[:,-1]=="III"]
C_x,C_y,C_z = Cscatter.loc[:,"SO2"],Cscatter.loc[:,"NO2"],Cscatter.loc[:,"PM2-5"]
Dscatter = data.loc[data.iloc[:,-1]=="IV"]
D_x,D_y,D_z = Dscatter.loc[:,"SO2"],Dscatter.loc[:,"NO2"],Dscatter.loc[:,"PM2-5"]
Escatter = data.loc[data.iloc[:,-1]=="V"]
E_x,E_y,E_z = Escatter.loc[:,"SO2"],Escatter.loc[:,"NO2"],Escatter.loc[:,"PM2-5"]
Fscatter = data.loc[data.iloc[:,-1]=="VI"]
F_x,F_y,F_z = Fscatter.loc[:,"SO2"],Fscatter.loc[:,"NO2"],Fscatter.loc[:,"PM2-5"]
ax.scatter(A_x, A_y, A_z, c='lightgreen', label="I")
ax.scatter(B_x, B_y, B_z, c='limegreen', label="II")
ax.scatter(C_x, C_y, C_z, c='forestgreen', label="III")
ax.scatter(D_x, D_y, D_z, c='orange', label="IV")
ax.scatter(E_x, E_y, E_z, c='firebrick', label="V")
ax.scatter(F_x, F_y, F_z, c='k', label="VI")
# lightgreen,limegreen,forestgreen,orange,firebrick,k
ax.legend(loc='best')
plt.show()



# 训练集测试集分割
X = data.iloc[:,:-1]
Y = data.iloc[:,-1]
print(X)
print(Y)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)  # DF
print(X_train.shape)
# 支持向量机
from sklearn import svm
model = svm.SVC(kernel='linear')
model.fit(X_train, y_train)
print("kernel='linear',使用全部属性的分类情况:",model.score(X_test, y_test))   # 0.660377358490566

model = svm.SVC(kernel="poly")
model.fit(X_train, y_train)
print("kernel=poly,使用全部属性的分类情况:",model.score(X_test, y_test))   # 0.7830188679245284

model = svm.SVC(kernel="rbf")
model.fit(X_train, y_train)
print("kernel=rbf,使用全部属性的分类情况:",model.score(X_test, y_test))   # 0.8584905660377359


# 训练集测试集分割
X3d = data3d.iloc[:,:-1]
Y3d = data3d.iloc[:,-1]
print(X3d)
print(Y3d)

X3d_train, X3d_test, y3d_train, y3d_test = train_test_split(X3d, Y3d, test_size=0.33, random_state=42)  # DF
print(X3d_train.shape)
# 支持向量机
from sklearn import svm
model = svm.SVC(kernel='linear')
model.fit(X3d_train, y3d_train)
print("kernel='linear',使用3d属性的分类情况:",model.score(X3d_test, y3d_test))   # 0.49056603773584906

model = svm.SVC(kernel="poly")
model.fit(X3d_train, y3d_train)
print("kernel=poly,使用3d属性的分类情况:",model.score(X3d_test, y3d_test))   # 0.7641509433962265

model = svm.SVC(kernel="rbf")
model.fit(X3d_train, y3d_train)
print("kernel=rbf,使用3d属性的分类情况:",model.score(X3d_test, y3d_test))   # 0.7924528301886793

from sklearn.tree import DecisionTreeClassifier, export_graphviz
# 训练
tree = DecisionTreeClassifier(random_state=1, criterion="gini", max_depth=5)
tree.fit(X_train, y_train)
print("criterion=gini,使用全部参数的决策树分类情况:",tree.score(X_test, y_test))  #0.9528301886792453

tree2 = DecisionTreeClassifier(random_state=1, criterion="gini", max_depth=5)
tree2.fit(X3d_train, y3d_train)
print("criterion=gini,使用3d参数的决策树分类情况:",tree2.score(X3d_test, y3d_test))  #0.7358490566037735

import graphviz
import pydotplus  #  pydotplus是旧pydot项目的一个改进版本,它为graphviz的点语言提供了一个python接口。
dot_data = export_graphviz(tree
                               , feature_names=X.columns
                               , filled=True
                               , rounded=True
                               )
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("./tree1.pdf")

(138条消息) Graphviz安装配置教程(图文详解)_振华OPPO的博客-CSDN博客_graphviz

你可能感兴趣的:(数据挖掘,数据挖掘)