【决策树,异常检测,主成分分析】

决策树

import pandas as pd
import numpy as np
data = pd.read_csv(‘iris_data.csv’)
data.head()
X = data.drop([‘target’,‘label’],axis=1)
y = data.loc[:,‘label’]
print(X.shape,y.shape)

from sklearn import tree
dc_tree = tree.DecisionTreeClassifier(criterion=‘entropy’,min_samples_leaf=5)
#(criterion=‘entropy’:信息熵,min_samples_leaf=5:防止过拟合)
dc_tree.fit(X,y)

y_predict = dc_tree.predict(X)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y,y_predict)
print(accuracy)

#可视化决策树
%matplotlib inline
from matplotlib import pyplot as plt
fig = plt.figure(figsize=(20,20))
tree.plot_tree(dc_tree,filled=‘True’,feature_names=[‘SepalLength’, ‘SepalWidth’, ‘PetalLength’, ‘PetalWidth’],class_names=[‘setosa’,‘versicolor’,‘virginica’])

异常检验

import pandas as pd
import numpy as np
data = pd.read_csv(‘anomaly_data.csv’)
data.head()
from matplotlib import pyplot as plt
fig1 = plt.figure(figsize=(10,5))
plt.scatter(data.loc[:,‘x1’],data.loc[:,‘x2’])
plt.title(‘data’)
plt.xlabel(‘x1’)
plt.ylabel(‘x2’)
plt.show()

x1_mean = x1.mean()
x1_sigma = x1.std()
x2_mean = x2.mean()
x2_sigma = x2.std()
print(x1_mean,x1_sigma,x2_mean,x2_sigma)

from scipy.stats import norm
x1_range = np.linspace(0,20,300)
x1_normal = norm.pdf(x1_range,x1_mean,x1_sigma)
x2_range = np.linspace(0,20,300)
x2_normal = norm.pdf(x2_range,x2_mean,x2_sigma)

fig2 = plt.figure(figsize=(20,5))
plt.subplot(121)
plt.plot(x1_range,x1_normal)
plt.title(‘normal p(x1)’)
plt.subplot(122)
plt.plot(x2_range,x2_normal)
plt.title(‘normal p(x2)’)
plt.show()

#异常数据检测
from sklearn.covariance import EllipticEnvelope
ad_model = EllipticEnvelope(contamination=0.03)
ad_model.fit(data)
y_predict = ad_model.predict(data)
print(pd.value_counts(y_predict))

fig4 = plt.figure(figsize=(10,6))
orginal_data=plt.scatter(data.loc[:,‘x1’],data.loc[:,‘x2’],marker=‘x’)
anomaly_data=plt.scatter(data.loc[:,‘x1’][y_predict==-1],data.loc[:,‘x2’][y_predict==-1],marker=‘o’,facecolor=‘none’,edgecolor=‘red’,s=150)

plt.title(‘自动寻找异常数据’,font2)
plt.xlabel(‘ x 1 x_1 x1’,font2)
plt.ylabel(‘ x 2 x_2 x2’,font2)
plt.legend((orginal_data,anomaly_data),(‘原数据’,‘检测异常点’))
plt.axis([4.5,15,2.5,15])
plt.show()

ad_model = EllipticEnvelope(contamination=0.02)
ad_model.fit(data)
y_predict = ad_model.predict(data)

#visualize the result
fig5 = plt.figure(figsize=(20,10))
orginal_data=plt.scatter(data.loc[:,‘x1’],data.loc[:,‘x2’],marker=‘x’)
anomaly_data=plt.scatter(data.loc[:,‘x1’][y_predict==-1],data.loc[:,‘x2’][y_predict==-1],marker=‘o’,facecolor=‘none’,edgecolor=‘red’,s=150)

plt.title(‘anomaly detection result’)
plt.xlabel(‘x1’)
plt.ylabel(‘x2’)
plt.legend((orginal_data,anomaly_data),(‘original_data’,‘anomaly_data’))
plt.show()

#make prediction
y_predict = ad_model.predict(data)
print(pd.value_counts(y_predict))

#visualize the result
fig4 = plt.figure(figsize=(10,6))
orginal_data=plt.scatter(data.loc[:,‘x1’],data.loc[:,‘x2’],marker=‘x’)
anomaly_data=plt.scatter(data.loc[:,‘x1’][y_predict==-1],data.loc[:,‘x2’][y_predict==-1],marker=‘o’,facecolor=‘none’,edgecolor=‘red’,s=150)

plt.title(‘自动寻找异常数据’,font2)
plt.xlabel(‘ x 1 x_1 x1’,font2)
plt.ylabel(‘ x 2 x_2 x2’,font2)
plt.legend((orginal_data,anomaly_data),(‘原数据’,‘检测异常点’))
plt.axis([4.5,15,2.5,15])
plt.show()

主成分分析

import pandas as pd
import numpy as np
data = pd.read_csv(‘iris_data.csv’)
data.head()
print(data)
#define X and y
X = data.drop([‘target’,‘label’],axis=1)
y = data.loc[:,‘label’]
y.head()

#KNN
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(X,y)
y_predict = KNN.predict(X)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y,y_predict)
print(accuracy)

#标准化处理
from sklearn.preprocessing import StandardScaler
X_norm = StandardScaler().fit_transform(X)
print(X_norm)

#calcualte the mean and sigma
x1_mean = X.loc[:,‘sepal length’].mean()
x1_norm_mean = X_norm[:,0].mean()
x1_sigma = X.loc[:,‘sepal length’].std()
x1_norm_sigma = X_norm[:,0].std()
print(x1_mean,x1_sigma,x1_norm_mean,x1_norm_sigma)from matplotlib import pyplot as plt
fig1 = plt.figure(figsize=(20,5))
plt.subplot(121)
plt.hist(X.loc[:,‘sepal length’],bins=100)
plt.subplot(122)
plt.hist(X_norm[:,0],bins=100)
plt.show()

#PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=4)
X_pca = pca.fit_transform(X_norm)
#calculate the variance ratio of each principle components
var_ratio = pca.explained_variance_ratio_
print(var_ratio)

fig2 = plt.figure(figsize=(20,5))
plt.bar([1,2,3,4],var_ratio)
plt.xticks([1,2,3,4],[‘PC1’,‘PC2’,‘PC3’,‘PC4’])
plt.ylabel(‘variance ratio of each PC’)
plt.show()

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_norm)
X_pca.shape

type(X_pca)

#visualize the PCA result
fig3 = plt.figure(figsize=(5,3))
setosa=plt.scatter(X_pca[:,0][y0],X_pca[:,1][y0])
versicolor=plt.scatter(X_pca[:,0][y1],X_pca[:,1][y1])
virginica=plt.scatter(X_pca[:,0][y2],X_pca[:,1][y2])
plt.legend((setosa,versicolor,virginica),(‘setosa’,‘versicolor’,‘virginica’))
plt.xlabel(‘PC1’)
plt.ylabel(‘PC2’)
plt.show()

fig3.savefig(‘1.png’)

KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(X_pca,y)
y_predict = KNN.predict(X_pca)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y,y_predict)
print(accuracy)

你可能感兴趣的:(人工智能,决策树,python)