机器学习基本架构

基础不牢地动山摇,根据下面代码可以简单了解数据挖掘的基本流程,但是目前自己并不可以灵活运用,在自己重新温习数据结构,概率论知识之后,再深入了解机器学习。。。。。
转帖:机器学习关于数据挖掘方向的简单架构

# Loadlibraries
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')
# Loaddataset
names =['sepal-length', 'sepal-width','petal-length', 'petal-width','class']
dataset =pd.read_csv("iris.data",names=names)
# shape
print(dataset.shape)
# head
print(dataset.head(20))
# descriptions
print(dataset.describe())
# classdistribution
print(dataset.groupby('class').size())
# boxand whisker plots
dataset.plot(kind='box',subplots=True,layout=(2,2), sharex=False, sharey=False)
plt.show()
#histograms
dataset.hist()
plt.show()
#scatter plot matrix
scatter_matrix(dataset)
plt.show()
#Split-out validation dataset
array =dataset.values
print(array)
X = array[:,:3]#行下标为1到3的数据X =array[:,0:4]
print(X)
Y =array[:,4]#行下标为4的数据
print(Y)
validation_size = 0.20#测试的比例
seed =7
X_train, X_validation, Y_train,Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
#训练数据集:X_train,Y_train
#验证数据集:X_validation,Y_validation
# Testoptions and evaluation metric
seed =7
scoring= 'accuracy'
# SpotCheck Algorithms
models =[]
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
#evaluate each model in turn
results =[]
names =[]
for name, model in models:
    kfold= model_selection.KFold(n_splits=10, random_state=seed)
    cv_results= model_selection.cross_val_score(model,X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg= "%s: %f (%f)" % (name, cv_results.mean(),cv_results.std())
    print(msg)
#Compare Algorithms
fig =plt.figure()
fig.suptitle('Algorithm Comparison')
ax =fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

# Makepredictions on validation dataset
knn =KNeighborsClassifier()
knn.fit(X_train,Y_train)
predictions = knn.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

你可能感兴趣的:(机器学习基本架构)