《Python数据挖掘入门与实践》Robert Layton 人民邮电出版社,第2章
估计器Estimator:用于分类、聚类和回归分析
转换器Transformer:用于数据预处理和数据转换
流水线Pipeline:组合数据挖掘流程,便于再次使用
%matplotlib inline
#jupyter notebook里的命令, 意思是将那些用matplotlib绘制的图显示在页面里而不是弹出一个窗口,参数inline表示将图表嵌入到Notebook中
import os #os模块可以处理文件和目录
home_folder = os.path.expanduser("~")#输出主目录位置
print(home_folder)
data_folder = os.path.join(home_folder, "Data", "Ionosphere")#拼接路径
data_filename = os.path.join(data_folder, "ionosphere.data")
print(data_filename)
import csv#处理csv文件的模块
import numpy as np#numpy提供矩阵运算功能
# Size taken from the dataset and is known
X = np.zeros((351, 34), dtype='float')
y = np.zeros((351,), dtype='bool')
with open(data_filename, 'r') as input_file:#获取文件句柄
reader = csv.reader(input_file)#读取文件内容
for i, row in enumerate(reader):#对于一个可迭代的的对象,enumerate将其组成一个索引序列,利用它可以同时获得索引和值
# Get the data, converting each item to a float
data = [float(datum) for datum in row[:-1]]#获取每行的前34个值并转换为浮点值
# Set the appropriate row in our dataset
X[i] = data
# 1 if the class is 'g', 0 otherwise
y[i] = row[-1] == 'g'#每行的最后一个数据表示数据好坏,g或者b
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=14)#train_test_split随机划分训练集和测试集
print("There are {} samples in the training dataset".format(X_train.shape[0]))
print("There are {} samples in the testing dataset".format(X_test.shape[0]))
print("Each sample has {} features".format(X_train.shape[1]))
#KNeighborsClassifier在scikit-learn 在sklearn.neighbors包之中。
#KNeighborsClassifier使用分三步:1)创建KNeighborsClassifier对象,2)调用fit函数在训练集上完成模型创建,3)调用predict函数进行预测。
from sklearn.neighbors import KNeighborsClassifier#导入K近邻分类器
estimator = KNeighborsClassifier()
estimator.fit(X_train, y_train)#此处使用默认参数
y_predicted = estimator.predict(X_test)
accuracy = np.mean(y_test == y_predicted) * 100#计算准确率
print("The accuracy is {0:.1f}%".format(accuracy))
接下来进行算法的交叉检验
from sklearn.cross_validation import cross_val_score#导入交叉检验的函数
scores = cross_val_score(estimator, X, y, scoring='accuracy')#返回不同划分下的准确率
average_accuracy = np.mean(scores) * 100
print("The average accuracy is {0:.1f}%".format(average_accuracy))
观察不同近邻个数n_neighbors对准确率的影响
avg_scores = []
all_scores = []
parameter_values = list(range(1, 21)) # 近邻个数从1到20,含20
for n_neighbors in parameter_values:
estimator = KNeighborsClassifier(n_neighbors=n_neighbors)
scores = cross_val_score(estimator, X, y, scoring='accuracy')
avg_scores.append(np.mean(scores))
all_scores.append(scores)
plt.plot?#查看格式化字符串的详细配置
from matplotlib import pyplot as plt#从matplotlib库中导入pyplot
plt.figure(figsize=(32,20))#调用figure创建一个绘图对象,并且使它成为当前的绘图对象。通过figsize参数可以指定绘图对象的宽度和高度,单位为英寸
plt.plot(parameter_values, avg_scores, '-o', linewidth=5, markersize=24)#x、y轴为近邻数和平均正确率'-'实线,'o'点
#plt.axis([0, max(parameter_values), 0, 1.0])
for parameter, scores in zip(parameter_values, all_scores):
n_scores = len(scores)
plt.plot([parameter] * n_scores, scores, '-o')#for循环画图,每次的x,y分别为[n1 n1 n1]和[score1 score2 score3]
plt.plot(parameter_values, all_scores, 'bx')#画图'b'蓝色,'x'符号x
from collections import defaultdict
all_scores = defaultdict(list)
parameter_values = list(range(1, 21)) # Including 20
for n_neighbors in parameter_values:
for i in range(100):#似乎不需要?
estimator = KNeighborsClassifier(n_neighbors=n_neighbors)
scores = cross_val_score(estimator, X, y, scoring='accuracy', cv=10)#cv折数
all_scores[n_neighbors].append(scores)
for parameter in parameter_values:
scores = all_scores[parameter]
n_scores = len(scores)
plt.plot([parameter] * n_scores, scores, '-o')
利用转换器Transformer对特征值进行预处理
X_broken=np.array(x)
X_broken[:,::2]/=10#建立X_broken用于展示预处理过程
#MinMaxScalerlei类可把特征值的值域规范化为0-1间。最小值为0,最大值为1,其余值介于二者之间。
from sklearn.preprocessing import MinMaxScaler
X_transformed=MinMaxScaler().fit_transform(X_broken)#规范化
estimator=KNeighborsClassifier()#k近邻分类器
transformed_scores=cross_val_score(estimator,X_transformed,y,scoring='accuracy')#交叉检验
print('The average accuracy for is {0:.1f}%'.format(np.mean(transformed_scores)*100))#得出平均准确率
流水线
from sklearn.pipeline import Pipeline#导入Pipeline对象
scaling_pipeline=Pipeline([('scale',MinMaxScaler()),('predict',KNeighborsClassifier())])#创建流水线
scores=cross_val_score(scaling_pipeline,X_broken,y,scoring='accuracy')#调用流水线
print('The pipeline scored an average accuracy for is {0:.1f}%'.format(np.mean(transformed_scores)*100))