第一章:简介篇
案例:“良/恶性乳腺癌肿瘤预测”完整代码样例
# coding: utf-8
# In[1]:
import pandas as pd
# In[2]:
df_train = pd.read_csv('../Datasets/Breast-Cancer/breast-cancer-train.csv')
df_test = pd.read_csv('../Datasets/Breast-Cancer/breast-cancer-test.csv')
# In[3]:
df_test_negative = df_test.loc[df_test['Type']==0][['Clump Thickness','Cell Size']]
df_test_positive = df_test.loc[df_test['Type']==1][['Clump Thickness','Cell Size']]
# In[4]:
#导入matplotlib工具包中的pyplot
import matplotlib.pyplot as plt
#绘制图中的良性肿瘤样本点,标记为红色的o
plt.scatter(df_test_negative['Clump Thickness'],df_test_negative['Cell Size'],marker='o',s=200,c='red')
#绘制图中的恶性肿瘤样本点,标记为黑色的x
plt.scatter(df_test_positive['Clump Thickness'],df_test_positive['Cell Size'],marker='x',s=150,c='black')
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')
plt.show()
# In[5]:
import numpy as np
#利用numpy中的random函数随机采样直线的截距和系数
intercept = np.random.random([1])
coef = np.random.random([2])
lx = np.arange(0,12)
ly = (-intercept - lx* coef[0])/coef[1]
#绘制一条随机直线
plt.plot(lx,ly,c = 'yellow')
plt.scatter(df_test_negative['Clump Thickness'],df_test_negative['Cell Size'],marker='o',s=200,c='red')
plt.scatter(df_test_positive['Clump Thickness'],df_test_positive['Cell Size'],marker='x',s=150,c='black')
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')
plt.show()
# In[6]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
#使用前10条训练样本学习直线的系数和截距
lr.fit(df_train[['Clump Thickness','Cell Size']][:10],df_train['Type'][:10])
print'Testing accuracy(10 training samples):',lr.score(df_test[['Clump Thickness','Cell Size']],df_test['Type'])
# In[7]:
intercept = lr.intercept_
coef = lr.coef_[0,:]
#原本这个分类面应该是 lx * coef[0] + ly * coef[1]+intercept = 0,映射到2维平面上之后,应该是
ly = (-intercept - lx * coef[0])/coef[1]
#绘图
plt.plot(lx,ly,c = 'green')
plt.scatter(df_test_negative['Clump Thickness'],df_test_negative['Cell Size'],marker='o',s=200,c='red')
plt.scatter(df_test_positive['Clump Thickness'],df_test_positive['Cell Size'],marker='x',s=150,c='black')
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')
plt.show()
# In[8]:
lr = LogisticRegression()
#使用所有训练样本学习直线的系数和截距
lr.fit(df_train[['Clump Thickness','Cell Size']],df_train['Type'])
print'Testing accuracy(all training samples):',lr.score(df_test[['Clump Thickness','Cell Size']],df_test['Type'])
# In[9]:
intercept = lr.intercept_
coef = lr.coef_[0,:]
#原本这个分类面应该是 lx * coef[0] + ly * coef[1]+intercept = 0,映射到2维平面上之后,应该是
ly = (-intercept - lx * coef[0])/coef[1]
#绘图
plt.plot(lx,ly,c = 'blue')
plt.scatter(df_test_negative['Clump Thickness'],df_test_negative['Cell Size'],marker='o',s=200,c='red')
plt.scatter(df_test_positive['Clump Thickness'],df_test_positive['Cell Size'],marker='x',s=150,c='black')
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')
plt.show()
第二章:基础篇
案例2.1:“良/恶性乳腺癌肿瘤数据预处理”
# coding: utf-8
# In[16]:
import pandas as pd
import numpy as np
# In[17]:
#创建特征向量表
column_names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']
# In[18]:
#使用pandas.read_csv从互联网读取指定数据
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data',names = column_names)
#输出此时的源数据
data.to_csv('Original.csv')
# In[19]:
#将?替换为标准缺失值表示
data = data.replace(to_replace='?',value = np.nan)
#丢弃带有缺失值的数据
data = data.dropna(how='any')
#输出data的数据量和维度
data.shape
#输出修改后的数据
data.to_csv('Result.csv')
案例2.2:“准备良/恶性乳腺癌肿瘤训练,测试数据”
#使用sklearn.cross_validation里的 train_test_spilt模块用于分割数据
from sklearn.cross_validation import train_test_split
#随机采样25%的数据用于测试,剩下的75%用于构建训练集合
X_train, X_test, y_train, y_test = train_test_split(data[column_names[1:10]],data[column_names[10]],test_size = 0.25, random_state = 33)
# In[23]:
#查验训练样本的数量和分布
y_train.value_counts()
# In[24]:
#查验测试样本的数量和分布
y_test.value_counts()
案例2.3:“使用线性分类模型从事良/恶性肿瘤预测任务”
#使用线性分类模型,从事良/恶性肿瘤预测任务
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
#标准化数据,保证每个维度的特征数据方差为1,均值为0.使得预测结果不会被某些过大的特征值所主导
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
#初始化LogisticRegression 与SGDClassifier
lr = LogisticRegression()
sgdc = SGDClassifier()
#调用LogisticRegression中的fit函数来训练模型参数
lr.fit(X_train,y_train)
#使用训练好的模型lr对X_test进行预测,结果存储在变量lr_y_predict中
lr_y_predict = lr.predict(X_test)
#调用SGDClassifier中的fit函数来训练模型参数
sgdc.fit(X_train,y_train)
#使用训练好的模型sgdc对X_test进行预测,结果存储在变量sgdc_y_predict中
sgdc_y_predict = sgdc.predict(X_test)
案例2.4:“使用线性分类模型从事良/恶性肿瘤预测任务的性能分析”
from sklearn.metrics import classification_report
#使用逻辑斯蒂回归模型自带的评分函数score获得模型在测试集上的准确性结果
print' Accuracy of LR Classifier:',lr.score(X_test,y_test)
#利用classification_report模块获得LogisticRegression其他三个指标的结果
print classification_report(y_test,lr_y_predict,target_names = ['Benign','Malignant'])
#使用随机梯度下降模型自带的评分函数score获得模型在测试集上的准确性结果
print 'Accuracy of SGD Classifier:',sgdc.score(X_test,y_test)
#利用classification_report模块获得SGDClassifier其他三个指标的结果
print classification_report(y_test,sgdc_y_predict,target_names = ['Benign','Malignant'])