《Python 机器学习及实践--从零开始通往kaggle竞赛之路》--第一章代码

# -*- coding: utf-8 -*-
# spyder + python3.6.1
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
plt.rcParams['font.sans-serif'] = ['SimHei']   #指定默认字体  
plt.rcParams['axes.unicode_minus'] = False     #解决负号'-'显示为方块的问题 

###############################################################################
### 数据分析 ###
"""
数据集介绍(2个属性+1个标签)
1.肿瘤厚度'Clump Thickness'
2.细胞尺寸'Cell Size'
3.肿瘤类型'Type'              0良性  1恶性 
"""
## 导入数据
df_train = pd.read_csv(r'breast-cancer-train.csv')
df_test = pd.read_csv(r'breast-cancer-test.csv')

## 查看数据
df_train = df_train.drop(['Unnamed: 0'],axis = 1)
print("#####训练集#####")
print(df_train.head(20),'\n')      # 前20个样本数据
print(df_train.info(),'\n')        # 数据集基本信息
print(df_train.describe(),'\n')    # 数据集统计信息

df_test = df_test.drop(['Unnamed: 0'],axis = 1)
print("#####测试集#####")
print(df_test.head(20),'\n')       # 前20个样本数据
print(df_test.info(),'\n')         # 数据集基本信息
print(df_test.describe(),'\n')     # 数据集统计信息

## 构建测试集中的正负分类样本
df_test_positive = df_test.loc[df_test.Type == 0][['Clump Thickness','Cell Size']]
print(df_test_positive.head(20),'\n')

df_test_negative = df_test.loc[df_test.Type == 1][['Clump Thickness','Cell Size']]
print(df_test_negative.head(20),'\n')

###############################################################################
### 随机分类 ###
## 绘制良性肿瘤样本点,标记为红色的o
plt.scatter(df_test_positive['Clump Thickness'], # 良性肿瘤厚度
            df_test_positive['Cell Size'],       # 良性肿瘤尺度
            marker='o', # 点的形状o
            s=200,      # 
            c='red'     # 点的红色
            )
## 绘制恶性肿瘤样本点,标记为黑色的x
plt.scatter(df_test_negative['Clump Thickness'], # 恶性肿瘤厚度
            df_test_negative['Cell Size'],       # 恶性肿瘤尺度
            marker='x', # 点的形状x
            s=150,      # 
            c='black'   # 点的颜色
            )

plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')

## 直线的截距和系数(随机产生)
intercept = np.random.random([1])
print(intercept)
coef = np.random.random([2])
print(coef)

## plot直随机线
lx = np.arange(0,12)                 # 等差数列
ly = (-intercept-lx*coef[0])/coef[1] # 直线截距式(分类面映射到二维平面)
plt.plot(lx,ly,c='blue')
plt.show()

###############################################################################
### logistic回归(前十条训练样本--直线截距和系数)

## 绘制良性肿瘤样本点,标记为红色的o
plt.scatter(df_test_negative['Clump Thickness'], # 
            df_test_negative['Cell Size'],       # 
            marker='o', # 0
            s=200,      # 
            c='red'     # 黑色
            )

plt.scatter(df_test_positive['Clump Thickness'], # 
            df_test_positive['Cell Size'],       # 
            marker='x', # 0
            s=150,      # 
            c='black'   # 黑色
            )

plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')

## 直线的截距和系数(lr产生)
lr = LogisticRegression()
lr.fit(df_train[['Clump Thickness','Cell Size']][:10],df_train['Type'][:10])
print('前十条训练样本的准确度',lr.score(df_test[['Clump Thickness','Cell Size']],df_test['Type']))

intercept = lr.intercept_
print(intercept)

coef = lr.coef_[0,:]
print(coef)

## 绘图plot
lx = np.arange(0,12)                 # 等差数列
ly = (-intercept-lx*coef[0])/coef[1] # 直线截距式(分类面映射到二维平面)
plt.plot(lx,ly,c='blue')
plt.show()

###############################################################################
## logistic回归(全部训练样本--直线截距和系数)
## 绘制良性肿瘤样本点,标记为红色的o
plt.scatter(df_test_negative['Clump Thickness'], # 
            df_test_negative['Cell Size'],       # 
            marker='o', # 0
            s=200,      # 
            c='red'     # 黑色
            )

plt.scatter(df_test_positive['Clump Thickness'], # 
            df_test_positive['Cell Size'],       # 
            marker='x', # 0
            s=150,      # 
            c='black'   # 黑色
            )

plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')

## 直线的截距和系数(lr产生)
lr = LogisticRegression()
lr.fit(df_train[['Clump Thickness','Cell Size']],df_train['Type'])
print('全部训练数据的准确度',lr.score(df_test[['Clump Thickness','Cell Size']],df_test['Type']))

intercept = lr.intercept_
coef = lr.coef_[0,:]
print("截距",intercept)
print("系数",coef)

## 绘图plot
lx = np.arange(0,12)                 # 等差数列
ly = (-intercept-lx*coef[0])/coef[1] # 直线截距式(分类面映射到二维平面)
plt.plot(lx,ly,c='blue')
plt.show()

你可能感兴趣的:(数据竞赛)