# -*- coding: utf-8 -*-
# spyder + python3.6.1
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
plt.rcParams['font.sans-serif'] = ['SimHei'] #指定默认字体
plt.rcParams['axes.unicode_minus'] = False #解决负号'-'显示为方块的问题
###############################################################################
### 数据分析 ###
"""
数据集介绍(2个属性+1个标签)
1.肿瘤厚度'Clump Thickness'
2.细胞尺寸'Cell Size'
3.肿瘤类型'Type' 0良性 1恶性
"""
## 导入数据
df_train = pd.read_csv(r'breast-cancer-train.csv')
df_test = pd.read_csv(r'breast-cancer-test.csv')
## 查看数据
df_train = df_train.drop(['Unnamed: 0'],axis = 1)
print("#####训练集#####")
print(df_train.head(20),'\n') # 前20个样本数据
print(df_train.info(),'\n') # 数据集基本信息
print(df_train.describe(),'\n') # 数据集统计信息
df_test = df_test.drop(['Unnamed: 0'],axis = 1)
print("#####测试集#####")
print(df_test.head(20),'\n') # 前20个样本数据
print(df_test.info(),'\n') # 数据集基本信息
print(df_test.describe(),'\n') # 数据集统计信息
## 构建测试集中的正负分类样本
df_test_positive = df_test.loc[df_test.Type == 0][['Clump Thickness','Cell Size']]
print(df_test_positive.head(20),'\n')
df_test_negative = df_test.loc[df_test.Type == 1][['Clump Thickness','Cell Size']]
print(df_test_negative.head(20),'\n')
###############################################################################
### 随机分类 ###
## 绘制良性肿瘤样本点,标记为红色的o
plt.scatter(df_test_positive['Clump Thickness'], # 良性肿瘤厚度
df_test_positive['Cell Size'], # 良性肿瘤尺度
marker='o', # 点的形状o
s=200, #
c='red' # 点的红色
)
## 绘制恶性肿瘤样本点,标记为黑色的x
plt.scatter(df_test_negative['Clump Thickness'], # 恶性肿瘤厚度
df_test_negative['Cell Size'], # 恶性肿瘤尺度
marker='x', # 点的形状x
s=150, #
c='black' # 点的颜色
)
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')
## 直线的截距和系数(随机产生)
intercept = np.random.random([1])
print(intercept)
coef = np.random.random([2])
print(coef)
## plot直随机线
lx = np.arange(0,12) # 等差数列
ly = (-intercept-lx*coef[0])/coef[1] # 直线截距式(分类面映射到二维平面)
plt.plot(lx,ly,c='blue')
plt.show()
###############################################################################
### logistic回归(前十条训练样本--直线截距和系数)
## 绘制良性肿瘤样本点,标记为红色的o
plt.scatter(df_test_negative['Clump Thickness'], #
df_test_negative['Cell Size'], #
marker='o', # 0
s=200, #
c='red' # 黑色
)
plt.scatter(df_test_positive['Clump Thickness'], #
df_test_positive['Cell Size'], #
marker='x', # 0
s=150, #
c='black' # 黑色
)
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')
## 直线的截距和系数(lr产生)
lr = LogisticRegression()
lr.fit(df_train[['Clump Thickness','Cell Size']][:10],df_train['Type'][:10])
print('前十条训练样本的准确度',lr.score(df_test[['Clump Thickness','Cell Size']],df_test['Type']))
intercept = lr.intercept_
print(intercept)
coef = lr.coef_[0,:]
print(coef)
## 绘图plot
lx = np.arange(0,12) # 等差数列
ly = (-intercept-lx*coef[0])/coef[1] # 直线截距式(分类面映射到二维平面)
plt.plot(lx,ly,c='blue')
plt.show()
###############################################################################
## logistic回归(全部训练样本--直线截距和系数)
## 绘制良性肿瘤样本点,标记为红色的o
plt.scatter(df_test_negative['Clump Thickness'], #
df_test_negative['Cell Size'], #
marker='o', # 0
s=200, #
c='red' # 黑色
)
plt.scatter(df_test_positive['Clump Thickness'], #
df_test_positive['Cell Size'], #
marker='x', # 0
s=150, #
c='black' # 黑色
)
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')
## 直线的截距和系数(lr产生)
lr = LogisticRegression()
lr.fit(df_train[['Clump Thickness','Cell Size']],df_train['Type'])
print('全部训练数据的准确度',lr.score(df_test[['Clump Thickness','Cell Size']],df_test['Type']))
intercept = lr.intercept_
coef = lr.coef_[0,:]
print("截距",intercept)
print("系数",coef)
## 绘图plot
lx = np.arange(0,12) # 等差数列
ly = (-intercept-lx*coef[0])/coef[1] # 直线截距式(分类面映射到二维平面)
plt.plot(lx,ly,c='blue')
plt.show()