#coding:utf-8
#导入warnings包,利用过滤器来实现忽略警告语句。
import warnings
warnings.filterwarnings('ignore')
import missingno as msno #缺失值可视化
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
path='./data/'
train_data=pd.read_csv(path+'train.csv')
test_data=pd.read_csv(path+'testA.csv')
观察行列信息
print('Train_data shape:',train_data.shape)
print('Test_data shape:',test_data.shape)
Train_data shape: (100000, 3)
Test_data shape: (20000, 2)
查看首尾数据
train_data.head().append(train_data.tail())
train_data.describe()
data.describe()
——获取数据的相关统计量
describe种有每列的统计量,个数count、平均值mean、方差std、最小值min、中位数25% 50% 75% 、以及最大值 看这个信息主要是瞬间掌握数据的大概的范围以及每个值的异常值的判断,比如有的时候会发现999 9999 -1 等值这些其实都是nan的另外一种表达方式,有的时候需要注意下
train_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 100000 non-null int64
1 heartbeat_signals 100000 non-null object
2 label 100000 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 2.3+ MB
data.info()
——获取数据类型
info 通过info来了解数据每列的type,有助于了解是否存在除了nan以外的特殊符号异常
train_data.isnull().sum()
id 0
heartbeat_signals 0
label 0
dtype: int64
test_data.isnull().sum()
id 0
heartbeat_signals 0
dtype: int64
1> 预测值频数分析
train_data['label'].value_counts()
0.0 64327
3.0 17912
2.0 14199
1.0 3562
Name: label, dtype: int64
# 目标变量分布可视化
fig,axs=plt.subplots(1,2,figsize=(14,7))
# 柱状图
sns.countplot(x='label',data=train_data,ax=axs[0])
axs[0].set_title('Frequency of each Class')
# 饼图
train_data['label'].value_counts().plot(x=None,y=None,kind='pie',ax=axs[1],autopct='%1.2f%%')
axs[1].set_title('Percentage of each Class')
2> 预测值分布
import scipy.stats as st
y=train_data['label']
plt.figure(1); plt.title('Default')
sns.distplot(y, rug=True, bins=20)
plt.figure(2); plt.title('Normal')
sns.distplot(y, kde=False, fit=st.norm)
plt.figure(3); plt.title('Log Normal')
sns.distplot(y, kde=False, fit=st.lognorm)
skewness偏度:
kurtosis峰度
print('Skewness: %f' % train_data['label'].skew())
print('Kurtosis: %f' % train_data['label'].kurt())
Skewness: 0.871005
Kurtosis: -1.009573
4> 用pandas_profiling生成数据报告
import pandas_profiling
pfr = pandas_profiling.ProfileReport(train)
pfr.to_file('./example.html')