Kaggle实战(二):数据集特征分析与展示

本文对原始数据集中的特征进行分析与展示

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
#color = sns.color_palette()

%matplotlib inline
dpath = 'F:/Python_demo/XGBoost/data/'
train = pd.read_json(dpath +"RentListingInquries_train.json")
train.head(3)

test = pd.read_json(dpath+"RentListingInquries_test.json")
#test.head().T
print("Train :", train.shape)
print("Test : ", test.shape)
train.info()
Train : (49352, 15)
Test :  (74659, 14)

Int64Index: 49352 entries, 10 to 99994
Data columns (total 15 columns):
bathrooms          49352 non-null float64
bedrooms           49352 non-null int64
building_id        49352 non-null object
created            49352 non-null object
description        49352 non-null object
display_address    49352 non-null object
features           49352 non-null object
latitude           49352 non-null float64
listing_id         49352 non-null int64
longitude          49352 non-null float64
manager_id         49352 non-null object
photos             49352 non-null object
price              49352 non-null int64
street_address     49352 non-null object
interest_level     49352 non-null object
dtypes: float64(3), int64(3), object(9)
memory usage: 6.0+ MB
train.describe()  # 统计信息
train.isnull().sum()  # 缺失值个数

Kaggle实战(二):数据集特征分析与展示_第1张图片

bathrooms          0
bedrooms           0
building_id        0
created            0
description        0
display_address    0
features           0
latitude           0
listing_id         0
longitude          0
manager_id         0
photos             0
price              0
street_address     0
interest_level     0
dtype: int64
sns.countplot(train.interest_level, order=['low', 'medium', 'high']);  # 画出统计直方图
plt.xlabel('Interest Level');
plt.ylabel('Number of occurrences');

Kaggle实战(二):数据集特征分析与展示_第2张图片

### Quantitative substitute of Interest Level  将标签与数字建立映射
train['interest'] = np.where(train.interest_level=='low', 0,
                                  np.where(train.interest_level=='medium', 1, 2))
sns.countplot(train.interest, order=[0, 1, 2]);
plt.xlabel('interest');
plt.ylabel('Number of occurrences');

Kaggle实战(二):数据集特征分析与展示_第3张图片

# 两个变量之间的散点图
order = ['low', 'medium', 'high']
sns.stripplot(train["interest_level"],train["bathrooms"],jitter=True,order=order)  # jitter=True当数据重合较多时,用该参数做一些调整
plt.title("Number of Number of Bathrooms Vs Interest_level");

Kaggle实战(二):数据集特征分析与展示_第4张图片

# 数据预处理,大于95%分位数的都处理成同一值
ulimit = np.percentile(train.bathrooms.values, 95)
train['bathrooms'].loc[train['bathrooms']>ulimit] = ulimit  # bathrooms大于4时都置为4
sns.countplot(x="bathrooms", hue="interest_level",data=train);  # hue用于分类

Kaggle实战(二):数据集特征分析与展示_第5张图片

# 画出柱状图
ulimit = np.percentile(train.price.values, 99)  # 99%分位数
train['price'].loc[train['price']>ulimit] = ulimit

sns.distplot(train.price.values, bins=50, kde=True)
plt.xlabel('price', fontsize=12)
plt.show()

Kaggle实战(二):数据集特征分析与展示_第6张图片

# 地理位置,线性关系图
sns.lmplot(x="longitude", y="latitude", fit_reg=False, hue='interest_level',
           hue_order=['low', 'medium', 'high'], height=9, scatter_kws={'alpha':0.4,'s':30},
           data=train[(train.longitude>train.longitude.quantile(0.005))
                           &(train.longitudetrain.latitude.quantile(0.005))                           
                           &(train.latitude

Kaggle实战(二):数据集特征分析与展示_第7张图片

# 日期类型处理
train['created'] = pd.to_datetime(train['created'])
train['date'] = train['created'].dt.date
train["year"] = train["created"].dt.year
train['month'] = train['created'].dt.month
train['day'] = train['created'].dt.day
train['hour'] = train['created'].dt.hour
train['weekday'] = train['created'].dt.weekday
train['week'] = train['created'].dt.week
train['quarter'] = train['created'].dt.quarter
train['weekend'] = ((train['weekday'] == 5) & (train['weekday'] == 6))
train['wd'] = ((train['weekday'] != 5) & (train['weekday'] != 6))
cnt_srs = train['date'].value_counts()

plt.figure(figsize=(12,4))
ax = plt.subplot(111)
ax.bar(cnt_srs.index, cnt_srs.values)
ax.xaxis_date()
plt.xticks(rotation='vertical')
plt.show()
from wordcloud import WordCloud

text = ''
text_da = ''
text_street = ''
#text_desc = ''
for ind, row in train.iterrows():
    for feature in row['features']:
        text = " ".join([text, "_".join(feature.strip().split(" "))])
    text_da = " ".join([text_da,"_".join(row['display_address'].strip().split(" "))])
    text_street = " ".join([text_street,"_".join(row['street_address'].strip().split(" "))])
    #text_desc = " ".join([text_desc, row['description']])
text = text.strip()
text_da = text_da.strip()
text_street = text_street.strip()
#text_desc = text_desc.strip()

plt.figure(figsize=(12,6))
wordcloud = WordCloud(background_color='white', width=600, height=300, max_font_size=50, max_words=40).generate(text)
wordcloud.recolor(random_state=0)
plt.imshow(wordcloud)
plt.title("Wordcloud for features", fontsize=30)
plt.axis("off")
plt.show()

# wordcloud for display address
plt.figure()
wordcloud = WordCloud(background_color='white', width=600, height=300, max_font_size=50, max_words=40).generate(text_da)
wordcloud.recolor(random_state=0)
plt.imshow(wordcloud)
plt.title("Wordcloud for Display Address", fontsize=30)
plt.axis("off")
plt.show()

# wordcloud for street address
plt.figure()
wordcloud = WordCloud(background_color='white', width=600, height=300, max_font_size=50, max_words=40).generate(text_street)
wordcloud.recolor(random_state=0)
plt.imshow(wordcloud)
plt.title("Wordcloud for Street Address", fontsize=30)
plt.axis("off")
plt.show()

Kaggle实战(二):数据集特征分析与展示_第8张图片

 

 

你可能感兴趣的:(Kaggle)