import pandas_profiling
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
from PIL import Image as im
from wordcloud import WordCloud,STOPWORDS
#Users
u_cols = ['user_id', 'location', 'age']
users = pd.read_csv('C:/Users/Desktop/推荐系统/第五次实验/BX-Users.csv', sep=';', names=u_cols, encoding='latin-1',low_memory=False)
#Books
i_cols = ['isbn', 'book_title' ,'book_author','year_of_publication', 'publisher', 'img_s', 'img_m', 'img_l']
items = pd.read_csv('C:/Users/Desktop/推荐系统/第五次实验/BX-Books.csv', sep=';', names=i_cols, encoding='latin-1',low_memory=False)
#Ratings
r_cols = ['user_id', 'isbn', 'rating']
ratings = pd.read_csv('C:/Users/Desktop/推荐系统/第五次实验/BX-Book-Ratings.csv', sep=';', names=r_cols, encoding='latin-1',low_memory=False)
观察前几个数据情况
users.head()
users.describe()
users.dtypes
ratings.describe()
ratings.dtypes
在上面我们看到第一行的数据没有用,我们删去:
#剪去第一列
users = users.drop(users.index[0])
items = items.drop(items.index[0])
ratings = ratings.drop(ratings.index[0])
users
统计空值:
users.isnull().sum()
类型转换:
#类型转换
users['age'] = users['age'].astype(float)
users['user_id'] = users['user_id'].astype(int)
ratings['user_id'] = ratings['user_id'].astype(int)
ratings['rating'] = ratings['rating'].astype(int)
items['year_of_publication'] = items['year_of_publication'].astype(int)
users.isnull().sum()
age概况:
users['age'].describe()
删除一点不合理的数据,对于一些年龄比较奇怪的
import numpy as np
users.loc[(users.age>99) | (users.age<5),'age'] = np.nan
users.age = users.age.fillna(users.age.mean())#删去不合理数据
ratings.isnull().sum()
items.isnull().sum() #检查books空值情况
查看空值情况:
items.loc[items.publisher.isnull(),:]
#查阅资料,将空值填充
items.loc[items.isbn=='193169656X','publisher']='Mundania Press LLC'
items.loc[items.isbn=='1931696993','publisher']='Novelbooks Incorporated'
同理:
items.loc[items.book_author.isnull(),:]
#再给他填上!!
items.loc[items.isbn=='9627982032','book_author']='Larissa Anne Downe'
#瞅一下书的时间合理不合理
print(sorted(items['year_of_publication'].unique()))
现在是2021年,最多也不能超过2021吧
# 删除不合理的日期
items.loc[(items.year_of_publication==0)|(items.year_of_publication>2021) ,'year_of_publication' ] = np.nan
items.year_of_publication = items.year_of_publication.fillna(round(items.year_of_publication.mean()))
数据合并:
#数据合并
df = pd.merge(users, ratings, on='user_id')
df = pd.merge(df, items, on='isbn')
df.head(5)
选取50岁以上的人和25岁以下人喜欢读的书:
user_fit = df[(df['age']>50)]
user_fit
user_fit25 = df[(df['age']<25)]
user_fit25
得到排名:
user_fit['book_title'].value_counts().head(10)
user_fit25['book_title'].value_counts().head(10)
import pandas_profiling
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
from PIL import Image as im
from wordcloud import WordCloud,STOPWORDS
plt.figure(figsize=(10,8))
sns.distplot(df['age'],kde=False)
plt.xlabel('Age')
plt.ylabel('count')
plt.title('Age Distribution',size=20)
plt.show()
df_v=df[['year_of_publication']].copy()
df_v['year_of_publication'] = df_v['year_of_publication'].astype(int).astype(str)
df_v=df_v['year_of_publication'].value_counts().head(25).reset_index()
df_v.columns=['year','count']
df_v['year']='Year '+df_v['year']
plt.figure(figsize=(10,8))
sns.barplot(x='count',y='year',data=df_v,palette=customPalette)
plt.ylabel('Year Of Publication')
plt.yticks(size=12)
plt.title('Years of Publication',size=20)
plt.show()
详细内容关注公众号,一起学习