机器学习图书畅销原因分析
决策树的结果如下。 特征的重要排名是
1.Discount_price:折扣价
2.Practice_theory_category: 实战还是理论类
3.Price: 当前价格
4.Traditional_deep_category: 传统机器学习还是深度学习。
5.Is_author_foreigner:国外作者还是国内
由评论数最高的前150本畅销书画出的下面柱状图可知,大部分图书是偏实战方面的图书
前150本畅销图书。有图可知,两者基本差不多
畅销书内国内作者还是多与国外作者,但是由于国外作者还是挺多可知。国外作者是畅销书的可能还是很大
畅销图书红色方框与蓝色三角形都落在了红色圆圈里面,滞销书绿色圆与黄色四边形大部分落在绿色圆圈里面,可以看出畅销书的价格在40-100之间,折扣价在0-30之间。
影响机器学习书畅销的主要因素是1.折扣价格,2偏实战还理论,3.当前价格,4.深度学习还是传统机器学习,5国外还是国内作者。
偏实战方面,偏深度学习方面,还有就是价格与折扣价适中的图书比较受欢迎。
如果某出版社要出版机器学习的书籍。我的建议是书的内容上多出一些偏实战与深度学习方面的图书尤其是Tensorflow语言的。还有就是书的质量方面,最好是精装版印刷比较好的。
有待改进的地方
特征其实太少,如果可以的话,多抓取以下其它的特征(比如书是否带有视频教程,作者是的出名程度,是否配有网课等等。。。)。
import pandas as pd
import numpy as np
import sklearn as sklearn
from sklearn.feature_extraction import DictVectorizer
from sklearn import tree
from wordcloud import WordCloud, ImageColorGenerator
import jieba
from sklearn.ensemble import RandomForestClassifier
from PIL import Image
import matplotlib.pyplot as plt
def preprocessing():
df = pd.read_csv('data.csv', encoding='utf-8')
print(df.loc[:, 'publish_date'].values)
# 去掉一些特殊字符
f = lambda x: x['publish_date'].replace('[','').replace(']','').replace("'",'')
df.loc[:, 'publish_date'] = df.apply(f, axis=1)
df.to_csv('final_data.csv', encoding = 'utf-8', index=False)
# 删掉没有出版时间的行
df = pd.read_csv('final_data.csv', encoding='utf-8')
df = df.dropna(subset=['publish_date'])
df.to_csv('final_data.csv', encoding = 'utf-8', index=False)
# 删掉大牛的书籍
df = pd.read_csv('final_data.csv', encoding='utf-8')
df = df[~ df['author'].str.contains('周志华','李航')]
df = df[~ df['title'].str.contains('周志华','李航')]
#删掉一些没用的列
#data = data[['item_id','commentCount', 'goodCount','poorCount', 'afterCount', 'videoCount','old_price', 'price', 'shop', 'title']]
df = df[['commentCount', 'old_price', 'price', 'shop', 'publish_date', 'author','title']]
#根据出版的时间,求出月平均的评论数
df['comment_count_monthly'] = df['commentCount'].div((pd.Timestamp('2020-07') - pd.to_datetime(df['publish_date'], format='%Y-%m')).dt.days/30)
# 折扣价格
f = lambda x: x['old_price']-x['price']
df['discount_price'] = df.apply(f, axis=1)
#根据书的价格分类, 2:高, 1:中, 0:低
discount_price = pd.cut(df.discount_price, [0, 5, 15,500], labels=[u"0", u"1", u"2"])
df['discount_price'] = discount_price
# 书的类型 1:实战 , 2:理论, 3:综合
def practice_theory_category(x):
is_practice = False
if '实战' in x['title'] or '实践' in x['title'] or '案例' in x['title'] or '动手' in x['title'] or '应用' in x['title'] or '代码' in x['title'] :
is_practice = True
if '理论' in x['title'] or '基础' in x['title'] or '数学' in x['title'] or '原理' in x['title'] or '算法'in x['title'] :
if is_practice:
return 3
else:
return 2
if is_practice:
return 1
return np.nan
df['practice_theory_category'] = df.apply(practice_theory_category, axis=1)
# 是否是深度学习类型, 1:深度学习 ,0:传统机器学习
def traditional_deep_category(x):
book_category = 0
if '深度学习' in x['title'] or 'GAN' in x['title'] or '对抗网络' in x['title'] or 'TensorFlow' in x['title'] :
book_category = 1
return book_category
df['traditional_deep_category'] = df.apply(traditional_deep_category, axis=1)
#是否海外作者 1:是, 0: 不是
def is_author_foreigner(x):
foreigner = 0
if '(' in x['author'] and ')' in x['author'] :
foreigner = 1
return foreigner
df['is_author_foreigner'] = df.apply(is_author_foreigner, axis=1)
#删掉月平均的评论数是0的数据
df = df[~(df.comment_count_monthly==0)]
#根据月平均排序
df = df.sort_values(by='comment_count_monthly', ascending=False)
#根据评论多少分类,1:hot, 2, best_selling, 3: ordinary, 4: unsalable
selling_top = pd.cut(df.comment_count_monthly, [0, 10, 100, 500, 1000000], labels=[u"4", u"3", u"2", u"1"])
df['selling_top'] = selling_top
#根据书的价格分类, 2:高, 1:中, 0:低
price = pd.cut(df.price, [0, 20, 40,500], labels=[u"0", u"1", u"2"])
df['price'] = price
df = df[['price', 'discount_price', 'practice_theory_category', 'traditional_deep_category', 'is_author_foreigner', 'selling_top']]
df.dropna(inplace=True)
#保持预处理过的数据
df.to_csv('final_data.csv', encoding='utf-8', index=False)
#print(data.head(10))
def preprocessing2():
df = pd.read_csv('data.csv', encoding='utf-8')
print(df.loc[:, 'publish_date'].values)
# 去掉一些特殊字符
f = lambda x: x['publish_date'].replace('[','').replace(']','').replace("'",'')
df.loc[:, 'publish_date'] = df.apply(f, axis=1)
df.to_csv('final_data.csv', encoding = 'utf-8', index=False)
# 删掉没有出版时间的行
df = pd.read_csv('final_data.csv', encoding='utf-8')
df = df.dropna(subset=['publish_date'])
df.to_csv('final_data.csv', encoding = 'utf-8', index=False)
# 删掉大牛的书籍
df = pd.read_csv('final_data.csv', encoding='utf-8')
df = df[~ df['author'].str.contains('周志华','李航')]
df = df[~ df['title'].str.contains('周志华','李航')]
#删掉一些没用的列
#data = data[['item_id','commentCount', 'goodCount','poorCount', 'afterCount', 'videoCount','old_price', 'price', 'shop', 'title']]
df = df[['commentCount', 'old_price', 'price', 'shop', 'publish_date', 'author','title']]
#根据出版的时间,求出月平均的评论数
df['comment_count_monthly'] = df['commentCount'].div((pd.Timestamp('2020-07') - pd.to_datetime(df['publish_date'], format='%Y-%m')).dt.days/30)
# 折扣价格
f = lambda x: x['old_price']-x['price']
df['discount_price'] = df.apply(f, axis=1)
# 书的类型 1:实战 , 2:理论, 3:综合
def practice_theory_category(x):
is_practice = False
if '实战' in x['title'] or '实践' in x['title'] or '案例' in x['title'] or '动手' in x['title'] or '应用' in x['title'] or '代码' in x['title'] :
is_practice = True
if '理论' in x['title'] or '基础' in x['title'] or '数学' in x['title'] or '原理' in x['title'] or '算法'in x['title'] :
if is_practice:
return 3
else:
return 2
if is_practice:
return 1
return np.nan
df['practice_theory_category'] = df.apply(practice_theory_category, axis=1)
# 是否是深度学习类型, 1:深度学习 ,0:传统机器学习
def traditional_deep_category(x):
book_category = 0
if '深度学习' in x['title'] or 'GAN' in x['title'] or '对抗网络' in x['title'] or 'TensorFlow' in x['title'] :
book_category = 1
return book_category
df['traditional_deep_category'] = df.apply(traditional_deep_category, axis=1)
#是否海外作者 1:是, 0: 不是
def is_author_foreigner(x):
foreigner = 0
if '(' in x['author'] and ')' in x['author'] :
foreigner = 1
return foreigner
df['is_author_foreigner'] = df.apply(is_author_foreigner, axis=1)
#删掉月平均的评论数是0的数据
df = df[~(df.comment_count_monthly==0)]
#根据月平均排序
df = df.sort_values(by='comment_count_monthly', ascending=False)
#根据评论多少分类,1:hot, 2, best_selling, 3: ordinary, 4: unsalable
selling_top = pd.cut(df.comment_count_monthly, [0, 10, 100, 500, 1000000], labels=[u"4", u"3", u"2", u"1"])
df['selling_top'] = selling_top
df = df[['price', 'discount_price', 'practice_theory_category', 'traditional_deep_category', 'is_author_foreigner', 'selling_top']]
df.dropna(inplace=True)
#保持预处理过的数据
df.to_csv('final_data.csv', encoding='utf-8', index=False)
#print(data.head(10))
def preprocessing1():
df = pd.read_csv('data.csv', encoding='utf-8')
print(df.loc[:, 'publish_date'].values)
# 去掉一些特殊字符
f = lambda x: x['publish_date'].replace('[','').replace(']','').replace("'",'')
df.loc[:, 'publish_date'] = df.apply(f, axis=1)
df.to_csv('final_data.csv', encoding = 'utf-8', index=False)
# 删掉没有出版时间的行
df = pd.read_csv('final_data.csv', encoding='utf-8')
df = df.dropna(subset=['publish_date'])
df.to_csv('final_data.csv', encoding = 'utf-8', index=False)
# 删掉大牛的书籍
df = pd.read_csv('final_data.csv', encoding='utf-8')
df = df[~ df['author'].str.contains('周志华','李航')]
df = df[~ df['title'].str.contains('周志华','李航')]
#删掉一些没用的列
#data = data[['item_id','commentCount', 'goodCount','poorCount', 'afterCount', 'videoCount','old_price', 'price', 'shop', 'title']]
df = df[['commentCount', 'old_price', 'price', 'shop', 'publish_date', 'author','title', 'url']]
#根据出版的时间,求出月平均的评论数
df['comment_count_monthly'] = df['commentCount'].div((pd.Timestamp('2020-07') - pd.to_datetime(df['publish_date'], format='%Y-%m')).dt.days/30)
# 折扣价格
f = lambda x: x['old_price']-x['price']
df['discount_price'] = df.apply(f, axis=1)
# 书的类型 1:实战 , 2:理论, 3:综合
def practice_theory_category(x):
book_category = ''
if '实战' in x['title'] or '实践' in x['title'] or '案例' in x['title'] or '实例' in x['title'] or '动手' in x['title'] or '应用' in x['title'] or '代码' in x['title'] :
book_category = 'practice'
if '理论' in x['title'] or '基础' in x['title'] or '数学' in x['title'] or '原理' in x['title'] or '算法'in x['title'] :
if book_category=='practice':
book_category = 'both'
else:
book_category ='theory'
return book_category
df['practice_theory_category'] = df.apply(practice_theory_category, axis=1)
# 是否是深度学习类型, 1:深度学习 ,0:传统机器学习
def traditional_deep_category(x):
book_category = np.nan
if '机器学习' in x['title'] or '贝叶斯' in x['title']:
book_category = 'traditional'
if '深度' in x['title'] or 'GAN' in x['title'] or '对抗' in x['title'] or \
'TensorFlow' in x['title'] or '强化学习' in x['title'] or '目标检测' in x['title'] or '神经网络' in x['title'] :
book_category = 'deep'
return book_category
df['traditional_deep_category'] = df.apply(traditional_deep_category, axis=1)
#是否海外作者 1:是, 0: 不是
def is_author_foreigner(x):
foreigner = 'native'
if '(' in x['author'] and ')' in x['author'] :
foreigner = 'foreigner'
return foreigner
df['is_author_foreigner'] = df.apply(is_author_foreigner, axis=1)
#删掉月平均的评论数是0的数据
df = df[~(df.comment_count_monthly==0)]
#根据月平均排序
df = df.sort_values(by='comment_count_monthly', ascending=False)
#根据评论多少分类,1:hot, 2, best_selling, 3: ordinary, 4: unsalable
#selling_top = pd.cut(df.comment_count_monthly, [0, 10, 100, 500, 1000000], labels=[u"4", u"3", u"2", u"1"])
#df['selling_top'] = selling_top
#根据书的价格分类, 高, 中, 低
price = pd.cut(df.price, [0, 40, 80,500], labels=[u"low", u"middle", u"high"])
df['price'] = price
df = df[['url','price', 'title' , 'shop', 'author', 'practice_theory_category', 'traditional_deep_category', 'is_author_foreigner', 'comment_count_monthly']]
df.dropna(inplace=True)
df = df.head(150)
#保持预处理过的数据
df.to_csv('final_data.csv', encoding='utf-8', index=False)
#print(data.head(10))
def show_bar_diagram(x_data, y_data, ):
plt.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 这两行需要手动设置
plt.bar(x=x_data, height=y_data, color='steelblue', alpha=0.8)
plt.title('特征的重要性')
#plt.xticks(x_data, x_data, rotation=-20)
# 显示图例
plt.legend()
plt.show()
def DecisionTree_important_feacture():
data = pd.read_csv('final_data.csv', encoding='utf-8')
# pandas 读取 csv 文件,header = None 表示不将首行作为列
#print(np.isnan(data).any())
# sparse=False意思是不产生稀疏矩阵
vec = sklearn.feature_extraction.DictVectorizer(sparse=True)
# 先用 pandas 对每行生成字典,然后进行向量化
feature = data[[ 'price','discount_price','practice_theory_category', 'traditional_deep_category', 'is_author_foreigner']]
#feature = data[['practice_theory_category', 'traditional_deep_category', 'is_author_foreigner']]
X_train = vec.fit_transform(feature.to_dict(orient='record'))
# 打印各个变量
print('show feature\n', feature)
print('show vector\n', X_train)
print('show vector name\n', vec.get_feature_names())
print('show vector name\n', vec.vocabulary_)
Y_train = data['selling_top']
#clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth= 4)
clf = tree.DecisionTreeClassifier(criterion='entropy')
clf.fit(X_train, Y_train)
print('feature_importances_:', clf.feature_importances_)
show_bar_diagram(vec.get_feature_names(),clf.feature_importances_ )
def RandomForestClassifier_important_feacture():
data = pd.read_csv('final_data.csv', encoding='utf-8')
# pandas 读取 csv 文件,header = None 表示不将首行作为列
#print(np.isnan(data).any())
# sparse=False意思是不产生稀疏矩阵
vec = sklearn.feature_extraction.DictVectorizer(sparse=False)
# 先用 pandas 对每行生成字典,然后进行向量化
feature = data[[ 'practice_theory_category', 'traditional_deep_category', 'is_author_foreigner']]
#feature = data[['price', 'discount_price', 'practice_theory_category', 'traditional_deep_category', 'is_author_foreigner']]
X_train = vec.fit_transform(feature.to_dict(orient='record'))
# 打印各个变量
print('show feature\n', feature)
print('show vector\n', X_train)
print('show vector name\n', vec.get_feature_names())
print('show vector name\n', vec.vocabulary_)
Y_train = data['selling_top']
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, Y_train)
print('feature_importances_:', clf.feature_importances_)
show_bar_diagram(vec.get_feature_names(), clf.feature_importances_)
def compare_in_certain_column(columnName, title, rotation = 0):
plt.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 这两行需要手动设置
data = pd.read_csv('final_data.csv', encoding='utf-8')
grp1 = data.groupby([columnName])[columnName].agg('count')
dict1 = grp1.to_dict()
print(dict1)
y_data = [i for i in dict1.values()]
x_data = [i for i in dict1.keys()]
plt.bar(x=x_data, height=y_data, color='steelblue', alpha=0.8 )
for x1, yy in zip(x_data, y_data):
plt.text(x1, yy, '%s' % yy)
plt.xticks(x_data, x_data, rotation=rotation)
# 设置标题
plt.title(title)
# 为两条坐标轴设置名称
plt.ylabel("数量")
# 显示图例
plt.legend()
plt.show()
def title_word_cloud():
# 打开文本
data = pd.read_csv('final_data.csv', encoding='utf-8')
titles = data['title'].tolist();
text = ''
for title in titles:
text = text+' '+title
# 中文分词
print(jieba.cut(text))
text = ' '.join(jieba.cut(text))
print(text)
# 生成对象
wc = WordCloud(font_path='simsun.ttc', width=800, height=600, mode='RGBA', background_color=None).generate(text)
# 显示词云
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()
def title_word_cloud_pic():
# 打开文本
data = pd.read_csv('final_data.csv', encoding='utf-8')
titles = data['title'].tolist();
text = ''
for title in titles:
text = text + ' ' + title
# 中文分词
text = ' '.join(jieba.cut(text))
# 生成对象
#mask = np.array(Image.open("1.jpg"))
mask = np.array(Image.open("3.png"))
wc = WordCloud(mask=mask, font_path='simsun.ttc', mode='RGBA', background_color='white').generate(text)
# 从图片中生成颜色
image_colors = ImageColorGenerator(mask)
wc.recolor(color_func=image_colors)
# 显示词云
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
# 保存到文件
#wc.to_file('wordcloud5.png')
def scatter_plot(dim1, dim2):
data = pd.read_csv('final_data.csv', encoding='utf-8')
data =data[data[dim1]<160]
data = data[data[dim2] > 0]
#data.head(200)
selling_top_dic = {1: 'hot', 2: 'best_selling', 3: 'ordinary', 4: 'unsalable'}
print(selling_top_dic[1])
for t, marker, color in zip(range(1,5), "s>od", ('r','b','g','y')):
# zip()接受任意多个序列参数,返回一个元组tuple列表
# 用不同的标记和颜色画出每种品种iris花朵的前两维数据
# We plot each class on its own to get different colored markers
plt.scatter(data[data['selling_top'] == t][dim1],
data[data['selling_top'] == t][dim2], marker=marker, c=color, label=selling_top_dic[t])
plt.xlabel(dim1)
plt.ylabel(dim2)
plt.legend()
plt.show()
def show_scatter_diagram():
data = pd.read_csv('final_data.csv', encoding='utf-8')
plt.scatter(data['price'], data['discount_price'],c=data['selling_top'] , marker='o')
plt.xlabel('price')
plt.ylabel('discount_price')
plt.show()
if __name__ == '__main__':
#preprocessing() # 用随机森林或者决策树分析特征的重要性
#DecisionTree_important_feacture()
#RandomForestClassifier_important_feacture()
#preprocessing1() # 分析单独的字段的预处理
#compare_in_certain_column('practice_theory_category','书的内容比较(实战还是理论)')
#compare_in_certain_column('traditional_deep_category', '传统机器学习与深度学习比较')
#compare_in_certain_column('shop', '出版社比较', -30)
#compare_in_certain_column('is_author_foreigner', '国外作者与国内作者比较')
#title_word_cloud()
#title_word_cloud_pic()
#show_scatter_diagram()
#preprocessing2()
scatter_plot('price','discount_price')