统计各类商品的发贴量,画出柱状图。
从url中获取商品类别
table = mongoset('ganji', 'itemurls') # 访问数据表itemurls
dict = {}
datadict = []
for i in table.find():
# 有部分url 指向zhuanzhuan商品,把它们都筛出去
itemclass = i['itemurl'].split('/')[3] # itemclass = 'sh.ganji.com'
# 下面判断 itemclass 是否为'sh.ganji.com', 是则保留,否则丢弃
if len(itemclass)<20 and itemclass != 'Mzhuanzhuan':
data = {
'iclass' : itemclass,
'url': i['itemurl']
}
datadict.append(data)
classlist = []
for i in datadict:
#print(i)
classlist.append(i['iclass'])
classidx = list(set(classlist))
print(classidx)
classidx打印如下
['fushixiaobaxuemao', 'jiadian', 'shouji', 'xuniwupin', 'zixingchemaimai', 'ruanjiantushu', 'bangong', 'shoucangpin', 'jiaju', 'xianzhilipin', 'nongyongpin', 'yingyouyunfu', 'meironghuazhuang', 'shuma', 'diannao', 'motuoche', 'baojianpin', 'laonianyongpin', 'rirongbaihuo']
[itemurls]表中的部分数据
{'_id': ObjectId('57786d9084a5fd53c0c0b2a6'), 'itemurl': 'http://sh.ganji.com/shouji/2170757686x.htm'}
{'_id': ObjectId('57786d9084a5fd53c0c0b2a7'), 'itemurl': 'http://sh.ganji.com/shouji/2091775260x.htm'}
{'_id': ObjectId('57786d9084a5fd53c0c0b2a8'), 'itemurl': 'http://sh.ganji.com/shouji/2154806461x.htm'}
{'_id': ObjectId('57786d9084a5fd53c0c0b2ac'), 'itemurl': 'http://sh.ganji.com/shouji/2156209920x.htm'}
统计各类别商品的发贴数量
classamount = []
for i in classidx:
#print(i)
classamount.append(classlist.count(i))
#print(classamount)
print(classamount)
classamount打印如下
[1744, 2858, 1773, 867, 1664, 2729, 4569, 3276, 1656, 3490, 468, 1638, 1377, 907, 2157, 270, 982, 1377, 1770]
将数据转换成用于highcharts 绘图的格式
def chartformat(name, data, typestr):
# 将数据转换成highcharts 格式的标准数据
series = []
for n,d in zip(name, data):
item = {
'name': n,
'data': [int(d)],
'type': typestr
}
series.append(item)
print(series)
return series
data = chartformat(classidx, classamount, 'column')
charts.plot(data, show='inline')
各类目发贴量统计柱状图
看上去发贴量最大的是家具类目
总结
- 学习jupyter 与higncharts的使用,很赞的绘图工具