利用Python进行数据分析---ch02《MovieLens 1M数据集(上)》读书笔记


Page:21-29页

#coding=UTF-8
import json
from collections import defaultdict
from collections import Counter
from pandas import DataFrame,Series
import pandas as pd
import numpy as np
from matplotlib import pylab, mlab, pyplot
from pylab import *
from numpy import *
'''
按行读取文件内容,并提取其中时区(‘tz’)属性的全部取值
'''
path='C:/pytm/ch02/usagov_bitly_data2012-03-16-1331923249.txt'
records=[json.loads(line) for line in open(path)]
time_zones=[rec['tz'] for rec in records if 'tz' in rec]
#打印出时区属性取值的后10个
print time_zones[:10]
#提取其中手机号(‘t’)属性的全部取值
telephone=[rec['t'] for rec in records if 't' in rec]  



'''
统计时区属性的某一个取值出现的次数:有两种方法;
'''
# def get_counts1(sequence):
#     counts={}
#     for x in sequence:
#         if x in counts:
#             counts[x] += 1
#         else:
#             counts[x]=1
#     return counts

def get_counts2(sequence):
    counts = defaultdict(int)
    for x in sequence:
        counts[x] += 1
    return counts

counts=get_counts2(time_zones)
print counts['America/New_York']
print len(time_zones)


'''
统计时区属性取值次数前10的全部属性:有两种方法;
'''
# def top_counts(count_dict, n=10):
#     value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
#     value_key_pairs.sort()
#     return value_key_pairs[-n:]
# 
# print top_counts(counts)
counts = Counter(time_zones)
print counts.most_common(10)

'''
用pandas对时区进行计数
DataFrame
'''
frame=DataFrame(records)
#print frame
print frame['tz'][:10]

tz_counts=frame['tz'].value_counts()   #统计次数
print tz_counts[:]

'''
用 matplotlib进行绘图:
step1,给记录中未知或缺失的时区填上一个替代值,使用fillna函数;;对于空字符串用布尔型数组索引加以替换


'''
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknow'
tz_counts = clean_tz.value_counts()
tz_counts = clean_tz.value_counts()
print tz_counts[:10]

tz_counts[:10].plot(kind='barh',rot=0)
# show()

#print frame['a'][1]
results=Series([x.split()[0] for x in frame.a.dropna()])
print results[:5]
print results.value_counts()[:8]


'''
现在想按照windows用户和非windows用户进行信息分解,为了简单起见
假设‘agent’含有windows都认为是w用户,由于有agent缺失值,故先
将他们从数据中移除,其次根据‘a’值进行判断
'''
cframe = frame[frame.a.notnull()]
operating_system = np.where(cframe['a'].str.contains('Windows'),'Windows','Not Windows')
print operating_system[:5]

'''
接下来就可以根据时区和新得到的操作系统列表对数据进行分组了
然后通过size对分组结果进行计数,类似于上面的value_counts函数
并用unstack对计数结果进行重塑
'''
by_tz_os = cframe.groupby(['tz',operating_system])
agg_counts = by_tz_os.size().unstack().fillna(0)
print agg_counts[:5]

'''
最后,我们来选取最常出现的时区,为了达到这个目的,我根据agg_counts
中的行构造了一个简洁索引数组
'''
#用于按升序排列
indexer = agg_counts.sum(1).argsort()
print indexer[:10]

#通过take按照这个顺序截取了最后10行
count_subset = agg_counts.take(indexer)
print count_subset[-1:]

#也可以生成一张条形图,用stacked=True来生成堆积条形物
count_subset.plot(kind='barh',stacked=True)
show()

#这张图太小,可以将各种规范化为“总计为1”并重新绘图
normed_subset = count_subset.div(count_subset.sum(1), axis=0)
normed_subset.plot(kind='barh',stacked=True)
show()




你可能感兴趣的:(数据挖掘)