利用python进行数据分析C02.P21-P29
- 利用python进行数据分析C02.P21-P29
-
- 用纯python代码对时区进行计数
- 用pandas对时区进行计数
- 统计window和非window用户数
- 根据时区和操作系统进行数据分组
用纯python代码对时区进行计数
import json
path = 'D:/document/Python/ziliao/kebendaima/pydata-book-master/ch02/usagov_bitly_data2012-03-16-1331923249.txt'
open(path).readline()
records = [json.loads(line) for line in open(path)]
'''begin'''
data = [ { 'a' : 1, 'b' : 2, 'c' : 3, 'd' : 4, 'e' : 5 } ]
json = json.dumps(data)
print json
print json.dumps({'a': 'Runoob', 'b': 7}, sort_keys=True, indent=4, separators=(',', ':'))
jsonData = '{"a":1,"b":2,"c":3,"d":4,"e":5}';
text = json.loads(jsonData)
print text
'''end'''
time_zones = [rec['tz'] for rec in records if 'tz' in rec]
def get_counts(sequence):
counts = {}
for x in sequence:
if x in counts:
counts[x] += 1
else:
counts[x] = 1
return counts
from collections import defaultdict
def get_counts2(sequence):
counts = defaultdict(int)
for x in sequence:
counts[x] += 1
return counts
from collections import Counter
def get_counts3(sequence):
counts = Counter()
for x in sequence:
counts.update(x)
return counts
counts3 = get_counts(time_zones)
counts = get_counts(time_zones)
counts['Africa/Cairo']
def top_counts(count_dict,n=10):
value_key_pairs = [(count,tz) for tz, count in count_dict.items()]
value_key_pairs.sort()
return value_key_pairs[-n:]
top_counts(counts)
sorted(counts.items(),key = lambda x:x[1],reverse = True)
import operator
sorted(counts.items(),key = operator.itemgetter(1))
sorted(counts.items(),key = operator.itemgetter(1),reverse=True)
from collections import Counter
counts2 = Counter(time_zones)
counts2.most_common(10)
用pandas对时区进行计数
from pandas import DataFrame,Series
import numpy as np
frame = DataFrame(records)
frame
frame['tz'][:10]
tz_counts = frame['tz'].value_counts()
tz_counts[:10]
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts[:10].plot(kind='barh',rot=0)
results = Series([x.split()[0] for x in frame.a.dropna()])
results.value_counts()[:8]
统计window和非window用户数
cframe = frame[frame.a.notnull()]
operating_system = np.where(cframe['a'].str.contains('Windows'),'Windows','Not Windows')
根据时区和操作系统进行数据分组
by_tz_os = cframe.groupby(['tz',operating_system])
agg_counts = by_tz_os.size().unstack().fillna(0)
indexer=agg_counts.sum(1).argsort()
indexer[:10]
'''
d1 = DataFrame([[1, 2, 3],[4, 5, 6],[7, 8, 9]])
d1.sum(1).argsort() 结果0,1,2
d2 = DataFrame([[2, 3],[1, 0.5],[7, 8]])
d2.sum(1).argsort() 结果1,0,2
总结:按行1(列0)求和5,1.5,15,排序1.5(索引0),5(索引1),15(索引2) 返回索引即1,0,2
'''
count_subset = agg_counts.take(indexer)[-10:]
count_subset
count_subset.plot(kind='barh', stacked=True)
normed_subset = count_subset.div(count_subset.sum(1),axis=0)
normed_subset.plot(kind='barh', stacked=True)