实例中的所有数据都是在GitHub上下载的,打包下载即可。
地址是:http://github.com/pydata/pydata-book
我使用的是Python2.7,书中的代码有一些有错误,我使用自己的2.7版本调通。
# coding: utf-8
import json
path = 'D:\Source Code\pydata-book-master\ch02\usagov_bitly_data2012-03-16-1331923249.txt'
records = [json.loads(line) for line in open(path)]
records[0]
print records[0]['tz']
time_zones = [rec['tz'] for rec in records if 'tz' in rec]
time_zones[:10]
from collections import defaultdict
def get_count(sequence):
counts = defaultdict(int)
for x in sequence:
counts[x] += 1;
return counts
def top_count(count_dict, n=10):
value_key_pairs = [(count,tz) for tz, count in count_dict.items()]
value_key_pairs.sort()
return value_key_pairs[-n:]
counts = get_count(time_zones)
counts['America/New_York']
len(time_zones)
top_count(counts)
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
frame = DataFrame(records)
frame
tz_counts = frame['tz'].value_counts()
tz_counts[:10]
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()
tz_counts[:10]
tz_counts[:10].plot(kind='barh',rot=0)
results = Series([x.split()[0] for x in frame.a.dropna()])
results[:5]
results.value_counts()[:8]
cframe = frame[frame.a.notnull()]
cframe
operating_system = np.where(cframe['a'].str.contains('Windows'),'Windows','Not Windows')
operating_system[:5]
by_tz_os = cframe.groupby(['tz',operating_system])
agg_counts = by_tz_os.size().unstack().fillna(0)
agg_counts[:10]
indexer = agg_counts.sum(1).argsort()
indexer[:10]
count_subset = agg_counts.take(indexer)[-10:]
count_subset
count_subset.plot(kind='barh', stacked=True)