利用Ipython进行计数和画图

# -*- coding: utf-8 -*-
import json
path = '/home/wsx/文档/pydata-book-master/ch02/usagov_bitly_data2012-03-16-1331923249.txt'
records = [json.loads(line) for line in open(path)]

time_zones = [rec['tz'] for rec in records if 'tz' in rec]
#计数
#1
def get_counts(sequence):
    counts = {}
    for x in sequence:
        if x in counts:
            counts[x] += 1
        else:
            counts[x] = 1
    return counts


#2
from collections import defaultdict

def get_counts2(sequence):
    counts = defaultdict(int) #所有的值均会被初始化为0
    for x in sequence:
        counts[x] += 1
    return counts
 
    
counts = get_counts(time_zones)   

#得到前10位的时区及其计数值
def top_counts(count_dict, n=10):
    value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
    value_key_pairs.sort()
    return value_key_pairs[-n:]

    
#可以在Python标准库中找到collection.Counter类,它能使这个任务变得更简单
from collections import Counter
counts = Counter(time_zones)
counts.most_common(10)    

#用pandas对时区进行计数
from pandas import DataFrame, Series
import pandas as pd; import numpy as np
frame = DataFrame(records)
frame
tz_counts = frame['tz'].value_counts()
tz_counts[:10]


#然后,我们想利用绘图库(matplotlib)为这段数据生成一张图片。为此,我们先给记录中未知
#或者缺失的时区填上一个替代值。fillna函数可以替换缺失值,而未知值可以通过布尔型数组
#索引加以替换:
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unkonwn'
tz_counts = clean_tz.value_counts()
tz_counts[:10]
tz_counts[:10].plot(kind='barh', rot=0)

你可能感兴趣的:(利用Ipython进行计数和画图)