Ch02

Talk is cheap,show you the code.

import json
from pandas import DataFrame,Series
import pandas as pd
#import numpy as np
import pylab 
###########
path = 'C:/Users/zydsb/Desktop/pydata-book-master/ch02/usagov_bitly_data2012-03-16-1331923249.txt'
records = [json.loads(line) for line in open(path)]
##########3
time_zones=[rec['tz'] for rec in records if 'tz' in rec]
#use Python to count
def get_counts(sequence):
    counts ={}
    i = 0
    for j in sequence:
        if j in counts:
            counts[j] += 1  #ads the elments to the dict
            i += 1
        else: counts[j] = 1
       # print type(counts)
    return counts
#num = get_counts(time_zones)    # do not count the num ,just give count{} the elements
#print num    #
num=get_counts(time_zones)
print num['America/New_York']  #the elements you  added is dict ,so the function is count numbers
def top_counts(count_dict,n =10):
    value_key_pairs = [(count,tz) for tz ,count in count_dict.items()]
    value_key_pairs.sort()
    return value_key_pairs[-n:]
print top_counts(num)
###########
# use pandas
frame = DataFrame(records)
#print frame
#print frame['tz'][:10]
tz_counts = frame['tz'].value_counts()
tz_num = tz_counts[:10]
print tz_num
###########
#plot
clean_tz = frame['tz'].fillna("Missing")
clean_tz [clean_tz == ""]  =  "Unknown"
tz_counts = clean_tz.value_counts()
tz_counts01 = tz_counts[:10]
print tz_counts01
tz_counts[:10].plot(kind = "barh",rot=0)  

有些代码在.py文件中跑是不支持的!!在命令行环境写是可行的
如下
%run "c:\users\zydsb\appdata\local\temp\tmpvrjjhh.py" c:\users\zydsb\appdata\local\temp\tmpvrjjhh.py:8: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators; you can avoid this warning by specifying engine='python'. user = pd.read_table('C:/Users/zydsb/Desktop/pydata-book-master/ch02/movielens/users.dat',sep="::",header = None,names = unames)


  1. 在IPython中,ctrl+p可以前向搜索之间键入的命令
  2. 符号_会保存之前的函数结果,以预防出现函数结果没有赋值给变量的情况
  3. %magic魔法命令帮助文档 ,以后应该会用
  4. tab自动完成,防止忘记变量,函数或方法之类
 # -*- coding: utf-8 -*-
#MovieLens
import pandas as pd
import json
from pandas import DataFrame,Series
path_user = 'C:/Users/zydsb/Desktop/pydata-book-master/ch02/movielens/users.dat'
path_movie = "C:/Users/zydsb/Desktop/pydata-book-master/ch02/movielens/movies.dat"
path_rating = "C:/Users/zydsb/Desktop/pydata-book-master/ch02/movielens/rating.dat"
#unames = ["user_id","gender","age","occa","zip"]
#user = pd.read_table('C:/Users/zydsb/Desktop/pydata-book-master/ch02/movielens/users.dat',sep="::",header = None,names = unames)
#rname = ["userz_id","movie_id","rating","times"]
#rating = pd.read_table("path_rating",sep = "::",header = None, names = rname )



#####Baby names
path = 'C:/Users/zydsb/Desktop/pydata-book-master/ch02/names/yob2000.txt'
records = [line for line in open(path)]
frame = DataFrame(records)
names_2000 = pd.read_csv(frame,names=['name','sex','birth'])      #此处代码和书上有出入,但是效果一样 
names_2000_group = names_2000.groupby('sex').birth.sum()  

你可能感兴趣的:(Ch02)