《利用Python进行数据分析》第二章-学习笔记

注:使用软件版本为anaconda3-4-3.1

#python常用基础包
import matplotlib.pyplot as plt
import pylab as py
import math as m
import scipy.stats as stats
import numpy as np
import pandas as pd
#加载1880年的数据,并以name、sex、births为列
names1880=pd.read_csv('names/yob1880.txt',names=['name','sex','births'])
names1880.head()
  name sex births
0 Mary F 7065
1 Anna F 2604
2 Emma F 2003
3 Elizabeth F 1939
4 Minnie F 1746
#计算1880年男女出生人数(以sex为行,对births求和)
names1880.groupby(['sex']).births.sum()
# names1880.groupby(['sex']).sum()
sex
F     90993
M    110493
Name: births, dtype: int64
#将历年数据整合并添加‘year’列
years=range(1880,2011)
pieces=[]
columns=['name','sex','births']
for year in years:
    path='names/yob%d.txt' %year
    frame=pd.read_csv(path,names=columns)
    frame['year']=year
    pieces.append(frame)
names=pd.concat(pieces,ignore_index=True) 
names.head()
  name sex births year
0 Mary F 7065 1880
1 Anna F 2604 1880
2 Emma F 2003 1880
3 Elizabeth F 1939 1880
4 Minnie F 1746 1880
#按年份统计男女的总出生人数
total_births=names.pivot_table('births',index='year',columns='sex',aggfunc=sum)
#total_births.head()
total_births.plot()
plt.show()
#添加每年每个名字在对应性别中的比例
def add_prop(group):
    births=group.births
    group['prop']=births/births.sum()
    return group
names=names.groupby(['year','sex']).apply(add_prop)
names.head()
  name sex births year prop
0 Mary F 7065 1880 0.077643
1 Anna F 2604 1880 0.028618
2 Emma F 2003 1880 0.022013
3 Elizabeth F 1939 1880 0.021309
4 Minnie F 1746 1880 0.019188
#验证累加出生率是否为1
np.allclose(names.groupby(['year','sex']).prop.sum(),1)
True
#选取排名前1000的名字
def get_top1000(group):
    return group.sort_values(by='births',ascending=False)[:1000]
grouped=names.groupby(['year','sex'])
top1000=grouped.apply(get_top1000)
top1000.head()
      name sex births year prop
year sex            
1880 F 0 Mary F 7065 1880 0.077643
1 Anna F 2604 1880 0.028618
2 Emma F 2003 1880 0.022013
3 Elizabeth F 1939 1880 0.021309
4 Minnie F 1746 1880 0.019188
#可将数据进一步按性别分为两部分
boys=top1000[top1000.sex=='M']
girls=top1000[top1000.sex=='F']
boys.tail(2)
      name sex births year prop
year sex            
2010 M 1677642 Jaydan M 194 2010 0.000102
1677645 Maxton M 193 2010 0.000102
#以year为行、name为列做表
total_births=top1000.pivot_table('births',index='year',columns='name',aggfunc=sum)
total_births.head(2)
name Aaden Aaliyah Aarav Aaron Aarush Ab Abagail Abb Abbey Abbie ... Zoa Zoe Zoey Zoie Zola Zollie Zona Zora Zula Zuri
year                                          
1880 NaN NaN NaN 102.0 NaN NaN NaN NaN NaN 71.0 ... 8.0 23.0 NaN NaN 7.0 NaN 8.0 28.0 27.0 NaN
1881 NaN NaN NaN 94.0 NaN NaN NaN NaN NaN 81.0 ... NaN 22.0 NaN NaN 10.0 NaN 9.0 21.0 27.0 NaN

2 rows × 6868 columns

#分析指定名字的命名趋势
subset=total_births[['John','Harry','Mary','Marilyn']]
subset.plot(subplots=True,figsize=(12,10),grid=False,title="Number of births per year")
plt.show()
#评估历年命名多样性的增长
table=top1000.pivot_table('prop',index='year',columns='sex',aggfunc=sum)
table.plot(yticks=np.linspace(0,1.2,13),xticks=range(1880,2040,10),title='Sum of top1000.prop by year and sex',grid=False,figsize=(10,5))
plt.show()
#统计2010年男孩中前多少个名字占了50%的比例
df=boys[boys.year==2010]
prop_comsum=df.prop.cumsum()
prop_comsum.values.searchsorted(0.5)
116
#统计历年前多少个名字占了50%的比例
def get_quantile_count(g):
    d=g.prop.cumsum().values.searchsorted(0.5)+1
    return d
diversity=top1000.groupby(['year','sex']).apply(get_quantile_count)
diversity=diversity.unstack('sex')
diversity.plot(title='Number of popular names in top 50%')
plt.show()
#指定年份名字最后一个字母分别为a、b、c...的数量
get_last_letter=lambda x:x[-1]
last_letters=names.name.map(get_last_letter)
last_letters.name='last_letter'
table=names.pivot_table('births',index=last_letters,columns=['sex','year'],aggfunc=sum)
subtable=table.reindex(columns=[1910,1960,2010],level='year')
subtable.head()
sex F M
year 1910 1960 2010 1910 1960 2010
last_letter            
a 108376.0 691247.0 670605.0 977.0 5204.0 28438.0
b NaN 694.0 450.0 411.0 3912.0 38859.0
c 5.0 49.0 946.0 482.0 15476.0 23125.0
d 6750.0 3729.0 2607.0 22111.0 262112.0 44398.0
e 133569.0 435013.0 313833.0 28655.0 178823.0 129012.0
#指定年份名字最后一个字母分别为a、b、c...的比例
letter_prop=subtable/subtable.sum()
letter_prop.head()
sex F M
year 1910 1960 2010 1910 1960 2010
last_letter            
a 0.273390 0.341853 0.381240 0.005031 0.002440 0.014980
b NaN 0.000343 0.000256 0.002116 0.001834 0.020470
c 0.000013 0.000024 0.000538 0.002482 0.007257 0.012181
d 0.017028 0.001844 0.001482 0.113858 0.122908 0.023387
e 0.336941 0.215133 0.178415 0.147556 0.083853 0.067959
#指定年份名字最后一个字母分别为a、b、c...的比例的条形图
fig,axes=plt.subplots(2,1,figsize=(20,12))
letter_prop['M'].plot(kind='bar',ax=axes[0],rot=0,title='Male')
letter_prop['F'].plot(kind='bar',rot=0,ax=axes[1],title='Female',legend=False)
plt.show()
#双重索引提取历年男孩中名字末字母为d、n、y的比例
letter_prop2=table/table.sum()
dny_ts=letter_prop2.ix[['d','n','y'],'M'].T
dny_ts.head()
last_letter d n y
year      
1880 0.083055 0.153213 0.075760
1881 0.083247 0.153214 0.077451
1882 0.085340 0.149560 0.077537
1883 0.084066 0.151646 0.079144
1884 0.086120 0.149915 0.080405
#上表的条形图
dny_ts.plot()
plt.show()
#统计包含‘lesl'字段的名字
all_names= top1000.name.unique()
mask=np.array(['lesl' in x.lower() for x in all_names])
lesley_like=all_names[mask]
lesley_like
array(['Leslie', 'Lesley', 'Leslee', 'Lesli', 'Lesly'], dtype=object)
#各名字总出数量
filtered=top1000[top1000.name.isin (lesley_like)]
filtered.groupby('name').births.sum()
name
Leslee      1082
Lesley     35022
Lesli        929
Leslie    370429
Lesly      10067
Name: births, dtype: int64
#名字含'lesl'的历年人数
table2=filtered.pivot_table('births',index='year',columns='sex',aggfunc=sum)
table2.head()
sex F M
year    
1880 8.0 79.0
1881 11.0 92.0
1882 9.0 128.0
1883 7.0 125.0
1884 15.0 125.0
#男孩女孩名字含'lesl'的历年人数比例
table2=table2.div(table2.sum(1),axis=0)
table2.head()
sex F M
year    
1880 0.091954 0.908046
1881 0.106796 0.893204
1882 0.065693 0.934307
1883 0.053030 0.946970
1884 0.107143 0.892857
#名字含'lesl'的历年比例趋势,可见这个名字由男孩名变成了女孩名!
table2.plot(style={'M':'k-','F':'k--'})
plt.show()

你可能感兴趣的:(Python学习笔记)