from pandas.io.parsers import read_csv
df=read_csv("H:\Python\data\WHO.csv")
print "DataFrame:",df
运行结果(只截取部分):
DataFrame: Country CountryID Continent \
0 Afghanistan 1 1
1 Albania 2 2
2 Algeria 3 3
3 Andorra 4 2
4 Angola 5 3
print "Shape:",df.shape #大小
print "Length:",len(df) #长度
结果:
Shape: (202, 358)
Length: 202
print "Column Headers",df.columns #得到每列的标题
print "Data type",df.dtypes #得到每列数据的类型
结果(截取部分)
Column Headers Index([u'Country', u'CountryID', u'Continent',
u'Adolescent fertility rate (%)', u'Adult literacy rate (%)',
u'Gross national income per capita (PPP international $)',
u'Net primary school enrolment ratio female (%)',
u'Net primary school enrolment ratio male (%)',
u'Population (in thousands) total',
u'Population annual growth rate (%)',
...
u'Total_CO2_emissions', u'Total_income', u'Total_reserves',
u'Trade_balance_goods_and_services', u'Under_five_mortality_from_CME',
u'Under_five_mortality_from_IHME', u'Under_five_mortality_rate',
u'Urban_population', u'Urban_population_growth',
u'Urban_population_pct_of_total'],
dtype='object', length=358)
Data type Country object
CountryID int64
Continent int64
Adolescent fertility rate (%) float64
Adult literacy rate (%) float64
Gross national income per capita (PPP international $) float64
Net primary school enrolment ratio female (%) float64
Net primary school enrolment ratio male (%) float64
print "Index:",df.index
结果:
Index: RangeIndex(start=0, stop=202, step=1)
print "Vales:",df.values
结果
Vales: [['Afghanistan' 1L 1L ..., 5740436.0 5.44 22.9]
['Albania' 2L 2L ..., 1431793.9 2.21 45.4]
['Algeria' 3L 3L ..., 20800000.0 2.61 63.3]
...,
['Yemen' 200L 1L ..., 5759120.5 4.37 27.3]
['Zambia' 201L 3L ..., 4017411.0 1.95 35.0]
['Zimbabwe' 202L 3L ..., 4709965.0 1.9 35.9]]
country_df=df["Country"]
print "Type df:",type(df)
print "Type country_df:",type(country_df)
结果:
Type df:
Type country_df:
print "Series Shape:",country_df.shape #获取列的形状
print "Series index:",country_df.index #获取索引
print "Series values:",country_df.values #获取该列的所有值
print "Series name:",country_df.name #获取列名(标题)
结果:
Series Shape: (202L,)
Series index: RangeIndex(start=0, stop=202, step=1)
Series values: ['Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola' 'Antigua and Barbuda'
'Argentina' 'Armenia' 'Australia' 'Austria' 'Azerbaijan' 'Bahamas'
'Bahrain' 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin'
'Bermuda' 'Bhutan' 'Bolivia' 'Bosnia and Herzegovina' 'Botswana' 'Brazil'
'Brunei Darussalam' 'Bulgaria' 'Burkina Faso' 'Burundi' 'Cambodia'
'Cameroon' 'Canada' 'Cape Verde' 'Central African Republic' 'Chad' 'Chile'
'China' 'Colombia' 'Comoros' 'Congo, Dem. Rep.' 'Congo, Rep.'
'Cook Islands' 'Costa Rica' "Cote d'Ivoire" 'Croatia' 'Cuba' 'Cyprus'
'Czech Republic' 'Denmark' 'Djibouti' 'Dominica' 'Dominican Republic'
'Ecuador' 'Egypt' 'El Salvador' 'Equatorial Guinea' 'Eritrea' 'Estonia'
'Ethiopia' 'Fiji' 'Finland' 'France' 'French Polynesia' 'Gabon' 'Gambia'
'Georgia' 'Germany' 'Ghana' 'Greece' 'Grenada' 'Guatemala' 'Guinea'
'Guinea-Bissau' 'Guyana' 'Haiti' 'Honduras' 'Hong Kong, China' 'Hungary'
'Iceland' 'India' 'Indonesia' 'Iran (Islamic Republic of)' 'Iraq'
'Ireland' 'Israel' 'Italy' 'Jamaica' 'Japan' 'Jordan' 'Kazakhstan' 'Kenya'
'Kiribati' 'Korea, Dem. Rep.' 'Korea, Rep.' 'Kuwait' 'Kyrgyzstan'
"Lao People's Democratic Republic" 'Latvia' 'Lebanon' 'Lesotho' 'Liberia'
'Libyan Arab Jamahiriya' 'Lithuania' 'Luxembourg' 'Macao, China'
'Macedonia' 'Madagascar' 'Malawi' 'Malaysia' 'Maldives' 'Mali' 'Malta'
'Marshall Islands' 'Mauritania' 'Mauritius' 'Mexico'
'Micronesia (Federated States of)' 'Moldova' 'Monaco' 'Mongolia'
'Montenegro' 'Morocco' 'Mozambique' 'Myanmar' 'Namibia' 'Nauru' 'Nepal'
'Netherlands' 'Netherlands Antilles' 'New Caledonia' 'New Zealand'
'Nicaragua' 'Niger' 'Nigeria' 'Niue' 'Norway' 'Oman' 'Pakistan' 'Palau'
'Panama' 'Papua New Guinea' 'Paraguay' 'Peru' 'Philippines' 'Poland'
'Portugal' 'Puerto Rico' 'Qatar' 'Romania' 'Russia' 'Rwanda'
'Saint Kitts and Nevis' 'Saint Lucia' 'Saint Vincent and the Grenadines'
'Samoa' 'San Marino' 'Sao Tome and Principe' 'Saudi Arabia' 'Senegal'
'Serbia' 'Seychelles' 'Sierra Leone' 'Singapore' 'Slovakia' 'Slovenia'
'Solomon Islands' 'Somalia' 'South Africa' 'Spain' 'Sri Lanka' 'Sudan'
'Suriname' 'Swaziland' 'Sweden' 'Switzerland' 'Syria' 'Taiwan'
'Tajikistan' 'Tanzania' 'Thailand' 'Timor-Leste' 'Togo' 'Tonga'
'Trinidad and Tobago' 'Tunisia' 'Turkey' 'Turkmenistan' 'Tuvalu' 'Uganda'
'Ukraine' 'United Arab Emirates' 'United Kingdom'
'United States of America' 'Uruguay' 'Uzbekistan' 'Vanuatu' 'Venezuela'
'Vietnam' 'West Bank and Gaza' 'Yemen' 'Zambia' 'Zimbabwe']
Series name: Country
print "Last 2 countries:",country_df[-2:]
print "Last 2 countries type:",type(country_df[-2:])
结果:
Last 2 countries: 200 Zambia
201 Zimbabwe
Name: Country, dtype: object
Last 2 countries type:
sunspots=read_csv("H:\Python\data\sunspots.csv")
print "Head 2:",sunspots.head(2) #查看前两行
print "Tail 2:",sunspots.tail(2) #查看后两行
运行结果:
Head 2: Date Yearly Mean Total Sunspot Number
0 2016/12/31 39.8
1 2015/12/31 69.8
Tail 2: Date Yearly Mean Total Sunspot Number
316 1701-12-31 18.3
317 1700-12-31 8.3
last_date=sunspots.index[-1]
print "Last value:\n",sunspots.loc[last_date]
运行结果:
last_date=sunspots.index[-1]
print "Last value:\n",sunspots.loc[last_date]
方法 | 说明 |
describe | 这个方法返回描述性统计信息 |
count | 返回非NAN数据项的数量 |
mad | 计算平均绝对偏差,级类似于标准差的一个有力统计工具 |
median | 返回中位数,等价于第50百分位数的值 |
min | 返回最小值 |
max | 返回最大值 |
mode | 返回众数(mod),即一组数据中出现次数最多的变量值 |
std | 返回表示离散度的标准差,即方差的平方根 |
var | 返回方差 |
skew | 返回偏差系数(skewness),该系数表示的是数据分布的对称程度 |
kurt | 这个方法将返回峰太系数,反映数据分布曲线顶端尖峭或扁平程度 |
print "Describe:\n",sunspots.describe() print "Non NaN observations:\n",sunspots.count() print "MAD:\n",sunspots.mad() print "Median:\n",sunspots.median() print "Min:\n",sunspots.min() print "Max:\n",sunspots.max() print "Mode:\n",sunspots.mode() print "Standard Deviation:\n",sunspots.std() print "Variance:\n",sunspots.var() print "Skewness:\n",sunspots.skew() print "Kurtosis:\n",sunspots.kurt()
Describe:
Yearly Mean Total Sunspot Number
count 318.000000
mean 79.193396
std 61.988788
min 0.000000
25% 24.950000
50% 66.250000
75% 116.025000
max 269.300000
Non NaN observations:
Date 318
Yearly Mean Total Sunspot Number 318
dtype: int64
MAD:
Yearly Mean Total Sunspot Number 50.925104
dtype: float64
Median:
Yearly Mean Total Sunspot Number 66.25
dtype: float64
Min:
Date 1700-12-31
Yearly Mean Total Sunspot Number 0
dtype: object
Max:
Date 2016/12/31
Yearly Mean Total Sunspot Number 269.3
dtype: object
Mode:
Date Yearly Mean Total Sunspot Number
0 1985/12/31 18.3
Standard Deviation:
Yearly Mean Total Sunspot Number 61.988788
dtype: float64
Variance:
Yearly Mean Total Sunspot Number 3842.60983
dtype: float64
Skewness:
Yearly Mean Total Sunspot Number 0.808551
dtype: float64
Kurtosis:
Yearly Mean Total Sunspot Number -0.130045
dtype: float64
import pandas as pd
from numpy.random import seed
from numpy.random import rand
from numpy.random import randint
import numpy as np
seed(42)
#random.rand(n),生成n个0到1间随机数
#random.random_integers(low,high=None,size=None) 生成闭区间[low,high]上离散均匀分布的整数值;若high=None,则取值区间变为[1,low]
df=pd.DataFrame({'Weather':['cold','hot','cold','hot','cold','hot','cold'],'Food':['soup','soup','icecream','chocolate','icecream','icecream','soup'],
'Price':10*rand(7),'Number':randint(1,9,size=(7,))})
print df
Food Number Price Weather
0 soup 8 3.745401 cold
1 soup 5 9.507143 hot
2 icecream 4 7.319939 cold
3 chocolate 8 5.986585 hot
4 icecream 8 1.560186 cold
5 icecream 3 1.559945 hot
6 soup 6 0.580836 cold
weather_group=df.groupby('Weather') #按天气分组
i=0
for name,group in weather_group:
i=i+1
print "Group ",i,name
print group
运行结果:
Group 1 cold
Food Number Price Weather
0 soup 8 3.745401 cold
2 icecream 4 7.319939 cold
4 icecream 8 1.560186 cold
6 soup 6 0.580836 cold
Group 2 hot
Food Number Price Weather
1 soup 5 9.507143 hot
3 chocolate 8 5.986585 hot
5 icecream 3 1.559945 hot
print "Weather group first:\n",weather_group.first() #展示各组第一行内容
print "Weather group last:\n",weather_group.last() #展示各组最后一行内容
print "Weather group mean:\n",weather_group.mean() #计算各组均值
运行结果:
Weather group first: Food Number Price Weather cold soup 8 3.745401 hot soup 5 9.507143 Weather group last: Food Number Price Weather cold soup 6 0.580836 hot icecream 3 1.559945 Weather group mean: Number Price Weather cold 6.500000 3.301591 hot 5.333333 5.684558
wf_group=df.groupby(['Weather','Food'])
print "WF Group:\n",wf_group.groups
运行结果:
WF Group:
{('hot', 'chocolate'): Int64Index([3], dtype='int64'), ('cold', 'icecream'): Int64Index([2, 4], dtype='int64'), ('cold', 'soup'): Int64Index([0, 6], dtype='int64'), ('hot', 'soup'): Int64Index([1], dtype='int64'), ('hot', 'icecream'): Int64Index([5], dtype='int64')}
print "WF Aggregated:\n",wf_group.agg([np.mean,np.median])
运行结果:
WF Aggregated:
Number Price
mean median mean median
Weather Food
cold icecream 6 6 4.440063 4.440063
soup 7 7 2.163119 2.163119
hot chocolate 8 8 5.986585 5.986585
icecream 3 3 1.559945 1.559945
soup 5 5 9.507143 9.507143
print "df:3\n",df[:3]
print "Contact Back together:\n",pd.concat([df[:3],df[:3]])
运行结果:
df:3
Food Number Price Weather
0 soup 8 3.745401 cold
1 soup 5 9.507143 hot
2 icecream 4 7.319939 cold
Contact Back together:
Food Number Price Weather
0 soup 8 3.745401 cold
1 soup 5 9.507143 hot
2 icecream 4 7.319939 cold
0 soup 8 3.745401 cold
1 soup 5 9.507143 hot
2 icecream 4 7.319939 cold
print "Appending rows:\n",df[3:].append(df[5:])
运行结果:
Appending rows:
Food Number Price Weather
3 chocolate 8 5.986585 hot
4 icecream 8 1.560186 cold
5 icecream 3 1.559945 hot
6 soup 6 0.580836 cold
5 icecream 3 1.559945 hot
6 soup 6 0.580836 cold
dests=pd.read_csv("H:\Python\data\dest.csv")
tips=pd.read_csv("H:\Python\data\\tips.csv")
print "dests:\n",dests
print "tips:\n",tips
运行结果:
dests:
EmpNr Dest
0 5 The Hague
1 3 Amsterdam
2 9 Rotterdam
tips:
EmpNr Amount
0 5 10.0
1 9 5.0
2 7 2.5
print "Merge() on key:\n",pd.merge(dests,tips,on='EmpNr')
运行结果:
Merge() on key:
EmpNr Dest Amount
0 5 The Hague 10.0
1 9 Rotterdam 5.0
print "Dest join() tips:\n",dests.join(tips,lsuffix='Dest',rsuffix='Tips')
运行结果:
Dest join() tips:
EmpNrDest Dest EmpNrTips Amount
0 5 The Hague 5 10.0
1 3 Amsterdam 9 5.0
2 9 Rotterdam 7 2.5
print "Inner join with merge():\n",pd.merge(dests,tips,how='inner') #内连接
print "Outer join with merge():\n",pd.merge(dests,tips,how='outer') #完全外部连接
运行结果:
Inner join with merge():
EmpNr Dest Amount
0 5 The Hague 10.0
1 9 Rotterdam 5.0
Outer join with merge():
EmpNr Dest Amount
0 5 The Hague 10.0
1 3 Amsterdam NaN
2 9 Rotterdam 5.0
3 7 NaN 2.5
df=pd.read_csv("H:\Python\data\WHO.csv")
#print df.head()
df=df[['Country',df.columns[6]]][:2] #将原df的Country列和第6列组成新DataFrame,并取前两行
print "New df:\n",df
运行结果:
New df:
Country Net primary school enrolment ratio female (%)
0 Afghanistan NaN
1 Albania 93.0
print "Null Values:\n",pd.isnull(df) #检查每行缺失的数
print "Not Null Values:\n",pd.notnull(df) #检查非缺失的数
print "Last Column Doubled:\n",2*df[df.columns[-1]] #NAN值乘以一个数后还是NAN
print "Last Column plus NaN:\n",df[df.columns[-1]]+np.nan #非NAN值加上NAN后变为了NAN
print "Zero filled:\n",df.fillna(0) #使用0替换NAN
运行结果:
Null Values:
Country Net primary school enrolment ratio female (%)
0 False True
1 False False
Not Null Values:
Country Net primary school enrolment ratio female (%)
0 True False
1 True True
Last Column Doubled:
0 NaN
1 186.0
Name: Net primary school enrolment ratio female (%), dtype: float64
Last Column plus NaN:
0 NaN
1 NaN
Name: Net primary school enrolment ratio female (%), dtype: float64
Zero filled:
Country Net primary school enrolment ratio female (%)
0 Afghanistan 0.0
1 Albania 93.0
print "Date range:\n",pd.date_range('1/1/1900',periods=42,freq='D') #42表示天数,D表示使用日频率。如果periods='W',表示42周
运行结果:
Date range:
DatetimeIndex(['1900-01-07', '1900-01-14', '1900-01-21', '1900-01-28',
'1900-02-04', '1900-02-11', '1900-02-18', '1900-02-25',
'1900-03-04', '1900-03-11', '1900-03-18', '1900-03-25',
'1900-04-01', '1900-04-08', '1900-04-15', '1900-04-22',
'1900-04-29', '1900-05-06', '1900-05-13', '1900-05-20',
'1900-05-27', '1900-06-03', '1900-06-10', '1900-06-17',
'1900-06-24', '1900-07-01', '1900-07-08', '1900-07-15',
'1900-07-22', '1900-07-29', '1900-08-05', '1900-08-12',
'1900-08-19', '1900-08-26', '1900-09-02', '1900-09-09',
'1900-09-16', '1900-09-23', '1900-09-30', '1900-10-07',
'1900-10-14', '1900-10-21'],
dtype='datetime64[ns]', freq='W-SUN')
import pandas as pd
import sys
try:
print "Date range:\n",pd.date_range('1/1/1677',periods=4,frep='D')
except:
etype,value,_=sys.exc_info() #获得错误类型,错误值
print "Error encountered:\n",etype,value #打印
运行结果:
Date range:
Error encountered:
Out of bounds nanosecond timestamp: 1677-01-01 00:00:00
offset=pd.DateOffset(seconds=2**63/10**9)
mid=pd.to_datetime('1/1/1970')
print "Start valid range:\n",mid-offset
print "End valid range:\n",mid+offset
运行结果:
Start valid range:
1677-09-21 00:12:44
End valid range:
2262-04-11 23:47:16
print "With format:\n",pd.to_datetime(['1901113','19031230'],format='%Y%m%d')
运行结果:
With format:
DatetimeIndex(['1901-11-03', '1903-12-30'], dtype='datetime64[ns]', freq=None)
print "Illegal date:\n",pd.to_datetime(['1901-11-13','not a date']) #第二个字符串无法转换,运行报错
print "Illegal date:\n",pd.to_datetime(['1901-11-13','not a date'],coerce=True) #强制转化,得到非时间数NAT
运行结果:
Illegal date:
DatetimeIndex(['1901-11-13', 'NaT'], dtype='datetime64[ns]', freq=None)
import pandas as pd
from numpy.random import seed
from numpy.random import rand
from numpy.random import randint
import numpy as np
seed(42)
N=7
df=pd.DataFrame({'Weather':['cold','hot','cold','hot','cold','hot','cold'],'Food':['soup','soup','icecream','chocolate','icecream','icecream','soup'],
'Price':10*rand(7),'Number':randint(1,9,size=(7,))})
print "DataFrame:\n",df
print pd.pivot_table(df,index='Food',aggfunc=np.sum) #计算各类型Food的统计值
运行结果:
DataFrame:
Food Number Price Weather
0 soup 8 3.745401 cold
1 soup 5 9.507143 hot
2 icecream 4 7.319939 cold
3 chocolate 8 5.986585 hot
4 icecream 8 1.560186 cold
5 icecream 3 1.559945 hot
6 soup 6 0.580836 cold
Number Price
Food
chocolate 8 5.986585
icecream 15 10.440071
soup 19 13.833380