《利用python进行数据分析》大纲

第5章 Pandas入门

Series的创建

obj=pd.Series([4,7,-5,3]) # 创建Series对象
obj.values  # 返回值 array([4,7,-5,3])
obj.index  # 返回索引

可以指定index

obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])

由字典类型数据生成

sdata = {
     'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)

DataFrame的创建

data = {
     'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)

指定 index与columns

frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four',
                             'five', 'six'])

基础操作:选取行与列!

frame2.columns
frame2['state'] # 法一:读取某一列
frame2.year # 法二:读取某一列
frame2.loc['three']  # 读取某一行

重建Series的索引 index

obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])

删除行与列

obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj
new_obj = obj.drop('c') # 删除Series中的值
new_obj
obj.drop(['d', 'c']) # 删除Series中的2个值

数据预处理操作:删除df的行与列

data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
 # 删除df的行
data.drop(['Colorado', 'Ohio']) 
# 删除df的列
data.drop('two', axis=1) 
data.drop(['two', 'four'], axis='columns')
# 删除df的列,并覆盖原df
obj.drop('c', inplace=True)

dataframe的索引操作

data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data.loc['Colorado', ['two', 'three']]
data.iloc[2, [3, 0, 1]]
data.iloc[2]
data.iloc[[1, 2], [3, 0, 1]]

函数

frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame
np.abs(frame)
f = lambda x: x.max() - x.min()
frame.apply(f) # 对列使用函数
frame.apply(f, axis='columns')  # 对行使用函数
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)
format = lambda x: '%.2f' % x
frame.applymap(format)
frame['e'].map(format)

对df中的值排序

obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
                     
frame.sort_index() # 按index值排序
frame.sort_index(axis=1)  # 按columns值排序,默认升序
frame.sort_index(axis=1, ascending=False)  # 按columns值排序,设为降序

obj = pd.Series([4, 7, -3, 2])
obj.sort_values()

obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()  # NaN值放到末尾

frame = pd.DataFrame({
     'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

frame.sort_values(by='b')  # 按b列升序排列
frame.sort_values(by=['a', 'b'])

数理统计函数

df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df.sum()  # 求每一列的和
df.sum(axis='columns')  # 求每一行的和
df.mean(axis='columns', skipna=False)  # 不排除NaN
df.idxmax()  # 找到每一列最大值对应的index

df.describe()  # 按列汇总统计
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj.describe()

寻找唯一值

obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

uniques = obj.unique()

obj.value_counts()  # 统计每个值的取值次数

pd.value_counts(obj.values, sort=False) # 默认降序排列

第6章 Pandas读取数据

读取文本数据

# 读取分隔好的数据,默认分隔符是逗号/制表符('\t')
pd.read_csv()
pd.read_table()
df = pd.read_csv('examples/ex1.csv')
df  # 返回一个dataframe对象
pd.read_table('examples/ex1.csv', sep=',')
# 读取无表头的文件,默认header=0
pd.read_csv('examples/ex2.csv', header=None)  # 默认列名
pd.read_csv('examples/ex2.csv', names=['a', 'b', 'c', 'd', 'message']) # 自己指定列名
# 指定某一列为index
names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('examples/ex2.csv', names=names, index_col='message')

# 分层索引
parsed = pd.read_csv('examples/csv_mindex.csv',
                     index_col=['key1', 'key2'])
parsed

跳过行:先跳过,再生成df

pd.read_csv('examples/ex4.csv', skiprows=[0, 2, 3])

替代NaN值

result = pd.read_csv('examples/ex5.csv', na_values=['NULL'])
result

sentinels = {
     'message': ['foo', 'NA'], 'something': ['two']}
pd.read_csv('examples/ex5.csv', na_values=sentinels)

设置显示的行数

pd.options.display.max_rows = 10
result = pd.read_csv('examples/ex6.csv')
result

更多这样使用:

pd.read_csv('examples/ex6.csv', nrows=5)

将数据写入文本

data = pd.read_csv('examples/ex5.csv')
data
data.to_csv('examples/out.csv')

data.to_csv(sys.stdout, index=False, header=False)

JSON数据

obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
              {"name": "Katie", "age": 38,
               "pets": ["Sixes", "Stache", "Cisco"]}]
}
"""
import json
result = json.loads(obj)
result
asjson = json.dumps(result)
siblings = pd.DataFrame(result['siblings'], columns=['name', 'age'])
siblings

data = pd.read_json('examples/example.json')
data
print(data.to_json())
print(data.to_json(orient='records'))

pickle模块

frame = pd.read_csv('examples/ex1.csv')
frame
frame.to_pickle('examples/frame_pickle')

pd.read_pickle('examples/frame_pickle')

注意:pickle仅被推荐为短期的存储格式

# 生成一个实例
xlsx = pd.ExcelFile('examples/ex1.xlsx')
pd.read_excel(xlsx, 'Sheet1')
frame = pd.read_excel('examples/ex1.xlsx', 'Sheet1')
frame

与Web API 交互

学习中…

与数据库交互

学习中…

第7章 数据预处理

7.1 处理缺失值

import numpy as np
import pandas as pd

创建一个含有缺失值的Series

string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

删除缺失值

from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

删除含有缺失值的行

from numpy import nan as NA
data = pd.DataFrame([[1., 6.5, 3.], 
                     [1., NA, NA],
                     [NA, NA, NA],
                     [NA, 6.5, 3.]])
cleaned = data.dropna() # 删除含有NA的行
cleaned
     0   1   2
0	1.0	6.5	3.0

删除全为缺失值的行

data.dropna(how='all')  # 删除全为缺失值的行
     0	  1	     2
0	1.0	 6.5	3.0
1	1.0	 NaN	NaN
3	NaN  6.5	3.0

删除全为缺失值的列

data[4]=NA
data.dropna(axis=1,how='all')# 删除全为缺失值的列
0	1      	2
0	1.0	6.5	3.0
1	1.0	NaN	NaN
2	NaN	NaN	NaN
3	NaN	6.5	3.0
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df
         0	    1	          2
0	-0.810186	NaN	         NaN
1	-1.647230	NaN         NaN
2	0.048366	NaN	        -0.409267
3	1.240861	NaN	         0.442658
4	-0.353365	2.785994	0.231522
5	1.184725	-1.566254	-0.840143
6	-0.827289	-0.212981	0.576380

df.dropna()
df.dropna(thresh=2)  # 删除含有至少2个缺失值的行

填充缺失值

df.fillna(0)
df.fillna({
     1: 0.5, 2: 0})
_ = df.fillna(0, inplace=True)
df
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df
df.fillna(method='ffill')
df.fillna(method='ffill', limit=2)

使用均值填充缺失值

ata = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())

7.2 其他处理

重复值处理

data = pd.DataFrame({
     'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

     k1	k2
0	one	1
1	two	1
2	one	2
3	two	3
4	one	3
5	two	4
6	two	4
# 返回一个布尔值series,反应每一行与之前的行是否重复
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

删除重复的行

data.drop_duplicates()
    k1	k2
0	one	1
1	two	1
2	one	2
3	two	3
4	one	3
5	two	4
data['v1'] = range(7)
# 按照k1列重复值删除行
data.drop_duplicates(['k1'])

    k1	k2	v1
0	one	1	0
1	two	1	1
data.drop_duplicates(['k1','k2'])

    k1	k2	v1
0	one	1	0
1	two	1	1
2	one	2	2
3	two	3	3
4	one	3	4
5	two	4	5

数值替代

data = pd.Series([1., -999., 2., -999., -1000., 3.])

data.replace(-999, np.nan)
data.replace([-999, -1000], np.nan)
data.replace([-999, -1000], [np.nan, 0])
data.replace({
     -999: np.nan, -1000: 0})
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
                    
data.rename(index=str.title, columns=str.upper)
data.rename(index={
     'OHIO': 'INDIANA'},
            columns={
     'three': 'peekaboo'})
data.rename(index={
     'OHIO': 'INDIANA'}, inplace=True)

随机抽样

df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
df.take(sampler)
df.sample(n=3)

0	1	2	3
2	8	9	10	11
3	12	13	14	15
0	0	1	2	3

choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)
draws

7.3 字符串操作

见课本

你可能感兴趣的:(《利用python进行数据分析》大纲)