在数据分析和建模的过程中,相当多的时间要用在数据准备上:加载、清理、转换以及重塑。这些工作会占到分析师时间的80%或更多。有时,存储在文件和数据库中的数据的格式不适合某个特定的任务。许多研究者都选择使用通用编程语言(如Python、Perl、R或Java)或UNIX文本处理工具(如sed或awk)对数据格式进行专门处理。幸运的是,pandas和内置的Python标准库提供了一组高级的、灵活的、快速的工具,可以让你轻松地将数据规整为想要的格式。
在许多数据分析工作中,缺失数据是经常发生的。pandas的目标之一就是尽量轻松地处理缺失数据。例如,pandas对象的所有描述性统计默认都不包括缺失数据。
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
0 aardvark
1 artichoke
2 NaN
3 avocado
dtype: object
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()
0 1.0
2 3.5
4 7.0
dtype: float64
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
....: [NA, NA, NA], [NA, 6.5, 3.]])
data
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3.0
clean=data.dropna()
clean
0 1 2
0 1.0 6.5 3.0
data.dropna(how='all')
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
3 NaN 6.5 3.0
data.dropna(axis=1,how='all') #按列来
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3.0
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df
0 1 2
0 0.578204 NaN NaN
1 0.217969 NaN NaN
2 2.197892 NaN 1.023195
3 -0.894835 NaN 1.823525
4 0.915724 0.198224 0.419659
5 -0.618141 -1.042905 -0.403488
6 0.441870 0.623395 0.108106
df.dropna()
0 1 2
4 0.915724 0.198224 0.419659
5 -0.618141 -1.042905 -0.403488
6 0.441870 0.623395 0.108106
df.dropna(thresh=2) #看他会保留两行NAN的供你观测
0 1 2
2 2.197892 NaN 1.023195
3 -0.894835 NaN 1.823525
4 0.915724 0.198224 0.419659
5 -0.618141 -1.042905 -0.403488
6 0.441870 0.623395 0.108106
df.fillna(0)#NAN填充0
0 1 2
0 -0.499572 0.000000 0.000000
1 -1.108934 0.000000 0.000000
2 -0.316295 0.000000 -1.324176
3 -0.371394 0.000000 0.802282
4 0.975481 1.716945 -1.540201
5 0.204285 2.324883 0.117626
6 -0.330108 0.153630 -0.338194
df.fillna({1: 0.5, 2: 0}) #第二列NAN填充0,第三列NAN填充1
0 1 2
0 -0.499572 0.500000 0.000000
1 -1.108934 0.500000 0.000000
2 -0.316295 0.500000 -1.324176
3 -0.371394 0.500000 0.802282
4 0.975481 1.716945 -1.540201
5 0.204285 2.324883 0.117626
6 -0.330108 0.153630 -0.338194
_ = df.fillna(0, inplace=True)
df
0 1 2
0 -0.499572 0.000000 0.000000
1 -1.108934 0.000000 0.000000
2 -0.316295 0.000000 -1.324176
3 -0.371394 0.000000 0.802282
4 0.975481 1.716945 -1.540201
5 0.204285 2.324883 0.117626
6 -0.330108 0.153630 -0.338194
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df
0 1 2
0 -0.774119 -0.676729 -0.128181
1 0.161477 -1.105764 -1.710922
2 0.937275 NaN -0.154157
3 -0.347762 NaN -0.331515
4 0.578991 NaN NaN
5 -1.993989 NaN NaN
df.fillna(method='ffill') #向上取等
0 1 2
0 -0.774119 -0.676729 -0.128181
1 0.161477 -1.105764 -1.710922
2 0.937275 -1.105764 -0.154157
3 -0.347762 -1.105764 -0.331515
4 0.578991 -1.105764 -0.331515
5 -1.993989 -1.105764 -0.331515
df.fillna(method='ffill', limit=2) #限制两个
0 1 2
0 0.476985 3.248944 -1.021228
1 -0.577087 0.124121 0.302614
2 0.523772 0.124121 1.343810
3 -0.713544 0.124121 -2.370232
4 -1.860761 NaN -2.370232
5 -1.265934 NaN -2.370232
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
....: 'k2': [1, 1, 2, 3, 3, 4, 4]})
data
k1 k2
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
5 two 4
6 two 4
data.duplicated()
0 False
1 False
2 False
3 False
4 False
5 False
6 True
dtype: bool
data.drop_duplicates()
k1 k2
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
5 two 4
data.drop_duplicates(['k1'])
k1 k2
0 one 1
1 two 1
data.drop_duplicates(['k1', 'k2'], keep='last')
k1 k2 v1
0 one 1 0
1 two 1 1
2 one 2 2
3 two 3 3
4 one 3 4
6 two 4 6
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
....: 'Pastrami', 'corned beef', 'Bacon',
....: 'pastrami', 'honey ham', 'nova lox'],
....: 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
food ounces
0 bacon 4.0
1 pulled pork 3.0
2 bacon1 2.0
3 Pastrami 6.0
4 corned beef 7.5
5 Bacon 8.0
6 pastrami 3.0
7 honey ham 5.0
8 nova lox 6.0
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}
lowercased = data['food'].str.lower()
data['animal'] = lowercased.map(meat_to_animal)#每一个meat_to_animal都使用lowercased
data
food ounces animal
0 bacon 4.0 pig
1 pulled pork 3.0 pig
2 bacon 12.0 pig
3 Pastrami 6.0 cow
4 corned beef 7.5 cow
5 Bacon 8.0 pig
6 pastrami 3.0 cow
7 honey ham 5.0 pig
8 nova lox 6.0 salmon
data['food'].map(lambda x: meat_to_animal[x.lower()])
0 pig
1 pig
2 pig
3 cow
4 cow
5 pig
6 cow
7 pig
8 salmon
Name: food, dtype: object
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data
0 1.0
1 -999.0
2 2.0
3 -999.0
4 -1000.0
5 3.0
dtype: float64
data.replace(-999, np.nan)
0 1.0
1 NaN
2 2.0
3 NaN
4 -1000.0
5 3.0
dtype: float64
data.replace([-999, -1000], np.nan)
data.replace([-999, -1000], [np.nan, 0])
data.replace({-999: np.nan, -1000: 0})
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
....: index=['Ohio', 'Colorado', 'New York'],
....: columns=['one', 'two', 'three', 'four'])
data
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
New York 8 9 10 11
transform = lambda x: x[:4].upper() #取字符串的前四个字母大写
data.index.map(transform)
Index(['OHIO', 'COLO', 'NEW '], dtype='object')
data.index = data.index.map(transform)
data.rename(index=str.title, columns=str.upper)
ONE TWO THREE FOUR
Ohio 0 1 2 3
Colo 4 5 6 7
New 8 9 10 11
data.rename(index={'OHIO': 'INDIANA'},
....: columns={'three': 'peekaboo'})
one two peekaboo four
INDIANA 0 1 2 3
COLO 4 5 6 7
NEW 8 9 10 11
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()
0 1 2 3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean -0.050093 0.060779 0.015470 -0.006734
std 0.992683 1.032154 0.991869 0.959386
min -2.881544 -3.037328 -3.172889 -2.907349
25% -0.700561 -0.605144 -0.659936 -0.639111
50% -0.051532 0.077689 0.044447 -0.021604
75% 0.626197 0.708148 0.675407 0.620392
max 2.984373 3.104012 3.325806 3.632424
col = data[2]
col[np.abs(col) > 3]
41 -3.399312
136 -3.745356
Name: 2, dtype: float64
data[(np.abs(data) > 3).any(1)]
0 1 2 3
41 0.457246 -0.025907 -3.399312 -0.974657
60 1.951312 3.260383 0.963301 1.201206
136 0.508391 -0.196713 -3.745356 -1.520113
235 -0.242459 -3.056990 1.918403 -0.578828
258 0.682841 0.326045 0.425384 -3.428254
322 1.179227 -3.184377 1.369891 -1.074833
544 -3.548824 1.553205 -2.186301 1.277104
635 -0.578093 0.193299 1.397822 3.366626
782 -0.207434 3.525865 0.283070 0.544635
803 -3.645860 0.255475 -0.549574 -1.907459
data[np.abs(data) > 3] = np.sign(data) * 3 #大于3变成3,小于-3变成-3
另一种常用于统计建模或机器学习的转换方式是:将分类变量(categorical variable)转换为“哑变量”或“指标矩阵”。
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
.....:'data1': range(6)})
df
key data1
0 b 0
1 b 1
2 a 2
3 c 3
4 a 4
5 b 5
pd.get_dummies(df['key'])
a b c
0 0 1 0
1 0 1 0
2 1 0 0
3 0 0 1
4 1 0 0
5 0 1 0
dummies = pd.get_dummies(df['key'], prefix='key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy
data1 key_a key_b key_c
0 0 0 1 0
1 1 0 1 0
2 2 1 0 0
3 3 0 0 1
4 4 1 0 0
5 5 0 1 0
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('F:/hellopython/movies.dat', sep='::',header=None, names=mnames)
movies[:10]
movie_id title genres
0 1 Toy Story (1995) Animation|Children's|Comedy
1 2 Jumanji (1995) Adventure|Children's|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama
4 5 Father of the Bride Part II (1995) Comedy
5 6 Heat (1995) Action|Crime|Thriller
6 7 Sabrina (1995) Comedy|Romance
7 8 Tom and Huck (1995) Adventure|Children's
8 9 Sudden Death (1995) Action
9 10 GoldenEye (1995) Action|Adventure|Thriller
all_genres = []
for x in movies.genres:
all_genres.extend(x.split('|'))
genres = pd.unique(all_genres) #这样就提取出来了
zero_matrix = np.zeros((len(movies), len(genres)))
dummies = pd.DataFrame(zero_matrix, columns=genres)
gen = movies.genres[0]
gen.split('|')
['Animation', "Children's", 'Comedy'] #他所属的类
dummies.columns.get_indexer(gen.split('|')) #这样就能计算他所属的类在所有的里面是在哪里
array([0, 1, 2]) #在第0,1,2行
for i, gen in enumerate(movies.genres):
indices = dummies.columns.get_indexer(gen.split('|'))
dummies.iloc[i, indices] = 1 #写个循环遍历所有的ID
movies_windic = movies.join(dummies.add_prefix('Genre_')) #结合起来
movies_windic.iloc[0]
movie_id 1
title Toy Story (1995)
genres Animation|Children's|Comedy
Genre_Animation 1
Genre_Children's 1
Genre_Comedy 1
Genre_Adventure 0
Genre_Fantasy 0
Genre_Romance 0
Genre_Drama 0
...
Genre_Crime 0
Genre_Thriller 0
Genre_Horror 0
Genre_Sci-Fi 0
Genre_Documentary 0
Genre_War 0
Genre_Musical 0
Genre_Mystery 0
Genre_Film-Noir 0
Genre_Western 0
Name: 0, Length: 21, dtype: object
values = np.random.rand(10)
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))
(0.0, 0.2] (0.2, 0.4] (0.4, 0.6] (0.6, 0.8] (0.8, 1.0]
0 0 0 0 0 1
1 0 1 0 0 0
2 1 0 0 0 0
3 0 1 0 0 0
4 0 0 1 0 0
5 0 0 1 0 0
6 0 0 0 0 1
7 0 0 0 1 0
8 0 0 0 1 0
9 0 0 0 1 0