import pandas as pd
import numpy as np
pd.__version__
'0.25.1'
一、文件读取与写入
1.读取
df = pd.read_csv(r'D:\study\pandas\data\table.csv')
df.head()
|
School |
Class |
ID |
Gender |
Address |
Height |
Weight |
Math |
Physics |
0 |
S_1 |
C_1 |
1101 |
M |
street_1 |
173 |
63 |
34.0 |
A+ |
1 |
S_1 |
C_1 |
1102 |
F |
street_2 |
192 |
73 |
32.5 |
B+ |
2 |
S_1 |
C_1 |
1103 |
M |
street_2 |
186 |
82 |
87.2 |
B+ |
3 |
S_1 |
C_1 |
1104 |
F |
street_2 |
167 |
81 |
80.4 |
B- |
4 |
S_1 |
C_1 |
1105 |
F |
street_4 |
159 |
64 |
84.8 |
B+ |
df_txt = pd.read_table(r'D:\study\pandas\data\table.txt')
df_txt
|
col1 |
col2 |
col3 |
col4 |
0 |
2 |
a |
1.4 |
apple |
1 |
3 |
b |
3.4 |
banana |
2 |
6 |
c |
2.5 |
orange |
3 |
5 |
d |
3.2 |
lemon |
df_excel = pd.read_excel(r'D:\study\pandas\data\table.xlsx')
df_excel.head()
|
School |
Class |
ID |
Gender |
Address |
Height |
Weight |
Math |
Physics |
0 |
S_1 |
C_1 |
1101 |
M |
street_1 |
173 |
63 |
34.0 |
A+ |
1 |
S_1 |
C_1 |
1102 |
F |
street_2 |
192 |
73 |
32.5 |
B+ |
2 |
S_1 |
C_1 |
1103 |
M |
street_2 |
186 |
82 |
87.2 |
B+ |
3 |
S_1 |
C_1 |
1104 |
F |
street_2 |
167 |
81 |
80.4 |
B- |
4 |
S_1 |
C_1 |
1105 |
F |
street_4 |
159 |
64 |
84.8 |
B+ |
2.写入
df.to_csv(r'D:\study\pandas\data\new_table.csv')
df.to_excel(r'D:\study\pandas\data\new_table.xlsx',sheet_name='Sheet1')
二、基本数据结构
1.Series
s = pd.Series(np.random.randn(5),index=['a','b','c','d','e'],name='这是一个Series',dtype='float64')
s
a 0.513690
b 1.582576
c 0.522936
d 1.196192
e -1.356439
Name: 这是一个Series, dtype: float64
s.values
array([ 0.51368998, 1.58257592, 0.52293603, 1.19619237, -1.3564393 ])
s.name
'这是一个Series'
s.index
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
s.dtype
dtype('float64')
s['a']
0.5136899817012984
s.mean()
0.49179099986372565
print([attr for attr in dir(s) if not attr.startswith('_')])
['T', 'a', 'abs', 'add', 'add_prefix', 'add_suffix', 'agg', 'aggregate', 'align', 'all', 'any', 'append', 'apply', 'argmax', 'argmin', 'argsort', 'array', 'as_matrix', 'asfreq', 'asof', 'astype', 'at', 'at_time', 'autocorr', 'axes', 'b', 'base', 'between', 'between_time', 'bfill', 'bool', 'c', 'clip', 'clip_lower', 'clip_upper', 'combine', 'combine_first', 'compound', 'compress', 'copy', 'corr', 'count', 'cov', 'cummax', 'cummin', 'cumprod', 'cumsum', 'd', 'data', 'describe', 'diff', 'div', 'divide', 'divmod', 'dot', 'drop', 'drop_duplicates', 'droplevel', 'dropna', 'dtype', 'dtypes', 'duplicated', 'e', 'empty', 'eq', 'equals', 'ewm', 'expanding', 'explode', 'factorize', 'ffill', 'fillna', 'filter', 'first', 'first_valid_index', 'flags', 'floordiv', 'from_array', 'ftype', 'ftypes', 'ge', 'get', 'get_dtype_counts', 'get_ftype_counts', 'get_values', 'groupby', 'gt', 'hasnans', 'head', 'hist', 'iat', 'idxmax', 'idxmin', 'iloc', 'imag', 'index', 'infer_objects', 'interpolate', 'is_monotonic', 'is_monotonic_decreasing', 'is_monotonic_increasing', 'is_unique', 'isin', 'isna', 'isnull', 'item', 'items', 'itemsize', 'iteritems', 'ix', 'keys', 'kurt', 'kurtosis', 'last', 'last_valid_index', 'le', 'loc', 'lt', 'mad', 'map', 'mask', 'max', 'mean', 'median', 'memory_usage', 'min', 'mod', 'mode', 'mul', 'multiply', 'name', 'nbytes', 'ndim', 'ne', 'nlargest', 'nonzero', 'notna', 'notnull', 'nsmallest', 'nunique', 'pct_change', 'pipe', 'plot', 'pop', 'pow', 'prod', 'product', 'ptp', 'put', 'quantile', 'radd', 'rank', 'ravel', 'rdiv', 'rdivmod', 'real', 'reindex', 'reindex_like', 'rename', 'rename_axis', 'reorder_levels', 'repeat', 'replace', 'resample', 'reset_index', 'rfloordiv', 'rmod', 'rmul', 'rolling', 'round', 'rpow', 'rsub', 'rtruediv', 'sample', 'searchsorted', 'sem', 'set_axis', 'shape', 'shift', 'size', 'skew', 'slice_shift', 'sort_index', 'sort_values', 'squeeze', 'std', 'strides', 'sub', 'subtract', 'sum', 'swapaxes', 'swaplevel', 'tail', 'take', 'to_clipboard', 'to_csv', 'to_dense', 'to_dict', 'to_excel', 'to_frame', 'to_hdf', 'to_json', 'to_latex', 'to_list', 'to_msgpack', 'to_numpy', 'to_period', 'to_pickle', 'to_sparse', 'to_sql', 'to_string', 'to_timestamp', 'to_xarray', 'transform', 'transpose', 'truediv', 'truncate', 'tshift', 'tz_convert', 'tz_localize', 'unique', 'unstack', 'update', 'value_counts', 'values', 'var', 'view', 'where', 'xs']
df = pd.DataFrame({'col1':list('abcde'),'col2':range(5,10),'col3':[1.3,2.5,3.6,4.6,5.8]},
index=list('一二三四五'))
df
|
col1 |
col2 |
col3 |
一 |
a |
5 |
1.3 |
二 |
b |
6 |
2.5 |
三 |
c |
7 |
3.6 |
四 |
d |
8 |
4.6 |
五 |
e |
9 |
5.8 |
df['col1']
一 a
二 b
三 c
四 d
五 e
Name: col1, dtype: object
type(df)
pandas.core.frame.DataFrame
type(df['col1'])
pandas.core.series.Series
df.rename(index={'一':'one'},columns={'col1':'new_col1'})
|
new_col1 |
col2 |
col3 |
one |
a |
5 |
1.3 |
二 |
b |
6 |
2.5 |
三 |
c |
7 |
3.6 |
四 |
d |
8 |
4.6 |
五 |
e |
9 |
5.8 |
df.index
Index(['一', '二', '三', '四', '五'], dtype='object')
df.columns
Index(['col1', 'col2', 'col3'], dtype='object')
df.values
array([['a', 5, 1.3],
['b', 6, 2.5],
['c', 7, 3.6],
['d', 8, 4.6],
['e', 9, 5.8]], dtype=object)
df.shape
(5, 3)
df.mean()
col2 7.00
col3 3.56
dtype: float64
df1 = pd.DataFrame({'A':[1,2,3]},index=[1,2,3])
df2 = pd.DataFrame({'A':[1,2,3]},index=[3,1,2])
df1-df2
df.drop(index='五',columns='col1')
|
col2 |
col3 |
一 |
5 |
1.3 |
二 |
6 |
2.5 |
三 |
7 |
3.6 |
四 |
8 |
4.6 |
df['col1']=[1,2,3,4,5]
del df['col1']
df
|
col2 |
col3 |
一 |
5 |
1.3 |
二 |
6 |
2.5 |
三 |
7 |
3.6 |
四 |
8 |
4.6 |
五 |
9 |
5.8 |
df['col1']=[1,2,3,4,5]
df.pop('col1')
一 1
二 2
三 3
四 4
五 5
Name: col1, dtype: int64
df
|
col2 |
col3 |
一 |
5 |
1.3 |
二 |
6 |
2.5 |
三 |
7 |
3.6 |
四 |
8 |
4.6 |
五 |
9 |
5.8 |
df1['B']=list('abc')
df1
df1.assign(C=pd.Series(list('def')))
|
A |
B |
C |
1 |
1 |
a |
e |
2 |
2 |
b |
f |
3 |
3 |
c |
NaN |
df1
df.select_dtypes(include=['number']).head()
|
col2 |
col3 |
一 |
5 |
1.3 |
二 |
6 |
2.5 |
三 |
7 |
3.6 |
四 |
8 |
4.6 |
五 |
9 |
5.8 |
df.select_dtypes(include=['float']).head()
|
col3 |
一 |
1.3 |
二 |
2.5 |
三 |
3.6 |
四 |
4.6 |
五 |
5.8 |
s = df.mean()
s.name='to_DataFrame'
s
col2 7.00
col3 3.56
Name: to_DataFrame, dtype: float64
s.to_frame()
|
to_DataFrame |
col2 |
7.00 |
col3 |
3.56 |
s.to_frame().T
|
col2 |
col3 |
to_DataFrame |
7.0 |
3.56 |
三、常用基本函数
df = pd.read_csv(r'D:\study\pandas\data\table.csv')
df.head()
|
School |
Class |
ID |
Gender |
Address |
Height |
Weight |
Math |
Physics |
0 |
S_1 |
C_1 |
1101 |
M |
street_1 |
173 |
63 |
34.0 |
A+ |
1 |
S_1 |
C_1 |
1102 |
F |
street_2 |
192 |
73 |
32.5 |
B+ |
2 |
S_1 |
C_1 |
1103 |
M |
street_2 |
186 |
82 |
87.2 |
B+ |
3 |
S_1 |
C_1 |
1104 |
F |
street_2 |
167 |
81 |
80.4 |
B- |
4 |
S_1 |
C_1 |
1105 |
F |
street_4 |
159 |
64 |
84.8 |
B+ |
df.tail()
|
School |
Class |
ID |
Gender |
Address |
Height |
Weight |
Math |
Physics |
30 |
S_2 |
C_4 |
2401 |
F |
street_2 |
192 |
62 |
45.3 |
A |
31 |
S_2 |
C_4 |
2402 |
M |
street_7 |
166 |
82 |
48.7 |
B |
32 |
S_2 |
C_4 |
2403 |
F |
street_6 |
158 |
60 |
59.7 |
B+ |
33 |
S_2 |
C_4 |
2404 |
F |
street_2 |
160 |
84 |
67.7 |
B |
34 |
S_2 |
C_4 |
2405 |
F |
street_6 |
193 |
54 |
47.6 |
B |
df.head(6)
|
School |
Class |
ID |
Gender |
Address |
Height |
Weight |
Math |
Physics |
0 |
S_1 |
C_1 |
1101 |
M |
street_1 |
173 |
63 |
34.0 |
A+ |
1 |
S_1 |
C_1 |
1102 |
F |
street_2 |
192 |
73 |
32.5 |
B+ |
2 |
S_1 |
C_1 |
1103 |
M |
street_2 |
186 |
82 |
87.2 |
B+ |
3 |
S_1 |
C_1 |
1104 |
F |
street_2 |
167 |
81 |
80.4 |
B- |
4 |
S_1 |
C_1 |
1105 |
F |
street_4 |
159 |
64 |
84.8 |
B+ |
5 |
S_1 |
C_2 |
1201 |
M |
street_5 |
188 |
68 |
97.0 |
A- |
df['Physics'].nunique()
7
df['Physics'].unique()
array(['A+', 'B+', 'B-', 'A-', 'B', 'A', 'C'], dtype=object)
df['Physics'].count()
35
df['Physics'].value_counts()
B+ 9
B 8
B- 6
A 4
A+ 3
A- 3
C 2
Name: Physics, dtype: int64
df.info()
RangeIndex: 35 entries, 0 to 34
Data columns (total 9 columns):
School 35 non-null object
Class 35 non-null object
ID 35 non-null int64
Gender 35 non-null object
Address 35 non-null object
Height 35 non-null int64
Weight 35 non-null int64
Math 35 non-null float64
Physics 35 non-null object
dtypes: float64(1), int64(3), object(5)
memory usage: 2.6+ KB
df.describe()
|
ID |
Height |
Weight |
Math |
count |
35.00000 |
35.000000 |
35.000000 |
35.000000 |
mean |
1803.00000 |
174.142857 |
74.657143 |
61.351429 |
std |
536.87741 |
13.541098 |
12.895377 |
19.915164 |
min |
1101.00000 |
155.000000 |
53.000000 |
31.500000 |
25% |
1204.50000 |
161.000000 |
63.000000 |
47.400000 |
50% |
2103.00000 |
173.000000 |
74.000000 |
61.700000 |
75% |
2301.50000 |
187.500000 |
82.000000 |
77.100000 |
max |
2405.00000 |
195.000000 |
100.000000 |
97.000000 |
df.describe(percentiles=[.05, .25, .75, .95])
|
ID |
Height |
Weight |
Math |
count |
35.00000 |
35.000000 |
35.000000 |
35.000000 |
mean |
1803.00000 |
174.142857 |
74.657143 |
61.351429 |
std |
536.87741 |
13.541098 |
12.895377 |
19.915164 |
min |
1101.00000 |
155.000000 |
53.000000 |
31.500000 |
5% |
1102.70000 |
157.000000 |
56.100000 |
32.640000 |
25% |
1204.50000 |
161.000000 |
63.000000 |
47.400000 |
50% |
2103.00000 |
173.000000 |
74.000000 |
61.700000 |
75% |
2301.50000 |
187.500000 |
82.000000 |
77.100000 |
95% |
2403.30000 |
193.300000 |
97.600000 |
90.040000 |
max |
2405.00000 |
195.000000 |
100.000000 |
97.000000 |
df['Physics'].describe()
count 35
unique 7
top B+
freq 9
Name: Physics, dtype: object
df['Math'].idxmax()
5
df['Math'].nlargest(3)
df['Math'].nsmallest(3)
10 31.5
1 32.5
26 32.7
Name: Math, dtype: float64
df['Math'].head()
0 34.0
1 32.5
2 87.2
3 80.4
4 84.8
Name: Math, dtype: float64
df['Math'].clip(33,80).head()
0 34.0
1 33.0
2 80.0
3 80.0
4 80.0
Name: Math, dtype: float64
df['Math'].mad()
16.924244897959188
df['Address'].head()
0 street_1
1 street_2
2 street_2
3 street_2
4 street_4
Name: Address, dtype: object
df['Address'].replace(['street_1','street_2'],['one','two']).head()
0 one
1 two
2 two
3 two
4 street_4
Name: Address, dtype: object
df.replace({'Address':{'street_1':'one','street_2':'two'}}).head()
|
School |
Class |
ID |
Gender |
Address |
Height |
Weight |
Math |
Physics |
0 |
S_1 |
C_1 |
1101 |
M |
one |
173 |
63 |
34.0 |
A+ |
1 |
S_1 |
C_1 |
1102 |
F |
two |
192 |
73 |
32.5 |
B+ |
2 |
S_1 |
C_1 |
1103 |
M |
two |
186 |
82 |
87.2 |
B+ |
3 |
S_1 |
C_1 |
1104 |
F |
two |
167 |
81 |
80.4 |
B- |
4 |
S_1 |
C_1 |
1105 |
F |
street_4 |
159 |
64 |
84.8 |
B+ |
df['Math'].apply(lambda x:str(x)+'!').head()
0 34.0!
1 32.5!
2 87.2!
3 80.4!
4 84.8!
Name: Math, dtype: object
df.apply(lambda x:x.apply(lambda x:str(x)+'!')).head()
|
School |
Class |
ID |
Gender |
Address |
Height |
Weight |
Math |
Physics |
0 |
S_1! |
C_1! |
1101! |
M! |
street_1! |
173! |
63! |
34.0! |
A+! |
1 |
S_1! |
C_1! |
1102! |
F! |
street_2! |
192! |
73! |
32.5! |
B+! |
2 |
S_1! |
C_1! |
1103! |
M! |
street_2! |
186! |
82! |
87.2! |
B+! |
3 |
S_1! |
C_1! |
1104! |
F! |
street_2! |
167! |
81! |
80.4! |
B-! |
4 |
S_1! |
C_1! |
1105! |
F! |
street_4! |
159! |
64! |
84.8! |
B+! |
四、排序
df.set_index('Math').head()
|
School |
Class |
ID |
Gender |
Address |
Height |
Weight |
Physics |
Math |
|
|
|
|
|
|
|
|
34.0 |
S_1 |
C_1 |
1101 |
M |
street_1 |
173 |
63 |
A+ |
32.5 |
S_1 |
C_1 |
1102 |
F |
street_2 |
192 |
73 |
B+ |
87.2 |
S_1 |
C_1 |
1103 |
M |
street_2 |
186 |
82 |
B+ |
80.4 |
S_1 |
C_1 |
1104 |
F |
street_2 |
167 |
81 |
B- |
84.8 |
S_1 |
C_1 |
1105 |
F |
street_4 |
159 |
64 |
B+ |
df.set_index('Math').sort_index().head()
|
School |
Class |
ID |
Gender |
Address |
Height |
Weight |
Physics |
Math |
|
|
|
|
|
|
|
|
31.5 |
S_1 |
C_3 |
1301 |
M |
street_4 |
161 |
68 |
B+ |
32.5 |
S_1 |
C_1 |
1102 |
F |
street_2 |
192 |
73 |
B+ |
32.7 |
S_2 |
C_3 |
2302 |
M |
street_5 |
171 |
88 |
A |
33.8 |
S_1 |
C_2 |
1204 |
F |
street_5 |
162 |
63 |
B |
34.0 |
S_1 |
C_1 |
1101 |
M |
street_1 |
173 |
63 |
A+ |
df.sort_values(by='Class').head()
|
School |
Class |
ID |
Gender |
Address |
Height |
Weight |
Math |
Physics |
0 |
S_1 |
C_1 |
1101 |
M |
street_1 |
173 |
63 |
34.0 |
A+ |
19 |
S_2 |
C_1 |
2105 |
M |
street_4 |
170 |
81 |
34.2 |
A |
18 |
S_2 |
C_1 |
2104 |
F |
street_5 |
159 |
97 |
72.2 |
B+ |
16 |
S_2 |
C_1 |
2102 |
F |
street_6 |
161 |
61 |
50.6 |
B+ |
15 |
S_2 |
C_1 |
2101 |
M |
street_7 |
174 |
84 |
83.3 |
C |
df.sort_values(by=['Address','Height']).head()
|
School |
Class |
ID |
Gender |
Address |
Height |
Weight |
Math |
Physics |
0 |
S_1 |
C_1 |
1101 |
M |
street_1 |
173 |
63 |
34.0 |
A+ |
11 |
S_1 |
C_3 |
1302 |
F |
street_1 |
175 |
57 |
87.7 |
A- |
23 |
S_2 |
C_2 |
2204 |
M |
street_1 |
175 |
74 |
47.2 |
B- |
33 |
S_2 |
C_4 |
2404 |
F |
street_2 |
160 |
84 |
67.7 |
B |
3 |
S_1 |
C_1 |
1104 |
F |
street_2 |
167 |
81 |
80.4 |
B- |
练习一
got=pd.read_csv(r'D:\study\pandas\data\Game_of_Thrones_Script.csv')
got.head()
|
Release Date |
Season |
Episode |
Episode Title |
Name |
Sentence |
0 |
2011/4/17 |
Season 1 |
Episode 1 |
Winter is Coming |
waymar royce |
What do you expect? They're savages. One lot s... |
1 |
2011/4/17 |
Season 1 |
Episode 1 |
Winter is Coming |
will |
I've never seen wildlings do a thing like this... |
2 |
2011/4/17 |
Season 1 |
Episode 1 |
Winter is Coming |
waymar royce |
How close did you get? |
3 |
2011/4/17 |
Season 1 |
Episode 1 |
Winter is Coming |
will |
Close as any man would. |
4 |
2011/4/17 |
Season 1 |
Episode 1 |
Winter is Coming |
gared |
We should head back to the wall. |
got['Name'].nunique()
564
got['Name'].value_counts().head()
tyrion lannister 1760
jon snow 1133
daenerys targaryen 1048
cersei lannister 1005
jaime lannister 945
Name: Name, dtype: int64
got_words= got.assign(Words=got['Sentence'].apply(lambda x:len(x.split()))).sort_values(by='Name')
got_words.head()
|
Release Date |
Season |
Episode |
Episode Title |
Name |
Sentence |
Words |
276 |
2011/4/17 |
Season 1 |
Episode 1 |
Winter is Coming |
a voice |
It's Maester Luwin, my lord. |
5 |
3012 |
2011/6/19 |
Season 1 |
Episode 10 |
Fire and Blood |
addam marbrand |
ls it true about Stannis and Renly? |
7 |
3017 |
2011/6/19 |
Season 1 |
Episode 10 |
Fire and Blood |
addam marbrand |
Kevan Lannister |
2 |
13610 |
2014/6/8 |
Season 4 |
Episode 9 |
The Watchers on the Wall |
aemon |
And what is it that couldn't wait until mornin... |
10 |
13614 |
2014/6/8 |
Season 4 |
Episode 9 |
The Watchers on the Wall |
aemon |
Oh, no need. I know my way around this library... |
48 |
L_count = []
N_words = list(zip(got_words['Name'],got_words['Words']))
for i in N_words:
if i ==N_words[0]:
L_count.append(i[1])
last = i[0]
else:
L_count.append(L_count[-1]+i[1] if i[0] == last else i[1])
last = i[0]
got_words['count']=L_count
got_words['Name'][got_words['count'].idxmax()]
'tyrion lannister'
练习二
kobe=pd.read_csv(r'D:\study\pandas\data\Kobe_data.csv',index_col='shot_id')
kobe.head()
|
action_type |
combined_shot_type |
game_event_id |
game_id |
lat |
loc_x |
loc_y |
lon |
minutes_remaining |
period |
... |
shot_made_flag |
shot_type |
shot_zone_area |
shot_zone_basic |
shot_zone_range |
team_id |
team_name |
game_date |
matchup |
opponent |
shot_id |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
Jump Shot |
Jump Shot |
10 |
20000012 |
33.9723 |
167 |
72 |
-118.1028 |
10 |
1 |
... |
NaN |
2PT Field Goal |
Right Side(R) |
Mid-Range |
16-24 ft. |
1610612747 |
Los Angeles Lakers |
2000/10/31 |
LAL @ POR |
POR |
2 |
Jump Shot |
Jump Shot |
12 |
20000012 |
34.0443 |
-157 |
0 |
-118.4268 |
10 |
1 |
... |
0.0 |
2PT Field Goal |
Left Side(L) |
Mid-Range |
8-16 ft. |
1610612747 |
Los Angeles Lakers |
2000/10/31 |
LAL @ POR |
POR |
3 |
Jump Shot |
Jump Shot |
35 |
20000012 |
33.9093 |
-101 |
135 |
-118.3708 |
7 |
1 |
... |
1.0 |
2PT Field Goal |
Left Side Center(LC) |
Mid-Range |
16-24 ft. |
1610612747 |
Los Angeles Lakers |
2000/10/31 |
LAL @ POR |
POR |
4 |
Jump Shot |
Jump Shot |
43 |
20000012 |
33.8693 |
138 |
175 |
-118.1318 |
6 |
1 |
... |
0.0 |
2PT Field Goal |
Right Side Center(RC) |
Mid-Range |
16-24 ft. |
1610612747 |
Los Angeles Lakers |
2000/10/31 |
LAL @ POR |
POR |
5 |
Driving Dunk Shot |
Dunk |
155 |
20000012 |
34.0443 |
0 |
0 |
-118.2698 |
6 |
2 |
... |
1.0 |
2PT Field Goal |
Center(C) |
Restricted Area |
Less Than 8 ft. |
1610612747 |
Los Angeles Lakers |
2000/10/31 |
LAL @ POR |
POR |
5 rows × 24 columns
pd.Series(list(zip(kobe['action_type'],kobe['combined_shot_type']))).value_counts().index[0]
('Jump Shot', 'Jump Shot')
pd.Series(list(list(zip(*(pd.Series(list(zip(kobe['game_id'],kobe['opponent']))).unique()).tolist()))[1])).value_counts().index[0]
'SAS'