Pandas基础操作
准备:
导入库¶
import pandas as pd # 导入pandas库并简写为pd
import numpy as np # 导入numpy库并简写为np
一、数据导入
pd.read_csv(filename) # 导入csv格式文件中的数据
pd.read_table(filename) # 导入有分隔符的文本 (如TSV) 中的数据
pd.read_excel(filename) # 导入Excel格式文件中的数据
pd.read_sql(query, connection_object) # 导入SQL数据表/数据库中的数据
pd.read_json(json_string) # 导入JSON格式的字符,URL地址或者文件中的数据
pd.read_html(url) # 导入经过解析的URL地址中包含的数据框 (DataFrame) 数据
pd.read_clipboard() # 导入系统粘贴板里面的数据
pd.DataFrame(dict) # 导入Python字典 (dict) 里面的数据,其中key是数据框的表头,value是数据框的内容。
实际数据导入
data = pd.read_csv('D:\\数据分析\\天池数据项目数据\\淘宝用户行为分析\\UserBehavior\\UserBehavior_test.csv')
# D:\\数据分析\\天池数据项目数据\\淘宝用户行为分析\\UserBehavior\\UserBehavior_test.csv 为本地csv文件目录
二、数据导出
df.to_csv(filename) # 将数据框 (DataFrame)中的数据导入csv格式的文件中
df.to_excel(filename) # 将数据框 (DataFrame)中的数据导入Excel格式的文件中
df.to_sql(table_name,connection_object) # 将数据框 (DataFrame)中的数据导入SQL数据表/数据库中
df.to_json(filename) # 将数据框 (DataFrame)中的数据导入JSON格式的文件中
三、数据结构及创建
Series
1、通过list创建
from pandas import Series,DataFrame
data0 = [1,2,3,4]
s = Series(data0,index=['a','b','c','d'])
print(s)
a 1
b 2
c 3
d 4
dtype: int64
2、通过数组创建
s1 = np.array(['a','s','d','f','g'])
print(s1)
['a' 's' 'd' 'f' 'g']
3、通过字典创建
data1 = {'a':1,'b':2,'c':3,'d':4}
s2 = pd.Series(data1,index=['a','d','e','b'])
print(s2)
a 1.0
d 4.0
e NaN
b 2.0
dtype: float64
上结果索引顺序保持不变,缺少的元素使用NAN(不是数字)填充
DataFrame
1、从列表创建DataFrame
a、通过单个列表或列表字典创建
data3 = [1,2,3,4,5,6]
s3 = pd.DataFrame(data3)
print(s3)
0
0 1
1 2
2 3
3 4
4 5
5 6
data4 = [['lijie',20],['zhangsan',45],['wangwu',30]]
s4 = pd.DataFrame(data4,columns=['name','age'])
print(s4)
name age
0 lijie 20
1 zhangsan 45
2 wangwu 30
b、通过字典列表创建
data5 = [{'a':1,'b':2},{'c':5,'d':8,'e':10}]
s5 = pd.DataFrame(data5)
print(s5)
a b c d e
0 1.0 2.0 NaN NaN NaN
1 NaN NaN 5.0 8.0 10.0
2、从ndarrays/Lists的字典来创建DataFrame
data6 = {'name':['tom','lose','steve','bob'],'age':[12,21,30,14]}
s6 = pd.DataFrame(data6)
print(s6)
name age
0 tom 12
1 lose 21
2 steve 30
3 bob 14
3、从系列字典创建DataFrame
data7 = {'one':pd.Series([1,2,3],index = ['a','b','c']),
'two':pd.Series([1,2,3,4],index = ['a','b','c','d'])}
s7 = pd.DataFrame(data7)
print(s7)
one two
a 1.0 1
b 2.0 2
c 3.0 3
d NaN 4
四、数据查看
1、基础属性
a、Series
s8 = Series([1,2,3,4,5])
print(s8)
0 1
1 2
2 3
3 4
4 5
dtype: int64
s8.axes # 返回行轴标签列表
[RangeIndex(start=0, stop=5, step=1)]
s8.dtype # 返回对象的数据类型
dtype('int64')
s8.empty # 如果系列为空,则返回True
False
s8.ndim # 返回底层数据的维数
1
s8.size # 返回基础数据中的元素数
5
s8.values # 将系列作为ndarray返回
array([1, 2, 3, 4, 5], dtype=int64)
s8.head() # 返回前n行
0 1
1 2
2 3
3 4
4 5
dtype: int64
s8.tail() # 返回最后n行
0 1
1 2
2 3
3 4
4 5
dtype: int64
b、DataFrame
data9 = {'one':pd.Series([1,2,3],index = ['a','b','c']),
'two':pd.Series([1,2,3,4],index = ['a','b','c','d'])}
s9 = pd.DataFrame(data9)
print(s9)
one two
a 1.0 1
b 2.0 2
c 3.0 3
d NaN 4
s9.T # 转置行与列
a b c d
one 1.0 2.0 3.0 NaN
two 1.0 2.0 3.0 4.0
s9.axes # 返回一个列,行轴标签和列轴标签作为唯一的成员。
[Index(['a', 'b', 'c', 'd'], dtype='object'),
Index(['one', 'two'], dtype='object')]
s9.dtypes # 返回此对象中的数据类型(dtypes)
one float64
two int64
dtype: object
s9.empty # 如果NDFrame完全为空[无项目],则返回为True; 如果任何轴的长度为0
False
s9.ndim # 轴/数组维度大小
2
s9.shape # 返回表示DataFrame的维度的元组
(4, 2)
s9.size # NDFrame中的元素数
8
s9.values # NDFrame的Numpy表示
array([[ 1., 1.],
[ 2., 2.],
[ 3., 3.],
[nan, 4.]])
s9.index # 查看索引
Index(['a', 'b', 'c', 'd'], dtype='object')
s9.columns # 查看列名
Index(['one', 'two'], dtype='object')
s9.info() # 查看数据框的索引、数据类型以及内存信息
Index: 4 entries, a to d
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 one 3 non-null float64
1 two 4 non-null int64
dtypes: float64(1), int64(1)
memory usage: 256.0+ bytes
2、描述统计
data10 = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Minsu','Jack',
'Lee','David','Gasper','Betina','Andres']),
'Age':pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]),
'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65])}
s10 = pd.DataFrame(data10)
print(s10)
Name Age Rating
0 Tom 25 4.23
1 James 26 3.24
2 Ricky 25 3.98
3 Vin 23 2.56
4 Steve 30 3.20
5 Minsu 29 4.60
6 Jack 23 3.80
7 Lee 34 3.78
8 David 40 2.98
9 Gasper 30 4.80
10 Betina 51 4.10
11 Andres 46 3.65
s10.sum() # 求和
Name TomJamesRickyVinSteveMinsuJackLeeDavidGasperBe...
Age 382
Rating 44.92
dtype: object
s10.count() # 非空元素
Name 12
Age 12
Rating 12
dtype: int64
s10.min() # 最小值
Name Andres
Age 23
Rating 2.56
dtype: object
s10.max() # 最大值
Name Vin
Age 51
Rating 4.8
dtype: object
s10.cumsum() # 累计求和
Name Age Rating
0 Tom 25 4.23
1 TomJames 51 7.47
2 TomJamesRicky 76 11.45
3 TomJamesRickyVin 99 14.01
4 TomJamesRickyVinSteve 129 17.21
5 TomJamesRickyVinSteveMinsu 158 21.81
6 TomJamesRickyVinSteveMinsuJack 181 25.61
7 TomJamesRickyVinSteveMinsuJackLee 215 29.39
8 TomJamesRickyVinSteveMinsuJackLeeDavid 255 32.37
9 TomJamesRickyVinSteveMinsuJackLeeDavidGasper 285 37.17
10 TomJamesRickyVinSteveMinsuJackLeeDavidGasperBe... 336 41.27
11 TomJamesRickyVinSteveMinsuJackLeeDavidGasperBe... 382 44.92
s10['Age'].mean() # 年龄平均值
31.833333333333332
s10.describe() # 统计信息摘要
Age Rating
count 12.000000 12.000000
mean 31.833333 3.743333
std 9.232682 0.661628
min 23.000000 2.560000
25% 25.000000 3.230000
50% 29.500000 3.790000
75% 35.500000 4.132500
max 51.000000 4.800000
五、数据清洗
1、重命名
s.columns = ['a','b'] # 重命名数据框的列名称
data.head(5) # data为最开始导入的csv数据
1 2268318 2520377 pv 1511544070
0 1 2333346.0 2520771.0 pv 1.511562e+09
1 1 2576651.0 149192.0 pv 1.511573e+09
2 1 2576651.0 149192.0 pv 1.511573e+09
3 1 4365585.0 2520377.0 pv 1.511596e+09
4 1 4606018.0 2735466.0 NaN 1.511616e+09
data.columns = ["user_id","item_id","catagory_id","behavior_type","timestamp"]
data.head(5)
user_id item_id catagory_id behavior_type timestamp
0 1 2333346.0 2520771.0 pv 1.511562e+09
1 1 2576651.0 149192.0 pv 1.511573e+09
2 1 2576651.0 149192.0 pv 1.511573e+09
3 1 4365585.0 2520377.0 pv 1.511596e+09
4 1 4606018.0 2735466.0 NaN 1.511616e+09
s.rename() # 重命名列/行,传入的函数或字典值必须是1对1的,没有包含在字典或者Series中的标签将保持原来的名称。字典中包含df中没有的标签,不会报错
s11 = data.rename(columns={'user_id':'a','item_id':'b','catagory_id':'c','behavior_type':'d','timestamp':'e'})
s11.head()
a b c d e
0 1 2333346.0 2520771.0 pv 1.511562e+09
1 1 2576651.0 149192.0 pv 1.511573e+09
2 1 2576651.0 149192.0 pv 1.511573e+09
3 1 4365585.0 2520377.0 pv 1.511596e+09
4 1 4606018.0 2735466.0 NaN 1.511616e+09
s.set_index(),可以设置单索引和复合索引
s12 = data.set_index('user_id') # 将user_id作为索引(单索引)
s12
item_id catagory_id behavior_type timestamp
user_id
1 2333346.0 2520771.0 pv 1.511562e+09
1 2576651.0 149192.0 pv 1.511573e+09
1 2576651.0 149192.0 pv 1.511573e+09
1 4365585.0 2520377.0 pv 1.511596e+09
1 4606018.0 2735466.0 NaN 1.511616e+09
... ... ... ... ...
100 4182583.0 1258177.0 pv 1.511751e+09
100 2337874.0 1194311.0 pv 1.511751e+09
100 3658601.0 2342116.0 pv 1.511758e+09
100 5153036.0 2342116.0 pv 1.511759e+09
100 598929.0 2429887.0 buy 1.511759e+09
100 rows × 4 columns
s13 = data.set_index(['user_id','item_id']) # 将user_id作为索引(复合索引)
s13
catagory_id behavior_type timestamp
user_id item_id
1 2333346.0 2520771.0 pv 1.511562e+09
2576651.0 149192.0 pv 1.511573e+09
2576651.0 149192.0 pv 1.511573e+09
4365585.0 2520377.0 pv 1.511596e+09
4606018.0 2735466.0 NaN 1.511616e+09
... ... ... ... ...
100 4182583.0 1258177.0 pv 1.511751e+09
2337874.0 1194311.0 pv 1.511751e+09
3658601.0 2342116.0 pv 1.511758e+09
5153036.0 2342116.0 pv 1.511759e+09
598929.0 2429887.0 buy 1.511759e+09
100 rows × 3 columns
2、重复值
duplicated() 查找重复值
data.duplicated()
0 False
1 False
2 False
3 False
4 False
...
95 False
96 False
97 False
98 False
99 False
Length: 100, dtype: bool
drop_duplicate() 删除重复值
data.drop_duplicates()
user_id item_id catagory_id behavior_type timestamp
0 1 2333346.0 2520771.0 pv 1.511562e+09
1 1 2576651.0 149192.0 pv 1.511573e+09
2 1 2576651.0 149192.0 pv 1.511573e+09
3 1 4365585.0 2520377.0 pv 1.511596e+09
4 1 4606018.0 2735466.0 NaN 1.511616e+09
... ... ... ... ... ...
95 100 4182583.0 1258177.0 pv 1.511751e+09
96 100 2337874.0 1194311.0 pv 1.511751e+09
97 100 3658601.0 2342116.0 pv 1.511758e+09
98 100 5153036.0 2342116.0 pv 1.511759e+09
99 100 598929.0 2429887.0 buy 1.511759e+09
100 rows × 5 columns
3、空值、缺失值
data.isnull() # True为空,反之非空
user_id item_id catagory_id behavior_type timestamp
0 False False False False False
1 False False False False False
2 False False False False False
3 False False False False False
4 False False False True False
... ... ... ... ... ...
95 False False False False False
96 False False False False False
97 False False False False False
98 False False False False False
99 False False False False False
100 rows × 5 columns
data.isnull().values==True # 可用data.isnull().values==True来定位
array([[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, True, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, True],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, True, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, True, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, True],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False],
[False, False, False, False, False]])
data.notnull() # true为非空,反之为空
user_id item_id catagory_id behavior_type timestamp
0 True True True True True
1 True True True True True
2 True True True True True
3 True True True True True
4 True True True False True
... ... ... ... ... ...
95 True True True True True
96 True True True True True
97 True True True True True
98 True True True True True
99 True True True True True
100 rows × 5 columns
data.fillna(0) # 用0填充空值
user_id item_id catagory_id behavior_type timestamp
0 1 2333346.0 2520771.0 pv 1.511562e+09
1 1 2576651.0 149192.0 pv 1.511573e+09
2 1 2576651.0 149192.0 pv 1.511573e+09
3 1 4365585.0 2520377.0 pv 1.511596e+09
4 1 4606018.0 2735466.0 0 1.511616e+09
... ... ... ... ... ...
95 100 4182583.0 1258177.0 pv 1.511751e+09
96 100 2337874.0 1194311.0 pv 1.511751e+09
97 100 3658601.0 2342116.0 pv 1.511758e+09
98 100 5153036.0 2342116.0 pv 1.511759e+09
99 100 598929.0 2429887.0 buy 1.511759e+09
100 rows × 5 columns
data.dropna(how='any', inplace=True) # 删除 含空值的行
4、字符处理
空格
data['behavior_type'].value_counts()
pv 87
fav 5
buy 3
Name: behavior_type, dtype: int64
data.replace('p','pv')
user_id item_id catagory_id behavior_type timestamp
0 1 2333346.0 2520771.0 pv 1.511562e+09
1 1 2576651.0 149192.0 pv 1.511573e+09
2 1 2576651.0 149192.0 pv 1.511573e+09
3 1 4365585.0 2520377.0 pv 1.511596e+09
5 1 230380.0 411153.0 pv 1.511645e+09
... ... ... ... ... ...
95 100 4182583.0 1258177.0 pv 1.511751e+09
96 100 2337874.0 1194311.0 pv 1.511751e+09
97 100 3658601.0 2342116.0 pv 1.511758e+09
98 100 5153036.0 2342116.0 pv 1.511759e+09
99 100 598929.0 2429887.0 buy 1.511759e+09
95 rows × 5 columns
去除空格
data['behavior_type'].map(str.strip) # 去除两端空格
0 pv
1 pv
2 pv
3 pv
5 pv
...
95 pv
96 pv
97 pv
98 pv
99 buy
Name: behavior_type, Length: 95, dtype: object
data['behavior_type'].map(str.lstrip) # 去除左端空格
0 pv
1 pv
2 pv
3 pv
5 pv
...
95 pv
96 pv
97 pv
98 pv
99 buy
Name: behavior_type, Length: 95, dtype: object
data['behavior_type'].map(str.rstrip) # 去除右端空格
0 pv
1 pv
2 pv
3 pv
5 pv
...
95 pv
96 pv
97 pv
98 pv
99 buy
Name: behavior_type, Length: 95, dtype: object
大小写
data['behavior_type'].map(str.upper) # 大写
0 PV
1 PV
2 PV
3 PV
5 PV
...
95 PV
96 PV
97 PV
98 PV
99 BUY
Name: behavior_type, Length: 95, dtype: object
data['behavior_type'].map(str.lower) # 小写
0 pv
1 pv
2 pv
3 pv
5 pv
...
95 pv
96 pv
97 pv
98 pv
99 buy
Name: behavior_type, Length: 95, dtype: object
data['behavior_type'].map(str.title) # 首字母大写
0 Pv
1 Pv
2 Pv
3 Pv
5 Pv
...
95 Pv
96 Pv
97 Pv
98 Pv
99 Buy
Name: behavior_type, Length: 95, dtype: object
data['behavior_type'].value_counts()
pv 87
fav 5
buy 3
Name: behavior_type, dtype: int64
# data.replace('Pv','PV') 替换
5、异常和极端值
发现方法:
a、可通过describe()方法,观察统计的各项数据,发现异常值
b、借助散点图、箱型图、正态分布图等进行异常值的发现
处理办法
a、删除——先将异常值替换为na,然后用dropna()删除
b、视为缺失值——先将异常值替换为na,然后用缺失值处理方法处理(填充,插值等)
6、更改数据格式
data['timestamp'].dtype
dtype('float64')
data['timestamp'].astype('int32') # 将 'float64' 转化为 'int32'
0 1511561733
1 1511572885
2 1511572886
3 1511596146
5 1511644942
...
95 1511750991
96 1511751022
97 1511757958
98 1511758581
99 1511759458
Name: timestamp, Length: 95, dtype: int32
data['date'] = pd.to_datetime(data['timestamp'])
data['date']
0 1970-01-01 00:00:01.511561733
1 1970-01-01 00:00:01.511572885
2 1970-01-01 00:00:01.511572886
3 1970-01-01 00:00:01.511596146
5 1970-01-01 00:00:01.511644942
...
95 1970-01-01 00:00:01.511750991
96 1970-01-01 00:00:01.511751022
97 1970-01-01 00:00:01.511757958
98 1970-01-01 00:00:01.511758581
99 1970-01-01 00:00:01.511759458
Name: date, Length: 95, dtype: datetime64[ns]
8、分列
str.split(self,pat = None,n = -1,expand = False )
pat str,可选:要分割的字符串或正则表达式。如果未指定,请在空白处分割。
n int,默认为-1(全部):限制输出的分割数。 None,则0和-1将被解释为返回所有拆分。
expand,默认为False:将拆分的字符串展开为单独的列。如果为True,则返回DataFrame / MultiIndex扩展维;如果为False,则返回包含字符串列表的Series / Index。
s14 = pd.Series(["this is a regular sentence","https://docs.python.org/3/tutorial/index.html",np.nan])
s14
0 this is a regular sentence
1 https://docs.python.org/3/tutorial/index.html
2 NaN
dtype: object
s14.str.split()
0 [this, is, a, regular, sentence]
1 [https://docs.python.org/3/tutorial/index.html]
2 NaN
dtype: object
s14.str.rsplit()
0 [this, is, a, regular, sentence]
1 [https://docs.python.org/3/tutorial/index.html]
2 NaN
dtype: object
s14.str.split(n=2)
0 [this, is, a regular sentence]
1 [https://docs.python.org/3/tutorial/index.html]
2 NaN
dtype: object
s14.str.rsplit(n=2)
0 [this is a, regular, sentence]
1 [https://docs.python.org/3/tutorial/index.html]
2 NaN
dtype: object
s14.str.rsplit(pat='/')
0 [this is a regular sentence]
1 [https:, , docs.python.org, 3, tutorial, index...
2 NaN
dtype: object
s14.str.split(pat='/',expand=True)
0 1 2 3 4 5
0 this is a regular sentence None None None None None
1 https: docs.python.org 3 tutorial index.html
2 NaN NaN NaN NaN NaN NaN
六、数据预处理
1、数据分组、分段
groupby() 分组
data.groupby(['behavior_type']).count() # 按照'behavior_type'对data进行分组并计数
user_id item_id catagory_id timestamp date
behavior_type
buy 3 3 3 3 3
fav 5 5 5 5 5
pv 87 87 87 87 87
data.groupby(['behavior_type','item_id']).sum() # 按照'behavior_type'和'item_id'对data进行分组并求和
user_id catagory_id timestamp
behavior_type item_id
buy 598929.0 100 2429887.0 1.511759e+09
1603476.0 100 2951233.0 1.511580e+09
2971043.0 100 4869428.0 1.511618e+09
fav 2158340.0 100 2429887.0 1.511743e+09
2971043.0 100 4869428.0 1.511565e+09
... ... ... ... ...
pv 4954999.0 3 1233459.0 4.536499e+09
4973305.0 1 2520771.0 1.511969e+09
5002615.0 2 5040754.0 3.023684e+09
5100093.0 100 2945933.0 1.511552e+09
5153036.0 100 2342116.0 1.511759e+09
74 rows × 3 columns
cut()/pcut() 分段
score_list = np.random.randint(25, 100, size=20) # 分数列表
print(score_list)
[38 62 72 26 60 50 61 86 38 97 72 47 61 59 83 73 48 56 59 38]
bins = [0, 59, 70, 80, 100] # 指定分数区间
print(bins)
[0, 59, 70, 80, 100]
score_cut = pd.cut(score_list, bins) # 对分数按照分数区间进行分段
print(score_cut)
[(0, 59], (59, 70], (70, 80], (0, 59], (59, 70], ..., (70, 80], (0, 59], (0, 59], (0, 59], (0, 59]]
Length: 20
Categories (4, interval[int64]): [(0, 59] < (59, 70] < (70, 80] < (80, 100]]
pd.value_counts(score_cut) # 查看各分数区间的人数
(0, 59] 10
(59, 70] 4
(80, 100] 3
(70, 80] 3
dtype: int64
2、 筛选、排序
sort_values() 按列进行排序
data.sort_values('user_id',ascending=True) # 按照'user_id'对data进行升序排序,ascending为False则为降序
user_id item_id catagory_id behavior_type timestamp date
0 1 2333346.0 2520771.0 pv 1.511562e+09 1970-01-01 00:00:01.511561733
30 1 3157558.0 2520771.0 pv 1.511985e+09 1970-01-01 00:00:01.511984943
31 1 2087357.0 2131531.0 pv 1.512005e+09 1970-01-01 00:00:01.512004568
32 1 4170517.0 149192.0 pv 1.512011e+09 1970-01-01 00:00:01.512011019
33 1 1340922.0 4690421.0 pv 1.512041e+09 1970-01-01 00:00:01.512041260
... ... ... ... ... ... ...
72 100 2971043.0 4869428.0 buy 1.511618e+09 1970-01-01 00:00:01.511617549
73 100 2337874.0 1194311.0 pv 1.511680e+09 1970-01-01 00:00:01.511680485
74 100 4182583.0 1258177.0 pv 1.511681e+09 1970-01-01 00:00:01.511680521
76 100 4919803.0 2881542.0 pv 1.511684e+09 1970-01-01 00:00:01.511684032
99 100 598929.0 2429887.0 buy 1.511759e+09 1970-01-01 00:00:01.511759458
95 rows × 6 columns
data.sort_values(['user_id','item_id'],ascending=[True,False])
user_id item_id catagory_id behavior_type timestamp date
14 1 5002615.0 2520377.0 pv 1.511839e+09 1970-01-01 00:00:01.511839385
16 1 5002615.0 2520377.0 pv 1.511844e+09 1970-01-01 00:00:01.511844273
28 1 4973305.0 2520771.0 pv 1.511969e+09 1970-01-01 00:00:01.511969365
37 1 4954999.0 411153.0 pv 1.512061e+09 1970-01-01 00:00:01.512061318
42 1 4954999.0 411153.0 pv 1.512173e+09 1970-01-01 00:00:01.512172732
... ... ... ... ... ... ...
70 100 1603476.0 2951233.0 buy 1.511580e+09 1970-01-01 00:00:01.511579908
58 100 1220136.0 4869428.0 pv 1.511551e+09 1970-01-01 00:00:01.511550908
65 100 704268.0 223690.0 pv 1.511564e+09 1970-01-01 00:00:01.511563606
92 100 598929.0 2429887.0 pv 1.511743e+09 1970-01-01 00:00:01.511742899
99 100 598929.0 2429887.0 buy 1.511759e+09 1970-01-01 00:00:01.511759458
95 rows × 6 columns
sort_index() 按行进行排序
3、 连接和组合
append()
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])
df1.append(df2) # 在数据框df2的末尾添加数据框df1,其中df1和df2的列数应该相等
A B C D
0 A0 B0 C0 D0
1 A1 B1 C1 D1
2 A2 B2 C2 D2
3 A3 B3 C3 D3
4 A4 B4 C4 D4
5 A5 B5 C5 D5
6 A6 B6 C6 D6
7 A7 B7 C7 D7
concat()
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])
pd.concat([df1,df2],axis=1) # 在数据框df1的列最后添加数据框df2,其中df1和df2的行数应该相等
A B C D A B C D
0 A0 B0 C0 D0 NaN NaN NaN NaN
1 A1 B1 C1 D1 NaN NaN NaN NaN
2 A2 B2 C2 D2 NaN NaN NaN NaN
3 A3 B3 C3 D3 NaN NaN NaN NaN
4 NaN NaN NaN NaN A4 B4 C4 D4
5 NaN NaN NaN NaN A5 B5 C5 D5
6 NaN NaN NaN NaN A6 B6 C6 D6
7 NaN NaN NaN NaN A7 B7 C7 D7
join()
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'key': ['K0', 'K1', 'K0', 'K1']})
df2 = pd.DataFrame({'C': ['C0', 'C1'],
'D': ['D0', 'D1']},
index=['K0', 'K1'])
df1.join(df2, on='key') # 对数据框df1和df2做内连接,其中连接的列为col1
A B key C D
0 A0 B0 K0 C0 D0
1 A1 B1 K1 C1 D1
2 A2 B2 K0 C0 D0
3 A3 B3 K1 C1 D1
merge()
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
result = pd.merge(left, right, on='key')
print(result)
key A B C D
0 K0 A0 B0 C0 D0
1 K1 A1 B1 C1 D1
2 K2 A2 B2 C2 D2
3 K3 A3 B3 C3 D3
七、数据索引、提取
df[col] 以数组 Series 的形式返回选取的列
df = pd.DataFrame(np.random.rand(5,5),columns=list('ABCDE'))
df['C']
0 0.340454
1 0.116156
2 0.234191
3 0.516992
4 0.354863
Name: C, dtype: float64
df[[col1,col2]] 以新的数据框(DataFrame)的形式返回选取的列
df = pd.DataFrame(np.random.rand(5,5),columns=list('ABCDE'))
df[['B','E']]
B E
0 0.313619 0.848954
1 0.571887 0.337651
2 0.666888 0.086449
3 0.423615 0.368037
4 0.453995 0.479666
df.loc['index_one'] 按照索引选取
s = pd.Series(np.array(['I','Love','Data']))
s.loc[1]
'Love'
df.iloc[0] 按照位置选取
df = pd.DataFrame(np.random.rand(5,5),columns=list('ABCDE'))
df.iloc[0,:] # 选取第一行
A 0.960768
B 0.433873
C 0.777235
D 0.799733
E 0.643273
Name: 0, dtype: float64
df = pd.DataFrame(np.random.rand(5,5),columns=list('ABCDE'))
df.iloc[0,0] # 选取第一行第一个元素
0.7353611931210813
时间序列
八、数据统计
描述统计
data.describe()
user_id item_id catagory_id timestamp
count 95.000000 9.500000e+01 9.500000e+01 9.500000e+01
mean 45.810526 2.831879e+06 2.494733e+06 1.511826e+09
std 49.538787 1.398492e+06 1.421917e+06 2.213639e+05
min 1.000000 4.625900e+04 1.491920e+05 1.511551e+09
25% 1.000000 2.087357e+06 1.289235e+06 1.511682e+09
50% 1.000000 2.791761e+06 2.465336e+06 1.511751e+09
75% 100.000000 3.875118e+06 3.213828e+06 1.511980e+09
max 100.000000 5.153036e+06 4.869428e+06 1.512282e+09
data.groupby(['behavior_type']).count().mean()
user_id 31.666667
item_id 31.666667
catagory_id 31.666667
timestamp 31.666667
date 31.666667
dtype: float64
更多见数据查看
函数应用
apply:应用在DataFrame的行或列中
applymap:应用在DataFrame的每个元素中
map:应用在单独一列(Series)的每个元素中
九、数据透视表
pd.pivot_table()