pandas DataFrame基础操作,不定时update
创建数据框
构造两个数据集
- df1:用户基础属性,年龄、性别、注册时间
- df2:用户交易属性,交易时间、支付金额
从dict创建
import pandas as pd
userid = list(range(1, 6))
sex = ["male", "female","male", "female","male"]
createtime_str = [
"2019-01-01 11:45:50", "2019-01-02 11:55:50", "2019-01-21 11:45:50",
"2019-02-01 12:45:50", "2019-01-15 10:40:50"
]
buytime_str = ["2019-04-01 11:45:50", "2019-05-02 11:56:50", "2019-07-21 12:45:50",
"2019-08-01 12:40:50", "2019-01-06 10:00:50"]
age = [18, 37, 21, 44, 39]
payamount =[11.15, 10.37, 12.11, 14.5, 16.39]
df1 = pd.DataFrame({
'userid':userid,'sex':sex,'age':age,'createtime_str':createtime_str})
df2 = pd.DataFrame({
'userid':userid,'buytime_str':buytime_str,'payamount':payamount})
df1
|
userid |
sex |
age |
createtime_str |
0 |
1 |
male |
18 |
2019-01-01 11:45:50 |
1 |
2 |
female |
37 |
2019-01-02 11:55:50 |
2 |
3 |
male |
21 |
2019-01-21 11:45:50 |
3 |
4 |
female |
44 |
2019-02-01 12:45:50 |
4 |
5 |
male |
39 |
2019-01-15 10:40:50 |
df2
|
userid |
buytime_str |
payamount |
0 |
1 |
2019-04-01 11:45:50 |
11.15 |
1 |
2 |
2019-05-02 11:56:50 |
10.37 |
2 |
3 |
2019-07-21 12:45:50 |
12.11 |
3 |
4 |
2019-08-01 12:40:50 |
14.50 |
4 |
5 |
2019-01-06 10:00:50 |
16.39 |
df1
|
userid |
sex |
age |
createtime_str |
0 |
1 |
male |
18 |
2019-01-01 11:45:50 |
1 |
2 |
female |
37 |
2019-01-02 11:55:50 |
2 |
3 |
male |
21 |
2019-01-21 11:45:50 |
3 |
4 |
female |
44 |
2019-02-01 12:45:50 |
4 |
5 |
male |
39 |
2019-01-15 10:40:50 |
数据框概览
判断类型
type(df1)
pandas.core.frame.DataFrame
打印列类型
df1.dtypes
userid int64
sex object
age int64
createtime_str object
dtype: object
描述性统计
只输出了数值型列的相关信息
df1.describe()
|
userid |
age |
count |
5.000000 |
5.000000 |
mean |
3.000000 |
31.800000 |
std |
1.581139 |
11.562872 |
min |
1.000000 |
18.000000 |
25% |
2.000000 |
21.000000 |
50% |
3.000000 |
37.000000 |
75% |
4.000000 |
39.000000 |
max |
5.000000 |
44.000000 |
行数列数统计
df1.shape
print("行数:"+str(df1.shape[0])+"\n"+"列数:"+str(df1.shape[1]))
行数:5
列数:4
取子集
取某个位置
行列索引从0开始
df1.iloc[1,2]
37
筛选前几行
df1.iloc[1:3,:]
|
userid |
sex |
age |
createtime_str |
1 |
2 |
female |
37 |
2019-01-02 11:55:50 |
2 |
3 |
male |
21 |
2019-01-21 11:45:50 |
筛选某些列
df1.iloc[:,[1,3]]
|
sex |
createtime_str |
0 |
male |
2019-01-01 11:45:50 |
1 |
female |
2019-01-02 11:55:50 |
2 |
male |
2019-01-21 11:45:50 |
3 |
female |
2019-02-01 12:45:50 |
4 |
male |
2019-01-15 10:40:50 |
df1[["userid","age"]]
|
userid |
age |
0 |
1 |
18 |
1 |
2 |
37 |
2 |
3 |
21 |
3 |
4 |
44 |
4 |
5 |
39 |
条件筛选
df1.query("userid>=2")
|
userid |
sex |
age |
createtime_str |
1 |
2 |
female |
37 |
2019-01-02 11:55:50 |
2 |
3 |
male |
21 |
2019-01-21 11:45:50 |
3 |
4 |
female |
44 |
2019-02-01 12:45:50 |
4 |
5 |
male |
39 |
2019-01-15 10:40:50 |
列操作
列数&列名
df1.columns.size
4
list(df1.columns)
['userid', 'sex', 'age', 'createtime_str']
列名修改
修改一个列
df1.rename(columns={
"userid":"id"},inplace=False)
|
id |
sex |
age |
createtime_str |
0 |
1 |
male |
18 |
2019-01-01 11:45:50 |
1 |
2 |
female |
37 |
2019-01-02 11:55:50 |
2 |
3 |
male |
21 |
2019-01-21 11:45:50 |
3 |
4 |
female |
44 |
2019-02-01 12:45:50 |
4 |
5 |
male |
39 |
2019-01-15 10:40:50 |
修改多个列
df1.rename(columns={
"userid":"id","sex":"gender"},inplace=False)
|
id |
gender |
age |
createtime_str |
0 |
1 |
male |
18 |
2019-01-01 11:45:50 |
1 |
2 |
female |
37 |
2019-01-02 11:55:50 |
2 |
3 |
male |
21 |
2019-01-21 11:45:50 |
3 |
4 |
female |
44 |
2019-02-01 12:45:50 |
4 |
5 |
male |
39 |
2019-01-15 10:40:50 |
修改全部列
注意:深拷贝和浅拷贝
https://www.runoob.com/w3cnote/python-understanding-dict-copy-shallow-or-deep.html
import copy
temp = copy.deepcopy(df1)
temp.columns=["id1","sex1","age1","createtime_str"]
temp
|
id1 |
sex1 |
age1 |
createtime_str |
0 |
1 |
male |
18 |
2019-01-01 11:45:50 |
1 |
2 |
female |
37 |
2019-01-02 11:55:50 |
2 |
3 |
male |
21 |
2019-01-21 11:45:50 |
3 |
4 |
female |
44 |
2019-02-01 12:45:50 |
4 |
5 |
male |
39 |
2019-01-15 10:40:50 |
删除列
删除一个列
temp = copy.deepcopy(df1)
temp.drop(axis=1,columns=["sex"],inplace=False)
|
userid |
age |
createtime_str |
0 |
1 |
18 |
2019-01-01 11:45:50 |
1 |
2 |
37 |
2019-01-02 11:55:50 |
2 |
3 |
21 |
2019-01-21 11:45:50 |
3 |
4 |
44 |
2019-02-01 12:45:50 |
4 |
5 |
39 |
2019-01-15 10:40:50 |
删除多个列
temp = copy.deepcopy(df1)
temp.drop(axis=1,columns=["sex","age"],inplace=False)
|
userid |
createtime_str |
0 |
1 |
2019-01-01 11:45:50 |
1 |
2 |
2019-01-02 11:55:50 |
2 |
3 |
2019-01-21 11:45:50 |
3 |
4 |
2019-02-01 12:45:50 |
4 |
5 |
2019-01-15 10:40:50 |
列筛选
df1[["userid","age"]]
|
userid |
age |
0 |
1 |
18 |
1 |
2 |
37 |
2 |
3 |
21 |
3 |
4 |
44 |
4 |
5 |
39 |
df1.iloc[1:3,0:2]
|
userid |
sex |
1 |
2 |
female |
2 |
3 |
male |
增加列
temp = copy.deepcopy(df1)
temp["new"] = temp["age"]*2
temp
|
userid |
sex |
age |
createtime_str |
new |
0 |
1 |
male |
18 |
2019-01-01 11:45:50 |
36 |
1 |
2 |
female |
37 |
2019-01-02 11:55:50 |
74 |
2 |
3 |
male |
21 |
2019-01-21 11:45:50 |
42 |
3 |
4 |
female |
44 |
2019-02-01 12:45:50 |
88 |
4 |
5 |
male |
39 |
2019-01-15 10:40:50 |
78 |
增加常数列
temp = copy.deepcopy(df1)
temp["new"] ="new"
temp
|
userid |
sex |
age |
createtime_str |
new |
0 |
1 |
male |
18 |
2019-01-01 11:45:50 |
new |
1 |
2 |
female |
37 |
2019-01-02 11:55:50 |
new |
2 |
3 |
male |
21 |
2019-01-21 11:45:50 |
new |
3 |
4 |
female |
44 |
2019-02-01 12:45:50 |
new |
4 |
5 |
male |
39 |
2019-01-15 10:40:50 |
new |
通过运算增加列
temp = copy.deepcopy(df1)
temp["new"] = temp.age.apply(lambda x:"20多啦" if x>20 else "还是小伙子哦")
temp
|
userid |
sex |
age |
createtime_str |
new |
0 |
1 |
male |
18 |
2019-01-01 11:45:50 |
还是小伙子哦 |
1 |
2 |
female |
37 |
2019-01-02 11:55:50 |
20多啦 |
2 |
3 |
male |
21 |
2019-01-21 11:45:50 |
20多啦 |
3 |
4 |
female |
44 |
2019-02-01 12:45:50 |
20多啦 |
4 |
5 |
male |
39 |
2019-01-15 10:40:50 |
20多啦 |
通过向量or列表增加列
temp = copy.deepcopy(df1)
temp["new"] = list(range(5))
temp
|
userid |
sex |
age |
createtime_str |
new |
0 |
1 |
male |
18 |
2019-01-01 11:45:50 |
0 |
1 |
2 |
female |
37 |
2019-01-02 11:55:50 |
1 |
2 |
3 |
male |
21 |
2019-01-21 11:45:50 |
2 |
3 |
4 |
female |
44 |
2019-02-01 12:45:50 |
3 |
4 |
5 |
male |
39 |
2019-01-15 10:40:50 |
4 |
列的类型转化
几个列类型转化
df1.dtypes
userid int64
sex object
age int64
createtime_str object
dtype: object
temp = copy.deepcopy(df1)
temp["age"] = df1.age.astype("double")
temp.dtypes
userid int64
sex object
age float64
createtime_str object
dtype: object
多个列批量转化
批量修改目前支持格式不多
要么就写循环吧
temp = copy.deepcopy(df1)
temp[["createtime_str"]] = temp[["createtime_str"]].apply(pd.to_datetime,format='%Y-%m-%d %H:%M:%S.%f')
temp.dtypes
userid int64
sex object
age int64
createtime_str datetime64[ns]
dtype: object
temp
|
userid |
sex |
age |
createtime_str |
0 |
1 |
male |
18 |
2019-01-01 11:45:50 |
1 |
2 |
female |
37 |
2019-01-02 11:55:50 |
2 |
3 |
male |
21 |
2019-01-21 11:45:50 |
3 |
4 |
female |
44 |
2019-02-01 12:45:50 |
4 |
5 |
male |
39 |
2019-01-15 10:40:50 |
列之间的运算
df1.dtypes
userid int64
sex object
age int64
createtime_str object
dtype: object
import copy
temp = copy.deepcopy(df1)
temp["userid1"] = [5,4,3,2,1]
temp["greatest"] = temp[["userid","userid1"]].apply(lambda x:max(x),axis=1)
temp["leastest"] = temp[["userid","userid1"]].apply(lambda x:min(x),axis=1)
temp
|
userid |
sex |
age |
createtime_str |
userid1 |
greatest |
leastest |
0 |
1 |
male |
18 |
2019-01-01 11:45:50 |
5 |
5 |
1 |
1 |
2 |
female |
37 |
2019-01-02 11:55:50 |
4 |
4 |
2 |
2 |
3 |
male |
21 |
2019-01-21 11:45:50 |
3 |
3 |
3 |
3 |
4 |
female |
44 |
2019-02-01 12:45:50 |
2 |
4 |
2 |
4 |
5 |
male |
39 |
2019-01-15 10:40:50 |
1 |
5 |
1 |
行操作
统计行数
df1.shape[0]
5
行筛选
条件筛选即可
df1.query("userid>=2")
|
userid |
sex |
age |
createtime_str |
1 |
2 |
female |
37 |
2019-01-02 11:55:50 |
2 |
3 |
male |
21 |
2019-01-21 11:45:50 |
3 |
4 |
female |
44 |
2019-02-01 12:45:50 |
4 |
5 |
male |
39 |
2019-01-15 10:40:50 |
重复行删除
import copy
temp = copy.deepcopy(df1)
temp = pd.concat([df1,pd.DataFrame({
'userid':[1],'sex':["male"],'age':[18],'createtime_str':["2019-01-01 11:45:50"]})])
temp
|
userid |
sex |
age |
createtime_str |
0 |
1 |
male |
18 |
2019-01-01 11:45:50 |
1 |
2 |
female |
37 |
2019-01-02 11:55:50 |
2 |
3 |
male |
21 |
2019-01-21 11:45:50 |
3 |
4 |
female |
44 |
2019-02-01 12:45:50 |
4 |
5 |
male |
39 |
2019-01-15 10:40:50 |
0 |
1 |
male |
18 |
2019-01-01 11:45:50 |
temp.drop_duplicates()
|
userid |
sex |
age |
createtime_str |
0 |
1 |
male |
18 |
2019-01-01 11:45:50 |
1 |
2 |
female |
37 |
2019-01-02 11:55:50 |
2 |
3 |
male |
21 |
2019-01-21 11:45:50 |
3 |
4 |
female |
44 |
2019-02-01 12:45:50 |
4 |
5 |
male |
39 |
2019-01-15 10:40:50 |
2020-02-29 Update于南京市栖霞区