python dataframe切片、筛选、分组、行列处理

dataframe切片

print(business.columns)#打印列名
df2.iloc[:,1:]#选择列
df[3:5]#得到行数据
df.iloc[3:5,0:2]
df.loc[:,['A','B']]
df.iat[1,1]#用于提取某个数字

注:loc是指location的意思,iloc中的i是指integer
    loc works on labels in the index.(即标签索引)
    iloc works on the positions in the index (so it only takes integers). (位置索引,只能是数字)

筛选

#多条件筛选
df[(df.num > 0) & (df.year > 2016)]
#指定返回列
df[['date','id']][(df.num > 0) & (df.year > 2016)]
#isin方法
list=['2017','2019','2009']
df['year'].isin(list)#返回布尔值
df[df['year'].isin(list)]#返回dataframe

实现行列计算

df['列求和'] = df.apply(lambda x: x.sum(), axis=1)
df.loc['行求和'] = df.apply(lambda x: x.sum())
df['列累积和'] = df['某一列'].cumsum()
df.sum()# 默认对每列元素求和
df.sum(1)# 对每行求和

# Covariance matrix of scaled data
cov_user = d_user.cov()
cov_user.style.background_gradient(cmap='coolwarm').set_precision(2)
# Correlation matrix of unscaled original data
corr_user = d_user.corr()
corr_user.style.background_gradient(cmap='coolwarm').set_precision(2)

预处理

# 读取某些列,生成新的DataFrame
newDf = pd.DataFrame(df, columns=[column1, column2, column3])
#对列的字符串进行相连
"".join(df['x'])
#对行的字符串进行相连,结果存在新的一列
df['new'] = df['x']+df['y']+df['z']
newb = pd.DataFrame(b_reduced, columns=['c', 'a', 's'])

#dataframe删除某一列有缺失值的行
u17 = u17.dropna(subset=["funny", "fans"])

#astype强制转换变量类型
df[['two', 'three']] = df[['two', 'three']].astype(float)

#pandas.Series.str.contains
#用于判断Series的字符串中是否包含待匹配的模式或者正则表达式,返回的是一个boolean Series。
Series.str.contains(self, pat, case=True, flags=0, na=nan, regex=True)

groupby

#生成dataframe数据
salaries=pd.DataFrame({
    'name':['BOSS','Lilei','Lilei','Han','BOSS','BOSS','Han','BOSS'],
    'Year':[2016,2016,2016,2016,2017,2017,2017,2017],
    'Salary':[999999,20000,25000,3000,9999999,999999,3500,999999],
    'Bonus':[100000,20000,20000,5000,200000,300000,3000,400000]   })

python dataframe切片、筛选、分组、行列处理_第1张图片

python
group_by_name=salaries.groupby('name')
print(type(group_by_name))

for name,group in group_by_name:
    print(name)
    print(group)
print(group_by_name[['Year','Bonus']].count())

group_by_name[['Salary','Bonus']].agg(sum)#使用聚合函数
group_by_name.get_group('Lilei')

#分组后,对其余各columns做多种统计运算
print(group_by_name[['Bonus','Salary']].agg(['count','mean','std']))
t0 = group_by_name[['Bonus','Salary']].agg(['count','mean','std'])
sns.relplot(data=t0.iloc[:,0],kind='line')
sns.barplot(x="name", y="Bonus", hue="Year", data=salaries)
plt.plot(x=t0.index, y=t0.iloc[:,0], c = 'y')
t0.iloc[:,[0,1,2]]

plt.grid(linestyle='-.')
t0.plot(y=[1,4],label = ["test",'ou'])
plt.show()
salaries.iloc[:,[0,1]]

#按多个列分组
group_by_name_year=salaries.groupby(['name','Year'])
print(group_by_name_year.count())
group_by_name_year.groups

datetime日期

#获得今天的日期
import time
now = time.strftime("%Y-%m-%d %H:%M:%S")
today = dt.datetime.today() 
z['date'].dt.time #获得时间

你可能感兴趣的:(python,数据处理)