目录
1.排序
2.去重
3.排序
4.分组排序
5.分组统计之频数
(1)单列
mysql:select * from uds.amz_daily_sales where purchase_date>='2021-04-01' order by asin desc;
pandas:
data.sort_values(by='asin',ascending=False);
data.sort_values(by='asin',ascending=False,inplace=True);
data.sort_values(by='asin',ascending=False,na_position='first'); #NAN值放在首行
(2)多列
mysql:select * from uds.amz_daily_sales where purchase_date>='2021-04-01' order by sales_channel_id,asin desc, sales_channel asc , purchase_date asc;
pandas:data.sort_values(by=['sales_channel_id','asin','sales_channel','purchase_date'],ascending=[True,False,True,True]);
#去除重复的行
df.drop_duplicates()
#按单列去重
df.drop_duplicates(subset=['brand'],keep='last', inplace=True)
#按多列去重
df.drop_duplicates(subset=['brand', 'style'], keep='last', inplace=True)
#实现按qty排序
data.sort_values(by='qty',axis=0,ascending=False)
#实现按asin与qty排序,降序
data.sort_values(by=['asin','qty'],axis=0,ascending=False)
#依据第一列排序,并将该列空值放在首位
df.sort_values(by=['col1'],na_position='first')
#依据第二、三列,数值降序排序
df.sort_values(by=['col2','col3'],ascending=False)
df.sort_values(by=['col2','col3'],ascending=[False,True])
#根据第一列中数值排序,按降序排列,并替换原数据
df.sort_values(by=['col1'],ascending=False,inplace=True,na_position='first')
df = pd.DataFrame(
{'brand': ['Yum Yum', 'Yum Yum','Yum Yum', 'Yum Yum','Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie','Indomie', 'Indomie', 'Indomie']
,'style': ['cup1', 'cup2', 'cup1','cup1', 'cup2', 'cup1', 'pack', 'pack','pack','back','back','back']
,'rating': [4, 4, 2, 1, 6, 7 ,10, 10, 17, 15, 5, 9]
,'orders': [10, 12, 22, 21, 16, 71 ,110, 210, 117, 215, 15, 99]
})
#1 按单列分组再按单列排序号
#按rating升序排列
df['ranking'] = df['rating'].groupby(df['brand']).rank(ascending=False)
#按rating降序排列,method='first'当组内rating有相同值时,早出现的那个排序在前
df['ranking'] = df['rating'].groupby(df['brand']).rank(ascending=False,method='first')
按单列分组再按多列排序号
df.sort_values(by=['brand','rating'],ascending=[True,False],inplace=True)
df['ranking'] = df['rating'].groupby(df['brand']).rank(ascending=False,method='first')
#2.按多列分组再按单列排序号
df2 = pd.DataFrame()
for name,group in df.groupby(['brand','style']):
group['ranking'] = group['rating'].rank(ascending=False,method='first')
print(group)
df2 = pd.concat([df2,group],axis=0)
按多列分组再按多列排序号
df2 = pd.DataFrame()
for name,group in df.groupby(['brand','style']):
group.sort_values(by=['rating','orders'],ascending=[False,False],inplace=True)
group['ranking'] = group['rating'].rank(ascending=False,method='first')
print(group)
df2 = pd.concat([df2,group],axis=0)
#3.按多列分组再按单列排序号
df['gp'] = df['brand']+df['style']
df['ranking'] = df['rating'].groupby(df['gp']).rank(ascending=False,method='first')
header=['brand','style', 'rating','ranking']
df=df[header]
按多列分组再按多列排序号
df['gp'] = df['brand']+df['style']
df.sort_values(by=['gp','rating','orders'],ascending=[True,False,False],inplace=True)
df['ranking'] = df['rating'].groupby(df['gp']).rank(ascending=False,method='first')
header=['brand','style', 'rating','ranking','orders']
df=df[header]
(1)求表行数:df.shape
(2)求某列值去重后的行数:
#1 去重后求行数
df.drop_duplicates(subset=['brand'],inplace=True)
df.shape
#2 groupby后求行数
len(df.groupby(['brand','style']))
(3)分组排序取前2条
df['gp'] = df['brand']+df['style']
df.sort_values(['rating', 'orders'], ascending=[False, False]).groupby('gp').head(2)
(1) 统计df中brand列各值的出现次数
df['brand'].value_counts() #返回series
(2) 统计df中brand与style两列组合值出现次数
df['style'].groupby(df['brand']).value_counts()