python - pandas 之 dataframe - 排序&去重&常规统计&分组排序&分组统计

目录

1.排序

2.去重

3.排序

4.分组排序

5.分组统计之频数


1.排序

(1)单列
mysql:select * from uds.amz_daily_sales where purchase_date>='2021-04-01' order by asin desc;
pandas:
data.sort_values(by='asin',ascending=False);
data.sort_values(by='asin',ascending=False,inplace=True);
data.sort_values(by='asin',ascending=False,na_position='first');  #NAN值放在首行

(2)多列
mysql:select * from uds.amz_daily_sales where purchase_date>='2021-04-01'  order by sales_channel_id,asin desc, sales_channel asc , purchase_date asc;
pandas:data.sort_values(by=['sales_channel_id','asin','sales_channel','purchase_date'],ascending=[True,False,True,True]);

 

2.去重

#去除重复的行
df.drop_duplicates()
#按单列去重
df.drop_duplicates(subset=['brand'],keep='last', inplace=True)
#按多列去重
df.drop_duplicates(subset=['brand', 'style'], keep='last', inplace=True)

 

3.排序

#实现按qty排序
data.sort_values(by='qty',axis=0,ascending=False)
#实现按asin与qty排序,降序
data.sort_values(by=['asin','qty'],axis=0,ascending=False)
#依据第一列排序,并将该列空值放在首位
df.sort_values(by=['col1'],na_position='first')
#依据第二、三列,数值降序排序
df.sort_values(by=['col2','col3'],ascending=False)
df.sort_values(by=['col2','col3'],ascending=[False,True])
#根据第一列中数值排序,按降序排列,并替换原数据
df.sort_values(by=['col1'],ascending=False,inplace=True,na_position='first')

 

4.分组排序

df = pd.DataFrame(
{'brand': ['Yum Yum', 'Yum Yum','Yum Yum', 'Yum Yum','Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie','Indomie', 'Indomie', 'Indomie']
,'style': ['cup1', 'cup2', 'cup1','cup1', 'cup2', 'cup1', 'pack', 'pack','pack','back','back','back']
,'rating': [4, 4, 2, 1, 6, 7 ,10, 10, 17, 15, 5, 9]
,'orders': [10, 12, 22, 21, 16, 71 ,110, 210, 117, 215, 15, 99]
})

#1 按单列分组再按单列排序号
#按rating升序排列
df['ranking'] = df['rating'].groupby(df['brand']).rank(ascending=False) 
#按rating降序排列,method='first'当组内rating有相同值时,早出现的那个排序在前
df['ranking'] = df['rating'].groupby(df['brand']).rank(ascending=False,method='first') 

按单列分组再按多列排序号
df.sort_values(by=['brand','rating'],ascending=[True,False],inplace=True)
df['ranking'] = df['rating'].groupby(df['brand']).rank(ascending=False,method='first') 

#2.按多列分组再按单列排序号
df2 = pd.DataFrame()
for name,group in df.groupby(['brand','style']):
        group['ranking'] = group['rating'].rank(ascending=False,method='first')
        print(group)
        df2 = pd.concat([df2,group],axis=0)

按多列分组再按多列排序号
df2 = pd.DataFrame()
for name,group in df.groupby(['brand','style']):
        group.sort_values(by=['rating','orders'],ascending=[False,False],inplace=True)
        group['ranking'] = group['rating'].rank(ascending=False,method='first')
        print(group)
        df2 = pd.concat([df2,group],axis=0)

#3.按多列分组再按单列排序号 
df['gp'] = df['brand']+df['style']
df['ranking'] = df['rating'].groupby(df['gp']).rank(ascending=False,method='first') 
header=['brand','style', 'rating','ranking']
df=df[header]

按多列分组再按多列排序号
df['gp'] = df['brand']+df['style']
df.sort_values(by=['gp','rating','orders'],ascending=[True,False,False],inplace=True)
df['ranking'] = df['rating'].groupby(df['gp']).rank(ascending=False,method='first') 
header=['brand','style', 'rating','ranking','orders']
df=df[header]

 

5.vs mysql

(1)求表行数:df.shape
(2)求某列值去重后的行数:
#1 去重后求行数
df.drop_duplicates(subset=['brand'],inplace=True)
df.shape
#2 groupby后求行数
len(df.groupby(['brand','style']))
(3)分组排序取前2条
df['gp'] = df['brand']+df['style']
df.sort_values(['rating', 'orders'], ascending=[False, False]).groupby('gp').head(2)

 

6.分组统计之频数

(1) 统计df中brand列各值的出现次数
df['brand'].value_counts() #返回series
(2) 统计df中brand与style两列组合值出现次数
df['style'].groupby(df['brand']).value_counts()

 

 

 

 

 

你可能感兴趣的:(python,python)