excel添加列匹配对比及标签生成(留存)


import pandas as pd
import numpy as np
import os
from datetime import date,datetime
today=date.today()

downpath='/Users/kangyongqing/Downloads/'
gensuipath='/Users/kangyongqing/Documents/kangyq/202209/OKR预占/2023Q2促留存/月度留存分级/24样本学生跟随.xlsx'

downname='102814_2023_07_27.csv'

df0=pd.read_excel(gensuipath,engine='openpyxl',sheet_name='选定跟盯样本',usecols=[0,1,2,3,4,5,6,7,8,9,10,11,12])
print(df0.columns)
print(df0['是否选定'].sum())

df1=pd.DataFrame(pd.read_csv(downpath+downname).rename(columns={'tutor_user_id':'教师id'}))
df1.insert(loc=3,column='ci',value=1)
#插入新的列,并赋值,loc是插入列的位置,从0开始
df1=pd.concat([df1,pd.DataFrame(columns=['cici'])],axis=1)
df1['cici'].fillna(2,inplace=True)
#插入新的列,并填充
df1.loc[:,'cicici']=3
#利用loc的行列索引标签来实现

print(df1.head())
df2=df1.pivot_table(index=['student_user_id','教师id'],columns='yuefen',values='ci',aggfunc='count').reset_index().fillna(0)
print(df2.head())

df3=pd.merge(df0,df2,on=['student_user_id','教师id'],how='left')
# print(df3.head())
# print(df3.columns)

#判定停课还是换师
df4=df1.pivot_table(index='student_user_id',values='ci',aggfunc='sum').reset_index().rename(columns={'ci':'Q3总课次'})
print(df4.head())

df5=pd.merge(df3,df4,on='student_user_id',how='left').fillna(0)
df5.loc[:,'leiji']=df5['7月跟随']+df5['8月跟随']+df5['9月跟随']
df5['标签']=np.where((df5['leiji']==0)&(df5['Q3总课次']>0),'换师',np.where((df5['leiji']==0)&(df5['Q3总课次']==0),'停课',''))
print(df5.head())
df5['student_user_id']=df5['student_user_id'].astype(np.int64).astype(str)
df5['家长id']=df5['家长id'].astype(np.int64).astype(str)
df5['教师id']=df5['教师id'].astype(np.int64).astype(str)
df5['m1']=df5['m1'].apply(lambda x:pd.to_datetime(x).strftime(format='%Y年%m月'))
df5['m2']=df5['m2'].apply(lambda x:pd.to_datetime(x).strftime(format='%Y年%m月'))
df5['m3']=df5['m3'].apply(lambda x:pd.to_datetime(x).strftime(format='%Y年%m月'))
#时间格式转化

print(df5.head())

#输出不包含某列的数据
df6=df5.loc[:,df5.columns!='leiji']
print(df6.head())

#输出不包含多列的数据,加上~即可
df7=df5.loc[:,~df5.columns.isin(['leiji','Q3总课次'])]
print(df7.head())
df6.to_excel('/Users/kangyongqing/Documents/kangyq/202209/OKR预占/2023Q2促留存/月度留存分级/留存数据跟盯/'+'样本学生跟随明细'+str(today)+'.xlsx',index=False)

知识点:

  1. 读取excel指定列数据,pd.read_excel(usecols=[]);
  2. 数据集插入列的三种方法:insert、concat、loc三种;
  3. 数据透视表及填充空值,并重置索引;
  4. 时间格式调整为~年~月;
  5. np.where 方法生成标签列
  6. 导出不包含指定列的数据到excel

你可能感兴趣的:(excel)