1,
print(Nowcoder.duplicated()) #显示每一行是否重复
print(Nowcoder.drop_duplicates()) # 删除重复行
2,
Nowcoder["Last_submission_time"] = pd.to_datetime(
Nowcoder["Last_submission_time"], format=("%Y-%m-%d")
)#日期格式化,pandas有专门的的日期格式化函数
print(Nowcoder.loc[:, ["Nowcoder_ID", "Level", "Last_submission_time"]])
3,
import pandas as pd
import json
pd.set_option('display.width', 300) # 设置字符显示宽度
pd.set_option('display.max_rows', None) # 设置显示最大行
pd.set_option('display.max_columns', None)
#读取json格式的数据,并转化为dataframe格式
with open('Nowcoder.json', 'r') as f:
data = json.loads(f.read())
print(pd.DataFrame(data))
4,
import pandas as pd
nowcoder = pd.read_csv('nowcoder.csv', parse_dates=True, index_col='date')
#分组求和
daily_num = nowcoder.groupby('date')['question_id'].count()
print(daily_num)
5,计算次日留存率
import pandas as pd
from datetime import timedelta
nowcoder = pd.read_csv('nowcoder.csv')
#总数
total_id = nowcoder['user_id'].count()
b = pd.merge(nowcoder,nowcoder,on = 'user_id')
#merge之后的列名:user_id,question_id_x,result_x,date_x,question_id_y,result_y,date_y
#是自动区分xy的
b['date_x'] = pd.to_datetime(b.date_x).dt.date
# to_datetime默认有时间精度,.dt.date去掉分钟,得到日期列表2021-12-1
b['date_y'] = pd.to_datetime(b.date_y).dt.date
b['differ'] = b.date_y - b.date_x
sum_diff = b[b.differ == '1 days'].differ.count()
res = round(sum_diff/total_id,2)
print(res)
6,连续登录3天以上的用户,关键题
# 1、因为每天用户练习次数可能不止一次,所以需要先将用户每天的练习日期去重。
# 2、再将用户id分组,按照练习日期进行排序。
# 3、计算练习日期减去第二步骤得到的结果值,用户连续练习情况下,相减的结果都相同。
# 4、按照id和日期分组并计数,筛选大于等于3的即为连续3天练习的用户。
## 供调试
# nowcoder = pd.DataFrame({'user_id':[3310,3310,3310,3313,3314,3315,3313],'question_id':['110','111','112','113','114','115','116'],'result':['right','right','wrong','wrong','right','right','right'],'date':['2021/12/20 08:00','2021/12/21 08:00','2021/12/22 08:00','2021/12/22 08:00','2021/12/23 08:00','2021/12/20 08:00','2021/12/27 09:00:00']})
import pandas as pd
from datetime import timedelta
nowcoder = pd.read_csv('nowcoder.csv')
nowcoder['date_m']=pd.to_datetime(nowcoder['date']).dt.strftime('%Y-%m')
nowcoder['date'] = pd.to_datetime(nowcoder['date'],format='%Y-%m-%d').dt.strftime('%Y-%m-%d')
nowcoder['date_m']=pd.to_datetime(nowcoder['date']).dt.strftime('%Y-%m')
data=nowcoder[nowcoder['date_m']>='2021-12']
df=data[['user_id','date']].drop_duplicates(['user_id','date'],inplace=False)
df['date']=pd.to_datetime(df['date'])
df['rank']=pd.to_timedelta(df.groupby(['user_id'])['date'].rank(),unit='d')
df['date_diff']=df['date']-df['rank']
_df=df.groupby(['user_id','date_diff'])['date'].count()
_df=_df.groupby('user_id').max()
print(_df[_df>=3])
7,
import pandas as pd
Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
print(Nowcoder.groupby('Level')['Language'].value_counts()) #计数
8,join前要先使用set_index设置索引,重点
import pandas as pd
signup = pd.read_csv('signup.csv')
items = pd.read_csv('items.csv')
df = items.set_index('item_id').join(signup.set_index('item_id'),
on='item_id', how='inner')
print(df.groupby('item_name')['employee_id'].count())
9,
import pandas as pd
signup = pd.read_csv('signup.csv')
signup1 = pd.read_csv('signup1.csv')
items = pd.read_csv('items.csv')
signup_all = pd.concat([signup, signup1]) #表上下合并
df = items.set_index('item_id').join(signup_all.set_index('item_id'),
on='item_id', how='inner')
print(df.groupby('item_name')['name'].count())
10,数据透视
import pandas as pd
signup = pd.read_csv("signup.csv")
items = pd.read_csv("items.csv")
df_all = pd.merge(
signup, items, how="left", left_on=signup["item_id"], right_on=items["item_id"]
)
print(
pd.pivot_table(
df_all,
index=["sex", "department"], #行索引
columns=["item_name"], #列索引
values=["employee_id"],
aggfunc="count", #计数
fill_value=0, #空值填充
)
)
11, 排序函数,sort_values()
import pandas as pd
sales = pd.read_csv('sales.csv')
data = sales.sort_values(by='monetary', ascending=False).reset_index(drop=True).head(3)
print(data)
11,分组计数
pd.qcut函数,按照数据出现频率百分比划分,比如要把数据分为四份,则四段分别是数据的0-25%,25%-50%,50%-75%,75%-100%,每个间隔段里的元素个数都是相同的。
pd.qcut(x, q, labels=None, retbins=False, precision=3, duplicates=‘raise’) #最后一个参数 duplicates='drop’表示若有重复区间则删除
# 测试数据
sales = pd.DataFrame({'user_id':['NC0008','NC0018','NC0021','NC0035','NC0036'],'recency':[186,19,151,8,262],'frequency':[5,13,3,4,2],'monetary':[2990,3648,629,4542,896]})
sales['R_Quartile'] = pd.qcut(sales['recency'], [0, 0.25, 0.5, 0.75, 1],["4", "3", "2", "1"]).astype("int")
sales['F_Quartile'] = pd.qcut(sales['frequency'], [0, 0.25, 0.5, 0.75, 1],["1", "2", "3", "4"]).astype("int")
sales['M_Quartile'] = pd.qcut(sales['monetary'], [0, 0.25, 0.5, 0.75, 1],["1", "2", "3", "4"]).astype("int")
print(sales.head())