pandas经典题

1,

print(Nowcoder.duplicated())    #显示每一行是否重复
print(Nowcoder.drop_duplicates())      # 删除重复行

2,


Nowcoder["Last_submission_time"] = pd.to_datetime(
    Nowcoder["Last_submission_time"], format=("%Y-%m-%d")
)#日期格式化,pandas有专门的的日期格式化函数
print(Nowcoder.loc[:, ["Nowcoder_ID", "Level", "Last_submission_time"]])

3,

import pandas as pd
import json

pd.set_option('display.width', 300)  # 设置字符显示宽度
pd.set_option('display.max_rows', None)  # 设置显示最大行
pd.set_option('display.max_columns', None)
#读取json格式的数据,并转化为dataframe格式
with open('Nowcoder.json', 'r') as f:
    data = json.loads(f.read())
print(pd.DataFrame(data))

4,

import pandas as pd
nowcoder = pd.read_csv('nowcoder.csv', parse_dates=True, index_col='date')
#分组求和
daily_num = nowcoder.groupby('date')['question_id'].count()
print(daily_num)

5,计算次日留存率

import pandas as pd
from datetime import timedelta
nowcoder = pd.read_csv('nowcoder.csv')
#总数
total_id = nowcoder['user_id'].count()
b = pd.merge(nowcoder,nowcoder,on = 'user_id')
#merge之后的列名:user_id,question_id_x,result_x,date_x,question_id_y,result_y,date_y
#是自动区分xy的
b['date_x'] = pd.to_datetime(b.date_x).dt.date  
# to_datetime默认有时间精度,.dt.date去掉分钟,得到日期列表2021-12-1
b['date_y'] = pd.to_datetime(b.date_y).dt.date
b['differ'] = b.date_y - b.date_x
sum_diff = b[b.differ == '1 days'].differ.count()
res = round(sum_diff/total_id,2)
print(res)

6,连续登录3天以上的用户,关键题

# 1、因为每天用户练习次数可能不止一次,所以需要先将用户每天的练习日期去重。
# 2、再将用户id分组,按照练习日期进行排序。
# 3、计算练习日期减去第二步骤得到的结果值,用户连续练习情况下,相减的结果都相同。
# 4、按照id和日期分组并计数,筛选大于等于3的即为连续3天练习的用户。
## 供调试
# nowcoder = pd.DataFrame({'user_id':[3310,3310,3310,3313,3314,3315,3313],'question_id':['110','111','112','113','114','115','116'],'result':['right','right','wrong','wrong','right','right','right'],'date':['2021/12/20 08:00','2021/12/21 08:00','2021/12/22 08:00','2021/12/22 08:00','2021/12/23 08:00','2021/12/20 08:00','2021/12/27 09:00:00']})

import pandas as pd
from datetime import timedelta
nowcoder = pd.read_csv('nowcoder.csv')
nowcoder['date_m']=pd.to_datetime(nowcoder['date']).dt.strftime('%Y-%m')
nowcoder['date'] = pd.to_datetime(nowcoder['date'],format='%Y-%m-%d').dt.strftime('%Y-%m-%d')
nowcoder['date_m']=pd.to_datetime(nowcoder['date']).dt.strftime('%Y-%m')
data=nowcoder[nowcoder['date_m']>='2021-12']
df=data[['user_id','date']].drop_duplicates(['user_id','date'],inplace=False)
df['date']=pd.to_datetime(df['date'])
df['rank']=pd.to_timedelta(df.groupby(['user_id'])['date'].rank(),unit='d')
df['date_diff']=df['date']-df['rank']
_df=df.groupby(['user_id','date_diff'])['date'].count()
_df=_df.groupby('user_id').max()
print(_df[_df>=3])

7,

import pandas as pd

Nowcoder = pd.read_csv('Nowcoder.csv', sep=',')
print(Nowcoder.groupby('Level')['Language'].value_counts())  #计数

8,join前要先使用set_index设置索引,重点

import pandas as pd
signup = pd.read_csv('signup.csv')
items = pd.read_csv('items.csv')
df = items.set_index('item_id').join(signup.set_index('item_id'), 
                                     on='item_id', how='inner')
print(df.groupby('item_name')['employee_id'].count())

9,

import pandas as pd
signup = pd.read_csv('signup.csv')
signup1 = pd.read_csv('signup1.csv')
items = pd.read_csv('items.csv')
signup_all = pd.concat([signup, signup1])  #表上下合并
df = items.set_index('item_id').join(signup_all.set_index('item_id'), 
                                     on='item_id', how='inner')
print(df.groupby('item_name')['name'].count())

10,数据透视

import pandas as pd

signup = pd.read_csv("signup.csv")
items = pd.read_csv("items.csv")

df_all = pd.merge(
    signup, items, how="left", left_on=signup["item_id"], right_on=items["item_id"]
)

print(
    pd.pivot_table(
        df_all,
        index=["sex", "department"],   #行索引
        columns=["item_name"],   #列索引
        values=["employee_id"],   
        aggfunc="count",   #计数
        fill_value=0,  #空值填充
    )
)

11, 排序函数,sort_values()

import pandas as pd
sales = pd.read_csv('sales.csv')

data = sales.sort_values(by='monetary', ascending=False).reset_index(drop=True).head(3)
print(data)

11,分组计数
pd.qcut函数,按照数据出现频率百分比划分,比如要把数据分为四份,则四段分别是数据的0-25%,25%-50%,50%-75%,75%-100%,每个间隔段里的元素个数都是相同的。
pd.qcut(x, q, labels=None, retbins=False, precision=3, duplicates=‘raise’) #最后一个参数 duplicates='drop’表示若有重复区间则删除

# 测试数据
sales = pd.DataFrame({'user_id':['NC0008','NC0018','NC0021','NC0035','NC0036'],'recency':[186,19,151,8,262],'frequency':[5,13,3,4,2],'monetary':[2990,3648,629,4542,896]})

sales['R_Quartile'] = pd.qcut(sales['recency'], [0, 0.25, 0.5, 0.75, 1],["4", "3", "2", "1"]).astype("int")
sales['F_Quartile'] = pd.qcut(sales['frequency'], [0, 0.25, 0.5, 0.75, 1],["1", "2", "3", "4"]).astype("int")
sales['M_Quartile'] = pd.qcut(sales['monetary'], [0, 0.25, 0.5, 0.75, 1],["1", "2", "3", "4"]).astype("int")
print(sales.head())

你可能感兴趣的:(python,pandas)