pandas小练习

pandas小练习_第1张图片

ex1 = pd.read_csv('work/端午粽子数据.csv')
ex1.columns
Index(['标题', ' 价格', '付款人数', '店铺', '发货地址 '], dtype='object')
ex1.info()

RangeIndex: 4403 entries, 0 to 4402
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   标题      4403 non-null   object
 1    价格     4403 non-null   object
 2   付款人数    4332 non-null   object
 3   店铺      4403 non-null   object
 4   发货地址    4400 non-null   object
dtypes: object(5)
memory usage: 172.1+ KB
ex1_1 = ex1[ex1['发货地址 '].notna()]
#去除无效值
def inval(x):
    try:
        float(x)
        return True
    except:
        return False
def to_val(x):
    return float(x)

ex1_1_hz = ex1_1.loc[ex1_1['发货地址 '].str.contains(r'杭州')]
ex1_1_hz[ex1_1_hz[' 价格'].map(inval)][' 价格'].map(to_val).mean()
81.10145251396648
ex1_2 = ex1_1.loc[ex1_1['标题'].str.contains(r'嘉兴')]
print(ex1_2.loc[~ex1_2['发货地址 '].str.contains(r'嘉兴')])
                                     标题     价格     付款人数          店铺  发货地址 
3        稻香私房鲜肉粽蛋黄肉粽嘉兴粽子咸鸭蛋礼盒装端午节送礼特产团购    138  1936人付款    稻香村食品旗舰店     北京
6          五芳斋华礼竹篮礼盒1360g蛋粽组合端午礼品嘉兴粽子礼盒    159  1028人付款        天猫超市     上海
8     真真老老嘉情礼盒10粽6蛋1.52kg/盒嘉兴粽子端午节粽子礼盒装    109  2117人付款        天猫超市     上海
9     五芳斋嘉兴粽子新鲜量贩蛋黄肉粽豆沙粽悦喜散装端午特产600g*2袋   59.9  1349人付款        天猫超市     上海
10         真真老老粽子臻芯800g/盒*1端午节礼盒装嘉兴特产送礼     75  1815人付款        天猫超市     上海
...                                 ...    ...      ...         ...    ...
4377      超港端午粽子肉粽蜜枣粽大肉棕子黄山特产礼盒新鲜嘉兴粽子肉粽   39.9    67人付款       超港旗舰店  安徽 黄山
4383     真真老老真情粽子咸蛋礼盒嘉兴特产肉粽豆沙甜枣粽端午节团购批发   79.9    10人付款     曈宝食品专营店     上海
4386     臻味德嘉兴粽子竹篮礼盒装农家手工蛋黄鲜肉棕甜粽端午节伴手礼品  49.08     2人付款       气泡西柚屋  浙江 温州
4393      傅太粽子10只蛋黄肉粽鲜肉粽嘉兴风味肉粽子即食早餐端午团购   60.2    46人付款      love侯氏  江西 萍乡
4401     【粉丝专享】端午节特产新鲜蛋黄鲜肉粽豆沙嘉兴粽子800g/箱   65.9   149人付款  chaoge1931  安徽 合肥

[1032 rows x 5 columns]
ex1_1[' 价格'] = ex1_1[ex1_1[' 价格'].map(inval)][' 价格'].map(to_val)
ex1_3 = ex1_1.dropna(subset=[' 价格','发货地址 '])
qcut = pd.qcut(ex1_3[' 价格'],5)
ex1_3.loc[:,'类别'] = qcut.cat.rename_categories([i for i in ['低','较低','中','较高','高']])
ex1_3 = ex1_3[['标题','类别', ' 价格', '付款人数', '店铺', '发货地址 ']].sort_values('类别',ascending=False)
print(ex1_3)
print(ex1_3[ex1_3['付款人数'].isnull()].shape[0])
ex1_4 = ex1_3.convert_dtypes()
ex1_4['人数'] = ex1_4['付款人数'].str.replace(r'人付款', '')
ex1_4['人数'] = ex1_4['人数'].str.replace(r'\+', '')
df_dot = ex1_4.loc[ex1_4['人数'].str.contains(r'\.'), '人数'].str.replace('.', '')
df_dot = df_dot.str.replace('万', '000')
ex1_4.loc[list(df_dot.index), '人数'] = list(df_dot.values)
df_wan = ex1_4.loc[ex1_4['人数'].str.contains('万'), '人数'].str.replace('万', '0000')
ex1_4.loc[list(df_wan.index), '人数'] = list(df_wan.values)
print(ex1_4.loc[ex1_4['人数'].isna()])
# 转回float类型
ex1_4['人数'] = pd.to_numeric(ex1_4['人数'], errors='coerce')
# 进行线性插值
for _, group in ex1_4.groupby('类别'):
    ex1_4.loc[group.index, '人数'] = group[[' 价格', '人数']].sort_values(by=' 价格')['人数'].interpolate()
print(ex1_4.loc[ex1_4['人数'].isna()])
ex1_4.loc[ex1_4['人数'].isna(), '人数'] = [900, 900, 900]
amount = list(ex1_4.loc[ex1_4['付款人数'].isna(), '人数'])
for i in range(len(amount)):
    amount[i] = str(round(amount[i]))
person = '人付款'
for i in range(len(amount)):
    amount[i] = amount[i] + person

ex1_4.loc[ex1_4['付款人数'].isna(), '付款人数'] = amount
print(ex1_4)
ex1_1[' 价格'] = ex1_1[' 价格'].astype('str')
ex1_5 = ('商品发货地为' + ex1_1['发货地址 '] + ',店铺为' + ex1_1['店铺'] + ',共计'+ ex1_1['付款人数'] + ',单价为' + ex1_1[' 价格'])
print(ex1_5)
print(ex1_5.str.extract(r'商品发货地为(?P<发货地址>\w+\s?\w+?),店铺为(?P<店铺>[\w]+),共计(?P<付款人数>\d+[万\+]*?人付款),单价为(?P<价格>\d+\.?\d+)'))
       发货地址          店铺      付款人数     价格
0     浙江 嘉兴    五芳斋官方旗舰店      6人付款  129.0
1        上海        天猫超市      8人付款   44.0
2     浙江 嘉兴    五芳斋官方旗舰店  100万+人付款   89.9
3        北京    稻香村食品旗舰店   1936人付款  138.0
4     浙江 嘉兴       城城喂食猫  9500+人付款    3.8
...     ...         ...       ...    ...
4398  浙江 嘉兴       红船旗舰店    347人付款    7.9
4399  四川 乐山   峨眉山隐栗粑粑罗栗     80人付款   93.0
4400     北京      北京美程嘉译      5人付款   98.2
4401  安徽 合肥  chaoge1931    149人付款   65.9
4402  广东 东莞     hezhiqi     20人付款   99.0

[4400 rows x 4 columns]

pandas小练习_第2张图片

df_3 = pd.read_csv('data/摩拜单车数据.csv', parse_dates=['start_time', 'end_time'])
df_3.head()
df_3.info()

RangeIndex: 102361 entries, 0 to 102360
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   orderid           102361 non-null  int64         
 1   bikeid            102361 non-null  int64         
 2   userid            102361 non-null  int64         
 3   start_time        102361 non-null  datetime64[ns]
 4   start_location_x  102361 non-null  float64       
 5   start_location_y  102361 non-null  float64       
 6   end_time          102361 non-null  datetime64[ns]
 7   end_location_x    102361 non-null  float64       
 8   end_location_y    102361 non-null  float64       
 9   track             102361 non-null  object        
dtypes: datetime64[ns](2), float64(4), int64(3), object(1)
memory usage: 7.8+ MB
# 获取日期
df_3['date'] = df_3['start_time'].apply(lambda x: str(x)[:10])

df_weekend = df_3.loc[df_3['start_time'].dt.dayofweek.isin([5, 6])]
weekend = len(df_weekend) / df_weekend['date'].nunique()
weekend
# 周密单日用车辆
3275.125
df_weekdays = df_3.loc[~df_3['start_time'].dt.dayofweek.isin([5, 6])]
weekdays = len(df_weekdays) / df_weekdays['date'].nunique()
weekdays
# 平时单日用车量
3311.304347826087

(2)

from datetime import datetime

start_morn = datetime.strptime('7:30', '%H:%M').time()
end_morn = datetime.strptime('9:30', '%H:%M').time()
df_w_morn = df_weekdays.loc[(df_weekdays['start_time'].dt.time >= start_morn) & (df_weekdays['start_time'].dt.time <= end_morn)]
# 获取早上用车量
s_morn = df_w_morn.groupby('date')['orderid'].count()

start_after = datetime.strptime('17:00', '%H:%M').time()
end_after = datetime.strptime('19:00', '%H:%M').time()
df_w_after = df_weekdays.loc[(df_weekdays['start_time'].dt.time >= start_after) & (df_weekdays['start_time'].dt.time <= end_after)]
# 获取下午用车量
s_after = df_w_after.groupby('date')['orderid'].count()

number = 0
total = list((s_morn - s_after).values)

for i in total:
    if i >=0:
        number += 1

number
# 只有3天
3

(3)

# 获取周五
df_fri = df_weekdays[df_weekdays['start_time'].dt.dayofweek.isin([4])]

df_fri.groupby('date')['orderid'].count().idxmax()
'2016-08-26'
df_26 = df_fri.loc[df_fri['date'] == '2016-08-26']
df_26['use_time'] = df_26['end_time'] - df_26['start_time']

# 建立分桶
list_time = [pd.Timedelta(minutes=0), pd.Timedelta(minutes=30), pd.Timedelta(hours=2), pd.Timedelta(hours=6)]

df_26['group_time'] = pd.cut(df_26['use_time'], bins=list_time)
df_26.groupby('group_time')['use_time'].sum() / df_26.groupby('group_time')['use_time'].count()
group_time
(0 days 00:00:00, 0 days 00:30:00]   00:12:04.639175
(0 days 00:30:00, 0 days 02:00:00]   00:49:03.421588
(0 days 02:00:00, 0 days 06:00:00]   02:34:37.142857
Name: use_time, dtype: timedelta64[ns]

(4)

import math

EARTH_RADIUS = 6371

def cal_dis_meters(latitude1, longitude1,latitude2, longitude2):  
    radLat1 = (math.pi/180)*latitude1  
    radLat2 = (math.pi/180)*latitude2  
    radLng1 = (math.pi/180)*longitude1  
    radLng2= (math.pi/180)*longitude2  
    
    d = []
    for i in range(len(latitude1)):
        d1=2.0*math.asin(math.sqrt(math.pow(math.sin((radLat1[i]-radLat2[i])/2.0),2)+math.cos(radLat1[i])*math.cos(radLat2[i])*math.pow(math.sin((radLng1[i]-radLng2[i])/2.0),2)))*EARTH_RADIUS
        d.append(d1)
    
    for i in range(len(d)): 
        d[i] = round(d[i], 2)
    
    return d

df_3['distance'] = cal_dis_meters(df_3['start_location_y'], df_3['start_location_x'], df_3['end_location_y'], df_3['end_location_x'])
df_3

(5)

df_3['use_time'] = df_3['end_time'] - df_3['start_time']

df_3['use_time'] = df_3['use_time'].apply(lambda x: str(x)[7:])
# 分钟转成小时
def min2hou(x):
    h = x[1]
    m = x[3:5]
    
    if m[0] == '0':
        m = m[1]
    return round(int(h) + int(m)/60, 2)
    
df_3['hour'] = df_3['use_time'].apply(min2hou)
df_3['cal_distance'] = df_3['hour'] * 15
df_3
df_3.loc[(df_3['cal_distance'] / df_3['distance']) >= 2]

(6)

# 还有脏数据
s = df_3.loc[df_3['track'].str.contains(r'[\\]')]['track'].str.replace('\\', '')
df_3.loc[df_3['track'].str.contains(r'[\\]'), 'track'] = s

# 弄个元组列表
result = []

for i in range(len(df_3)):
    list_temp = df_3['track'][i].split('#')
    result_temp = []
    for j in range(len(list_temp)):
        list_temp1 = list_temp[j].split(',')
        zip_temp = list(zip([float(list_temp1[0])], [float(list_temp1[1])]))
        result_temp.extend(zip_temp)
        
    result.append(result_temp)
def cal_real_meters(latitude1, longitude1,latitude2, longitude2):  
    radLat1 = (math.pi/180)*latitude1  
    radLat2 = (math.pi/180)*latitude2  
    radLng1 = (math.pi/180)*longitude1  
    radLng2= (math.pi/180)*longitude2  
    d=2.0*math.asin(math.sqrt(math.pow(math.sin((radLat1-radLat2)/2.0),2)+math.cos(radLat1)*math.cos(radLat2)*math.pow(math.sin((radLng1-radLng2)/2.0),2)))*EARTH_RADIUS
    return d

real_dist = []

for i in range(len(df_3)):
    d = 0
    for j in range(len(result[i])-1):
        d_temp = cal_real_meters(result[i][j][1], result[i][j][0], result[i][j+1][1], result[i][j+1][0])
        d += d_temp
    
    real_dist.append(round(d, 2))
    
real_dist
df_3['real_dist'] = real_dist
df_3.loc[(df_3['cal_distance'] / df_3['real_dist']) >= 2]

Reference

  1. Pandas 教程(下)综合练习

你可能感兴趣的:(一些基础分享)