ex1 = pd.read_csv('work/端午粽子数据.csv')
ex1.columns
Index(['标题', ' 价格', '付款人数', '店铺', '发货地址 '], dtype='object')
ex1.info()
RangeIndex: 4403 entries, 0 to 4402
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 标题 4403 non-null object
1 价格 4403 non-null object
2 付款人数 4332 non-null object
3 店铺 4403 non-null object
4 发货地址 4400 non-null object
dtypes: object(5)
memory usage: 172.1+ KB
ex1_1 = ex1[ex1['发货地址 '].notna()]
#去除无效值
def inval(x):
try:
float(x)
return True
except:
return False
def to_val(x):
return float(x)
ex1_1_hz = ex1_1.loc[ex1_1['发货地址 '].str.contains(r'杭州')]
ex1_1_hz[ex1_1_hz[' 价格'].map(inval)][' 价格'].map(to_val).mean()
81.10145251396648
ex1_2 = ex1_1.loc[ex1_1['标题'].str.contains(r'嘉兴')]
print(ex1_2.loc[~ex1_2['发货地址 '].str.contains(r'嘉兴')])
标题 价格 付款人数 店铺 发货地址
3 稻香私房鲜肉粽蛋黄肉粽嘉兴粽子咸鸭蛋礼盒装端午节送礼特产团购 138 1936人付款 稻香村食品旗舰店 北京
6 五芳斋华礼竹篮礼盒1360g蛋粽组合端午礼品嘉兴粽子礼盒 159 1028人付款 天猫超市 上海
8 真真老老嘉情礼盒10粽6蛋1.52kg/盒嘉兴粽子端午节粽子礼盒装 109 2117人付款 天猫超市 上海
9 五芳斋嘉兴粽子新鲜量贩蛋黄肉粽豆沙粽悦喜散装端午特产600g*2袋 59.9 1349人付款 天猫超市 上海
10 真真老老粽子臻芯800g/盒*1端午节礼盒装嘉兴特产送礼 75 1815人付款 天猫超市 上海
... ... ... ... ... ...
4377 超港端午粽子肉粽蜜枣粽大肉棕子黄山特产礼盒新鲜嘉兴粽子肉粽 39.9 67人付款 超港旗舰店 安徽 黄山
4383 真真老老真情粽子咸蛋礼盒嘉兴特产肉粽豆沙甜枣粽端午节团购批发 79.9 10人付款 曈宝食品专营店 上海
4386 臻味德嘉兴粽子竹篮礼盒装农家手工蛋黄鲜肉棕甜粽端午节伴手礼品 49.08 2人付款 气泡西柚屋 浙江 温州
4393 傅太粽子10只蛋黄肉粽鲜肉粽嘉兴风味肉粽子即食早餐端午团购 60.2 46人付款 love侯氏 江西 萍乡
4401 【粉丝专享】端午节特产新鲜蛋黄鲜肉粽豆沙嘉兴粽子800g/箱 65.9 149人付款 chaoge1931 安徽 合肥
[1032 rows x 5 columns]
ex1_1[' 价格'] = ex1_1[ex1_1[' 价格'].map(inval)][' 价格'].map(to_val)
ex1_3 = ex1_1.dropna(subset=[' 价格','发货地址 '])
qcut = pd.qcut(ex1_3[' 价格'],5)
ex1_3.loc[:,'类别'] = qcut.cat.rename_categories([i for i in ['低','较低','中','较高','高']])
ex1_3 = ex1_3[['标题','类别', ' 价格', '付款人数', '店铺', '发货地址 ']].sort_values('类别',ascending=False)
print(ex1_3)
print(ex1_3[ex1_3['付款人数'].isnull()].shape[0])
ex1_4 = ex1_3.convert_dtypes()
ex1_4['人数'] = ex1_4['付款人数'].str.replace(r'人付款', '')
ex1_4['人数'] = ex1_4['人数'].str.replace(r'\+', '')
df_dot = ex1_4.loc[ex1_4['人数'].str.contains(r'\.'), '人数'].str.replace('.', '')
df_dot = df_dot.str.replace('万', '000')
ex1_4.loc[list(df_dot.index), '人数'] = list(df_dot.values)
df_wan = ex1_4.loc[ex1_4['人数'].str.contains('万'), '人数'].str.replace('万', '0000')
ex1_4.loc[list(df_wan.index), '人数'] = list(df_wan.values)
print(ex1_4.loc[ex1_4['人数'].isna()])
# 转回float类型
ex1_4['人数'] = pd.to_numeric(ex1_4['人数'], errors='coerce')
# 进行线性插值
for _, group in ex1_4.groupby('类别'):
ex1_4.loc[group.index, '人数'] = group[[' 价格', '人数']].sort_values(by=' 价格')['人数'].interpolate()
print(ex1_4.loc[ex1_4['人数'].isna()])
ex1_4.loc[ex1_4['人数'].isna(), '人数'] = [900, 900, 900]
amount = list(ex1_4.loc[ex1_4['付款人数'].isna(), '人数'])
for i in range(len(amount)):
amount[i] = str(round(amount[i]))
person = '人付款'
for i in range(len(amount)):
amount[i] = amount[i] + person
ex1_4.loc[ex1_4['付款人数'].isna(), '付款人数'] = amount
print(ex1_4)
ex1_1[' 价格'] = ex1_1[' 价格'].astype('str')
ex1_5 = ('商品发货地为' + ex1_1['发货地址 '] + ',店铺为' + ex1_1['店铺'] + ',共计'+ ex1_1['付款人数'] + ',单价为' + ex1_1[' 价格'])
print(ex1_5)
print(ex1_5.str.extract(r'商品发货地为(?P<发货地址>\w+\s?\w+?),店铺为(?P<店铺>[\w]+),共计(?P<付款人数>\d+[万\+]*?人付款),单价为(?P<价格>\d+\.?\d+)'))
发货地址 店铺 付款人数 价格
0 浙江 嘉兴 五芳斋官方旗舰店 6人付款 129.0
1 上海 天猫超市 8人付款 44.0
2 浙江 嘉兴 五芳斋官方旗舰店 100万+人付款 89.9
3 北京 稻香村食品旗舰店 1936人付款 138.0
4 浙江 嘉兴 城城喂食猫 9500+人付款 3.8
... ... ... ... ...
4398 浙江 嘉兴 红船旗舰店 347人付款 7.9
4399 四川 乐山 峨眉山隐栗粑粑罗栗 80人付款 93.0
4400 北京 北京美程嘉译 5人付款 98.2
4401 安徽 合肥 chaoge1931 149人付款 65.9
4402 广东 东莞 hezhiqi 20人付款 99.0
[4400 rows x 4 columns]
df_3 = pd.read_csv('data/摩拜单车数据.csv', parse_dates=['start_time', 'end_time'])
df_3.head()
df_3.info()
RangeIndex: 102361 entries, 0 to 102360
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 orderid 102361 non-null int64
1 bikeid 102361 non-null int64
2 userid 102361 non-null int64
3 start_time 102361 non-null datetime64[ns]
4 start_location_x 102361 non-null float64
5 start_location_y 102361 non-null float64
6 end_time 102361 non-null datetime64[ns]
7 end_location_x 102361 non-null float64
8 end_location_y 102361 non-null float64
9 track 102361 non-null object
dtypes: datetime64[ns](2), float64(4), int64(3), object(1)
memory usage: 7.8+ MB
# 获取日期
df_3['date'] = df_3['start_time'].apply(lambda x: str(x)[:10])
df_weekend = df_3.loc[df_3['start_time'].dt.dayofweek.isin([5, 6])]
weekend = len(df_weekend) / df_weekend['date'].nunique()
weekend
# 周密单日用车辆
3275.125
df_weekdays = df_3.loc[~df_3['start_time'].dt.dayofweek.isin([5, 6])]
weekdays = len(df_weekdays) / df_weekdays['date'].nunique()
weekdays
# 平时单日用车量
3311.304347826087
(2)
from datetime import datetime
start_morn = datetime.strptime('7:30', '%H:%M').time()
end_morn = datetime.strptime('9:30', '%H:%M').time()
df_w_morn = df_weekdays.loc[(df_weekdays['start_time'].dt.time >= start_morn) & (df_weekdays['start_time'].dt.time <= end_morn)]
# 获取早上用车量
s_morn = df_w_morn.groupby('date')['orderid'].count()
start_after = datetime.strptime('17:00', '%H:%M').time()
end_after = datetime.strptime('19:00', '%H:%M').time()
df_w_after = df_weekdays.loc[(df_weekdays['start_time'].dt.time >= start_after) & (df_weekdays['start_time'].dt.time <= end_after)]
# 获取下午用车量
s_after = df_w_after.groupby('date')['orderid'].count()
number = 0
total = list((s_morn - s_after).values)
for i in total:
if i >=0:
number += 1
number
# 只有3天
3
(3)
# 获取周五
df_fri = df_weekdays[df_weekdays['start_time'].dt.dayofweek.isin([4])]
df_fri.groupby('date')['orderid'].count().idxmax()
'2016-08-26'
df_26 = df_fri.loc[df_fri['date'] == '2016-08-26']
df_26['use_time'] = df_26['end_time'] - df_26['start_time']
# 建立分桶
list_time = [pd.Timedelta(minutes=0), pd.Timedelta(minutes=30), pd.Timedelta(hours=2), pd.Timedelta(hours=6)]
df_26['group_time'] = pd.cut(df_26['use_time'], bins=list_time)
df_26.groupby('group_time')['use_time'].sum() / df_26.groupby('group_time')['use_time'].count()
group_time
(0 days 00:00:00, 0 days 00:30:00] 00:12:04.639175
(0 days 00:30:00, 0 days 02:00:00] 00:49:03.421588
(0 days 02:00:00, 0 days 06:00:00] 02:34:37.142857
Name: use_time, dtype: timedelta64[ns]
(4)
import math
EARTH_RADIUS = 6371
def cal_dis_meters(latitude1, longitude1,latitude2, longitude2):
radLat1 = (math.pi/180)*latitude1
radLat2 = (math.pi/180)*latitude2
radLng1 = (math.pi/180)*longitude1
radLng2= (math.pi/180)*longitude2
d = []
for i in range(len(latitude1)):
d1=2.0*math.asin(math.sqrt(math.pow(math.sin((radLat1[i]-radLat2[i])/2.0),2)+math.cos(radLat1[i])*math.cos(radLat2[i])*math.pow(math.sin((radLng1[i]-radLng2[i])/2.0),2)))*EARTH_RADIUS
d.append(d1)
for i in range(len(d)):
d[i] = round(d[i], 2)
return d
df_3['distance'] = cal_dis_meters(df_3['start_location_y'], df_3['start_location_x'], df_3['end_location_y'], df_3['end_location_x'])
df_3
(5)
df_3['use_time'] = df_3['end_time'] - df_3['start_time']
df_3['use_time'] = df_3['use_time'].apply(lambda x: str(x)[7:])
# 分钟转成小时
def min2hou(x):
h = x[1]
m = x[3:5]
if m[0] == '0':
m = m[1]
return round(int(h) + int(m)/60, 2)
df_3['hour'] = df_3['use_time'].apply(min2hou)
df_3['cal_distance'] = df_3['hour'] * 15
df_3
df_3.loc[(df_3['cal_distance'] / df_3['distance']) >= 2]
(6)
# 还有脏数据
s = df_3.loc[df_3['track'].str.contains(r'[\\]')]['track'].str.replace('\\', '')
df_3.loc[df_3['track'].str.contains(r'[\\]'), 'track'] = s
# 弄个元组列表
result = []
for i in range(len(df_3)):
list_temp = df_3['track'][i].split('#')
result_temp = []
for j in range(len(list_temp)):
list_temp1 = list_temp[j].split(',')
zip_temp = list(zip([float(list_temp1[0])], [float(list_temp1[1])]))
result_temp.extend(zip_temp)
result.append(result_temp)
def cal_real_meters(latitude1, longitude1,latitude2, longitude2):
radLat1 = (math.pi/180)*latitude1
radLat2 = (math.pi/180)*latitude2
radLng1 = (math.pi/180)*longitude1
radLng2= (math.pi/180)*longitude2
d=2.0*math.asin(math.sqrt(math.pow(math.sin((radLat1-radLat2)/2.0),2)+math.cos(radLat1)*math.cos(radLat2)*math.pow(math.sin((radLng1-radLng2)/2.0),2)))*EARTH_RADIUS
return d
real_dist = []
for i in range(len(df_3)):
d = 0
for j in range(len(result[i])-1):
d_temp = cal_real_meters(result[i][j][1], result[i][j][0], result[i][j+1][1], result[i][j+1][0])
d += d_temp
real_dist.append(round(d, 2))
real_dist
df_3['real_dist'] = real_dist
df_3.loc[(df_3['cal_distance'] / df_3['real_dist']) >= 2]