# New York City Taxi Trip Duration纽约出租车大数据探索
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
train=pd.read_csv(r"D:\2018_BigData\Python\Kaggle_learning\New York City Taxi Trip Duration\train.csv")
train.head()
|
id |
vendor_id |
pickup_datetime |
dropoff_datetime |
passenger_count |
pickup_longitude |
pickup_latitude |
dropoff_longitude |
dropoff_latitude |
store_and_fwd_flag |
trip_duration |
0 |
id2875421 |
2 |
2016-03-14 17:24:55 |
2016-03-14 17:32:30 |
1 |
-73.982155 |
40.767937 |
-73.964630 |
40.765602 |
N |
455 |
1 |
id2377394 |
1 |
2016-06-12 00:43:35 |
2016-06-12 00:54:38 |
1 |
-73.980415 |
40.738564 |
-73.999481 |
40.731152 |
N |
663 |
2 |
id3858529 |
2 |
2016-01-19 11:35:24 |
2016-01-19 12:10:48 |
1 |
-73.979027 |
40.763939 |
-74.005333 |
40.710087 |
N |
2124 |
3 |
id3504673 |
2 |
2016-04-06 19:32:31 |
2016-04-06 19:39:40 |
1 |
-74.010040 |
40.719971 |
-74.012268 |
40.706718 |
N |
429 |
4 |
id2181028 |
2 |
2016-03-26 13:30:55 |
2016-03-26 13:38:10 |
1 |
-73.973053 |
40.793209 |
-73.972923 |
40.782520 |
N |
435 |
train.info()
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
id 1458644 non-null object
vendor_id 1458644 non-null int64
pickup_datetime 1458644 non-null object
dropoff_datetime 1458644 non-null object
passenger_count 1458644 non-null int64
pickup_longitude 1458644 non-null float64
pickup_latitude 1458644 non-null float64
dropoff_longitude 1458644 non-null float64
dropoff_latitude 1458644 non-null float64
store_and_fwd_flag 1458644 non-null object
trip_duration 1458644 non-null int64
dtypes: float64(4), int64(3), object(4)
memory usage: 122.4+ MB
train.describe()
|
vendor_id |
passenger_count |
pickup_longitude |
pickup_latitude |
dropoff_longitude |
dropoff_latitude |
trip_duration |
count |
1.458644e+06 |
1.458644e+06 |
1.458644e+06 |
1.458644e+06 |
1.458644e+06 |
1.458644e+06 |
1.458644e+06 |
mean |
1.534950e+00 |
1.664530e+00 |
-7.397349e+01 |
4.075092e+01 |
-7.397342e+01 |
4.075180e+01 |
9.594923e+02 |
std |
4.987772e-01 |
1.314242e+00 |
7.090186e-02 |
3.288119e-02 |
7.064327e-02 |
3.589056e-02 |
5.237432e+03 |
min |
1.000000e+00 |
0.000000e+00 |
-1.219333e+02 |
3.435970e+01 |
-1.219333e+02 |
3.218114e+01 |
1.000000e+00 |
25% |
1.000000e+00 |
1.000000e+00 |
-7.399187e+01 |
4.073735e+01 |
-7.399133e+01 |
4.073588e+01 |
3.970000e+02 |
50% |
2.000000e+00 |
1.000000e+00 |
-7.398174e+01 |
4.075410e+01 |
-7.397975e+01 |
4.075452e+01 |
6.620000e+02 |
75% |
2.000000e+00 |
2.000000e+00 |
-7.396733e+01 |
4.076836e+01 |
-7.396301e+01 |
4.076981e+01 |
1.075000e+03 |
max |
2.000000e+00 |
9.000000e+00 |
-6.133553e+01 |
5.188108e+01 |
-6.133553e+01 |
4.392103e+01 |
3.526282e+06 |
pd.set_option('display.max_columns', None)
train[["passenger_count","trip_duration"]].describe()
|
passenger_count |
trip_duration |
count |
1.458644e+06 |
1.458644e+06 |
mean |
1.664530e+00 |
9.594923e+02 |
std |
1.314242e+00 |
5.237432e+03 |
min |
0.000000e+00 |
1.000000e+00 |
25% |
1.000000e+00 |
3.970000e+02 |
50% |
1.000000e+00 |
6.620000e+02 |
75% |
2.000000e+00 |
1.075000e+03 |
max |
9.000000e+00 |
3.526282e+06 |
import datetime
from datetime import datetime
train.pickup_datetime = train.pickup_datetime.apply(lambda x:datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
train["pickup_yearmonth"] = train["pickup_datetime"].map(lambda x: 100*x.year + x.month)
train.head(1)
|
id |
vendor_id |
pickup_datetime |
dropoff_datetime |
passenger_count |
pickup_longitude |
pickup_latitude |
dropoff_longitude |
dropoff_latitude |
store_and_fwd_flag |
trip_duration |
pickup_yearmonth |
0 |
id2875421 |
2 |
2016-03-14 17:24:55 |
2016-03-14 17:32:30 |
1 |
-73.982155 |
40.767937 |
-73.96463 |
40.765602 |
N |
455 |
201603 |
train["pickup_yearmonth"].value_counts()
201603 256189
201604 251645
201605 248487
201602 238300
201606 234316
201601 229707
Name: pickup_yearmonth, dtype: int64
train["pickup_month"] = train["pickup_datetime"].map(lambda x: x.month)
train.head(1)
|
id |
vendor_id |
pickup_datetime |
dropoff_datetime |
passenger_count |
pickup_longitude |
pickup_latitude |
dropoff_longitude |
dropoff_latitude |
store_and_fwd_flag |
trip_duration |
pickup_yearmonth |
pickup_month |
0 |
id2875421 |
2 |
2016-03-14 17:24:55 |
2016-03-14 17:32:30 |
1 |
-73.982155 |
40.767937 |
-73.96463 |
40.765602 |
N |
455 |
201603 |
3 |
train["pickup_week"]=train.pickup_datetime.apply(lambda x: x.weekday())
train.head(2)
|
id |
vendor_id |
pickup_datetime |
dropoff_datetime |
passenger_count |
pickup_longitude |
pickup_latitude |
dropoff_longitude |
dropoff_latitude |
store_and_fwd_flag |
trip_duration |
pickup_yearmonth |
pickup_month |
pickup_week |
0 |
id2875421 |
2 |
2016-03-14 17:24:55 |
2016-03-14 17:32:30 |
1 |
-73.982155 |
40.767937 |
-73.964630 |
40.765602 |
N |
455 |
201603 |
3 |
0 |
1 |
id2377394 |
1 |
2016-06-12 00:43:35 |
2016-06-12 00:54:38 |
1 |
-73.980415 |
40.738564 |
-73.999481 |
40.731152 |
N |
663 |
201606 |
6 |
6 |
train["pickup_day"]=train.pickup_datetime.apply(lambda x: x.day)
train.head(2)
|
id |
vendor_id |
pickup_datetime |
dropoff_datetime |
passenger_count |
pickup_longitude |
pickup_latitude |
dropoff_longitude |
dropoff_latitude |
store_and_fwd_flag |
trip_duration |
pickup_yearmonth |
pickup_month |
pickup_week |
pickup_day |
0 |
id2875421 |
2 |
2016-03-14 17:24:55 |
2016-03-14 17:32:30 |
1 |
-73.982155 |
40.767937 |
-73.964630 |
40.765602 |
N |
455 |
201603 |
3 |
0 |
14 |
1 |
id2377394 |
1 |
2016-06-12 00:43:35 |
2016-06-12 00:54:38 |
1 |
-73.980415 |
40.738564 |
-73.999481 |
40.731152 |
N |
663 |
201606 |
6 |
6 |
12 |
train["pickup_date"]=train.pickup_datetime.values.astype("datetime64[D]")
train.head(2)
|
id |
vendor_id |
pickup_datetime |
dropoff_datetime |
passenger_count |
pickup_longitude |
pickup_latitude |
dropoff_longitude |
dropoff_latitude |
store_and_fwd_flag |
trip_duration |
pickup_yearmonth |
pickup_month |
pickup_week |
pickup_day |
pickup_date |
0 |
id2875421 |
2 |
2016-03-14 17:24:55 |
2016-03-14 17:32:30 |
1 |
-73.982155 |
40.767937 |
-73.964630 |
40.765602 |
N |
455 |
201603 |
3 |
0 |
14 |
2016-03-14 |
1 |
id2377394 |
1 |
2016-06-12 00:43:35 |
2016-06-12 00:54:38 |
1 |
-73.980415 |
40.738564 |
-73.999481 |
40.731152 |
N |
663 |
201606 |
6 |
6 |
12 |
2016-06-12 |
month_trip=train.groupby(['pickup_month'])["trip_duration"].agg(["sum","mean","count"])
month_trip=month_trip.reset_index()
month_trip.head(2)
|
pickup_month |
sum |
mean |
count |
0 |
1 |
211875608 |
922.373319 |
229707 |
1 |
2 |
219433897 |
920.830453 |
238300 |
month_trip.rename(columns={'sum':'month_sum_trip_dur','mean':'month_avg_trip_dur','count':"month_trip_times"}, inplace = True)
month_trip.head(2)
|
pickup_month |
month_sum_trip_dur |
month_avg_trip_dur |
month_trip_times |
0 |
1 |
211875608 |
922.373319 |
229707 |
1 |
2 |
219433897 |
920.830453 |
238300 |
date_trip=train.groupby(['pickup_date'])["trip_duration"].agg(["sum","mean","count"])
date_trip=date_trip.reset_index()
date_trip.rename(columns={'sum':'date_sum_trip_duration','mean':'date_avg_trip_dur','count':"date_trip_times"}, inplace = True)
date_trip.head()
|
pickup_date |
date_sum_trip_duration |
date_avg_trip_dur |
date_trip_times |
0 |
2016-01-01 |
6593910 |
920.679978 |
7162 |
1 |
2016-01-02 |
5470632 |
840.084767 |
6512 |
2 |
2016-01-03 |
5874410 |
924.667086 |
6353 |
3 |
2016-01-04 |
5723773 |
851.118662 |
6725 |
4 |
2016-01-05 |
10484304 |
1455.344808 |
7204 |
day_trip=train.groupby(['pickup_day'])["trip_duration"].agg(["sum","mean","count"])
day_trip=day_trip.reset_index()
day_trip.rename(columns={'sum':'day_sum_trip_duration','mean':'day_avg_trip_dur','count':"day_trip_times"}, inplace = True)
day_trip.head()
|
pickup_day |
day_sum_trip_duration |
day_avg_trip_dur |
day_trip_times |
0 |
1 |
44656812 |
958.053978 |
46612 |
1 |
2 |
44354937 |
928.860299 |
47752 |
2 |
3 |
46806173 |
976.247221 |
47945 |
3 |
4 |
47050568 |
947.549451 |
49655 |
4 |
5 |
51193213 |
1020.272899 |
50176 |
plt.figure(figsize=(16,18))
plt.subplot(321)
ax1= plt.plot(month_trip.pickup_month, month_trip.month_trip_times, color="green", alpha=0.8, label='trip times',marker='*')
plt.title("Monthly Trip Times")
plt.ylabel('# of trip times', fontsize=12)
plt.xlabel('Pickup_Month', fontsize=12)
plt.subplot(322)
ax2= plt.plot(month_trip.pickup_month, month_trip.month_avg_trip_dur, color="green",alpha=0.8,marker='o')
plt.title("Monthly Trip Duration")
plt.ylabel('Avg of trip druration', fontsize=12)
plt.xlabel('Pickup_Month', fontsize=12)
plt.subplot(312)
ax2= plt.plot(date_trip.pickup_date, date_trip.date_trip_times, color="red", alpha=0.8,marker='o')
plt.title("Date Trip Time")
plt.ylabel('Date trip time', fontsize=12)
plt.xlabel('Pickup_Date', fontsize=12)
plt.subplot(313)
ax2= plt.plot(day_trip.pickup_day, day_trip.day_trip_times, color="red", alpha=0.8,marker='*')
plt.title("Day Trip Time")
plt.ylabel('Day trip time', fontsize=12)
plt.xlabel('Pickup_Day', fontsize=12)
plt.show()
date_trip[date_trip.date_trip_times<6000]
|
pickup_date |
date_sum_trip_duration |
date_avg_trip_dur |
date_trip_times |
22 |
2016-01-23 |
1691754 |
1026.549757 |
1648 |
23 |
2016-01-24 |
3052107 |
902.189477 |
3383 |
150 |
2016-05-30 |
4568228 |
820.148654 |
5570 |
x = train.groupby(['pickup_day'])["passenger_count"].agg(["mean"])
x = x.reset_index()
day_trip["avg_passenger_count"] = x["mean"]
day_trip.head()
|
pickup_day |
day_sum_trip_duration |
day_avg_trip_dur |
day_trip_times |
avg_passenger_count |
0 |
1 |
44656812 |
958.053978 |
46612 |
1.682871 |
1 |
2 |
44354937 |
928.860299 |
47752 |
1.666611 |
2 |
3 |
46806173 |
976.247221 |
47945 |
1.655355 |
3 |
4 |
47050568 |
947.549451 |
49655 |
1.643279 |
4 |
5 |
51193213 |
1020.272899 |
50176 |
1.670301 |
plt.figure(figsize=(12,12))
plt.subplot(211)
ax2= plt.plot(day_trip.pickup_day, day_trip.avg_passenger_count, color="grey", alpha=0.8,marker='o')
plt.title("Day Trip Passenger")
plt.ylabel('Day trip passenger', fontsize=12)
plt.xlabel('Pickup_Day', fontsize=12)
plt.subplot(212)
ax2= plt.plot(day_trip.pickup_day, day_trip.day_trip_times, color="grey", alpha=0.9,marker='*')
plt.title("Day Trip Time")
plt.ylabel('Day trip time', fontsize=12)
plt.xlabel('Pickup_Day', fontsize=12)
plt.show()
train["pickup_hour"]=train.pickup_datetime.apply(lambda x: x.hour)
train.head()
|
id |
vendor_id |
pickup_datetime |
dropoff_datetime |
passenger_count |
pickup_longitude |
pickup_latitude |
dropoff_longitude |
dropoff_latitude |
store_and_fwd_flag |
trip_duration |
pickup_yearmonth |
pickup_month |
pickup_week |
pickup_day |
pickup_date |
pickup_hour |
0 |
id2875421 |
2 |
2016-03-14 17:24:55 |
2016-03-14 17:32:30 |
1 |
-73.982155 |
40.767937 |
-73.964630 |
40.765602 |
N |
455 |
201603 |
3 |
0 |
14 |
2016-03-14 |
17 |
1 |
id2377394 |
1 |
2016-06-12 00:43:35 |
2016-06-12 00:54:38 |
1 |
-73.980415 |
40.738564 |
-73.999481 |
40.731152 |
N |
663 |
201606 |
6 |
6 |
12 |
2016-06-12 |
0 |
2 |
id3858529 |
2 |
2016-01-19 11:35:24 |
2016-01-19 12:10:48 |
1 |
-73.979027 |
40.763939 |
-74.005333 |
40.710087 |
N |
2124 |
201601 |
1 |
1 |
19 |
2016-01-19 |
11 |
3 |
id3504673 |
2 |
2016-04-06 19:32:31 |
2016-04-06 19:39:40 |
1 |
-74.010040 |
40.719971 |
-74.012268 |
40.706718 |
N |
429 |
201604 |
4 |
2 |
6 |
2016-04-06 |
19 |
4 |
id2181028 |
2 |
2016-03-26 13:30:55 |
2016-03-26 13:38:10 |
1 |
-73.973053 |
40.793209 |
-73.972923 |
40.782520 |
N |
435 |
201603 |
3 |
5 |
26 |
2016-03-26 |
13 |
week_trip = train.groupby(["pickup_month",'pickup_week','pickup_day','pickup_hour'])["trip_duration"].agg(["mean","count"])
week_trip = week_trip.reset_index()
week_trip.head(2)
|
pickup_month |
pickup_week |
pickup_day |
pickup_hour |
mean |
count |
0 |
1 |
0 |
4 |
0 |
656.771186 |
118 |
1 |
1 |
0 |
4 |
1 |
703.086957 |
92 |
week_trip.rename(columns={'mean':'week_avg_trip_dur','count':"week_trip_times"}, inplace = True)
print(week_trip.shape)
week_trip.head(2)
(4359, 6)
|
pickup_month |
pickup_week |
pickup_day |
pickup_hour |
week_avg_trip_dur |
week_trip_times |
0 |
1 |
0 |
4 |
0 |
656.771186 |
118 |
1 |
1 |
0 |
4 |
1 |
703.086957 |
92 |
x1 = train.groupby(["pickup_month",'pickup_week','pickup_day','pickup_hour'])["passenger_count"].agg(["mean"])
x1 = x1.reset_index()
week_trip["avg_passenger_count"] = x1["mean"]
week_trip.head()
|
pickup_month |
pickup_week |
pickup_day |
pickup_hour |
week_avg_trip_dur |
week_trip_times |
avg_passenger_count |
0 |
1 |
0 |
4 |
0 |
656.771186 |
118 |
1.593220 |
1 |
1 |
0 |
4 |
1 |
703.086957 |
92 |
1.673913 |
2 |
1 |
0 |
4 |
2 |
692.085106 |
47 |
1.574468 |
3 |
1 |
0 |
4 |
3 |
738.500000 |
32 |
1.250000 |
4 |
1 |
0 |
4 |
4 |
644.000000 |
50 |
1.700000 |
plt.figure(figsize=(10,8))
sns.swarmplot(x="pickup_week", y="week_trip_times", hue="pickup_month", data=week_trip)
plt.show()
plt.figure(figsize=(10,5))
sns.boxplot(x="pickup_week", y="avg_passenger_count", hue="pickup_month", data=week_trip)
plt.show()
plt.figure(figsize=(16,6))
sns.swarmplot(x="pickup_hour", y="week_trip_times", data=week_trip)
plt.show()
plt.figure(figsize=(16,6))
sns.stripplot(x="pickup_hour", y="avg_passenger_count", data=week_trip)
plt.show()
plt.figure(figsize=(16,6))
y=week_trip["week_avg_trip_dur"][week_trip["week_avg_trip_dur"]<=8000]
daytripdur=sns.boxplot(x="pickup_hour", y=y, data=week_trip)
plt.show()
plt.figure(figsize=(16,6))
daytripdur=sns.boxplot(x="pickup_hour", y=y, data=week_trip)
plt.show()
from math import sin,radians,cos,asin,sqrt
def haversine(lon1, lat1, lon2, lat2):
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
c = 2 * asin(sqrt(a))
r = 6371
return c * r
plt.figure(figsize=(16,6))
y=week_trip["week_avg_trip_dur"][week_trip["week_avg_trip_dur"]<=8000]
daytripdur=sns.boxplot(x="pickup_hour", y=y, data=week_trip)
plt.show()