2 Setup
library(tidyverse)
library(sf)
library(lubridate)
library(tigris)
library(tidycensus)
library(viridis)
library(riem)
library(gridExtra)
library(knitr)
library(kableExtra)
library(RSocrata)
library(caret)
library(gganimate)
library(gifski)
plotTheme <- theme(
plot.title =element_text(size=12),
plot.subtitle = element_text(size=8),
plot.caption = element_text(size = 6),
axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
axis.text.y = element_text(size = 10),
axis.title.y = element_text(size = 10),
# Set the entire chart region to blank
panel.background=element_blank(),
plot.background=element_blank(),
#panel.border=element_rect(colour="#F0F0F0"),
# Format the grid
panel.grid.major=element_line(colour="#D0D0D0",size=.2),
axis.ticks=element_blank())
mapTheme <- theme(plot.title =element_text(size=12),
plot.subtitle = element_text(size=8),
plot.caption = element_text(size = 6),
axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
axis.title.x=element_blank(),
axis.title.y=element_blank(),
panel.background=element_blank(),
panel.border=element_blank(),
panel.grid.major=element_line(colour = 'transparent'),
panel.grid.minor=element_blank(),
legend.direction = "vertical",
legend.position = "right",
plot.margin = margin(1, 1, 1, 1, 'cm'),
legend.key.height = unit(1, "cm"), legend.key.width = unit(0.2, "cm"))
palette5 <- c("#eff3ff","#bdd7e7","#6baed6","#3182bd","#08519c")
palette4 <- c("#D2FBD4","#92BCAB","#527D82","#123F5A")
palette2 <- c("#6baed6","#08519c")
palette1 <- c("#b6cde3","#58a2e8","#1f66ab","#073b6b")
Load census API key to grab stuff from tidycensus
2.1. Import Data
Let’s read in the month of May, Usually, May has a fairly pleasant
temperature range in Philly, so we may see some leisure trips as well as
commutes. https://kiosks.bicycletransit.workers.dev/phl
dat <- read.csv("https://raw.githubusercontent.com/RumRon/HW_UP_IP/main/R/data4HW5/4.25-5.29.csv")
dat <- dat %>%
rename(., from_longitude = start_lon) %>%
rename(., from_latitude = start_lat)%>%
rename(., to_latitude = end_lat ) %>%
rename(., to_longitude = end_lon) %>%
rename(., from_station_id= start_station) %>%
rename(., to_station_id = end_station)
station_name <- read.csv("https://raw.githubusercontent.com/RumRon/HW_UP_IP/main/R/data4HW5/indego_stations.csv")
station_name<- station_name %>%
select("Station_ID","Station_Name")
dat <- left_join(dat, station_name, by = c("from_station_id" = "Station_ID")) %>%
rename(., from_station_name= Station_Name)
dat <- left_join(dat, station_name, by = c("to_station_id" = "Station_ID")) %>%
rename(., to_station_name= Station_Name)
Let’s use some date parsing to bin the data by 15 and 60 minute
intervals by rounding and take a look at our data to see the format and
names of all of our columns using the glimpse command.
dat$start_time <- strptime(dat$start_time, format = "%m/%d/%Y %H:%M")
dat$end_time <- strptime(dat$end_time, format = "%m/%d/%Y %H:%M")
dat2 <- dat %>%
mutate(interval60 = floor_date(ymd_hms(start_time), unit = "hour"),
interval15 = floor_date(ymd_hms(start_time), unit = "15 mins"),
week = week(interval60),
dotw = wday(interval60, label=TRUE))
glimpse(dat2)
## Rows: 96,034
## Columns: 21
## $ trip_id <int> 475034210, 474925307, 474976334, 474925306, 474925…
## $ duration <int> 1, 17, 357, 12, 10, 4, 10, 11, 1, 4, 3, 4, 10, 173…
## $ start_time <dttm> 2022-04-25 00:05:00, 2022-04-25 00:05:00, 2022-04…
## $ end_time <dttm> 2022-04-25 00:06:00, 2022-04-25 00:22:00, 2022-04…
## $ from_station_id <int> 3190, 3190, 3190, 3212, 3026, 3033, 3006, 3167, 32…
## $ from_latitude <dbl> 39.94892, 39.94892, 39.94892, 39.96383, 39.94182, …
## $ from_longitude <dbl> -75.16991, -75.16991, -75.16991, -75.18177, -75.14…
## $ to_station_id <int> 3000, 3021, 3205, 3111, 3100, 3046, 3066, 3061, 32…
## $ to_latitude <dbl> NA, 39.95390, 39.95405, 39.97779, 39.92777, 39.950…
## $ to_longitude <dbl> NA, -75.16902, -75.16783, -75.21323, -75.15103, -7…
## $ bike_id <int> 11828, 14639, 3436, 19327, 14500, 5198, 19879, 348…
## $ plan_duration <int> 30, 30, 1, 30, 30, 365, 30, 365, 365, 365, 30, 365…
## $ trip_route_category <chr> "One Way", "One Way", "One Way", "One Way", "One W…
## $ passholder_type <chr> "Indego30", "Indego30", "Day Pass", "Indego30", "I…
## $ bike_type <chr> "standard", "standard", "standard", "electric", "s…
## $ from_station_name <chr> "17th & Locust", "17th & Locust", "17th & Locust",…
## $ to_station_name <chr> "Virtual Station", "18th & JFK", "17th & JFK", "Pa…
## $ interval60 <dttm> 2022-04-25 00:00:00, 2022-04-25 00:00:00, 2022-04…
## $ interval15 <dttm> 2022-04-25 00:00:00, 2022-04-25 00:00:00, 2022-04…
## $ week <dbl> 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17…
## $ dotw <ord> Mon, Mon, Mon, Mon, Mon, Mon, Mon, Mon, Mon, Mon, …
2.2. Import Census Info
Using the tidycensus package, we can download census
geography and variables. These are used to test generalizeability later.
We extract the tracts for mapping and joining purposes - creating an
sf object that consists only of GEOIDs and geometries.
We add the spatial information to our rideshare data as origin and
destination data, first joining the origin station, then the destination
station to our census data.
phillyCensus <-
get_acs(geography = "tract",
variables = c("B01003_001", "B19013_001",
"B02001_002", "B08013_001",
"B08012_001", "B08301_001",
"B08301_010", "B01002_001"),
year = 2020,
state = "PA",
geometry = TRUE,
county=c("Philadelphia"),
output = "wide") %>%
rename(Total_Pop = B01003_001E,
Med_Inc = B19013_001E,
Med_Age = B01002_001E,
White_Pop = B02001_002E,
Travel_Time = B08013_001E,
Num_Commuters = B08012_001E,
Means_of_Transport = B08301_001E,
Total_Public_Trans = B08301_010E) %>%
select(Total_Pop, Med_Inc, White_Pop, Travel_Time,
Means_of_Transport, Total_Public_Trans,
Med_Age,
GEOID, geometry) %>%
mutate(Percent_White = White_Pop / Total_Pop,
Mean_Commute_Time = Travel_Time / Total_Public_Trans,
Percent_Taking_Public_Trans = Total_Public_Trans / Means_of_Transport)
phillyTracts <-
phillyCensus %>%
as.data.frame() %>%
distinct(GEOID, .keep_all = TRUE) %>%
select(GEOID, geometry) %>%
st_sf
dat_census <- st_join(dat2 %>%
filter(is.na(from_longitude) == FALSE &
is.na(from_latitude) == FALSE &
is.na(to_latitude) == FALSE &
is.na(to_longitude) == FALSE) %>%
st_as_sf(., coords = c("from_longitude", "from_latitude"), crs = 4326),
phillyTracts %>%
st_transform(crs=4326),
join=st_intersects,
left = TRUE) %>%
rename(Origin.Tract = GEOID) %>%
mutate(from_longitude = unlist(map(geometry, 1)),
from_latitude = unlist(map(geometry, 2)))%>%
as.data.frame() %>%
select(-geometry)%>%
st_as_sf(., coords = c("to_longitude", "to_latitude"), crs = 4326) %>%
st_join(., phillyTracts %>%
st_transform(crs=4326),
join=st_intersects,
left = TRUE) %>%
rename(Destination.Tract = GEOID) %>%
mutate(to_longitude = unlist(map(geometry, 1)),
to_latitude = unlist(map(geometry, 2)))%>%
as.data.frame() %>%
select(-geometry)
2.3. Import Weather Data
Import weather data from Philadelphia airport (code PHL) using
riem_measures. We can mutate the data to get
temperature, wind speed, precipitation on an hourly basis and plot the
temperature and precipitation trends over our study period.
These data can also be categorized as a part of an exploration of the
relationship between your independent and dependent variables,
e.g. “does wind appear to affect ridership during rush hour?”
weather.Panel <-
riem_measures(station = "PHL", date_start = "2022-04-25", date_end = "2022-05-30") %>%
dplyr::select(valid, tmpf, p01i, sknt)%>%
replace(is.na(.), 0) %>%
mutate(interval60 = ymd_h(substr(valid,1,13))) %>%
mutate(week = week(interval60),
dotw = wday(interval60, label=TRUE)) %>%
group_by(interval60) %>%
summarize(Temperature = max(tmpf),
Precipitation = sum(p01i),
Wind_Speed = max(sknt)) %>%
mutate(Temperature = ifelse(Temperature == 0, 42, Temperature))
glimpse(weather.Panel)
## Rows: 840
## Columns: 4
## $ interval60 <dttm> 2022-04-25 00:00:00, 2022-04-25 01:00:00, 2022-04-25 02…
## $ Temperature <dbl> 55.0, 54.0, 52.0, 51.1, 50.0, 48.0, 48.0, 48.0, 46.0, 46…
## $ Precipitation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Wind_Speed <dbl> 12, 11, 10, 8, 8, 7, 7, 7, 7, 7, 8, 9, 9, 10, 10, 10, 11…
grid.arrange(
ggplot(weather.Panel, aes(interval60,Precipitation)) + geom_line() +
labs(title="Percipitation", x="Hour", y="Perecipitation") + plotTheme,
ggplot(weather.Panel, aes(interval60,Wind_Speed)) + geom_line() +
labs(title="Wind Speed", x="Hour", y="Wind Speed") + plotTheme,
ggplot(weather.Panel, aes(interval60,Temperature)) + geom_line() +
labs(title="Temperature", x="Hour", y="Temperature") + plotTheme,
top="Weather Data - Philadelphia PHL - May, 2022")

2.4. Import Amenity Data
Import school data as amenity.
library(FNN)
school <-
st_read("https://raw.githubusercontent.com/RumRon/HW_UP_IP/main/R/Schools.geojson") %>%
st_transform(crs=4326)
root.dir = "https://raw.githubusercontent.com/urbanSpatial/Public-Policy-Analytics-Landing/master/DATA/"
source("https://raw.githubusercontent.com/urbanSpatial/Public-Policy-Analytics-Landing/master/functions.r")
3. Describe and Explore the Data
Examining the time and frequency components of our data.
3.1. Overall Time Pattern
First, we look at the overall time pattern - there is clearly a daily
periodicity and there are lull periods on weekends. Notice that the
weekend near the 28th of May (Memorial Day) doesn’t have the same dip in
activity.
ggplot(dat_census %>%
group_by(interval60) %>%
tally())+
geom_line(aes(x = interval60, y = n))+
labs(title="Bike share per hr. Philadelphia, May, 2022",
x="Date",
y="Number of trips")+
plotTheme()

3.2. Examine the Distribution of Trip Volume by Station for
Different Times of the Day
We clearly have a few high volume periods but mostly low volume. Our
data must consist of a lot of low demand station/hours and a few high
demand station hours.
There’s a possibility we may have to treat these as count data here,
which means running Poisson regression. Then again, we might have enough
of the higher counts in our high volume times and stations, that we
should really be building a linear model to accomodate our actual volume
and not worry about the low trip times/stations.
We can also track the daily trends in ridership by day of the week
and weekend versus weekday, to see what temporal patterns we’d like to
control for.
dat_census %>%
mutate(time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush"))%>%
group_by(interval60, from_station_name, time_of_day) %>%
tally()%>%
group_by(from_station_name, time_of_day)%>%
summarize(mean_trips = mean(n))%>%
ggplot()+
geom_histogram(aes(mean_trips), binwidth = 1)+
labs(title="Mean Number of Hourly Trips Per Station. Phialdelphia, May, 2022",
x="Number of trips",
y="Frequency")+
facet_wrap(~time_of_day)+
plotTheme()

ggplot(dat_census %>%
group_by(interval60, from_station_name) %>%
tally())+
geom_histogram(aes(n), binwidth = 5)+
labs(title="Bike share trips per hr by station. Philadelphia, May, 2022",
x="Trip Counts",
y="Number of Stations")+
plotTheme()

ggplot(dat_census %>% mutate(hour = hour(start_time)))+
geom_freqpoly(aes(hour, color = dotw), binwidth = 1)+
labs(title="Bike share trips in Philadelphia, by day of the week, May, 2022",
x="Hour",
y="Trip Counts")+
plotTheme()

ggplot(dat_census %>%
mutate(hour = hour(start_time),
weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday")))+
geom_freqpoly(aes(hour, color = weekend), binwidth = 1)+
labs(title="Bike share in Philadelphia - weekend vs weekday, May, 2022",
x="Hour",
y="Trip Counts")+
plotTheme()

ggplot()+
geom_sf(data = phillyTracts %>%
st_transform(crs=4326))+
geom_point(data = dat_census %>%
mutate(hour = hour(start_time),
weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday"),
time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush"))%>%
group_by(from_station_id, from_latitude, from_longitude, weekend, time_of_day) %>%
tally(),
aes(x=from_longitude, y = from_latitude, color = n),
fill = "transparent", alpha = 0.4, size = 1)+
scale_colour_viridis(direction = -1,
discrete = FALSE, option = "D")+
ylim(min(dat_census$from_latitude), max(dat_census$from_latitude))+
xlim(min(dat_census$from_longitude), max(dat_census$from_longitude))+
facet_grid(weekend ~ time_of_day)+
labs(title="Bike share trips per hr by station. Philadlephia, May, 2022")+
mapTheme()

3.3 Create Space-Time Panel
First we have to make sure each unique station and hour/day
combo exists in our data set. This is done in order to create a
“panel” (e.g. a time-series) data set where each time period in the
study is represented by a row - whether an observation took place then
or not. So if a station didn’t have any trips originating from it at a
given hour, we still need a zero in that spot in the panel.
We start by determining the maximum number of combinations.
Then we compare that to the actual number of combinations. We create
an empty data frame study.panel, is created that has each
unique space/time observations. This is done using the expand.grid
function and unique. Along the way, we keep tabs on the number of rows
our data have - nrow shows that the count is still
correct.
We then join the station name, tract and lat/lon (some have multiple
lat lon info, so we just take the first one of each using
group_by and slice).
length(unique(dat_census$interval60)) * length(unique(dat_census$from_station_id))
## [1] 148452
study.panel <-
expand.grid(interval60=unique(dat_census$interval60),
from_station_id = unique(dat_census$from_station_id)) %>%
left_join(., dat_census %>%
select(from_station_id, from_station_name, Origin.Tract, from_longitude, from_latitude )%>%
distinct() %>%
group_by(from_station_id) %>%
slice(1))
nrow(study.panel)
## [1] 148452
We create the full panel by summarizing counts by station for each
time interval, keep census info and lat/lon information along for
joining later to other data. We remove data for station IDs that are
FALSE.
ride.panel <-
dat_census %>%
mutate(Trip_Counter = 1) %>%
right_join(study.panel) %>%
group_by(interval60, from_station_id, from_station_name, Origin.Tract, from_longitude, from_latitude) %>%
summarize(Trip_Count = sum(Trip_Counter, na.rm=T)) %>%
left_join(weather.Panel) %>%
ungroup() %>%
filter(is.na(from_station_id) == FALSE) %>%
mutate(week = week(interval60),
dotw = wday(interval60, label = TRUE)) %>%
filter(is.na(Origin.Tract) == FALSE)
ride.panel <-
left_join(ride.panel, phillyCensus %>%
as.data.frame() %>%
select(-geometry), by = c("Origin.Tract" = "GEOID"))
from <- ride.panel %>%
dplyr::select(from_longitude, from_latitude)
to <- st_coordinates(school)
ride.panel <-
ride.panel %>%
mutate(
school_dist = nn_function(from, to, 3)
)
3.4. Create time lags
Creating time lag variables will add additional nuance about the
demand during a given time period - hours before and during that
day.
We can also try to control for the effects of holidays that disrupt
the expected demand during a given weekend or weekday. We have a holiday
on May 28 - Memorial Day. For that three day weekend we could use some
dummy variables indicating temporal proximity to the holiday.
The demand right now should be relatively similar to the demand
tomorrow at this time, and to the demand an hour from now, but twelve
hours from now, we likely expect the opposite in terms of demand.
ride.panel <-
ride.panel %>%
arrange(from_station_name, interval60) %>%
mutate(lagHour = dplyr::lag(Trip_Count,1),
lag2Hours = dplyr::lag(Trip_Count,2),
lag3Hours = dplyr::lag(Trip_Count,3),
lag4Hours = dplyr::lag(Trip_Count,4),
lag12Hours = dplyr::lag(Trip_Count,12),
lag1day = dplyr::lag(Trip_Count,24),
holiday = ifelse(yday(interval60) == 148,1,0)) %>%
mutate(day = yday(interval60)) %>%
mutate(holidayLag = case_when(dplyr::lag(holiday, 1) == 1 ~ "PlusOneDay",
dplyr::lag(holiday, 2) == 1 ~ "PlustTwoDays",
dplyr::lag(holiday, 3) == 1 ~ "PlustThreeDays",
dplyr::lead(holiday, 1) == 1 ~ "MinusOneDay",
dplyr::lead(holiday, 2) == 1 ~ "MinusTwoDays",
dplyr::lead(holiday, 3) == 1 ~ "MinusThreeDays"),
#holidayLag = replace_na(holidayLag, 0))
holidayLag = ifelse(is.na(holidayLag) == TRUE, 0, holidayLag))
as.data.frame(ride.panel) %>%
group_by(interval60) %>%
summarise_at(vars(starts_with("lag"), "Trip_Count"), mean, na.rm = TRUE) %>%
gather(Variable, Value, -interval60, -Trip_Count) %>%
mutate(Variable = factor(Variable, levels=c("lagHour","lag2Hours","lag3Hours","lag4Hours",
"lag12Hours","lag1day","school_dist")))%>%
group_by(Variable) %>%
summarize(correlation = round(cor(Value, Trip_Count),2))
## # A tibble: 6 × 2
## Variable correlation
## <fct> <dbl>
## 1 lagHour 0.89
## 2 lag2Hours 0.71
## 3 lag3Hours 0.52
## 4 lag4Hours 0.33
## 5 lag12Hours -0.52
## 6 lag1day 0.79
4. Run Models
We split our data into a training and a test set. We create five
linear models using the lm funtion. Sometimes, for data
such as these, Poisson distributions, designed for modeling counts,
might be appropriate. I’ll spare you the effort - linear models work
better with this particular data set.
We create the models using our training data ride.Train.
The first models include only temporal controls, but the later ones
contain all of our lag information.
ride.Train <- ride.panel %>% filter(., week >= 20)
ride.Test <- ride.panel %>% filter(., week < 20)
reg1 <-
lm(Trip_Count ~ hour(interval60) + dotw + Temperature, data=ride.Train)
reg2 <-
lm(Trip_Count ~ from_station_name + dotw + Temperature, data=ride.Train)
reg3 <-
lm(Trip_Count ~ from_station_name + hour(interval60) + dotw + Temperature + Precipitation,
data=ride.Train)
reg4 <-
lm(Trip_Count ~ from_station_name + hour(interval60) + dotw + Temperature + Precipitation +
lagHour + lag2Hours +lag3Hours + lag12Hours + lag1day,
data=ride.Train)
reg5 <-
lm(Trip_Count ~ from_station_name + hour(interval60) + dotw +
Temperature + Precipitation + lagHour + lag2Hours +
lag3Hours +lag12Hours + lag1day + holidayLag + holiday + school_dist,
data=ride.Train)
4.1. Predict for test data
When models have finished running, create a nested data frame of test
data by week. Nested data is common in most other programming languages.
For instance, the javascript object notation file format (aka JSON) is
highly nested.
Nesting means that instead of merely having a “flat” file consisting
of rows and columns, we have a matrix of other objects - imagine each
cell in a matrix containing another matrix within it, or a list, or a
list of lists.
The purrr package is designed to map
functions through nested data structures. This concept is important -
think of map as visiting each dataframe in a nested data
set and applies a function to it.
We create a function called model_pred which we can then
map onto each data frame in our nested structure.
This function is called in the code below in a few ways, one way is
like so:
map(.x = data, fit = name_of_your_regression, .f = model_pred).
Here’s the important bit - the argument fit takes the name
of a regression you have created that you want to use to make
predictions, and the .f argument takes a function, in this
case model_pred, which we create in order to simply execute
the predict function.
ride.Test.weekNest <-
ride.Test %>%
nest(-week)
model_pred <- function(dat, fit){
pred <- predict(fit, newdata = dat)}
When we run our predictions and summarize our results, we are going
to have some NA data - recall we have some lag information that will
necessarily trip up the model at the margins of the time frame.
week_predictions <-
ride.Test.weekNest %>%
mutate(ATime_FE = map(.x = data, fit = reg1, .f = model_pred),
BSpace_FE = map(.x = data, fit = reg2, .f = model_pred),
CTime_Space_FE = map(.x = data, fit = reg3, .f = model_pred),
DTime_Space_FE_timeLags = map(.x = data, fit = reg4, .f = model_pred),
ETime_Space_FE_timeLags_holidayLags = map(.x = data, fit = reg5, .f = model_pred)) %>%
gather(Regression, Prediction, -data, -week) %>%
mutate(Observed = map(data, pull, Trip_Count),
Absolute_Error = map2(Observed, Prediction, ~ abs(.x - .y)),
MAE = map_dbl(Absolute_Error, mean, na.rm = TRUE),
sd_AE = map_dbl(Absolute_Error, sd, na.rm = TRUE))
## Warning in predict.lm(fit, newdata = dat): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(fit, newdata = dat): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(fit, newdata = dat): prediction from a rank-deficient fit
## may be misleading
week_predictions
## # A tibble: 15 × 8
## week data Regression Prediction Observed Absolute_Error MAE sd_AE
## <dbl> <list> <chr> <list> <list> <list> <dbl> <dbl>
## 1 17 <tibble> ATime_FE <dbl> <dbl> <dbl [21,360]> 0.780 0.771
## 2 18 <tibble> ATime_FE <dbl> <dbl> <dbl [29,904]> 0.801 0.870
## 3 19 <tibble> ATime_FE <dbl> <dbl> <dbl [29,904]> 0.779 0.829
## 4 17 <tibble> BSpace_FE <dbl> <dbl> <dbl [21,360]> 0.673 0.819
## 5 18 <tibble> BSpace_FE <dbl> <dbl> <dbl [29,904]> 0.707 0.883
## 6 19 <tibble> BSpace_FE <dbl> <dbl> <dbl [29,904]> 0.683 0.826
## 7 17 <tibble> CTime_Space_FE <dbl> <dbl> <dbl [21,360]> 0.715 0.738
## 8 18 <tibble> CTime_Space_FE <dbl> <dbl> <dbl [29,904]> 0.737 0.825
## 9 19 <tibble> CTime_Space_FE <dbl> <dbl> <dbl [29,904]> 0.721 0.789
## 10 17 <tibble> DTime_Space_FE… <dbl> <dbl> <dbl [21,360]> 0.657 0.695
## 11 18 <tibble> DTime_Space_FE… <dbl> <dbl> <dbl [29,904]> 0.670 0.765
## 12 19 <tibble> DTime_Space_FE… <dbl> <dbl> <dbl [29,904]> 0.648 0.732
## 13 17 <tibble> ETime_Space_FE… <dbl> <dbl> <dbl [21,360]> 0.661 0.692
## 14 18 <tibble> ETime_Space_FE… <dbl> <dbl> <dbl [29,904]> 0.672 0.766
## 15 19 <tibble> ETime_Space_FE… <dbl> <dbl> <dbl [29,904]> 0.648 0.731
5. Accuracy
5.1 MAE
week_predictions %>%
dplyr::select(week, Regression, MAE) %>%
gather(Variable, MAE, -Regression, -week) %>%
ggplot(aes(week, MAE)) +
geom_bar(aes(fill = Regression), position = "dodge", stat="identity") +
scale_fill_manual(values = palette5) +
labs(title = "Mean Absolute Errors by model specification and week") +
plotTheme()
We could see that regression D and E has relatively low MAE by model
specification and week.
5.2 reg5 seems to have the best goodness of fit
generally.
week_predictions %>%
mutate(interval60 = map(data, pull, interval60),
from_station_id = map(data, pull, from_station_id)) %>%
dplyr::select(interval60, from_station_id, Observed, Prediction, Regression) %>%
unnest() %>%
gather(Variable, Value, -Regression, -interval60, -from_station_id) %>%
group_by(Regression, Variable, interval60) %>%
summarize(Value = sum(Value)) %>%
ggplot(aes(interval60, Value, colour=Variable)) +
geom_line(size = 1.1) +
facet_wrap(~Regression, ncol=1) +
labs(title = "Predicted/Observed bike share time series", subtitle = "Philadelphia; A test set of 2 weeks", x = "Hour", y= "Station Trips") +
plotTheme()

week_predictions %>%
mutate(interval60 = map(data, pull, interval60),
from_station_id = map(data, pull, from_station_id),
from_latitude = map(data, pull, from_latitude),
from_longitude = map(data, pull, from_longitude)) %>%
select(interval60, from_station_id, from_longitude, from_latitude, Observed, Prediction, Regression) %>%
unnest() %>%
filter(Regression == "ETime_Space_FE_timeLags_holidayLags") %>%
group_by(from_station_id, from_longitude, from_latitude) %>%
summarize(MAE = mean(abs(Observed-Prediction), na.rm = TRUE))%>%
ggplot(.)+
geom_sf(data = phillyCensus, color = "grey", fill = "transparent")+
geom_point(aes(x = from_longitude, y = from_latitude, color = MAE),
fill = "transparent", alpha = 0.4)+
scale_colour_viridis(direction = -1,
discrete = FALSE, option = "D")+
ylim(min(dat_census$from_latitude), max(dat_census$from_latitude))+
xlim(min(dat_census$from_longitude), max(dat_census$from_longitude))+
labs(title="Mean Abs Error, Test Set, Model 5")+
mapTheme()
There are some MAE around downtown area and spread to University City
and nearby areas.
5.3. Space-Time Error Evaluation
If we plot observed vs. predicted for different times of day during
the week and weekend, some patterns begin to emerge.
week_predictions %>%
mutate(interval60 = map(data, pull, interval60),
from_station_id = map(data, pull, from_station_id),
from_latitude = map(data, pull, from_latitude),
from_longitude = map(data, pull, from_longitude),
dotw = map(data, pull, dotw)) %>%
select(interval60, from_station_id, from_longitude,
from_latitude, Observed, Prediction, Regression,
dotw) %>%
unnest() %>%
filter(Regression == "ETime_Space_FE_timeLags_holidayLags")%>%
mutate(weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday"),
time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush"))%>%
ggplot()+
geom_point(aes(x= Observed, y = Prediction))+
geom_smooth(aes(x= Observed, y= Prediction), method = "lm", se = FALSE, color = "red")+
geom_abline(slope = 1, intercept = 0)+
facet_grid(time_of_day~weekend)+
labs(title="Observed vs Predicted",
x="Observed trips",
y="Predicted trips")+
plotTheme()

Weekend’s morning is the hardest to predict.
5.4. MAE map by weekend/weekday and time of day.
week_predictions %>%
mutate(interval60 = map(data, pull, interval60),
from_station_id = map(data, pull, from_station_id),
from_latitude = map(data, pull, from_latitude),
from_longitude = map(data, pull, from_longitude),
dotw = map(data, pull, dotw) ) %>%
select(interval60, from_station_id, from_longitude,
from_latitude, Observed, Prediction, Regression,
dotw) %>%
unnest() %>%
filter(Regression == "ETime_Space_FE_timeLags_holidayLags")%>%
mutate(weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday"),
time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush")) %>%
group_by(from_station_id, weekend, time_of_day, from_longitude, from_latitude) %>%
summarize(MAE = mean(abs(Observed-Prediction), na.rm = TRUE))%>%
ggplot(.)+
geom_sf(data = phillyCensus, color = "grey", fill = "transparent")+
geom_point(aes(x = from_longitude, y = from_latitude, color = MAE),
fill = "transparent", size = 0.5, alpha = 0.4)+
scale_colour_viridis(direction = -1,
discrete = FALSE, option = "D")+
ylim(min(dat_census$from_latitude), max(dat_census$from_latitude))+
xlim(min(dat_census$from_longitude), max(dat_census$from_longitude))+
facet_grid(weekend~time_of_day)+
labs(title="Mean Absolute Errors, Test Set")+
mapTheme()

Seems like errors are concentrated in downtown areas. The pattern has
been visualized both as the scatter plot and the spatial map. The
ridership is high during the evening rush as compared to the morning
rush, hence why the errors are higher in the weekday predictions.
5.5 Errors as a function of socio-economic variables
week_predictions %>%
mutate(interval60 = map(data, pull, interval60),
from_station_id = map(data, pull, from_station_id),
from_latitude = map(data, pull, from_latitude),
from_longitude = map(data, pull, from_longitude),
dotw = map(data, pull, dotw),
Percent_Taking_Public_Trans = map(data, pull, Percent_Taking_Public_Trans),
Med_Inc = map(data, pull, Med_Inc),
Med_Age = map(data, pull, Med_Age),
Mean_Commute_Time = map(data,pull,Mean_Commute_Time),
Percent_White = map(data, pull, Percent_White)) %>%
select(interval60, from_station_id, from_longitude,
from_latitude, Observed, Prediction, Regression,
dotw, Med_Inc, Med_Age, Mean_Commute_Time,Percent_Taking_Public_Trans) %>%
unnest() %>%
filter(Regression == "ETime_Space_FE_timeLags_holidayLags")%>%
mutate(weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday"),
time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush")) %>%
filter(time_of_day == "AM Rush") %>%
group_by(from_station_id, Med_Inc, Percent_Taking_Public_Trans, Mean_Commute_Time) %>%
summarize(MAE = mean(abs(Observed-Prediction), na.rm = TRUE))%>%
gather(-from_station_id, -MAE, key = "variable", value = "value")%>%
ggplot(.)+
#geom_sf(data = phillyCensus, color = "grey", fill = "transparent")+
geom_point(aes(x = value, y = MAE), alpha = 0.4)+
geom_smooth(aes(x = value, y = MAE), method = "lm", se= FALSE)+
facet_wrap(~variable, scales = "free")+
labs(title="Errors as a function of socio-economic variables",
y="Mean Absolute Error (Trips)")+
plotTheme()

Let’s focus on the morning commute, where station locations probably
relate to likely users. How is the model performing on weekday mornings
relative to demand for public transportation (e.g. possible user base).
We can tell that there are a select few stations that are proving
sightly resistant to our model - they have long communication time, high
mean income and low transit usage, demographically.
5.6 Animated Map by space/time dependencies
week21 <-
filter(dat_census , week == 21)
week21.panel <-
expand.grid(
interval15 = unique(week21$interval15),
Pickup.Census.Tract = unique(dat_census$from_station_id))
ride.animation.data <-
mutate(week21, Trip_Counter = 1) %>%
select(interval15, from_station_id, from_longitude, from_latitude, Trip_Counter) %>%
group_by(interval15, from_station_id, from_longitude, from_latitude) %>%
summarize(Trip_Count = sum(Trip_Counter, na.rm=T)) %>%
ungroup() %>%
mutate(Trips = case_when(Trip_Count == 0 ~ "0 trips",
Trip_Count > 0 & Trip_Count <= 2 ~ "0-2 trips",
Trip_Count > 2 & Trip_Count <= 5 ~ "2-5 trips",
Trip_Count > 5 & Trip_Count <= 10 ~ "5-10 trips",
Trip_Count > 10 & Trip_Count <= 15 ~ "10-15 trips",
Trip_Count > 15 ~ "15+ trips")) %>%
mutate(Trips = fct_relevel(Trips, "0 trips","0-2 trips","2-5 trips",
"5-10 trips","10-15 trips","15+ trips"))
library(FNN)
mapTheme <- function(base_size = 12) {
theme(
text = element_text( color = "black"),
plot.title = element_text(size = 18,colour = "black"),
plot.subtitle=element_text(face="italic"),
plot.caption=element_text(hjust=0),
axis.ticks = element_blank(),
panel.background = element_blank(),axis.title = element_blank(),
axis.text = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=2)
)
}
rideshare_animation <-
ggplot()+
geom_sf(data = phillyTracts %>%
st_transform(crs=4326), colour = '#efefef')+
geom_point(data = ride.animation.data,
aes(x = from_longitude, y = from_latitude, fill = Trips, color = Trips), size = 1, alpha = 1.5) +
scale_colour_manual(values = palette1) +
labs(title = "Indego pickups for one week in May 2022",
subtitle = "15 minute intervals: {current_frame}") +
transition_manual(interval15) +
mapTheme()
animate(rideshare_animation, duration=20, renderer = gifski_renderer())

---
title: "Space-Time Prediction of Bike Share Demand - HW5 (MUSA 508, Fall, 2022)"
author: "Student: Rui Jiang; Instructor: Michael Fichman"
date: "November 16, 2022"
output: 
  html_document:
    toc: true
    toc_float: true
    code_folding: "hide"
    code_download: true
---

## 1.1. Introduction

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

One of the most difficult operational problems for urban bike share systems is the need to ‘re-balance’ bicycles across the network. Bike share is not useful if a dock has no bikes to pickup, nor if there are no open docking spaces to deposit a bike. Re-balancing is the practice of anticipating (or predicting) bike share demand for all docks at all times and manually redistributing bikes to ensure a bike or a docking place is available when needed.<br />
The demand for parking spaces, uber trips, bike share, road access and a whole host of urban transportation phenomena are time and space dependent, and modeling them frequently involves simply controlling for the day, hour, location, weather and other temporal phenomena. Quite simply, the demand for bike share trips today at my location at 5PM is probably highly correlated with the demand last week at the same time. <br />
This project will predict only the demand, but it will give us a window into how we can use time-space predictive modeling to address an operations issue. If we knew the bike station capacities, we could see when demand for bikes might drive stations to run out of bikes, and then move excess bikes from elsewhere. For example, we could provide reward credits for those who end their trip at the high-demand stations depends on the prediction. Today’s trend is similar to the trend tomorrow and this week’s trend will be similar to next week trends. Hence, we will be able to predict how many bikes and from which station to high accuracy. Those credits could be used by riders who tries to pay their next period plan (monthly or yearly pass) or guest pass. Electric bikes require extra 20¢/minute to unlock. Credits could also work as a discount for the extra electric bikes fee. <br />
Philadelphia’s bikeshare program – offers 600 self-service bikes among 60 stations all day, every day. Philadelphia’s City government owns the bicycles and stations, with the Mayor’s Office of Transportation and Utilities (MOTU) planning and managing the project. The City of Philadelphia and its partners at Bicycle Transit Systems are sharing anonymized Indego trip data, downloadable from OpenDataPhilly and on Indego’s webiste. The data includes the membership type, bike numbers, checkout kiosk name, ID, and location of stations, trip duration, checkout and return times, and a category distinguishing between one way and round trips. 
https://www.rideindego.com/about/data/

## 2 Setup

```{r setup_13, cache=TRUE, message=FALSE}
library(tidyverse)
library(sf)
library(lubridate)
library(tigris)
library(tidycensus)
library(viridis)
library(riem)
library(gridExtra)
library(knitr)
library(kableExtra)
library(RSocrata)
library(caret)
library(gganimate)
library(gifski)

plotTheme <- theme(
  plot.title =element_text(size=12),
  plot.subtitle = element_text(size=8),
  plot.caption = element_text(size = 6),
  axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
  axis.text.y = element_text(size = 10),
  axis.title.y = element_text(size = 10),
  # Set the entire chart region to blank
  panel.background=element_blank(),
  plot.background=element_blank(),
  #panel.border=element_rect(colour="#F0F0F0"),
  # Format the grid
  panel.grid.major=element_line(colour="#D0D0D0",size=.2),
  axis.ticks=element_blank())

mapTheme <- theme(plot.title =element_text(size=12),
                  plot.subtitle = element_text(size=8),
                  plot.caption = element_text(size = 6),
                  axis.line=element_blank(),
                  axis.text.x=element_blank(),
                  axis.text.y=element_blank(),
                  axis.ticks=element_blank(),
                  axis.title.x=element_blank(),
                  axis.title.y=element_blank(),
                  panel.background=element_blank(),
                  panel.border=element_blank(),
                  panel.grid.major=element_line(colour = 'transparent'),
                  panel.grid.minor=element_blank(),
                  legend.direction = "vertical", 
                  legend.position = "right",
                  plot.margin = margin(1, 1, 1, 1, 'cm'),
                  legend.key.height = unit(1, "cm"), legend.key.width = unit(0.2, "cm"))

palette5 <- c("#eff3ff","#bdd7e7","#6baed6","#3182bd","#08519c")
palette4 <- c("#D2FBD4","#92BCAB","#527D82","#123F5A")
palette2 <- c("#6baed6","#08519c")
palette1 <- c("#b6cde3","#58a2e8","#1f66ab","#073b6b")
```

Load census API key to grab stuff from `tidycensus`

```{r install_census_API_key, warning = FALSE, include=FALSE, eval = TRUE}
# Install Census API Key
tidycensus::census_api_key("2f748668ad5407296cc1ffdff1a4ab3b2aa98a84", overwrite = TRUE)
```

### 2.1. Import Data

Let's read in the month of May, Usually, May has a fairly pleasant temperature range in Philly, so we may see some leisure trips as well as commutes.
https://kiosks.bicycletransit.workers.dev/phl


```{r read_dat}
dat <- read.csv("https://raw.githubusercontent.com/RumRon/HW_UP_IP/main/R/data4HW5/4.25-5.29.csv")

dat <- dat %>%
  rename(., from_longitude = start_lon) %>%
  rename(., from_latitude = start_lat)%>%
  rename(., to_latitude = end_lat ) %>%
  rename(., to_longitude = end_lon) %>%
  rename(., from_station_id= start_station) %>%
  rename(., to_station_id = end_station)
station_name <- read.csv("https://raw.githubusercontent.com/RumRon/HW_UP_IP/main/R/data4HW5/indego_stations.csv")
station_name<- station_name %>%
  select("Station_ID","Station_Name")
dat <- left_join(dat, station_name, by = c("from_station_id" = "Station_ID")) %>%
  rename(., from_station_name= Station_Name) 
dat <- left_join(dat, station_name, by = c("to_station_id" = "Station_ID")) %>%
  rename(., to_station_name= Station_Name) 
```

Let's use some date parsing to bin the data by 15 and 60 minute intervals by rounding and take a look at our data to see the format and names of all of our columns using the `glimpse` command.

```{r time_bins, echo=TRUE}
dat$start_time <- strptime(dat$start_time, format = "%m/%d/%Y %H:%M")
dat$end_time <- strptime(dat$end_time, format = "%m/%d/%Y %H:%M")
dat2 <- dat %>%
  mutate(interval60 = floor_date(ymd_hms(start_time), unit = "hour"),
         interval15 = floor_date(ymd_hms(start_time), unit = "15 mins"),
         week = week(interval60),
         dotw = wday(interval60, label=TRUE))

glimpse(dat2)
```

### 2.2. Import Census Info

Using the `tidycensus` package, we can download census geography and variables. These are used to test generalizeability later. We extract the tracts for mapping and joining purposes - creating an `sf` object that consists only of GEOIDs and geometries.

We add the spatial information to our rideshare data as origin and destination data, first joining the origin station, then the destination station to our census data.

```{r get_census, message=FALSE, warning=FALSE, cache=FALSE, results = 'hide'}
phillyCensus <- 
  get_acs(geography = "tract", 
          variables = c("B01003_001", "B19013_001", 
                        "B02001_002", "B08013_001",
                        "B08012_001", "B08301_001", 
                        "B08301_010", "B01002_001"), 
          year = 2020, 
          state = "PA", 
          geometry = TRUE, 
          county=c("Philadelphia"),
          output = "wide") %>%
  rename(Total_Pop =  B01003_001E,
         Med_Inc = B19013_001E,
         Med_Age = B01002_001E,
         White_Pop = B02001_002E,
         Travel_Time = B08013_001E,
         Num_Commuters = B08012_001E,
         Means_of_Transport = B08301_001E,
         Total_Public_Trans = B08301_010E) %>%
  select(Total_Pop, Med_Inc, White_Pop, Travel_Time,
         Means_of_Transport, Total_Public_Trans,
         Med_Age,
         GEOID, geometry) %>%
  mutate(Percent_White = White_Pop / Total_Pop,
         Mean_Commute_Time = Travel_Time / Total_Public_Trans,
         Percent_Taking_Public_Trans = Total_Public_Trans / Means_of_Transport)
```

```{r extract_geometries }
phillyTracts <- 
  phillyCensus %>%
  as.data.frame() %>%
  distinct(GEOID, .keep_all = TRUE) %>%
  select(GEOID, geometry) %>% 
  st_sf

```


```{r add_census_tracts , message = FALSE, warning = FALSE}
dat_census <- st_join(dat2 %>% 
          filter(is.na(from_longitude) == FALSE &
                   is.na(from_latitude) == FALSE &
                   is.na(to_latitude) == FALSE &
                   is.na(to_longitude) == FALSE) %>%
          st_as_sf(., coords = c("from_longitude", "from_latitude"), crs = 4326),
        phillyTracts %>%
          st_transform(crs=4326),
        join=st_intersects,
              left = TRUE) %>%
  rename(Origin.Tract = GEOID) %>%
  mutate(from_longitude = unlist(map(geometry, 1)),
         from_latitude = unlist(map(geometry, 2)))%>%
  as.data.frame() %>%
  select(-geometry)%>%
  st_as_sf(., coords = c("to_longitude", "to_latitude"), crs = 4326) %>%
  st_join(., phillyTracts %>%
            st_transform(crs=4326),
          join=st_intersects,
          left = TRUE) %>%
  rename(Destination.Tract = GEOID)  %>%
  mutate(to_longitude = unlist(map(geometry, 1)),
         to_latitude = unlist(map(geometry, 2)))%>%
  as.data.frame() %>%
  select(-geometry)
```

### 2.3. Import Weather Data

Import weather data from Philadelphia airport (code PHL) using `riem_measures`. We can `mutate` the data to get temperature, wind speed, precipitation on an hourly basis and plot the temperature and precipitation trends over our study period.

These data can also be categorized as a part of an exploration of the relationship between your independent and dependent variables, e.g. "does wind appear to affect ridership during rush hour?"

```{r import_weather, message=FALSE, warning=FALSE}
weather.Panel <- 
  riem_measures(station = "PHL", date_start = "2022-04-25", date_end = "2022-05-30") %>%
  dplyr::select(valid, tmpf, p01i, sknt)%>%
  replace(is.na(.), 0) %>%
    mutate(interval60 = ymd_h(substr(valid,1,13))) %>%
    mutate(week = week(interval60),
           dotw = wday(interval60, label=TRUE)) %>%
    group_by(interval60) %>%
    summarize(Temperature = max(tmpf),
              Precipitation = sum(p01i),
              Wind_Speed = max(sknt)) %>%
    mutate(Temperature = ifelse(Temperature == 0, 42, Temperature))

glimpse(weather.Panel)
```

```{r plot_weather, catche = TRUE}
grid.arrange(
  ggplot(weather.Panel, aes(interval60,Precipitation)) + geom_line() + 
  labs(title="Percipitation", x="Hour", y="Perecipitation") + plotTheme,
  ggplot(weather.Panel, aes(interval60,Wind_Speed)) + geom_line() + 
    labs(title="Wind Speed", x="Hour", y="Wind Speed") + plotTheme,
  ggplot(weather.Panel, aes(interval60,Temperature)) + geom_line() + 
    labs(title="Temperature", x="Hour", y="Temperature") + plotTheme,
  top="Weather Data - Philadelphia PHL - May, 2022")
```

### 2.4. Import Amenity Data

Import school data as amenity.

```{r import_amenity, results='hide'}
library(FNN)
school <-
  st_read("https://raw.githubusercontent.com/RumRon/HW_UP_IP/main/R/Schools.geojson") %>%
  st_transform(crs=4326)
root.dir = "https://raw.githubusercontent.com/urbanSpatial/Public-Policy-Analytics-Landing/master/DATA/"
source("https://raw.githubusercontent.com/urbanSpatial/Public-Policy-Analytics-Landing/master/functions.r")



```


## 3. Describe and Explore the Data

Examining the time and frequency components of our data.

### 3.1. Overall Time Pattern

First, we look at the overall time pattern - there is clearly a daily periodicity and there are lull periods on weekends. Notice that the weekend near the 28th of May (Memorial Day) doesn't have the same dip in activity.

```{r trip_timeseries }
ggplot(dat_census %>%
         group_by(interval60) %>%
         tally())+
  geom_line(aes(x = interval60, y = n))+
  labs(title="Bike share per hr. Philadelphia, May, 2022",
       x="Date", 
       y="Number of trips")+
  plotTheme()
```

### 3.2. Examine the Distribution of Trip Volume by Station for Different Times of the Day

We clearly have a few high volume periods but mostly low volume. Our data must consist of a lot of low demand station/hours and a few high demand station hours. 

There's a possibility we may have to treat these as count data here, which means running Poisson regression. Then again, we might have enough of the higher counts in our high volume times and stations, that we should really be building a linear model to accomodate our actual volume and not worry about the low trip times/stations.

We can also track the daily trends in ridership by day of the week and weekend versus weekday, to see what temporal patterns we'd like to control for.

```{r mean_trips_hist, warning = FALSE, message = FALSE }
dat_census %>%
        mutate(time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
                                 hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
                                 hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
                                 hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush"))%>%
         group_by(interval60, from_station_name, time_of_day) %>%
         tally()%>%
  group_by(from_station_name, time_of_day)%>%
  summarize(mean_trips = mean(n))%>%
  ggplot()+
  geom_histogram(aes(mean_trips), binwidth = 1)+
  labs(title="Mean Number of Hourly Trips Per Station. Phialdelphia, May, 2022",
       x="Number of trips", 
       y="Frequency")+
  facet_wrap(~time_of_day)+
  plotTheme()
```

```{r trips_station_dotw }
ggplot(dat_census %>%
         group_by(interval60, from_station_name) %>%
         tally())+
  geom_histogram(aes(n), binwidth = 5)+
  labs(title="Bike share trips per hr by station. Philadelphia, May, 2022",
       x="Trip Counts", 
       y="Number of Stations")+
  plotTheme()
```

```{r trips_hour_dotw }
ggplot(dat_census %>% mutate(hour = hour(start_time)))+
     geom_freqpoly(aes(hour, color = dotw), binwidth = 1)+
  labs(title="Bike share trips in Philadelphia, by day of the week, May, 2022",
       x="Hour", 
       y="Trip Counts")+
     plotTheme()


ggplot(dat_census %>% 
         mutate(hour = hour(start_time),
                weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday")))+
     geom_freqpoly(aes(hour, color = weekend), binwidth = 1)+
  labs(title="Bike share in Philadelphia - weekend vs weekday, May, 2022",
       x="Hour", 
       y="Trip Counts")+
     plotTheme()
```


```{r origin_map }
ggplot()+
  geom_sf(data = phillyTracts %>%
          st_transform(crs=4326))+
  geom_point(data = dat_census %>% 
            mutate(hour = hour(start_time),
                weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday"),
                time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
                                 hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
                                 hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
                                 hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush"))%>%
              group_by(from_station_id, from_latitude, from_longitude, weekend, time_of_day) %>%
              tally(),
            aes(x=from_longitude, y = from_latitude, color = n), 
            fill = "transparent", alpha = 0.4, size = 1)+
  scale_colour_viridis(direction = -1,
  discrete = FALSE, option = "D")+
  ylim(min(dat_census$from_latitude), max(dat_census$from_latitude))+
  xlim(min(dat_census$from_longitude), max(dat_census$from_longitude))+
  facet_grid(weekend ~ time_of_day)+
  labs(title="Bike share trips per hr by station. Philadlephia, May, 2022")+
  mapTheme()
```


### 3.3 Create Space-Time Panel

First **we have to make sure each unique station and hour/day combo exists in our data set.** This is done in order to create a "panel" (e.g. a time-series) data set where each time period in the study is represented by a row - whether an observation took place then or not. So if a station didn't have any trips originating from it at a given hour, we still need a zero in that spot in the panel.

We start by determining the maximum number of combinations.

Then we compare that to the actual number of combinations. We create an empty data frame `study.panel`, is created that has each unique space/time observations. This is done using the expand.grid function and unique. Along the way, we keep tabs on the number of rows our data have - `nrow` shows that the count is still correct.

We then join the station name, tract and lat/lon (some have multiple lat lon info, so we just take the first one of each using `group_by` and `slice`).


```{r panel_length_check , message = FALSE, warning = FALSE}
length(unique(dat_census$interval60)) * length(unique(dat_census$from_station_id))


study.panel <- 
  expand.grid(interval60=unique(dat_census$interval60), 
              from_station_id = unique(dat_census$from_station_id)) %>%
  left_join(., dat_census %>%
              select(from_station_id, from_station_name, Origin.Tract, from_longitude, from_latitude )%>%
              distinct() %>%
              group_by(from_station_id) %>%
              slice(1))

nrow(study.panel)      
```


We create the full panel by summarizing counts by station for each time interval, keep census info and lat/lon information along for joining later to other data. We remove data for station IDs that are `FALSE`.

```{r create_panel , message = FALSE}
ride.panel <- 
  dat_census %>%
  mutate(Trip_Counter = 1) %>%
  right_join(study.panel) %>% 
  group_by(interval60, from_station_id, from_station_name, Origin.Tract, from_longitude, from_latitude) %>%
  summarize(Trip_Count = sum(Trip_Counter, na.rm=T)) %>%
  left_join(weather.Panel) %>%
  ungroup() %>%
  filter(is.na(from_station_id) == FALSE) %>%
  mutate(week = week(interval60),
         dotw = wday(interval60, label = TRUE)) %>%
  filter(is.na(Origin.Tract) == FALSE)
```

```{r census_and_panel , message = FALSE}
ride.panel <- 
  left_join(ride.panel, phillyCensus %>%
              as.data.frame() %>%
              select(-geometry), by = c("Origin.Tract" = "GEOID"))
```

```{r amenity, message = FALSE}
from <- ride.panel %>%
  dplyr::select(from_longitude, from_latitude)
to <- st_coordinates(school)

ride.panel <-
  ride.panel %>%
  mutate(
    school_dist = nn_function(from, to, 3)
  )
```
### 3.4. Create time lags

Creating time lag variables will add additional nuance about the demand during a given time period - hours before and during that day. 

We can also try to control for the effects of holidays that disrupt the expected demand during a given weekend or weekday. We have a holiday on May 28 - Memorial Day. For that three day weekend we could use some dummy variables indicating temporal proximity to the holiday.

The demand right now should be relatively similar to the demand tomorrow at this time, and to the demand an hour from now, but twelve hours from now, we likely expect the opposite in terms of demand.


```{r time_lags , message = FALSE}
ride.panel <- 
  ride.panel %>% 
  arrange(from_station_name, interval60) %>% 
  mutate(lagHour = dplyr::lag(Trip_Count,1),
         lag2Hours = dplyr::lag(Trip_Count,2),
         lag3Hours = dplyr::lag(Trip_Count,3),
         lag4Hours = dplyr::lag(Trip_Count,4),
         lag12Hours = dplyr::lag(Trip_Count,12),
         lag1day = dplyr::lag(Trip_Count,24),
         holiday = ifelse(yday(interval60) == 148,1,0)) %>%
   mutate(day = yday(interval60)) %>%
   mutate(holidayLag = case_when(dplyr::lag(holiday, 1) == 1 ~ "PlusOneDay",
                                 dplyr::lag(holiday, 2) == 1 ~ "PlustTwoDays",
                                 dplyr::lag(holiday, 3) == 1 ~ "PlustThreeDays",
                                 dplyr::lead(holiday, 1) == 1 ~ "MinusOneDay",
                                 dplyr::lead(holiday, 2) == 1 ~ "MinusTwoDays",
                                 dplyr::lead(holiday, 3) == 1 ~ "MinusThreeDays"),
         #holidayLag = replace_na(holidayLag, 0))
         holidayLag = ifelse(is.na(holidayLag) == TRUE, 0, holidayLag))

```

```{r evaluate_lags , warning = FALSE, message = FALSE}
as.data.frame(ride.panel) %>%
    group_by(interval60) %>% 
    summarise_at(vars(starts_with("lag"), "Trip_Count"), mean, na.rm = TRUE) %>%
    gather(Variable, Value, -interval60, -Trip_Count) %>%
    mutate(Variable = factor(Variable, levels=c("lagHour","lag2Hours","lag3Hours","lag4Hours",
                                                "lag12Hours","lag1day","school_dist")))%>%
    group_by(Variable) %>%  
    summarize(correlation = round(cor(Value, Trip_Count),2))
```


## 4. Run Models

We split our data into a training and a test set. We create five linear models using the `lm` funtion. Sometimes, for data such as these, Poisson distributions, designed for modeling counts, might be appropriate. I'll spare you the effort - linear models work better with this particular data set. 

We create the models using our training data `ride.Train`. The first models include only temporal controls, but the later ones contain all of our lag information.


```{r train_test }
ride.Train <- ride.panel %>% filter(., week >= 20)
ride.Test <- ride.panel %>% filter(., week < 20)
```


```{r five_models }
reg1 <- 
  lm(Trip_Count ~  hour(interval60) + dotw + Temperature,  data=ride.Train)

reg2 <- 
  lm(Trip_Count ~  from_station_name + dotw + Temperature,  data=ride.Train)

reg3 <- 
  lm(Trip_Count ~  from_station_name + hour(interval60) + dotw + Temperature + Precipitation, 
     data=ride.Train)

reg4 <- 
  lm(Trip_Count ~  from_station_name +  hour(interval60) + dotw + Temperature + Precipitation +
                   lagHour + lag2Hours +lag3Hours + lag12Hours + lag1day, 
     data=ride.Train)

reg5 <- 
  lm(Trip_Count ~  from_station_name + hour(interval60) + dotw + 
                  Temperature + Precipitation + lagHour + lag2Hours +
                  lag3Hours +lag12Hours + lag1day + holidayLag + holiday + school_dist, 
     data=ride.Train)
```

### 4.1. Predict for test data

When models have finished running, create a nested data frame of test data by week. Nested data is common in most other programming languages. For instance, the javascript object notation file format (aka JSON) is highly nested.

Nesting means that instead of merely having a "flat" file consisting of rows and columns, we have a matrix of other objects - imagine each cell in a matrix containing another matrix within it, or a list, or a list of lists. 

The `purrr` package is designed to `map` functions through nested data structures. This concept is important - think of `map` as visiting each dataframe in a nested data set and applies a function to it.

We create a function called `model_pred` which we can then `map` onto each data frame in our nested structure.

This function is called in the code below in a few ways, one way is like so: `map(.x = data, fit = name_of_your_regression, .f = model_pred)`. Here's the important bit - the argument `fit` takes the name of a regression you have created that you want to use to make predictions, and the `.f` argument takes a function, in this case `model_pred`, which we create in order to simply execute the `predict` function.

```{r nest_data , warning = FALSE, message = FALSE}
ride.Test.weekNest <- 
  ride.Test %>%
  nest(-week) 
```


```{r predict_function }
model_pred <- function(dat, fit){
   pred <- predict(fit, newdata = dat)}
```

When we run our predictions and summarize our results, we are going to have some NA data - recall we have some lag information that will necessarily trip up the model at the margins of the time frame. 

```{r do_predicitons, message=FALSE}
week_predictions <- 
  ride.Test.weekNest %>% 
    mutate(ATime_FE = map(.x = data, fit = reg1, .f = model_pred),
           BSpace_FE = map(.x = data, fit = reg2, .f = model_pred),
           CTime_Space_FE = map(.x = data, fit = reg3, .f = model_pred),
           DTime_Space_FE_timeLags = map(.x = data, fit = reg4, .f = model_pred),
           ETime_Space_FE_timeLags_holidayLags = map(.x = data, fit = reg5, .f = model_pred)) %>% 
    gather(Regression, Prediction, -data, -week) %>%
    mutate(Observed = map(data, pull, Trip_Count),
           Absolute_Error = map2(Observed, Prediction,  ~ abs(.x - .y)),
           MAE = map_dbl(Absolute_Error, mean, na.rm = TRUE),
           sd_AE = map_dbl(Absolute_Error, sd, na.rm = TRUE))

week_predictions
```

## 5. Accuracy

### 5.1 MAE

```{r plot_errors_by_model }
week_predictions %>%
  dplyr::select(week, Regression, MAE) %>%
  gather(Variable, MAE, -Regression, -week) %>%
  ggplot(aes(week, MAE)) + 
    geom_bar(aes(fill = Regression), position = "dodge", stat="identity") +
    scale_fill_manual(values = palette5) +
    labs(title = "Mean Absolute Errors by model specification and week") +
  plotTheme()
```
We could see that regression D and E has relatively low MAE by model specification and week.

### 5.2 `reg5` seems to have the best goodness of fit generally.

```{r error_vs_actual_timeseries , warning = FALSE, message = FALSE}
week_predictions %>% 
    mutate(interval60 = map(data, pull, interval60),
           from_station_id = map(data, pull, from_station_id)) %>%
    dplyr::select(interval60, from_station_id, Observed, Prediction, Regression) %>%
    unnest() %>%
    gather(Variable, Value, -Regression, -interval60, -from_station_id) %>%
    group_by(Regression, Variable, interval60) %>%
    summarize(Value = sum(Value)) %>%
    ggplot(aes(interval60, Value, colour=Variable)) + 
      geom_line(size = 1.1) + 
      facet_wrap(~Regression, ncol=1) +
      labs(title = "Predicted/Observed bike share time series", subtitle = "Philadelphia; A test set of 2 weeks",  x = "Hour", y= "Station Trips") +
      plotTheme()
```


```{r errors_by_station, warning = FALSE, message = FALSE }
week_predictions %>% 
    mutate(interval60 = map(data, pull, interval60),
           from_station_id = map(data, pull, from_station_id), 
           from_latitude = map(data, pull, from_latitude), 
           from_longitude = map(data, pull, from_longitude)) %>%
    select(interval60, from_station_id, from_longitude, from_latitude, Observed, Prediction, Regression) %>%
    unnest() %>%
  filter(Regression == "ETime_Space_FE_timeLags_holidayLags") %>%
  group_by(from_station_id, from_longitude, from_latitude) %>%
  summarize(MAE = mean(abs(Observed-Prediction), na.rm = TRUE))%>%
ggplot(.)+
  geom_sf(data = phillyCensus, color = "grey", fill = "transparent")+
  geom_point(aes(x = from_longitude, y = from_latitude, color = MAE), 
             fill = "transparent", alpha = 0.4)+
  scale_colour_viridis(direction = -1,
  discrete = FALSE, option = "D")+
  ylim(min(dat_census$from_latitude), max(dat_census$from_latitude))+
  xlim(min(dat_census$from_longitude), max(dat_census$from_longitude))+
  labs(title="Mean Abs Error, Test Set, Model 5")+
  mapTheme()
```
There are some MAE around downtown area and spread to University City and nearby areas.

### 5.3. Space-Time Error Evaluation

If we plot observed vs. predicted for different times of day during the week and weekend, some patterns begin to emerge. 

```{r obs_pred_all, warning=FALSE, message = FALSE, cache=TRUE}
week_predictions %>% 
    mutate(interval60 = map(data, pull, interval60),
           from_station_id = map(data, pull, from_station_id), 
           from_latitude = map(data, pull, from_latitude), 
           from_longitude = map(data, pull, from_longitude),
           dotw = map(data, pull, dotw)) %>%
    select(interval60, from_station_id, from_longitude, 
           from_latitude, Observed, Prediction, Regression,
           dotw) %>%
    unnest() %>%
  filter(Regression == "ETime_Space_FE_timeLags_holidayLags")%>%
  mutate(weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday"),
         time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
                                 hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
                                 hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
                                 hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush"))%>%
  ggplot()+
  geom_point(aes(x= Observed, y = Prediction))+
    geom_smooth(aes(x= Observed, y= Prediction), method = "lm", se = FALSE, color = "red")+
    geom_abline(slope = 1, intercept = 0)+
  facet_grid(time_of_day~weekend)+
  labs(title="Observed vs Predicted",
       x="Observed trips", 
       y="Predicted trips")+
  plotTheme()
```

Weekend's morning is the hardest to predict. 

### 5.4. MAE map by weekend/weekday and time of day.

```{r station_summary, warning=FALSE, message = FALSE }
week_predictions %>% 
    mutate(interval60 = map(data, pull, interval60),
           from_station_id = map(data, pull, from_station_id), 
           from_latitude = map(data, pull, from_latitude), 
           from_longitude = map(data, pull, from_longitude),
           dotw = map(data, pull, dotw) ) %>%
    select(interval60, from_station_id, from_longitude, 
           from_latitude, Observed, Prediction, Regression,
           dotw) %>%
    unnest() %>%
  filter(Regression == "ETime_Space_FE_timeLags_holidayLags")%>%
  mutate(weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday"),
         time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
                                 hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
                                 hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
                                 hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush")) %>%
  group_by(from_station_id, weekend, time_of_day, from_longitude, from_latitude) %>%
  summarize(MAE = mean(abs(Observed-Prediction), na.rm = TRUE))%>%
  ggplot(.)+
  geom_sf(data = phillyCensus, color = "grey", fill = "transparent")+
  geom_point(aes(x = from_longitude, y = from_latitude, color = MAE), 
             fill = "transparent", size = 0.5, alpha = 0.4)+
  scale_colour_viridis(direction = -1,
  discrete = FALSE, option = "D")+
  ylim(min(dat_census$from_latitude), max(dat_census$from_latitude))+
  xlim(min(dat_census$from_longitude), max(dat_census$from_longitude))+
  facet_grid(weekend~time_of_day)+
  labs(title="Mean Absolute Errors, Test Set")+
  mapTheme()
  
```

Seems like errors are concentrated in downtown areas. The pattern has been visualized both as the scatter plot and the spatial map. The ridership is high during the evening rush as compared to the morning rush, hence why the errors are higher in the weekday predictions. 

### 5.5 Errors as a function of socio-economic variables

```{r station_summary2, warning=FALSE, message = FALSE }
week_predictions %>% 
    mutate(interval60 = map(data, pull, interval60),
           from_station_id = map(data, pull, from_station_id), 
           from_latitude = map(data, pull, from_latitude), 
           from_longitude = map(data, pull, from_longitude),
           dotw = map(data, pull, dotw),
           Percent_Taking_Public_Trans = map(data, pull, Percent_Taking_Public_Trans),
           Med_Inc = map(data, pull, Med_Inc),
           Med_Age = map(data, pull, Med_Age),
           Mean_Commute_Time = map(data,pull,Mean_Commute_Time),
           Percent_White = map(data, pull, Percent_White)) %>%
    select(interval60, from_station_id, from_longitude, 
           from_latitude, Observed, Prediction, Regression,
           dotw, Med_Inc, Med_Age, Mean_Commute_Time,Percent_Taking_Public_Trans) %>%
    unnest() %>%
  filter(Regression == "ETime_Space_FE_timeLags_holidayLags")%>%
  mutate(weekend = ifelse(dotw %in% c("Sun", "Sat"), "Weekend", "Weekday"),
         time_of_day = case_when(hour(interval60) < 7 | hour(interval60) > 18 ~ "Overnight",
                                 hour(interval60) >= 7 & hour(interval60) < 10 ~ "AM Rush",
                                 hour(interval60) >= 10 & hour(interval60) < 15 ~ "Mid-Day",
                                 hour(interval60) >= 15 & hour(interval60) <= 18 ~ "PM Rush")) %>%
  filter(time_of_day == "AM Rush") %>%
  group_by(from_station_id, Med_Inc, Percent_Taking_Public_Trans, Mean_Commute_Time) %>%
  summarize(MAE = mean(abs(Observed-Prediction), na.rm = TRUE))%>%
  gather(-from_station_id, -MAE, key = "variable", value = "value")%>%
  ggplot(.)+
  #geom_sf(data = phillyCensus, color = "grey", fill = "transparent")+
  geom_point(aes(x = value, y = MAE), alpha = 0.4)+
  geom_smooth(aes(x = value, y = MAE), method = "lm", se= FALSE)+
  facet_wrap(~variable, scales = "free")+
  labs(title="Errors as a function of socio-economic variables",
       y="Mean Absolute Error (Trips)")+
  plotTheme()
  
```

Let's focus on the morning commute, where station locations probably relate to likely users. How is the model performing on weekday mornings relative to demand for public transportation (e.g. possible user base). We can tell that there are a select few stations that are proving sightly resistant to our model - they have long communication time, high mean income and low transit usage, demographically.



### 5.6  Animated Map by space/time dependencies
 
```{r animation, warning=FALSE, message = FALSE }
week21 <-
  filter(dat_census , week == 21)

week21.panel <-
  expand.grid(
    interval15 = unique(week21$interval15),
    Pickup.Census.Tract = unique(dat_census$from_station_id))

ride.animation.data <-
  mutate(week21, Trip_Counter = 1) %>%
  select(interval15, from_station_id, from_longitude, from_latitude, Trip_Counter) %>%
  group_by(interval15, from_station_id, from_longitude, from_latitude) %>%
  summarize(Trip_Count = sum(Trip_Counter, na.rm=T)) %>% 
  ungroup() %>% 
  mutate(Trips = case_when(Trip_Count == 0 ~ "0 trips",
                           Trip_Count > 0 & Trip_Count <= 2 ~ "0-2 trips",
                           Trip_Count > 2 & Trip_Count <= 5 ~ "2-5 trips",
                           Trip_Count > 5 & Trip_Count <= 10 ~ "5-10 trips",
                           Trip_Count > 10 & Trip_Count <= 15 ~ "10-15 trips",
                           Trip_Count > 15 ~ "15+ trips")) %>%
  mutate(Trips  = fct_relevel(Trips, "0 trips","0-2 trips","2-5 trips",
                              "5-10 trips","10-15 trips","15+ trips"))
library(FNN)

mapTheme <- function(base_size = 12) {
  theme(
    text = element_text( color = "black"),
    plot.title = element_text(size = 18,colour = "black"),
    plot.subtitle=element_text(face="italic"),
    plot.caption=element_text(hjust=0),
    axis.ticks = element_blank(),
    panel.background = element_blank(),axis.title = element_blank(),
    axis.text = element_blank(),
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=2)
  )
}

rideshare_animation <-
  ggplot()+
  geom_sf(data = phillyTracts %>%
            st_transform(crs=4326), colour = '#efefef')+
  geom_point(data = ride.animation.data, 
             aes(x = from_longitude, y = from_latitude, fill = Trips, color = Trips), size = 1, alpha = 1.5) +
  scale_colour_manual(values = palette1) +
  labs(title = "Indego pickups for one week in May 2022",
       subtitle = "15 minute intervals: {current_frame}") +
  transition_manual(interval15) +
  mapTheme()

animate(rideshare_animation, duration=20, renderer = gifski_renderer())
```
 
## 6. k-fold cross validation
 
```{r cv }
indegosample <- sample_n(ride.panel, 100000)%>%
  na.omit()

fitControl <- trainControl(method = "cv", 
                           number = 100,
                           savePredictions = TRUE)

set.seed(42)
# for k-folds CV

reg.cv <-  
  train(Trip_Count ~ from_station_name + hour(interval60) + dotw + Temperature + Precipitation +
                   lagHour + lag2Hours +lag3Hours +lag12Hours + lag1day + holidayLag + holiday, 
        data = indegosample,  
        method = "lm",  
        trControl = fitControl,  
        na.action = na.pass)

reg.cv

```
When we work with larger data sets, we cannot examine each value to see if there is an outlier or some outliers, or if all errors are systematically higher. Looking at the ratio of MAE to RMSE can help us understand if there are larger but less common errors. 
The closer the model predictions are to the observations, the smaller the MSE will be.

## 7. Conclusion

This model could help indicate where and when the high demand would occur. A bike re-balancing plan could be made based on this model's result, which has small errors but is overall beneficial. In regards to the re-balancing plan, the algorithm predicts hourly and daily patterns so well that it is possible to predict when and how an individual will arrive at a particular location. As a result, we will be able to incentivize users based on their behavior and thus manage the supply and demand for bikes. The reward program for re-balancing can give some credits to riders who park their shared bikes in predicted high-demand areas during the morning and evening peak commute periods on weekdays. Each week can be adjusted based on the previous week, while the same time period from the previous year or two years can also be used, as seasons, weather, and holidays can also be referenced. In the future, if there are other factors that raise attention, we could also add them to the algorithm and test them to see the accuracy and generalization. 
