#import data
setwd("~/Desktop/590/final")
require(foreign)
require(nnet)
require(ggplot2)
require(reshape2)
library(lubridate)
library(dplyr)
library(sf)
library(tidyverse)
library(tidyr)
library(lubridate)
library(ggmap)
library(stplanr)
library(tmap)
library(caret)
library(stringr)
library(viridis)
require(gridExtra)
library(kableExtra)
library(forestmangr)
library(MASS)
options(scipen=99)
Theme1 <- theme(legend.box = "horizontal",
plot.title = element_text(size=12),
plot.subtitle = element_text(size=8),
axis.title.x = element_text(size = 4))
Theme2 <- theme(plot.title = element_text(size=12),
plot.subtitle = element_text(size=9))
palette3 <- c("#bdd7e7","#6baed6","#08519c")
palette4 <- c("#D2FBD4","#92BCAB","#527D82","#123F5A")
palette5 <- c("#eff3ff","#bdd7e7","#6baed6","#3182bd","#08519c")
mapTheme <- theme(plot.title =element_text(size=12),
plot.subtitle = element_text(size=8),
plot.caption = element_text(size = 6),
axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
axis.title.x=element_blank(),
axis.title.y=element_blank(),
panel.background=element_blank(),
panel.border=element_blank(),
panel.grid.major=element_line(colour = 'transparent'),
panel.grid.minor=element_blank(),
legend.direction = "vertical",
legend.position = "right",
plot.margin = margin(1, 1, 1, 1, 'cm'),
legend.key.height = unit(1, "cm"), legend.key.width = unit(0.2, "cm"))
plotTheme <- theme(
plot.title =element_text(size=12),
plot.subtitle = element_text(size=8),
plot.caption = element_text(size = 6),
axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
axis.text.y = element_text(size = 10),
axis.title.y = element_text(size = 10),
panel.background=element_blank(),
plot.background=element_blank(),
panel.grid.major=element_line(colour="#D0D0D0",size=.2),
axis.ticks=element_blank())
substrRight <- function(x, n){
substr(x, nchar(x)-n+1, nchar(x))}
options(scipen=50, digits=10)
stations <- read.csv("/Users/zhangjing/Desktop/590/final/stations.csv")%>%
st_as_sf(coords = c("longitude", "latitude"), crs = 4326) %>%
mutate(station_id = as.numeric(str_sub(URI, 31, 39)))%>%
dplyr::select(-alternative.fr,-alternative.nl,-alternative.de, -alternative.en, -country.code, -URI)
rail <- st_read("/Users/zhangjing/Desktop/590/final/source/belgium-railways-shape/railways.shp")%>%
filter(type=="rail")
line <- st_read("https://raw.githubusercontent.com/GillesVandewiele/KaggleTrainOccupancy/master/line_info.csv")
#belgium_basemap <- get_stamenmap(bbox = c(left = 2.739258, bottom = 49.563228, right = 6.551513, top = 51.695260), zoom = 11)
trains <-
read.csv("/Users/zhangjing/Desktop/590/final/trains_train.csv") %>%
mutate(from = as.numeric(as.character(from)),
to = as.numeric(as.character(to))) %>%
left_join(dplyr::select(stations, station_id, avg_stop_times,name), by = c("from" = "station_id")) %>%
st_sf() %>% mutate(from.X = st_coordinates(.)[,1],
from.Y = st_coordinates(.)[,2]) %>%
st_drop_geometry() %>%
left_join(dplyr::select(stations, station_id, avg_stop_times), by = c("to" = "station_id")) %>%
st_sf() %>% mutate(to.X = st_coordinates(.)[,1],
to.Y = st_coordinates(.)[,2]) %>%
st_drop_geometry() %>%
mutate(distance = sqrt((from.X - to.X)^2 + (from.Y - to.Y)^2),
stop_times_from = avg_stop_times.x,
stop_times_to = avg_stop_times.y,) %>%
dplyr::select(-avg_stop_times.x, -avg_stop_times.y)%>%
arrange(-distance)
trains$occ2 <- relevel(trains$occupancy, ref = "low")
trains$OD <- paste (trains$from, trains$to, sep= "-")
trains$time <- parse_date_time(trains$time, '%I:%M:%S %p')
trains$time <- substr(trains$time, 12, 19)
trains$date_time <- as.POSIXct(paste(trains$date, trains$time), format="%Y/%m/%d %H:%M:%S")
str(trains)
trains <-
trains %>%
mutate(veh = substr(vehicle,1,1),
interval60 = floor_date(ymd_hms(date_time), unit = "hour"),
interval15 = floor_date(ymd_hms(date_time), unit = "15 mins"),
week = week(interval60),
dotw = wday(interval60, label=TRUE),
veh = substr(vehicle,1,1),
hour = as.numeric(substr(time, 1, 2)),
weekend = ifelse(dotw == "Sat" | dotw == "Sun",1,0),
mor.jam = ifelse(hour>= 6 & hour<=10 & weekend == "0",1,0),
eve.jam = ifelse(hour>= 15 & hour<= 19 & weekend == "0",1,0),
hour = factor(hour),
weekend = factor(weekend),
# mor.jam = factor(mor.jam),
# eve.jam = factor(eve.jam),
from = factor(from),
to = factor (to))
new_t <- trains%>%
mutate(new = 1)%>%
dplyr::select(OD, new)%>%
group_by(OD) %>%
summarise( count = sum (new))
new_t <- separate(new_t,OD,into=c("from","to"), sep="-")
new_t <- subset(new_t, new_t$to!="1000000" & new_t$from!="1000000" & new_t$to!="0" & !is.na(new_t$to) & !is.na(new_t$from) & new_t$to!="NA" & new_t$from!="NA")
new_t <- subset(new_t, new_t$to!=new_t$from)
new_t$from <- as.numeric(as.character(new_t$from))
new_t$to <- as.numeric(new_t$to)
station_extract <- stations%>%
mutate(to = station_id)%>%
dplyr::select(to)
z <- station_extract
l <- od2line(flow = new_t, zones = z)
l$OD <- paste (l$from, l$to, sep= "-")
try <- merge(l,trains, by = "OD", all.x=T, all.y=F)
g <- ggplot(try)+
#geom_sf(data=rail, inherit.aes = FALSE,na.rm=T, col="black", lwd=0.2 )+
geom_sf(data=stations, size = 1,inherit.aes = FALSE)+
geom_sf(data=try, aes(color = occupancy), inherit.aes = FALSE, line.lwd = "count")+
#geom_sf(stat_order_sf,aes(color = vehicle_id), size=0.3)+
#theme(legend.position="none")+
ylim(min(49.5), max(51.5))+ #lat
xlim(min(2.5), max(6.1)) #long
train_sf <- merge(l[,c("geometry","OD")], trains, by = "OD", alll=T)
weather1 <- read.csv("https://raw.githubusercontent.com/GillesVandewiele/KaggleTrainOccupancy/master/weather_data_july_1.csv")
weather2 <- read.csv("https://raw.githubusercontent.com/GillesVandewiele/KaggleTrainOccupancy/master/weather_data_july_2.csv")
weather3 <- read.csv("https://raw.githubusercontent.com/GillesVandewiele/KaggleTrainOccupancy/master/weather_data_aug_1.csv")
weather4 <- read.csv("https://raw.githubusercontent.com/GillesVandewiele/KaggleTrainOccupancy/master/weather_data_aug_2.csv")
weather5 <- read.csv("https://raw.githubusercontent.com/GillesVandewiele/KaggleTrainOccupancy/master/weather_data_sep_1.csv")
weather6 <- read.csv("https://raw.githubusercontent.com/GillesVandewiele/KaggleTrainOccupancy/master/weather_data_sep_2.csv")
weather7 <- read.csv("https://raw.githubusercontent.com/GillesVandewiele/KaggleTrainOccupancy/master/weather_data_oct_1.csv")
weather8 <- read.csv("https://raw.githubusercontent.com/GillesVandewiele/KaggleTrainOccupancy/master/weather_data_oct_2.csv")
weather1 <-
mutate(weather1, interval60 = ymd_h(substr(date_time,1,13))) %>%
mutate(week = week(interval60),
dotw = wday(interval60, label=TRUE)) %>%
group_by(interval60,station_name) %>%
dplyr::summarize(Temperature = max(temperature),
Humidity = max(humidity),
Wind_Speed = max(windspeed))
weather2 <-
mutate(weather2, interval60 = ymd_h(substr(date_time,1,13))) %>%
mutate(week = week(interval60),
dotw = wday(interval60, label=TRUE)) %>%
group_by(interval60,station_name) %>%
dplyr::summarize(Temperature = max(temperature),
Humidity = max(humidity),
Wind_Speed = max(windspeed))
weather3 <-
mutate(weather3, interval60 = ymd_h(substr(date_time,1,13))) %>%
mutate(week = week(interval60),
dotw = wday(interval60, label=TRUE)) %>%
group_by(interval60,station_name) %>%
dplyr::summarize(Temperature = max(temperature),
Humidity = max(humidity),
Wind_Speed = max(windspeed))
weather4 <-
mutate(weather4, interval60 = ymd_h(substr(date_time,1,13))) %>%
mutate(week = week(interval60),
dotw = wday(interval60, label=TRUE)) %>%
group_by(interval60,station_name) %>%
dplyr::summarize(Temperature = max(temperature),
Humidity = max(humidity),
Wind_Speed = max(windspeed))
weather5 <-
mutate(weather5, interval60 = ymd_h(substr(date_time,1,13))) %>%
mutate(week = week(interval60),
dotw = wday(interval60, label=TRUE)) %>%
group_by(interval60,station_name) %>%
dplyr::summarize(Temperature = max(temperature),
Humidity = max(humidity),
Wind_Speed = max(windspeed))
weather6 <-
mutate(weather6, interval60 = ymd_h(substr(date_time,1,13))) %>%
mutate(week = week(interval60),
dotw = wday(interval60, label=TRUE)) %>%
group_by(interval60,station_name) %>%
dplyr::summarize(Temperature = max(temperature),
Humidity = max(humidity),
Wind_Speed = max(windspeed))
weather7 <-
mutate(weather7, interval60 = ymd_h(substr(date_time,1,13))) %>%
mutate(week = week(interval60),
dotw = wday(interval60, label=TRUE)) %>%
group_by(interval60,station_name) %>%
dplyr::summarize(Temperature = max(temperature),
Humidity = max(humidity),
Wind_Speed = max(windspeed))
weather8 <-
mutate(weather8, interval60 = ymd_h(substr(date_time,1,13))) %>%
mutate(week = week(interval60),
dotw = wday(interval60, label=TRUE)) %>%
group_by(interval60,station_name) %>%
dplyr::summarize(Temperature = max(temperature),
Humidity = max(humidity),
Wind_Speed = max(windspeed))
weather1$station_date <- paste (weather1$station_name, weather1$interval60, sep= "-")
weather2$station_date <- paste (weather2$station_name, weather2$interval60, sep= "-")
weather3$station_date <- paste (weather3$station_name, weather3$interval60, sep= "-")
weather4$station_date <- paste (weather4$station_name, weather4$interval60, sep= "-")
weather5$station_date <- paste (weather5$station_name, weather5$interval60, sep= "-")
weather6$station_date <- paste (weather6$station_name, weather6$interval60, sep= "-")
weather7$station_date <- paste (weather7$station_name, weather7$interval60, sep= "-")
weather8$station_date <- paste (weather8$station_name, weather8$interval60, sep= "-")
weather <- rbind(weather1,weather2,weather3,weather4,weather5,weather6,weather7,weather8)
trains$station_date <- paste (trains$name, trains$interval60, sep= "-")
trains <- merge(x=trains,y=weather[,c("station_date","Temperature","Humidity","Wind_Speed")], by = "station_date", all.x=TRUE)
trains$Temperature[is.na(trains$Temperature)] <- 0
trains$Humidity[is.na(trains$Humidity)] <- 0
trains$Wind_Speed[is.na(trains$Wind_Speed)] <- 0
Services oversupply and shortage are always serious problems in the transportation operating system. In train services, the oversupply will increase the operating fee while with limited revenue thus result in decreasing profit. In the situation of service shortage (supply > demand), most of the trains will crow and that considerably harm the experience of passengers and damage the company’s reputation. Also, the company will lose a number of potential passengers and their ticket fare if the train keep crowed since they can’t get into the train and that forces them to take other means of transportation.
Thus, we want to provide a smart train occupancy prediction tool for transportation planners and help them to correct the train services provision of the whole region according to current demand. Although occupancy prediction isn’t a new thing, most of the user of the prediction is train passengers, and the function is about to let the passengers know the occupancy level before they buy the ticket. Currently, planner only regards the performance or feature of origin station as the main criterion of the trip’s occupancy, while, the situation is much more complex than this, and which the train occupancy might also be impacted by destination station, weather, serving line, departure date and time, etc. Apart from that, although there are some train prediction products that can be used by transportation planners, they only care about the accuracy of the prediction. Different from them, we also took an cost-benefit analysis of our model, which is an ideal way to let user know almost all the prediction have mistake, and how much you would pay for the wrong prediction and how much you can benefit from the correct prediction (the latter one always larger than the former one).
The core arithmetic of the website is a prediction model that can learn from previous train occupancy recording and accurately predict the future occupancy to three-level: low, medium and high. This part will briefly introduce the method framework and the data that we use.
This diagram shows the framework of the process.
traininfo <- read.csv("/Users/zhangjing/Desktop/590/final/train.csv")
kable(traininfo,
caption = "Train trip data") %>%
kable_styling("striped", full_width = F)
Variable_Name | Description |
---|---|
date | the data of the trip |
time | the departure time |
from | the id of departure station |
to | the id of arrival station |
vehicle | vehicle type |
occupancy | the occupancy of the train |
stationinfo <- read.csv("/Users/zhangjing/Desktop/590/final/stationinfo.csv")
kable(stationinfo,
caption = "Station data") %>%
kable_styling("striped", full_width = F)
Variable_Name | Description |
---|---|
URI | links to connection informa connection of that train |
Name | the station name |
longitude | the longitude of the station |
latitude | the latitude of the train |
avg_stop_times | the average stop time of the station |
lineinfo <- read.csv("/Users/zhangjing/Desktop/590/final/lineinfo.csv")
kable(lineinfo,
caption = "Train line data") %>%
kable_styling("striped", full_width = F)
Variable_Name | Description |
---|---|
vehicle id | the id of the train |
vehicle type | the type of vehicle |
nr_of_stops | the number of stops |
stopping_station_ids | id of the stopping stations |
grid.arrange(
ggplot(trains, aes(interval60,Humidity)) + geom_line() +
labs(title="Percipitation", x="Hour", y="Humidity") + plotTheme,
ggplot(trains, aes(interval60,Wind_Speed)) + geom_line() +
labs(title="Wind Speed", x="Hour", y="Wind Speed") + plotTheme,
ggplot(trains, aes(interval60,Temperature)) + geom_line() +
labs(title="Temperature", x="Hour", y="Temperature") + plotTheme,
top="Weather Data - July~October, 2016")
Generally, there are 2250 trips and 859 unique Origin and Destination pairs (O-D pair). For example, “8814001-8731901” is one O-D pair which means that this trip is FROM station 8814001 TO station 8731901. However, we can’t directly put all O-D pairs into the model which might destroy the generalizability of the whole model. Therefore, we grouped the train’s data by “O-D pair” and “occupancy level” and found 58 O-D pairs that have more than 4 trips (count>=4) and also have a higher share of low or medium or high occupancy (percent >=0.5). It means that, for each selected O-D pairs, more than 50% of the trips are in low (or medium or high) occupancy. For example, in London, the tube trip FROM Oxford Circus Station (main shopping street) to Holborn Station (Station next to Chinatown which is a famous tourist attraction in London) is always crowded and in high occupancy. While the tube trip in a suburban area is likely to be in low occupancy since it only serves residents who live surround this station. Therefore, this step selected the O-D paries which has some characteristics of its occupancy level.
We will add one more column to the trains dataset called OD_occ, in this column, if the O-D pair of the trip belong to the selected O-D pairs (as plotted), they will be coded as its original O-D pair, otherwise, they will be coded as “1” which means that for this type of O-D pairs, we didn’t find special characteristics of its occupancy level.
# occ with OD trip (percent)
occ_OD <- trains%>%
group_by(OD)%>%
count(occ2)%>%
mutate(percent = n/sum(n),
count = n)%>%
filter(n>=4 & percent >0.5)%>%
dplyr::select(-n)
occ_OD_df <- as.data.frame(occ_OD)
head(occ_OD[order(-occ_OD_df$count, -occ_OD_df$percent),],10)
## # A tibble: 10 x 4
## # Groups: OD [10]
## OD occ2 percent count
## <chr> <fct> <dbl> <int>
## 1 8892007-8891009 low 0.512 22
## 2 8833001-8812005 high 0.618 21
## 3 8813003-0 medium 0.516 16
## 4 8812005-8831765 low 1 14
## 5 8893013-8893401 low 1 13
## 6 8821006-8821121 low 0.812 13
## 7 8813003-8892007 low 0.571 12
## 8 8841202-8812005 high 0.846 11
## 9 8896149-8813003 medium 0.786 11
## 10 8811106-8811916 low 1 10
# merge l with occ_OD
l2 <- merge(l, occ_OD, by="OD", all.x=TRUE)
# plot the trip with high percentage of certain occupancy
l2$occ2 <- factor(l2$occ2, level=c("low","medium","high"))
train_sf$OD_occ <- factor(ifelse(train_sf$OD %in% occ_OD$OD, train_sf$OD, "1"))
l2%>%
filter(!is.na(l2$occ2))%>%
mutate(per = percent*10)%>%
ggplot()+
geom_sf(data=rail, inherit.aes = FALSE,na.rm=T, col="black", lwd=0.2 )+
geom_sf(data=stations, size = 1,
inherit.aes = FALSE, col="grey" )+
ylim(min(49.5), max(51.5))+ #lat
xlim(min(2.5), max(6.1)) +#long
geom_sf(aes(color = -percent), lwd=1.5, fill = "transparent")+
scale_colour_viridis(direction = 1, discrete = FALSE, option = "C")+
facet_wrap(~occ2)+
labs(title="Selected O-D paires and their occupany ",
subtitle="Filiter: OD paires have 4+ trips, share of certain occupancy level >50%")
Apart from the O-D pair, the origin and the destination will also impact the occupancy level of the trip. For example in Beijing, the short tube trips FROM Beijing South Station (the biggest railwayStation in Beijing) are always highly occupied, whatever the direction or the line. Also the short tube trips TO Beijing South Station also always crowded, since there is considerable demand to gather to or disperse from this station. Thus, to explore the relationship between trips occupancy and Origin or Destination Station in our data, we grouped the trains dataset by “From”/“TO” Stations and three “occupancy level”. We find that the FROM and TO stations considerably impact the occupancy level as shown in Figure. Specifically, 100% of the trips departure FROM or head TO 8821964 station is in low occupancy, and there are a number of stations that have such kind of feature. Additionally, we also found that for many stations, such as 8831401, about 67% of the trips departure FROM this station is highly occupied while 70% of the trips that head TO this station is in low occupancy. Therefore, we can justify that the impact of the station as Origin or Destination on trips’ occupancy is different. Thus, we intend to put all FROM and TO stations into our model.
### from
occ_from <- trains%>%
group_by(from)%>%
count(occ2)%>%
mutate(percent = n/sum(n),
count = n,
station_id= from)%>%
# filter(n>=4 & percent >0.5)%>%
dplyr::select(-n)
#merge with sf
stations <- stations%>%
mutate(Geom = gsub('[(c)°]', '', geometry)) %>%
separate(col = Geom, into = c('lat', 'lon'), sep = '\\,')
occ_from <- merge(stations[c("lat","lon","station_id")], occ_from, by= "station_id",all.y=T)
occ_from_df <- as.data.frame(occ_from)%>%
dplyr::select(-geometry)%>%
round_df(3)
head(occ_from_df[order( -occ_from_df$count,-occ_from_df$percent),],10)
## station_id lat lon from occ2 percent count
## 337 8892007 3.710675 51.035896 8892007 low 0.359 79
## 336 8892007 3.710675 51.035896 8892007 medium 0.341 75
## 64 8812005 4.360846 50.859663 8812005 low 0.374 67
## 201 8833001 4.715866 50.88228 8833001 high 0.537 66
## 338 8892007 3.710675 51.035896 8892007 high 0.300 66
## 63 8812005 4.360846 50.859663 8812005 high 0.358 64
## 86 8813003 4.356801 50.845658 8813003 medium 0.375 60
## 85 8813003 4.356801 50.845658 8813003 high 0.344 55
## 65 8812005 4.360846 50.859663 8812005 medium 0.268 48
## 122 8821006 4.421101 51.2172 8821006 low 0.584 45
#### to
occ_to <- trains%>%
group_by(to)%>%
count(occ2)%>%
mutate(percent = n/sum(n),
count = n,
station_id= to)%>%
# filter(n>=4 & percent >0.5)%>%
dplyr::select(-n)
#merge with sf
occ_to <- merge(stations[c("geometry","station_id")], occ_to, by= "station_id",all.y=T)
grid.arrange(
ggplot(occ_from)+
geom_sf(data=rail, inherit.aes = FALSE,na.rm=T, col="black", lwd=0.2, alpha=0.6)+
geom_sf(data=occ_from, aes(color = percent, size = count), alpha=0.5)+
scale_colour_viridis(direction = -1, discrete = F, option = "C")+
facet_wrap(~occ2)+
ylim(min(49.3), max(51.5))+ #lat
xlim(min(2.5), max(6.3))+
labs(title="Plot of trip occupancy FROM the station",
subtitle="Point size = count of trips FROM this station\nPoint color = share of certain occupancy level")
+Theme1+ guides(size = FALSE)
,
ggplot(occ_to)+
geom_sf(data=rail, inherit.aes = FALSE,na.rm=T, col="black", lwd=0.2, alpha=0.2)+
geom_sf(data=occ_to, aes(colour = percent, size = count), alpha=0.5)+
scale_colour_viridis(direction = -1, discrete = F, option = "C")+
facet_wrap(~occ2)+
ylim(min(49.3), max(51.5))+ #lat
xlim(min(2.5), max(6.3))+
labs(title="Plot of trip occupancy TO the station",
subtitle="Point size = count of trips TO this station\nPoint color = share of certain occupancy level")
+Theme1+guides(size = FALSE)
,
ncol=1)
Except for O-D pairs, origin and destination station, we also considered the serving line of this trip (also called as vehicle in the dataset). Similar to previous method, we grouped the data by line type, and selected the line that 75% of the trips belong to this line is low or medium or high occupancy, also the line has more than 4 trips (since we don’t want to select the line that has fill trips which is too special and can help the model).
# occ with veh
occ_veh <- trains%>%
group_by(vehicle)%>%
count(occ2)%>%
mutate(percent = n/sum(n),
vehicle_id = vehicle,
count = n)%>%
filter(n>=5, percent>=0.75)%>%
dplyr::select(-n)
occ_veh_df <- as.data.frame(occ_veh)%>%
round_df(3)
head(occ_veh_df[order( -occ_veh_df$count,-occ_veh_df$percent),],10)
## vehicle occ2 percent vehicle_id count
## 24 IC736 low 1.000 IC736 17
## 2 8015 low 1.000 8015 14
## 1 1828 low 0.929 1828 13
## 15 IC3631 high 0.812 IC3631 13
## 13 IC3432 low 1.000 IC3432 12
## 19 IC4237 low 1.000 IC4237 12
## 25 L557 low 0.857 L557 12
## 29 P7444 high 0.857 P7444 12
## 38 undefined medium 1.000 undefined 11
## 36 S53586 low 0.917 S53586 11
#seperate and clean staion
line$nr_of_stops <- as.numeric(as.character(line$nr_of_stops))
sep_stat_line <- as.data.frame(str_split_fixed(line$stopping_station_ids, ",",34))
sep_stat_line <- sep_stat_line %>%
mutate_all(funs(gsub("'", "", .)))%>%
mutate_all(funs(gsub("]", "", .)))%>%
mutate(V1 = substr(V1, 2, 10))
# join station with line info
sep_stat_line <- cbind(line,sep_stat_line)
sep_stat_line <-
sep_stat_line%>%
dplyr::select(-Unnamed..0, -Unnamed..0.1)
# wide to long
v_stat_line <- reshape(sep_stat_line,
varying=c(6:39),
timevar = "Order",
idvar=c("vehicle_id"),
v.names="station_id",
direction="long")
v_stat_line <- v_stat_line%>%mutate_all(funs(gsub(" ", "", .)))%>%
mutate(station_id = substr(station_id,3,9))%>%
filter(station_id!="")%>%
dplyr::select(-stopping_station_ids)
# add geomtry to each station
stat_order_sf <- merge(stations[c("avg_stop_times", "station_id","geometry")], v_stat_line, by = "station_id",all=T)
# if the % of certain type of occupancy is larger than 0.75 (and count >=5), then 1, otherwise 0
stat_order_sf$occ_veh<- ifelse(stat_order_sf$vehicle_id %in% occ_veh$vehicle_id,1,0)
# only select the stations exist in trains
stat_order_sf <- subset(stat_order_sf, stat_order_sf$station_id %in% trains$from | stat_order_sf$station_id %in% trains$to)
# merge occupancy into station order_sf
veh_sf <- merge(stat_order_sf,occ_veh, by = "vehicle_id", all.x=T)
veh_sf <- veh_sf%>%
filter(!is.na(occ2))
ggplot()+
geom_sf(data=rail, inherit.aes = FALSE,na.rm=T, col="black", lwd=0.2, alpha=0.2)+
geom_sf(data=stat_order_sf, color = "grey",size=1, alpha=0.4)+
geom_sf(data=veh_sf,aes(color = vehicle_id, group = vehicle_id), size=1)+
facet_wrap(~occ2)+
ylim(min(49.5), max(51.5))+ #lat
xlim(min(2.5), max(6.1))+
theme(legend.position="bottom")+
labs(title="Relationship between occupancy and LINEs")
# subtitle="Relative to test set points (in black)")
train_sf$vehicle <- as.character(train_sf$vehicle)
train_sf$veh_occ <- as.factor(ifelse(train_sf$vehicle %in% occ_veh$vehicle, train_sf$vehicle, "1"))
(1)daily pattern The following plots show the distribution of trip volume of different occupancy for different times of the day
ggplot(trains %>%mutate(hour = hour(date_time)))+
geom_freqpoly(aes(hour), binwidth = 1)+
labs(title="Train occupancy, by hour of the day",
x="Hour",
y="Train Counts")+
facet_wrap(~occupancy)+
plotTheme
At about 7am and 4pm, there were more trips than other times and the number of high occupancy, medium occupancy trip as well as low occupancy trips all increased. Yet, there were distict differences. At about 7am, there are fewer low occupancy trips than the high and medium occupancy trips; at about 12:30pm, there were more low occupancy trips; at 4pm there were also more low occupancy trips.
ggplot(trains %>%mutate(hour = hour(date_time)))+
geom_freqpoly(aes(hour, color = dotw), binwidth = 1)+
labs(title="Train occupancy, by day of the week",
x="Hour",
y="Train Counts")+
facet_wrap(~occupancy)+
plotTheme
The occupancy varied among different days of the week. At 7am on Sunday, there are more high occupancy trips; at 4pm on Friday there are more high and low occupancy trips.
ggplot(trains %>%mutate(week = week(interval60)))+
geom_freqpoly(aes(week), binwidth = 1)+
labs(title="Train occupancy, by week",
x="Week",
y="Train Counts")+
facet_wrap(~occupancy)+
plotTheme
The plots above suggests that between weeek 30 and week 35, there were more low occupancy trips, but between week 37 and week 45 there are more high occupancy trips.
trains$mor.jam[trains$mor.jam==0] <- 'No'
trains$mor.jam[trains$mor.jam==1] <- 'Yes'
trains$eve.jam[trains$eve.jam==0] <- 'No'
trains$eve.jam[trains$eve.jam==1] <- 'Yes'
trains %>%
dplyr::select(occupancy, mor.jam,eve.jam) %>%
gather(Variable, value, -occupancy) %>%
count(Variable, value, occupancy)%>%
ggplot(., aes(value, n, fill = occupancy)) +
geom_bar(position = "dodge", stat="identity") +
facet_wrap(~Variable, scales="free") +
scale_fill_viridis(option= "plasma", discrete=TRUE) +
labs(x="Is rush hour", y="Value",
title = "rush hour's associations with the occupancy")
The plots above indicate that in non-rush hours, there were more low occupancy trips; in evening rush hours, there were fewer medium occupancy trips.
train <-
read.csv("/Users/zhangjing/Desktop/590/final/trains_train.csv") %>%
mutate(from = as.numeric(as.character(from)),
to = as.numeric(as.character(to))) %>%
left_join(dplyr::select(stations, station_id), by = c("from" = "station_id")) %>%
st_sf() %>% mutate(from.X = st_coordinates(.)[,1],
from.Y = st_coordinates(.)[,2]) %>%
st_drop_geometry() %>%
left_join(dplyr::select(stations, station_id), by = c("to" = "station_id")) %>%
st_sf() %>% mutate(to.X = st_coordinates(.)[,1],
to.Y = st_coordinates(.)[,2]) %>%
st_drop_geometry() %>%
mutate(distance = sqrt((from.X - to.X)^2 + (from.Y - to.Y)^2)) %>%
arrange(-distance)
train1 <- filter(train,is.na(train$distance)==F)
train1 <-mutate(train1, distance.cat = case_when(
distance <0.27 ~ "less than 0.27",
distance >= 0.27 & distance < 0.54 ~ "0.27-0.54",
distance >= 0.54 & distance < 0.81 ~ "0.54-0.81",
distance >= 0.81 & distance < 1.08 ~ "0.81-1.08",
distance >= 1.08 & distance < 1.35 ~ "1.08-1.35",
distance >= 1.35 & distance < 1.62 ~ "1.35-1.62",
distance >= 1.62 & distance < 1.89 ~ "1.62-1.89",
distance >= 1.89 & distance < 2.16 ~ "1.89-2.16",
distance >= 2.16 & distance < 2.43 ~ "2.16-2.43",
distance >= 2.43 & distance < 2.7 ~ "2.43-2.7",
distance >2.7 ~ "over 2.7"))
train1$occupancy<-factor(train1$occupancy, levels=c("high", "medium", "low"))
train1 %>%
dplyr::select(occupancy, distance.cat) %>%
gather(Variable, value, -occupancy) %>%
count(Variable, value, occupancy)%>%
ggplot(., aes(value, n, fill = occupancy)) +
geom_bar(position = "dodge", stat="identity") +
facet_wrap(~Variable, scales="free") +
scale_fill_viridis(option= "plasma", discrete=TRUE) +
scale_x_discrete(limits=c("less than 0.27","0.27-0.54","0.81-1.08","1.08-1.35",
"1.35-1.62","1.62-1.89","1.89-2.16","2.16-2.43","2.43-2.7","over 2.7"))+
labs(x="Distance between from and to stations", y="Value",
title = "Distance's association with the occupancy") +
theme(axis.text.x = element_text(angle = 20, hjust = 1))
According to the plot above, when the departual and arrival stations were extremly closed to each other, there were mroe low occupancy trips. However, when the distance increased, the number of low occupancy trips largely decreased. Besides, when the distance was beween 1.08 to 1.35, there were slightly more high occupancy trips. Thus, within different distance, the distribution of different occupancy trips changed.
station_1 <- read.csv("/Users/zhangjing/Desktop/590/final/stations.csv")
station_1$station_id <- substrRight(as.character(station_1$URI), 9)
trains2 <- read.csv("/Users/zhangjing/Desktop/590/final/trains_train2.csv")
dat <- dplyr::select(station_1,station_id,avg_stop_times)
colnames(dat)[colnames(dat)=="station_id"] <- "from"
dat <- merge(x=dat,y=station_1[,c("station_id","avg_stop_times")], by = "avg_stop_times", all.x=TRUE)
colnames(dat)[colnames(dat)=="station_id"] <- "to"
trains2 <- merge(x=trains2, y=dat[,c("from","avg_stop_times")], by = "from", all.x = FALSE, all.y=FALSE, sort = FALSE)
colnames(trains2)[colnames(trains2)=="avg_stop_times"] <- "fromstoptime"
trains2 <- merge(x=trains2, y=dat[,c("to","avg_stop_times")], by = "to", all.x = FALSE, all.y=FALSE, sort = FALSE)
colnames(trains2)[colnames(trains2)=="avg_stop_times"] <- "tostoptime"
trains2$occupancy<-factor(trains2$occupancy, levels=c("high", "medium", "low"))
train_df<-as.data.frame(trains2)
trains2 <-mutate(train_df, fromstoptime.cat = case_when(
fromstoptime <68.7 ~ "less than 68.7",
fromstoptime >= 68.7 & fromstoptime < 137.4 ~ "68.7-137.4",
fromstoptime >= 137.4 & fromstoptime < 206.1 ~ "137.4-206.1",
fromstoptime >= 206.1 & fromstoptime < 274.8 ~ "206.1-274.8",
fromstoptime >= 274.8 & fromstoptime < 343.5 ~ "274.8-343.5",
fromstoptime >= 343.5 & fromstoptime < 412.2 ~ "343.5-412.2",
fromstoptime >= 412.2 & fromstoptime < 480.9 ~ "412.2-480.9",
fromstoptime >= 480.9 & fromstoptime < 549.6 ~ "480.9-549.6",
fromstoptime >= 549.6 & fromstoptime < 618.3 ~ "549.6-618.3",
fromstoptime >= 618.3 & fromstoptime < 687 ~ "618.3-687"))
trains2 <-mutate(trains2, tostoptime.cat = case_when(
tostoptime <68.7 ~ "less than 68.7",
tostoptime >= 68.7 & tostoptime < 137.4 ~ "68.7-137.4",
tostoptime >= 137.4 & tostoptime < 206.1 ~ "137.4-206.1",
tostoptime >= 206.1 & tostoptime < 274.8 ~ "206.1-274.8",
tostoptime >= 274.8 & tostoptime < 343.5 ~ "274.8-343.5",
tostoptime >= 343.5 & tostoptime < 412.2 ~ "343.5-412.2",
tostoptime >= 412.2 & tostoptime < 480.9 ~ "412.2-480.9",
tostoptime >= 480.9 & tostoptime < 549.6 ~ "480.9-549.6",
tostoptime >= 549.6 & tostoptime < 618.3 ~ "549.6-618.3",
tostoptime >= 618.3 & tostoptime < 687 ~ "618.3-687"))
trains2 %>%
dplyr::select(occupancy, fromstoptime.cat,tostoptime.cat) %>%
gather(Variable, value, -occupancy) %>%
count(Variable, value, occupancy)%>%
ggplot(., aes(value, n, fill = occupancy)) +
geom_bar(position = "dodge", stat="identity") +
facet_wrap(~Variable, scales="free") +
scale_fill_viridis(option= "plasma", discrete=TRUE) +
scale_x_discrete(limits=c("less than 68.7", "68.7-137.4", "137.4-206.1", "206.1-274.8","274.8-343.5", "343.5-412.2", "412.2-480.9", "480.9-549.6", "549.6-618.3", "618.3-687"))+
labs(x="Trains' Stop time", y="Value",
title = "From and to station's stop time associations with the occupancy")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
According to the plots above, there are differences among different stop time groups. For example, when the stop time of the from station was less than 137.4, there were obviously more low occupancy trips than high and medium, while the low, meidum and high occupancy trips were almost at the same amount when the stop time was more than 137.4; besides, when the stop time of the to station was between 274.8 to 343.5 there were more low occupancy trips.
The following plots show the relationship between weather and occupancy:
grid.arrange(
ggplot(trains)+
geom_freqpoly(aes(Temperature, color = occupancy))+
labs(title="Train occupancy, by temperature",
x="Temperture",
y="Train Counts"),
ggplot(trains)+
geom_freqpoly(aes(Humidity, color = occupancy))+
labs(title="Train occupancy, by humidity",
x="Humidity",
y="Train Counts"),
ggplot(trains)+
geom_freqpoly(aes(Wind_Speed, color = occupancy))+
labs(title="Train occupancy, by Wind_Speed",
x="Wind_Speed",
y="Train Counts"),
ncol=1)
The first plot shows that generally, no matter what the temperature was, there were more low occupancy trips, but when the temperature was between 12.5 to 15, there were more high occupancy trips; The second plot shows that as the humidity increased, the number of high, medium and low occupancy trips all increased, so the plot does not neccessarily support the hypothesis that humidity will infuence the occupancy. The third plot shows that there was no strong relationship between wind speed and the occupancy, although when the windspeed was bwtween 4 to 7 and 17 to 23 there were apparently more low occupancy trips.
This project uses Multinomial Logistic Regression to train the model. The dependent variable (y) is a three-category factor: train occupancy with low, medium and high three-level. When predicting the occupancy level by multinomial logistic regression, the model will calculate the possibility be each occupancy level and then return the occupancy level with the highest possibility. For example, for result type high-low ( the observed occupancy is high while the model wrongly predicted it as low), it means that the possibility that this model calculated to be “low” is higher than the possibility to be “high”, so the model returns the occupancy level as low. The following prediction result of the test set (testProbs_table) may help you understand the mechanism better.
trains <-
trains %>%
mutate( mor.jam = factor(mor.jam),
eve.jam = factor(eve.jam))
train_sf$veh <- as.factor(train_sf$veh)
train_sf$week <- as.factor(train_sf$week)
set.seed(1123)
trainIndex <- createDataPartition(
y = paste(train_sf$occ2, train_sf$to, train_sf$veh_occ, train_sf$OD_occ, train_sf$from, train_sf$dowt, p = .75),
list = FALSE,
times = 1)
yTrain <- train_sf[ trainIndex,]
yTest <- train_sf[-trainIndex,]
m1 <- multinom(occ2 ~ week + dotw + veh + hour + mor.jam + eve.jam + distance + from + to + OD_occ + veh_occ + stop_times_from + stop_times_to, data = yTrain, MaxNWts =10000000)
## # weights: 1980 (1318 variable)
## initial value 1785.244969
## iter 10 value 1534.439206
## iter 20 value 1087.797236
## iter 30 value 961.133783
## iter 40 value 887.819586
## iter 50 value 863.277509
## iter 60 value 849.160826
## iter 70 value 843.664578
## iter 80 value 841.572699
## iter 90 value 840.847925
## iter 100 value 840.535775
## final value 840.535775
## stopped after 100 iterations
testProbs <- data.frame(Observe = as.factor(yTest$occ2),
Predict = predict(m1, newdata=yTest, "class"),
pre = predict(m1, newdata=yTest, "probs"),
from = yTest$from,
to = yTest$to,
OD = yTest$OD,
week = yTest$week,
hour = yTest$hour,
interval60 = yTest$interval60,
date = yTest$date)
testProbs <- merge(l[c("geometry","OD")], testProbs, by="OD", all.y=T)
testProbs <- testProbs[c(7,8,9, 10,11,12,1,2,3,4,5,6,13)]
testProbs_table <- as.data.frame(testProbs)%>%
round_df(3)%>%
dplyr::select(from, to, date, Observe, Predict, pre.low, pre.high, pre.medium)
head(testProbs_table,10)
## from to date Observe Predict pre.low pre.high pre.medium
## 1 8811007 8812005 2016/9/30 low low 0.799 0.000 0.201
## 2 8811007 8812005 2016/10/28 low low 0.836 0.000 0.164
## 3 8811007 8812005 2016/10/13 low low 0.882 0.000 0.118
## 4 8811106 8811916 2016/10/14 low low 1.000 0.000 0.000
## 5 8811106 8811916 2016/10/11 low low 1.000 0.000 0.000
## 6 8811106 8811916 2016/10/4 low low 1.000 0.000 0.000
## 7 8811106 8811916 2016/10/10 low low 1.000 0.000 0.000
## 8 8811106 8811916 2016/10/6 low low 1.000 0.000 0.000
## 9 8811189 8813003 2016/10/24 medium medium 0.091 0.132 0.777
## 10 8811189 8813003 2016/7/30 high low 0.734 0.036 0.230
Before the Accuracy and generalizability analysis, we have to evaluate the cost-benefit first. For the cost and benefit analysis, the prediction result will affect the profit of the train system since planners will adjust trains services based on the prediction.
As an algorithm for transportation organizations, we hope to help them reduce expenditures while optimizing the needs of transportation systems. We classify the nine prediction results to calculate the income and cost that each prediction result brings compared to the actual.
If we can successfully predict the route, we expect that the optimal algorithm can optimize the total revenue.We are going to make the following assumptions about the Income and Cost of the occupancy prediction change: - We assume the average ticket fee per passenger is €6. - We assume the capacity of each trip is 400 (80*5 coach). - We assume the cost on increasing of capacity or coach is €300. - We assume the about €300, which corresponds to 100 capacity increase. - For highly occupied trips, it’s possible that there is potential passengers. Therefore, we assumed 100 additional passengers will get into the train if we increase the capacity of that trip.
Based on these assumptions, we have established a profit calculation method for each prediction result and the observed comparison.(format: observed level-predict level)
From this figure, it can be seen that the low-high, the medium-high, the high-low will cause a large loss to our overall prodits, especially the LOW-HIGH. This makes us focus on these types of predictions. We hope that these types of predictions can be made more accurately. The accuracy of predictions is influenced by reducing the error. The test of these types of predictions will be mentioned later.
testProbs_df <- as.data.frame(testProbs)
cost_benefit_table <-
testProbs_df %>%
na.omit() %>%
count(Observe, Predict) %>%
summarize(Low_Low = sum(n[Observe=='low' & Predict=='low']),
Low_Medium = sum(n[Observe=='low' & Predict=='medium']),
Low_High = sum(n[Observe=='low' & Predict=='high']),
Medium_low = sum(n[Observe=='medium' & Predict=='low']),
Medium_Medium = sum(n[Observe=='medium' & Predict=='medium']),
Medium_High = sum(n[Observe=='medium' & Predict=='high']),
High_Low = sum(n[Observe=='high' & Predict=='low']),
High_Medium = sum(n[Observe=='high' & Predict=='medium']),
High_High = sum(n[Observe=='high' & Predict=='high']),
SUM = sum(n)
) %>%
gather(Variable, Count)%>%
mutate(Income =
ifelse(Variable == "Low_Low", 0,
ifelse(Variable == "Low_Medium", 0,
ifelse(Variable == "Low_High", 0,
ifelse(Variable == "Medium_low", 0,
ifelse(Variable == "Medium_Medium", 0,
ifelse(Variable == "Medium_High", 0,
ifelse(Variable == "High_Low",'-(6*100)',
ifelse(Variable == "High_Medium", 0,
ifelse(Variable == "High_High", '6*100',
ifelse(Variable == "SUM",(-(6*100)+6*100) ,0))))))))))) %>%
mutate(Cost =
ifelse(Variable == "Low_Low", -300,
ifelse(Variable == "Low_Medium", 0,
ifelse(Variable == "Low_High", 300,
ifelse(Variable == "Medium_low", -300,
ifelse(Variable == "Medium_Medium", 0,
ifelse(Variable == "Medium_High", 300,
ifelse(Variable == "High_Low",-300,
ifelse(Variable == "High_Medium", 0,
ifelse(Variable == "High_High", 300,
ifelse(Variable == "SUM", 0,0))))))))))) %>%
mutate('Profits/per count'=
ifelse(Variable == "Low_Low", 0-(-300),
ifelse(Variable == "Low_Medium", 0,
ifelse(Variable == "Low_High", -300,
ifelse(Variable == "Medium_low", 0-(-300),
ifelse(Variable == "Medium_Medium", 0,
ifelse(Variable == "Medium_High", 0-(300),
ifelse(Variable == "High_Low",-6*100-(-300),
ifelse(Variable == "High_Medium", 0,
ifelse(Variable == "High_High", 6*100-(300),
ifelse(Variable == "SUM", ((0-(-300))+0-300+(0-(-300))-300+(-6*100-(-300))+6*100-(300)),0))))))))))) %>%
mutate('Total_Profits' =
ifelse(Variable == "Low_Low", Count * (0-(-300)),
ifelse(Variable == "Low_Medium", Count * (0-0),
ifelse(Variable == "Low_High", Count * (-(300)),
ifelse(Variable == "Medium_low", Count * (0-(-300)),
ifelse(Variable == "Medium_Medium", Count * (0-(0)),
ifelse(Variable == "Medium_High", Count * (0-(300)),
ifelse(Variable == "High_Low",Count*(-6*100-(-300)),
ifelse(Variable == "High_Medium", Count*(0-(0)),
ifelse(Variable == "High_High", Count*(6*100-(300)),
ifelse(Variable == "SUM", '####',0)))))))))))%>%
bind_cols(data.frame(Description = c(
"We predicted low demand with the low demand observation",
"We predicted low demand with the medium demand observation",
"We predicted low demand with the high demand observation",
"We predicted medium demand with the low demand observation",
"We predicted medium demand with the meidum demand observation",
"We predicted medium demand with the high demand observation",
"We predicted high demand with the low demand observation",
"We predicted high demand with the medium demand observation",
"We predicted high demand with the high demand observation",
"Total Profits")))%>%
mutate(Observe_Predict=Variable)
cost_benefit_table <- cost_benefit_table[c(8,2,3,4,5,6,7)]
cost_benefit_table$Total_Profits <- as.numeric(cost_benefit_table$Total_Profits)
cost_benefit_table$Total_Profits[cost_benefit_table$Observe_Predict=="SUM"] <- sum(cost_benefit_table$Total_Profits, na.rm=T)
head(cost_benefit_table,10)
## # A tibble: 10 x 7
## Observe_Predict Count Income Cost `Profits/per co… Total_Profits
## <chr> <int> <chr> <dbl> <dbl> <dbl>
## 1 Low_Low 151 0 -300 300 45300
## 2 Low_Medium 28 0 0 0 0
## 3 Low_High 43 0 300 -300 -12900
## 4 Medium_low 43 0 -300 300 12900
## 5 Medium_Medium 76 0 0 0 0
## 6 Medium_High 43 0 300 -300 -12900
## 7 High_Low 30 -(6*1… -300 -300 -9000
## 8 High_Medium 40 0 0 0 0
## 9 High_High 127 6*100 300 300 38100
## 10 SUM 581 0 0 0 61500
## # … with 1 more variable: Description <fct>
kable(cost_benefit_table,
caption = "Cost/Benefit Table")%>% kable_styling()
Observe_Predict | Count | Income | Cost | Profits/per count | Total_Profits | Description |
---|---|---|---|---|---|---|
Low_Low | 151 | 0 | -300 | 300 | 45300 | We predicted low demand with the low demand observation |
Low_Medium | 28 | 0 | 0 | 0 | 0 | We predicted low demand with the medium demand observation |
Low_High | 43 | 0 | 300 | -300 | -12900 | We predicted low demand with the high demand observation |
Medium_low | 43 | 0 | -300 | 300 | 12900 | We predicted medium demand with the low demand observation |
Medium_Medium | 76 | 0 | 0 | 0 | 0 | We predicted medium demand with the meidum demand observation |
Medium_High | 43 | 0 | 300 | -300 | -12900 | We predicted medium demand with the high demand observation |
High_Low | 30 | -(6*100) | -300 | -300 | -9000 | We predicted high demand with the low demand observation |
High_Medium | 40 | 0 | 0 | 0 | 0 | We predicted high demand with the medium demand observation |
High_High | 127 | 6*100 | 300 | 300 | 38100 | We predicted high demand with the high demand observation |
SUM | 581 | 0 | 0 | 0 | 61500 | Total Profits |
caret::confusionMatrix(testProbs$Observe, testProbs$Predict,
positive = "low")
## Confusion Matrix and Statistics
##
## Reference
## Prediction low high medium
## low 151 43 28
## high 30 127 40
## medium 43 43 76
##
## Overall Statistics
##
## Accuracy : 0.6092943
## 95% CI : (0.5682865, 0.6491851)
## No Information Rate : 0.3855422
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.4073666
##
## Mcnemar's Test P-Value : 0.1332086
##
## Statistics by Class:
##
## Class: low Class: high Class: medium
## Sensitivity 0.6741071 0.5962441 0.5277778
## Specificity 0.8011204 0.8097826 0.8032037
## Pos Pred Value 0.6801802 0.6446701 0.4691358
## Neg Pred Value 0.7966574 0.7760417 0.8377088
## Prevalence 0.3855422 0.3666093 0.2478485
## Detection Rate 0.2598967 0.2185886 0.1308090
## Detection Prevalence 0.3820998 0.3390706 0.2788296
## Balanced Accuracy 0.7376138 0.7030134 0.6654907
accu <- as.data.frame(table(testProbs$Observe,testProbs$Predict))%>%
mutate(Observe = Var1,
Predict = Var2,
count = Freq,
result_type = paste (Observe, Predict, sep= "-"))%>%
dplyr::select(-Var1, -Var2,-Freq,)
accu$percent <- as.numeric(c("68.7", "16.6", "23.2", "15.6", "62.8", "21.3", "15.6", "20.6","55.5"))
accu$Predict <- factor(accu$Predict, level=c("low","medium","high"))
accu$Observe <- factor(accu$Observe, level=c("low","medium","high"))
an image caption Source: 3*3Confusion Matrix & corresponding profit
In this part, we will adjust our model based on the above cost-benefit and accuracy analysis. As we mentioned before, the cost of error type Low-High (Observed Low but predict as High) is considerably high, therefore we intend to modify our model by decrease the type of mistake and increase the general profit at the same time. We found that for this error, most of the case is that, the possibility of being high occupancy is only slightly higher than the possibility of being low occupancy (the difference is less than 0.2), and the model returns the level as high by default, while in reality, it should be low. Finally, we manually changed those prediction result that it is predicted as HIGH but the difference of possibility between being high and low is less than 0.26, we changed them from high to low. The reason is that high occupancy will result in additional operation fees since the planner will increase the frequency or capacity of the train to accommodate more passengers, while if this “high occupancy” is a wrong prediction, the additional operation fee will be a waste. Although the adjustment also results in few other wrong predictions, the whole profit of the model reaches the highest point at threshold 0.26.
This is the changed cost-benefit table
### adjust2
testProbs2 <- testProbs
testProbs2$Predict2 <- factor(ifelse(testProbs2$Predict=="high" & testProbs2$pre.high-testProbs2$pre.low<=0.26, "low", testProbs2$Predict))
testProbs2$Predict2 <- factor(ifelse(testProbs2$Predict2=="1"|testProbs2$Predict2=="low","low",
ifelse(testProbs2$Predict2=="2","high", "medium")))
testProbs2$Predict <- testProbs2$Predict2
testProbs_df2 <- as.data.frame(testProbs2)
cost_benefit_table2 <-
testProbs_df2 %>%
na.omit() %>%
count(Observe, Predict) %>%
summarize(Low_Low = sum(n[Observe=='low' & Predict=='low']),
Low_Medium = sum(n[Observe=='low' & Predict=='medium']),
Low_High = sum(n[Observe=='low' & Predict=='high']),
Medium_low = sum(n[Observe=='medium' & Predict=='low']),
Medium_Medium = sum(n[Observe=='medium' & Predict=='medium']),
Medium_High = sum(n[Observe=='medium' & Predict=='high']),
High_Low = sum(n[Observe=='high' & Predict=='low']),
High_Medium = sum(n[Observe=='high' & Predict=='medium']),
High_High = sum(n[Observe=='high' & Predict=='high']),
SUM = sum(n)
) %>%
gather(Variable, Count)%>%
mutate(Revenue =
ifelse(Variable == "Low_Low", Count * (0-(-300)),
ifelse(Variable == "Low_Medium", Count * (0-0),
ifelse(Variable == "Low_High", Count * (-(300)),
ifelse(Variable == "Medium_low", Count * (0-(-300)),
ifelse(Variable == "Medium_Medium", Count * (0-(0)),
ifelse(Variable == "Medium_High", Count * (0-(300)),
ifelse(Variable == "High_Low",Count*(-6*100-(-300)),
ifelse(Variable == "High_Medium", Count*(0-(0)),
ifelse(Variable == "High_High", Count*(6*100-(300)),
ifelse(Variable == "SUM", "i don't know",0)))))))))))%>%
bind_cols(data.frame(Description = c(
"We predicted low demand with the low demand observation",
"We predicted low demand with the medium demand observation",
"We predicted low demand with the high demand observation",
"We predicted medium demand with the low demand observation",
"We predicted medium demand with the meidum demand observation",
"We predicted medium demand with the high demand observation",
"We predicted high demand with the low demand observation",
"We predicted high demand with the medium demand observation",
"We predicted high demand with the high demand observation",
"Total profit")))
cost_benefit_table2$Revenue <- as.numeric(cost_benefit_table2$Revenue)
cost_benefit_table2$Revenue[cost_benefit_table2$Variable=="SUM"] <- sum(cost_benefit_table2$Revenue, na.rm=T)
kable(cost_benefit_table2,
caption = "Cost/Benefit Table")%>% kable_styling()
Variable | Count | Revenue | Description |
---|---|---|---|
Low_Low | 164 | 49200 | We predicted low demand with the low demand observation |
Low_Medium | 28 | 0 | We predicted low demand with the medium demand observation |
Low_High | 30 | -9000 | We predicted low demand with the high demand observation |
Medium_low | 55 | 16500 | We predicted medium demand with the low demand observation |
Medium_Medium | 76 | 0 | We predicted medium demand with the meidum demand observation |
Medium_High | 31 | -9300 | We predicted medium demand with the high demand observation |
High_Low | 49 | -14700 | We predicted high demand with the low demand observation |
High_Medium | 40 | 0 | We predicted high demand with the medium demand observation |
High_High | 108 | 32400 | We predicted high demand with the high demand observation |
SUM | 581 | 65100 | Total profit |
This diagram shows the accuracy changes between origin model and adjusted model.
accu2 <- as.data.frame(table(testProbs2$Observe,testProbs2$Predict))%>%
mutate(Observe = Var1,
Predict = Var2,
count = Freq,
result_type = paste (Observe, Predict, sep= "-"))%>%
dplyr::select(-Var1, -Var2,-Freq,)
accu2$percent <- as.numeric(c("8.2", "53.8", "15.2", "76.1", "25.6", "29.3", "15.6", "20.6","55.5"))
accu2$Predict <- factor(accu2$Predict, level=c("low","medium","high"))
accu2$Observe <- factor(accu2$Observe, level=c("low","medium","high"))
an image caption Source: Confusion Matrix Comparision between origin and asjusted
We want to explore does our model generalize through space and time panel. To answer this question, we’ll plot the error geographically and summarize it by time variables.
It is notable that, we only consider 5 result types whose profit is less or equal to 0 as the wrong prediction. Additionally, when predicting the occupancy level by multinomial logistic regression, the model will calculate the possibility be each occupancy level and then return the occupancy level with the highest possibility. For example, for result type high-low ( the observed occupancy is high while the model wrongly predicted it as low), it means that the possibility that this model calculated to be “low” is higher than the possibility to be “high”, so the model returns the occupancy level as low.
Since there is no traditional Mean Absolute Error (MAE) in our model, we regard the difference between the possibility of observed occupancy level and the possibility of predicted occupancy as the error (p.observed -p.predict). Don’t forget that when we talk about the error we only viewing the result types whose profit is <=0 as the “wrong prediction”. The following “wrong prediction table” only selected the 5 types of wrong prediction trips which may help you to understand the mechanism better.
main_error <- testProbs%>%
mutate(type=paste(Observe, Predict, sep= "-"))%>%
filter(Predict!=Observe)%>%
filter(type == "low-high"|type == "medium-high"|type == "high-low")%>%
mutate(high_m_low = pre.high-pre.low,
high_m_med = pre.high-pre.medium,
low_m_high = pre.low-pre.high)
high_low <- subset(main_error, main_error$type=="high-low")%>%
mutate(error = low_m_high)%>%
dplyr::select(-c(15,16,17))
low_high <- subset(main_error, main_error$type=="low-high")%>%
mutate(error = high_m_low)%>%
dplyr::select(-c(15,16,17))
medium_high <- subset(main_error, main_error$type=="medium-high")%>%
mutate(error = high_m_med)%>%
dplyr::select(-c(15,16,17))
all_error <- testProbs%>%
mutate(type=paste(Observe, Predict, sep= "-"))%>%
filter(Observe!=Predict)
low_medium <- all_error%>%
filter(type=="low-medium")%>%
mutate(error = pre.medium-pre.low)
High_medium <- all_error%>%
filter(type=="high-medium")%>%
mutate(error = pre.medium-pre.high)
all_error <- rbind(high_low,low_high,medium_high, low_medium,High_medium)
wrong_pred_5_type <- as.data.frame(all_error)%>%
dplyr::select(Observe, Predict, pre.low, pre.medium, pre.high, error)%>%
round_df(3)
head(wrong_pred_5_type,100)
## Observe Predict pre.low pre.medium pre.high error
## 1 high low 0.734 0.230 0.036 0.698
## 2 high low 0.460 0.242 0.298 0.163
## 3 high low 0.502 0.229 0.269 0.233
## 4 high low 0.574 0.273 0.153 0.421
## 5 high low 0.522 0.262 0.216 0.307
## 6 high low 0.415 0.384 0.201 0.213
## 7 high low 0.440 0.285 0.275 0.165
## 8 high low 0.576 0.284 0.140 0.435
## 9 high low 0.548 0.043 0.409 0.139
## 10 high low 0.608 0.000 0.392 0.216
## 11 high low 0.907 0.039 0.054 0.853
## 12 high low 0.555 0.106 0.339 0.216
## 13 high low 0.456 0.364 0.180 0.276
## 14 high low 0.591 0.342 0.068 0.523
## 15 high low 0.593 0.000 0.407 0.186
## 16 high low 0.442 0.420 0.138 0.304
## 17 high low 0.469 0.243 0.288 0.181
## 18 high low 0.718 0.172 0.110 0.608
## 19 high low 0.520 0.000 0.480 0.041
## 20 high low 0.721 0.167 0.111 0.610
## 21 high low 0.493 0.210 0.297 0.195
## 22 high low 0.538 0.246 0.216 0.322
## 23 high low 0.463 0.361 0.176 0.287
## 24 high low 0.728 0.239 0.033 0.695
## 25 high low 0.407 0.282 0.312 0.095
## 26 high low 0.580 0.128 0.292 0.288
## 27 high low 0.932 0.001 0.067 0.865
## 28 high low 0.433 0.134 0.433 0.001
## 29 high low 0.999 0.000 0.001 0.998
## 30 high low 0.933 0.037 0.030 0.903
## 31 low high 0.380 0.236 0.384 0.004
## 32 low high 0.169 0.337 0.494 0.325
## 33 low high 0.304 0.141 0.555 0.251
## 34 low high 0.115 0.049 0.836 0.721
## 35 low high 0.308 0.093 0.600 0.292
## 36 low high 0.242 0.250 0.508 0.267
## 37 low high 0.489 0.000 0.511 0.022
## 38 low high 0.104 0.422 0.474 0.370
## 39 low high 0.251 0.271 0.479 0.228
## 40 low high 0.227 0.330 0.443 0.216
## 41 low high 0.227 0.330 0.443 0.216
## 42 low high 0.466 0.000 0.534 0.068
## 43 low high 0.146 0.389 0.465 0.319
## 44 low high 0.214 0.298 0.487 0.273
## 45 low high 0.133 0.354 0.513 0.379
## 46 low high 0.078 0.389 0.533 0.455
## 47 low high 0.141 0.211 0.648 0.507
## 48 low high 0.130 0.295 0.575 0.445
## 49 low high 0.107 0.065 0.828 0.721
## 50 low high 0.000 0.000 1.000 1.000
## 51 low high 0.436 0.089 0.475 0.039
## 52 low high 0.170 0.199 0.631 0.462
## 53 low high 0.148 0.248 0.604 0.457
## 54 low high 0.160 0.175 0.665 0.506
## 55 low high 0.334 0.000 0.666 0.333
## 56 low high 0.122 0.184 0.694 0.572
## 57 low high 0.007 0.021 0.973 0.966
## 58 low high 0.440 0.000 0.560 0.121
## 59 low high 0.155 0.020 0.825 0.671
## 60 low high 0.217 0.163 0.620 0.404
## 61 low high 0.120 0.077 0.803 0.683
## 62 low high 0.162 0.000 0.838 0.676
## 63 low high 0.262 0.145 0.593 0.331
## 64 low high 0.188 0.379 0.433 0.245
## 65 low high 0.499 0.000 0.501 0.002
## 66 low high 0.237 0.279 0.484 0.248
## 67 low high 0.162 0.094 0.744 0.582
## 68 low high 0.185 0.244 0.571 0.385
## 69 low high 0.170 0.310 0.519 0.349
## 70 low high 0.320 0.000 0.680 0.360
## 71 low high 0.270 0.128 0.602 0.332
## 72 low high 0.490 0.008 0.501 0.011
## 73 low high 0.092 0.233 0.674 0.582
## 74 medium high 0.380 0.236 0.384 0.148
## 75 medium high 0.000 0.219 0.781 0.561
## 76 medium high 0.168 0.413 0.419 0.006
## 77 medium high 0.037 0.032 0.931 0.899
## 78 medium high 0.349 0.135 0.516 0.381
## 79 medium high 0.363 0.241 0.396 0.156
## 80 medium high 0.086 0.179 0.735 0.555
## 81 medium high 0.238 0.324 0.437 0.113
## 82 medium high 0.000 0.193 0.807 0.614
## 83 medium high 0.189 0.404 0.407 0.003
## 84 medium high 0.206 0.244 0.550 0.306
## 85 medium high 0.087 0.303 0.611 0.308
## 86 medium high 0.146 0.389 0.465 0.077
## 87 medium high 0.368 0.237 0.395 0.158
## 88 medium high 0.258 0.316 0.426 0.110
## 89 medium high 0.336 0.044 0.620 0.575
## 90 medium high 0.008 0.000 0.992 0.992
## 91 medium high 0.192 0.335 0.473 0.138
## 92 medium high 0.079 0.426 0.496 0.070
## 93 medium high 0.409 0.146 0.444 0.298
## 94 medium high 0.168 0.184 0.648 0.465
## 95 medium high 0.194 0.236 0.570 0.334
## 96 medium high 0.326 0.151 0.523 0.372
## 97 medium high 0.241 0.119 0.640 0.522
## 98 medium high 0.001 0.281 0.718 0.436
## 99 medium high 0.171 0.189 0.640 0.451
## 100 medium high 0.000 0.192 0.808 0.615
We make two figures in this part. The first figure is the overall spatial distribution of the five types of errors. The second figure is to classify them according to the type of error and check their different spatial distributions.
As can be seen from the first figure, the distribution characteristics of this error are not significant. It seems that the route between the stations closer to the center area has a higher error. The routes of the marginal area connect with the center area and the lines at the central station have lower errors. But there is a flaw here, we cannot tell each type to provide the information to optimize the model.
The second figure is aimed to check the error for different prediction types. It is more intuitive and effective. Because classification is used to study the effect of different prediction types on overall benefits, we need to look at the distribution of each error type in more detail. Combining with the cost and benefits analysis, it can be seen that for the two types: low-high and high-low where our overall profits impact is large, their errors seem to be higher. This proves that the correction of such errors should be the focus of modeling adjustments. The high-medium and the low-medium types have a small impact on our overall profits. Although the medium-high will increase the profits, the error here is not that large than the low-high and high-low.
ggplot(all_error)+
#geom_sf(data=rail,inherit.aes = FALSE, na.rm=T, col="black", lwd=0.2 ,alpha=0.2)+
geom_sf(data=stations, size = 1, inherit.aes = FALSE, col="grey" )+
geom_sf(data=all_error, aes(colour = error),lwd=1)+
scale_colour_viridis(direction = -1, discrete = F, option = "C")+
#facet_wrap(~type)+
ylim(min(49.5), max(51.5))+ #lat
xlim(min(2.5), max(6.1))+
labs(title="Spatial Error ",
subtitle="Only consider the result types whose profit<=0 as error")
ggplot(all_error)+
#geom_sf(data=rail,inherit.aes = FALSE, na.rm=T, col="black", lwd=0.2 ,alpha=0.2)+
geom_sf(data=stations, size = 1, inherit.aes = FALSE, col="grey" )+
geom_sf(data=all_error, aes(colour = error),lwd=1)+
scale_colour_viridis(direction = -1, discrete = F, option = "C")+
facet_wrap(~type)+
ylim(min(49.5), max(51.5))+ #lat
xlim(min(2.5), max(6.1))+
labs(title="Spatial Error by 5 error types",
subtitle="Only consider the result types whose profit<=0 as error")
We also summarize the trend of errors from the perspective of time. It is useful to understand whether a given model generalizes to the weeks or different hours.
The first line figure checks the overall error trends by hour intervals. It can be seen that the overall fluctuation trend is relatively stable. The fluctuation range was not so great around October 15. In addition, we make a distinction between different 5 types. It is notable to indicate that the high-low and low-high have a higher error fluctuation compared with the low-medium and medium-low.
The second figure checks the error by weeks. It can be seen that the 38th week have a low possibility error. When we make a distinction between different 5 types, it can be seen that especially the high-low in 43rd week, low-high in 39th week need to be improved to increase the general profit. They have a significant impact on the overall profits.
all_error_df <- as.data.frame(all_error)
#hour
grid.arrange(
all_error %>%
filter(interval60>=as.Date("2016-09-15"))%>%
ggplot(aes(interval60, error)) +
geom_line(size = 0.5)+
labs(title = "Errors by hour",
subtitle = "Only consider the result types whose profit<=0 as error")+Theme2,
all_error %>%
filter(interval60>=as.Date("2016-09-15"))%>%
ggplot(aes(interval60, error, colour=type)) +
geom_line(size = 0.5)+
labs(title = "Errors by 5 error types and hour",
subtitle = "Only consider the result types whose profit<=0 as error")+Theme2,
ncol=1)
#week
grid.arrange(
all_error_df %>%
filter(week=="38"|week=="39"|week=="40"|week=="41"|week=="42"|week=="43")%>%
ggplot(aes(x=week, y=error)) +
geom_bar(position = "dodge", stat="identity") +
scale_fill_manual(values = palette5) +
labs(title = "Errors by week",
subtitle = "Only consider the result types whose profit<=0 as error")+Theme2,
all_error_df %>%
filter(week=="38"|week=="39"|week=="40"|week=="41"|week=="42"|week=="43")%>%
dplyr::select(week, type, error) %>%
gather(Variable, error, -type, -week) %>%
ggplot(aes(x=week, y=error)) +
geom_bar(aes(fill = type), position = "dodge", stat="identity") +
scale_fill_manual(values = palette5) +
labs(title = "Errors by 5 error types and week",
subtitle = "Only consider the result types whose profit<=0 as error")+Theme2,
ncol=1)
In conclusion, this project developed a train occupancy prediction model, which can play as a tool for transportation planners to re-distribute the train resources and make the provision in line with the demand. Our prediction is generalized through different space and time. It’s clear that few prediction models are perfect, however, in our model we tried the best to decrease the cost of the error by adjusting part of the predicted high occupancy. As for further improvement, we need to go detail and find more related variables to increase accuracy. Additionally, in this project, we only focused on Low-High type of error, but in reality, we should focus on all types of error since they all related to the final profit.