Video

set up

#import data
setwd("~/Desktop/590/final")

require(foreign)
require(nnet)
require(ggplot2)
require(reshape2)
library(lubridate)
library(dplyr)
library(sf)
library(tidyverse)
library(tidyr)
library(lubridate)
library(ggmap)
library(stplanr)
library(tmap)
library(caret)
library(stringr)
library(viridis)
require(gridExtra)
library(kableExtra)
library(forestmangr)
library(MASS)

options(scipen=99)

Theme1 <- theme(legend.box = "horizontal",
        plot.title = element_text(size=12),
        plot.subtitle = element_text(size=8),
        axis.title.x = element_text(size = 4))
Theme2 <- theme(plot.title = element_text(size=12),
                plot.subtitle = element_text(size=9))
palette3 <- c("#bdd7e7","#6baed6","#08519c")
palette4 <- c("#D2FBD4","#92BCAB","#527D82","#123F5A")
palette5 <- c("#eff3ff","#bdd7e7","#6baed6","#3182bd","#08519c")
        
mapTheme <- theme(plot.title =element_text(size=12),
                  plot.subtitle = element_text(size=8),
                  plot.caption = element_text(size = 6),
                  axis.line=element_blank(),
                  axis.text.x=element_blank(),
                  axis.text.y=element_blank(),
                  axis.ticks=element_blank(),
                  axis.title.x=element_blank(),
                  axis.title.y=element_blank(),
                  panel.background=element_blank(),
                  panel.border=element_blank(),
                  panel.grid.major=element_line(colour = 'transparent'),
                  panel.grid.minor=element_blank(),
                  legend.direction = "vertical", 
                  legend.position = "right",
                  plot.margin = margin(1, 1, 1, 1, 'cm'),
                  legend.key.height = unit(1, "cm"), legend.key.width = unit(0.2, "cm"))

plotTheme <- theme(
  plot.title =element_text(size=12),
  plot.subtitle = element_text(size=8),
  plot.caption = element_text(size = 6),
  axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
  axis.text.y = element_text(size = 10),
  axis.title.y = element_text(size = 10),
  panel.background=element_blank(),
  plot.background=element_blank(),
  panel.grid.major=element_line(colour="#D0D0D0",size=.2),
  axis.ticks=element_blank())


substrRight <- function(x, n){
  substr(x, nchar(x)-n+1, nchar(x))}

options(scipen=50, digits=10)

stations <- read.csv("/Users/zhangjing/Desktop/590/final/stations.csv")%>%
st_as_sf(coords = c("longitude", "latitude"), crs = 4326) %>%
mutate(station_id = as.numeric(str_sub(URI, 31, 39)))%>%
  dplyr::select(-alternative.fr,-alternative.nl,-alternative.de, -alternative.en, -country.code, -URI)

rail <- st_read("/Users/zhangjing/Desktop/590/final/source/belgium-railways-shape/railways.shp")%>%
  filter(type=="rail")

line <- st_read("https://raw.githubusercontent.com/GillesVandewiele/KaggleTrainOccupancy/master/line_info.csv")

#belgium_basemap <- get_stamenmap(bbox = c(left = 2.739258, bottom = 49.563228, right = 6.551513, top = 51.695260), zoom = 11)

clean data

trains <- 
read.csv("/Users/zhangjing/Desktop/590/final/trains_train.csv") %>%
mutate(from = as.numeric(as.character(from)),
to = as.numeric(as.character(to))) %>%
left_join(dplyr::select(stations, station_id, avg_stop_times,name), by = c("from" = "station_id")) %>%
st_sf() %>% mutate(from.X = st_coordinates(.)[,1], 
from.Y = st_coordinates(.)[,2]) %>%
st_drop_geometry() %>%
left_join(dplyr::select(stations, station_id, avg_stop_times), by = c("to" = "station_id")) %>%
st_sf() %>% mutate(to.X = st_coordinates(.)[,1], 
to.Y = st_coordinates(.)[,2]) %>%
st_drop_geometry() %>%
mutate(distance = sqrt((from.X - to.X)^2 + (from.Y - to.Y)^2),
       stop_times_from = avg_stop_times.x,
       stop_times_to = avg_stop_times.y,) %>%
  dplyr::select(-avg_stop_times.x, -avg_stop_times.y)%>%
arrange(-distance)

trains$occ2 <- relevel(trains$occupancy, ref = "low")
trains$OD <- paste (trains$from, trains$to, sep= "-")
trains$time <- parse_date_time(trains$time, '%I:%M:%S %p')
trains$time <- substr(trains$time, 12, 19)
trains$date_time <- as.POSIXct(paste(trains$date, trains$time), format="%Y/%m/%d %H:%M:%S")
str(trains)

trains <- 
  trains %>%
  mutate(veh = substr(vehicle,1,1),
        interval60 = floor_date(ymd_hms(date_time), unit = "hour"),
         interval15 = floor_date(ymd_hms(date_time), unit = "15 mins"),
         week = week(interval60),
         dotw = wday(interval60, label=TRUE),
         veh = substr(vehicle,1,1),
         hour = as.numeric(substr(time, 1, 2)),
         weekend = ifelse(dotw == "Sat" | dotw == "Sun",1,0),
        mor.jam = ifelse(hour>= 6 & hour<=10 & weekend == "0",1,0),
        eve.jam = ifelse(hour>= 15 & hour<= 19 & weekend == "0",1,0),
        hour = factor(hour),
        weekend = factor(weekend),
    #    mor.jam = factor(mor.jam),
    #    eve.jam = factor(eve.jam),
        from = factor(from),
        to = factor (to))

creat OD line

new_t <- trains%>%
  mutate(new = 1)%>%
  dplyr::select(OD, new)%>%
  group_by(OD) %>%
  summarise( count = sum (new))

new_t <- separate(new_t,OD,into=c("from","to"), sep="-")
new_t <- subset(new_t, new_t$to!="1000000" & new_t$from!="1000000" & new_t$to!="0" & !is.na(new_t$to) & !is.na(new_t$from) & new_t$to!="NA" & new_t$from!="NA")

new_t <- subset(new_t, new_t$to!=new_t$from)
new_t$from <- as.numeric(as.character(new_t$from))
new_t$to <- as.numeric(new_t$to)

station_extract <- stations%>%
  mutate(to = station_id)%>%
  dplyr::select(to)

z <- station_extract
l <- od2line(flow = new_t, zones = z)
l$OD <-  paste (l$from, l$to, sep= "-")
try <- merge(l,trains, by = "OD", all.x=T, all.y=F)

g <- ggplot(try)+
  #geom_sf(data=rail, inherit.aes = FALSE,na.rm=T, col="black", lwd=0.2 )+
geom_sf(data=stations, size = 1,inherit.aes = FALSE)+
geom_sf(data=try, aes(color = occupancy), inherit.aes = FALSE, line.lwd = "count")+
#geom_sf(stat_order_sf,aes(color = vehicle_id), size=0.3)+
  #theme(legend.position="none")+
   ylim(min(49.5), max(51.5))+ #lat
  xlim(min(2.5), max(6.1)) #long

train_sf <- merge(l[,c("geometry","OD")], trains, by = "OD", alll=T)

import weather data

weather1 <- read.csv("https://raw.githubusercontent.com/GillesVandewiele/KaggleTrainOccupancy/master/weather_data_july_1.csv")
weather2 <- read.csv("https://raw.githubusercontent.com/GillesVandewiele/KaggleTrainOccupancy/master/weather_data_july_2.csv")
weather3 <- read.csv("https://raw.githubusercontent.com/GillesVandewiele/KaggleTrainOccupancy/master/weather_data_aug_1.csv")
weather4 <- read.csv("https://raw.githubusercontent.com/GillesVandewiele/KaggleTrainOccupancy/master/weather_data_aug_2.csv")
weather5 <- read.csv("https://raw.githubusercontent.com/GillesVandewiele/KaggleTrainOccupancy/master/weather_data_sep_1.csv")
weather6 <- read.csv("https://raw.githubusercontent.com/GillesVandewiele/KaggleTrainOccupancy/master/weather_data_sep_2.csv")
weather7 <- read.csv("https://raw.githubusercontent.com/GillesVandewiele/KaggleTrainOccupancy/master/weather_data_oct_1.csv")
weather8 <- read.csv("https://raw.githubusercontent.com/GillesVandewiele/KaggleTrainOccupancy/master/weather_data_oct_2.csv")

weather1 <- 
    mutate(weather1, interval60 = ymd_h(substr(date_time,1,13))) %>%
    mutate(week = week(interval60),
           dotw = wday(interval60, label=TRUE)) %>%
    group_by(interval60,station_name) %>%
    dplyr::summarize(Temperature = max(temperature),
              Humidity = max(humidity),
              Wind_Speed = max(windspeed))
weather2 <- 
    mutate(weather2, interval60 = ymd_h(substr(date_time,1,13))) %>%
    mutate(week = week(interval60),
           dotw = wday(interval60, label=TRUE)) %>%
    group_by(interval60,station_name) %>%
    dplyr::summarize(Temperature = max(temperature),
              Humidity = max(humidity),
              Wind_Speed = max(windspeed))
weather3 <- 
    mutate(weather3, interval60 = ymd_h(substr(date_time,1,13))) %>%
    mutate(week = week(interval60),
           dotw = wday(interval60, label=TRUE)) %>%
    group_by(interval60,station_name) %>%
    dplyr::summarize(Temperature = max(temperature),
              Humidity = max(humidity),
              Wind_Speed = max(windspeed))
weather4 <- 
    mutate(weather4, interval60 = ymd_h(substr(date_time,1,13))) %>%
    mutate(week = week(interval60),
           dotw = wday(interval60, label=TRUE)) %>%
    group_by(interval60,station_name) %>%
    dplyr::summarize(Temperature = max(temperature),
              Humidity = max(humidity),
              Wind_Speed = max(windspeed))
weather5 <- 
    mutate(weather5, interval60 = ymd_h(substr(date_time,1,13))) %>%
    mutate(week = week(interval60),
           dotw = wday(interval60, label=TRUE)) %>%
    group_by(interval60,station_name) %>%
    dplyr::summarize(Temperature = max(temperature),
              Humidity = max(humidity),
              Wind_Speed = max(windspeed))
weather6 <- 
    mutate(weather6, interval60 = ymd_h(substr(date_time,1,13))) %>%
    mutate(week = week(interval60),
           dotw = wday(interval60, label=TRUE)) %>%
    group_by(interval60,station_name) %>%
    dplyr::summarize(Temperature = max(temperature),
              Humidity = max(humidity),
              Wind_Speed = max(windspeed))
weather7 <- 
    mutate(weather7, interval60 = ymd_h(substr(date_time,1,13))) %>%
    mutate(week = week(interval60),
           dotw = wday(interval60, label=TRUE)) %>%
    group_by(interval60,station_name) %>%
    dplyr::summarize(Temperature = max(temperature),
              Humidity = max(humidity),
              Wind_Speed = max(windspeed))
weather8 <- 
    mutate(weather8, interval60 = ymd_h(substr(date_time,1,13))) %>%
    mutate(week = week(interval60),
           dotw = wday(interval60, label=TRUE)) %>%
    group_by(interval60,station_name) %>%
    dplyr::summarize(Temperature = max(temperature),
              Humidity = max(humidity),
              Wind_Speed = max(windspeed))

weather1$station_date <- paste (weather1$station_name, weather1$interval60, sep= "-")
weather2$station_date <- paste (weather2$station_name, weather2$interval60, sep= "-")
weather3$station_date <- paste (weather3$station_name, weather3$interval60, sep= "-")
weather4$station_date <- paste (weather4$station_name, weather4$interval60, sep= "-")
weather5$station_date <- paste (weather5$station_name, weather5$interval60, sep= "-")
weather6$station_date <- paste (weather6$station_name, weather6$interval60, sep= "-")
weather7$station_date <- paste (weather7$station_name, weather7$interval60, sep= "-")
weather8$station_date <- paste (weather8$station_name, weather8$interval60, sep= "-")

weather <- rbind(weather1,weather2,weather3,weather4,weather5,weather6,weather7,weather8)

trains$station_date <- paste (trains$name, trains$interval60, sep= "-")
trains <- merge(x=trains,y=weather[,c("station_date","Temperature","Humidity","Wind_Speed")], by = "station_date", all.x=TRUE)
trains$Temperature[is.na(trains$Temperature)] <- 0
trains$Humidity[is.na(trains$Humidity)] <- 0
trains$Wind_Speed[is.na(trains$Wind_Speed)] <- 0

1. Introduction

Services oversupply and shortage are always serious problems in the transportation operating system. In train services, the oversupply will increase the operating fee while with limited revenue thus result in decreasing profit. In the situation of service shortage (supply > demand), most of the trains will crow and that considerably harm the experience of passengers and damage the company’s reputation. Also, the company will lose a number of potential passengers and their ticket fare if the train keep crowed since they can’t get into the train and that forces them to take other means of transportation.

use case

Thus, we want to provide a smart train occupancy prediction tool for transportation planners and help them to correct the train services provision of the whole region according to current demand. Although occupancy prediction isn’t a new thing, most of the user of the prediction is train passengers, and the function is about to let the passengers know the occupancy level before they buy the ticket. Currently, planner only regards the performance or feature of origin station as the main criterion of the trip’s occupancy, while, the situation is much more complex than this, and which the train occupancy might also be impacted by destination station, weather, serving line, departure date and time, etc. Apart from that, although there are some train prediction products that can be used by transportation planners, they only care about the accuracy of the prediction. Different from them, we also took an cost-benefit analysis of our model, which is an ideal way to let user know almost all the prediction have mistake, and how much you would pay for the wrong prediction and how much you can benefit from the correct prediction (the latter one always larger than the former one).

2. Methodology

The core arithmetic of the website is a prediction model that can learn from previous train occupancy recording and accurately predict the future occupancy to three-level: low, medium and high. This part will briefly introduce the method framework and the data that we use.

2.1 General framework

This diagram shows the framework of the process. an image caption Source: framwork

2.2 Data

Trip data: The table below describes each variable in trip data

traininfo <- read.csv("/Users/zhangjing/Desktop/590/final/train.csv")
kable(traininfo,
      caption = "Train trip data") %>%
  kable_styling("striped", full_width = F)

Train trip data
Variable_Name	Description
date	the data of the trip
time	the departure time
from	the id of departure station
to	the id of arrival station
vehicle	vehicle type
occupancy	the occupancy of the train

Station data: The table below describes each variable in train station data

stationinfo <- read.csv("/Users/zhangjing/Desktop/590/final/stationinfo.csv")
kable(stationinfo,
      caption = "Station data") %>%
  kable_styling("striped", full_width = F)

Station data
Variable_Name	Description
URI	links to connection informa connection of that train
Name	the station name
longitude	the longitude of the station
latitude	the latitude of the train
avg_stop_times	the average stop time of the station

Train line data: The table below describes each variable in train line data

lineinfo <- read.csv("/Users/zhangjing/Desktop/590/final/lineinfo.csv")
kable(lineinfo,
      caption = "Train line data") %>%
  kable_styling("striped", full_width = F)

Train line data
Variable_Name	Description
vehicle id	the id of the train
vehicle type	the type of vehicle
nr_of_stops	the number of stops
stopping_station_ids	id of the stopping stations

Weather data the weather data includes temperature, humidity and wind speed information.

grid.arrange(
  ggplot(trains, aes(interval60,Humidity)) + geom_line() + 
  labs(title="Percipitation", x="Hour", y="Humidity") + plotTheme,
  ggplot(trains, aes(interval60,Wind_Speed)) + geom_line() + 
    labs(title="Wind Speed", x="Hour", y="Wind Speed") + plotTheme,
  ggplot(trains, aes(interval60,Temperature)) + geom_line() + 
    labs(title="Temperature", x="Hour", y="Temperature") + plotTheme,
  top="Weather Data - July~October, 2016")

3. Exploratory analysis

3.1 Spatial network

- occ with trip

Generally, there are 2250 trips and 859 unique Origin and Destination pairs (O-D pair). For example, “8814001-8731901” is one O-D pair which means that this trip is FROM station 8814001 TO station 8731901. However, we can’t directly put all O-D pairs into the model which might destroy the generalizability of the whole model. Therefore, we grouped the train’s data by “O-D pair” and “occupancy level” and found 58 O-D pairs that have more than 4 trips (count>=4) and also have a higher share of low or medium or high occupancy (percent >=0.5). It means that, for each selected O-D pairs, more than 50% of the trips are in low (or medium or high) occupancy. For example, in London, the tube trip FROM Oxford Circus Station (main shopping street) to Holborn Station (Station next to Chinatown which is a famous tourist attraction in London) is always crowded and in high occupancy. While the tube trip in a suburban area is likely to be in low occupancy since it only serves residents who live surround this station. Therefore, this step selected the O-D paries which has some characteristics of its occupancy level.

We will add one more column to the trains dataset called OD_occ, in this column, if the O-D pair of the trip belong to the selected O-D pairs (as plotted), they will be coded as its original O-D pair, otherwise, they will be coded as “1” which means that for this type of O-D pairs, we didn’t find special characteristics of its occupancy level.

# occ with OD trip (percent)
occ_OD <- trains%>%
  group_by(OD)%>%
   count(occ2)%>%
  mutate(percent = n/sum(n),
         count = n)%>%
  filter(n>=4 & percent >0.5)%>%
   dplyr::select(-n) 
occ_OD_df <- as.data.frame(occ_OD)
head(occ_OD[order(-occ_OD_df$count, -occ_OD_df$percent),],10)

## # A tibble: 10 x 4
## # Groups:   OD [10]
##    OD              occ2   percent count
##    <chr>           <fct>    <dbl> <int>
##  1 8892007-8891009 low      0.512    22
##  2 8833001-8812005 high     0.618    21
##  3 8813003-0       medium   0.516    16
##  4 8812005-8831765 low      1        14
##  5 8893013-8893401 low      1        13
##  6 8821006-8821121 low      0.812    13
##  7 8813003-8892007 low      0.571    12
##  8 8841202-8812005 high     0.846    11
##  9 8896149-8813003 medium   0.786    11
## 10 8811106-8811916 low      1        10

# merge l with occ_OD
l2 <- merge(l, occ_OD, by="OD", all.x=TRUE)

# plot the trip with high percentage of certain occupancy 
l2$occ2 <- factor(l2$occ2, level=c("low","medium","high"))
train_sf$OD_occ <- factor(ifelse(train_sf$OD %in% occ_OD$OD, train_sf$OD, "1"))
l2%>%
  filter(!is.na(l2$occ2))%>%
  mutate(per = percent*10)%>%
ggplot()+
geom_sf(data=rail, inherit.aes = FALSE,na.rm=T, col="black", lwd=0.2 )+
geom_sf(data=stations, size = 1,
        inherit.aes = FALSE, col="grey" )+
  ylim(min(49.5), max(51.5))+ #lat
  xlim(min(2.5), max(6.1)) +#long
geom_sf(aes(color = -percent), lwd=1.5, fill = "transparent")+
 scale_colour_viridis(direction = 1, discrete = FALSE, option = "C")+
 facet_wrap(~occ2)+
   labs(title="Selected O-D paires and their occupany ",
        subtitle="Filiter: OD paires have 4+ trips, share of certain occupancy level >50%")

- occ with station

Apart from the O-D pair, the origin and the destination will also impact the occupancy level of the trip. For example in Beijing, the short tube trips FROM Beijing South Station (the biggest railwayStation in Beijing) are always highly occupied, whatever the direction or the line. Also the short tube trips TO Beijing South Station also always crowded, since there is considerable demand to gather to or disperse from this station. Thus, to explore the relationship between trips occupancy and Origin or Destination Station in our data, we grouped the trains dataset by “From”/“TO” Stations and three “occupancy level”. We find that the FROM and TO stations considerably impact the occupancy level as shown in Figure. Specifically, 100% of the trips departure FROM or head TO 8821964 station is in low occupancy, and there are a number of stations that have such kind of feature. Additionally, we also found that for many stations, such as 8831401, about 67% of the trips departure FROM this station is highly occupied while 70% of the trips that head TO this station is in low occupancy. Therefore, we can justify that the impact of the station as Origin or Destination on trips’ occupancy is different. Thus, we intend to put all FROM and TO stations into our model.

### from 
occ_from <- trains%>%
  group_by(from)%>%
   count(occ2)%>%
  mutate(percent = n/sum(n),
         count = n,
         station_id= from)%>%
 # filter(n>=4 & percent >0.5)%>%
  dplyr::select(-n) 
#merge with sf
stations <- stations%>%
  mutate(Geom = gsub('[(c)°]', '', geometry)) %>% 
  separate(col = Geom, into = c('lat', 'lon'), sep = '\\,')
occ_from <- merge(stations[c("lat","lon","station_id")], occ_from, by= "station_id",all.y=T)
occ_from_df <- as.data.frame(occ_from)%>%
  dplyr::select(-geometry)%>%
  round_df(3)
head(occ_from_df[order( -occ_from_df$count,-occ_from_df$percent),],10)

##     station_id      lat        lon    from   occ2 percent count
## 337    8892007 3.710675  51.035896 8892007    low   0.359    79
## 336    8892007 3.710675  51.035896 8892007 medium   0.341    75
## 64     8812005 4.360846  50.859663 8812005    low   0.374    67
## 201    8833001 4.715866   50.88228 8833001   high   0.537    66
## 338    8892007 3.710675  51.035896 8892007   high   0.300    66
## 63     8812005 4.360846  50.859663 8812005   high   0.358    64
## 86     8813003 4.356801  50.845658 8813003 medium   0.375    60
## 85     8813003 4.356801  50.845658 8813003   high   0.344    55
## 65     8812005 4.360846  50.859663 8812005 medium   0.268    48
## 122    8821006 4.421101    51.2172 8821006    low   0.584    45

#### to 
occ_to <- trains%>%
  group_by(to)%>%
   count(occ2)%>%
  mutate(percent = n/sum(n),
         count = n,
         station_id= to)%>%
 # filter(n>=4 & percent >0.5)%>%
  dplyr::select(-n) 

#merge with sf 
occ_to <- merge(stations[c("geometry","station_id")], occ_to, by= "station_id",all.y=T)
grid.arrange(
ggplot(occ_from)+
geom_sf(data=rail, inherit.aes = FALSE,na.rm=T, col="black", lwd=0.2, alpha=0.6)+
geom_sf(data=occ_from,  aes(color = percent, size = count), alpha=0.5)+
  scale_colour_viridis(direction = -1, discrete = F, option = "C")+
  facet_wrap(~occ2)+
   ylim(min(49.3), max(51.5))+ #lat
  xlim(min(2.5), max(6.3))+
  labs(title="Plot of trip occupancy FROM the station",
      subtitle="Point size = count of trips FROM this station\nPoint color = share of certain occupancy level")
  +Theme1+ guides(size = FALSE)
,
  
ggplot(occ_to)+
geom_sf(data=rail, inherit.aes = FALSE,na.rm=T, col="black", lwd=0.2, alpha=0.2)+
geom_sf(data=occ_to,  aes(colour = percent, size = count), alpha=0.5)+
  scale_colour_viridis(direction = -1, discrete = F, option = "C")+
  facet_wrap(~occ2)+
   ylim(min(49.3), max(51.5))+ #lat
  xlim(min(2.5), max(6.3))+
  labs(title="Plot of trip occupancy TO the station",
       subtitle="Point size = count of trips TO this station\nPoint color = share of certain occupancy level")
 +Theme1+guides(size = FALSE)
,
ncol=1)

- occ with Line

Except for O-D pairs, origin and destination station, we also considered the serving line of this trip (also called as vehicle in the dataset). Similar to previous method, we grouped the data by line type, and selected the line that 75% of the trips belong to this line is low or medium or high occupancy, also the line has more than 4 trips (since we don’t want to select the line that has fill trips which is too special and can help the model).

# occ with veh
occ_veh <- trains%>%
  group_by(vehicle)%>%
   count(occ2)%>%
  mutate(percent = n/sum(n),
         vehicle_id = vehicle,
         count = n)%>%
   filter(n>=5, percent>=0.75)%>%
dplyr::select(-n)
occ_veh_df <- as.data.frame(occ_veh)%>%
  round_df(3)
head(occ_veh_df[order( -occ_veh_df$count,-occ_veh_df$percent),],10)

##      vehicle   occ2 percent vehicle_id count
## 24     IC736    low   1.000      IC736    17
## 2       8015    low   1.000       8015    14
## 1       1828    low   0.929       1828    13
## 15    IC3631   high   0.812     IC3631    13
## 13    IC3432    low   1.000     IC3432    12
## 19    IC4237    low   1.000     IC4237    12
## 25      L557    low   0.857       L557    12
## 29     P7444   high   0.857      P7444    12
## 38 undefined medium   1.000  undefined    11
## 36    S53586    low   0.917     S53586    11

#seperate and clean staion
line$nr_of_stops <- as.numeric(as.character(line$nr_of_stops))
sep_stat_line <- as.data.frame(str_split_fixed(line$stopping_station_ids, ",",34))

sep_stat_line <- sep_stat_line %>%
  mutate_all(funs(gsub("'", "", .)))%>%
  mutate_all(funs(gsub("]", "", .)))%>%
  mutate(V1 = substr(V1, 2, 10))

# join station with line info
sep_stat_line <- cbind(line,sep_stat_line)
sep_stat_line <- 
  sep_stat_line%>%
  dplyr::select(-Unnamed..0, -Unnamed..0.1)

# wide to long
v_stat_line <- reshape(sep_stat_line,
                       varying=c(6:39), 
                       timevar = "Order",
                       idvar=c("vehicle_id"),
                       v.names="station_id",
                       direction="long")
v_stat_line <- v_stat_line%>%mutate_all(funs(gsub(" ", "", .)))%>%
  mutate(station_id = substr(station_id,3,9))%>%
  filter(station_id!="")%>%
  dplyr::select(-stopping_station_ids)

# add geomtry to each station
stat_order_sf <- merge(stations[c("avg_stop_times", "station_id","geometry")], v_stat_line, by = "station_id",all=T)

# if the % of certain type of occupancy is larger than 0.75 (and count >=5), then 1,  otherwise 0 
stat_order_sf$occ_veh<- ifelse(stat_order_sf$vehicle_id %in% occ_veh$vehicle_id,1,0)

# only select the stations exist in trains 
stat_order_sf <- subset(stat_order_sf, stat_order_sf$station_id %in% trains$from | stat_order_sf$station_id %in% trains$to)

# merge occupancy into station order_sf
veh_sf <- merge(stat_order_sf,occ_veh, by = "vehicle_id", all.x=T)
veh_sf <- veh_sf%>%
  filter(!is.na(occ2))

ggplot()+
  geom_sf(data=rail, inherit.aes = FALSE,na.rm=T, col="black", lwd=0.2, alpha=0.2)+
  geom_sf(data=stat_order_sf, color = "grey",size=1, alpha=0.4)+
  geom_sf(data=veh_sf,aes(color = vehicle_id, group = vehicle_id), size=1)+
  facet_wrap(~occ2)+ 
ylim(min(49.5), max(51.5))+ #lat
xlim(min(2.5), max(6.1))+
  theme(legend.position="bottom")+
   labs(title="Relationship between occupancy and LINEs")

        # subtitle="Relative to test set points (in black)")
  
train_sf$vehicle <- as.character(train_sf$vehicle)
train_sf$veh_occ <- as.factor(ifelse(train_sf$vehicle %in% occ_veh$vehicle, train_sf$vehicle, "1"))

3.2 About the Trip

- departure time

(1)daily pattern The following plots show the distribution of trip volume of different occupancy for different times of the day

 ggplot(trains %>%mutate(hour = hour(date_time)))+
     geom_freqpoly(aes(hour), binwidth = 1)+
  labs(title="Train occupancy, by hour of the day",
       x="Hour", 
       y="Train Counts")+
           facet_wrap(~occupancy)+
     plotTheme

At about 7am and 4pm, there were more trips than other times and the number of high occupancy, medium occupancy trip as well as low occupancy trips all increased. Yet, there were distict differences. At about 7am, there are fewer low occupancy trips than the high and medium occupancy trips; at about 12:30pm, there were more low occupancy trips; at 4pm there were also more low occupancy trips.

day of the week the following plot discribe the train occupancy pattern by day of the week

 ggplot(trains %>%mutate(hour = hour(date_time)))+
     geom_freqpoly(aes(hour, color = dotw), binwidth = 1)+
  labs(title="Train occupancy, by day of the week",
       x="Hour", 
       y="Train Counts")+
           facet_wrap(~occupancy)+
     plotTheme

The occupancy varied among different days of the week. At 7am on Sunday, there are more high occupancy trips; at 4pm on Friday there are more high and low occupancy trips.

week the following plot discribe the train occupancy pattern by week.

 ggplot(trains %>%mutate(week =  week(interval60)))+
     geom_freqpoly(aes(week), binwidth = 1)+
  labs(title="Train occupancy, by week",
       x="Week", 
       y="Train Counts")+
           facet_wrap(~occupancy)+
     plotTheme

The plots above suggests that between weeek 30 and week 35, there were more low occupancy trips, but between week 37 and week 45 there are more high occupancy trips.

rush hour We define 2 rush hour period, one is from 6am to 10am, and the other is from 3pm to 7pm.

trains$mor.jam[trains$mor.jam==0] <- 'No'
trains$mor.jam[trains$mor.jam==1] <- 'Yes'
trains$eve.jam[trains$eve.jam==0] <- 'No'
trains$eve.jam[trains$eve.jam==1] <- 'Yes'

trains %>%
  dplyr::select(occupancy, mor.jam,eve.jam) %>%
  gather(Variable, value, -occupancy)  %>%
  count(Variable, value, occupancy)%>%
  ggplot(., aes(value, n, fill = occupancy)) +   
  geom_bar(position = "dodge", stat="identity") +
  facet_wrap(~Variable, scales="free") +
  scale_fill_viridis(option= "plasma", discrete=TRUE) +
  labs(x="Is rush hour", y="Value",
       title = "rush hour's associations with the occupancy")

The plots above indicate that in non-rush hours, there were more low occupancy trips; in evening rush hours, there were fewer medium occupancy trips.

- distance between to and from

train <- 
read.csv("/Users/zhangjing/Desktop/590/final/trains_train.csv") %>%
mutate(from = as.numeric(as.character(from)),
to = as.numeric(as.character(to))) %>%
left_join(dplyr::select(stations, station_id), by = c("from" = "station_id")) %>%
st_sf() %>% mutate(from.X = st_coordinates(.)[,1], 
from.Y = st_coordinates(.)[,2]) %>%
st_drop_geometry() %>%
left_join(dplyr::select(stations, station_id), by = c("to" = "station_id")) %>%
st_sf() %>% mutate(to.X = st_coordinates(.)[,1], 
to.Y = st_coordinates(.)[,2]) %>%
st_drop_geometry() %>%
mutate(distance = sqrt((from.X - to.X)^2 + (from.Y - to.Y)^2)) %>%
arrange(-distance)

train1 <- filter(train,is.na(train$distance)==F)

train1 <-mutate(train1, distance.cat = case_when(
  distance <0.27  ~ "less than 0.27",
  distance >= 0.27 & distance < 0.54  ~ "0.27-0.54",
  distance >= 0.54 & distance <  0.81   ~ "0.54-0.81",
  distance >= 0.81 & distance <  1.08   ~ "0.81-1.08",
  distance >= 1.08 & distance <  1.35   ~ "1.08-1.35",
  distance >= 1.35 & distance <  1.62   ~ "1.35-1.62",
  distance >= 1.62 & distance <  1.89   ~ "1.62-1.89",
  distance >= 1.89 & distance <  2.16   ~ "1.89-2.16",
  distance >= 2.16 & distance <  2.43   ~ "2.16-2.43",
  distance >= 2.43 & distance <  2.7   ~ "2.43-2.7",
  distance >2.7  ~ "over 2.7"))

train1$occupancy<-factor(train1$occupancy, levels=c("high", "medium", "low"))

train1 %>%
  dplyr::select(occupancy, distance.cat) %>%
  gather(Variable, value, -occupancy)  %>%
  count(Variable, value, occupancy)%>%
  ggplot(., aes(value, n, fill = occupancy)) +   
  geom_bar(position = "dodge", stat="identity") +
  facet_wrap(~Variable, scales="free") +
  scale_fill_viridis(option= "plasma", discrete=TRUE) +
  scale_x_discrete(limits=c("less than 0.27","0.27-0.54","0.81-1.08","1.08-1.35",
                            "1.35-1.62","1.62-1.89","1.89-2.16","2.16-2.43","2.43-2.7","over 2.7"))+
  labs(x="Distance between from and to stations", y="Value",
       title = "Distance's association with the occupancy") +
  theme(axis.text.x = element_text(angle = 20, hjust = 1))

According to the plot above, when the departual and arrival stations were extremly closed to each other, there were mroe low occupancy trips. However, when the distance increased, the number of low occupancy trips largely decreased. Besides, when the distance was beween 1.08 to 1.35, there were slightly more high occupancy trips. Thus, within different distance, the distribution of different occupancy trips changed.

- stop time (from and to station)

station_1 <- read.csv("/Users/zhangjing/Desktop/590/final/stations.csv")
station_1$station_id <- substrRight(as.character(station_1$URI), 9)
trains2 <- read.csv("/Users/zhangjing/Desktop/590/final/trains_train2.csv")
dat <- dplyr::select(station_1,station_id,avg_stop_times)
colnames(dat)[colnames(dat)=="station_id"] <- "from"
dat <- merge(x=dat,y=station_1[,c("station_id","avg_stop_times")], by = "avg_stop_times", all.x=TRUE)
colnames(dat)[colnames(dat)=="station_id"] <- "to"

trains2 <- merge(x=trains2, y=dat[,c("from","avg_stop_times")], by = "from", all.x = FALSE, all.y=FALSE, sort = FALSE)
colnames(trains2)[colnames(trains2)=="avg_stop_times"] <- "fromstoptime"
trains2 <- merge(x=trains2, y=dat[,c("to","avg_stop_times")], by = "to", all.x = FALSE, all.y=FALSE, sort = FALSE)
colnames(trains2)[colnames(trains2)=="avg_stop_times"] <- "tostoptime"

trains2$occupancy<-factor(trains2$occupancy, levels=c("high", "medium", "low"))
train_df<-as.data.frame(trains2)

trains2 <-mutate(train_df, fromstoptime.cat = case_when(
  fromstoptime <68.7  ~ "less than 68.7",
  fromstoptime >= 68.7 & fromstoptime < 137.4  ~ "68.7-137.4",
  fromstoptime >= 137.4 & fromstoptime <  206.1   ~ "137.4-206.1",
  fromstoptime >= 206.1 & fromstoptime <  274.8   ~ "206.1-274.8",
  fromstoptime >= 274.8 & fromstoptime <  343.5   ~ "274.8-343.5",
  fromstoptime >= 343.5 & fromstoptime <  412.2   ~ "343.5-412.2",
  fromstoptime >= 412.2 & fromstoptime <  480.9   ~ "412.2-480.9",
  fromstoptime >= 480.9 & fromstoptime <  549.6   ~ "480.9-549.6",
  fromstoptime >= 549.6 & fromstoptime <  618.3   ~ "549.6-618.3",
  fromstoptime >= 618.3 & fromstoptime <  687   ~ "618.3-687"))

trains2 <-mutate(trains2, tostoptime.cat = case_when(
  tostoptime <68.7  ~ "less than 68.7",
  tostoptime >= 68.7 & tostoptime < 137.4  ~ "68.7-137.4",
  tostoptime >= 137.4 & tostoptime <  206.1   ~ "137.4-206.1",
  tostoptime >= 206.1 & tostoptime <  274.8   ~ "206.1-274.8",
  tostoptime >= 274.8 & tostoptime <  343.5   ~ "274.8-343.5",
  tostoptime >= 343.5 & tostoptime <  412.2   ~ "343.5-412.2",
  tostoptime >= 412.2 & tostoptime <  480.9   ~ "412.2-480.9",
  tostoptime >= 480.9 & tostoptime <  549.6   ~ "480.9-549.6",
  tostoptime >= 549.6 & tostoptime <  618.3   ~ "549.6-618.3",
  tostoptime >= 618.3 & tostoptime <  687   ~ "618.3-687"))

trains2 %>%
  dplyr::select(occupancy, fromstoptime.cat,tostoptime.cat) %>%
  gather(Variable, value, -occupancy)  %>%
  count(Variable, value, occupancy)%>%
  ggplot(., aes(value, n, fill = occupancy)) +   
  geom_bar(position = "dodge", stat="identity") +
  facet_wrap(~Variable, scales="free") +
   scale_fill_viridis(option= "plasma", discrete=TRUE) +
  scale_x_discrete(limits=c("less than 68.7", "68.7-137.4", "137.4-206.1", "206.1-274.8","274.8-343.5", "343.5-412.2", "412.2-480.9", "480.9-549.6", "549.6-618.3", "618.3-687"))+
  labs(x="Trains' Stop time", y="Value",
       title = "From and to station's stop time associations with the occupancy")+
   theme(axis.text.x = element_text(angle = 45, hjust = 1))

According to the plots above, there are differences among different stop time groups. For example, when the stop time of the from station was less than 137.4, there were obviously more low occupancy trips than high and medium, while the low, meidum and high occupancy trips were almost at the same amount when the stop time was more than 137.4; besides, when the stop time of the to station was between 274.8 to 343.5 there were more low occupancy trips.

- weather

The following plots show the relationship between weather and occupancy:

grid.arrange(
ggplot(trains)+
  geom_freqpoly(aes(Temperature, color = occupancy))+
  labs(title="Train occupancy, by temperature",
       x="Temperture", 
       y="Train Counts"),
ggplot(trains)+
  geom_freqpoly(aes(Humidity, color = occupancy))+
  labs(title="Train occupancy, by humidity",
       x="Humidity", 
       y="Train Counts"),
ggplot(trains)+
  geom_freqpoly(aes(Wind_Speed, color = occupancy))+
  labs(title="Train occupancy, by Wind_Speed",
       x="Wind_Speed", 
       y="Train Counts"),
ncol=1)

The first plot shows that generally, no matter what the temperature was, there were more low occupancy trips, but when the temperature was between 12.5 to 15, there were more high occupancy trips; The second plot shows that as the humidity increased, the number of high, medium and low occupancy trips all increased, so the plot does not neccessarily support the hypothesis that humidity will infuence the occupancy. The third plot shows that there was no strong relationship between wind speed and the occupancy, although when the windspeed was bwtween 4 to 7 and 17 to 23 there were apparently more low occupancy trips.

4. Modeling

This project uses Multinomial Logistic Regression to train the model. The dependent variable (y) is a three-category factor: train occupancy with low, medium and high three-level. When predicting the occupancy level by multinomial logistic regression, the model will calculate the possibility be each occupancy level and then return the occupancy level with the highest possibility. For example, for result type high-low ( the observed occupancy is high while the model wrongly predicted it as low), it means that the possibility that this model calculated to be “low” is higher than the possibility to be “high”, so the model returns the occupancy level as low. The following prediction result of the test set (testProbs_table) may help you understand the mechanism better.

trains <- 
  trains %>%
  mutate( mor.jam = factor(mor.jam),
        eve.jam = factor(eve.jam))

train_sf$veh <- as.factor(train_sf$veh)
train_sf$week <- as.factor(train_sf$week)

set.seed(1123)
trainIndex <- createDataPartition(
              y = paste(train_sf$occ2, train_sf$to, train_sf$veh_occ,  train_sf$OD_occ, train_sf$from, train_sf$dowt, p = .75),
              list = FALSE,
              times = 1)
yTrain <- train_sf[ trainIndex,]
yTest  <- train_sf[-trainIndex,]

m1 <- multinom(occ2 ~  week + dotw + veh + hour  + mor.jam + eve.jam + distance + from + to + OD_occ + veh_occ + stop_times_from + stop_times_to, data = yTrain, MaxNWts =10000000)

## # weights:  1980 (1318 variable)
## initial  value 1785.244969 
## iter  10 value 1534.439206
## iter  20 value 1087.797236
## iter  30 value 961.133783
## iter  40 value 887.819586
## iter  50 value 863.277509
## iter  60 value 849.160826
## iter  70 value 843.664578
## iter  80 value 841.572699
## iter  90 value 840.847925
## iter 100 value 840.535775
## final  value 840.535775 
## stopped after 100 iterations

testProbs <- data.frame(Observe = as.factor(yTest$occ2),
                        Predict = predict(m1, newdata=yTest, "class"),
                 pre = predict(m1, newdata=yTest, "probs"),
                 from = yTest$from,
                 to = yTest$to,
                 OD = yTest$OD,
                 week = yTest$week,
                 hour = yTest$hour,
                 interval60 = yTest$interval60,
                 date = yTest$date)
testProbs <- merge(l[c("geometry","OD")], testProbs, by="OD", all.y=T)
testProbs <- testProbs[c(7,8,9, 10,11,12,1,2,3,4,5,6,13)]
testProbs_table <- as.data.frame(testProbs)%>%
  round_df(3)%>%
  dplyr::select(from, to, date, Observe, Predict, pre.low, pre.high, pre.medium)
head(testProbs_table,10)

##       from      to       date Observe Predict pre.low pre.high pre.medium
## 1  8811007 8812005  2016/9/30     low     low   0.799    0.000      0.201
## 2  8811007 8812005 2016/10/28     low     low   0.836    0.000      0.164
## 3  8811007 8812005 2016/10/13     low     low   0.882    0.000      0.118
## 4  8811106 8811916 2016/10/14     low     low   1.000    0.000      0.000
## 5  8811106 8811916 2016/10/11     low     low   1.000    0.000      0.000
## 6  8811106 8811916  2016/10/4     low     low   1.000    0.000      0.000
## 7  8811106 8811916 2016/10/10     low     low   1.000    0.000      0.000
## 8  8811106 8811916  2016/10/6     low     low   1.000    0.000      0.000
## 9  8811189 8813003 2016/10/24  medium  medium   0.091    0.132      0.777
## 10 8811189 8813003  2016/7/30    high     low   0.734    0.036      0.230

5. Goodness of fit

5.1 Cost benefit analysis

Before the Accuracy and generalizability analysis, we have to evaluate the cost-benefit first. For the cost and benefit analysis, the prediction result will affect the profit of the train system since planners will adjust trains services based on the prediction.

As an algorithm for transportation organizations, we hope to help them reduce expenditures while optimizing the needs of transportation systems. We classify the nine prediction results to calculate the income and cost that each prediction result brings compared to the actual.

If we can successfully predict the route, we expect that the optimal algorithm can optimize the total revenue.We are going to make the following assumptions about the Income and Cost of the occupancy prediction change: - We assume the average ticket fee per passenger is €6. - We assume the capacity of each trip is 400 (80*5 coach). - We assume the cost on increasing of capacity or coach is €300. - We assume the about €300, which corresponds to 100 capacity increase. - For highly occupied trips, it’s possible that there is potential passengers. Therefore, we assumed 100 additional passengers will get into the train if we increase the capacity of that trip.

Based on these assumptions, we have established a profit calculation method for each prediction result and the observed comparison.(format: observed level-predict level)

For the low-low, there are low passengers, the income is stable, but the prediction will let the authority to decrease the cost, for example decline the frequency of the trip between two station. We use the 0-(-300) here as the profits per count.
For the low-Medium, there is no change in the cost and the income. The supply will not change for the medium results because the medium can not significant change the willing to add the capacity. We assume the authority will take no actions for this result. The profits here will not change.
For the low-high, there is no change in income, because the low demand of transits. The authority has willing to increasing the capacity of the route. So, the adding cost is 300. The profits here is 0-300.
For the medium-low, the authority will decrease the cost of capacity but the passenger is medium here which are not signifcant influenced by the decrease. The profits here is 0-(-300).
For the medium-medium, it is relatively balanced. There is no need to adjust the income and cost.
For the medium-high, there will be added cost for the predicted high demand. However, the income will not changed by the medium passengers supply.
For the high-high, the authority will decrease the capacity of the route. Due to its high supply, passengers may choose to abandon this route through congestion caused by the reduction of their capacity or due to time conflicts, which will result in a decrease in line income. We assume the income as -(6100) and the cost for reduce capacity as -300. The profits is -(6100)-(-300).
For the high-medium, the authority will not change the current arrange of capacity, High-demand lines are still the status quo. The profits will not change in this case.
For the high-high, the authority will choose to increasing the capacity of the route. The original high demand was released for the increasing of the supply(capacity). We assumed that more people will taking this route. The income will be 6100 and the cost for increasing the capacity will be 300. The profits will be 6100-300.

From this figure, it can be seen that the low-high, the medium-high, the high-low will cause a large loss to our overall prodits, especially the LOW-HIGH. This makes us focus on these types of predictions. We hope that these types of predictions can be made more accurately. The accuracy of predictions is influenced by reducing the error. The test of these types of predictions will be mentioned later.

testProbs_df <- as.data.frame(testProbs)
cost_benefit_table <-
   testProbs_df %>%
   na.omit() %>%
      count(Observe, Predict) %>%
      summarize(Low_Low = sum(n[Observe=='low' & Predict=='low']),
                Low_Medium = sum(n[Observe=='low' & Predict=='medium']),
                Low_High = sum(n[Observe=='low' & Predict=='high']),
                Medium_low = sum(n[Observe=='medium' & Predict=='low']),
                Medium_Medium = sum(n[Observe=='medium' & Predict=='medium']),
                Medium_High = sum(n[Observe=='medium' & Predict=='high']),
                High_Low = sum(n[Observe=='high' & Predict=='low']),
                High_Medium = sum(n[Observe=='high' & Predict=='medium']),
                High_High = sum(n[Observe=='high' & Predict=='high']),
                SUM = sum(n)
                ) %>%
       gather(Variable, Count)%>%
         mutate(Income =
               ifelse(Variable == "Low_Low", 0,
               ifelse(Variable == "Low_Medium", 0,
              ifelse(Variable == "Low_High", 0,
               ifelse(Variable == "Medium_low", 0,
              ifelse(Variable == "Medium_Medium", 0,
                ifelse(Variable == "Medium_High", 0,
               ifelse(Variable == "High_Low",'-(6*100)',
               ifelse(Variable == "High_Medium", 0,
               ifelse(Variable == "High_High", '6*100',
                      ifelse(Variable == "SUM",(-(6*100)+6*100) ,0))))))))))) %>%
           mutate(Cost =
               ifelse(Variable == "Low_Low", -300,
               ifelse(Variable == "Low_Medium", 0,
              ifelse(Variable == "Low_High", 300,
               ifelse(Variable == "Medium_low", -300,
              ifelse(Variable == "Medium_Medium", 0,
                ifelse(Variable == "Medium_High", 300,
               ifelse(Variable == "High_Low",-300,
               ifelse(Variable == "High_Medium", 0,
               ifelse(Variable == "High_High", 300,
                      ifelse(Variable == "SUM", 0,0))))))))))) %>%
               mutate('Profits/per count'=
               ifelse(Variable == "Low_Low", 0-(-300),
               ifelse(Variable == "Low_Medium", 0,
              ifelse(Variable == "Low_High", -300,
               ifelse(Variable == "Medium_low", 0-(-300),
              ifelse(Variable == "Medium_Medium", 0,
                ifelse(Variable == "Medium_High", 0-(300),
               ifelse(Variable == "High_Low",-6*100-(-300),
               ifelse(Variable == "High_Medium", 0,
               ifelse(Variable == "High_High", 6*100-(300),
                      ifelse(Variable == "SUM", ((0-(-300))+0-300+(0-(-300))-300+(-6*100-(-300))+6*100-(300)),0))))))))))) %>%
       mutate('Total_Profits' =
               ifelse(Variable == "Low_Low", Count * (0-(-300)),
               ifelse(Variable == "Low_Medium", Count * (0-0),
              ifelse(Variable == "Low_High", Count * (-(300)),
               ifelse(Variable == "Medium_low", Count * (0-(-300)),
              ifelse(Variable == "Medium_Medium", Count * (0-(0)),
                ifelse(Variable == "Medium_High", Count * (0-(300)),
               ifelse(Variable == "High_Low",Count*(-6*100-(-300)),
               ifelse(Variable == "High_Medium", Count*(0-(0)),
               ifelse(Variable == "High_High", Count*(6*100-(300)),
                ifelse(Variable == "SUM", '####',0)))))))))))%>%
    bind_cols(data.frame(Description = c(
              "We predicted low demand with the low demand observation",
              "We predicted low demand with the medium demand observation",
              "We predicted low demand with the high demand observation",
              "We predicted medium demand with the low demand observation",
              "We predicted medium demand with the meidum demand observation",
              "We predicted medium demand with the high demand observation",
              "We predicted high demand with the low demand observation",
              "We predicted high demand with the medium demand observation",
              "We predicted high demand with the high demand observation",
              "Total Profits")))%>%
  mutate(Observe_Predict=Variable)
cost_benefit_table <- cost_benefit_table[c(8,2,3,4,5,6,7)]
cost_benefit_table$Total_Profits <- as.numeric(cost_benefit_table$Total_Profits)
cost_benefit_table$Total_Profits[cost_benefit_table$Observe_Predict=="SUM"] <- sum(cost_benefit_table$Total_Profits, na.rm=T)
head(cost_benefit_table,10)

## # A tibble: 10 x 7
##    Observe_Predict Count Income  Cost `Profits/per co… Total_Profits
##    <chr>           <int> <chr>  <dbl>            <dbl>         <dbl>
##  1 Low_Low           151 0       -300              300         45300
##  2 Low_Medium         28 0          0                0             0
##  3 Low_High           43 0        300             -300        -12900
##  4 Medium_low         43 0       -300              300         12900
##  5 Medium_Medium      76 0          0                0             0
##  6 Medium_High        43 0        300             -300        -12900
##  7 High_Low           30 -(6*1…  -300             -300         -9000
##  8 High_Medium        40 0          0                0             0
##  9 High_High         127 6*100    300              300         38100
## 10 SUM               581 0          0                0         61500
## # … with 1 more variable: Description <fct>

kable(cost_benefit_table,
       caption = "Cost/Benefit Table")%>% kable_styling()

Cost/Benefit Table
Observe_Predict	Count	Income	Cost	Profits/per count	Total_Profits	Description
Low_Low	151	0	-300	300	45300	We predicted low demand with the low demand observation
Low_Medium	28	0	0	0	0	We predicted low demand with the medium demand observation
Low_High	43	0	300	-300	-12900	We predicted low demand with the high demand observation
Medium_low	43	0	-300	300	12900	We predicted medium demand with the low demand observation
Medium_Medium	76	0	0	0	0	We predicted medium demand with the meidum demand observation
Medium_High	43	0	300	-300	-12900	We predicted medium demand with the high demand observation
High_Low	30	-(6*100)	-300	-300	-9000	We predicted high demand with the low demand observation
High_Medium	40	0	0	0	0	We predicted high demand with the medium demand observation
High_High	127	6*100	300	300	38100	We predicted high demand with the high demand observation
SUM	581	0	0	0	61500	Total Profits

5.2 Accuracy

caret::confusionMatrix(testProbs$Observe, testProbs$Predict, 
                       positive = "low")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction low high medium
##     low    151   43     28
##     high    30  127     40
##     medium  43   43     76
## 
## Overall Statistics
##                                                 
##                Accuracy : 0.6092943             
##                  95% CI : (0.5682865, 0.6491851)
##     No Information Rate : 0.3855422             
##     P-Value [Acc > NIR] : < 0.00000000000000022 
##                                                 
##                   Kappa : 0.4073666             
##                                                 
##  Mcnemar's Test P-Value : 0.1332086             
## 
## Statistics by Class:
## 
##                      Class: low Class: high Class: medium
## Sensitivity           0.6741071   0.5962441     0.5277778
## Specificity           0.8011204   0.8097826     0.8032037
## Pos Pred Value        0.6801802   0.6446701     0.4691358
## Neg Pred Value        0.7966574   0.7760417     0.8377088
## Prevalence            0.3855422   0.3666093     0.2478485
## Detection Rate        0.2598967   0.2185886     0.1308090
## Detection Prevalence  0.3820998   0.3390706     0.2788296
## Balanced Accuracy     0.7376138   0.7030134     0.6654907

accu <- as.data.frame(table(testProbs$Observe,testProbs$Predict))%>%
  mutate(Observe = Var1,
         Predict = Var2,
         count = Freq,
         result_type = paste (Observe, Predict, sep= "-"))%>%
  dplyr::select(-Var1, -Var2,-Freq,)
accu$percent <- as.numeric(c("68.7", "16.6", "23.2", "15.6", "62.8", "21.3", "15.6", "20.6","55.5"))
accu$Predict <- factor(accu$Predict, level=c("low","medium","high"))
accu$Observe <- factor(accu$Observe, level=c("low","medium","high"))

an image caption Source: 3*3Confusion Matrix & corresponding profit

5.3 Final Model Adjustment

In this part, we will adjust our model based on the above cost-benefit and accuracy analysis. As we mentioned before, the cost of error type Low-High (Observed Low but predict as High) is considerably high, therefore we intend to modify our model by decrease the type of mistake and increase the general profit at the same time. We found that for this error, most of the case is that, the possibility of being high occupancy is only slightly higher than the possibility of being low occupancy (the difference is less than 0.2), and the model returns the level as high by default, while in reality, it should be low. Finally, we manually changed those prediction result that it is predicted as HIGH but the difference of possibility between being high and low is less than 0.26, we changed them from high to low. The reason is that high occupancy will result in additional operation fees since the planner will increase the frequency or capacity of the train to accommodate more passengers, while if this “high occupancy” is a wrong prediction, the additional operation fee will be a waste. Although the adjustment also results in few other wrong predictions, the whole profit of the model reaches the highest point at threshold 0.26.

This is the changed cost-benefit table

### adjust2
testProbs2 <- testProbs
testProbs2$Predict2 <- factor(ifelse(testProbs2$Predict=="high" & testProbs2$pre.high-testProbs2$pre.low<=0.26, "low", testProbs2$Predict))
testProbs2$Predict2 <- factor(ifelse(testProbs2$Predict2=="1"|testProbs2$Predict2=="low","low", 
                          ifelse(testProbs2$Predict2=="2","high", "medium")))
testProbs2$Predict <- testProbs2$Predict2
testProbs_df2 <- as.data.frame(testProbs2)
cost_benefit_table2 <-
   testProbs_df2 %>%
   na.omit() %>%
      count(Observe, Predict) %>%
      summarize(Low_Low = sum(n[Observe=='low' & Predict=='low']),
                Low_Medium = sum(n[Observe=='low' & Predict=='medium']),
                Low_High = sum(n[Observe=='low' & Predict=='high']),
                Medium_low = sum(n[Observe=='medium' & Predict=='low']),
                Medium_Medium = sum(n[Observe=='medium' & Predict=='medium']),
                Medium_High = sum(n[Observe=='medium' & Predict=='high']),
                High_Low = sum(n[Observe=='high' & Predict=='low']),
                High_Medium = sum(n[Observe=='high' & Predict=='medium']),
                High_High = sum(n[Observe=='high' & Predict=='high']),
                SUM = sum(n)
                ) %>%
       gather(Variable, Count)%>%
       mutate(Revenue =
               ifelse(Variable == "Low_Low", Count * (0-(-300)),
               ifelse(Variable == "Low_Medium", Count * (0-0),
              ifelse(Variable == "Low_High", Count * (-(300)), 
               ifelse(Variable == "Medium_low", Count * (0-(-300)), 
              ifelse(Variable == "Medium_Medium", Count * (0-(0)), 
                ifelse(Variable == "Medium_High", Count * (0-(300)), 
               ifelse(Variable == "High_Low",Count*(-6*100-(-300)),
               ifelse(Variable == "High_Medium", Count*(0-(0)),
               ifelse(Variable == "High_High", Count*(6*100-(300)),
                ifelse(Variable == "SUM", "i don't know",0)))))))))))%>%
  bind_cols(data.frame(Description = c(
              "We predicted low demand with the low demand observation",
              "We predicted low demand with the medium demand observation",
              "We predicted low demand with the high demand observation",
              "We predicted medium demand with the low demand observation",
              "We predicted medium demand with the meidum demand observation",
              "We predicted medium demand with the high demand observation",
              "We predicted high demand with the low demand observation",
              "We predicted high demand with the medium demand observation",
              "We predicted high demand with the high demand observation",
               "Total profit")))
cost_benefit_table2$Revenue <- as.numeric(cost_benefit_table2$Revenue)
cost_benefit_table2$Revenue[cost_benefit_table2$Variable=="SUM"] <- sum(cost_benefit_table2$Revenue, na.rm=T)

kable(cost_benefit_table2,
       caption = "Cost/Benefit Table")%>% kable_styling()

Cost/Benefit Table
Variable	Count	Revenue	Description
Low_Low	164	49200	We predicted low demand with the low demand observation
Low_Medium	28	0	We predicted low demand with the medium demand observation
Low_High	30	-9000	We predicted low demand with the high demand observation
Medium_low	55	16500	We predicted medium demand with the low demand observation
Medium_Medium	76	0	We predicted medium demand with the meidum demand observation
Medium_High	31	-9300	We predicted medium demand with the high demand observation
High_Low	49	-14700	We predicted high demand with the low demand observation
High_Medium	40	0	We predicted high demand with the medium demand observation
High_High	108	32400	We predicted high demand with the high demand observation
SUM	581	65100	Total profit

This diagram shows the accuracy changes between origin model and adjusted model.

accu2 <- as.data.frame(table(testProbs2$Observe,testProbs2$Predict))%>%
  mutate(Observe = Var1,
         Predict = Var2,
         count = Freq,
         result_type = paste (Observe, Predict, sep= "-"))%>%
  dplyr::select(-Var1, -Var2,-Freq,)
accu2$percent <- as.numeric(c("8.2", "53.8", "15.2", "76.1", "25.6", "29.3", "15.6", "20.6","55.5"))
accu2$Predict <- factor(accu2$Predict, level=c("low","medium","high"))
accu2$Observe <- factor(accu2$Observe, level=c("low","medium","high"))

an image caption Source: Confusion Matrix Comparision between origin and asjusted

5.3 Generalizability

We want to explore does our model generalize through space and time panel. To answer this question, we’ll plot the error geographically and summarize it by time variables.

It is notable that, we only consider 5 result types whose profit is less or equal to 0 as the wrong prediction. Additionally, when predicting the occupancy level by multinomial logistic regression, the model will calculate the possibility be each occupancy level and then return the occupancy level with the highest possibility. For example, for result type high-low ( the observed occupancy is high while the model wrongly predicted it as low), it means that the possibility that this model calculated to be “low” is higher than the possibility to be “high”, so the model returns the occupancy level as low.

Since there is no traditional Mean Absolute Error (MAE) in our model, we regard the difference between the possibility of observed occupancy level and the possibility of predicted occupancy as the error (p.observed -p.predict). Don’t forget that when we talk about the error we only viewing the result types whose profit is <=0 as the “wrong prediction”. The following “wrong prediction table” only selected the 5 types of wrong prediction trips which may help you to understand the mechanism better.

main_error <- testProbs%>%
  mutate(type=paste(Observe, Predict, sep= "-"))%>%
  filter(Predict!=Observe)%>%
  filter(type == "low-high"|type == "medium-high"|type == "high-low")%>%
  mutate(high_m_low = pre.high-pre.low,
         high_m_med = pre.high-pre.medium,
         low_m_high = pre.low-pre.high)

high_low <- subset(main_error, main_error$type=="high-low")%>%
  mutate(error = low_m_high)%>%
  dplyr::select(-c(15,16,17))

low_high <- subset(main_error, main_error$type=="low-high")%>%
  mutate(error = high_m_low)%>%
  dplyr::select(-c(15,16,17))

medium_high <- subset(main_error, main_error$type=="medium-high")%>%
  mutate(error = high_m_med)%>%
  dplyr::select(-c(15,16,17))

all_error <- testProbs%>%
  mutate(type=paste(Observe, Predict, sep= "-"))%>%
  filter(Observe!=Predict)

low_medium <- all_error%>%
  filter(type=="low-medium")%>%
  mutate(error = pre.medium-pre.low)

High_medium <- all_error%>%
  filter(type=="high-medium")%>%
  mutate(error = pre.medium-pre.high)
  
all_error <- rbind(high_low,low_high,medium_high, low_medium,High_medium)
wrong_pred_5_type <- as.data.frame(all_error)%>%
  dplyr::select(Observe, Predict, pre.low, pre.medium, pre.high, error)%>%
  round_df(3)
head(wrong_pred_5_type,100)

##     Observe Predict pre.low pre.medium pre.high error
## 1      high     low   0.734      0.230    0.036 0.698
## 2      high     low   0.460      0.242    0.298 0.163
## 3      high     low   0.502      0.229    0.269 0.233
## 4      high     low   0.574      0.273    0.153 0.421
## 5      high     low   0.522      0.262    0.216 0.307
## 6      high     low   0.415      0.384    0.201 0.213
## 7      high     low   0.440      0.285    0.275 0.165
## 8      high     low   0.576      0.284    0.140 0.435
## 9      high     low   0.548      0.043    0.409 0.139
## 10     high     low   0.608      0.000    0.392 0.216
## 11     high     low   0.907      0.039    0.054 0.853
## 12     high     low   0.555      0.106    0.339 0.216
## 13     high     low   0.456      0.364    0.180 0.276
## 14     high     low   0.591      0.342    0.068 0.523
## 15     high     low   0.593      0.000    0.407 0.186
## 16     high     low   0.442      0.420    0.138 0.304
## 17     high     low   0.469      0.243    0.288 0.181
## 18     high     low   0.718      0.172    0.110 0.608
## 19     high     low   0.520      0.000    0.480 0.041
## 20     high     low   0.721      0.167    0.111 0.610
## 21     high     low   0.493      0.210    0.297 0.195
## 22     high     low   0.538      0.246    0.216 0.322
## 23     high     low   0.463      0.361    0.176 0.287
## 24     high     low   0.728      0.239    0.033 0.695
## 25     high     low   0.407      0.282    0.312 0.095
## 26     high     low   0.580      0.128    0.292 0.288
## 27     high     low   0.932      0.001    0.067 0.865
## 28     high     low   0.433      0.134    0.433 0.001
## 29     high     low   0.999      0.000    0.001 0.998
## 30     high     low   0.933      0.037    0.030 0.903
## 31      low    high   0.380      0.236    0.384 0.004
## 32      low    high   0.169      0.337    0.494 0.325
## 33      low    high   0.304      0.141    0.555 0.251
## 34      low    high   0.115      0.049    0.836 0.721
## 35      low    high   0.308      0.093    0.600 0.292
## 36      low    high   0.242      0.250    0.508 0.267
## 37      low    high   0.489      0.000    0.511 0.022
## 38      low    high   0.104      0.422    0.474 0.370
## 39      low    high   0.251      0.271    0.479 0.228
## 40      low    high   0.227      0.330    0.443 0.216
## 41      low    high   0.227      0.330    0.443 0.216
## 42      low    high   0.466      0.000    0.534 0.068
## 43      low    high   0.146      0.389    0.465 0.319
## 44      low    high   0.214      0.298    0.487 0.273
## 45      low    high   0.133      0.354    0.513 0.379
## 46      low    high   0.078      0.389    0.533 0.455
## 47      low    high   0.141      0.211    0.648 0.507
## 48      low    high   0.130      0.295    0.575 0.445
## 49      low    high   0.107      0.065    0.828 0.721
## 50      low    high   0.000      0.000    1.000 1.000
## 51      low    high   0.436      0.089    0.475 0.039
## 52      low    high   0.170      0.199    0.631 0.462
## 53      low    high   0.148      0.248    0.604 0.457
## 54      low    high   0.160      0.175    0.665 0.506
## 55      low    high   0.334      0.000    0.666 0.333
## 56      low    high   0.122      0.184    0.694 0.572
## 57      low    high   0.007      0.021    0.973 0.966
## 58      low    high   0.440      0.000    0.560 0.121
## 59      low    high   0.155      0.020    0.825 0.671
## 60      low    high   0.217      0.163    0.620 0.404
## 61      low    high   0.120      0.077    0.803 0.683
## 62      low    high   0.162      0.000    0.838 0.676
## 63      low    high   0.262      0.145    0.593 0.331
## 64      low    high   0.188      0.379    0.433 0.245
## 65      low    high   0.499      0.000    0.501 0.002
## 66      low    high   0.237      0.279    0.484 0.248
## 67      low    high   0.162      0.094    0.744 0.582
## 68      low    high   0.185      0.244    0.571 0.385
## 69      low    high   0.170      0.310    0.519 0.349
## 70      low    high   0.320      0.000    0.680 0.360
## 71      low    high   0.270      0.128    0.602 0.332
## 72      low    high   0.490      0.008    0.501 0.011
## 73      low    high   0.092      0.233    0.674 0.582
## 74   medium    high   0.380      0.236    0.384 0.148
## 75   medium    high   0.000      0.219    0.781 0.561
## 76   medium    high   0.168      0.413    0.419 0.006
## 77   medium    high   0.037      0.032    0.931 0.899
## 78   medium    high   0.349      0.135    0.516 0.381
## 79   medium    high   0.363      0.241    0.396 0.156
## 80   medium    high   0.086      0.179    0.735 0.555
## 81   medium    high   0.238      0.324    0.437 0.113
## 82   medium    high   0.000      0.193    0.807 0.614
## 83   medium    high   0.189      0.404    0.407 0.003
## 84   medium    high   0.206      0.244    0.550 0.306
## 85   medium    high   0.087      0.303    0.611 0.308
## 86   medium    high   0.146      0.389    0.465 0.077
## 87   medium    high   0.368      0.237    0.395 0.158
## 88   medium    high   0.258      0.316    0.426 0.110
## 89   medium    high   0.336      0.044    0.620 0.575
## 90   medium    high   0.008      0.000    0.992 0.992
## 91   medium    high   0.192      0.335    0.473 0.138
## 92   medium    high   0.079      0.426    0.496 0.070
## 93   medium    high   0.409      0.146    0.444 0.298
## 94   medium    high   0.168      0.184    0.648 0.465
## 95   medium    high   0.194      0.236    0.570 0.334
## 96   medium    high   0.326      0.151    0.523 0.372
## 97   medium    high   0.241      0.119    0.640 0.522
## 98   medium    high   0.001      0.281    0.718 0.436
## 99   medium    high   0.171      0.189    0.640 0.451
## 100  medium    high   0.000      0.192    0.808 0.615

- Spatial error

We make two figures in this part. The first figure is the overall spatial distribution of the five types of errors. The second figure is to classify them according to the type of error and check their different spatial distributions.

As can be seen from the first figure, the distribution characteristics of this error are not significant. It seems that the route between the stations closer to the center area has a higher error. The routes of the marginal area connect with the center area and the lines at the central station have lower errors. But there is a flaw here, we cannot tell each type to provide the information to optimize the model.

The second figure is aimed to check the error for different prediction types. It is more intuitive and effective. Because classification is used to study the effect of different prediction types on overall benefits, we need to look at the distribution of each error type in more detail. Combining with the cost and benefits analysis, it can be seen that for the two types: low-high and high-low where our overall profits impact is large, their errors seem to be higher. This proves that the correction of such errors should be the focus of modeling adjustments. The high-medium and the low-medium types have a small impact on our overall profits. Although the medium-high will increase the profits, the error here is not that large than the low-high and high-low.

ggplot(all_error)+
#geom_sf(data=rail,inherit.aes = FALSE, na.rm=T, col="black", lwd=0.2 ,alpha=0.2)+
geom_sf(data=stations, size = 1, inherit.aes = FALSE, col="grey" )+
geom_sf(data=all_error, aes(colour = error),lwd=1)+
    scale_colour_viridis(direction = -1, discrete = F, option = "C")+
     #facet_wrap(~type)+
    ylim(min(49.5), max(51.5))+ #lat
    xlim(min(2.5), max(6.1))+
   labs(title="Spatial Error ",
        subtitle="Only consider the result types whose profit<=0 as error")

ggplot(all_error)+
#geom_sf(data=rail,inherit.aes = FALSE, na.rm=T, col="black", lwd=0.2 ,alpha=0.2)+
geom_sf(data=stations, size = 1, inherit.aes = FALSE, col="grey" )+
geom_sf(data=all_error, aes(colour = error),lwd=1)+
    scale_colour_viridis(direction = -1, discrete = F, option = "C")+
     facet_wrap(~type)+
    ylim(min(49.5), max(51.5))+ #lat
    xlim(min(2.5), max(6.1))+
   labs(title="Spatial Error by 5 error types",
        subtitle="Only consider the result types whose profit<=0 as error")

- Time error

We also summarize the trend of errors from the perspective of time. It is useful to understand whether a given model generalizes to the weeks or different hours.

The first line figure checks the overall error trends by hour intervals. It can be seen that the overall fluctuation trend is relatively stable. The fluctuation range was not so great around October 15. In addition, we make a distinction between different 5 types. It is notable to indicate that the high-low and low-high have a higher error fluctuation compared with the low-medium and medium-low.

The second figure checks the error by weeks. It can be seen that the 38th week have a low possibility error. When we make a distinction between different 5 types, it can be seen that especially the high-low in 43rd week, low-high in 39th week need to be improved to increase the general profit. They have a significant impact on the overall profits.

all_error_df <- as.data.frame(all_error)
#hour
grid.arrange(
all_error %>%
  filter(interval60>=as.Date("2016-09-15"))%>%
    ggplot(aes(interval60, error)) + 
      geom_line(size = 0.5)+
   labs(title = "Errors by hour",
        subtitle = "Only consider the result types whose profit<=0 as error")+Theme2,
all_error %>%
  filter(interval60>=as.Date("2016-09-15"))%>%
    ggplot(aes(interval60, error, colour=type)) + 
      geom_line(size = 0.5)+
  labs(title = "Errors by 5 error types and hour",
        subtitle = "Only consider the result types whose profit<=0 as error")+Theme2,
ncol=1)

#week
grid.arrange(
  all_error_df %>%
  filter(week=="38"|week=="39"|week=="40"|week=="41"|week=="42"|week=="43")%>%
  ggplot(aes(x=week, y=error)) + 
    geom_bar(position = "dodge", stat="identity") +
     scale_fill_manual(values = palette5) +
    labs(title = "Errors by week",
         subtitle = "Only consider the result types whose profit<=0 as error")+Theme2,
all_error_df %>%
  filter(week=="38"|week=="39"|week=="40"|week=="41"|week=="42"|week=="43")%>%
  dplyr::select(week, type, error) %>%
  gather(Variable, error, -type, -week) %>%
  ggplot(aes(x=week, y=error)) + 
    geom_bar(aes(fill = type), position = "dodge", stat="identity") +
     scale_fill_manual(values = palette5) +
    labs(title = "Errors by 5 error types and week",
        subtitle = "Only consider the result types whose profit<=0 as error")+Theme2,
ncol=1)

6. Conclusion and Further improvement

In conclusion, this project developed a train occupancy prediction model, which can play as a tool for transportation planners to re-distribute the train resources and make the provision in line with the demand. Our prediction is generalized through different space and time. It’s clear that few prediction models are perfect, however, in our model we tried the best to decrease the cost of the error by adjusting part of the predicted high occupancy. As for further improvement, we need to go detail and find more related variables to increase accuracy. Additionally, in this project, we only focused on Low-High type of error, but in reality, we should focus on all types of error since they all related to the final profit.

Train Occupancy Prediction

JingZhang ZiruiChen

11/24/2019