Commuting Mode Choice based on Distance to the next Train Station

Author

Marijana Maric and Pablo Bischofberger

Code
# You can set chunk options individually per code chunk, as was done with this
# code chunk.

# echo: false           hides the code from the generated output
# warning: false        hides warnings from the generated output
# message: false        hides messages from the generated output

# To apply the setting for all code chunks, add the options to the yaml header of the document (in between the ---) without the preceeding "#|".

# Loading all needed libraries 
knitr::opts_chunk$set(echo = TRUE)
library(readr)
library(dplyr)
library(lubridate)
library(ggplot2)
library(plotly)
library(tidyr)
library(sf)
library(sp)
library(tmap)
library(tmaptools)
library(geosphere)
library(purrr)
library(cowplot)
library(rnaturalearth)
library(rnaturalearthdata)
library(gt)
library(knitr)
library(rstatix)
library(nnet)
library(effects)
library(broom)
Code
# Write all needed functions
# Function to get trips for a single user
get_trips <- function(df) {
  df <- df %>%
    arrange(datetime) %>%  # Sort data by datetime
    mutate(time_diff = c(0, difftime(datetime[-1], datetime[-n()], units = "mins"))) %>%  # Calculate time difference between records
    mutate(trip_id = cumsum(transport_mode != lag(transport_mode, default = transport_mode[1]) | time_diff > 15)) %>%  # Identify individual trips
    mutate(next_transport_mode_tmp = lead(transport_mode, default = NA),
           last_transport_mode_tmp = lag(transport_mode, default = NA)) %>% 
    group_by(trip_id) %>%
    mutate(next_transport_mode = last(next_transport_mode_tmp),
           last_transport_mode = first(last_transport_mode_tmp)) 
  df$next_transport_mode_tmp <- NULL
  df$last_transport_mode_tmp <- NULL
  return(df)
}

# Function to calculate distances for one user
calculate_distances <- function(data) {
  # Convert the data to sf object for spatial operations
  coordinates <- st_as_sf(data, coords = c("lon_x", "lat_y"), crs = 4326)
  coordinates_matrix <- st_coordinates(coordinates)
  distances <- c(0, distVincentySphere(coordinates_matrix[-1, ], coordinates_matrix[-nrow(data), ]))
  data$distance <- distances
  
  # Calculate cumulative distance and time for each trip
  data <- data %>%
    arrange(trip_id, datetime) %>%
    group_by(trip_id) %>%
    mutate(
      cumulative_distance =cumsum(distance),
      cumulative_time = cumsum(time_diff),
      elapsed_time = sum(time_diff),
      total_trip_distance = sum(distance)
    ) %>%
    ungroup() %>%
    mutate(
      has_train_before = last_transport_mode == 'Train',
      has_train_after = next_transport_mode == 'Train',
      has_train_somewhere = transport_mode == "Train" | next_transport_mode == 'Train' | last_transport_mode == 'Train'
    )
  
  return(data)
}

# Function to visualize a user's trajectory
visualize_trajectory <- function(user_id, trip_id, data) {
  # Subset the data for the specified user and trip
  specific_data <- data %>%
    dplyr::filter(user_id == !!user_id, trip_id %in% c((!!trip_id - 1), !!trip_id, (!!trip_id + 1)))
  
  # If there's no data for this user and trip, return a message
  if (nrow(specific_data) == 0) {
    return("No data for this user and trip")
  }
  
  # Convert the data to a spatial object
  data_sf <- sf::st_as_sf(specific_data, coords = c("lon_x", "lat_y"), crs = 4326)
  
  # Set up the map
  tmap::tmap_mode("view")
  
  # Create the map
  map <- tmap::tm_shape(data_sf) +
    tmap::tm_dots(col = "transport_mode") +
    tmap::tm_basemap(server = "OpenStreetMap") +
    tmap::tm_layout(title = paste("User ID:", user_id, "Trip ID:", trip_id))
  
  return(map)
}

generate_ylabel <- function(measure) {
  # Check if measure is one of the expected inputs
  if (!(measure %in% c("distance", "total_time", "direct_distance"))) {
    return(measure)
  }
  
  # Assign a label based on the measure
  if (measure == "distance") {
    label <- paste0("Distance (m)")
  } else if (measure == "total_time") {
    label <- paste0("Time (min)")
  } else if (measure == "direct_distance") {
    label <- paste0("Direct distance (m)")
  }
  
  return(label)
}

# Function to create a boxplot
create_boxplot <- function(data, value_column, remove_outliers = FALSE, add_title = FALSE, x_label_visible = TRUE, fixed_order = FALSE) {
  if (remove_outliers) {
    data <- data %>%
      group_by(transport_mode) %>%
      mutate(Q1 = quantile(get(value_column), 0.25, na.rm = TRUE),
             Q3 = quantile(get(value_column), 0.75, na.rm = TRUE),
             IQR = Q3 - Q1) %>%
      filter(#get(value_column) >= (Q1 - 1.5 * IQR),
             get(value_column) <= (Q3 + 1.5 * IQR))
  }
  
  
  
  
  if (fixed_order){
  # Define the fixed order for the transport_mode factor
  fixed_order <- c("Walk", "Bike", "Bus","Tram", "Car")
    data$transport_mode <- factor(data$transport_mode, levels = fixed_order)
  }else{
    data$transport_mode <- with(data, reorder(transport_mode, get(value_column), FUN = mean)) # Order by mean  
  }


  
  
  
  p<-ggplot(data, aes(x = transport_mode, y = !!sym(value_column), fill = transport_mode)) +
    geom_boxplot(outlier.shape = NA) +  # Don't plot outliers
    geom_jitter(width = 0.3, size = 1, alpha = 0.5) +
    stat_summary(fun = mean, geom = "point", shape = 20, size = 3, color = "black", fill = "white") +
    scale_fill_brewer(palette = "Set3") +
    labs(y = generate_ylabel(value_column)) +
    theme_minimal() +
    theme(
      plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
      axis.title.x = element_text(face = "bold", size = 12),
      axis.title.y = element_text(face = "bold", size = 12),
      axis.text.x = element_text(angle = 45, hjust = 1),
      legend.position = "none"
    )
  if (add_title) {
    p <- p + labs(title = paste("Boxplot of", value_column, "by Transport Mode"))
  }
  if (x_label_visible) {
    p <- p + labs(x = "Transport Mode")
  }else{
    p <- p + labs(x = "")
  }
  return(p)
}
  
# density plot
create_density_plot <- function(data, x_var = "distance", group_var = "transport_mode", title = "Density Plot") {
  library(ggplot2)
  ggplot(data, aes_string(x = x_var, fill = group_var)) +
    geom_density(alpha = 0.5) +
    labs(title = title, x = x_var, y = "Density") +
    theme_minimal()
}

combined_boxplot <- function(data, value_columns, remove_outliers = FALSE, title = "Combined Boxplots") {
  
  # Create an empty list to store individual plots
  plot_list <- list()
  x_label_visible = FALSE
  # Loop over value_columns to create individual boxplots
  for (i in 1:length(value_columns)) {
    if (i == length(value_columns)){
      x_label_visible = TRUE
    }
    p <- create_boxplot(data, value_columns[i], remove_outliers, FALSE, x_label_visible, TRUE)
    plot_list[[i]] <- p
  }
  
  # Combine the plots using cowplot
  combined_plot <- cowplot::plot_grid(plotlist = plot_list, ncol = 1)
  
  # Add a common title to the combined plot
  combined_plot <- cowplot::ggdraw(combined_plot) + 
    cowplot::draw_label(title, fontface = 'bold', size = 14, x = 0.5, y = 0.98)
  
  return(combined_plot)
}

Background and Research Goals

In urban areas, understanding the factors that influence individuals’ transport mode choices is crucial for developing efficient and sustainable mobility systems (European Environment Agency (EEA) 2019). The proximity to a train station has been identified as a significant factor in determining transport mode preferences (Wibowo and Chalermpong 2010). Shorter distances to train stations may encourage walking and public transportation use, while longer distances may lead to increased reliance on private vehicles or alternative modes of transport (Tennøy, Knapskog, and Wolday 2022).

The research goals of this study are twofold. First, we aim to investigate the impact of distance to the nearest train station on transport mode choices. By analyzing real-world data and employing statistical methods, such as ANNOVA and regression analysis, we seek to uncover patterns and relationships between distance and transport mode preferences.

Second, we aim to examine the role of different distance measurement methods in analyzing the relationship between distance and transport mode choice. We will explore the effectiveness of statistical methods in uncovering insights and evaluate how the choice of distance measurement influences the analysis.

To address these objectives and shed light on the relationship between distance to the nearest train station and transport mode choices, the following research questions will guide our investigation:

  1. How does the distance to the nearest train station influence individuals’ transport mode preferences?
  2. Can statistical methods, such as statistical tests and regression analysis effectively analyze the relationship between distance and transport mode choice, and provide valuable insights?
  3. How does the choice of distance measurement method influence the analysis of the relationship between distance and transport mode?

Data and Methods

Dataset

The data comprises GPS trajectory data that was collected from nine individuals between the beginning of April 2023 and the conclusion of May 2023, who were tracking their movement with the POSMO application. The Posmo Project app automatically records all activities that each individual undertakes on foot, by bike, public transport or by car (Genossenschaft Posmo Schweiz 2022). The initial data set follows a specific structure and includes the following variables:

  • user_id: A unique identifier for each individual in the study. (chr)

  • datetime: The date and time of each recorded location. (dttm)

  • weekday: The day of the week. (chr)

  • transport_mode: The mode of transportation used by the individual (e.g., walking, biking, car, bus or tram).(chr)

  • lon_x and lat_y: The longitude and latitude of each recorded location.(dbl)

Data Preparation

The process of data cleansing, a key element in any research project, is optimized using well-defined steps. Initially, data from CSV files are compiled into a combined data frame. Subsequently, the data sets are converted into appropriate data types to ensure consistency with the following steps of the analysis. The transport mode column is modified to combine similar types like ‘Run’ and ‘Walk’. This step helps to increase the intensity of subsequent analyses. This is followed by a rigorous data cleaning process to detect and delete any instances of NA values or empty strings, ensuring data integrity and completeness. A summary of the cleand data is shown in Table 1.

Code
# List all CSV files in the directory
file_list <- list.files(path = "data", pattern = "*.csv", full.names = TRUE)

# Read each CSV file into a data frame and concatenate them together
posmo <- do.call(rbind, lapply(file_list, function(x) read.csv(x, stringsAsFactors = FALSE)))

# Convert the datetime column to POSIXct type
posmo$datetime <- as.POSIXct(posmo$datetime, format = "%Y-%m-%dT%H:%M:%S", tz = "GMT")

# Convert user_id to a factor and then to numeric
posmo$user_id <- as.numeric(factor(posmo$user_id))

# Keep only the necessary columns and combine walk and run
posmo <- posmo |> 
  arrange(user_id, datetime) |>  
  select(user_id, datetime,transport_mode, lon_x, lat_y) |>
  mutate(transport_mode = ifelse(transport_mode == 'Run', 'Walk', transport_mode),
         transport_mode = ifelse(transport_mode == 'Funicular', 'Tram', transport_mode))

# Check for NA's and empty strings in the data
any(is.na(posmo))

# check which columns: 
apply(posmo, 2, function(x) any(is.na(x)))

#remove na data
posmo <- na.omit(posmo)

any(sapply(posmo |>select(user_id,transport_mode, lon_x, lat_y), function(x) any(x == "")))

# Create a data frame with non-empty transport_mode
posmo <- posmo|>filter(transport_mode != "")
Code
# Create a summarized dataframe
summary_df <- posmo %>%
  summarise(
    num_users = n_distinct(user_id),
    num_data_points = n(),
    num_datetimes = n_distinct(datetime),
    min_date = min(datetime, na.rm = TRUE),
    max_date = max(datetime, na.rm = TRUE),
    num_transport_modes = n_distinct(transport_mode),
    avg_longitude = mean(lon_x, na.rm = TRUE),
    max_longitude = max(lon_x, na.rm = TRUE),
    min_longitude = min(lon_x, na.rm = TRUE),
    avg_latitude = mean(lat_y, na.rm = TRUE),
    max_latitude = max(lat_y, na.rm = TRUE),
    min_latitude = min(lat_y, na.rm = TRUE)
  )

# Convert to gt table
# summary_table <- gt(summary_df, caption = "Table 1: Summary of Posmo Data") %>%
#   cols_label(
#     num_users = "Number of Users",
#     num_data_points = "Total number of datapoints",
#     num_datetimes = "Number of Datetimes",
#     min_date = "Minimal recorded datetime",
#     max_date = "Maximum recorded datetime",
#     num_transport_modes = "Number of Transport Modes",
#     avg_longitude = "Average Longitude",
#     max_longitude = "Maximum Longitude",
#     min_longitude = "Minimum Longitude",
#     avg_latitude = "Average Latitude",
#     max_latitude = "Maximum Latitude",
#     min_latitude = "Minimum Latitude"
#   ) %>%
#   fmt_number(
#     columns = vars(avg_longitude, max_longitude, min_longitude, avg_latitude, max_latitude, min_latitude),
#     decimals = 2
#   )
# 
# 
# # Print the table
# summary_table |> as_raw_html()

kable(summary_df, digits = 1, col.names = c(num_users = "Number of Users",
                                num_data_points = "Total number of datapoints",
                                num_datetimes = "Number of Datetimes",
                                min_date = "Minimal recorded datetime",
                                max_date = "Maximum recorded datetime",
                                num_transport_modes = "Number of Transport Modes",
                                avg_longitude = "Average Longitude",
                                max_longitude = "Maximum Longitude",
                                min_longitude = "Minimum Longitude",
                                avg_latitude = "Average Latitude",
                                max_latitude = "Maximum Latitude",
                                min_latitude = "Minimum Latitude"),
caption = "Summary of Posmo Data {#tbl-1}", label = "#tbl-1")
Table 1: Summary of Posmo Data
Number of Users Total number of datapoints Number of Datetimes Minimal recorded datetime Maximum recorded datetime Number of Transport Modes Average Longitude Maximum Longitude Minimum Longitude Average Latitude Maximum Latitude Minimum Latitude
9 116808 99906 2023-04-01 13:05:36 2023-05-31 14:39:59 10 8.5 11.5 -12.3 47.3 61.6 3.7

Once the initial data cleaning is completed, the data cycle is further enhanced with analytical metrics. A unique identifier ‘trip_id’ is calculated for each particular trip, it provides a systematic way of classifying and analysing data. Subsequently, the data are grouped according to each trip, and then two measures of distance (in meters) and elapsed time (in minutes) are calculated for each trip. Lastly, the data set is filtered to store only trips that occur immediately before or after a train trip. This strategic selection of data lays the foundation for answering the research questions, focusing on the patterns associated with the mode of transportation before starting a train trip.

In the end, we have the following data structure:

  • trip_id: A unique identifier assigned to each individual journey made by a participant in the study. This identifier is used to differentiate between different trips made by the same user. (int)

  • user_id: A unique identifier for each participant in the study. This identifier is used to differentiate between the data of different participants. (num)

  • day: The date on which the recorded location data was collected. (Date)

  • transport_mode: The mode of transportation used by the participant at a given recorded location. This could include various modes such as walking, using a bike, car, bus, tram, or other. (chr)

  • last_transport_mode: The mode of transportation that was used by the participant directly before the current transport_mode. This provides a sense of sequence in the participant’s mode of transportation. (chr)

  • next_transport_mode: The mode of transportation that was used by the participant directly after the current transport_mode. This, along with last_transport_mode, provides a context to the sequence of transportation modes used by the participant. (chr)

  • distance: The total distance in meters covered by the participant during a particular trip. This measurement is derived from the collected location data. (num)

  • total_time: The total time in minutes taken by the participant to complete a particular trip. This measurement is derived from the collected location data and corresponds to the distance covered in the same trip. (num)

  • direct_distance: The shortest possible distance in meters between the starting and ending point of a trip. This measurement is calculated using the ‘as the crow flies’ method, which ignores any detours or turns taken during the trip. (num)

A summary of the enhanced data is shown in Table 2.

Code
# Remove duplicated rows, split the data frame by user_id, apply the get_trips function to each group, and bind the results into a single data frame posmo_enriched.
posmo_enriched <- posmo|>distinct() |> group_by(user_id)|>group_split()|>map_dfr(get_trips)

# Apply the calculate_distances function to each user
data_by_user <- split(posmo_enriched, posmo_enriched$user_id)
posmo_enriched <- do.call(rbind, lapply(data_by_user, calculate_distances))

# Filter trips that involve a train
posmo_enriched_train <- posmo_enriched |> filter(has_train_somewhere == TRUE)
Code
posmo_analysis <- posmo_enriched_train %>%
  mutate(day = lubridate::date(datetime)) %>%
  group_by(trip_id, user_id, day, transport_mode, last_transport_mode, next_transport_mode) %>%
  summarise(
    distance = sum(distance),
    total_time = max(cumulative_time),
    direct_distance = distVincentySphere(c(lon_x[1], lat_y[1]), c(lon_x[n()], lat_y[n()]))
  ) %>%
  ungroup()

posmo_train_related <- posmo_analysis |> 
  filter(next_transport_mode == 'Train' | last_transport_mode == 'Train') |> 
  filter(transport_mode!= 'Other1' & transport_mode!= 'Train')
Code
# Create a summarized dataframe
summary_df <- posmo_analysis %>%
  summarise(
    num_trips = n_distinct(trip_id),
    num_users = n_distinct(user_id),
    num_days = n_distinct(day),
    min_day = min(day, na.rm = TRUE),
    max_day = max(day, na.rm = TRUE),
    num_transport_modes = n_distinct(transport_mode),
    avg_distance = mean(distance, na.rm = TRUE),
    max_distance = max(distance, na.rm = TRUE),
    min_distance = min(distance, na.rm = TRUE),
    avg_total_time = mean(total_time, na.rm = TRUE),
    max_total_time = max(total_time, na.rm = TRUE),
    min_total_time = min(total_time, na.rm = TRUE),
    avg_direct_distance = mean(direct_distance, na.rm = TRUE),
    max_direct_distance = max(direct_distance, na.rm = TRUE),
    min_direct_distance = min(direct_distance, na.rm = TRUE)
  )

# Convert to gt table
# summary_table <- gt(summary_df, caption = "Table 2: Summary of Posmo Analysis Data") %>%
#   cols_label(
#     num_trips = "Number of Trips",
#     num_users = "Number of Users",
#     num_days = "Number of Days",
#     min_day = "Earliest Recorded Day",
#     max_day = "Latest Recorded Day",
#     num_transport_modes = "Number of Transport Modes",
#     avg_distance = "Average Distance (m)",
#     max_distance = "Maximum Distance (m)",
#     min_distance = "Minimum Distance (m)",
#     avg_total_time = "Average Total Time (min)",
#     max_total_time = "Maximum Total Time (min)",
#     min_total_time = "Minimum Total Time (min)",
#     avg_direct_distance = "Average Direct Distance (m)",
#     max_direct_distance = "Maximum Direct Distance (m)",
#     min_direct_distance = "Minimum Direct Distance (m)"
#   ) %>%
#   fmt_number(
#     columns = vars(avg_distance, max_distance, min_distance, avg_total_time, max_total_time, min_total_time, avg_direct_distance, max_direct_distance, min_direct_distance),
#     decimals = 2
#   )
# 
# # Print the table
# # print(summary_table)
# summary_table |> as_raw_html()


kable(summary_df, digits = 1, col.names = c(num_trips = "Number of Trips",
    num_users = "Number of Users",
    num_days = "Number of Days",
    min_day = "Earliest Recorded Day",
    max_day = "Latest Recorded Day",
    num_transport_modes = "Number of Transport Modes",
    avg_distance = "Average Distance (m)",
    max_distance = "Maximum Distance (m)",
    min_distance = "Minimum Distance (m)",
    avg_total_time = "Average Total Time (min)",
    max_total_time = "Maximum Total Time (min)",
    min_total_time = "Minimum Total Time (min)",
    avg_direct_distance = "Average Direct Distance (m)",
    max_direct_distance = "Maximum Direct Distance (m)",
    min_direct_distance = "Minimum Direct Distance (m)"),
caption = "Summary of Posmo Analysis Data {#tbl-2}", label = "#tbl-2")
Table 2: Summary of Posmo Analysis Data
Number of Trips Number of Users Number of Days Earliest Recorded Day Latest Recorded Day Number of Transport Modes Average Distance (m) Maximum Distance (m) Minimum Distance (m) Average Total Time (min) Maximum Total Time (min) Minimum Total Time (min) Average Direct Distance (m) Maximum Direct Distance (m) Minimum Direct Distance (m)
292 8 37 2023-04-03 2023-05-18 7 11939.3 1428946 0 170.8 9171.4 0.2 5583.9 63152.6 0

Selection of Relevant Trips

When looking into the summary of the data, one can clearly see that, there is some work needed on outliers and identifying just relevant trips. As relevant trips, we see the usage of a transport mode to reach a train. So, a relevant trip should not be too long nor be too far. For that, we are using different box plots and other visuals to identify potential thresholds for the data.

a. Initial State of the Dataset

In examining the box plots, it’s evident that the outliers significantly influence the overall visual representation, making it challenging to discern underlying patterns. Furthermore, it’s noteworthy that each transportation mode exhibits distinct outlier patterns across different measures. This visualization (Figure 1) reinforces the necessity of further scrutinizing and refining our data, as outlined in the subsequent sections of this research project.

Code
combined_boxplot(data = posmo_train_related, value_columns = c("distance", "direct_distance", "total_time"), remove_outliers = FALSE)

Figure 1: Boxplots of initial dataset

b. Performed Analysis and Transformations on the Dataset

In our analysis, we focus on relevant trajectories for various modes of transportation to a train station. These modes include walking, driving, biking, bus, and tram. We filter the dataset for each mode based on specific criteria such as speed and total length. Relevant walking trips have speeds between 2-160 meters per minute and total trip time under 60 minutes. Similar criteria are applied to other modes. We combine the relevant trips for analysis and use plots to visualize time, speed, and distance, identifying outliers and patterns. This method ensures accurate analysis of each mode’s relevant trips.

Code
#add average speed
posmo_train_related <- posmo_train_related |> mutate(tripspeed = distance/total_time) |>
  arrange(tripspeed)
# filter for walking
df_walk <- posmo_train_related %>% 
  filter(transport_mode == "Walk" & tripspeed >= 2 & tripspeed <= 160 & total_time < 60)

# filter for car
df_car <- posmo_train_related %>% 
  filter(transport_mode == "Car" & tripspeed > 0 & total_time < 30 & distance < 10000)

# filter for bike
df_bike <- posmo_train_related %>% 
  filter(transport_mode == "Bike" & tripspeed > 0 & total_time < 120 & distance < 50000)

# filter for bus
df_bus <- posmo_train_related %>% 
  filter(transport_mode == "Bus" & tripspeed > 0 & total_time < 120 & distance < 50000)

# filter for tram
df_tram <- posmo_train_related %>% 
  filter(transport_mode == "Tram" & tripspeed > 0 & total_time < 60 & distance < 50000)

# combine the data frames
posmo_train_related_filtered <- bind_rows(df_walk, df_car, df_bike, df_bus, df_tram)

c. Final State of the Dataset

After all transformation, our dataset includes the most relevant trajectories for each transport mode to the train station: walking, driving, cycling, bus, and tram. Early indications from our box-plot visualizations suggest that there may be differences in mean values between transport modes (Figure 2, Figure 3 and Figure 4). This makes us confident that our dataset can help to answer our research questions.

Code
create_boxplot(data = posmo_train_related_filtered, value_column =  "distance", remove_outliers = FALSE, add_title = FALSE, fixed_order = TRUE)

Figure 2: Boxplot of distance by transport mode

Code
create_boxplot(data = posmo_train_related_filtered, value_column =  "direct_distance", remove_outliers = FALSE, add_title = FALSE, fixed_order = TRUE)

Figure 3: Boxplot of direct_distance by transport mode

Code
create_boxplot(data = posmo_train_related_filtered, value_column =  "total_time", remove_outliers = FALSE, add_title = FALSE, fixed_order = TRUE)

Figure 4: Boxplot of total_time by transport mode

Statistical Methods and Limitations

This study employed statistical analyses and regression modeling to examine the relationship between distance, time, and preferred transport modes. Three measures (distance, total time, and direct distance) were assessed across five modes of transport (Walk, Bike, Bus, Car, and Tram).

  • Distance: This measure refers to the total length of the travel path in meters.
  • Total Time: This measure represents the total time spent for the trip, from start to end, measured in minutes.
  • Direct Distance: This measure indicates the straight-line distance from the origin to the destination, disregarding any detours or route variations.

To compare differences in measures, statistical tests were chosen after checking assumptions of normality and homogeneity of variances. The Shapiro-Wilk test (Shapiro and Wilk 1965) was used to test for normality in each measure across the different modes of transport. To examine the homogeneity of variances Levene’s test (Levene 1960) was used. This test is crucial for certain statistical methods like the traditional ANOVA.

Since data violated assumptions, Welch’s ANOVA (Welch 1951) was used, followed by the Games-Howell post-hoc test (Games and Howell 1976) for pairwise comparisons.

Regression analysis utilized multinomial logistic regression models to estimate probabilities of choosing modes based on varying distances and total time values. Distance and time calculations were based on participants’ trajectory data, using precise coordinates and timestamps.

It’s important to note the small sample size limited generalizability, and biases may exist as the data only represents students in a specific module. However, these methods provided valuable insights into the relationship between distance, time, and transport mode preferences within this context.

Results

Statistical Analysis to Examine Mean Value Differences

Shapiro-Wilk and Levene’s test

The results of the Shapiro-Wilk test showed significant deviation from normality (p < 0.05) for all combinations of measures and transport modes. This suggests that the data for these groups do not follow a normal distribution.

Code
# Shapiro-Wilk test for normality
measures <- c("distance", "total_time", "direct_distance")
transport_modes <- unique(posmo_train_related_filtered$transport_mode)

for (measure in measures){
  for (transport_mode in transport_modes){
    print(paste("Shapiro-Wilk test for ", measure, " and transport mode ", transport_mode))
    print(shapiro.test(posmo_train_related_filtered[[measure]][posmo_train_related_filtered$transport_mode==transport_mode]))
  }
}

The results of the Levene’s test show significant deviation from the assumption of equal variances for ‘distance’ and ‘direct_distance’ (p < 0.05), while ‘total_time’ did not significantly deviate from this assumption (see Table 3).

Table 3: Results of Levene’s test
Measure p-value
distance 4.134e-06
total_time 0.191
direct_distance 7.783e-08
Code
# Levene's test for homogeneity of variances
measures <- c("distance", "total_time", "direct_distance") # Define the measures

for (measure in measures) {
  print(paste("Levene's Test for", measure))
  print(car::leveneTest(as.formula(paste(measure, "~ transport_mode")), data = posmo_train_related_filtered))
} # Loop over all measures

Welch’s ANOVA and Games-Howell post-hoc test

Given the violations of normality and homogeneity of variances for some of the measures, we performed Welch’s ANOVA tests, which does not require these assumptions, followed by the Games-Howell post-hoc test for pairwise comparisons.

The results for Welch’s ANOVA tests are as follows (Table 4):

Table 4: Results for Welch’s ANOVA tests
Measure F statistic num df denom df p-value
Distance 40.118 4.000 55.619 8.796e-16
Total Time 2.7365 4.000 69.250 0.03557
Direct Distance 51.228 4.000 54.141 <2.2e-16

The p-values for all three measures (distance, total time and direct distance) are below the threshold of 0.05. Given these results, we reject the null hypothesis of equal means across all transport modes for these measures. To further investigate the specific pairs of transport modes with significantly different means, we conducted the Games-Howell post-hoc test.

The results of the Games-Howell post-hoc test showed significant differences in pairwise comparisons. Below are the statistically significant results (Table 5):

Table 5: Results for Games-Howell post-hoc test
Comparison Estimate p-value
Bike vs Walk (distance) -1795.35 0.019
Bus vs Walk (distance) -1715.39 4.30e-09
Car vs Walk (distance) -2175.52 1.60e-10
Tram vs Walk (distance) -1480.46 2.34e-05
Code
# Perform Welch's ANOVA for distance
oneway.test(distance ~ transport_mode, data = posmo_train_related_filtered, var.equal = FALSE)

# Perform Welch's ANOVA for total_time
oneway.test(total_time ~ transport_mode, data = posmo_train_related_filtered, var.equal = FALSE)

# Perform Welch's ANOVA for direct_distance
oneway.test(direct_distance ~ transport_mode, data = posmo_train_related_filtered, var.equal = FALSE)

# Perform Games-Howell post-hoc test for distance
games_howell_test(posmo_train_related_filtered, distance ~ transport_mode)

# Perform Games-Howell post-hoc test for total_time
games_howell_test(posmo_train_related_filtered, total_time ~ transport_mode)

# Perform Games-Howell post-hoc test for direct_distance
games_howell_test(posmo_train_related_filtered, direct_distance ~ transport_mode)

These results indicate that for the measure distance, Bike, Bus, Car, and Tram all had significantly higher average distance than Walk. The negative estimate values represent the average difference in distance compared to Walk, with all modes of transport having more distance on average. This pattern is similarly observed when assessing the direct distance. However, when it comes to time, the data aligns with the chart’s depiction, demonstrating that the mean times across the different modes of transport do not show significant differences.

Regression Analysis

Distance

The analysis conducted using multinomial logistic regression revealed interesting insights into the relationship between distance and individuals’ preferred transport modes (Table 6 and Figure 5). The probabilities of choosing different modes (Walk, Bike, Bus, Car, and Tram) were examined based on varying distances. The calculated Z-scores are fairly large, suggesting that the coefficients for each transport mode are significantly different from zero. This indicates that distance does have a significant effect on the choice of transport mode. The fit of the model seems to be reasonably good, based on the residual deviance and the AIC values.

The probability of choosing Walk decreased significantly as the distance to the train station increased. At shorter distances (e.g., 24 meters), the probability of selecting Walk was high, with approximately 92.7% likelihood and stays over 50% untill 1150 meters where it is dropping below. That means that every second individual is chosing a different transport mode than walking when the train station is away more then 1150 meters. On 1700 meters, just every fifth individual is choosing to go by walk. Starting from 2100 meters, the probability drops to less than 10%.

Overall, as the distance increased to 2500 meters and longer, the probability diminished to a small value, indicating a strong aversion to walking longer distances.

As the distance increases, the probability of choosing the “Bike” mode also increases. The probabilities start at 1.38% and gradually rise until they reache it’s maximum at 18.55%. This indicates that as the distance to the destination increases, there is a higher likelihood of choosing the “Bike” mode as the preferred transport mode.

Similar patterns were observed for Bus, Car, and Tram. The probabilities of choosing these modes increases as the distance increases. Each mode exhibited distinct probabilities at various distances, signifying the nuanced preferences of individuals.

These findings shed light on the relationship between distance and transport mode preferences. They highlight the decreased likelihood of selecting Walk as distances increase, while also emphasizing the varied preferences for Bike, Bus, Car, and Tram based on distance. Importantly, these probabilities are specific to the examined dataset and should be interpreted within the context of the broader study.

# weights:  15 (8 variable)
initial  value 329.934772 
iter  10 value 228.119051
final  value 226.022321 
converged
Table 6: Summary of the regression model of transport mode and distance
y.level term estimate std.error statistic p.value
Bike (Intercept) -4.21238 0.19595 -21.49753 0
Bike distance 0.00226 0.00020 11.24454 0
Bus (Intercept) -3.76963 0.26281 -14.34356 0
Bus distance 0.00230 0.00021 11.04462 0
Car (Intercept) -4.04416 0.27165 -14.88712 0
Car distance 0.00234 0.00021 11.34435 0
Tram (Intercept) -3.95006 0.13436 -29.39859 0
Tram distance 0.00218 0.00020 10.85561 0
Code
# Create the plot
plot <- ggplot(result, aes(x = total_time_value)) +
    geom_line(aes(y = Walk, color = "Walk"), linetype = "dashed") +
    geom_line(aes(y = Bike, color = "Bike")) +
    geom_line(aes(y = Bus, color = "Bus")) +
    geom_line(aes(y = Car, color = "Car")) +
    geom_line(aes(y = Tram, color = "Tram")) +
    scale_color_manual(values = c("Walk" = "black", "Bike" = "blue", "Bus" = "green", "Car" = "red", "Tram" = "orange")) +
    geom_hline(yintercept = c(0.5, 0.25), linetype = "dotted", color = "gray",linewidth = 1.1)+
    labs(x = "Distance in meter", y = "Probability") +
    theme_minimal()

# Add legends for the transport modes
plot <- plot + labs(color = "Transport Mode")

# Display the plot
plot

Figure 5: Visualization of the regression model of transport mode and distance

Direct Distance

The analysis of the distance already revealed substantial insights, the same analysis with the direct distance to the train station and the selection of the transport mode shows even stronger effects and sensitivities (Table 7 and Figure 6). All results are comparable, significant as the effects of distance.

As the direct distance to the train station increased, the probability of selecting Walk decreased more rapidly than with the original “distance” measure. For instance, at a direct distance of 650 meters, less than 50% of individuals were observed to choose walking as their mode of transport. The influence of direct distance became even more pronounced at greater distances. Around 950 meters, less than 20% or every fifth person opted to walk. This trend continued to the point where at approximately 1600 meters, only around 1% of individuals decided to walk.

Compared to the analysis with the overall trip distance, where a considerable proportion of individuals (about 10%) still preferred to walk at distances around 2100 meters, the “direct_distance” analysis reveals a much stronger aversion to walking longer distances. This indicates that the direct distance between the starting point and the train station may be a more accurate representation of individuals’ perceptions of distance and their consequent transport mode choices.

# weights:  15 (8 variable)
initial  value 329.934772 
iter  10 value 219.283360
final  value 211.767484 
converged
Table 7: Summary of the regression model of transport mode and direct distance
y.level term estimate std.error statistic p.value
Bike (Intercept) -4.78602 0.31008 -15.43471 0
Bike direct_distance 0.00466 0.00041 11.40403 0
Bus (Intercept) -4.12695 0.27001 -15.28429 0
Bus direct_distance 0.00462 0.00042 11.01625 0
Car (Intercept) -4.69759 0.27662 -16.98224 0
Car direct_distance 0.00480 0.00041 11.74964 0
Tram (Intercept) -4.36215 0.33205 -13.13721 0
Tram direct_distance 0.00446 0.00043 10.46212 0
Code
# Create the plot
plot <- ggplot(result, aes(x = total_time_value)) +
    geom_line(aes(y = Walk, color = "Walk"), linetype = "dashed") +
    geom_line(aes(y = Bike, color = "Bike")) +
    geom_line(aes(y = Bus, color = "Bus")) +
    geom_line(aes(y = Car, color = "Car")) +
    geom_line(aes(y = Tram, color = "Tram")) +
    scale_color_manual(values = c("Walk" = "black", "Bike" = "blue", "Bus" = "green", "Car" = "red", "Tram" = "orange")) +
    geom_hline(yintercept = c(0.5, 0.25), linetype = "dotted", color = "gray",linewidth = 1.1)+
    labs(x = "Distance in meter", y = "Probability") +
    theme_minimal()

# Add legends for the transport modes
plot <- plot + labs(color = "Transport Mode")

# Display the plot
plot

Figure 6: Visualization of the regression model of transport mode and direct distance

Time

The same analysis was conducted for the time measure. The resulting probabilities represent the likelihood of choosing a specific transport mode based on the total time values. The multinomial logistic regression model was fitted using the “total_time” variable as the predictor and the transport mode as the outcome variable, with the “Walk” mode as the reference level. The results of this regresssion are much harder to interpret and are less concrete as the ones before.

In this case, total time is a significant predictor for the Bike and Bus modes (p-values of 0.00201 and 0.02670, respectively), but not for the Car and Tram modes. This suggests that as total time increases, individuals are more likely to choose biking or taking the bus over walking, but total time does not significantly affect the choice between using the car or taking the tram over walking (Table 8).

# weights:  15 (8 variable)
initial  value 329.934772 
iter  10 value 293.554276
final  value 292.823378 
converged
Table 8: Summary of the regression model of transport mode and time
y.level term estimate std.error statistic p.value
Bike (Intercept) -2.31855 0.40802 -5.68246 0.00000
Bike total_time 0.07342 0.02377 3.08849 0.00201
Bus (Intercept) -1.37823 0.32574 -4.23109 0.00002
Bus total_time 0.04921 0.02221 2.21585 0.02670
Car (Intercept) -1.15360 0.34593 -3.33477 0.00085
Car total_time 0.01675 0.02657 0.63020 0.52856
Tram (Intercept) -1.83401 0.39089 -4.69188 0.00000
Tram total_time 0.04377 0.02615 1.67367 0.09419

Looking into the results, we can say that at a total time value of 10 minutes, the predicted probabilities suggest that individuals are most likely to choose walking with a probability of 45%, followed by using the “Bike” with 9.2%, taking the bus with 18.4%, taking the car with 16.7%, and using the Tram with 11.1% (Table 9).

Table 9: Probabilities of choosing different transport modes based on total time values
Total Time (min) Walk Bike Bus Car Tram
10 44.7% 9.2% 18.4% 16.7% 11.1%
20 34.2% 14.6% 23.0% 15.1% 13.1%
30 24.3% 21.7% 26.8% 12.7% 14.5%
40 16.1% 29.9% 29.1% 9.9% 14.8%
50 10.0% 38.8% 29.6% 7.3% 14.3%
60 5.9% 47.5% 28.5% 5.1% 13.0%

As the total time value increases, the predicted probabilities for going by walk decreases strongly (Figure 7).

Code
# Create the plot
plot <- ggplot(result, aes(x = total_time_value)) +
    geom_line(aes(y = Walk, color = "Walk"), linetype = "dashed") +
    geom_line(aes(y = Bike, color = "Bike")) +
    geom_line(aes(y = Bus, color = "Bus")) +
    geom_line(aes(y = Car, color = "Car")) +
    geom_line(aes(y = Tram, color = "Tram")) +
    scale_color_manual(values = c("Walk" = "black", "Bike" = "blue", "Bus" = "green", "Car" = "red", "Tram" = "orange")) +
    geom_hline(yintercept = c(0.5, 0.25), linetype = "dotted", color = "gray",linewidth = 1.1)+
    labs(x = "Time in minutes", y = "Probability") +
    theme_minimal()

# Add legends for the transport modes
plot <- plot + labs(color = "Transport Mode")

# Display the plot
plot

Figure 7: Visualization of the regression model of transport mode and time

Discussion

Our study provides important insights into the influence of distance on transport mode preferences and the effectiveness of statistical methods in analyzing this relationship.

Firstly, we found that the distance to the nearest train station strongly affects individuals’ choices of transport mode. As the distance increases, the likelihood of selecting walking decreases, while the preference for alternative modes such as biking, taking the bus, driving a car, or using the tram increases.

Secondly, our statistical analysis demonstrated the effectiveness of tests and regression analysis in examining the relationship between distance and transport mode choice. These methods allowed us to quantify the impact of distance on mode preferences and provided valuable insights into the preferences of individuals.

Lastly, we investigated the influence of different distance measurement methods. Comparing the overall trip distance and the direct distance to the train station, we observed that the direct distance measurement revealed a stronger aversion to walking longer distances. Additionally, including total time alongside distance further influenced mode choice, with longer times favoring biking or taking the bus over walking.

In conclusion, our study highlights the importance of distance in shaping transport mode preferences, validates the effectiveness of statistical methods in analyzing this relationship, and emphasizes the impact of distance measurement methods and time considerations on mode choice.

References

European Environment Agency (EEA). 2019. The First and Last Mile — the Key to Sustainable Urban Transport. Transport and Environment Report 2019. Publications Office of the European Union. https://data.europa.eu/doi/10.2800/200903.
Games, Paul A., and John F. Howell. 1976. “Pairwise Multiple Comparison Procedures with Unequal N’s and/or Variances: A Monte Carlo Study.” Journal of Educational Statistics 1 (2): 113–25. https://doi.org/10.3102/10769986001002113.
Genossenschaft Posmo Schweiz. 2022. “Posmo Project AppTracking Für Gruppen.” https://posmo.coop/produkte/posmo-project-tracking-fuer-gruppen.
Levene, H. 1960. “Robust Tests for Equality of Variances.” Contributions to Probability and Statistics: Essays in Honor of Harold Hotelling 2: 278–92.
Shapiro, S. S., and M. B. Wilk. 1965. “An Analysis of Variance Test for Normality (Complete Samples).” Biometrika 52 (3/4): 591. https://doi.org/10.2307/2333709.
Tennøy, Aud, Marianne Knapskog, and Fitwi Wolday. 2022. “Walking Distances to Public Transport in Smaller and Larger Norwegian Cities.” Transportation Research Part D: Transport and Environment 103 (February): 103169. https://doi.org/10.1016/j.trd.2022.103169.
Welch, B. L. 1951. “On the Comparison of Several Mean Values: An Alternative Approach.” Biometrika 38 (3/4): 330. https://doi.org/10.2307/2332579.
Wibowo, Sony Sulaksono, and Saksith Chalermpong. 2010. “Characteristics of Mode Choice Within Mass Transit Catchments Area.” Eastern Asia Society for Transportation Studies. https://doi.org/10.11175/easts.8.1261.