Summary

This data analysis provides an overview of Ecological Momentary Assessment (EMA) responses, including cleaning, flagging, and visualization. All steps and parameters are documented below.

This file was generated using the EMA-CleanR package from the University of Michigan. See README file for instructions, more documentation and full license information.

1 Project Setup

1.1 Load Required Packages

Refer to each package for their individual license

# Load required packages
# Refer to each package for their individual license
library(dplyr)
library(psych)
library(tidyverse)
library(lubridate)
library(table1)
library(corrplot)
library(ggplot2)
library(patchwork)
library(rlang)

1.2 Set project parameters

The input parameters are set in the EMA-CleanR.Rmd file, in the top, in YAML format. Example:


params:
  input_file: "EMA-Data.csv"

Refer to README.md and the knowledge base article for documentation on each parameter.

# Assign each parameter from YAML (top section of this script) as a variable in the parent environment
for (nm in names(params)) {
  # Unlist if the parameter is a named mapping
  if (nm %in% c("ema_item_labels", "participant_group_map", "plot_colors")) {
    assign(nm, unlist(params[[nm]]), envir = .GlobalEnv)
  } else {
    assign(nm, params[[nm]], envir = .GlobalEnv)
  }
}

# Print all parameter names and values that will be used by this script
for (nm in names(params)) {
  val <- if (nm == "ema_item_labels") unlist(params[[nm]]) else params[[nm]]
  if (is.null(val)) {
    cat(nm, "= NULL\n")
  } else if (is.atomic(val) && length(val) > 1 && is.null(names(val))) {
    cat(nm, "=", paste(val, collapse = ", "), "\n")
  } else if (is.list(val) || (is.atomic(val) && !is.null(names(val)))) {
    val_vec <- unlist(val)
    if (!is.null(names(val_vec))) {
      cat(nm, "=\n  ")
      cat(paste(names(val_vec), val_vec, sep=": ", collapse = "; "), "\n")
    } else {
      cat(nm, "=", paste(val_vec, collapse = ", "), "\n")
    }
  } else {
    cat(nm, "=", val, "\n")
  }
}

## input_file = EMA-Data.csv 
## input_file_has_headers = TRUE 
## output_dir = output 
## late_survey_cutoff_hour = 9 
## ignore_surveys = Practice Daily Survey, Practice, Practica, Prueba, Test, Temp, Trial 
## surveys_per_day = 4 
## total_surveys_in_study = 112 
## ema_item_prefix = EMA_ 
## ema_item_labels =
##   EMA_01: Nervous/Anxious; EMA_02: Sad/Blue; EMA_03: Irritable/Angry; EMA_04: Happy/Joyful; EMA_05: Excited/Enthusiastic; EMA_06: Energy Level; EMA_07: Situation Stressful; EMA_08: Emotions Out of Control; EMA_09: Attention to Feelings; EMA_10: Confused About Feelings; EMA_11: Emotion Regulation; EMA_12: Impulsivity; EMA_13: Acting on Feelings 
## participant_group_map =
##   7: HC; 6: SBD; 5: BD 
## plot_colors =
##   um_maize: #FFCB05; um_blue: #00274C; um_ligth_blue: #537BAE; um_dark_blue: #012D5F

# Ensure output directory is writable:
if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE)

1.3 Global functions

Global functions for common tasks throughout the code, such as setting color gradients.

# Global functions for common tasks throughout the code, such as setting color gradients.

# get_color_gradient(): Reusable function for color gradient, from U-M Blue to Maize
# usage: survey_colors <- get_color_gradient(survey_names)
get_color_gradient <- function(categories, from_col = plot_colors["um_blue"], to_col = plot_colors["um_maize"]) {
  colorRampPalette(c(from_col, to_col))(length(categories)) |> setNames(categories)
}

# plot_ema_by_survey(): Reusable plotting function with optional friendly item labels
plot_ema_by_survey <- function(df, item, survey_colors, item_labels = NULL) {
  ylabel <- if (!is.null(item_labels) && item %in% names(item_labels)) item_labels[[item]] else item
  ggplot(df, aes(x = surveyname, y = .data[[item]], color = surveyname, fill = surveyname)) +
    geom_jitter(width = 0.2, alpha = 0.5) +
    geom_boxplot(width = 0.2, color = "black", alpha = 0.5) +
    scale_fill_manual(values = survey_colors) +
    scale_color_manual(values = survey_colors) +
    labs(
      title = paste(ylabel, "by Survey"),
      x = "Survey Name",
      y = ylabel
    ) +
    theme_minimal()
}

1.4 Load data sets

# Read in data set 
EMAwide.merge <- read.csv(input_file, sep = ",", header=input_file_has_headers)
cat("Number of rows loaded:", nrow(EMAwide.merge), "\n")

## Number of rows loaded: 9412

2 Clean Up EMA Datafile

2.1 Remove practice and test surveys

## Get rid of practice and test surveys
EMAwide.merge2 <- EMAwide.merge[!EMAwide.merge$surveyname %in% ignore_surveys, ]
cat(
  "Excluding surveys with these names:",
  paste(ignore_surveys, collapse = ", "),
  "\n",
  "Remaining rows after exclusion:",
  nrow(EMAwide.merge2),
  "\n"
)

## Excluding surveys with these names: Practice Daily Survey, Practice, Practica, Prueba, Test, Temp, Trial 
##  Remaining rows after exclusion: 9309

2.2 Sort dataset and filter invalid participant IDs

# Sort dataset and filter invalid participant IDs (R NA, empty string, "na", "n/a", "null", "NULL")
EMAwide.merge2 <- EMAwide.merge2 %>% 
    arrange(participantidentifier, start_datetime) %>% #sort the data by participant ID and date
    filter(
      !is.na(participantidentifier),                         # removes actual R NA
      participantidentifier != "",                           # removes empty string
      !tolower(participantidentifier) %in% c("na", "n/a", "null")  # removes "NA", "N/A", "null", "NULL"
  )
cat("Remaining rows after participantidentifier cleaning:", nrow(EMAwide.merge2), "\n")

## Remaining rows after participantidentifier cleaning: 9309

2.3 Cleanup dates and de-duplicate

# Take day time variables and make sure they are a time class
EMAwide.merge2$start_datetime <- mdy_hm(EMAwide.merge2$start_datetime, tz = "UTC")
EMAwide.merge2$end_datetime <- mdy_hm(EMAwide.merge2$end_datetime, tz = "UTC")

# Create a variable that is date alone
EMAwide.merge2<- EMAwide.merge2 %>%
  mutate(start_day = (as.Date(format(start_datetime, "%Y-%m-%d")))) %>% 
  relocate(start_day, .before = start_datetime)

# Identify duplicates based on participantidentifier, start_day, and enddate
duplicates <- EMAwide.merge2 %>%
  group_by(participantidentifier, 
           start_datetime, 
           end_datetime) %>%
  filter(n() > 1) %>%
  ungroup()

# Combine duplicate rows by participantidentifier, start_day, and enddate
EMAwide.merge2_combined <- EMAwide.merge2 %>%
  group_by(participantidentifier, 
           start_datetime, 
           end_datetime) %>%
  summarise(across(everything(), ~ coalesce(.[1], .[2])), .groups = "drop")  # Merge partial responses

# Resort and rename df
EMA.DIGIT <- EMAwide.merge2_combined %>% 
  arrange(participantidentifier, start_datetime)

cat("Remaining rows after parsing dates and removing duplicates:", nrow(EMA.DIGIT), "\n")

## Remaining rows after parsing dates and removing duplicates: 9296

2.4 Create participant groups

# Create a new variable participant_group based on 'participant identifier'
# In this example, the participant IDs are setup so that
# anything starting with 5 indicates a bowel disease diagnosis, etc.
# This approach could also be used to group by other characteristics or cohorts.
EMA.DIGIT <- EMA.DIGIT %>%
  mutate(
    participant_group = unname(participant_group_map[substr(participantidentifier, 1, 1)])
    ) %>% 
  relocate(participant_group, .after = participantidentifier)

2.5 Identify and clean EMA items

# Identify EMA item columns
ema_vars <- EMA.DIGIT %>% 
  select(starts_with(ema_item_prefix)) %>% # all items in our study begin with EMA_
  names()
cat(
  "Identify EMA item columns (prefix =", ema_item_prefix, ")...\n",
  "EMA item vars found:", paste(ema_vars, collapse = ", "), "\n"
)

## Identify EMA item columns (prefix = EMA_ )...
##  EMA item vars found: EMA_01, EMA_02, EMA_03, EMA_04, EMA_05, EMA_06, EMA_07, EMA_08, EMA_09, EMA_10, EMA_11, EMA_12, EMA_13

# Remove rows where all EMA items are NA
EMA.DIGIT <- EMA.DIGIT %>%
  filter(!if_all(all_of(ema_vars), is.na))
cat("Rows remaining in cleaned data:", nrow(EMA.DIGIT), "\n")

## Rows remaining in cleaned data: 9280

2.6 Create Variables

# Calculate days since start date
calculate_day_in_study <- function(data) {
  data <- data %>%
    arrange(participantidentifier, start_datetime) %>%
    group_by(participantidentifier) %>%
    mutate(dayinstudy_uncorrected = as.integer(difftime(start_day, first(start_day), units = "days")) + 1) %>% # for each participant, calculate the number of days since their first start_day, treating the earliest day as Day 1
    ungroup()
  
  return(data)
}

# apply calculate_day_in_study function 
EMA.DIGIT2 <- calculate_day_in_study(EMA.DIGIT) %>% relocate(dayinstudy_uncorrected, .before = start_day)

# Updated code without hard-coded survey name
# When surveys are completed late, after midnight, count them towards the previous day
# as long as they were completed before the specified cut-off time (late_survey_cutoff_hour)
cat("Surveys completed between midnight and", late_survey_cutoff_hour, " will be counted as the previous study day.\n")

## Surveys completed between midnight and 9  will be counted as the previous study day.

EMA.DIGIT2 <- EMA.DIGIT2 %>%
  group_by(participantidentifier) %>%  # Ensures correct lag in multi-participant data
  mutate(
    dayinstudy = case_when(
      hour(start_datetime) < late_survey_cutoff_hour ~ lag(dayinstudy_uncorrected), # previous day
      TRUE ~ dayinstudy_uncorrected
    )
  ) %>%
  ungroup() %>%
  relocate(dayinstudy, .before = start_datetime)


# Check to make sure this worked by creating a new dataframe showing whether dayinstudy_uncorrected matches dayinstudy
EMAwide_day_check <- EMA.DIGIT2 %>%
  mutate(match = dayinstudy_uncorrected == dayinstudy)

# Delete dayinstudy_uncorrected variable
EMA.DIGIT2$dayinstudy_uncorrected <- NULL

# Create a Survey number per participant
EMA.DIGIT2 <- EMA.DIGIT2 %>%
  arrange(participantidentifier, start_day) %>%  # Ensure data are sorted as desired
  group_by(participantidentifier) %>%            # Group by each participant
  mutate(surveynum = row_number()) %>%           # Create a sequence variable within each group
  ungroup()  %>% 
  # move our surveynum sequence before start_day variable
  relocate(surveynum, .before = start_day)

# Add a weekday column
EMA.DIGIT2<- EMA.DIGIT2 %>%
  mutate(weekday = weekdays(start_day),
         day_type = ifelse(weekday %in% c("Saturday", "Sunday"), "Weekend", "Weekday")) %>% 
  relocate(day_type, .before = start_day)

2.7 Time to Completion (TTC) Flag

Find total time it took to complete survey and identify flags

# Calculate the time difference in mins
EMA.DIGIT2<- EMA.DIGIT2 %>%
  mutate(time_diff =as.integer(difftime(EMA.DIGIT2$end_datetime, EMA.DIGIT2$start_datetime, units = "secs"))) %>% relocate(time_diff, .before = start_day) 

## Create a cutoff time for surveys that are taking longer, where SDcuthigh == mean + 2SD 
mean <- mean(EMA.DIGIT2$time_diff, na.rm = TRUE)
mean

## [1] 178.9655

sd <- sd(EMA.DIGIT2$time_diff, na.rm = TRUE)
sd

## [1] 1429.678

SDcuthigh <- mean + (2*sd)
SDcuthigh

## [1] 3038.321

# Create a binary flag variable based on SDcuthigh condition
EMA.DIGIT2$TTCFlag_High <- ifelse(EMA.DIGIT2$time_diff > SDcuthigh, 1, 0)

## Visualize time_diff in a histogram, where red line shows our SDcuthigh threshold
ggplot(EMA.DIGIT2, aes(x = time_diff)) +
  geom_histogram(binwidth = 5, fill = plot_colors["um_blue"], color = "black") +
  geom_vline(xintercept = SDcuthigh, color = "red", linetype = "dashed", size = 1) +  # Add vertical red line
  labs(title = "Histogram of Time Difference",
       x = "Time Difference (secs)",
       y = "Frequency") +
  theme_minimal()

## Identify the timediff low. Theoretically, we choose the minimum as someone who completes all possible items in less than 1 second per item. In the EBI study, we had 23 items, so the code below originally used 23 seconds, even though not all 23 items are included in our deidentified data file. To generalize it, the code was modified to use length(ema_vars) which would yield the number of items in the actual data file. One item = 1 second minimum to complete.

EMA.DIGIT2$TTCFlag_Low <- ifelse(EMA.DIGIT2$time_diff < length(ema_vars), 1, 0)

3 Flags

3.1 Low Variance (SD) Flag

Calculate SD for items with Likert Scale and create flags

## If there is zero variance across all ema items, SD_Flag == 1
last_ema_var <- tail(ema_vars, 1)
EMA.DIGIT3 <- EMA.DIGIT2 %>%
  mutate(
    sd_EMA = apply(select(., all_of(ema_vars)), 1, sd, na.rm = TRUE),
    SD_Flag = if_else(sd_EMA == 0, 1, 0)
  ) %>%
  relocate(sd_EMA, SD_Flag, .after = last_ema_var)

3.2 Flag Review

## Create a df with IDs and the counts and percentages of data that are flagged 
summary_df <- EMA.DIGIT3 %>%
  group_by(participantidentifier) %>%
  summarise(
    TTCFlag_High = sum(TTCFlag_High, na.rm = TRUE),
    TTCFlag_Low = sum(TTCFlag_Low, na.rm = TRUE),
    SD_Flag = sum(SD_Flag, na.rm = TRUE), 
    Percent_TTCFlag_High = round((TTCFlag_High / total_surveys_in_study) * 100, 0),
    Percent_TTCFlag_Low = round((TTCFlag_Low / total_surveys_in_study) * 100, 0),
    Percent_SD_Flag = round((SD_Flag / total_surveys_in_study) * 100, 0)
  )
summary_df

# Identify IDs that have 2 or more flags
SuspectIDs <- summary_df %>%
  rowwise() %>%
  filter(sum(c_across(Percent_TTCFlag_High:Percent_SD_Flag) > 0) >= 2) %>% # row-wise sum across
  ungroup() %>%  # Ungroup after row-wise operation
  select(participantidentifier, Percent_TTCFlag_High, Percent_TTCFlag_Low, Percent_SD_Flag)

SuspectIDs

3.3 Figure Exploration

3.3.1 Time to completion histogram

# Filter the dataset to drop rows where any of the conditions are met
EMA.DIGIT3_filtered <- EMA.DIGIT3 %>%
  filter(!(TTCFlag_High == 1 | TTCFlag_Low == 1))

## round to nearest 0.5 minutes for visualization
EMA.DIGIT3_filtered <- EMA.DIGIT3_filtered %>%
  mutate(
    minutes = time_diff / 60,
    minutes_rounded = floor(minutes * 2 + 0.5) / 2 )

ggplot(EMA.DIGIT3_filtered, aes(x = minutes_rounded)) +
  geom_histogram(binwidth = 5, fill = "#00274C", color = "black") +
  facet_wrap(~ participantidentifier) +
  labs(title = "Histogram of Time Difference per Participant",
       x = "Time Difference (mins)",
       y = "Frequency") +
  theme_minimal() + 
  theme(
       panel.spacing = unit(1, "lines")   #increase spacing between plots
  )

3.3.2 SD histograms

# SD FLAGS
df_counts <- EMA.DIGIT3 %>%
  count(participantidentifier, SD_Flag)

ggplot(df_counts, aes(x = factor(SD_Flag), y = n, fill = factor(SD_Flag))) +
  geom_col(color = "black") +
  facet_wrap(~ participantidentifier) +
  labs(title = "Frequency of SD Flags",
       x = "Response",
       y = "Count") +
  scale_fill_manual(values = c("0" = unname(plot_colors["um_maize"]),  "1" = unname(plot_colors["um_blue"]))) +
  theme_minimal()

4 Compliance

4.1 Missing Data Summary

# Remove duplicates and keep the first occurrence based on participantidentifier, dayinstudy, and surveyname
EMA.DIGIT4 <- EMA.DIGIT3 %>%
  arrange(participantidentifier, dayinstudy, surveyname) %>%  # Arrange by the relevant columns
  distinct(participantidentifier, dayinstudy, surveyname, .keep_all = TRUE)  # Keep first occurrence

# Define all possible survey names
survey_names <-  EMA.DIGIT4$surveyname %>% unique()

# Create a complete dataset with all combinations of participant, day, and surveyname. Our current dataset only has completed surveys, so it's important we know what's really missing
expected_surveys <- EMA.DIGIT4 %>%
  distinct(participantidentifier, dayinstudy) %>%
  tidyr::expand(participantidentifier, dayinstudy, surveyname = survey_names)

# Count actual completed surveys per participant per day per surveyname
actual_surveys <- EMA.DIGIT4 %>%
  group_by(participantidentifier, dayinstudy, surveyname) %>%
  summarise(completed = n(), .groups = "drop")

# Merge expected and actual surveys
missing_surveys <- expected_surveys %>%
  left_join(actual_surveys, by = c("participantidentifier", "dayinstudy", "surveyname")) %>%
  mutate(completed = ifelse(is.na(completed), 0, completed),  # Replace NA with 0
         missing_count = 1 - completed)  # 1 if missing, 0 if completed


## Plot missingness count by survey_name

# Aggregate data to sum completed and missing_count for each surveyname
survey_summary <- missing_surveys %>%
  group_by(surveyname) %>%
  summarise(completed = sum(completed, na.rm = TRUE),
            missing_count = sum(missing_count, na.rm = TRUE)) %>%
  ungroup()

# Reshape the data to long format
long_data <- survey_summary %>%
  pivot_longer(cols = c(completed, missing_count), 
               names_to = "survey_status", 
               values_to = "count")  # Convert to long format

# Create the grouped bar chart
ggplot(long_data, aes(x = surveyname, y = count, fill = survey_status)) +
  geom_bar(stat = "identity", position = "dodge") +  # Side-by-side bars
  labs(x = "Survey Name", y = "Count", title = "Completed vs Missing Count by Survey Name") +  # Labels & title
  theme_minimal() +  # Clean theme
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  # Rotate x-axis labels
  scale_fill_manual(values = c("completed" = unname(plot_colors["um_blue"]), "missing_count" = unname(plot_colors["um_maize"])))

## Plot missingness count by weekday vs. weekend

## Get other variables we want to add to missingsurvey 

Vars <- EMA.DIGIT4 %>%
  select(participantidentifier, dayinstudy, surveyname,
         day_type, deviceplatform, participant_group)


missing_surveys2 <- merge(missing_surveys, Vars, by = c("participantidentifier", "dayinstudy", "surveyname"), all.x = TRUE)

missing_surveys2 <- missing_surveys2 %>%
  group_by(participantidentifier) %>%
  fill(day_type, deviceplatform, participant_group, .direction = "downup") %>%  # Specify .direction only once
  ungroup()

# Aggregate data to sum completed and missing_count for each surveyname
survey_summary <- missing_surveys2 %>%
  group_by(day_type) %>%
  summarise(completed = sum(completed, na.rm = TRUE),
            missing_count = sum(missing_count, na.rm = TRUE)) %>%
  ungroup()

# Reshape the data to long format
long_data <- survey_summary %>%
  pivot_longer(cols = c(completed, missing_count), 
               names_to = "survey_status", 
               values_to = "count")  # Convert to long format

# Create the grouped bar chart
ggplot(long_data, aes(x = day_type, y = count, fill = survey_status)) +
  geom_bar(stat = "identity", position = "dodge") +  # Side-by-side bars
  labs(x = "Day Type", y = "Count", title = "Completed vs Missing Count by Day Type") +  # Labels & title
  theme_minimal() +  # Clean theme
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  # Rotate x-axis labels
  scale_fill_manual(values = c("completed" = unname(plot_colors["um_blue"]), "missing_count" = unname(plot_colors["um_maize"])))

## Now plot completion v. deviceplatform

# Aggregate data to sum completed and missing_count for each surveyname
survey_summary <- missing_surveys2 %>%
  group_by(deviceplatform) %>%
  summarise(completed = sum(completed, na.rm = TRUE),
            missing_count = sum(missing_count, na.rm = TRUE)) %>%
  ungroup()

# Reshape the data to long format
long_data <- survey_summary %>%
  pivot_longer(cols = c(completed, missing_count), 
               names_to = "survey_status", 
               values_to = "count")  # Convert to long format

# Create the grouped bar chart
ggplot(long_data, aes(x = deviceplatform, y = count, fill = survey_status)) +
  geom_bar(stat = "identity", position = "dodge") +  # Side-by-side bars
  labs(x = "Device", y = "Count", title = "Completed vs Missing Count by Device") +  # Labels & title
  theme_minimal() +  # Clean theme
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  # Rotate x-axis labels
  scale_fill_manual(values = c("completed" = unname(plot_colors["um_blue"]), "missing_count" = unname(plot_colors["um_maize"])))

## Now do it for participant groups (diagnosis, cohort, etc.)

# Aggregate data to sum completed and missing_count for each participant group
survey_summary <- missing_surveys2 %>%
  group_by(participant_group) %>%
  summarise(completed = sum(completed, na.rm = TRUE),
            missing_count = sum(missing_count, na.rm = TRUE)) %>%
  ungroup()

# Reshape the data to long format
long_data <- survey_summary %>%
  pivot_longer(cols = c(completed, missing_count), 
               names_to = "survey_status", 
               values_to = "count")  # Convert to long format

# Create the grouped bar chart
ggplot(long_data, aes(x = participant_group, y = count, fill = survey_status)) +
  geom_bar(stat = "identity", position = "dodge") +  # Side-by-side bars
  labs(x = "Participant Group", y = "Count", title = "Completed vs Missing Count by Participant Group") +  # Labels & title
  theme_minimal() +  # Clean theme
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  # Rotate x-axis labels
  scale_fill_manual(values = c("completed" = unname(plot_colors["um_blue"]), "missing_count" = unname(plot_colors["um_maize"])))

4.2 Figures for percent compliance

# Compliance summary
df_compliance <- EMA.DIGIT4 %>%
  group_by(participantidentifier) %>%
  summarise(
    total_completed = sum(!is.na(surveyname)),
    total_expected = max(dayinstudy, na.rm = TRUE) * surveys_per_day,
    percent_compliance = (total_completed / total_expected) * 100,
    participant_group = first(participant_group)
  )

df_compliance$participant_group <- as.factor(df_compliance$participant_group)

# Get all unique groups & set chunk size
group_levels <- unique(df_compliance$participant_group)
groups_per_plot <- 4

# Split groups into chunks of 4
group_slices <- split(group_levels, ceiling(seq_along(group_levels) / groups_per_plot))

# Loop and create compliance by participant violin plots, with max 4 groups at a time
for (i in seq_along(group_slices)) {
  current_groups <- group_slices[[i]]
  
  # Subset data
  df_sub <- df_compliance %>%
    filter(participant_group %in% current_groups)
  
  # Dynamic color palette for each group
  current_colors <- get_color_gradient(
    as.character(current_groups), 
    from_col = plot_colors["um_blue"],
    to_col   = plot_colors["um_maize"]
  )
  
  # Make and print the plot
  p <- ggplot(df_sub, aes(x = participant_group, y = percent_compliance, fill = participant_group)) +
    geom_violin(trim = TRUE, alpha = 0.7) +
    geom_jitter(width = 0.2, alpha = 0.5) +
    scale_fill_manual(values = current_colors) +
    labs(
      title = paste("Survey Completion % for Groups:", paste(current_groups, collapse = ", ")),
      x = "Participant Group",
      y = "Percent Complete"
    ) +
    theme_minimal() +
    theme(legend.position = "none") + 
    ylim(0,101)
  
  print(p)
}

4.3 Compliance table

table1(~ percent_compliance  | participant_group, data=df_compliance)

	BD (N=36)	HC (N=34)	SBD (N=33)	Overall (N=103)
percent_compliance
Mean (SD)	79.3 (19.2)	89.4 (10.8)	80.2 (20.0)	82.9 (17.6)
Median [Min, Max]	87.9 [35.2, 100]	93.3 [55.2, 99.1]	88.4 [21.1, 99.1]	89.3 [21.1, 100]

5 Item Distribution

5.1 Histograms of EMA items

# Function for plotting histogram
plot_list <- lapply(ema_vars, function(var){ # use previously defined ema_vars
  
  ggplot(EMA.DIGIT4, aes(x = .data[[var]])) +
    geom_histogram(bins = 10, fill = "skyblue", color = "black") +
    labs(
      title = ema_item_labels[[var]], 
      x = var, 
      y = "Frequency"
    ) +
    theme_minimal()
})

# Plot on a grid
combined_histograms <- wrap_plots(plot_list, ncol = 3)

combined_histograms

5.2 Item by within day time effect

survey_names <- unique(EMA.DIGIT4$surveyname)
num_surveys <- length(survey_names)
survey_colors <- get_color_gradient(survey_names)
names(survey_colors) <- survey_names


ema_plots <- lapply(ema_vars, function(item) {
  plot_ema_by_survey(EMA.DIGIT4, item, survey_colors, ema_item_labels)
})

for (i in seq_along(ema_vars)) {
  print(ema_plots[[i]])
}

6 Correlation

6.1 Correlation matrix

# Define the corrmatrix
corrmatrix <- EMA.DIGIT4 %>%
 select(participantidentifier, starts_with(ema_item_prefix))

# For ema item labels, retrieve the ones corresponding to the actual columns in the correlation matrices
current_ema_items <- names(corrmatrix)[names(corrmatrix) != "participantidentifier"]
current_item_labels <- ema_item_labels[current_ema_items]

StatsBetween <- statsBy(corrmatrix, group = "participantidentifier", cors = FALSE, method="spearman", na.rm = TRUE)

Betweencorr <- as.matrix (StatsBetween$rbg) ## this is between person r value
betweenp <- as.matrix(StatsBetween$pbg) ## this is between person p value
betweenm <- as.matrix(StatsBetween$mean) ## This is person centered mean
Withincorr <- as.matrix (StatsBetween$rwg) ## this is within person r value
withinp <- as.matrix(StatsBetween$pwg) ## this is within person p value
withinsd <- as.matrix(StatsBetween$sd) ## this is the within-person SD

6.2 Plot correlogram

# define color palette
my_col <- colorRampPalette(c(plot_colors["um_maize"], "white", plot_colors["um_dark_blue"]))(100)


# Set readable labels for corrplot using your labels param
rownames(Betweencorr) <- paste0(seq_along(current_item_labels), ". ", current_item_labels)
colnames(Betweencorr) <- rep("", length(current_item_labels))
rownames(Withincorr) <- rep("", length(current_item_labels))
colnames(Withincorr) <- paste0(seq_along(current_item_labels), ".")

# Plot within correlations on the upper diagonal
corrplot(Withincorr,
         type = "upper",
         method = "color",
         col = my_col,
         p.mat = withinp,
         sig.level = 0.05,
         insig = "pch",
         addCoef.col = "black",
         tl.col = "black",
         na.label = "--",
         pch.cex = 15,
         tl.srt = 0,
         tl.cex = 3,
         number.cex = 3,
         cl.cex = 3,
         number.digits = 2)

# Plot between person correlations on the lower diagonal
corrplot(Betweencorr,
         type = "lower",
         method = "color",
         col = my_col,
         p.mat = betweenp,
         sig.level = 0.05,
         insig = "pch",
         addCoef.col = "black",
         tl.col = "black",
         pch.cex = 15,
         tl.cex = 3,
         number.cex = 3,
         cl.pos = "n",
         add = TRUE) # plots between and within together on the same plot!

7 Output

The correlation tables are saved as individual CSVs in the output_dir directory.

# Create csv files of correlation tables
write.csv(Betweencorr, file = file.path(output_dir, "Betweencorr.csv"))
write.csv(Withincorr, file = file.path(output_dir, "Withincorr.csv"))
write.csv(betweenm, file = file.path(output_dir, "betweenm.csv"))
write.csv(betweenp, file = file.path(output_dir, "betweenp.csv"))
write.csv(withinp, file = file.path(output_dir, "withinp.csv"))
write.csv(withinsd, file = file.path(output_dir, "withinsd.csv"))

# For HTML report (from R console or script)
#rmarkdown::render("EMA-CleanR.Rmd", output_dir = output_dir)

For instructions and more information on this program, please visit our GitHub page at: https://github.com/DepressionCenter/EMA-CleanR

8 License

8.1 Copyright Notice

8.2 Software and Library License Notice

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see https://www.gnu.org/licenses/gpl-3.0-standalone.html.

8.3 Documentation License Notice

Permission is granted to copy, distribute and/or modify this document under the terms of the GNU Free Documentation License, Version 1.3 or any later version published by the Free Software Foundation; with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts. You should have received a copy of the license included in the section entitled “GNU Free Documentation License”. If not, see https://www.gnu.org/licenses/fdl-1.3-standalone.html

8.4 Citation

If you find this repository, code or paper useful for your research, please cite it.

8.4.0.1 Citation Example:

Sperry, Sarah; Murphy, Victoria (2025). EMA-CleanR. University of Michigan. Software. https://github.com/DepressionCenter/EMA-CleanR
DOI: 10.5281/zenodo.17982076

EMA-CleanR

Sarah Sperry; Victoria Murphy.

2025-12-05