# Structure of the lecture ----------------------------------------------------

# simulate and look at data
# First we calculate risks and rates for one AE
# Next, we calculate it for all AEs

# Packages ----------------------------------------------------------------


req.packcages <- c("labelled", "data.table", "dplyr", "tidyr", "ggplot2", 
                   "scales", "gridExtra", "grid", "gtable", "survminer")

for(pack in req.packcages)
  if(!(pack %in% rownames(installed.packages())))
    install.packages(pack)

require(labelled)    # Manipulating Labelled Data
require(data.table)  # Enhanced data.frame
require(dplyr)       # dplyr: a grammar of data manipulation
require(tidyr)       # Easily Tidy Data with 'spread()' and 'gather()' Functions
require(ggplot2)     # Create Elegant Data Visualisations Using the Grammar of Graphics 
require(scales)      # Generic plot scaling methods
require(gridExtra)   # To arrange multiple grid-based plots on a page, and draw tables
require(grid)        # A rewrite of the graphics layout capabilities, plus some support for interaction
require(gtable)      # Extract label info from plot

require(survival)    # Fit survival curves
require(survminer)   # Survival plot by ggplot2

# The package stdRate have been developed for this course 
#install.packages("../stdRate0.1.0.zip", repos = NULL)
require(stdRat)      # Calculation of risk and rates for the ADaM dataset ADAE

# Invetigate the data ---------------------------------------------------------

adsl %>% ungroup

# How many studies are included:
unique(adsl$studyid)

# which countries
unique(adsl$country)

# How many subjects included by contry by site
table(adsl$studyid, adsl$region, adsl$country)

# Since we will mainly be looking at the safety analysis set we can filter accordingly
adsl_f <- adsl %>% filter(saffl == "Y") %>% mutate(trtdurd  = as.numeric(eotdt-randdt),
                                                   intrdurd = as.numeric(eosdt-randdt))


#How many included in Safety analysis set by site nested under studyid, region, and country,
adsl_f %>% group_by(studyid, region, country, siteid) %>% summarise(N = n())



# Drop out pattern ---------------------------------------------------------

# Establish survival curves
fit <- survfit(Surv(trtdurd, eotstt != "COMPLETED") ~ trta, data = adsl_f)

# Plot the curves and show p-val for equal "survival"
ggsurvplot(fit, data  = adsl_f,
           conf.int   = TRUE,
           pval       = TRUE,
           fun        = "pct",
           risk.table = TRUE,
           size       = 1,
           linetype   = "strata",
           palette    = c("#E7B800", "#2E9FDF"),
           legend.title = "Treatment",
           legend.labs = rev(c("New drug", "Old drug")))



##########################################################################-
# Calculate advese event risk/rate --------------------------------------------
##########################################################################-

# Assignment.

# We have the following situation:

# Fits try playing with the following example:
#  study1   new    14    381    14    164.2491 
#  study1   old    17    420    17    190.9049
#  study2   new     9    251     9    105.3005
#  study2   old     9    247     9    109.9466

# calculate the risk and rate for study 1:



# General situation:

# Count the number of subjects and person years of exposure in the 
# safety analysis set by studyid and actual treatment:
n_sas_data <- adsl %>% filter(saffl == "Y") %>% group_by(studyid, trta) %>% 
  summarise(n_trt = n(), pye_trt = sum(as.numeric(eotdt-randdt))/365.25)


# Calculate different summary statistics for adverse events,
# Since we divide by the number of subjects in the safety analysis set
# these numbers are joined
aedecod_summary   <- 
  adae %>% filter(saffl == "Y" & trtemfl == "Y") %>% 
  left_join(n_sas_data, by = c("studyid", "trta")) %>%
  group_by(aebodsys, aehlgt, aedecod, studyid, trta) %>% 
  summarize(N     = n_distinct(usubjid), 
            P     = N/n_trt[1],
            P_var = (P*(1-P)/n_trt[1]),
            P_low  = (P-qnorm(0.975)*sqrt(P_var))*100,
            P_high = (P+qnorm(0.975)*sqrt(P_var))*100,
            P      = P * 100,
            E      = n(),
            R      = E/pye_trt[1],
            R_var  = E/pye_trt[1]^2,
            R_low  = (R-qnorm(0.975)*sqrt(R_var))*100,
            R_high = (R+qnorm(0.975)*sqrt(R_var))*100,
            R      = R * 100)


# Using package stdRate ---------------------------------------------------
# The function freqAEstd can be used to calculate 

# Assignment 2 Go through the examples in function  freqAEstd
# Try grouping by region


# comparisons -------------------------------------------------------------
# Back to doing things by hand.
# Calculate diffrent standard error to be able to calculate confidence intervals
aedecod_se <- aedecod_summary %>% group_by(aebodsys, aehlgt, aedecod, studyid) %>% 
  mutate(se_risk_diff  = sqrt(sum(P_var)),
         se_rate_diff  = sqrt(sum(R_var)),
         se_risk_ratio = sqrt(sum(1/N)),
         se_rate_ratio = sqrt(sum(1/E)))


# Function for ratio
ratio_calc <- function(rate, names, order){
  if(!all(names %in% order)){
    return(NaN)
  }else{
    names(rate) <- names
    rate[order[1]]/rate[order[2]]
  }
}

# Function for difference
diff_calc <- function(risk, names, order){
  
  if(length(risk) == 1){
    missing <- setdiff(order, names)
    risk <- c(risk, 0)
    names <- c(names, missing)
  }
  
  names(risk) <- names
  risk[order[1]]-risk[order[2]]
}


# Calculate the risk and rate difference
comparison <- aedecod_se %>% 
  group_by(aebodsys, aehlgt, aedecod, studyid) %>%
  summarise(# Total number of affected subjects 
    N = sum(N),
    # Total number of events
    E = sum(E),
    # Risk difference
    risk_diff = diff_calc(P, names=trta, order = c("new drug", "old drug")),
    risk_diff_low  = (risk_diff/100-qnorm(0.975)*se_risk_diff[1])*100,
    risk_diff_high = (risk_diff/100+qnorm(0.975)*se_risk_diff[1])*100,
    risk_diff_pval = 2*(1-pnorm(abs(risk_diff)/100/se_risk_diff[1])),
    # Risk ratio
    risk_ratio = ratio_calc(P, names=trta, order = c("new drug", "old drug")),
    risk_ratio_low  = exp(log(risk_ratio)-qnorm(0.975)*se_risk_ratio[1]),
    risk_ratio_high = exp(log(risk_ratio)+qnorm(0.975)*se_risk_ratio[1]),
    risk_ratio_pval = 2*(1-pnorm(abs(log(risk_ratio))/se_risk_ratio[1])),
    #risk_harm = ifelse(rate_ratio>1, "new drug", "old drug"),
    # Rate diff
    rate_diff = diff_calc(R, names=trta, order = c("new drug", "old drug")),
    rate_diff_low  = (rate_diff/100-qnorm(0.975)*se_rate_diff[1])*100,
    rate_diff_high = (rate_diff/100+qnorm(0.975)*se_rate_diff[1])*100,
    rate_diff_pval = 2*(1-pnorm(abs(rate_diff)/100/se_rate_diff[1])),
    # Rate ratio
    rate_ratio = ratio_calc(R, names=trta, order = c("new drug", "old drug")),
    rate_ratio_low  = exp(log(rate_ratio)-qnorm(0.975)*se_rate_ratio[1]),
    rate_ratio_high = exp(log(rate_ratio)+qnorm(0.975)*se_rate_ratio[1]),
    rate_ratio_pval = 2*(1-pnorm(abs(log(rate_ratio))/se_rate_ratio[1])),
    rate_ratio_pval_adj = p.adjust(rate_ratio_pval, method = "holm"),
    rate_harm = ifelse(rate_ratio>1, "new drug", "old drug")) 


# Using package stdRate ---------------------------------------------------
# The function freqAEstdComp can be used to calculate 

# Assignment3 Go: through the examples in function  freqAEstdComp


# Volcano plot ------------------------------------------------------------


# Create the volcano plot
p <- ggplot(comparison, aes(x=rate_ratio, y=rate_ratio_pval, size = E, color=rate_harm)) +
      geom_point(alpha=0.2) 
  
p 

# there is something about the scaled 

reverselog_trans <- function(base = exp(1)) {
  force(base) 
  trans <- function(x) -log(x, base)
  inv <- function(x) base^(-x)
  trans_new(paste0("reverselog-", format(base)), trans, inv, 
            log_breaks(base = base), 
            domain = c(1e-100, Inf))
}

p <- p + scale_y_continuous(trans = reverselog_trans(10)) + 
  scale_x_log10()
  
p + geom_text(aes(label=ifelse(rate_ratio_pval_adj < 0.05, aedecod, ""), size = 10))


p <- p + geom_text(aes(label=ifelse(rate_ratio_pval_adj < 0.05, aedecod, ""), size = 10),
          vjust = "inward", hjust = "inward",
          show.legend = FALSE)

 
 # lets add some label colour
p <- p + scale_color_manual(values=structure(c("#001965",  "#808080"), names = c("new drug", "old drug"))) + 
         scale_fill_manual( values=structure(c("#001965", "#808080"), names = c("new drug", "old drug")))+ 
  xlab("Rate raio") + ylab("Adjusted p-value") +
  labs( 
    size = "Number\nof events:" , color = "Harmfull for:",
    caption = "Line at 0.05 signifies statistical significance"#,
    )

p


# Final code ---------------------------------------------

ggplot(comparison, aes(x=rate_ratio, y=rate_ratio_pval, size = E, color=rate_harm)) +
  geom_hline(yintercept = 0.05) +
  geom_point(alpha=0.2) + scale_y_continuous(trans = reverselog_trans(10)) + 
  scale_x_log10() + scale_size_continuous(range = c(0.5, 10)) +
  scale_color_manual(values=structure(c("#001965",  "#808080"), names = c("new drug", "old drug"))) + 
  scale_fill_manual(values=structure(c("#001965", "#808080"), names = c("new drug", "old drug")))+ 
  xlab("Rate raio") + ylab("Adjusted p-value") +
  labs( 
    size = "Number\nof events:" , color = "Harmfull for:",
    caption = "Line at 0.05 signifies statistical significance"#,
    #title = "Volcano plot with adjusted p-values"
  ) +
  theme_bw() +
  theme(
    text = element_text(size=12),
    plot.title = element_text(hjust = 0.5),
    legend.position = c(.95, .05),
    legend.justification = c("bottom"),
    panel.border = element_blank()
  ) + facet_grid(cols = vars(studyid)) +
  geom_text(aes(label=ifelse(rate_ratio_pval_adj < 0.05, aedecod, ""), size = 10),
            vjust = "inward", hjust = "inward",
            show.legend = FALSE)



# Cochrane Mantel Haensel -------------------------------------------------


# Count the number of subjects and person years of exposure in the 
# safety analysis set by studyid and actual treatment:
n_sas_data <- adsl %>% filter(saffl == "Y") %>% group_by(studyid, trta) %>% 
  summarise(n_trt = n(), pye_trt = sum(as.numeric(eotdt-randdt))/365.25)

# Calculate the mantel haensel correction for incidence and rate
cmh_weight <- n_sas_data %>% group_by(studyid) %>% 
  summarise(w_n   = prod(n_trt)/sum(n_trt),
            w_pye = prod(pye_trt)/sum(pye_trt))

# First, the risk and rate is computed by trial
summary_by_trial <- adae %>% 
  filter(saffl == "Y" & trtemfl == "Y") %>%
  left_join(n_sas_data, by = c("studyid", "trta")) %>% 
  group_by(aebodsys, aehlgt, aedecod, trta, studyid) %>% 
  summarize(N     = n_distinct(usubjid), 
            P     = N / n_trt[1],
            pv    = P * (1 - P) / n_trt[1],
            n_trt = n_trt[1],
            E     = n(),
            R     = E / pye_trt[1],
            rv    = E / pye_trt[1]^2,
            pye_trt = pye_trt[1])

# Next, the risk and rate are corrected
aedecod_cmh <- summary_by_trial %>% group_by(aebodsys, aehlgt, aedecod, trta) %>% 
  left_join(cmh_weight, by = c("studyid")) %>%
  summarise(N      = sum(N),
            P_trt  = sum(P*w_n),
            P      = sum(P*w_n)/sum(w_n)*100,
            var_p  = sum(pv*w_n^2)/sum(w_n)^2,
            P_low  = (P/100-qnorm(0.975)*sqrt(var_p))*100,
            P_high = (P/100+qnorm(0.975)*sqrt(var_p))*100,
            E      = sum(E),
            R_trt  = sum(R*w_pye), # for calculation of RR_se
            R      = sum(R*w_pye)/sum(w_pye)*100,
            var_r  = sum(rv*w_pye^2)/sum(w_pye)^2,
            R_low  = (R/100-qnorm(0.975)*sqrt(var_r))*100,
            R_high = (R/100+qnorm(0.975)*sqrt(var_r))*100)




# Assignment --------------------------------------------------------------

# Go though the example in function freqAEcmh


# Create a dataset with common AEs
to_plot <- aedecod_cmh %>% group_by(aedecod) %>% filter(max(P) > 5) 

# Create a new factor level for aedecod corresponding to P for new drug
levels <- to_plot %>% summarize(P=P[trta == "new drug"]) %>% arrange(P) %>% pull(aedecod) 
to_plot_ordered <- to_plot %>% ungroup() %>% 
  mutate(aedecod = factor(aedecod, levels = levels))


# Plot the risk together with the fitted CI intervals
risk_plot <- to_plot_ordered %>% 
  ggplot(aes(color=trta)) +
  geom_linerange(aes(aedecod, ymin = P_low, ymax = P_high),
                 position=position_dodge(w=.5), size=1.5) +
  geom_hline(yintercept=0, linetype=2) +
  coord_flip() + theme_minimal() +
  geom_point(aes(aedecod, P), position=position_dodge(w=.5), size =2)+ 
  xlab(NULL) + labs( title = "Corrected Risk") +
  scale_color_manual(values=structure(c("#001965",  "#808080"), 
                                      names = c("new drug", "old drug"))) 

# Plot the rate together with the fitted CI intervals
rate_plot <- to_plot_ordered %>% 
  ggplot(aes(color=trta)) +
  geom_linerange(aes(aedecod, ymin = R_low, ymax = R_high),
                 position=position_dodge(w=.5), size=1.5) +
  geom_hline(yintercept=0, linetype=2) +
  coord_flip() + theme_minimal() +
  geom_point(aes(aedecod, R), position=position_dodge(w=.5), size =2) +
  # scale_y_sqrt() +
  xlab(NULL) + labs( title = "Corrected Rate") +
  scale_color_manual(values=structure(c("#001965",  "#808080"), 
                                      names = c("new drug", "old drug"))) 


# To calculate the numerator for the variance of Rate and Risk Ratio
aedecod_cmh_rate_se <- summary_by_trial %>% 
  group_by(aebodsys, aehlgt, aedecod, studyid) %>% 
  left_join(cmh_weight, by = c("studyid")) %>%
  summarise(ov_risk = w_n[1]*(sum(N)/sum(n_trt)- prod(P)),
            ov_rate = w_pye[1]*sum(E)/sum(pye_trt)) %>% 
  group_by(aebodsys, aehlgt, aedecod) %>%
  summarise(ov_risk = sum(ov_risk),
            ov_rate = sum(ov_rate)) 

# Calculate diffrent standard error to be able to calculate confidence intervals
aedecod_cmh_se <- aedecod_cmh %>% group_by(aebodsys, aehlgt, aedecod) %>% 
  left_join(aedecod_cmh_rate_se, by=c("aebodsys", "aehlgt", "aedecod")) %>%
  mutate(se_risk_diff  = sqrt(sum(var_p)),
         se_rate_diff  = sqrt(sum(var_r)),
         se_risk_ratio = sqrt(ov_risk/(prod(P_trt))),
         se_rate_ratio = sqrt(ov_rate/(prod(R_trt)))) 

# calculate the comparisons
comparison_cmh <- aedecod_cmh_se %>%
  summarise(N = sum(N),
            E = sum(E),
            # Risk diff
            risk_diff = diff_calc(P, names=trta, order = c("new drug", "old drug")),
            risk_diff_low  = (risk_diff/100-qnorm(0.975)*se_risk_diff[1])*100,
            risk_diff_high = (risk_diff/100+qnorm(0.975)*se_risk_diff[1])*100,
            risk_diff_pval = 2*(1-pnorm(abs(risk_diff)/100/se_risk_diff[1])),
            risk_diff_pval_adj = p.adjust(risk_diff_pval, method = "holm"),
            # Risk ratio
            risk_ratio = ratio_calc(P, names=trta, order = c("new drug", "old drug")),
            risk_ratio_low  = risk_ratio*exp(-qnorm(0.975)*se_risk_ratio[1]),
            risk_ratio_high = risk_ratio*exp( qnorm(0.975)*se_risk_ratio[1]),
            risk_ratio_pval = 2*(1-pnorm(abs(log(risk_ratio))/se_risk_ratio[1])),
            risk_ratio_pval_adj = p.adjust(risk_ratio_pval, method = "holm"),
            risk_harm = ifelse(risk_diff>1, "new drug", "old drug"),
            # Rate diff
            rate_diff = diff_calc(R, names=trta, order = c("new drug", "old drug")),
            rate_diff_low  = (rate_diff/100-qnorm(0.975)*se_rate_diff[1])*100,
            rate_diff_high = (rate_diff/100+qnorm(0.975)*se_rate_diff[1])*100,
            rate_diff_pval = 2*(1-pnorm(abs(rate_diff)/100/se_rate_diff[1])),
            rate_diff_pval_adj = p.adjust(rate_diff_pval, method = "holm"),
            # Rate ratio
            rate_ratio = ratio_calc(R, names=trta, order = c("new drug", "old drug")),
            rate_ratio_low  = rate_ratio*exp(-qnorm(0.975)*se_rate_ratio[1]),
            rate_ratio_high = rate_ratio*exp( qnorm(0.975)*se_rate_ratio[1]),
            rate_ratio_pval = 2*(1-pnorm(abs(log(rate_ratio))/se_rate_ratio[1])),
            rate_ratio_pval_adj = p.adjust(rate_ratio_pval, method = "holm"),
            rate_harm = ifelse(rate_ratio>1, "new drug", "old drug"))


# Create plot for rate ratioi
rate_ratio_plot <- comparison_cmh %>% filter(aedecod %in% to_plot$aedecod) %>% 
  mutate(aedecod = factor(aedecod, levels = levels)) %>%
  ggplot(aes(color=rate_harm)) +
  geom_linerange(aes(aedecod, ymin = rate_ratio_low, ymax = rate_ratio_high),
                 position=position_dodge(w=.5), size=1.5) +
  geom_hline(yintercept=1, linetype=2) +
  coord_flip() + theme_minimal() +
  geom_point(aes(aedecod, rate_ratio), position=position_dodge(w=.5), size =2)+ 
  labs( title = "Corrected Rate Ratio")+
  xlab(NULL) + ylab("Rate ratio") + scale_y_log10()+
  scale_color_manual(values=structure(c("#001965",  "#808080"), 
                                      names = c("new drug", "old drug"))) 



# Function for extracting labels
g_legend<-function(a.gplot){
  tmp <- ggplot_gtable(ggplot_build(a.gplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)}


# Get the legend from risk plot
legend <- g_legend(risk_plot + theme(legend.position="bottom"))


# Plot the three plot together
grid.arrange(arrangeGrob(risk_plot + theme(legend.position="none"), 
                         rate_plot + theme(legend.position="none",
                                           axis.title.y=element_blank(),
                                           axis.text.y=element_blank(),
                                           axis.ticks.y=element_blank()), 
                         rate_ratio_plot + theme(axis.title.y=element_blank(),
                                                 legend.position="none",
                                                 axis.text.y=element_blank(),
                                                 axis.ticks.y=element_blank()),
                         nrow=1, widths = c(0.5, 0.25, 0.25)),
             legend, nrow=2, heights=c(10, 1))