SHIVA analysis - Version 6.2

CHANGELOG
============================

Version 6.2
-----------------------------
- Add "Adjust CNA data" section the TCGA_population processing
- Amplification Threshold 3.9

Version 6.1
-----------------------------
genes removed from the study:

* BRAF, CNA: not used to allocate patients
* PIK3CB, CNA: not present in the original design
* FLT3 (CNA): not used to alloacte patients
* INPP4B (CNA): too many <NA> values in the raw data
library(xlsx)
library(tibble)
library(DT)
library(dplyr)
library(PrecisionTrialDesigner)
library(ggplot2)
library(parallel)
library(RColorBrewer)
library(VariantAnnotation)
library("org.Hs.eg.db") # GENOME wide annotation for HUman based on Entrez Gene identifiers. 

#R helper function library
source("helper_functions.R")

# Define Plot color schema
MYPALETTE <- colorRampPalette(rev(brewer.pal(11, "Spectral")), space="Lab")

set.seed(17)

PRE-PROCESSING - prepare the Input

We need to prepare the following inputs

  1. SHIVA clinical trail design: design of the panel
  2. SHIVA population
  3. TCGA population

1. SHIVA clinical trail design: design of the panel

Panel design

#import SHIVA Design
# --------------------------------------------------------------------------------
SHIVA_design <- xlsx::read.xlsx("../Supplementary_file/SHIVA_design_v6.1.xlsx", sheetName = "Sheet1")
# SHIVA_design <- readxl::read_excel("../Supplementary_file/SHIVA_design.xlsx"
#                                    , sheet = 1
#                                    , col_types = c("text", "text", "text", "text", "text", "text"))

#fixing input
SHIVA_design$mutation_specification <- as.character(SHIVA_design$mutation_specification)
SHIVA_design$mutation_specification <- sub("\"\"", "", SHIVA_design$mutation_specification)

#CONVERT AMINOACIT TO FORMAT COMPATIBLE FOR PANEL
for (i in 1:nrow(SHIVA_design)) {

  if (!is.na(SHIVA_design$mutation_specification[i]) && 
     SHIVA_design$mutation_specification[i] != "" && 
     SHIVA_design$mutation_specification[i] != "truncating") {
    
    #Print modification
    cat("Gene", as.character(SHIVA_design$gene_symbol[i]), ":"
        , SHIVA_design$mutation_specification[i]
        , "-->", convertAA(SHIVA_design$mutation_specification[i])
        , "\n")
    
    #use the helper function to convert the a.a.
    SHIVA_design$mutation_specification[i] <- convertAA(SHIVA_design$mutation_specification[i])
  }
}
## Gene AKT1 : Glu17Lys --> E17K 
## Gene PIK3CA : Glu545Lys --> E545K 
## Gene PIK3CA : Glu542Lys --> E542K 
## Gene PIK3CA : His1047Arg --> H1047R 
## Gene PIK3CA : Thr1025Ala --> T1025A 
## Gene PIK3CA : Glu453Lys --> E453K 
## Gene PIK3CA : Gln546Lys --> Q546K 
## Gene BRAF : Gly469Ala --> G469A 
## Gene BRAF : Val600Glu --> V600E 
## Gene ERBB2 : Ser792Phe --> S792F 
## Gene ERBB2 : Thr862Ala --> T862A 
## Gene FLT3 : Met665Thr --> M665T 
## Gene KIT : Val852Ile --> V852I 
## Gene KIT : Asp572Gly --> D572G 
## Gene KIT : Met722Val --> M722V 
## Gene KIT : Pro838Ser --> P838S 
## Gene PDGFRA : Leu655Trp --> L655W
# preview
# --------------------------------------------------------------------------------
datatable_withbuttons(SHIVA_design)

2. SHIVA population

Import raw data

Import in the analysis the raw data. Add a column with the TCGA tumor ID confersion Supplementary_file/SHIVA_conversiontable.xlsx.

# Read in SHIVA raw data
merged_df <- xlsx::read.xlsx2("../Supplementary_file/SHIVA_authorTable.xlsx", sheetIndex = 1)
# merged_df <- readxl::read_excel("../Supplementary_file/SHIVA_authorTable.xlsx"
#                                 , sheet = 1
#                                 , col_names = TRUE
# #                                , col_types = rep("text", times=24)
#                                 )
                                 

# Import CNA data for the 496 profiled patients
cna_data <- xlsx::read.xlsx2("../Supplementary_file/SHIVA_cna_data.xls", sheetIndex = 1) 
# cna_data <- readxl::read_excel("../Supplementary_file/SHIVA_cna_data.xls"
#                                 , sheet = 1
#                                 , col_names = TRUE)

Adjust shiva population

The objective in this section is to calculate patients that had usefull mutations for the study and calculate the detection power based on the new authors data.

Using the Authors table, we want to keep those patients with

  • Localizations compatible with TCGA
  • remove those patients that do not have either NGS or CNA performed. (this will reduce the dataset to only those that are actually with a complete profile).
  • find patients with no CNA data (all genes tested for CNA having NA).
  • We are no longer including HR in the analysis. So we have removed the filter on expression.

of the remaing now, you can check how many are eligible for randomization and therefore calculate the expected detection power.

Select patients with a full data profile (NGS + CNA) on which to run some population inference.

Also, we will remove those patients that were not randomized for any particular reasons. Why? because they are not allocated to the trial and we want to keep the input data as similar to the simulation dataset.

Finally, we removed patients with TCGA tumor types acyc, sclc, es, because have no CNA data in TCGA population.

# Patient filtering - 
# --------------------------------------------------------------------------------
# All patients have CNA data, so we do not include any filter here. 
df_full_profile <- merged_df %>%
  # filter by patients that do not have a Complete profile, CNA are OK, no patients filtered there.
  #dplyr::filter(IHC.for.HR.expression == "Done") %>%
  dplyr::filter(!NGS == "Not done") %>%
  # filter those patients with locations incompatible with the TCGA 
  dplyr::filter(!TCGA_tumor_ID == "") %>%
  # Remove patients that were not randomized because of problems
  # dplyr::filter(Reason.for.no.randomization == "ERROR")
  # Remove patients with tumor types  acyc, sclc, es, becuase have no CNA data in TCGA population
  dplyr::filter(!TCGA_tumor_ID %in% c("acyc", "sclc", "es"))
  
# Define tumor type list in the population
# --------------------------------------------------------------------------------
tumor_types <- as.character(unique(df_full_profile$TCGA_tumor_ID))
# calculate weights
tumor_weights <- table(as.character(df_full_profile$TCGA_tumor_ID))
# calculate frequency
tumor_freq <- prop.table(tumor_weights)

# Preview
# --------------------------------------------------------------------------------
datatable(df_full_profile
          # ADD BUTTONS TO THE TABLE
          , extensions = 'Buttons'
          , options = list(
               dom = 'lBfrtip'
              , buttons = c('copy', 'csv', 'excel')
              )
          , caption = "Patients with a complete profile"
          )
data.frame(tumor_weights) %>% 
  dplyr::rename(Tumor = Var1, Count = Freq) %>%
  ggplot(aes(Tumor, Count)) +
    geom_bar(stat="identity", position=position_dodge() ) + 
    ggtitle("Number of Tumor type in the SHIVA population") +
    coord_flip()

Convert the raw data from the SHIVA study to a PTD population object

Ideally we should convert the SHIVA dataset to be compatible with PTD, and then import it as a population background. Then I will basically have two population backgrounds, the SHIVA, and the TCGA genrated one. And I can compare them easily to validate whether TCGA data can simulate the SHIVA situation.

Prepare CNA data
################################################################################
# CONVERT SHIVA POPULATION TO BE CAMPATIBLE WITH PTD
################################################################################


################################################################################
# PREPARE CNA DATA
# ------------------------------------------------------------------------------
# struttura da ricreare
# LISTA:
#  $copynumber
#     $data
#        $gene_symbol
#        $CNA
#        $case_id   (CONTIENE TUTTI I NOMI DEI CAMPIONI CON ALTERAZIONI)
#        $tumor_type
#     $Samples
#        $brca  (CONTIENE TUTTI I NOMI DEI CAMPIONI)

# Create Data
# ----------------------
cna_long <- cna_data %>%
  tidyr::gather("gene_symbol", "CNA", 2:ncol(cna_data)) %>%
  dplyr::select(gene_symbol, CNA, case_id = Patient.ID) %>%
  dplyr::filter(case_id %in% df_full_profile$Patient.ID) %>% # Filter by patients.ID we are interested in
  #dplyr::mutate(tumor_type="SHIVA") %>% # add fake tumor name
  dplyr::mutate(case_id=as.character(case_id)) # conver fctr to chr
## Warning: attributes are not identical across measure variables; they will
## be dropped
# preview before correction
cna_long %>%
  dplyr::group_by(gene_symbol, CNA) %>%
  dplyr::summarise(counts = n()) %>%
  ggplot(aes(x=CNA, y=counts)) + 
    geom_bar(stat="identity", position=position_dodge()) +
    labs(title="Alterations Split by gene (BEFORE correction)") +
    coord_flip()

fixIT <-  function(from, to, target=cna_long$CNA){
  #print(paste("^",from,"$", sep=""))
  gsub(paste("^",from,"$", sep=""), to , target)
}

# fix CNA encoding
# ------------------------------------------------------------------------------
cna_long$CNA <- fixIT("delete_hom",                     "deletion")
cna_long$CNA <- fixIT("normal, delete_het, delete_hom", "deletion")
cna_long$CNA <- fixIT("delete_het, delete_hom",         "deletion")
cna_long$CNA <- fixIT("delete_het, loh",                "deletion")
cna_long$CNA <- fixIT("normal, delete_hom, loh",        "deletion")
cna_long$CNA <- fixIT("normal, delete_hom",             "deletion")

cna_long$CNA <- fixIT("amplification, gain",           "amplification")
cna_long$CNA <- fixIT("amplification, gain, normal",   "amplification")
cna_long$CNA <- fixIT("amplification, normal",         "amplification")
#cna_long$CNA %>% table()

# Convert to normal
# ------------------------------------------------------------------------------
cna_long$CNA <- fixIT("amplification, normal, delete_het", "normal")
cna_long$CNA <- fixIT("amplification, normal, loh",        "normal")
cna_long$CNA <- fixIT("gain, delete_het",                  "normal")
cna_long$CNA <- fixIT("gain, delete_hom",                  "normal")
cna_long$CNA <- fixIT("gain, normal, delete_hom",          "normal")
cna_long$CNA <- fixIT("normal, delete_het",                "normal")

cna_long$CNA <- fixIT("amplification, delete_het",     "normal")
cna_long$CNA <- fixIT("delete_het",                    "normal")
cna_long$CNA <- fixIT("gain, loh",                     "normal")
cna_long$CNA <- fixIT("gain",                          "normal")
cna_long$CNA <- fixIT("gain, normal, loh",             "normal")
cna_long$CNA <- fixIT("NA",                            "normal")
cna_long$CNA <- fixIT("normal, loh",                   "normal")
cna_long$CNA <- fixIT("gain, normal",                  "normal")
#cna_long$CNA %>% table()

# TEST 
# clean up dataset from not standard variation names
# cna_long <- cna_long %>%
#   mutate_cond(!(CNA %in% "normal" | 
#              CNA %in% "deletion" |
#              CNA %in% "amplification")
#                   , CNA = "normal"
#              )
  




# Get patients list and associated tumor id
# ------------------------------------------------------------------------------
temp <- df_full_profile %>% 
  mutate(case_id = Patient.ID) %>% 
  dplyr::select(tumor_type = TCGA_tumor_ID
                ,  case_id)

# join patients  
cna_long <- dplyr::left_join(cna_long, temp, by= "case_id")
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factor and character vector, coercing into character vector
# add CNAvalue
# cna_long <- cna_long %>%
#   dplyr::mutate(CNAValue = 0) %>%
#   mutate_cond(CNA %in% "amplification", CNAValue = 2) %>%
#   mutate_cond(CNA %in% "deletion", CNAValue = -2)


# Create Sample
# ----------------------
# get list of tumors
tumor_chr <-  df_full_profile$TCGA_tumor_ID %>% 
  unique %>% 
  as.character

# prepare funtion to extract sample names for each tumor
sampleXtumor <- function(x, df){
  df %>% 
    dplyr::select(Patient.ID, TCGA_tumor_ID) %>%
    dplyr::filter(TCGA_tumor_ID %in% x) %>% .[["Patient.ID"]] %>%
    as.character()
}
# Extract samples for each tumors
Samples <- purrr::map(tumor_chr, ~ sampleXtumor(., df=df_full_profile))
names(Samples) <- tumor_chr

# Check if there are problems. 
testthat::expect_equal(length(unique(cna_long$case_id)), nrow(df_full_profile))
testthat::expect_equal(length(unlist(Samples)), nrow(df_full_profile))

# Create list element for CNA for the DataFull argument of the panel (panel@DataFull)
# ----------------------
copynumber <- list(  data = data.frame(cna_long)
                   , Samples = Samples)
Preview data distribution for CNA
copynumber$data %>%
  dplyr::group_by(CNA) %>%
  dplyr::summarise(counts = n()) %>%
  ggplot(aes(x=CNA, y=counts)) + 
    geom_bar(stat="identity", position=position_dodge()) +
    labs(title="Alterations Split by gene (after correction)") +
    coord_flip()

copynumber$data %>%
  dplyr::group_by(gene_symbol, CNA) %>%
  dplyr::summarise(counts = n()) %>%
  DT::datatable(caption = "Alterations Split by gene")
copynumber$data %>%
  dplyr::filter(CNA %in% c("amplification", "deletion")) %>%
  dplyr::group_by(gene_symbol, CNA) %>%
  dplyr::summarise(counts = n()) %>%
  DT::datatable(caption = "Alterations Split by gene (keep only deletion and amplification terms")
# Alterations Split by gene (keep only deletion and amplification terms
copynumber$data %>%
  dplyr::filter(CNA %in% c("amplification", "deletion")) %>%
  dplyr::group_by(gene_symbol, CNA) %>%
  dplyr::summarise(counts = n()) %>%
  ggplot(aes(x=gene_symbol, y=counts, fill=CNA)) + 
    geom_bar(stat="identity", position=position_dodge()) +
    labs(title="Alterations Split by gene (keep only deletion and amplification terms") +
    coord_flip()

Prepare SNV data
################################################################################
# PREPARE MUTATION DATA
# ------------------------------------------------------------------------------
#   str(panel@dataFull$mutations)

# Convert Gene Name to HGNC standard
# --------------------------------------
wrongGName <- c("RAPTOR","PI3KCA","PIK3CB")
rightGName <- c("RPTOR"  ,"PIK3CA"  ,"PI3KCB")
for (i in 1:length(wrongGName)) {
    if (i == 1) {
        temp <- gsub(wrongGName[i] , rightGName[i] , df_full_profile$Mutated_genes)
    } else {
        temp <- gsub(wrongGName[i] , rightGName[i] , temp)
    }
}
df_full_profile$gene_symbol <- temp

# Select fields of interest and start preformatting
#--------------------------------------
mutation_data <- df_full_profile %>% 
  dplyr::select(Patient.ID
                , amino_acid_change = pp_old
                , tumor_type = TCGA_tumor_ID
                , gene_symbol
                #, nm = NM_old
                ) %>%
  dplyr::mutate(amino_acid_change = as.character(amino_acid_change)
                , tumor_type = as.character(tumor_type)
                #, nm = as.character(nm)
                )


# Prepare function for unnesting Genes and the data
#--------------------------------------
unnest_mutationData <- function(df){
  
  # split genes and mutations by ";"
  # -----------------------------------------------
  genes <- strsplit(df$gene_symbol, ";")
  mutations <- strsplit(df$amino_acid_change, ";")
  #transcripts <- strsplit(df$nm, ";")

  # Quality check
  # -----------------------------------------------
  gene_lengths <- purrr::map_int(genes, length)
  mutations_lengths <- purrr::map_int(mutations, length)
  #transcripts_lengths <- purrr::map_int(transcripts, length)

  index <- gene_lengths != mutations_lengths
  if (any(index)) {
    warning("some genes, do not have matching genes-alteration formatting and they will be removed\n")
    print(df[index, "Patient.ID"])
    
    # FIX - REMOVE THEM 
    # ~~~~~~~~~~~~~~~~~
    df <- df[-which(index),]
  }
  
  # Unnesting genes and alterations
  # -----------------------------------------------
  unnested_gene <- df %>% tidyr::separate_rows(gene_symbol, sep = ";")
  unnested_pp <- df %>% tidyr::separate_rows(amino_acid_change, sep = ";")
  
  #merge them back together
  unnested_gene$amino_acid_change <- unnested_pp$amino_acid_change
  #unnested_gene$nm <- unnested_nm$nm
  
  # unnest alterations, second level, split by "|"
  unnested_df <- unnested_gene %>%
    tidyr::separate_rows(amino_acid_change, sep = "\\|")# %>% unique

  # Quality Control
  # ----------------------------------------------
  if(sum((unnested_df$amino_acid_change) != "") != purrr::map_int(strsplit(df$amino_acid_change, ";|\\|"), length) %>% sum()){
    stop("unnesting did not work as expected")
  }

  return(unique(unnested_df))
}

# unnest the table
unnested_df <- unnest_mutationData(mutation_data)
## Warning in unnest_mutationData(mutation_data): some genes, do not have matching genes-alteration formatting and they will be removed
##  [1] 08-0024 03-0007 07-0098 04-0025 05-0042 06-0096 07-0099 06-0113
##  [9] 07-0027 01-0222 04-0039
## 496 Levels: 01-0001 01-0002 01-0003 01-0004 01-0007 01-0008 ... 08-0027
# Adjust AA change
#--------------------------------------
unnested_df$amino_acid_change <- sub("^p." , "" , unnested_df$amino_acid_change)
unnested_df$amino_acid_change <- sub("[" , "" , unnested_df$amino_acid_change, fixed = TRUE)
unnested_df$amino_acid_change <- gsub("Qln" , "Gln" , unnested_df$amino_acid_change, fixed = TRUE)
# conver amminoacid for from long to short
unnested_df$amino_acid_change <- purrr::map_chr(unnested_df$amino_acid_change, hgvs_long_to_short)
# Fix strange simbols
unnested_df <- unnested_df %>% 
  mutate_cond(amino_acid_change %in% c("NA", "", "XXXX"), amino_acid_change = "") # custom made conditional mutate
# parse aa position
unnested_df$amino_position <- stringr::str_extract(unnested_df$amino_acid_change , "\\d+")

# Extract and prepare Sample info
#--------------------------------------
temp_sampledf <- unnested_df %>% dplyr::select(Patient.ID, TCGA_tumor_ID = tumor_type)
tumor_chr <- unique(temp_sampledf$TCGA_tumor_ID)
Samples <- purrr::map( tumor_chr, ~ sampleXtumor(., df= temp_sampledf))
names(Samples) <- tumor_chr

# Subset dataset where there are mutations
#--------------------------------------
# !!!(THIS HAS TO BE DONE AFTER, RETRIEVING THE SAMPLES for the panel structure) !!!!
# otherwise we risk to miss patients that are not included simply becuase they have no alteration

# Load entrez ID
xx <- as.list(org.Hs.egALIAS2EG)
# Remove pathway identifiers that do not map to any entrez gene id
xx <- xx[!is.na(xx)]

unnested_df <- unnested_df %>%
  dplyr::filter(!gene_symbol == "") %>% # remove those patients that DO NOT have a gene matched to an alteration 
  dplyr::mutate(Entrez_gene_ID = sapply(gene_symbol , function(x) as.integer(xx[[x]][1])))


# Very shallow classification of variants
unnested_df$mutation_type <- with( unnested_df , 
                                    ifelse(grepl("Ter$" , amino_acid_change) , "Nonsense_Mutation" , 
                                    ifelse(grepl("fs$" , amino_acid_change) , "Frame_Shift_Ins" , 
                                    ifelse(grepl("del$" , amino_acid_change) , "Frame_Shift_Del" , 
                                    ifelse(grepl("ins" , amino_acid_change) , "Frame_Shift_Ins" ,  
                                    "Missense_Mutation"))))
                                    )

# dummy genomic positon as a place holder
unnested_df$genomic_position <- "1:11111:C,G"
#"7:140439656:C,G"
# dummy genetic profile as a place holder
unnested_df$genetic_profile_id <- unnested_df$tumor_type
# change name of Patient.ID column to case_id
#colnames(unnested_df)[colnames(unnested_df)=="Patient.ID"] <- "case_id"
# SOrf df
mutations_df <- unnested_df %>% 
  dplyr::select(entrez_gene_id= Entrez_gene_ID
                , gene_symbol
                , case_id = Patient.ID
                , mutation_type
                , amino_acid_change
                , genetic_profile_id
                , tumor_type
                , amino_position
                , genomic_position
                ) %>%
  dplyr::mutate(amino_position = as.integer(amino_position)) %>%
  dplyr::mutate(case_id = as.character(case_id))


# Create list element for CNA for the DataFull argument of the panel (panel@DataFull)
# ----------------------
mutations <- list(  data = data.frame(mutations_df)
                  , Samples = Samples)


# PUT THEM TOGETHER
################################################################################
fusion = list(data = NULL
              , Samples = NULL
              )

expression = list(data = NULL
              , Samples = NULL
              )

SHIVA_repo <- list(fusions = fusion
                   , mutations = mutations
                   , copynumber = copynumber
                   , expression = expression
                   )

Preview the SNV data

mutations$data %>%
  dplyr::group_by(gene_symbol) %>%
  dplyr::summarise(count=n()) %>%
  ggplot(aes(x=gene_symbol, y=count)) + 
    geom_bar(stat="identity", position=position_dodge()) +
    labs(title="Total SNV (before any filtering) from the SHIVA population") +
    coord_flip()

Simulate Clinical trial on SHIVA population

# DESIGN
SHIVA_population <- newCancerPanel(SHIVA_design)
#DOWNLOAD alternative
SHIVA_population <- getAlterations(SHIVA_population
                              , tumor_type = tumor_types
                              , repos = SHIVA_repo
                              )
#SIMULATE
SHIVA_population <- subsetAlterations(SHIVA_population)

#save a backup copy to speed up future analysis
saveRDS(SHIVA_population, file = "../Temp/SHIVA_population.rds")

3. TCGA population

Simulate Clinical trial on TCGA population

# DESIGN
TCGA_population <- newCancerPanel(SHIVA_design)
#DOWNLOAD alternative
TCGA_population <- getAlterations(TCGA_population
                              , tumor_type = tumor_types
                              , expr_z_score = 1
                              )
#SIMULATE
TCGA_population <- subsetAlterations(TCGA_population)

#save a backup copy to speed up future analysis
saveRDS(TCGA_population, file = "../Temp/TCGA_population.rds")

Filter pathogenic mutation

Remove NON pathogenic mutation fetched from cosmic dataset (01-05-2017).

# Clean TCGA popualation running a cosmic filter
cosmic <- readr::read_csv("../external_resources/cosmic_pathogenic_SHIVApanel")

# Prepare filter function
# ######################################################################
filterGene <- function(gene_name, cosmic_df, tcga_df){
  
  print("cycle: " %++% gene_name)
  
  # PREPARE COSMIC DATA
  # --------------------------------------------
  
  # fetch aa mutations
  temp <- cosmic_df %>%
    dplyr::filter(gene %in% gene_name) %>%
    dplyr::select(AA.Mutation)
  
  # clean them
  temp <- purrr::map(temp, ~ (gsub("p.", "", .))) %>% unlist()
  
  # prepare df
  cosmic_clean <- cosmic_df %>%
    dplyr::filter(gene %in% gene_name) %>%
    dplyr::mutate(AA = temp) %>%
    dplyr::select(gene, AA)
  
  # RUN FILTER ON TCGA
  # --------------------------------------------
  TCGA_selection <- tcga_df %>%
    dplyr::filter(gene_symbol %in% gene_name) %>%
    dplyr::filter(amino_acid_change %in% cosmic_clean$AA)
  
  return(TCGA_selection)
} 

# Apply the filter
# ------------------------------------------------
TCGA_filtered <- purrr::map_df(unique(cosmic$gene)
                              , ~ filterGene(.
                                             , cosmic
                                             , TCGA_population@dataFull$mutations$data
                                             )
                )
## [1] "cycle:  ABL1"
## [1] "cycle:  AKT1"
## [1] "cycle:  BRAF"
## [1] "cycle:  EGFR"
## [1] "cycle:  ERBB2"
## [1] "cycle:  FLT3"
## [1] "cycle:  KIT"
## [1] "cycle:  PGDFRA"
## [1] "cycle:  PIK3CA"
## [1] "cycle:  PTEN"
## [1] "cycle:  RET"
## [1] "cycle:  SRC"
## [1] "cycle:  STK11"
# Replace population data
TCGA_original_number <- nrow(unique(TCGA_population@dataFull$mutations$data))

# apply filter
TCGA_population@dataFull$mutations$data <- TCGA_filtered
# re run the simulation
TCGA_population <- subsetAlterations(TCGA_population)
  • Filtering out ** 27.72 % ** of the TCGA original SNVs.

Adjust CNA data

!!!!WE NEED TEXT HERE TO DESCRIBE!!!!!

# SETTINGS
# ==============================================================================
# DEFINE GENES: define genes we are interested for the CNA analysis
CNA_genes <- TCGA_population@arguments$panel %>%
  dplyr::filter(alteration == "CNA") %>%
  dplyr::select(gene_symbol) %>%
  .[[1]]

# PATH: set global variable with the path to.rds files with CNA raw data
PATH_2CNA_FILES <- "/Users/ale/IEO/projects/IEO_research/R_package_panel_and_projection/functionfactory/cna_firehose_rds/"


# helper function that reads in a firehouse RDS file, filters by gene, and apply the threshold configured
# ==============================================================================
get_cna_long <- function(tumor_type, PATH_2rds, genes_chr, del_thr = 1.1, amp_thr = 5.9){
  
  
  # CHECKS
  # --------------------------------------------------------------------------------
  # check that PATH actually leads to a dir
  if(!dir.exists(PATH_2rds)){ stop("path to a dir that cannot be found")}

  # Check that all the files are found for each tumor type required in the analysis
  input_lgl <- purrr::map_lgl(tumor_types, ~ file.exists(paste0(PATH_2rds, toupper(.) , "_PTD.rds")))
  
  if(!all(input_lgl)){
    stop(paste("Files for some of the tumor types where not found:", tumor_types[-which(input_lgl)]))
  }
 
  # Checks
  if(!is.character(tumor_types) | is.null(tumor_types) | length(tumor_types) ==0){
    stop(paste("Tumor type is not confirm", tumor_types))
  }


  # Define an internal function to do the job
  # -------------------------------------------------------------------------------
  get_CNA_long_onetumor <- function(one_tumor, PATH_2rds, genes_chr, del_thr = 1.1, amp_thr = 2.9){
    
    # read in CNA file
    mat <- readRDS(paste0(PATH_2rds, toupper(one_tumor) , "_PTD.rds"))
    
    # extract Patients
    pats <- colnames(mat)

    # Filter for gene of interest
    mat <- mat[ genes_chr , ]

    # Melt and adjust
    output <- reshape2::melt(mat , value.name = "CNAvalue")
    colnames(output) <- c("gene_symbol" , "case_id" , "CNAvalue")
      output$tumor_type <- one_tumor
    # Choose cutoff values
    # -----------------------------------------------------------------------------
    # With default threshold data are now like this: 
        # [0 , 1.1) "deletion"
        # 1.1 "shallow deletion"
        # 2 "normal"
        # 2.9 "shallow amplification"
        # (2.9 , Inf] "amplification"
    output$CNA <- ifelse( output$CNAvalue <del_thr 
                    , "deletion" 
                    , ifelse(output$CNAvalue > amp_thr , "amplification" , "normal"))
    # select
    output <- output %>%
      dplyr::select(gene_symbol , CNA , case_id , tumor_type, CNAvalue) %>%
      dplyr::mutate(gene_symbol = as.character(gene_symbol)) %>%
      dplyr::mutate(case_id = as.character(case_id)) 
    
    return(output)
  }

  # Main body of the function
  # -----------------------------------------------------------------------------
  cna_long_df <- purrr::map(tumor_types
                             , ~ get_CNA_long_onetumor(., PATH_2rds, genes_chr, del_thr, amp_thr)
                             ) %>%
    dplyr::bind_rows()
    
  return(cna_long_df)  

}
  

# RUN MAIN FUNCTION
# ==============================================================================
cna_long_df <- get_cna_long(tumor_types, PATH_2CNA_FILES, CNA_genes, del_thr = 1.1, amp_thr = 3.9)


# Create Sample
# ----------------------
# get list of tumors
tumor_chr <-  cna_long_df$tumor_type %>% 
  unique %>% 
  as.character

# prepare funtion to extract sample names for each tumor
sampleXtumor <- function(x, df){
  df %>% 
    dplyr::select(case_id, tumor_type) %>%
    dplyr::filter(tumor_type %in% x) %>% .[["case_id"]] %>%
    as.character() %>%
    unique()
}
# Extract samples for each tumors
Samples <- purrr::map(tumor_chr, ~ sampleXtumor(., df=cna_long_df))
names(Samples) <- tumor_chr

# Check if there are problems. 
testthat::expect_equal(length(unlist(Samples)), length(unique(cna_long_df$case_id)))

# Create list element for CNA for the DataFull argument of the panel (panel@DataFull)
# ==============================================================================
copynumber <- list(  data = data.frame(cna_long_df)
                   , Samples = Samples)


# Preview changes before committing
#TCGA_population@dataFull$copynumber <- copynumber
TCGA_cna_pop_comparison <- purrr::map2( 
    .x = list( TCGA_population@dataFull$copynumber$Samples, copynumber$Samples )
  , .y = list( "TCGA", "firehouse" )
  , .f = ~ purrr::map_int(.x, length) %>%
      data.frame(n_patients = .) %>%
      tibble::rownames_to_column("tumor_type") %>%
      dplyr::mutate(source = .y)
  ) %>% 
  dplyr::bind_rows() %>%
  ggplot(aes(tumor_type, n_patients, fill=source)) + 
    geom_bar(stat="identity", position= position_dodge()) +
    coord_flip() + 
    ggtitle("number of patients in the CNA datasets")
# Apply changes
TCGA_population@dataFull$copynumber <- copynumber
TCGA_population <- subsetAlterations(TCGA_population)
## Subsetting mutations...
## Subsetting copynumber...

RESULTS - comparison between the TCGA population and SHIVA

  • ANALYSIS 1) % of patients found to have an alteration that could be allocated to the targeted therapy group;
  • ANALYSIS 2) Gene alteration frequency;
  • ANALYSIS 3) Patient allocation;
  • ANALYSIS 4) Power analysis of the study.

1. Detection power

Detection power TCGA population

# Run simulation
# ------------------------------------------------------------------------------
N_SIMULATIONS = 200

simul <- mclapply(1:N_SIMULATIONS , function(x) {
            cov <- coveragePlot(TCGA_population 
                    , alterationType=c("mutations" , "copynumber") 
                    , collapseByGene=TRUE
                    , tumor.weights = tumor_weights
                    , maxNumAlt = 7
                    , noPlot=TRUE
                     )
            return(cov$plottedTable/cov$Samples[1])
        } , mc.cores=detectCores())

## Prepare function for plotting Detection power with Confidenti intervals
# ------------------------------------------------------------------------------

# Convert to dataframe
simul <- data.frame(matrix(unlist(simul)
                        , nrow=length(simul)
                        , byrow=T )
                 , stringsAsFactors = FALSE)

# GET STATS
# ------------------------------------------------------------------------------
# Calculate for each n° of alteration, the mean
detectionPower <- apply(simul,2,mean)

# Calculate lower CI for bootstrap
leftCI <- apply(simul,2,function(x){left <- quantile(x, 0.025)})

# Calculate upper CI for bootstrap
rightCI <- apply(simul,2,function(x){right <- quantile(x, 0.975)})

# Preapre for column with # alterations
n_alterations <- 1:length(detectionPower)

# Put them all together in a data frame
dtpow <- data.frame(n_alterations, detectionPower, leftCI, rightCI)

# Generate the plot
TCGA_detection_image <- dtpow %>%
  ggplot(aes(x=n_alterations, y=detectionPower)) + 
  geom_bar(stat="identity" , position=position_dodge(), fill=MYPALETTE(10)[2]) + 
  scale_y_continuous(limits = c(0,1)) +
  scale_x_continuous(breaks = n_alterations, trans = "reverse") +
  geom_errorbar(aes(ymax=rightCI
                    , ymin=leftCI)
                , position=position_dodge(0.9)
  ) + 
  ggtitle("Overall Detection power in TCGA Population") +
  annotate("text"
           , x=n_alterations
           , y=detectionPower+0.15
           , label= sapply(detectionPower*100
                           , function(x) paste(c(round(x, digits=2), "%")
                                               , collapse=" ")
           )
  ) +
  coord_flip()

#Export as a svg
#img_file = "../Figures/fig4A.svg"
#ggsave(filename=img_file, plot=image, device = "svg")
#message("file saved in" %++% img_file)

# Show image
TCGA_detection_image

# Preview table
datatable_withbuttons(dtpow, caption = "Detection power TCGA")
Contribution to the detection power allocated by alteration type.
# MUTATION CONTRIBUTIONS
# ------------------------------------------------------------------------------

simul <- mclapply(1:N_SIMULATIONS , function(x) {
            cov <- coveragePlot(TCGA_population 
                    , alterationType=c("mutations") 
                    , collapseByGene=TRUE
                    , tumor.weights = tumor_weights
                    , maxNumAlt = 7
                    , noPlot=TRUE
                     )
            return(cov$plottedTable/cov$Samples[1])
        } , mc.cores=detectCores())

TCGA_detection_muts <- simulate_detectionPower_img(simul)
## Saving 10 x 8 in image
## file saved in ../Figures/fig4A.svg
# Preview image
#TCGA_detection_muts + ggtitle("Mutation only Detection Power in TCGA")

# preview table
simulate_detectionPower_tbl(simul) %>%
  datatable_withbuttons(caption = "Mutation only Detection Power in TCGA")
# COPYNUMBER CONTRIBUTION
# ------------------------------------------------------------------------------
simul <- mclapply(1:N_SIMULATIONS , function(x) {
            cov <- coveragePlot(TCGA_population 
                    , alterationType=c("copynumber") 
                    , collapseByGene=TRUE
                    , tumor.weights = tumor_weights
                    , maxNumAlt = 7
                    , noPlot=TRUE
                     )
            return(cov$plottedTable/cov$Samples[1])
        } , mc.cores=detectCores())

TCGA_detection_cna <- simulate_detectionPower_img(simul)
## Saving 10 x 8 in image
## file saved in ../Figures/fig4A.svg
# Preview image
#TCGA_detection_cna + ggtitle("CNA only Detection Power in TCGA")

# Preview table
simulate_detectionPower_tbl(simul) %>%
  datatable_withbuttons(caption = "CNA only Detection Power in TCGA")
gridExtra::grid.arrange(TCGA_detection_muts + ggtitle("Detection Power in TCGA: Mutation only")
                        , TCGA_detection_cna + ggtitle("Detection Power in TCGA: CNA only "))

Detection power SHIVA population

# calculate shiva detection power
shiva_detect <- coveragePlot(SHIVA_population
             , alterationType = c("mutations", "copynumber")
             , collapseByGene = TRUE
             , maxNumAlt = 7
             , noPlot= TRUE
             )
## Warning in dataExtractor(object = object, alterationType =
## alterationType, : The following tumor types have no alteration to display:
## lihc, thca, acc, thym, ucs, prad
# calculate frequencies
shiva_freq <- as.numeric(shiva_detect$plottedTable/shiva_detect$Samples[1])

# Preapre for column with # alterations
n_alterations <- 1:length(shiva_freq)
  
# Put them all together in a data frame
dtpow_shiva <- data.frame(n_alterations, shiva_freq= shiva_freq)
  
# left CI and right CI
dtpow_shiva$leftCI <- purrr::map_dbl(shiva_freq, ~ leftCIprop(., sum(tumor_weights)))
dtpow_shiva$rightCI <- purrr::map_dbl(shiva_freq, ~ rightCIprop(., sum(tumor_weights)))

# Generate the plot
SHIVA_detection_image <- dtpow_shiva %>%
  ggplot(aes(x=n_alterations, y=shiva_freq)) + 
    geom_bar(stat="identity" , position=position_dodge(), fill=MYPALETTE(10)[10]) + 
    scale_y_continuous(limits = c(0,1)) +
    scale_x_continuous(breaks = n_alterations, trans = "reverse") +
    geom_errorbar(aes(   ymax= leftCI
                       , ymin= rightCI)
                   , position=position_dodge(0.9)
    ) + 
    ggtitle("Detection power SHIVA population") +
    annotate("text"
             , x=n_alterations
             , y=shiva_freq+0.15
             , label= sapply(shiva_freq*100
                             , function(x) paste(c(round(x, digits=2), "%")
                                                 , collapse=" ")
             )
    ) +
    coord_flip()
  
#Export as a svg
img_file = "../Figures/fig4B.svg"
ggsave(filename=img_file, plot=SHIVA_detection_image, device = "svg")
## Saving 10 x 8 in image
message("file saved in" %++% img_file)
## file saved in ../Figures/fig4B.svg
# Show image
SHIVA_detection_image

# Preview table
datatable_withbuttons(dtpow_shiva)
Contribution to the detection power allocated by alteration type.
coveragePlot(SHIVA_population
             , alterationType = c("mutations", "copynumber")
             , collapseByGene = TRUE
             , grouping = "alteration_id"
             #, noPlot=TRUE
             )
## Warning in coveragePlot(SHIVA_population, alterationType = c("mutations", :
## If 'alteration_id' is in grouping variables, you cannot collapse by gene.
## The option was set to FALSE
## Warning in dataExtractor(object = object, alterationType =
## alterationType, : The following tumor types have no alteration to display:
## lihc, thca, acc, thym, ucs, prad

# SHIVA mutations ONLY
cov <- coveragePlot(SHIVA_population
             , alterationType = c("mutations")
             , collapseByGene = TRUE
             , grouping = "alteration_id"
             , noPlot=TRUE
             )
## Warning in coveragePlot(SHIVA_population, alterationType =
## c("mutations"), : If 'alteration_id' is in grouping variables, you cannot
## collapse by gene. The option was set to FALSE
## Warning in dataExtractor(object = object, alterationType =
## alterationType, : The following tumor types have no alteration to display:
## lihc, uvm, meso, tgct, kirc, thca, acc, thym, ucs, prad
cov <- data.frame( cov$plottedTable / cov$Samples) %>%
  tidyr::gather(n_alterations, freq)
  
cov$leftCI <- purrr::map_dbl(cov$freq, ~ leftCIprop(., sum(tumor_weights)))
cov$rightCI <- purrr::map_dbl(cov$freq, ~ rightCIprop(., sum(tumor_weights)))

write.table(cov, file="~/Desktop/perluca_SNV_detection_SHIVA.txt", quote=FALSE, sep="\t")
# SHIVA mutations ONLY
cov <- coveragePlot(SHIVA_population
             , alterationType = c("copynumber")
             , collapseByGene = TRUE
             , grouping = "alteration_id"
             , noPlot=TRUE
             )
## Warning in coveragePlot(SHIVA_population, alterationType =
## c("copynumber"), : If 'alteration_id' is in grouping variables, you cannot
## collapse by gene. The option was set to FALSE
## Warning in dataExtractor(object = object, alterationType =
## alterationType, : The following tumor types have no alteration to display:
## lihc, thca, acc, thym, ucs, prad
cov <- data.frame( cov$plottedTable / cov$Samples) %>%
  tidyr::gather(n_alterations, freq)
  
cov$leftCI <- purrr::map_dbl(cov$freq, ~ leftCIprop(., sum(tumor_weights)))
cov$rightCI <- purrr::map_dbl(cov$freq, ~ rightCIprop(., sum(tumor_weights)))

write.table(cov, file="~/Desktop/perluca_CNA_detection_SHIVA.txt", quote=FALSE, sep="\t")

Show them together

gridExtra::grid.arrange(TCGA_detection_image, SHIVA_detection_image)

2. Gene alteration Frequency

SNV

###############################################################################
# Fetch SNV data from TCGA population 
# -----------------------------------------------------------------------------
tcga_wide <- purrr::map(1:N_SIMULATIONS
           , ~ cpFreq(TCGA_population, alteration="mutations", tumor.weights = tumor_weights) %>% 
                tidyr::spread(gene_symbol, freq)
           ) %>% bind_rows

# GET STATS
# ------------------------------------------------------------------------------
# Calculate for each n° of alteration, the mean
freq <- apply(tcga_wide,2,mean)

# Calculate lower CI for bootstrap
leftCI <- apply(tcga_wide,2,function(x){left <- quantile(x, 0.025)})

# Calculate upper CI for bootstrap
rightCI <- apply(tcga_wide,2,function(x){right <- quantile(x, 0.975)})


# Put them all together in a data frame
tcga_snv <- data.frame(freq, leftCI, rightCI, population="tcga") %>%
  tibble::rownames_to_column("gene") 

###############################################################################
# Fetch CNA data from SHIVA population 
# -----------------------------------------------------------------------------
temp <- cpFreq(SHIVA_population, alteration="mutations")
temp <- temp[which(temp[["gene_symbol"]] %in% SHIVA_population@arguments$panel$gene_symbol),]

# get CI
leftCI <- purrr::map_dbl(temp$freq, ~ leftCIprop(., sum(tumor_weights)))
rightCI <- purrr::map_dbl(temp$freq, ~ rightCIprop(., sum(tumor_weights)))

# get DF
shiva_snv <- temp %>%
  dplyr::rename(gene = gene_symbol) %>%
  dplyr::mutate(leftCI=leftCI
                , rightCI=rightCI
                ) %>%
  dplyr::mutate(population = "shiva")

###############################################################################
# put the plots together
# -----------------------------------------------------------------------------

# Put them together
merged_snv <- rbind(tcga_snv, shiva_snv)

# bind SHIVA and TCGA
snv_plot <- merged_snv %>%
  ggplot(aes(x=gene, y=freq, fill=population)) + 
  geom_bar(stat="identity", position=position_dodge()) +
  scale_fill_manual(values=MYPALETTE(10)[c(2,10)]) +
  #scale_y_continuous(limits = c(0,.7)) +
  geom_errorbar(aes(ymax=rightCI
                    , ymin=leftCI)
                , position=position_dodge(0.9)
  ) + 
 ggtitle("SNV frequency comparison in the two populations") +   
 coord_flip()


# snv_plot <- rbind(tcga_snv, shiva_snv) %>%
#   ggplot(aes(x=gene, y=freq, fill=population)) + 
#   geom_bar(stat="identity") +
#   facet_grid(. ~ gene, space="free", scales="free") + 
#   labs(title="SNV frequency comparison in the two populations") +
#   coord_flip()


# Preview
snv_plot

CNA

# Fetch CNA data from TCGA population 
# ------------------------------------------------------------------------------
simul_amplifications <- lapply(1:50, function(x) {
            cpFreq(TCGA_population, alteration="copynumber", tumor.weights = tumor_weights) %>%
                dplyr::select(amplification) %>% 
                tibble::rownames_to_column("gene") %>%
                tidyr::spread(gene, amplification)
        }) 

simul_amplifications <- simul_amplifications %>%  bind_rows()
  
simul_deletions <- lapply(1:50, function(x) {
            cpFreq(TCGA_population, alteration="copynumber", tumor.weights = tumor_weights) %>%
                dplyr::select(deletion) %>% 
                tibble::rownames_to_column("gene") %>%
                tidyr::spread(gene, deletion)
        }) %>%  bind_rows()
  
# GET STATS
# ------------------------------------------------------------------------------
TCGA_amplifications_df <- get_bootstrap_stats(simul_amplifications) %>%
  tibble::rownames_to_column("gene") %>%
  dplyr::mutate(population = "TCGA")

TCGA_deletion_df <- get_bootstrap_stats(simul_deletions) %>%
  tibble::rownames_to_column("gene") %>%
  dplyr::mutate(population = "TCGA")


# Fetch CNA data from SHIVA population 
# ------------------------------------------------------------------------------

# Amplifications
# ------------
shiva_cna_amplification <- cpFreq(SHIVA_population, alteration="copynumber") %>%
  tibble::rownames_to_column("gene") %>%
  dplyr::select(-deletion, -normal) %>%
  dplyr::rename(freq = amplification) %>%
  dplyr::mutate(population = "shiva") 

# Calculate left CI and right CI
shiva_cna_amplification$leftCI <- purrr::map_dbl(shiva_cna_amplification$freq, ~ leftCIprop(., sum(tumor_weights)))
shiva_cna_amplification$rightCI <- purrr::map_dbl(shiva_cna_amplification$freq, ~ rightCIprop(., sum(tumor_weights)))

# last formattings 
shiva_cna_amplification <-  shiva_cna_amplification %>%
  dplyr::mutate(row_counter = 1:nrow(shiva_cna_amplification)) %>%
  dplyr::select(gene, row_counter, freq, leftCI, rightCI, population)


# Deletion
# ------------
shiva_cna_deletion <- cpFreq(SHIVA_population, alteration="copynumber") %>%
  tibble::rownames_to_column("gene") %>%
  dplyr::select(-amplification, -normal) %>%
  dplyr::rename(freq = deletion) %>%
  dplyr::mutate(population = "SHIVA")

# Calculate left CI and right CI
shiva_cna_deletion$leftCI <- purrr::map_dbl(shiva_cna_deletion$freq, ~ leftCIprop(., sum(tumor_weights)))
shiva_cna_deletion$rightCI <- purrr::map_dbl(shiva_cna_deletion$freq, ~ rightCIprop(., sum(tumor_weights)))

# last formattings 
shiva_cna_deletion <-  shiva_cna_deletion %>%
  dplyr::mutate(row_counter = 1:nrow(shiva_cna_deletion)) %>%
  dplyr::select(gene, row_counter, freq, leftCI, rightCI, population)

# Prepare filter
# Removed genes not considered in the panel for amplifications or deletions
# ------------
amplification_filter <- SHIVA_design %>%
  dplyr::filter(exact_alteration == "amplification") %>%
  dplyr::select(gene_symbol) %>%
  .[[1]] %>% 
  as.character()

deletion_filter <- SHIVA_design %>%
  dplyr::filter(exact_alteration == "deletion") %>%
  dplyr::select(gene_symbol) %>%
  .[[1]] %>% 
  as.character()


# Merge datasets
# ------------
merged_cna_amplifications <- rbind(TCGA_amplifications_df, shiva_cna_amplification) %>% 
  dplyr::filter(gene %in% amplification_filter) 
merged_cna_deletion <- rbind(TCGA_deletion_df, shiva_cna_deletion) %>% 
  dplyr::filter(gene %in% deletion_filter) 


# bind SHIVA and TCGA | Amplification
# -----
CNA_plot_amplification <- merged_cna_amplifications %>%
  ggplot(aes(x=gene, y=freq, fill=population)) + 
  geom_bar(stat="identity", position=position_dodge()) +
  scale_fill_manual(values=MYPALETTE(10)[c(10,2)]) +
  #scale_y_continuous(limits = c(0,.7)) +
  geom_errorbar(aes(ymax=rightCI
                    , ymin=leftCI)
                , position=position_dodge(0.9)
  ) + 
 ggtitle("CNA Amplification frequency comparison in the two populations") +   
 coord_flip()


CNA_plot_deletion <- merged_cna_deletion %>%
  ggplot(aes(x=gene, y=freq, fill=population)) + 
  geom_bar(stat="identity", position=position_dodge()) +
  scale_fill_manual(values=MYPALETTE(10)[c(10,2)]) +
  #scale_y_continuous(limits = c(0,.7)) +
  geom_errorbar(aes(ymax=rightCI
                    , ymin=leftCI)
                , position=position_dodge(0.9)
  ) + 
 ggtitle("CNA Deletion frequency comparison in the two populations") +   
 coord_flip()


# Make the plot
gridExtra::grid.arrange(CNA_plot_amplification, CNA_plot_deletion, ncol=1, heights = c(20,5))

3. Patient allocation by pathway

Shiva population

coveragePlot(SHIVA_population
             , alterationType = c("mutations", "copynumber")
             , collapseByGene = TRUE
             , grouping = "group"
             , noPlot = TRUE
             )
## Warning in dataExtractor(object = object, alterationType =
## alterationType, : The following tumor types have no alteration to display:
## lihc, thca, acc, thym, ucs, prad
## $plottedTable
##                        1  2 3 4 5 6 7 8 9 10
## group: PIK3/AKT/mTOR 104 12 1 0 0 0 0 0 0  0
## group: RAF/MEK        71 14 3 1 0 0 0 0 0  0
## 
## $Samples
## [1] 406 406
shiva_pathway <- coveragePlot(SHIVA_population
             , alterationType = c("mutations", "copynumber")
             , collapseByGene = TRUE
             , grouping = "group"
             , maxNumAlt = 1
             , noPlot = TRUE
             )
## Warning in dataExtractor(object = object, alterationType =
## alterationType, : The following tumor types have no alteration to display:
## lihc, thca, acc, thym, ucs, prad
# calcualte frequency
shiva_pathway <- data.frame(shiva_pathway$plottedTable /shiva_pathway$Samples) %>%
  tibble::rownames_to_column("group") %>%
  dplyr::rename(allocation =X1)
  
# Calculate left CI and right CI
shiva_pathway$leftCI <- purrr::map_dbl(shiva_pathway$allocation, ~ leftCIprop(., sum(tumor_weights)))
shiva_pathway$rightCI <- purrr::map_dbl(shiva_pathway$allocation, ~ rightCIprop(., sum(tumor_weights)))

# transform table
shiva_allocation <- shiva_pathway %>%
  dplyr::mutate(row_counter = 1:n(), population = "SHIVA") %>%
  dplyr::select(group, row_counter, allocation, leftCI, rightCI, population)

# pepare plot
shiva_allocation_img <- shiva_allocation %>%
  ggplot(aes(x=row_counter, y=allocation, fill=group)) + 
    geom_bar(stat="identity" , position=position_dodge()) +
    ggtitle("Shiva population pathway allocaiton frequency") 

# Preview
#shiva_allocation_img

TCGA population

simul <- mclapply(1:N_SIMULATIONS , function(x) {
       # calculate allocated patients 
     cov <- coveragePlot(TCGA_population
        , alterationType = c("mutations", "copynumber")
        , collapseByGene = TRUE
        , grouping = "group"
        , maxNumAlt = 1
        , tumor.weights = tumor_weights #tumor_weights
        , noPlot=TRUE
       ) 
     
      # get frequency
     as.data.frame(cov$plottedTable) %>%
      tibble::rownames_to_column("group") %>%
      dplyr::select(group, allocation= `1`) %>%
      dplyr::mutate(allocation = allocation /cov$Samples[1]) %>%
      tidyr::spread(group, allocation) %>%
      return()
     
        } , mc.cores=detectCores()) %>% bind_rows

# GET STATS
# ------------------------------------------------------------------------------
# Calculate for each n° of alteration, the mean
allocation <- apply(simul,2,mean)

# Calculate lower CI for bootstrap
leftCI <- apply(simul,2,function(x){left <- quantile(x, 0.025)})

# Calculate upper CI for bootstrap
rightCI <- apply(simul,2,function(x){right <- quantile(x, 0.975)})

# Preapre for column with the counter
row_counter <- 1:length(allocation)

# Put them all together in a data frame
tcga_allocation <- data.frame(row_counter, allocation, leftCI, rightCI, population="TCGA") %>%
  tibble::rownames_to_column("group")

# Generate the Plot
# ------------------------------------------------------------------------------
# Generate the plot
tcga_allocation_img <- tcga_allocation  %>%
  ggplot(aes(x=row_counter, y=allocation, fill=group)) + 
  geom_bar(stat="identity" , position=position_dodge()) + 
  scale_y_continuous(limits = c(0,1)) +
  scale_x_continuous(breaks = row_counter, trans = "reverse") +
  geom_errorbar(aes(ymax=rightCI
                    , ymin=leftCI)
                , position=position_dodge(0.9)
  ) + 
  ggtitle("Patient Allocation to molecular pathway threatment") +
  annotate("text"
           , x=row_counter
           , y=allocation+0.15
           , label= sapply(allocation*100
                           , function(x) paste(c(round(x, digits=2), "%")
                                               , collapse=" ")
           )
  ) +
  coord_flip()

# Preview
#tcga_allocation_img

Put them together

merged_allocation <- rbind(tcga_allocation, shiva_allocation) %>%
  dplyr::mutate(row_counter = 1:n()) 

merged_allocation %>%
  ggplot(aes(x=group, y=allocation, fill=population)) + 
    geom_bar(stat="identity" , position=position_dodge()) + 
    scale_fill_manual(values=MYPALETTE(10)[c(10,2)]) + 
    scale_y_continuous(limits = c(0,1)) +
    #facet_grid(. ~ group, space="free", scales="free") +
    #scale_x_discrete(limits = merged_allocation$group) +
    geom_errorbar(aes(ymax=rightCI
                      , ymin=leftCI)
                  , position=position_dodge(0.9)
    ) + 
    ggtitle("Patient Allocation to molecular pathway treatment, considering all the patients")  

datatable_withbuttons(merged_allocation)

Alternative allocation, with ambigous allocation and considering only eligible patients.

# ALLOCATION FUNCTION with not predefined rule
# ------------------
# Create function to perform the following tasks:
# 1. extract a weighted (according to SHIVA population) sample from the population
# 2. Simulate patient allocation according the defined schema
allocate_patients_fn <- function(population, weights=FALSE){
    
  #extract patient data from the panel
  if(weights){
    patient_lt <- dataExtractor(population
                  , alterationType = c("copynumber", "mutations")
                  , collapseByGene = TRUE 
                  , tumor.weights = weights
                  )
  } else {
    patient_lt <- dataExtractor(population
                  , alterationType = c("copynumber", "mutations")
                  , collapseByGene = TRUE 
                  )
  }
   
  #select general dataframe
  patient_df <- patient_lt$data
                
  # replace / with _
  patient_df[,2] <- gsub("/", "_", patient_df[,2])
  
  # generate a frequency table that indicates, for each patient, elibibility to group allocation, as well as tumor_type
  # one patient per row.
  patient_allocation_df <- patient_df %>%
    dplyr::select(group, case_id, tumor_type) %>%
    dplyr::mutate(found = TRUE) %>%
    unique() %>%
    tidyr::spread(group, found) 
  
  # SIMULATE PATIENTS ALLOCATION ACCORDING TO A SPECIFIC ORDER
  # ----------------------------------------
  # select ONLY PIK3_AKT_mTOR compatible patients
  PIK3_AKT_mTOR_assigned <- patient_allocation_df %>%
    dplyr::filter(is.na(RAF_MEK)) %>%
    dplyr::filter(PIK3_AKT_mTOR == TRUE)

  # select ONLY RAF_MEK compatible patients
  RAF_MEK_assigned <- patient_allocation_df %>%
    dplyr::filter(is.na(PIK3_AKT_mTOR)) %>%
    dplyr::filter(RAF_MEK == TRUE)
  
  # Select patients that would be allocated to either
  ambigous <- patient_allocation_df %>%
    dplyr::filter(RAF_MEK == TRUE) %>%
    dplyr::filter(PIK3_AKT_mTOR == TRUE)
  
  # freq of patient allocation in each treatment group
  df <- data.frame(PIK3_AKT_mTOR = nrow(PIK3_AKT_mTOR_assigned) / nrow(patient_allocation_df)
             , RAF_MEK = nrow(RAF_MEK_assigned) / nrow(patient_allocation_df)
             , ambiguos = nrow(ambigous) / nrow(patient_allocation_df)
  )
  return(df)
}
# TCGA
# ------------------------------------------------------------------------------
bootstrapping_df <- replicate(500
                                , allocate_patients_fn(TCGA_population, tumor_weights) 
                                , simplify=FALSE
                                ) %>% do.call("rbind", .)

# calculae the mean, sd and CI for each treatment group
bootstrappingMeans <- colMeans(bootstrapping_df)
lowerCI <- apply( bootstrapping_df , 2 , function(x) quantile(x, 0.025))
upperCI <- apply( bootstrapping_df , 2 , function(x) quantile(x, 0.975))


# Prepare dataframe for plotting
# ----------------------------
# covert to dataframe
tcga_df <- data.frame(mean=bootstrappingMeans
           , lowerSD = NA
           , upperSD = NA
           , lowerCI = lowerCI
           , upperCI = upperCI
           ) %>%
  tibble::rownames_to_column("group_treatment") %>%
  dplyr::arrange(mean) %>%
  dplyr::mutate(dataset="TCGA")



# SHIVA
# ------------------------------------------------------------------------------
shiva_df <- allocate_patients_fn(SHIVA_population)  %>%
  tidyr::gather(group_treatment, mean) %>%
  dplyr::mutate(lowerSD = NA
                , upperSD = NA
                , lowerCI = leftCIprop(mean, sum(tumor_weights))
                , upperCI = rightCIprop(mean, sum(tumor_weights))
                , dataset = "SHIVA"
                )


# merge dataframes together
merge_df <- rbind(shiva_df, tcga_df) %>%
  dplyr::rename(freq = mean)

# ----------------------------
#  PLOT2 - Same side barplot
# barplot, with only the genes in the panel
merge_df %>%
  dplyr::mutate(origin=paste(as.character(group_treatment), as.character(dataset), sep=":")) %>%
  ggplot(aes(x=origin, y=freq, fill=dataset)) + 
  geom_bar(stat="identity") + 
  scale_fill_manual(values=MYPALETTE(10)[c(10,2)]) + 
  geom_errorbar(aes(min=lowerCI, max=upperCI)) + 
  facet_grid(. ~ group_treatment, space="free", scales="free") +
  theme(axis.text.x=element_blank()
        , axis.ticks.x= element_blank()
        ) +
    labs(title="Comparison between expected allocation to molecular pathways")

merge_df %>%
  dplyr::select(-lowerSD, -upperSD) %>%
  datatable_withbuttons

4. Power analysis

SHIVA

survPowerSampleSize(SHIVA_population
                    , alterationType = c("mutations", "copynumber")
                    , HR = c(0.625)
                    , power = seq(0.6 , 0.9 , 0.1)
                    , alpha = 0.05
                    , case.fraction = 0.5
                    #, tumor.freqs = tumor_freq
)
## Warning in dataExtractor(object = object, alterationType =
## alterationType, : The following tumor types have no alteration to display:
## lihc, thca, acc, thym, ucs, prad

TCGA

survPowerSampleSize(TCGA_population
                    , alterationType = c("mutations", "copynumber")
                    , HR = c(0.625)
                    , power = seq(0.6 , 0.9 , 0.1)
                    , alpha = 0.05
                    , case.fraction = 0.5
                    , tumor.freqs = tumor_freq
)

SHIVA not Optimized

surv2 <- survPowerSampleSize(SHIVA_population
   , alterationType = c("mutations", "copynumber")
   , HR = 0.625
   , power = 0.8
   , alpha = 0.05
   , case.fraction = 0.5
   , ber=0.85
   , fu=1.883
   , tumor.freqs = tumor_freq
   , var = "group" 
   , noPlot=TRUE)
## Warning in dataExtractor(object = object, alterationType =
## alterationType, : The following tumor types have no alteration to display:
## lihc, thca, acc, thym, ucs, prad
datatable_withbuttons(surv2)

TCGA Optimized

simulSurv2 <- mclapply(1:100 , function(x) {
    survPowerSampleSize(TCGA_population
        , alterationType = c("mutations", "copynumber")
        , HR = 0.625
        , power = 0.8
        , alpha = 0.05
        , ber=0.85
        , fu=1.883
        , case.fraction = 0.5
        , collapseByGene=TRUE
        , tumor.weights = tumor_weights
        , var = "group" 
        , noPlot=TRUE
        #, priority.trial=TCGA_population@arguments$gr
        , priority.trial=c("PIK3/AKT/mTOR", "RAF/MEK")
        , priority.trial.order="optimal")$'HR:0.625 | HR0:1 | Power:0.8'$Summary
    } , mc.cores=detectCores())


summaryMat2 <- simulSurv2[[1]]
for(i in rownames(summaryMat2)){
    for(j in colnames(summaryMat2)){
        vec <- sapply(simulSurv2 , function(x) x[i , j])
        summaryMat2[i , j] <- paste(mean(vec) , 
                paste0( "(" , quantile(vec , 0.025) 
                        , "-" 
                        , quantile(vec , 0.975) , ")"))
    }
}
knitr::kable(summaryMat2)
RAF/MEK PIK3/AKT/mTOR Total
Screened 1420.85 (1177.075-1766) 0 (0-0) 1420.85 (1177.075-1766)
Eligible 199 (199-199) 291.72 (213.375-405.825) 490.72 (412.375-604.825)
Not.Eligible 931.12 (752.425-1189.025) 0 (0-0) 931.12 (752.425-1189.025)

SHIVA Optimized

shiva_surv <- survPowerSampleSize(SHIVA_population
    , alterationType = c("mutations", "copynumber")
    , HR = 0.625
    , power = 0.8
    , alpha = 0.05
    , ber=0.85
    , fu=1.883
    , case.fraction = 0.5
    , collapseByGene=TRUE
    , var = "group" 
    , noPlot=TRUE
    #, priority.trial=TCGA_population@arguments$gr
    , priority.trial=c( "RAF/MEK", "PIK3/AKT/mTOR")
    , priority.trial.order="optimal")$'HR:0.625 | HR0:1 | Power:0.8'$Summary 
## Warning in dataExtractor(object = object, alterationType =
## alterationType, : The following tumor types have no alteration to display:
## lihc, thca, acc, thym, ucs, prad
## 
## Minimum Eligible Sample Size to reach 80% of power with an HR equal to 0.625 and a HR0 equal to 1 for each group is equal to: 199
shiva_surv %>% knitr::kable()
RAF/MEK PIK3/AKT/mTOR Total
Screened 1138 0 1138
Eligible 199 228 427
Not.Eligible 712 0 712
# temp


(mutation_data) %>% dplyr::filter(Patient.ID %in% c("08-0024", "03-0007", "07-0098", "04-0025", "05-0042", "06-0096", "07-0099", "06-0113", "07-0027", "01-0212", "01-0222", "04-0039")) %>% View()


test <- (df_full_profile) %>% 
  dplyr::select(Patient.ID,Drug,Signaling.pathway) %>% 
  dplyr::rename(case_id = Patient.ID)
test2 <- dplyr::left_join(cna_long , test, by="case_id")
write.table(test2, file="~/Desktop/perluca.txt", sep="\t", quote =FALSE, col.names = TRUE)
  • MTOR CNA plot strano