library(cgdsr)
## Please send questions to cbioportal@googlegroups.com
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(dbplyr)
## 
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
## 
##     ident, sql
library(sparklyr)

  cgds <- cgdsr::CGDS("http://www.cbioportal.org/")
  
  clinicalData <- cgdsr::getClinicalData(cgds, "gbm_tcga_pub_all")
 
   sc <- spark_connect(master = "local", version = "2.4.0")
  
 clinicalData_tbl <- dplyr::copy_to(sc, clinicalData, overwrite = TRUE)
 # clinicalData_tbl <- spark_read_table(sc, 'clinicaldata')
  
    clinicalData_trans <-
     clinicalData_tbl %>%
      mutate(OS_STATUS = regexp_replace(OS_STATUS, "LIVING", "0")) %>%
      mutate(OS_STATUS = regexp_replace(OS_STATUS, "DECEASED", "1")) %>%
      mutate(DFS_STATUS = regexp_replace(DFS_STATUS, "^$|^ $", "DiseaseFree")) %>%
      filter(!is.na(OS_STATUS)) %>%
      mutate(OS_STATUS = as.numeric(OS_STATUS)) %>%
      arrange(is.na(OS_MONTHS), OS_MONTHS) %>%  ## OUFFF put Nan at the end of the column
      mutate(DiseaseFree = ifelse(DFS_STATUS == "DiseaseFree", 1, 0)) %>% 
      mutate( n_DiseaseFree = cumsum(as.numeric(DiseaseFree == 1 ))) %>%
      mutate( n_Recurred = cumsum(as.numeric(DiseaseFree == 0 )))
  
print(sql_render(clinicalData_trans))
## <SQL> SELECT `ACGH_DATA`, `CANCER_TYPE`, `CANCER_TYPE_DETAILED`, `COMPLETE_DATA`, `DFS_MONTHS`, `DFS_STATUS`, `FRACTION_GENOME_ALTERED`, `KARNOFSKY_PERFORMANCE_SCORE`, `MRNA_DATA`, `MUTATION_COUNT`, `ONCOTREE_CODE`, `OS_MONTHS`, `OS_STATUS`, `PRETREATMENT_HISTORY`, `PRIOR_GLIOMA`, `SAMPLE_COUNT`, `SEQUENCED`, `SEX`, `TREATMENT_STATUS`, `DiseaseFree`, `n_DiseaseFree`, sum(CAST(`DiseaseFree` = 0.0 AS DOUBLE)) OVER (ORDER BY ((`OS_MONTHS`) IS NULL), `OS_MONTHS` ROWS UNBOUNDED PRECEDING) AS `n_Recurred`
## FROM (SELECT `ACGH_DATA`, `CANCER_TYPE`, `CANCER_TYPE_DETAILED`, `COMPLETE_DATA`, `DFS_MONTHS`, `DFS_STATUS`, `FRACTION_GENOME_ALTERED`, `KARNOFSKY_PERFORMANCE_SCORE`, `MRNA_DATA`, `MUTATION_COUNT`, `ONCOTREE_CODE`, `OS_MONTHS`, `OS_STATUS`, `PRETREATMENT_HISTORY`, `PRIOR_GLIOMA`, `SAMPLE_COUNT`, `SEQUENCED`, `SEX`, `TREATMENT_STATUS`, `DiseaseFree`, sum(CAST(`DiseaseFree` = 1.0 AS DOUBLE)) OVER (ORDER BY ((`OS_MONTHS`) IS NULL), `OS_MONTHS` ROWS UNBOUNDED PRECEDING) AS `n_DiseaseFree`
## FROM (SELECT `ACGH_DATA`, `CANCER_TYPE`, `CANCER_TYPE_DETAILED`, `COMPLETE_DATA`, `DFS_MONTHS`, `DFS_STATUS`, `FRACTION_GENOME_ALTERED`, `KARNOFSKY_PERFORMANCE_SCORE`, `MRNA_DATA`, `MUTATION_COUNT`, `ONCOTREE_CODE`, `OS_MONTHS`, `OS_STATUS`, `PRETREATMENT_HISTORY`, `PRIOR_GLIOMA`, `SAMPLE_COUNT`, `SEQUENCED`, `SEX`, `TREATMENT_STATUS`, CASE WHEN (`DFS_STATUS` = "DiseaseFree") THEN (1.0) WHEN NOT(`DFS_STATUS` = "DiseaseFree") THEN (0.0) END AS `DiseaseFree`
## FROM (SELECT *
## FROM (SELECT `ACGH_DATA`, `CANCER_TYPE`, `CANCER_TYPE_DETAILED`, `COMPLETE_DATA`, `DFS_MONTHS`, `DFS_STATUS`, `FRACTION_GENOME_ALTERED`, `KARNOFSKY_PERFORMANCE_SCORE`, `MRNA_DATA`, `MUTATION_COUNT`, `ONCOTREE_CODE`, `OS_MONTHS`, CAST(`OS_STATUS` AS DOUBLE) AS `OS_STATUS`, `PRETREATMENT_HISTORY`, `PRIOR_GLIOMA`, `SAMPLE_COUNT`, `SEQUENCED`, `SEX`, `TREATMENT_STATUS`
## FROM (SELECT *
## FROM (SELECT `ACGH_DATA`, `CANCER_TYPE`, `CANCER_TYPE_DETAILED`, `COMPLETE_DATA`, `DFS_MONTHS`, REGEXP_REPLACE(`DFS_STATUS`, "^$|^ $", "DiseaseFree") AS `DFS_STATUS`, `FRACTION_GENOME_ALTERED`, `KARNOFSKY_PERFORMANCE_SCORE`, `MRNA_DATA`, `MUTATION_COUNT`, `ONCOTREE_CODE`, `OS_MONTHS`, `OS_STATUS`, `PRETREATMENT_HISTORY`, `PRIOR_GLIOMA`, `SAMPLE_COUNT`, `SEQUENCED`, `SEX`, `TREATMENT_STATUS`
## FROM (SELECT `ACGH_DATA`, `CANCER_TYPE`, `CANCER_TYPE_DETAILED`, `COMPLETE_DATA`, `DFS_MONTHS`, `DFS_STATUS`, `FRACTION_GENOME_ALTERED`, `KARNOFSKY_PERFORMANCE_SCORE`, `MRNA_DATA`, `MUTATION_COUNT`, `ONCOTREE_CODE`, `OS_MONTHS`, REGEXP_REPLACE(`OS_STATUS`, "DECEASED", "1") AS `OS_STATUS`, `PRETREATMENT_HISTORY`, `PRIOR_GLIOMA`, `SAMPLE_COUNT`, `SEQUENCED`, `SEX`, `TREATMENT_STATUS`
## FROM (SELECT `ACGH_DATA`, `CANCER_TYPE`, `CANCER_TYPE_DETAILED`, `COMPLETE_DATA`, `DFS_MONTHS`, `DFS_STATUS`, `FRACTION_GENOME_ALTERED`, `KARNOFSKY_PERFORMANCE_SCORE`, `MRNA_DATA`, `MUTATION_COUNT`, `ONCOTREE_CODE`, `OS_MONTHS`, REGEXP_REPLACE(`OS_STATUS`, "LIVING", "0") AS `OS_STATUS`, `PRETREATMENT_HISTORY`, `PRIOR_GLIOMA`, `SAMPLE_COUNT`, `SEQUENCED`, `SEX`, `TREATMENT_STATUS`
## FROM `clinicalData`) `ayjkzguabb`) `yqumcejgrt`) `xnqfpcguyz`
## WHERE (NOT(((`OS_STATUS`) IS NULL)))) `qmviylkilf`) `dkpafvoqlf`
## ORDER BY ((`OS_MONTHS`) IS NULL), `OS_MONTHS`) `ddexwupmni`) `prgtsxajzr`) `yzelbgvfst`