## Please send questions to cbioportal@googlegroups.com
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
##
## ident, sql
library(sparklyr)
cgds <- cgdsr::CGDS("http://www.cbioportal.org/")
clinicalData <- cgdsr::getClinicalData(cgds, "gbm_tcga_pub_all")
sc <- spark_connect(master = "local", version = "2.4.0")
clinicalData_tbl <- dplyr::copy_to(sc, clinicalData, overwrite = TRUE)
# clinicalData_tbl <- spark_read_table(sc, 'clinicaldata')
clinicalData_trans <-
clinicalData_tbl %>%
mutate(OS_STATUS = regexp_replace(OS_STATUS, "LIVING", "0")) %>%
mutate(OS_STATUS = regexp_replace(OS_STATUS, "DECEASED", "1")) %>%
mutate(DFS_STATUS = regexp_replace(DFS_STATUS, "^$|^ $", "DiseaseFree")) %>%
filter(!is.na(OS_STATUS)) %>%
mutate(OS_STATUS = as.numeric(OS_STATUS)) %>%
arrange(is.na(OS_MONTHS), OS_MONTHS) %>% ## OUFFF put Nan at the end of the column
mutate(DiseaseFree = ifelse(DFS_STATUS == "DiseaseFree", 1, 0)) %>%
mutate( n_DiseaseFree = cumsum(as.numeric(DiseaseFree == 1 ))) %>%
mutate( n_Recurred = cumsum(as.numeric(DiseaseFree == 0 )))
print(sql_render(clinicalData_trans))
## <SQL> SELECT `ACGH_DATA`, `CANCER_TYPE`, `CANCER_TYPE_DETAILED`, `COMPLETE_DATA`, `DFS_MONTHS`, `DFS_STATUS`, `FRACTION_GENOME_ALTERED`, `KARNOFSKY_PERFORMANCE_SCORE`, `MRNA_DATA`, `MUTATION_COUNT`, `ONCOTREE_CODE`, `OS_MONTHS`, `OS_STATUS`, `PRETREATMENT_HISTORY`, `PRIOR_GLIOMA`, `SAMPLE_COUNT`, `SEQUENCED`, `SEX`, `TREATMENT_STATUS`, `DiseaseFree`, `n_DiseaseFree`, sum(CAST(`DiseaseFree` = 0.0 AS DOUBLE)) OVER (ORDER BY ((`OS_MONTHS`) IS NULL), `OS_MONTHS` ROWS UNBOUNDED PRECEDING) AS `n_Recurred`
## FROM (SELECT `ACGH_DATA`, `CANCER_TYPE`, `CANCER_TYPE_DETAILED`, `COMPLETE_DATA`, `DFS_MONTHS`, `DFS_STATUS`, `FRACTION_GENOME_ALTERED`, `KARNOFSKY_PERFORMANCE_SCORE`, `MRNA_DATA`, `MUTATION_COUNT`, `ONCOTREE_CODE`, `OS_MONTHS`, `OS_STATUS`, `PRETREATMENT_HISTORY`, `PRIOR_GLIOMA`, `SAMPLE_COUNT`, `SEQUENCED`, `SEX`, `TREATMENT_STATUS`, `DiseaseFree`, sum(CAST(`DiseaseFree` = 1.0 AS DOUBLE)) OVER (ORDER BY ((`OS_MONTHS`) IS NULL), `OS_MONTHS` ROWS UNBOUNDED PRECEDING) AS `n_DiseaseFree`
## FROM (SELECT `ACGH_DATA`, `CANCER_TYPE`, `CANCER_TYPE_DETAILED`, `COMPLETE_DATA`, `DFS_MONTHS`, `DFS_STATUS`, `FRACTION_GENOME_ALTERED`, `KARNOFSKY_PERFORMANCE_SCORE`, `MRNA_DATA`, `MUTATION_COUNT`, `ONCOTREE_CODE`, `OS_MONTHS`, `OS_STATUS`, `PRETREATMENT_HISTORY`, `PRIOR_GLIOMA`, `SAMPLE_COUNT`, `SEQUENCED`, `SEX`, `TREATMENT_STATUS`, CASE WHEN (`DFS_STATUS` = "DiseaseFree") THEN (1.0) WHEN NOT(`DFS_STATUS` = "DiseaseFree") THEN (0.0) END AS `DiseaseFree`
## FROM (SELECT *
## FROM (SELECT `ACGH_DATA`, `CANCER_TYPE`, `CANCER_TYPE_DETAILED`, `COMPLETE_DATA`, `DFS_MONTHS`, `DFS_STATUS`, `FRACTION_GENOME_ALTERED`, `KARNOFSKY_PERFORMANCE_SCORE`, `MRNA_DATA`, `MUTATION_COUNT`, `ONCOTREE_CODE`, `OS_MONTHS`, CAST(`OS_STATUS` AS DOUBLE) AS `OS_STATUS`, `PRETREATMENT_HISTORY`, `PRIOR_GLIOMA`, `SAMPLE_COUNT`, `SEQUENCED`, `SEX`, `TREATMENT_STATUS`
## FROM (SELECT *
## FROM (SELECT `ACGH_DATA`, `CANCER_TYPE`, `CANCER_TYPE_DETAILED`, `COMPLETE_DATA`, `DFS_MONTHS`, REGEXP_REPLACE(`DFS_STATUS`, "^$|^ $", "DiseaseFree") AS `DFS_STATUS`, `FRACTION_GENOME_ALTERED`, `KARNOFSKY_PERFORMANCE_SCORE`, `MRNA_DATA`, `MUTATION_COUNT`, `ONCOTREE_CODE`, `OS_MONTHS`, `OS_STATUS`, `PRETREATMENT_HISTORY`, `PRIOR_GLIOMA`, `SAMPLE_COUNT`, `SEQUENCED`, `SEX`, `TREATMENT_STATUS`
## FROM (SELECT `ACGH_DATA`, `CANCER_TYPE`, `CANCER_TYPE_DETAILED`, `COMPLETE_DATA`, `DFS_MONTHS`, `DFS_STATUS`, `FRACTION_GENOME_ALTERED`, `KARNOFSKY_PERFORMANCE_SCORE`, `MRNA_DATA`, `MUTATION_COUNT`, `ONCOTREE_CODE`, `OS_MONTHS`, REGEXP_REPLACE(`OS_STATUS`, "DECEASED", "1") AS `OS_STATUS`, `PRETREATMENT_HISTORY`, `PRIOR_GLIOMA`, `SAMPLE_COUNT`, `SEQUENCED`, `SEX`, `TREATMENT_STATUS`
## FROM (SELECT `ACGH_DATA`, `CANCER_TYPE`, `CANCER_TYPE_DETAILED`, `COMPLETE_DATA`, `DFS_MONTHS`, `DFS_STATUS`, `FRACTION_GENOME_ALTERED`, `KARNOFSKY_PERFORMANCE_SCORE`, `MRNA_DATA`, `MUTATION_COUNT`, `ONCOTREE_CODE`, `OS_MONTHS`, REGEXP_REPLACE(`OS_STATUS`, "LIVING", "0") AS `OS_STATUS`, `PRETREATMENT_HISTORY`, `PRIOR_GLIOMA`, `SAMPLE_COUNT`, `SEQUENCED`, `SEX`, `TREATMENT_STATUS`
## FROM `clinicalData`) `ayjkzguabb`) `yqumcejgrt`) `xnqfpcguyz`
## WHERE (NOT(((`OS_STATUS`) IS NULL)))) `qmviylkilf`) `dkpafvoqlf`
## ORDER BY ((`OS_MONTHS`) IS NULL), `OS_MONTHS`) `ddexwupmni`) `prgtsxajzr`) `yzelbgvfst`