Contents

The original data of Jin et al 2020 can be downloaded from the Broad single cell portal (https://singlecell.broadinstitute.org/single_cell/study/SCP1184). Here, we just use a subset of the data to demonstrate the workflow of the analysis.

library(Seurat)
## Loading required package: SeuratObject

## Loading required package: sp

## 'SeuratObject' was built under R 4.3.2 but the current version is
## 4.3.3; it is recomended that you reinstall 'SeuratObject' as the ABI
## for R may have changed

## 'SeuratObject' was built with package 'Matrix' 1.6.3 but the current
## version is 1.6.5; it is recomended that you reinstall 'SeuratObject' as
## the ABI for 'Matrix' may have changed

## 
## Attaching package: 'SeuratObject'

## The following object is masked from 'package:base':
## 
##     intersect
library(caret)
## Loading required package: ggplot2

## Loading required package: lattice
sc.seurat <- readRDS("perturbseq-exneu.rds")

Y <- data.frame(t(as.matrix(sc.seurat[['RNA']]$counts))) # cell-by-gene matrix
metadata <- sc.seurat@meta.data

perturb <- metadata
colnames(perturb) <- gsub("Perturbation", "trt_", colnames(perturb))
perturb$trt_ <- relevel(as.factor(perturb$trt_), ref = "GFP")
dmy <- dummyVars(" ~ trt_", data = perturb)
A <- data.frame(predict(dmy, newdata = perturb))[,-1] # cell-by-trt matrix

For running causarray, we require the following inputs:

  • Y: the cell-by-gene gene expression matrix.

  • A: the cell-by-condition binary matrix of the perturbation/treatment conditions.

  • X, X_A: (optional) the cell-by-covariate matrix of the covariates of interest for outcome and propensity models.

Here, Y and A can be dataframes.

Use R package reticulate to load the Python package causarray.

require(reticulate)
## Loading required package: reticulate
Sys.setenv(PYTHONUNBUFFERED = TRUE)
use_condaenv('causarray')
causarray <- import("causarray")
cat(causarray$`__version__`)
## 0.0.6
# (Y, A) should be either data.frame or matrix
# optional covariates can be provided as matrices
dat <- causarray$prep_causarray_data(Y, A)
names(dat) <- c("Y", "A", "X", "X_A")
list2env(dat, .GlobalEnv)
## <environment: R_GlobalEnv>

We first apply gcate to estimate unmeasured confounders.

r <- 10
res_gate <- causarray$fit_gcate(Y, X, A, r, verbose=TRUE) # a list of results from 2 stages optimization
## {'d': 30, 'n': 2926, 'p': 3221, 'r': 10}
## 'Estimating initial latent variables with GLMs...'
## 'Fitting nb GLM with offset...'
## 'Fitting GLM done.'
## 'Estimating initial coefficients with GLMs...'
## 'Fitting nb GLM with offset...'
## 'Fitting GLM done.'
## {'kwargs_es': {'max_iters': 500,
##                'patience': 5,
##                'tolerance': 0.001,
##                'warmup': 0},
##  'kwargs_glm': {'disp_glm': array([ 1.11673516,  1.06870944,  1.16716468, ..., 12.58818245,
##        16.46897663,  1.70852614], shape=(3221,)),
##                 'family': 'nb',
##                 'size_factor': array([0.53193358, 0.87362742, 1.2235467 , ..., 0.5593801 , 0.73025856,
##        0.77857223], shape=(2926,))},
##  'kwargs_ls': {'C': 1000.0,
##                'alpha': 0.1,
##                'beta': 0.5,
##                'max_iters': 20,
##                'tol': 0.0001}}
## 'Fitting GCATE (step 1)...'
## {'d': 30, 'n': 2926, 'p': 3221, 'r': 10}
## {'kwargs_es': {'max_iters': 500,
##                'patience': 5,
##                'tolerance': 0.001,
##                'warmup': 0},
##  'kwargs_glm': {'disp_glm': array([ 1.11673516,  1.06870944,  1.16716468, ..., 12.58818245,
##        16.46897663,  1.70852614], shape=(3221,)),
##                 'family': 'nb',
##                 'size_factor': array([0.53193358, 0.87362742, 1.2235467 , ..., 0.5593801 , 0.73025856,
##        0.77857223], shape=(2926,))},
##  'kwargs_ls': {'C': 1000.0,
##                'alpha': 0.1,
##                'beta': 0.5,
##                'max_iters': 20,
##                'tol': 0.0001}}
## 'Fitting GCATE (step 2)...'
U <- res_gate[[2]]$U

Next, we apply causarray to estimate the causal effects of perturbations on gene expression.

offsets <- log(res_gate[[2]][['kwargs_glm']][['size_factor']]) # use the precomputed size factors
res <- causarray$LFC(Y, cbind(X, U), A, cbind(X_A, U), offset=offsets, verbose=TRUE)
## 'Estimating LFC...'
## {'a': 29, 'd': 11, 'd_A': 12, 'estimands': 'LFC', 'n': 2926, 'p': 3221}
## {'C': 1.0,
##  'class_weight': 'balanced',
##  'fit_intercept': False,
##  'random_state': 0,
##  'verbose': False}
## {'offset': array([-0.63123664, -0.13510128,  0.20175377, ..., -0.58092607,
##        -0.31435661, -0.25029351], shape=(2926,)),
##  'random_state': 0,
##  'verbose': True}
## 'Fit propensity score models...'
## 'Fit outcome models...'
## 'Fitting nb GLM (fast)...'
## 'Estimating AIPW mean...'
names(res) <- c("df_res", "estimation")
list2env(res, .GlobalEnv)
## <environment: R_GlobalEnv>
library(dplyr)
## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

# Filter the results for significant discoveries
significant_discoveries <- df_res[df_res$padj < 0.1, ]

# Count the number of discoveries for each perturbation condition
discovery_counts <- as.data.frame(table(significant_discoveries$trt))
colnames(discovery_counts) <- c('Perturbation', 'Count')

# Order the discovery_counts by Count in descending order
discovery_counts <- discovery_counts %>% arrange(desc(Count))

# Set the factor levels of Perturbation to ensure ggplot respects the order
discovery_counts$Perturbation <- factor(discovery_counts$Perturbation, levels = discovery_counts$Perturbation)

# Plot the number of discoveries for each perturbation condition
ggplot(discovery_counts, aes(x = Perturbation, y = Count)) +
  geom_bar(stat = "identity", fill = "royalblue") +  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  ggtitle('Number of Discoveries (padj < 0.1) for Each Perturbation Condition') +
  xlab('Perturbation Condition') +
  ylab('Number of Discoveries')