5 Comparison of scCUTseq and ACT

This sections produces all the figures used in Supplementary Figure 3.
We compare the Breadth of Coverage, Overdispersion and overall copynumber profiles between different sections in Patient 3 that were sequenced by both ACT and scCUTseq.

# Source setup file
source("./functions/setup.R")

# Source plotting functions
source("./functions/plotProfile.R")
source("./functions/plotHeatmap.R")

5.1 Calculate breadth of coverage

We downsampled bam files to 800K reads using the following commands in bash and ran our copynumber calling pipeline (described in Preprocessing.Rmd) and use the resulting cnv.rds to get information about cell quality.

samtools view ${INSAMPLEBASE}.bam -H > ${INSAMPLEBASE}.sam&& samtools view $INSAMPLEBASE.bam | shuf -n 800000 --random-source=$INSAMPLEBASE.bam >> $INSAMPLEBASE.sam&& samtools sort -o ../downsampling/bamfiles/$OUTBAM $INSAMPLEBASE.sam&& rm $INSAMPLEBASE.sam&& genomeCoverageBed -ibam ../downsampling/bamfiles/$OUTBAM -fs 50 -max 1 > ../downsampling/coverage/$COVHISTFILE

We then used the $COVHISTFILE output to calculate the breadth of coverage.

# Define BoC function
calc_coverage = function(path) {
  
  inpaths = Sys.glob(paste0(path, "*.covhist.txt"))
  
  coverage.stats = tibble(bed_path=inpaths) %>%
    mutate(cellname = str_extract(basename(bed_path), "^[^.]*")) %>%
    group_by(cellname) %>%
    summarize(.groups="keep",
              read_tsv(bed_path,
                       col_names=c("refname", "depth", "count",
                                   "refsize", "frac"),
                       col_types=cols(col_character(), col_double(),
                                      col_double(), col_double(),
                                      col_double())),
    ) %>%
    filter(refname=="genome") %>%
    summarize(breadth = 1 - frac[depth==0],
              .groups="keep")
  
}

# Calculate BoC on the COVHIST files
sccutseq = calc_coverage("./data/downsampling/coverage/scCUTseq/")
act = calc_coverage("./data/downsampling/coverage/ACT/")

# SetDT
setDT(sccutseq)
setDT(act)

# Load copynumber profiles to get HQ cells
p3 = readRDS("./data/downsampling/P3.rds")
p3_stats = p3$stats
p3_hq = p3_stats[classifier_prediction == "good", sample]

# ACT
CD5p = readRDS("./data/downsampling/CD5p.rds")
CD7p = readRDS("./data/downsampling/CD7p.rds")
CD4p = readRDS("./data/downsampling/CD4p.rds")
CD2p = readRDS("./data/downsampling/CD2p.rds")
CD6p = readRDS("./data/downsampling/CD6p.rds")
CD3p = readRDS("./data/downsampling/CD3p.rds")

act_hq = c(paste0("CD5p_", CD5p$stats[classifier_prediction == "good", sample]),
           paste0("CD7p_", CD7p$stats[classifier_prediction == "good", sample]),
           paste0("CD4p_", CD4p$stats[classifier_prediction == "good", sample]),
           paste0("CD2p_", CD2p$stats[classifier_prediction == "good", sample]),
           paste0("CD6p_", CD6p$stats[classifier_prediction == "good", sample]),
           paste0("CD3p_", CD3p$stats[classifier_prediction == "good", sample]))

# Select HQ samples
sccutseq = sccutseq[cellname %in% p3_hq, ]
act = act[cellname %in% act_hq, ]

# Combine
sccutseq[, tech := "scCUTseq"]
act[, tech := "ACT"]
dt = rbind(sccutseq, act)
dt[, tech := factor(tech, levels = c("scCUTseq", "ACT"))]

# Plot comparison
ggplot(dt, aes(x = tech, y = breadth)) +
  geom_violin() +
  geom_boxplot(width = .15) + 
  scale_y_continuous(limits = c(0, 0.0125)) +
  labs(y = "Breadth of coverage", x = "")

5.2 Calculate overdispersion

Then we used the read counts from each bin to calculate the overdispersion. We also do this on the HQ cells that are selected in the code chunk above.

# Define functions
l2e.normal.sd = function(xs)
{
  # Need at least two values to get a standard deviation
  stopifnot(length(xs) >= 2)
  optim.result = stats::optimize(
    # L2E loss function
    f=function(sd)
      # "Data part", the sample average of the likelihood
      -2 * mean(stats::dnorm(xs, sd=sd)) +
      # "Theta part", the integral of the squared density
      1/(2*sqrt(pi)*sd),
    # Parameter: standard deviation of the normal distribution fit
    interval = c(0, diff(range(xs))))
  return(optim.result$minimum)
}

# A function for estimating the index of dispersion, which is used when
# estimating standard errors for each segment mean
overdispersion = function(v)
{
  # 3 elements, 2 differences, can find a standard deviation
  stopifnot(length(v) >= 3)
  # Differences between pairs of values
  y = v[-1]
  x = v[-length(v)]
  # Normalize the differences using the sum. The result should be around zero,
  # plus or minus square root of the index of dispersion
  vals.unfiltered = (y-x)/sqrt(y+x)
  # Remove divide by zero cases, and--considering this is supposed to be count
  # data--divide by almost-zero cases
  vals = vals.unfiltered[y + x  >= 1]
  # Check that there's anything left
  stopifnot(length(vals) >= 2)
  # Assuming most of the normalized differences follow a normal distribution,
  # estimate the standard deviation
  val.sd = l2e.normal.sd(vals)
  # Square this standard deviation to obtain an estimate of the index of
  # dispersion
  iod = val.sd^2
  # subtract one to get the overdispersion criteria
  iod.over = iod -1
  # normalizing by mean bincounts
  iod.norm = iod.over/mean(v)
  return(iod.norm)
}

# Get count data for scCUTseq and ACT
p3_counts = p3$counts[, p3_stats[classifier_prediction == "good", sample], with = FALSE]
CD5p_counts = CD5p$counts[, CD5p$stats[classifier_prediction == "good", sample], with = FALSE]
CD7p_counts = CD7p$counts[, CD7p$stats[classifier_prediction == "good", sample], with = FALSE]
CD4p_counts = CD4p$counts[, CD4p$stats[classifier_prediction == "good", sample], with = FALSE]
CD2p_counts = CD2p$counts[, CD2p$stats[classifier_prediction == "good", sample], with = FALSE]
CD6p_counts = CD6p$counts[, CD6p$stats[classifier_prediction == "good", sample], with = FALSE]
CD3p_counts = CD3p$counts[, CD3p$stats[classifier_prediction == "good", sample], with = FALSE]

# Combine act
act_counts = do.call(cbind, list(CD5p_counts, CD4p_counts, CD2p_counts,
                                 CD7p_counts, CD6p_counts, CD3p_counts))

# Calculate overdispersion
p3_overdispersion = lapply(p3_counts, function(x) {
  overdispersion(x)
})

act_overdispersion = lapply(act_counts, function(x) {
  overdispersion(x)
})

# Make into DT
act_dt = data.table(method = "ACT", overdispersion = unlist(act_overdispersion))
p3_dt = data.table(method = "scCUTseq", overdispersion = unlist(p3_overdispersion))

# Combine DTs
dt = rbind(act_dt, p3_dt)
dt[, method := factor(method, levels = c("scCUTseq", "ACT"))]

# Plot comparison
ggplot(dt, aes(x = method, y = overdispersion)) +
  geom_violin() +
  geom_boxplot(width = .075) + 
  labs(y = "Overdispersion", x = "")

5.3 Plot genomewide heatmaps of ACT and scCUTseq copynumber profiles

Plot both the ACT and scCUTseq genomewide heatmap to visually compare the two methods. Note: the plotting of these can take a while. Furthermore, we set dendrogram = FALSE since there is a known bug with the ggdendro package when you have many samples that are identical (in our case, diploid cells)

# Load in data
p3_sccut = readRDS("./data/P3_cnv.rds")

# Select HQ cells
p3_sccut_profiles = p3_sccut$copynumber[, p3_sccut$stats[classifier_prediction == "good", sample], with = FALSE]
p3_sccut_profiles = p3_sccut_profiles[, grepl("NZ186|MS80|NZ187|NZ188|NZ189|NZ211", colnames(p3_sccut_profiles)), with = F] # Select sections that also has ACT sequencing data

# Make annotation table for scCUTseq
p3_annot = fread("./annotation/P3.tsv", header = FALSE)

scCUT_annot = data.table(sample = colnames(p3_sccut_profiles),
                         variable = "section",
                         value = gsub("_.*", "", colnames(p3_sccut_profiles)))

# Merge to get section information
scCUT_annot = merge(scCUT_annot, p3_annot, by.x = "value", by.y = "V1")

# Plot scCUTseq heatmap
plotHeatmap(p3_sccut_profiles, p3_sccut$bins, dendrogram = FALSE, annotation = scCUT_annot[, .(sample, variable, V2)])

# Get ACT profiles and make annotation
p3_act = readRDS("./data/P3_ACT.rds")
p3_act_profiles = p3_act$copynumber[, sort(p3_act$stats[classifier_prediction == "good", sample]), with = F]

# Make annotation for ACT
ACT_annot = data.table(sample = colnames(p3_act_profiles),
                       variable = "section",
                       value = gsub("_.*", "", colnames(p3_act_profiles)))

# Plot ACT heatmap
plotHeatmap(p3_act_profiles, p3_act$bins, dendrogram = FALSE, annotation = ACT_annot)

5.4 Plot profile plots of example profiles

We selected 2 profiles of scCUTseq and 2 profiles of ACT to show the differences. These are selected manually but are representative for other plots (you can change the sample ID to manually check others)

# profile selection
sccut = readRDS("./data/CD27.rds")
act = readRDS("./data/CD1p.rds")

# scCUTseq cell 1
plotProfile(sccut$copynumber$AGCCAACGGCA, sccut$counts_gc$AGCCAACGGCA * sccut$ploidies[sample == "AGCCAACGGCA", ploidy], sccut$bins)

# scCUTseq cell 2
plotProfile(sccut$copynumber$AGCTTGCTCAT, sccut$counts_gc$AGCTTGCTCAT * sccut$ploidies[sample == "AGCTTGCTCAT", ploidy], sccut$bins)

# ACT cell 1
plotProfile(act$copynumber$`296_S296`, act$counts_gc$`296_S296` * act$ploidies[sample == "296_S296", ploidy], act$bins)

# ACT cell 2
plotProfile(act$copynumber$`227_S227`, act$counts_gc$`227_S227` * act$ploidies[sample == "227_S227", ploidy], act$bins)