HybridQC: Smarter scRNA-seq Quality Control

Kaitao Lai

Introduction

HybridQC provides a hybrid framework for quality control (QC) in single-cell RNA-seq data.

Load Required Packages

library(Seurat)
library(HybridQC)
library(dplyr)

Set working directory

# Optional: change to your output path
#setwd(tempdir())  # or setwd("your/path")
setwd("/Users/kl/Work/Applications/HybridQC")

📥 Load Example Dataset

pbmc_path <- "../inst/extdata/pbmc2k.csv"
seurat_obj <- LoadPBMC2k(dev_path = pbmc_path)
#> ✅ Using dev_path: ../inst/extdata/pbmc2k.csv
#> Warning: Data is of class matrix. Coercing to dgCMatrix.
#seurat_obj <- LoadPBMC2k()

⚙️ Step 1: Basic QC

qc_basic <- run_basic_qc(seurat_obj)
head(qc_basic)
#>       nFeature_RNA nCount_RNA percent.mt
#> Cell1           34         34          0
#> Cell2           33         33          0
#> Cell3           42         42          0
#> Cell4           36         36          0
#> Cell5           41         41          0
#> Cell6           40         41          0
write.csv(qc_basic, "../results/qc/qc_basic.csv")

🧬 Step 2: Preprocessing + Mito %

seurat_obj[["percent.mt"]] <- PercentageFeatureSet(seurat_obj, pattern = "^MT-")
seurat_obj <- NormalizeData(seurat_obj)
#> Normalizing layer: counts
seurat_obj <- FindVariableFeatures(seurat_obj)
#> Finding variable features for layer counts
seurat_obj <- ScaleData(seurat_obj)
#> Centering and scaling data matrix
seurat_obj <- RunPCA(seurat_obj)
#> PC_ 1 
#> Positive:  Gene585, Gene484, Gene468, Gene809, Gene31, Gene377, Gene364, Gene301, Gene28, Gene85 
#>     Gene669, Gene144, Gene812, Gene916, Gene434, Gene353, Gene678, Gene532, Gene341, Gene141 
#>     Gene89, Gene206, Gene927, Gene954, Gene196, Gene406, Gene794, Gene721, Gene686, Gene740 
#> Negative:  Gene157, Gene21, Gene310, Gene741, Gene133, Gene409, Gene543, Gene10, Gene451, Gene639 
#>     Gene695, Gene150, Gene650, Gene403, Gene88, Gene553, Gene630, Gene433, Gene101, Gene226 
#>     Gene201, Gene660, Gene363, Gene426, Gene986, Gene399, Gene333, Gene915, Gene234, Gene855 
#> PC_ 2 
#> Positive:  Gene747, Gene117, Gene124, Gene696, Gene970, Gene591, Gene23, Gene428, Gene177, Gene306 
#>     Gene878, Gene388, Gene949, Gene489, Gene712, Gene404, Gene283, Gene481, Gene556, Gene755 
#>     Gene212, Gene812, Gene757, Gene868, Gene646, Gene26, Gene357, Gene163, Gene956, Gene822 
#> Negative:  Gene378, Gene789, Gene985, Gene785, Gene874, Gene92, Gene884, Gene863, Gene903, Gene334 
#>     Gene761, Gene182, Gene423, Gene561, Gene158, Gene539, Gene713, Gene254, Gene236, Gene530 
#>     Gene135, Gene697, Gene948, Gene246, Gene601, Gene580, Gene356, Gene867, Gene239, Gene318 
#> PC_ 3 
#> Positive:  Gene153, Gene77, Gene988, Gene846, Gene197, Gene689, Gene639, Gene914, Gene286, Gene897 
#>     Gene816, Gene511, Gene509, Gene297, Gene883, Gene345, Gene245, Gene188, Gene928, Gene571 
#>     Gene757, Gene317, Gene490, Gene44, Gene389, Gene367, Gene161, Gene136, Gene47, Gene764 
#> Negative:  Gene256, Gene598, Gene721, Gene939, Gene885, Gene28, Gene445, Gene640, Gene441, Gene432 
#>     Gene824, Gene160, Gene114, Gene435, Gene272, Gene818, Gene123, Gene520, Gene930, Gene937 
#>     Gene620, Gene460, Gene250, Gene172, Gene352, Gene825, Gene505, Gene180, Gene774, Gene470 
#> PC_ 4 
#> Positive:  Gene48, Gene12, Gene314, Gene637, Gene196, Gene990, Gene68, Gene326, Gene93, Gene398 
#>     Gene859, Gene916, Gene3, Gene402, Gene743, Gene135, Gene720, Gene156, Gene180, Gene867 
#>     Gene439, Gene189, Gene645, Gene805, Gene194, Gene30, Gene756, Gene193, Gene780, Gene701 
#> Negative:  Gene954, Gene876, Gene268, Gene129, Gene107, Gene350, Gene545, Gene581, Gene666, Gene84 
#>     Gene283, Gene420, Gene443, Gene286, Gene618, Gene501, Gene472, Gene477, Gene662, Gene91 
#>     Gene927, Gene566, Gene51, Gene865, Gene900, Gene284, Gene33, Gene63, Gene672, Gene910 
#> PC_ 5 
#> Positive:  Gene864, Gene220, Gene909, Gene664, Gene518, Gene941, Gene73, Gene822, Gene678, Gene440 
#>     Gene832, Gene684, Gene428, Gene71, Gene245, Gene138, Gene273, Gene920, Gene306, Gene765 
#>     Gene752, Gene675, Gene137, Gene294, Gene92, Gene244, Gene842, Gene466, Gene159, Gene257 
#> Negative:  Gene797, Gene984, Gene59, Gene980, Gene505, Gene974, Gene392, Gene535, Gene181, Gene901 
#>     Gene356, Gene718, Gene430, Gene201, Gene293, Gene479, Gene654, Gene451, Gene865, Gene436 
#>     Gene538, Gene286, Gene442, Gene805, Gene62, Gene269, Gene53, Gene243, Gene287, Gene509
seurat_obj <- RunUMAP(seurat_obj, dims = 1:10)
#> 04:09:33 UMAP embedding parameters a = 0.9922 b = 1.112
#> 04:09:33 Read 2000 rows and found 10 numeric columns
#> 04:09:33 Using Annoy for neighbor search, n_neighbors = 30
#> 04:09:33 Building Annoy index with metric = cosine, n_trees = 50
#> 0%   10   20   30   40   50   60   70   80   90   100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 04:09:33 Writing NN index file to temp file /var/folders/m5/yyh4c08d6y59r0b5bhqjxmj40000gn/T//Rtmp1trzE9/file20ef76dd21e4
#> 04:09:33 Searching Annoy index using 1 thread, search_k = 3000
#> 04:09:34 Annoy recall = 100%
#> 04:09:34 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
#> 04:09:35 Initializing from normalized Laplacian + noise (using RSpectra)
#> 04:09:35 Commencing optimization for 500 epochs, with 63328 positive edges
#> 04:09:35 Using rng type: pcg
#> 04:09:37 Optimization finished

🧠 Step 3: Configure Conda and run Isolation Forest

reticulate::use_condaenv("hybridqc_py311", required = TRUE)
reticulate::py_config()
#> python:         /opt/homebrew/Caskroom/miniconda/base/envs/hybridqc_py311/bin/python
#> libpython:      /opt/homebrew/Caskroom/miniconda/base/envs/hybridqc_py311/lib/libpython3.11.dylib
#> pythonhome:     /opt/homebrew/Caskroom/miniconda/base/envs/hybridqc_py311:/opt/homebrew/Caskroom/miniconda/base/envs/hybridqc_py311
#> version:        3.11.13 (main, Jun  5 2025, 08:21:08) [Clang 14.0.6 ]
#> numpy:          /opt/homebrew/Caskroom/miniconda/base/envs/hybridqc_py311/lib/python3.11/site-packages/numpy
#> numpy_version:  2.2.6
#> 
#> NOTE: Python version was forced by use_python() function

seurat_obj <- run_isolation_forest_qc(
  seurat_obj,
  python_path = "/opt/homebrew/Caskroom/miniconda/base/envs/hybridqc_py311/bin/python",
  add_umap_plot = FALSE
)

# Save scores
iso_scores <- FetchData(seurat_obj, vars = "isoforest_score")
write.csv(iso_scores, "../results/qc/isoforest_scores.csv")

📊 Plot and save UMAP

p <- FeaturePlot(seurat_obj, features = "isoforest_score", reduction = "umap") +
  ggtitle("Isolation Forest Outlier Score (UMAP)")
ggsave("../results/plots/isoforest_umap.png", plot = p, width = 7, height = 5)
p

✅ Step 4: Hybrid Filtering

ml_scores <- seurat_obj$isoforest_score
filtered_obj <- filter_cells(seurat_obj, qc_basic, ml_scores)
write.csv(filtered_obj@meta.data, "../results/qc/filtered_cells_metadata.csv")

🔄 Build this vignette

Run the following once in your R session:

devtools::build_vignettes()
#> NULL

🧠 Tips • If you want to knit/test the Rmd directly: rmarkdown::render(“vignettes/HybridQC.Rmd”) • To view it in documentation: rebuild the package and go to Help > HybridQC > Vignettes