HybridQC
provides a hybrid framework for quality control
(QC) in single-cell RNA-seq data.
seurat_obj[["percent.mt"]] <- PercentageFeatureSet(seurat_obj, pattern = "^MT-")
seurat_obj <- NormalizeData(seurat_obj)
#> Normalizing layer: counts
seurat_obj <- FindVariableFeatures(seurat_obj)
#> Finding variable features for layer counts
seurat_obj <- ScaleData(seurat_obj)
#> Centering and scaling data matrix
seurat_obj <- RunPCA(seurat_obj)
#> PC_ 1
#> Positive: Gene585, Gene484, Gene468, Gene809, Gene31, Gene377, Gene364, Gene301, Gene28, Gene85
#> Gene669, Gene144, Gene812, Gene916, Gene434, Gene353, Gene678, Gene532, Gene341, Gene141
#> Gene89, Gene206, Gene927, Gene954, Gene196, Gene406, Gene794, Gene721, Gene686, Gene740
#> Negative: Gene157, Gene21, Gene310, Gene741, Gene133, Gene409, Gene543, Gene10, Gene451, Gene639
#> Gene695, Gene150, Gene650, Gene403, Gene88, Gene553, Gene630, Gene433, Gene101, Gene226
#> Gene201, Gene660, Gene363, Gene426, Gene986, Gene399, Gene333, Gene915, Gene234, Gene855
#> PC_ 2
#> Positive: Gene747, Gene117, Gene124, Gene696, Gene970, Gene591, Gene23, Gene428, Gene177, Gene306
#> Gene878, Gene388, Gene949, Gene489, Gene712, Gene404, Gene283, Gene481, Gene556, Gene755
#> Gene212, Gene812, Gene757, Gene868, Gene646, Gene26, Gene357, Gene163, Gene956, Gene822
#> Negative: Gene378, Gene789, Gene985, Gene785, Gene874, Gene92, Gene884, Gene863, Gene903, Gene334
#> Gene761, Gene182, Gene423, Gene561, Gene158, Gene539, Gene713, Gene254, Gene236, Gene530
#> Gene135, Gene697, Gene948, Gene246, Gene601, Gene580, Gene356, Gene867, Gene239, Gene318
#> PC_ 3
#> Positive: Gene153, Gene77, Gene988, Gene846, Gene197, Gene689, Gene639, Gene914, Gene286, Gene897
#> Gene816, Gene511, Gene509, Gene297, Gene883, Gene345, Gene245, Gene188, Gene928, Gene571
#> Gene757, Gene317, Gene490, Gene44, Gene389, Gene367, Gene161, Gene136, Gene47, Gene764
#> Negative: Gene256, Gene598, Gene721, Gene939, Gene885, Gene28, Gene445, Gene640, Gene441, Gene432
#> Gene824, Gene160, Gene114, Gene435, Gene272, Gene818, Gene123, Gene520, Gene930, Gene937
#> Gene620, Gene460, Gene250, Gene172, Gene352, Gene825, Gene505, Gene180, Gene774, Gene470
#> PC_ 4
#> Positive: Gene48, Gene12, Gene314, Gene637, Gene196, Gene990, Gene68, Gene326, Gene93, Gene398
#> Gene859, Gene916, Gene3, Gene402, Gene743, Gene135, Gene720, Gene156, Gene180, Gene867
#> Gene439, Gene189, Gene645, Gene805, Gene194, Gene30, Gene756, Gene193, Gene780, Gene701
#> Negative: Gene954, Gene876, Gene268, Gene129, Gene107, Gene350, Gene545, Gene581, Gene666, Gene84
#> Gene283, Gene420, Gene443, Gene286, Gene618, Gene501, Gene472, Gene477, Gene662, Gene91
#> Gene927, Gene566, Gene51, Gene865, Gene900, Gene284, Gene33, Gene63, Gene672, Gene910
#> PC_ 5
#> Positive: Gene864, Gene220, Gene909, Gene664, Gene518, Gene941, Gene73, Gene822, Gene678, Gene440
#> Gene832, Gene684, Gene428, Gene71, Gene245, Gene138, Gene273, Gene920, Gene306, Gene765
#> Gene752, Gene675, Gene137, Gene294, Gene92, Gene244, Gene842, Gene466, Gene159, Gene257
#> Negative: Gene797, Gene984, Gene59, Gene980, Gene505, Gene974, Gene392, Gene535, Gene181, Gene901
#> Gene356, Gene718, Gene430, Gene201, Gene293, Gene479, Gene654, Gene451, Gene865, Gene436
#> Gene538, Gene286, Gene442, Gene805, Gene62, Gene269, Gene53, Gene243, Gene287, Gene509
seurat_obj <- RunUMAP(seurat_obj, dims = 1:10)
#> 04:09:33 UMAP embedding parameters a = 0.9922 b = 1.112
#> 04:09:33 Read 2000 rows and found 10 numeric columns
#> 04:09:33 Using Annoy for neighbor search, n_neighbors = 30
#> 04:09:33 Building Annoy index with metric = cosine, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 04:09:33 Writing NN index file to temp file /var/folders/m5/yyh4c08d6y59r0b5bhqjxmj40000gn/T//Rtmp1trzE9/file20ef76dd21e4
#> 04:09:33 Searching Annoy index using 1 thread, search_k = 3000
#> 04:09:34 Annoy recall = 100%
#> 04:09:34 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
#> 04:09:35 Initializing from normalized Laplacian + noise (using RSpectra)
#> 04:09:35 Commencing optimization for 500 epochs, with 63328 positive edges
#> 04:09:35 Using rng type: pcg
#> 04:09:37 Optimization finished
🧠 Step 3: Configure Conda and run Isolation Forest
reticulate::use_condaenv("hybridqc_py311", required = TRUE)
reticulate::py_config()
#> python: /opt/homebrew/Caskroom/miniconda/base/envs/hybridqc_py311/bin/python
#> libpython: /opt/homebrew/Caskroom/miniconda/base/envs/hybridqc_py311/lib/libpython3.11.dylib
#> pythonhome: /opt/homebrew/Caskroom/miniconda/base/envs/hybridqc_py311:/opt/homebrew/Caskroom/miniconda/base/envs/hybridqc_py311
#> version: 3.11.13 (main, Jun 5 2025, 08:21:08) [Clang 14.0.6 ]
#> numpy: /opt/homebrew/Caskroom/miniconda/base/envs/hybridqc_py311/lib/python3.11/site-packages/numpy
#> numpy_version: 2.2.6
#>
#> NOTE: Python version was forced by use_python() function
seurat_obj <- run_isolation_forest_qc(
seurat_obj,
python_path = "/opt/homebrew/Caskroom/miniconda/base/envs/hybridqc_py311/bin/python",
add_umap_plot = FALSE
)
# Save scores
iso_scores <- FetchData(seurat_obj, vars = "isoforest_score")
write.csv(iso_scores, "../results/qc/isoforest_scores.csv")
p <- FeaturePlot(seurat_obj, features = "isoforest_score", reduction = "umap") +
ggtitle("Isolation Forest Outlier Score (UMAP)")
ggsave("../results/plots/isoforest_umap.png", plot = p, width = 7, height = 5)
p
ml_scores <- seurat_obj$isoforest_score
filtered_obj <- filter_cells(seurat_obj, qc_basic, ml_scores)
write.csv(filtered_obj@meta.data, "../results/qc/filtered_cells_metadata.csv")
🔄 Build this vignette
Run the following once in your R session:
🧠 Tips • If you want to knit/test the Rmd directly: rmarkdown::render(“vignettes/HybridQC.Rmd”) • To view it in documentation: rebuild the package and go to Help > HybridQC > Vignettes