2023/0727_使用 Seurat的2700 PBMC資料_了解scRNA-seq 分析流程
Source: https://satijalab.org/seurat/articles/pbmc3k_tutorial.html
# Load the PBMC dataset (資料輸入)
pbmc.data <- Read10X(data.dir = "D:/YCT/scRNAseq/pbmc3k/filtered_gene_bc_matrices/hg19/")
# Initialize the Seurat object with the raw (non-normalized data).
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
#Standard pre-processing workflow
# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
# Visualize QC metrics as a violin plot
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
# FeatureScatter is typically used to visualize feature-feature relationships, but can be used
# for anything calculated by the object, i.e. columns in object metadata, PC scores etc.
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
#選取一部份的 細胞做後續分析.資料存入另一個pbmc1
pbmc1 <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
#Normalizing the data
pbmc1 <- NormalizeData(pbmc1, normalization.method = "LogNormalize", scale.factor = 10000)
#feature selection/ Identification of highly variable features
pbmc1 <- FindVariableFeatures(pbmc1, selection.method = "vst", nfeatures = 2000)
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc1), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc1)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2
#Scaling the data
all.genes <- rownames(pbmc1)
pbmc1 <- ScaleData(pbmc1, features = all.genes)
#Perform linear dimensional reduction
pbmc1 <- RunPCA(pbmc1, features = VariableFeatures(object = pbmc1))
# Examine and visualize PCA results a few different ways
print(pbmc1[["pca"]], dims = 1:5, nfeatures = 5)
#Seurat provides several useful ways of visualizing both cells and features that define the PCA, including VizDimReduction(), DimPlot(), and DimHeatmap()
VizDimLoadings(pbmc1, dims = 1:2, reduction = "pca")
DimPlot(pbmc1, reduction = "pca")
DimHeatmap(pbmc1, dims = 1, cells = 500, balanced = TRUE)
DimHeatmap(pbmc1, dims = 1:15, cells = 500, balanced = TRUE)
#Determine the ‘dimensionality’ of the dataset
pbmc1 <- JackStraw(pbmc1, num.replicate = 100)
pbmc1 <- ScoreJackStraw(pbmc1, dims = 1:20)
JackStrawPlot(pbmc1, dims = 1:15)
ElbowPlot(pbmc1)
#Cluster the cells
pbmc1 <- FindNeighbors(pbmc1, dims = 1:10)
pbmc1 <- FindClusters(pbmc1, resolution = 0.5)
#Maximum modularity in 10 random starts: 0.8728
# Look at cluster IDs of the first 5 cells
head(Idents(pbmc1), 5)
#Run non-linear dimensional reduction (UMAP/tSNE)
pbmc1 <- RunUMAP(pbmc1, dims = 1:10)
# note that you can set `label = TRUE` or use the LabelClusters function to help label
# individual clusters
DimPlot(pbmc1, reduction = "umap")
#暫時存下這個階段的資料,
saveRDS(pbmc1, file="D:/YCT/scRNAseq/pbmc3k/demo1.rds")
#Cluster biomarkers/把每一個cluster的特徵基因找出來。
#特定某一個群體2/ find all markers of cluster 2
cluster2.markers <- FindMarkers(pbmc1, ident.1 = 2, min.pct = 0.25)
head(cluster2.markers, n = 5)
# 特定某一個群體5 與 group0, 3的不同/ find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(pbmc1, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
head(cluster5.markers, n = 5)
#主要使用的功能/把每一個cluster的特徵基因找出來。/find markers for every cluster compared to all remaining cells, report only the positive ones
pbmc1.markers <- FindAllMarkers(pbmc1, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
pbmc1.markers %>%
group_by(cluster) %>%
slice_max(n = 2, order_by = avg_log2FC)
#ROC test returns the ‘classification power’ for any individual marker (ranging from 0 - random, to 1 - perfect).
cluster0.markers <- FindMarkers(pbmc1, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE)
VlnPlot(pbmc1, features = c("MS4A1", "CD79A"))
# you can plot raw counts as well
VlnPlot(pbmc1, features = c("NKG7", "PF4"), slot = "counts", log = TRUE)
FeaturePlot(pbmc1, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP",
"CD8A"))
#將每個cluster內前幾名特別基因用來分析heatmap
pbmc1.markers %>%
group_by(cluster) %>%
top_n(n = 10, wt = avg_log2FC) -> top10
DoHeatmap(pbmc1, features = top10$gene) + NoLegend()
#cluster 命名
new.cluster.ids <- c("Naive CD4 T", "CD14+ Mono", "Memory CD4 T", "B", "CD8 T", "FCGR3A+ Mono",
"NK", "DC", "Platelet")
names(new.cluster.ids) <- levels(pbmc1)
pbmc1 <- RenameIdents(pbmc1, new.cluster.ids)
DimPlot(pbmc1, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
#暫時存下這個階段的資料,
saveRDS(pbmc1, file= "D:/YCT/scRNAseq/pbmc3k/demo2.rds")
Comments
Post a Comment