2023/0728_使用10X genomics提供的5k 胰臟癌資料分析 (HDF5格式轉換)
Source: https://www.10xgenomics.com/resources/datasets?query=pancreatic%20&page=1&configure%5BhitsPerPage%5D=50&configure%5BmaxValuesPerFacet%5D=1000
Source: 資料輸入參考/ https://www.youtube.com/watch?v=5HBzgsz8qyk&ab_channel=Bioinformagician
#下載 10xgenomics // 5K human pancreatic tumor isolated with chromium nuclei isolation kit
#檔案格式 為 HDF5, .5, 用 hdf5r的package才能用 Read10X_h5功能
#也下載SeuratDisk
install.packages("hdf5r")
library(hdf5r)
library(Seurat)
library(SeuratDisk)
library(dplyr) # 出現一些問題名 Q1#
library(patchwork)
#資料輸入參考/ https://www.youtube.com/watch?v=5HBzgsz8qyk&ab_channel=Bioinformagician
# Load the ataset (資料輸入) (輸入一個 matrix)
PDAC.data <- Read10X_h5(filename = "D:/YCT/scRNAseq/5kPDAC/5k_human_pancreatic_tumor_CNIK_5pv2_raw_feature_bc_matrix.h5",
use.names=TRUE, unique.features=TRUE)
str(PDAC.data)
# PDAC.expression <- PDAC.data$"Gene Expression" 這一步沒有辦法用/因為資料裏沒有這一個 str 結構
# Initialize the Seurat object with the raw (non-normalized data). 建立一個 seurate的 object
# 基本上 PDAC會一個rownames為barcode, 資料為 nCount_RNA 及 nFeature_RNA->就是 meta.data
PDAC <- CreateSeuratObject(counts = PDAC.data, project = "5kPDAC", min.cells = 5, min.features = 300)
Str(PDAC)
#Standard pre-processing workflow
# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
# 就是在meta.data裏都加一個column, "percent.mt"
PDAC[["percent.mt"]] <- PercentageFeatureSet(PDAC, pattern = "^MT-")
# Visualize QC metrics as a violin plot --> 用來判斷要選那一個範圍的細胞做分析
VlnPlot(PDAC, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
# FeatureScatter is typically used to visualize feature-feature relationships, but can be usedPDA
# for anything calculated by the object, i.e. columns in object metadata, PC scores etc.
plot1 <- FeatureScatter(PDAC, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(PDAC, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
#選取一部份的 細胞做後續分析.資料存入另一個 PDAC1 (要依據上面的 VlnPlot 圖來決定#
PDAC1 <- subset(PDAC, subset = nFeature_RNA > 300 & nFeature_RNA < 7000 & percent.mt < 5)
#Normalizing the data
PDAC1 <- NormalizeData(PDAC1, normalization.method = "LogNormalize", scale.factor = 10000)
#feature selection/ Identification of highly variable features
PDAC1 <- FindVariableFeatures(PDAC1, selection.method = "vst", nfeatures = 2000)
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(PDAC1), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(PDAC1)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2
#Scaling the data
all.genes <- rownames(PDAC1)
PDAC2 <- ScaleData(PDAC1, features = all.genes)
#Perform linear dimensional reduction
PDAC2 <- RunPCA(PDAC2, features = VariableFeatures(object = PDAC2))
# Examine and visualize PCA results a few different ways
print(PDAC2[["pca"]], dims = 1:5, nfeatures = 5)
#Seurat provides several useful ways of visualizing both cells and features that define the PCA, including VizDimReduction(), DimPlot(), and DimHeatmap()
VizDimLoadings(PDAC2, dims = 1:2, reduction = "pca")
DimPlot(PDAC2, reduction = "pca")
DimHeatmap(PDAC2, dims = 1, cells = 500, balanced = TRUE)
DimHeatmap(PDAC2, dims = 1:15, cells = 500, balanced = TRUE)
#Determine the ‘dimensionality’ of the dataset
PDAC1 <- JackStraw(PDAC2, num.replicate = 100)
PDAC1 <- ScoreJackStraw(PDAC2, dims = 1:20)
JackStrawPlot(PDAC2, dims = 1:15)
ElbowPlot(PDAC2)
#Cluster the cells
PDAC2 <- FindNeighbors(PDAC2, dims = 1:10)
PDAC2 <- FindClusters(PDAC2, resolution = 0.5)
#Maximum modularity in 10 random starts: 0.8728
# Look at cluster IDs of the first 5 cells
head(Idents(PDAC2), 5)
#Run non-linear dimensional reduction (UMAP/tSNE)
PDAC2 <- RunUMAP(PDAC2, dims = 1:10)
# note that you can set `label = TRUE` or use the LabelClusters function to help label
# individual clusters
DimPlot(PDAC2, reduction = "umap")
#暫時存下這個階段的資料,
saveRDS(PDAC2, file="D:/YCT/scRNAseq/5kPDAC/demo1.rds")
#Cluster biomarkers/把每一個cluster的特徵基因找出來。
#特定某一個群體2/ find all markers of cluster 2
cluster2.markers <- FindMarkers(PDAC2, ident.1 = 2, min.pct = 0.25)
head(cluster2.markers, n = 5)
# 特定某一個群體5 與 group0, 3的不同/ find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(PDAC2, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
head(cluster5.markers, n = 5)
#主要使用的功能/把每一個cluster的特徵基因找出來。/find markers for every cluster compared to all remaining cells, report only the positive ones
PDAC2.markers <- FindAllMarkers(PDAC2, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
PDAC2.markers %>%
group_by(cluster) %>%
slice_max(n = 2, order_by = avg_log2FC)
#ROC test returns the ‘classification power’ for any individual marker (ranging from 0 - random, to 1 - perfect).
cluster0.markers <- FindMarkers(PDAC2, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE)
VlnPlot(PDAC2, features = c("MS4A1", "CD79A"))
# you can plot raw counts as well
VlnPlot(PDAC2, features = c("NKG7", "PF4"), slot = "counts", log = TRUE)
FeaturePlot(PDAC2, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP",
"CD8A"))
#將每個cluster內前幾名特別基因用來分析heatmap
PDAC2.markers %>%
group_by(cluster) %>%
top_n(n = 10, wt = avg_log2FC) -> top10
DoHeatmap(PDAC2, features = top10$gene) + NoLegend()
#cluster 命名 (原理是什麼??只是照cluster的順序而己)
new.cluster.ids <- c("gp0", "gp1", "gp2", "gp3", "gp4", "gp5", "gp6",
"gp7", "gp8", "gp9", "gp10", "gp11","gp12")
names(new.cluster.ids) <- levels(PDAC2)
PDAC2 <- RenameIdents(PDAC2, new.cluster.ids)
DimPlot(PDAC2, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
#暫時存下這個階段的資料,
saveRDS(PDAC2, file= "D:/YCT/scRNAseq/5kPDAC/demo2.rds")
Comments
Post a Comment