2023/0728_使用10X genomics提供的5k 胰臟癌資料分析 (HDF5格式轉換)

Source: https://www.10xgenomics.com/resources/datasets?query=pancreatic%20&page=1&configure%5BhitsPerPage%5D=50&configure%5BmaxValuesPerFacet%5D=1000 Source: 資料輸入參考/ https://www.youtube.com/watch?v=5HBzgsz8qyk&ab_channel=Bioinformagician #下載 10xgenomics // 5K human pancreatic tumor isolated with chromium nuclei isolation kit #檔案格式 為 HDF5, .5, 用 hdf5r的package才能用 Read10X_h5功能 #也下載SeuratDisk install.packages("hdf5r") library(hdf5r) library(Seurat) library(SeuratDisk) library(dplyr) # 出現一些問題名 Q1# library(patchwork) #資料輸入參考/ https://www.youtube.com/watch?v=5HBzgsz8qyk&ab_channel=Bioinformagician # Load the ataset (資料輸入) (輸入一個 matrix) PDAC.data <- Read10X_h5(filename = "D:/YCT/scRNAseq/5kPDAC/5k_human_pancreatic_tumor_CNIK_5pv2_raw_feature_bc_matrix.h5", use.names=TRUE, unique.features=TRUE) str(PDAC.data) # PDAC.expression <- PDAC.data$"Gene Expression" 這一步沒有辦法用/因為資料裏沒有這一個 str 結構 # Initialize the Seurat object with the raw (non-normalized data). 建立一個 seurate的 object # 基本上 PDAC會一個rownames為barcode, 資料為 nCount_RNA 及 nFeature_RNA->就是 meta.data PDAC <- CreateSeuratObject(counts = PDAC.data, project = "5kPDAC", min.cells = 5, min.features = 300) Str(PDAC) #Standard pre-processing workflow # The [[ operator can add columns to object metadata. This is a great place to stash QC stats # 就是在meta.data裏都加一個column, "percent.mt" PDAC[["percent.mt"]] <- PercentageFeatureSet(PDAC, pattern = "^MT-") # Visualize QC metrics as a violin plot --> 用來判斷要選那一個範圍的細胞做分析 VlnPlot(PDAC, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3) # FeatureScatter is typically used to visualize feature-feature relationships, but can be usedPDA # for anything calculated by the object, i.e. columns in object metadata, PC scores etc. plot1 <- FeatureScatter(PDAC, feature1 = "nCount_RNA", feature2 = "percent.mt") plot2 <- FeatureScatter(PDAC, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") plot1 + plot2 #選取一部份的 細胞做後續分析.資料存入另一個 PDAC1 (要依據上面的 VlnPlot 圖來決定# PDAC1 <- subset(PDAC, subset = nFeature_RNA > 300 & nFeature_RNA < 7000 & percent.mt < 5) #Normalizing the data PDAC1 <- NormalizeData(PDAC1, normalization.method = "LogNormalize", scale.factor = 10000) #feature selection/ Identification of highly variable features PDAC1 <- FindVariableFeatures(PDAC1, selection.method = "vst", nfeatures = 2000) # Identify the 10 most highly variable genes top10 <- head(VariableFeatures(PDAC1), 10) # plot variable features with and without labels plot1 <- VariableFeaturePlot(PDAC1) plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE) plot1 + plot2 #Scaling the data all.genes <- rownames(PDAC1) PDAC2 <- ScaleData(PDAC1, features = all.genes) #Perform linear dimensional reduction PDAC2 <- RunPCA(PDAC2, features = VariableFeatures(object = PDAC2)) # Examine and visualize PCA results a few different ways print(PDAC2[["pca"]], dims = 1:5, nfeatures = 5) #Seurat provides several useful ways of visualizing both cells and features that define the PCA, including VizDimReduction(), DimPlot(), and DimHeatmap() VizDimLoadings(PDAC2, dims = 1:2, reduction = "pca") DimPlot(PDAC2, reduction = "pca") DimHeatmap(PDAC2, dims = 1, cells = 500, balanced = TRUE) DimHeatmap(PDAC2, dims = 1:15, cells = 500, balanced = TRUE) #Determine the ‘dimensionality’ of the dataset PDAC1 <- JackStraw(PDAC2, num.replicate = 100) PDAC1 <- ScoreJackStraw(PDAC2, dims = 1:20) JackStrawPlot(PDAC2, dims = 1:15) ElbowPlot(PDAC2) #Cluster the cells PDAC2 <- FindNeighbors(PDAC2, dims = 1:10) PDAC2 <- FindClusters(PDAC2, resolution = 0.5) #Maximum modularity in 10 random starts: 0.8728 # Look at cluster IDs of the first 5 cells head(Idents(PDAC2), 5) #Run non-linear dimensional reduction (UMAP/tSNE) PDAC2 <- RunUMAP(PDAC2, dims = 1:10) # note that you can set `label = TRUE` or use the LabelClusters function to help label # individual clusters DimPlot(PDAC2, reduction = "umap") #暫時存下這個階段的資料, saveRDS(PDAC2, file="D:/YCT/scRNAseq/5kPDAC/demo1.rds") #Cluster biomarkers/把每一個cluster的特徵基因找出來。 #特定某一個群體2/ find all markers of cluster 2 cluster2.markers <- FindMarkers(PDAC2, ident.1 = 2, min.pct = 0.25) head(cluster2.markers, n = 5) # 特定某一個群體5 與 group0, 3的不同/ find all markers distinguishing cluster 5 from clusters 0 and 3 cluster5.markers <- FindMarkers(PDAC2, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25) head(cluster5.markers, n = 5) #主要使用的功能/把每一個cluster的特徵基因找出來。/find markers for every cluster compared to all remaining cells, report only the positive ones PDAC2.markers <- FindAllMarkers(PDAC2, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25) PDAC2.markers %>% group_by(cluster) %>% slice_max(n = 2, order_by = avg_log2FC) #ROC test returns the ‘classification power’ for any individual marker (ranging from 0 - random, to 1 - perfect). cluster0.markers <- FindMarkers(PDAC2, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE) VlnPlot(PDAC2, features = c("MS4A1", "CD79A")) # you can plot raw counts as well VlnPlot(PDAC2, features = c("NKG7", "PF4"), slot = "counts", log = TRUE) FeaturePlot(PDAC2, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A")) #將每個cluster內前幾名特別基因用來分析heatmap PDAC2.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC) -> top10 DoHeatmap(PDAC2, features = top10$gene) + NoLegend() #cluster 命名 (原理是什麼??只是照cluster的順序而己) new.cluster.ids <- c("gp0", "gp1", "gp2", "gp3", "gp4", "gp5", "gp6", "gp7", "gp8", "gp9", "gp10", "gp11","gp12") names(new.cluster.ids) <- levels(PDAC2) PDAC2 <- RenameIdents(PDAC2, new.cluster.ids) DimPlot(PDAC2, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend() #暫時存下這個階段的資料, saveRDS(PDAC2, file= "D:/YCT/scRNAseq/5kPDAC/demo2.rds")

Comments

Popular posts from this blog

2022_1204_下載FPKM到TPM

2023/0727_使用 Seurat的2700 PBMC資料_了解scRNA-seq 分析流程

2023_0730/ 整合 snRNA-seq的資料, 去除Batch effect /Bioinformagician