WGCNA全流程

WGCNA

WGCNA是最初接触到课题的时候学习的,最近要帮别的课题组做一个四组数据的WGCNA分析,去寻找每组的特征基因,趁着个机会,整理一下WGCNA的全流程

什么是WGCNA?

WGCNA的全称为Weighted correlation network analysis, 基本的原理就是通过对基因的聚类将基因划分为多个模块,去分析模块和性状,包括性别,年龄,以及分组情况等的关系.

WGCNA流程

导入数据

导入基因表达数据.
# 数据的行为基因,列为样本
dat <- read.table("~/project/other.data/AD_USP25_4_GT/AllSamplesGeneExpressionCount.txt",header=T,stringsAsFactors=F)
sample.info <- read.table("~/project/other.data/AD_USP25_4_GT/sample.info.txt",header=T,stringsAsFactors=F)

# 将 counts 转换为 log2(counts+1),并且将数据转置
datExpr0 <- t(log2(dat+1))
datExpr0[datExpr0==0] <- NA

数据清洗

检查基因或者样本是否有过多的缺失值.
gsg = goodSamplesGenes(datExpr0, verbose = 3)
gsg$allOK
# 如果 gsg$allOK 返回 TRUE, 所有的基因和样本都通过了检测,如果返回FALSE,我们酒需要删除那些缺失值过多的基因和样本
if (!gsg$allOK) {
    if (sum(!gsg$goodGenes)>0)
        printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")));
    if (sum(!gsg$goodSamples)>0)
        printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")));
    datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]
}
接下来我们对样本进行聚类检查是否存在离群样本.
sampleTree = hclust(dist(datExpr0), method = "average")

pdf(file = "sample_clustering.pdf", width = 12, height = 9);
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,
cex.axis = 1.5, cex.main = 2)
abline(h=110,col="red")
dev.off()
样本聚类结果如上图所示,有两个样本AD6和WT2与别的样本相比呈现出离群的趋势,因此我们在下面的分析中需要去除这两个样本.
clust = cutreeStatic(sampleTree, cutHeight = 110, minSize = 10)
table(clust)
# clust 1包含我们想要保留的样本
|clust
|0  1
|2 22
keepSamples = (clust==1)
datExpr = datExpr0[keepSamples, ]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)

导入性状数据,在这里,我们的性状指样本的分组.

datTraits <- data.frame(sample=colnames(datExpr),stringsAsFactors=F)
datTraits$group <- unlist(sapply(datTraits$sample,function(x) unlist(strsplit(x,split="[0-9]"))[1] ))
datTraits$group2 <- c(1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4)
现在我们有了分组的数据,我们可以在计算模块之前去观察一下样本的聚类与样本的分组的关系
sampleTree2 = hclust(dist(datExpr), method = "average")
traitColors = numbers2colors(datTraits$group2, signed = T);
pdf(file = "sample_clustering_2.pdf", width = 7, height = 7)
plotDendroAndColors(sampleTree2, traitColors,
groupLabels = names(datTraits),
main = "Sample dendrogram and trait heatmap")
dev.off()

一步一步的建立基因网络

选择soft-thresholding power

powers = c(c(1:10), seq(from = 12, to=20, by=2))
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5, networkType = "signed")

pdf("power_selection.pdf",width=7,height=4)
par(mfrow = c(1,2))
cex1 = 0.9
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red")
# this line corresponds to using an R^2 cut-off of h
abline(h=c(0.8,0.85,0.90),col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
dev.off()

共表达相似性计算

根据上图的结果,我们设置soft power = 12.
# Co-expression similarity and adjacency
softPower = 6
adjacency = adjacency(datExpr, power = softPower, type = "signed")
# Topological Overlap Matrix (TOM)
TOM = TOMsimilarity(adjacency, TOMType = "signed")
dissTOM = 1-TOM

基因聚类和模块检测

# 基因聚类
geneTree = hclust(as.dist(dissTOM), method = "average")

pdf("gene_clustering.pdf",width=12,height=9)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE, hang = 0.04)
dev.off()
# 模块检测
minModuleSize = 30
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
    deepSplit = 2, pamRespectsDendro = FALSE, minClusterSize = minModuleSize)
table(dynamicMods)

dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)

pdf("gene_clustering_2.pdf",width=8,height=6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")
dev.off()
# 模块合并
MEList = moduleEigengenes(datExpr, colors = dynamicColors)
MEs = MEList$eigengenes
MEDiss = 1-cor(MEs)
METree = hclust(as.dist(MEDiss), method = "average")

pdf("module_clustering.pdf",width=7,height=6)
plot(METree, main = "Clustering of module eigengenes", xlab = "", sub = "")
MEDissThres = 0.3
abline(h=MEDissThres, col = "red")
dev.off()
merge = mergeCloseModules(datExpr, dynamicColors, cutHeight = MEDissThres, verbose = 3)
mergedColors = merge$colors
mergedMEs = merge$newMEs

pdf(file = "gene_clustering_3.pdf", wi = 8, he = 6)
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors), c("Dynamic Tree Cut", "Merged dynamic"), dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05)
dev.off()
# 模块颜色重命名
moduleColors = mergedColors
colorOrder = c("grey", standardColors(50))
moduleLabels = match(moduleColors, colorOrder)-1
MEs = mergedMEs

将分组信息与模块相关联

nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs, datTraits, use = "p")
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)

pdf("heatmap.of.correlation.pdf",width=10,height=6)
textMatrix = paste(signif(moduleTraitCor, 2), "\n(", signif(moduleTraitPvalue, 1), ")", sep = "")
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3))
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(datTraits),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = greenWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
dev.off()

评论

此博客中的热门博文

RNAseq学习与总结

10X Genomics单细胞转录组测序数据的处理