limma의 선형모형 + empirical Bayes를 사용해 차등발현 유전자를 구합니다. (one-color/Illumina는 log2 표현값, Agilent two-color는 M=log2(R/G) 사용)
Step 02에서 저장한 행렬을 그대로 사용합니다.
# expr: (gene/probe × sample) log2 —— Affy / Agilent 1C / Illumina
expr <- readRDS("affy_rma_log2.rds") # 예시: 파일명은 환경에 맞게 변경
# 또는 Agilent two-color
# M <- readRDS("agilent_twocolor_M.rds")
library(limma)
group <- factor(c("Control","Control","Case","Case"))
design <- model.matrix(~0 + group)
colnames(design) <- levels(group)
cont <- makeContrasts(Case_vs_Control = Case - Control, levels=design)
group <- factor(c("Control","Control","TreatA","TreatA","TreatB","TreatB"))
design <- model.matrix(~0 + group); colnames(design) <- levels(group)
cont <- makeContrasts(
TreatA_vs_Control = TreatA - Control,
TreatB_vs_Control = TreatB - Control,
levels = design
)
동일 피험자 반복 측정은 duplicateCorrelation로 상관을 모델링하거나, 블록을 넣어 처리합니다.
# subject: 개체(factor), condition: Before/After
subject <- factor(c("P1","P1","P2","P2","P3","P3","P4","P4"))
condition <- factor(c("Before","After","Before","After","Before","After","Before","After"))
design <- model.matrix(~0 + condition); colnames(design) <- levels(condition)
block <- subject
corfit <- duplicateCorrelation(expr, design, block=block) # expr 또는 M 사용
# contrasts
cont <- makeContrasts(After_vs_Before = After - Before, levels=design)
# batch: 실험 배치(factor), group: 관심 그룹
design <- model.matrix(~ 0 + group + batch) # 배치를 공변량으로 포함
colnames(design)[1:length(levels(group))] <- levels(group)
cont <- makeContrasts(Case_vs_Control = groupCase - groupControl, levels=design)
# expr: log2 expression matrix
fit <- lmFit(expr, design)
fit2 <- contrasts.fit(fit, cont)
fit2 <- eBayes(fit2)
# 단일 대비 예시
tt <- topTable(fit2, coef=1, number=Inf, sort.by="P", adjust.method="BH")
head(tt)
# 저장
write.csv(tt, file="DEG_limma_results.csv", row.names=TRUE)
# M: log2(R/G) matrix
fit <- lmFit(M, design)
fit2 <- contrasts.fit(fit, cont)
fit2 <- eBayes(fit2)
tt <- topTable(fit2, coef=1, number=Inf, sort.by="P", adjust.method="BH")
write.csv(tt, file="DEG_twocolor_M_results.csv")
# 일반적인 기준(예시): adj.P.Val < 0.05 & |logFC| >= 1
deg <- subset(tt, adj.P.Val < 0.05 & abs(logFC) >= 1)
nrow(deg); head(deg)
write.csv(deg, "DEG_significant.csv")
if (!requireNamespace("ggplot2", quietly=TRUE)) install.packages("ggplot2")
library(ggplot2)
tbl <- transform(tt,
negLogP = -log10(P.Value),
sig = factor(adj.P.Val < 0.05 & abs(logFC) >= 1,
levels=c(TRUE,FALSE), labels=c("Sig","NS"))
)
ggplot(tbl, aes(x=logFC, y=negLogP, color=sig)) +
geom_point(size=1.2, alpha=.7) +
geom_vline(xintercept=c(-1,1), linetype="dashed") +
geom_hline(yintercept=-log10(0.05), linetype="dashed") +
scale_color_manual(values=c("Sig"="#d62728","NS"="#7f7f7f")) +
labs(x="log2 Fold Change", y="-log10(P)", color="") +
theme_minimal()
if (!requireNamespace("pheatmap", quietly=TRUE)) install.packages("pheatmap")
library(pheatmap)
topGenes <- rownames(tt)[1:50] # 상위 50개 예시
Xsub <- expr[topGenes, , drop=FALSE] # two-color면 M 사용
pheatmap(scale(t(Xsub)), cluster_cols=TRUE, show_rownames=TRUE)
mat <- expr # two-color면 M 사용
pc <- prcomp(t(mat), scale.=TRUE)
plot(pc$x[,1], pc$x[,2], xlab="PC1", ylab="PC2",
pch=19, col=as.integer(group)+1)
legend("topright", legend=levels(group), pch=19, col=2:(length(levels(group))+1))
plotMA(fit2, coef=1, main="MA-plot (coef 1)")
abline(h=0, col="red", lty=2)
플랫폼별 annotation 패키지/파일을 사용해 심볼/Entrez/Ensembl을 붙입니다.
# 예시: org.Hs.eg.db로 SYMBOL 붙이기(키 타입은 환경에 맞게 변경)
if (!requireNamespace("org.Hs.eg.db", quietly=TRUE))
BiocManager::install("org.Hs.eg.db")
library(AnnotationDbi); library(org.Hs.eg.db)
keys <- rownames(tt)
ann <- AnnotationDbi::select(org.Hs.eg.db, keys=keys,
columns=c("SYMBOL","ENTREZID"), keytype="PROBEID")
tt2 <- cbind(PROBEID = rownames(tt), tt)
tt2 <- merge(tt2, ann, by.x="PROBEID", by.y="PROBEID", all.x=TRUE)
write.csv(tt2, "DEG_with_annotation.csv", row.names=FALSE)
duplicateCorrelation과 block을 함께 사용하는 것이 안정적입니다.removeBatchEffect로 시각화용 보정.