差异基因表法分析

DESeq2

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
#
# Differential expression analysis with the DESeq2 package.
#
# https://bioconductor.org/packages/release/bioc/html/DESeq2.html
#
# Command line install: mamba install bioconductor-deseq2
#

DESIGN_FILE = "design.csv"

# The name of the file that contains the counts.
COUNTS_FILE = "counts.csv"

# The output file.
OUTPUT_FILE = "deseq2.csv"

# Install the optparse package automatically.
if(!require(optparse, quietly=T)){
install.packages("optparse", repos="http://cran.us.r-project.org")
library(optparse)
}

# Command line options.
option_list = list(

make_option(c("-d", "--design_file"), action="store", default=DESIGN_FILE, dest="design_file", type='character',
help="the design file for the samples [%default]"),

make_option(c("-c", "--counts"), action="store", default=COUNTS_FILE, dest="counts_file", type='character',
help="input counts file [%default]"),

make_option(c("-o", "--out"), action="store", default=OUTPUT_FILE, dest="output_file", type='character',
help="the results file [%default]"),

make_option(c("-f", "--factor_name"), action="store", default="group", dest="factor_name", type='character',
help="the factor column name in the design file [%default]"),

make_option(c("-s", "--sample_name"), action="store", default="sample", dest="sample_name", type='character',
help="the sample column name in the design file [%default]")

)

# Parse the options.
opt = parse_args(OptionParser(option_list=option_list))

# The name of the file that contains the counts.
counts_file = opt$counts_file

# The sample file is in CSV format and must have the headers "sample" and "condition".
design_file = opt$design_file

# The output file.
output_file = opt$output_file

# The name of the factor column
factor_name = opt$factor_name

# The name of the sample column
sample_name = opt$sample_name

# Inform the user.
cat("# Running DESeq2", "\n")
cat("# Design:", design_file, "\n")
cat("# Counts:", counts_file, "\n")

# Read the design file.
inp <- read.csv(design_file, strip.white = T, stringsAsFactors = F)

# Check the sample column name.
cat("# Sample column:", sample_name, "\n")

if (!sample_name %in% colnames(inp)) {
stop("# Sample column: ", sample_name, " not found in design file (use -s).")
}

# Check the factor column name.
cat("# Factor column:", factor_name, "\n")
if (!factor_name %in% colnames(inp)) {
stop("# Factor column: ", factor_name, " not found in design file (use -f).")
}

# Create a new data frame to work with.
df = data.frame(
sample = inp[, sample_name],
groups = inp[, factor_name]
)

# The default groups come from the design
groups = unique(df$group)

# Create a new data frame to work with.
df = data.frame(
sample = inp[, sample_name],
groups = inp[, factor_name]
)

# The default groups come from the design
groups = unique(df$groups )

# Print information on the groups
for (grp in groups) {
n_grp <- sum(df$groups == grp)
cat("# Group", grp, "has", n_grp, "samples.\n")
}

# Turn conditions into factors.
df$group = factor(df$group)

# The first level should correspond to the first entry in the file
# Required later when building a model.
df$group = relevel(df$group, toString(df$group[1]))

# Isolate the sample names.
sample_names = df$sample

# Read the data from input file.
count_df = read.csv(counts_file, strip.white = T, stringsAsFactors = F)

# Created rounded integers for the count data
count_data = round(count_df[, sample_names, drop=F])

# Column names should be carried over. Skip sample names and FDR
other_cols = names(count_df)[!names(count_df) %in% c(sample_names, "FDR")]

# Carry over all additional columns not in sample names.
row_data = count_df[, other_cols, drop=F]

#
# Running DESeq2
#

# Bioconductor packages used in the script.
bio.p <- c("DESeq2", "tibble", "dplyr", "tools")

# Load Bioconductor packages
cat("# Initializing ", bio.p, "...")
status = suppressPackageStartupMessages(
lapply(bio.p, library, character.only = TRUE)
)
cat(" done\n")

# Create DESEq2 dataset.
dds = DESeqDataSetFromMatrix(
countData = count_data,
rowData = row_data,
colData = df,
design = ~group
)

# Number of rows before
n_start = nrow(dds)

# Remove rows with no counts
keep <- rowSums(counts(dds)) > 3

# At least 3 samples with a count of 10 or higher
# keep <- rowSums(counts(dds) >= 10) >= 3

# Apply the filtering
dds <- dds[keep, ]

# Run deseq
dds = DESeq(dds)

# Get the normalized counts.
normed = counts(dds, normalized=TRUE)

# Round normalized counts to a single digit.
normed = round(normed, 1)

# Format the results.
res = results(dds)

#
# The rest of the code is about formatting the output dataframe.
#

# The first columns that carry over from the input.
start = dplyr::as_tibble(rowData(dds))[, other_cols]

# The main tibble
data = bind_cols(start, as_tibble(res))

# Create the foldChange column.
data$foldChange = 2 ^ data$log2FoldChange

# Rename the columns.
data = dplyr::rename(data, PValue=pvalue, FDR=padj)

# Set all NA values to 1.
data$FDR[is.na(data$FDR)] <- 1

# Create a real adjusted pvalue
data$PAdj = p.adjust(data$PValue, method="hochberg")

# Create the additional columns that we wish to present.
data$baseMeanA = 1
data$baseMeanB = 1

# Combine the normed matrix with the results
total <- bind_cols(data, normed)

# Sort again by P-value.
total = arrange(total, FDR)

# Compute the false discovery counts on the sorted table.
total$falsePos = 1:nrow(total) * total$FDR

# Sample names for condition A and B
col_names_A <- sample_names [df$group == groups[[1]]]
col_names_B <- sample_names [df$group == groups[[2]]]

#col_names_A = unlist(df %>% filter(group==groups[1]) %>% select(sample))
#col_names_B = unlist(df %>% filter(group==groups[2]) %>% select(sample))

# Create the individual baseMean columns.
total$baseMeanA = rowMeans(total[, col_names_A])
total$baseMeanB = rowMeans(total[, col_names_B])

# Bringing some sanity to numbers. Round columns to fewer digits.
total$foldChange = round(total$foldChange, 3)
total$log2FoldChange = round(total$log2FoldChange, 1)
total$baseMean = round(total$baseMean, 1)
total$baseMeanA = round(total$baseMeanA, 1)
total$baseMeanB = round(total$baseMeanB, 1)
total$lfcSE = round(total$lfcSE, 2)
total$stat = round(total$stat, 2)
total$FDR = round(total$FDR, 4)
total$falsePos = round(total$falsePos, 0)

# Reorganize columns names to make more sense.
new_cols = c(other_cols,
"baseMean","baseMeanA","baseMeanB",
"foldChange", "log2FoldChange",
"lfcSE","stat","PValue","PAdj",
"FDR","falsePos", col_names_A, col_names_B)

# Slice the dataframe with new columns.
total = total[, new_cols]

# Count the number of significant results.
n_total = nrow(total)
fdr_count = sum(total$FDR < 0.05, na.rm=TRUE)
fdr_frac = round(fdr_count/n_total, 3) * 100

pval_count = sum(total$PValue < 0.05, na.rm=TRUE)
pval_frac = round(pval_count/n_total, 3) * 100

# Reformat these columns as string.
total$PAdj = formatC(total$PAdj, format = "e", digits = 1)
total$PValue = formatC(total$PValue, format = "e", digits = 1)

# Write the results to the standard output.
write.csv(total, file=output_file, row.names = F, quote = F)

# Inform the user.
cat("# Input:", n_start, "rows\n")
cat("# Removed:", n_start-n_total, "rows\n")
cat("# Fitted:", n_total, "rows\n")
cat("# Significant PVal:", pval_count, "(", pval_frac, "%)\n")
cat("# Significant FDRs:", fdr_count, "(", fdr_frac, "%)\n")
cat("# Results:", output_file, "\n")


Deseq2的输出中,Padj实际上是FDR! 一般我们认为:PAdj进行多重检验所使用的方法是hochberg, FDR进行多重检验的方法是Benjamini & Hochberg。所以在代码中我们进行了转换操作。

数据结果说明

name - the feature identity. It must be unique within the column. It may be a gene name, transcript name, or exon - whatever the feature we chose to quantify.
baseMean - the average normalized expression level across samples. It measures how much total signal is present across both conditions.
baseMeanA - the average normalized expression level across the first condition. It measures how much total signal is there for condition A.
baseMeanB - the average normalized expression level across the first condition. It measures how much total signal is there for condition B.
foldChange - the ratio of baseMeanB/baseMeanA. Very important to always be aware that in the fold change means B/A (second condition/first condition)
log2FoldChange - the second logarithm of foldChange. Log 2 transformations are convenient as they transform the changes onto a uniform scale. A four-fold increase after transformation is 2. A four-fold decrease (1/4) after log 2 transform is -2. This property makes it much easier to compare the magnitude of up/down changes.
PValue - the uncorrected p-value of the likelihood of observing the effect of the size foldChange (or larger) by chance alone. This p-value is not corrected for multiple comparisons.
PAdj - the multiple comparisons corrected PValue (via the Hochberg method). This probability of having at least one false positive when accounting for all comparisons made. This value is usually overly conservative in genomics.
FDR - the False Discovery Rate - this column represents the fraction of false discoveries for all the rows above the row where the value is listed. For example, if in row number 300 the FDR is 0.05, it means that if you were cut the table at this row and accept all genes at and above it as differentially expressed then, 300 * 0.05 = 15 genes out of the 300 are likely to be false positives. The values in this column are also called q-values.
falsePos - this column is derived directly from FDR and represents the number of false positives in the rows above. It is computed as RowIndex * FDR and is there to provide a direct interpretation of FDR.
The following columns represent the normalized matrix of the original count data in this case, 3 and 3 conditions.

edgeR

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#
# Differential expression analysis with the edgeR package.
#
# https://bioconductor.org/packages/release/bioc/html/edgeR.html
#

# Load the library
library(edgeR)

# The name of the file that contains the counts.
counts_file = "counts.csv"

# The sample file is in CSV format and must have the headers "sample" and "condition".
design_file = "design.csv"

# The final result file.
output_file = "results.csv"

# Read the sample file.
colData <- read.csv(design_file, stringsAsFactors=F)

# Turn conditions into factors.
colData$condition = factor(colData$condition)

# The first level should correspond to the first entry in the file!
# Required when building a model.
colData$condition = relevel(colData$condition, toString(colData$condition[1]))

# Isolate the sample names.
sample_names <- colData$sample

# Read the data from the standard input.
df = read.csv(counts_file, header=TRUE, row.names=1 )

# Created rounded integers for the count data
counts = round(df[, sample_names])

# Other columns in the dataframe that are not sample information.
otherCols = df[!(names(df) %in% sample_names)]

# Using the same naming as in the library.
group <- colData$condition

# Creates a DGEList object from a table of counts and group.
dge <- DGEList(counts=counts, group=group)

# Maximizes the negative binomial conditional common likelihood to estimate a common dispersion value across all genes.
dis <- estimateCommonDisp(dge)

# Estimates tagwise dispersion values by an empirical Bayes method based on weighted conditional maximum likelihood.
tag <- estimateTagwiseDisp(dis)

# Compute genewise exact tests for differences in the means between the groups.
etx <- exactTest(tag)

# Extracts the most differentially expressed genes.
etp <- topTags(etx, n=nrow(counts))

# Get the scale of the data
scale = dge$samples$lib.size * dge$samples$norm.factors

# Get the normalized counts
normed = round(t(t(counts)/scale) * mean(scale))

# Turn the DESeq2 results into a data frame.
data = merge(otherCols, etp$table, by="row.names")

# Create column placeholders.
data$baseMean = 1
data$baseMeanA = 1
data$baseMeanB = 1
data$foldChange = 2 ^ data$logFC
data$falsePos = 1

# Rename the column.
names(data)[names(data)=="logFC"] <-"log2FoldChange"

# Compute the adjusted p-value
data$PAdj = p.adjust(data$PValue, method="hochberg")

# Rename the first columns for consistency with other methods.
colnames(data)[1] <- "name"

# Create a merged output that contains the normalized counts.
total <- merge(data, normed, by.x='name', by.y="row.names")

# Sort the data for the output.
total = total[with(total, order(PValue, -foldChange)), ]

# Compute the false discovery counts on the sorted table.
data$falsePos = 1:nrow(data) * data$FDR

# Sample names for condition A
col_names_A = data.frame(split(colData, colData$condition)[1])[,1]

# Sample names for condition B
col_names_B = data.frame(split(colData, colData$condition)[2])[,1]

# Create the individual baseMean columns.
total$baseMeanA = rowMeans(total[, col_names_A])
total$baseMeanB = rowMeans(total[, col_names_B])
total$baseMean = total$baseMeanA + total$baseMeanB

# Round the numbers
total$foldChange = round(total$foldChange, 3)
total$FDR = round(total$FDR, 4)
total$PAdj = round(total$PAdj, 4)
total$logCPM = round(total$logCPM, 1)
total$log2FoldChange = round(total$log2FoldChange, 1)
total$baseMean = round(total$baseMean, 1)
total$baseMeanA = round(total$baseMeanA, 1)
total$baseMeanB = round(total$baseMeanB, 1)
total$falsePos = round(total$falsePos, 0)

# Reformat these columns as string.
total$PAdj = formatC(total$PAdj, format = "e", digits = 1)
total$PValue = formatC(total$PValue, format = "e", digits = 1)

# Reorganize columns names to make more sense.
new_cols = c("name", names(otherCols), "baseMean","baseMeanA","baseMeanB","foldChange",
"log2FoldChange","logCPM","PValue","PAdj", "FDR","falsePos",col_names_A, col_names_B)

# Slice the dataframe with new columns.
total = total[, new_cols]

# Write the result to the standard output.
write.csv(total, file=output_file, row.names=FALSE, quote=FALSE)

# Inform the user.
print("# Tool: edgeR")
print(paste("# Design: ", design_file))
print(paste("# Input: ", counts_file))
print(paste("# Output: ", output_file))

数据结果说明

name - the feature identity. It must be unique within the column. It may be a gene name, transcript name, or exon - whatever the feature we chose to quantify.
baseMean - the average normalized expression level across samples. It measures how much total signal is present across both conditions.
baseMeanA - the average normalized expression level across the first condition. It measures how much total signal is there for condition A.
baseMeanB - the average normalized expression level across the first condition. It measures how much total signal is there for condition B.
foldChange - the ratio of baseMeanB/baseMeanA. Very important to always be aware that in the fold change means B/A (second condition/first condition)
log2FoldChange - the second logarithm of foldChange. Log 2 transformations are convenient as they transform the changes onto a uniform scale. A four-fold increase after transformation is 2. A four-fold decrease (1/4) after log 2 transform is -2. This property makes it much easier to compare the magnitude of up/down changes.
PValue - the uncorrected p-value of the likelihood of observing the effect of the size foldChange (or larger) by chance alone. This p-value is not corrected for multiple comparisons.
PAdj - the multiple comparisons corrected PValue (via the Hochberg method). This probability of having at least one false positive when accounting for all comparisons made. This value is usually overly conservative in genomics.
FDR - the False Discovery Rate - this column represents the fraction of false discoveries for all the rows above the row where the value is listed. For example, if in row number 300 the FDR is 0.05, it means that if you were cut the table at this row and accept all genes at and above it as differentially expressed then, 300 * 0.05 = 15 genes out of the 300 are likely to be false positives. The values in this column are also called q-values.
falsePos - this column is derived directly from FDR and represents the number of false positives in the rows above. It is computed as RowIndex * FDR and is there to provide a direct interpretation of FDR.
The following columns represent the normalized matrix of the original count data in this case, 3 and 3 conditions.