Biostar workflows

Guide

在各个数据库中有大量的数据,可以通过一些工具进行下载:

parallel

当我们有一个文件如下所示:

1
2
3
4
5
6
7
sample,group,file
control1,control,sample_J1_ABC
control2,control,sample_K2_DEF
control3,control,sample_L3_GHI
heatshock1,treatment,sample_Q1_JKL
heatshock2,treatment,sample_Q2_MNO
heatshock3,treatment,sample_Q3_PQR

我们想根据列明名去提取变量,并在parallel中运行,我们可以使用--header :以及--colseo ,然后就可以实现以下功能:
1
2
3
4
5
6
cat design.csv | parallel --header : --colsep , \
make -f snpcall.mk \
REF=refs/genome.fa \
R1=reads/{file}.fq \
BAM=bam/{sample}.bam \
run

访问meatadata

如果我们有NCBI accession number,我们可以通过bio searchmetadata:

1
2
3
4
5
6
7
8
9
10
11
# Searches GenBank
bio search AF086833

# Searches SRA
bio search SRR1553425

# Searches NCBI Genomes
bio search GCA_000005845

# Also searches NCBI Genomes
bio search ecoli

下载GenBank数据

GenBank是NIH基因序列数据库,如果我们有一个Genebank的accession number,我们可以通过genbank.mk来下Genebank数据。例如我们想下载AF086833FASTSAGFF数据,我们可以通过以下命令:

1
make -f src/run/genbank.mk fasta gff ACC=AF086833
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#
# Downloads NCBI data vie Entrez API
#

# Accession number at NCBI
ACC ?= AF086833

# The accession as a fasta file.
REF ?= refs/${ACC}.fa

# The accession as a genbank file.
GBK ?= refs/${ACC}.gb

# The accession as a gff file.
GFF ?= refs/${ACC}.gff

# Check the make version.
ifeq ($(origin .RECIPEPREFIX), undefined)
$(error "### Error! Please use GNU Make 4.0 or later ###")
endif

# Makefile customizations.
.DELETE_ON_ERROR:
.ONESHELL:
SHELL := bash
.SHELLFLAGS := -eu -o pipefail -c
MAKEFLAGS += --warn-undefined-variables --no-print-directory

MAKEFLAGS += --warn-undefined-variables --no-print-directory

# Print usage information.
usage::
@echo "#"
@echo "# genbank.mk: download sequences from GenBank"
@echo "#"
@echo "# ACC=${ACC}"
@echo "# REF=${REF}"
@echo "# GBK=${GBK}"
@echo "# GFF=${GFF}"
@echo "#"
@echo "# make fasta genbank gff run"
@echo "#"

# Obtain a sequence from NCBI.
${REF}:
mkdir -p $(dir $@)
bio fetch ${ACC} --format fasta > $@

${GBK}:
mkdir -p $(dir $@)
bio fetch ${ACC} > $@

${GFF}:
mkdir -p $(dir $@)
bio fetch ${ACC} --format gff > $@

# Download FASTA file.
fasta:: ${REF}
@ls -lh ${REF}

# Download GenBank file.
genbank:: ${GBK}
@ls -lh ${GBK}

# Download GFF file.
gff:: ${GFF}
@ls -lh ${GFF}

# Remove FASTA file.
fasta!::
rm -rf ${REF}

# Remove GenBank file.
genbank!::
rm -rf ${GBK}

# Remove GFF file.
gff!::
rm -rf ${GFF}

# The run target is FASTA
run:: fasta

# Undo run target
run!:: fasta!

test:
make -f src/run/genbank.mk fasta! fasta ACC=${ACC} REF=${REF}
make -f src/run/genbank.mk gff! gff ACC=${ACC} GFF=${GFF}
make -f src/run/genbank.mk genbank! genbank ACC=${ACC} GBK=${GBK}

# Installation instructions
install::
@echo pip install bio --upgrade

# Targets that are not files.
.PHONY: usage install run run! test fasta fasta! genbank genbank! gff gff!

从NCBI访问参考基因组

NCBI在blog中提出从2024年6月,使用新的NCBI Datasets resource来访问参考基因组,从而替换旧的GenomeAssenbly resource。

但是,目前资源仍然有效,主要入口站点位于:

假如我们有一个人参考基因组的accession number,GCA_000001405.29我们可以先通过bio search查询:

1
bio search GCA_000001405.29

它将输出:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
[
{
"assembly_accession": "GCA_000001405.29",
"bioproject": "PRJNA31257",
"biosample": "",
"wgs_master": "",
"refseq_category": "reference genome",
"taxid": "9606",
"species_taxid": "9606",
"organism_name": "Homo sapiens",
"infraspecific_name": "",
"isolate": "",
"version_status": "latest",
"assembly_level": "Chromosome",
"release_type": "Patch",
"genome_rep": "Full",
"seq_rel_date": "2022/02/03",
"asm_name": "GRCh38.p14",
"submitter": "Genome Reference Consortium",
"gbrs_paired_asm": "GCF_000001405.40",
"paired_asm_comp": "different",
"ftp_path": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.29_GRCh38.p14",
"excluded_from_refseq": "",
"relation_to_type_materialasm_not_live_date": ""
}
]

访问基因组所在的URL,然后即可通过前面的模块命令进行下载数据。

  • NCBI支持rsync协议(大多数网站不支持),可以使用rsync.mk进行下载数据,只需要将链接中的https改为rsync就可以下载完整目录来。
    1
    2
    3
    4
    5
    6
    7
    8
    # You need to change the protocol to rsync in this case.
    URL=rsync://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/005/845/GCA_000005845.2_ASM584v2

    # You can use rsync directly
    rsync -avz $URL .

    # Or use our curl.mk makefile module.
    make -f src/run/rsync.mk get URL=$URL
rsync.mk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#
# Download data via rsync
#
# You might ask yourself Why have a makefile for what seems to be be trivial command?
#
# In this case to help us remember the correct commands and to make it fit with
# the rest of the workflow building process.
#
# The rsync rules everyone always forgets:
#
# - trailing slash on URL means copy the contents of the directory
# - no trailing slash on URL means copy the directory itself
# - a slash on the destination directory has no effect
#

# The remote location we wish to download
URL ?= rsync://hgdownload.soe.ucsc.edu/goldenPath/eboVir3/bigZips

# The destination that the download will be placed in.
DEST ?= refs

# Makefile customizations.
SHELL := bash
.RECIPEPREFIX = >
.DELETE_ON_ERROR:
.ONESHELL:
MAKEFLAGS += --warn-undefined-variables --no-print-directory

# General usage information.
usage::
> @echo ""
> @echo "# rsync.mk: downloads data via rsync protocol"
> @echo ""
> @echo "# make run URL=? FILE=?"
> @echo ""

# rsync will automatically detect updated files.
run::
> mkdir -p ${DEST}
> rsync --times --copy-links --recursive -vz -P ${URL} ${DEST}
> find ${DEST}/*

run!::
> @echo "# cannot undo an rsync download"

# Installation instructions
install::
> @echo "# no installation required"

从Ensembl访问数据

Ensembl是一个基因组数据库,了大量的数据,并且按照版本号进行归档,我们可以通过网页或者发布网站进行访问:

获取FASTQ数据

通过SRA下载

如果我们知道SRA编号,首先我们使用bio search搜索metadata:

1
2
# Look up information on a SRR number.
bio search SRR1553425

我们会得到一些信息:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
[
{
"run_accession": "SRR1553425",
"sample_accession": "SAMN02951957",
"sample_alias": "EM110",
"sample_description": "Zaire ebolavirus genome sequencing from 2014 outbreak in Sierra Leone",
"first_public": "2015-06-05",
"country": "Sierra Leone",
"scientific_name": "Zaire ebolavirus",
"fastq_bytes": "111859282;119350609",
"base_count": "360534650",
"read_count": "1784825",
"library_name": "EM110_r1.ADXX",
"library_strategy": "RNA-Seq",
"library_source": "TRANSCRIPTOMIC",
"library_layout": "PAIRED",
"instrument_platform": "ILLUMINA",
"instrument_model": "Illumina HiSeq 2500",
"study_title": "Zaire ebolavirus Genome sequencing",
"fastq_url": [
"https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR155/005/SRR1553425/SRR1553425_1.fastq.gz",
"https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR155/005/SRR1553425/SRR1553425_2.fastq.gz"
],
"info": "112 MB, 119 MB file; 2 million reads; 360.5 million sequenced bases"
}
]

上方的metadata中fastq_url就是我们的下载链接,我们也可以通过命令直接识别数据的URL:
1
bio search SRR1553425 | jq -r '.[].fastq_url[]'`

对于大型数据,可以通过aria2去下载我们的数据,也可以使用自定义makesfile模块aria.mk
1
2
3
4
5
# The file to download.
URL=https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR155/005/SRR1553425/SRR1553425_1.fastq.gz

# Download with 5 connections.
aria2c -x 5 -d reads $URL

或者

1
2
3
4
URL=https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR155/005/SRR1553425/SRR1553425_1.fastq.gz

# Use the aria module to download the file.
make -f src/run/aria.mk run URL=${URL} FILE=reads/SRR1553425_1.fastq.gz

我们也可以使用fastq-dump工具下载数据,使用自定义的fastq-dump.mk模块:

1
make -f src/run/sra.mk run SRR=SRR1553425 N=ALL

如果我们有一个项目编号,也可以一次获取项目的所有数据:
1
2
3
4
bio search PRJNA257197 --csv > project.csv

cat project.csv | csvcut -c 1 | \
parallel make -f src/run/sra.mk run SRR={} N=ALL

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#
# Download data via the aria2c downloader.
#

# The URL to be downloaded
URL ?= https://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/bigZips/wuhCor1.fa.gz

# The directory to put the data into
DIR=.

# Destination file.
FILE ?= ${DIR}/$(notdir ${URL})

# Aria2c flags.
FLAGS = -x 5 -c --summary-interval=10

# Makefile customizations.
SHELL := bash
.DELETE_ON_ERROR:
.ONESHELL:
MAKEFLAGS += --warn-undefined-variables --no-print-directory

# General usage information.
usage:
@echo "#"
@echo "# aria.mk: downloads data"
@echo "#"
@echo "# make run URL=?"
@echo "#"

# Download the file with aria2c.
${FILE}:
mkdir -p $(dir $@)
aria2c ${FLAGS} -o $@ ${URL}

run: ${FILE}
@ls -lh $<

# Remove the file.
run!:
rm -f ${FILE}

# A shortcut to default run
test: run

# Installation instructions
install:
@echo "micromamba install aria2"

.PHONY:: usage run run! install test

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#
# Downloads sequencing reads from SRA.
#

# The directory that stores FASTQ reads that we operate on.
DIR ?= reads

# SRR number (sequencing run from the Ebola outbreak data of 2014)
SRR ?= SRR1553425

# The name of the unpacked reads
P1 ?= ${DIR}/${SRR}_1.fastq
P2 ?= ${DIR}/${SRR}_2.fastq

# The name of the final reads (may be the same)
R1 ?= ${P1}
R2 ?= ${P2}

# How many reads to download (N=ALL downloads everything).
N ?= ALL

# Makefile customizations.
.DELETE_ON_ERROR:
SHELL := bash
.ONESHELL:
.SHELLFLAGS := -eu -o pipefail -c
MAKEFLAGS += --warn-undefined-variables --no-print-directory

# Print usage information.
usage::
@echo "#"
@echo "# sra.mk: downloads FASTQ reads from SRA"
@echo "#"
@echo "# make run SRR=${SRR} N=${N}"
@echo "#"

# Set the flags for the download
ifeq ($(N), ALL)
FLAGS ?= -F --split-files
else
FLAGS ?= -F --split-files -X ${N}
endif

# Obtain the reads from SRA.
${R1}:
mkdir -p ${DIR}
fastq-dump ${FLAGS} -O ${DIR} ${SRR}

@if [ "${P1}" != "${R1}" ]; then \
mv -f ${P1} ${R1}; \
fi

@if [ -f ${P2} ] && [ "${P2}" != "${R2}" ]; then \
mv -f ${P2} ${R2}; \
fi


# List the data. We don't know if the reads are paired or not.
run: ${R1}
@if [ -f ${R2} ]; then
@ls -lh ${R1} ${R2}
else
@ls -lh ${R1}
fi

# Removes the SRA files.
run!:
rm -f ${P1} ${P2} ${R1} ${R2}

test:
# Get the FASTQ reads.
make -f src/run/sra.mk SRR=${SRR} run! run

install::
@echo micromamba install sra-tools

# Targets that are not files.
.PHONY: usage run run! test install

通过SRA Explorer

SRA Explorer工具旨在使序列读取档案中的数据集更易于访问。它具备 NCBI SRA 网站应有的一切功能。它是一个网页应用程序,可让您搜索数据集、查看元数据:

https://sra-explorer.info/

通过NCBI网站

可以直接访问SRA数据库,访问和浏览数据

https://www.ncbi.nlm.nih.gov/sra

通过URL下载数据

如果我们有一个数据的URL,我们可以通过wget``curl进行下载,这里也有一个curl.mk来实现这个功能:

1
2
3
4
5
# The URL of the file.
URL=http://ftp.ensembl.org/pub/release-104/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.22.fa.gz

# Download with curl
make -f src/run/curl.mk run URL=${URL} FILE=refs/chr22.fa.gz

对于大型数据,推荐使用aria软件,支持更快的多线程和断点下载。同样可以使用aria.mk来实现这个功能:

1
2
3
4
5
# The URL of the file.
URL=http://ftp.ensembl.org/pub/release-104/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.22.fa.gz

# Download with aria2
make -f src/run/aria.mk run URL=${URL} FILE=refs/chr22.fa.gz

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#
# Download data via the curl downloader.
#

# The URL to be downloaded
URL ?= https://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/bigZips/wuhCor1.fa.gz

# Destination file.
FILE ?= refs/$(notdir ${URL})

# Aria2c flags.
FLAGS = -L

#
# Cute trick. The module also allows to fetch a limited number of lines
# from a remote gzipped url.
#

# How many lines to get
N =

# Apply the header
ifeq ($(strip $(N)),)
HEAD =
else
HEAD = | head -n ${N}
endif

# Should it unzip
GUNZIP =

ifeq ($(strip $(GUNZIP)),)
GUNZIP_CMD =
else
GUNZIP_CMD = | gunzip -c
endif

# Should it rezip
BGZIP =

ifeq ($(strip $(BGZIP)),)
BGZIP_CMD =
else
BGZIP_CMD = | bgzip -@ 4 -c
endif

FILTER = ${GUNZIP_CMD} ${HEAD} ${BGZIP_CMD}

# Makefile customizations.
.DELETE_ON_ERROR:
.ONESHELL:
MAKEFLAGS += --warn-undefined-variables --no-print-directory

# General usage information.
usage:
@echo "#"
@echo "# curl.mk: downloads data"
@echo "#"
@echo "# make run URL=? FILE=?"
@echo "#"

# Download the file with aria2c.
${FILE}:
mkdir -p $(dir $@)
curl ${FLAGS} ${URL} ${FILTER} > $@

run: ${FILE}
@ls -lh $<

# For all cases remove the file and temporary file.
run!:
rm -f ${FILE}

# Installation instructions
install:
@echo "# no installation required"

.PHONY:: usage run run! install
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#
# Download data via the aria2c downloader.
#

# The URL to be downloaded
URL ?= https://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/bigZips/wuhCor1.fa.gz

# The directory to put the data into
DIR=.

# Destination file.
FILE ?= ${DIR}/$(notdir ${URL})

# Aria2c flags.
FLAGS = -x 5 -c --summary-interval=10

# Makefile customizations.
SHELL := bash
.DELETE_ON_ERROR:
.ONESHELL:
MAKEFLAGS += --warn-undefined-variables --no-print-directory

# General usage information.
usage:
@echo "#"
@echo "# aria.mk: downloads data"
@echo "#"
@echo "# make run URL=?"
@echo "#"

# Download the file with aria2c.
${FILE}:
mkdir -p $(dir $@)
aria2c ${FLAGS} -o $@ ${URL}

run: ${FILE}
@ls -lh $<

# Remove the file.
run!:
rm -f ${FILE}

# A shortcut to default run
test: run

# Installation instructions
install:
@echo "micromamba install aria2"

.PHONY:: usage run run! install test

refgenie工具

Refgenie 是一款命令行工具,可用于下载和管理参考基因组,也可以构建和管理自定义基因组集。

安装

可以使用coanda进行安装:

1
coanda install refgenie

但是,一般来说,基于Python的命令行工具,我们可以使用pipx保存在单独的环境中,这样还有一个好处是refgenie可以所有的环境可用。
1
2
3
pip install pipx
pipx ensurepath
pipx install refgenie

配置

创建配置文件

refgenie需要一个配置文件,每次运行时候都需要-c参数指定配置文件。更为方便的是,我们可以创建REFGENIEshell环境变量,来自动默认加载配置文件。
我们将我们的配置文件放在~/refs中(可以自定义),然后在~/.bashrc中添加:

1
2
# Our configuration file is located in the ~/refs folder. 
export REFGENIE=~/refs/config.yaml

也可以直接命令行输入:
1
echo "export REFGENIE=~/refs/config.yaml" >> ~/.bashrc

初始化配置文件

1
refgenie init -c ~/refs/config.yaml

添加资源

refgenie可以订阅iGenomes服务器资源,托管额外的参考基因组数据。

1
refgenie subscribe -s http://igenomes.databio.org/

检索

  • 列出远程基因组资源
    1
    refgenie listr
  • 列出本地基因组资源

    1
    refgenie list
  • 查找本地资源的路径:

    1
    refgenie seek hg38/gencode_gtf

使用

  • 下载资源

    1
    refgenie pull hg38/gencode_gtf
  • Bash脚本中使用

    1
    2
    # Set the GTF variable to the path of the hg38/gencode_gtf resource
    REF=$(refgenie seek hg38/fasta)
  • Makefile中使用
    在makefile中,我们可以使用$(shell command)来执行命令,然后将结果赋值给变量,如下所示:

    1
    2
    # Set the GTF variable to the path of the hg38/gencode_gtf resource
    GTF=$(shell refgenie seek hg38/gencode_gtf)

genomepy工具

genomepy是一个下载和使用基因组数据的工具,它可用于:

  • 搜索可用数据
  • 显示可用的metadata
  • 自动下载、预处理
  • 生成可选的比对索引

安装

同样有两种安装方式:

  • 使用micromamba安装

    1
    micromamba install genomepy
  • 或者使用pipx安装

    1
    2
    3
    pip install pipx
    pipx ensurepath
    pipx install genomepy

检索

搜索基因组:

1
genomepy search ecoli > listing.txt

下载基因组:

1
genomepy install ViralProj272395

默认的下载文件位置在~/.local/share/genomes/ViralProj272395中,可以通过-g参数指定下载位置。

下载注释文件

需要先安装一些软件包,然后进行检索下载。

1
2
micromamba install ucsc-genepredtogtf ucsc-genepredtobed ucsc-bedtogenepred \
ucsc-gtftogenepred ucsc-gff3togenepred

检索注释包

1
genomepy annotation ViralProj272395

下载安装注释

1
genomepy install ViralProj272395 --annotation

则目录~/.local/share/genomes/ViralProj272395包含以下内容:

1
2
3
4
5
6
7
8
README.txt
ViralProj272395.annotation.bed
ViralProj272395.annotation.gtf
ViralProj272395.fa
ViralProj272395.fa.fai
ViralProj272395.fa.sizes
ViralProj272395.gaps.bed
assembly_report.txt

Recipes

短序列比对

1
make -f src/recipes/short-read-alignments.mk run
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#
# Biostar Workflows: https://www.biostarhandbook.com/
#

# Genome accession.
ACC = KM233118

# SRR number.
SRR = SRR1972918

# Set both reads for paired end and only one for single end.
R1 = reads/SRR1972918_1.fq
R2 = reads/SRR1972918_2.fq

# The reference genome.
REF = refs/${ACC}.fa

# The name of the gff file.
GFF = refs/${ACC}.gff

# The alignment file.
BAM = bam/${SRR}.bam

# Makefile customizations.
SHELL := bash
.DELETE_ON_ERROR:
.ONESHELL:
.SHELLFLAGS := -eu -o pipefail -c
MAKEFLAGS += --warn-undefined-variables --no-print-directory

usage:
@echo "#"
@echo "# Align short reads to a reference genome"
@echo "#"
@echo "# make run"
@echo "#"

# Download the reference genome.
genome:
make -f src/run/genbank.mk fasta gff ACC=${ACC} REF=${REF} GFF=${GFF}

# Download the short reads.
fastq:
make -f src/run/sra.mk run SRR=${SRR} N=2500 R1=${R1} R2=${R2}

# Perform data related tasks
data: genome fastq
make -f src/run/bwa.mk index REF=${REF}

# Index and align the reads to the reference genome.
align:
make -f src/run/bwa.mk run REF=${REF} R1=${R1} R2=${R2} BAM=${BAM}

# Run the complete pipeline.
run: data align

# Remove the alignment generated by the pipeline
run!:
rm -f ${BAM}

.PHONY: usage data align run
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#
# Downloads NCBI data vie Entrez API
#

# Accession number at NCBI
ACC ?= AF086833

# The accession as a fasta file.
REF ?= refs/${ACC}.fa

# The accession as a genbank file.
GBK ?= refs/${ACC}.gb

# The accession as a gff file.
GFF ?= refs/${ACC}.gff

# Check the make version.
ifeq ($(origin .RECIPEPREFIX), undefined)
$(error "### Error! Please use GNU Make 4.0 or later ###")
endif

# Makefile customizations.
.DELETE_ON_ERROR:
.ONESHELL:
SHELL := bash
.SHELLFLAGS := -eu -o pipefail -c
MAKEFLAGS += --warn-undefined-variables --no-print-directory

MAKEFLAGS += --warn-undefined-variables --no-print-directory

# Print usage information.
usage::
@echo "#"
@echo "# genbank.mk: download sequences from GenBank"
@echo "#"
@echo "# ACC=${ACC}"
@echo "# REF=${REF}"
@echo "# GBK=${GBK}"
@echo "# GFF=${GFF}"
@echo "#"
@echo "# make fasta genbank gff run"
@echo "#"

# Obtain a sequence from NCBI.
${REF}:
mkdir -p $(dir $@)
bio fetch ${ACC} --format fasta > $@

${GBK}:
mkdir -p $(dir $@)
bio fetch ${ACC} > $@

${GFF}:
mkdir -p $(dir $@)
bio fetch ${ACC} --format gff > $@

# Download FASTA file.
fasta:: ${REF}
@ls -lh ${REF}

# Download GenBank file.
genbank:: ${GBK}
@ls -lh ${GBK}

# Download GFF file.
gff:: ${GFF}
@ls -lh ${GFF}

# Remove FASTA file.
fasta!::
rm -rf ${REF}

# Remove GenBank file.
genbank!::
rm -rf ${GBK}

# Remove GFF file.
gff!::
rm -rf ${GFF}

# The run target is FASTA
run:: fasta

# Undo run target
run!:: fasta!

test:
make -f src/run/genbank.mk fasta! fasta ACC=${ACC} REF=${REF}
make -f src/run/genbank.mk gff! gff ACC=${ACC} GFF=${GFF}
make -f src/run/genbank.mk genbank! genbank ACC=${ACC} GBK=${GBK}

# Installation instructions
install::
@echo pip install bio --upgrade

# Targets that are not files.
.PHONY: usage install run run! test fasta fasta! genbank genbank! gff gff!
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#
# Downloads sequencing reads from SRA.
#

# The directory that stores FASTQ reads that we operate on.
DIR ?= reads

# SRR number (sequencing run from the Ebola outbreak data of 2014)
SRR ?= SRR1553425

# The name of the unpacked reads
P1 ?= ${DIR}/${SRR}_1.fastq
P2 ?= ${DIR}/${SRR}_2.fastq

# The name of the final reads (may be the same)
R1 ?= ${P1}
R2 ?= ${P2}

# How many reads to download (N=ALL downloads everything).
N ?= 10000

# Makefile customizations.
.DELETE_ON_ERROR:
SHELL := bash
.ONESHELL:
.SHELLFLAGS := -eu -o pipefail -c
MAKEFLAGS += --warn-undefined-variables --no-print-directory

# Print usage information.
usage::
@echo "#"
@echo "# sra.mk: downloads FASTQ reads from SRA"
@echo "#"
@echo "# make run SRR=${SRR} N=${N}"
@echo "#"

# Set the flags for the download
ifeq ($(N), ALL)
FLAGS ?= -F --split-files
else
FLAGS ?= -F --split-files -X ${N}
endif

# Obtain the reads from SRA.
${R1}:
mkdir -p ${DIR}
fastq-dump --gzip ${FLAGS} -O ${DIR} ${SRR}

@if [ "${P1}" != "${R1}" ]; then \
mv -f ${P1} ${R1}; \
fi

@if [ -f ${P2} ] && [ "${P2}" != "${R2}" ]; then \
mv -f ${P2} ${R2}; \
fi


# List the data. We don't know if the reads are paired or not.
run: ${R1}
@if [ -f ${R2} ]; then
@ls -lh ${R1} ${R2}
else
@ls -lh ${R1}
fi

# Removes the SRA files.
run!:
rm -f ${P1} ${P2} ${R1} ${R2}

test:
# Get the FASTQ reads.
make -f src/run/sra.mk SRR=${SRR} run! run

install::
@echo micromamba install sra-tools

# Targets that are not files.
.PHONY: usage run run! test install
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#
# Generate alignments with bwa
#

# The reference genome.
REF ?= refs/genome.fa

# The directory that holds the index.
IDX_DIR = $(dir ${REF})/idx

# The name of the index
IDX ?= ${IDX_DIR}/$(notdir ${REF})

# A file in the index directory.
IDX_FILE ?= ${IDX}.ann

# Number of CPUS
NCPU ?= 2

# Additional flags to pass to BWA.
BWA_FLAGS ?= -t ${NCPU}

# Sam filter flags to filter the BAM file before sorting.
SAM_FLAGS ?=

# First in pair.
R1 ?= reads/reads1.fq

# Second in pair. Runs in single end mode if not defined.
R2 ?=

# The alignment file.
BAM ?= bam/alignment.bam

# Unsorted BAM file.
BAM_TMP ?= $(basename ${BAM}).unsorted.bam

# Set the values for the read groups.
ID ?= run1
SM ?= sample1
LB ?= library1
PL ?= ILLUMINA

# Build the read groups tag.
RG ?= '@RG\tID:${ID}\tSM:${SM}\tLB:${LB}\tPL:${PL}'

# Makefile customizations.
.DELETE_ON_ERROR:
SHELL := bash
.ONESHELL:
.SHELLFLAGS := -eu -o pipefail -c
MAKEFLAGS += --warn-undefined-variables --no-print-directory

# Print usage information.
usage:
@echo "#"
@echo "# bwa.mk: align reads using BWA"
@echo "#"
@echo "# R1=${R1}"
ifneq (${R2},)
@echo "# R2=${R2}"
endif
@echo "# REF=${REF}"
@echo "# IDX=${IDX}"
@echo "# BAM=${BAM}"
@echo "#"
@echo "# make index align"
@echo "#"

# Read 1 must exist.
${R1}:
@echo "# Read 1 file not found: R1=${R1}"
@exit -1

# If R2 is set, it must exist.
ifneq (${R2},)
${R2}:
@echo "# Read 2 file not found: R2=${R2}"
@exit -1
endif

# Bail out if reference is not found
${REF}:
echo "# Reference not found: ${REF}";
exit -1

# Index the reference genome.
${IDX_FILE}: ${REF}
@mkdir -p $(dir $@)
bwa index -p ${IDX} ${REF}

# Create the index.
index: ${IDX_FILE}
@echo "# bwa index: ${IDX}"

# Remove the index.
index!:
rm -rf ${IDX_FILE}

# Paired end alignment.
${BAM_TMP}: ${R1} ${R2}
@if [ ! -f ${IDX_FILE} ]; then
echo "# bwa index not found: IDX=${IDX}";
exit -1
fi
mkdir -p $(dir $@)
bwa mem ${BWA_FLAGS} -R ${RG} ${IDX} ${R1} ${R2} | samtools view -b ${SAM_FLAGS} -o ${BAM_TMP}

# Sort the BAM file.
${BAM}: ${BAM_TMP}
mkdir -p $(dir $@)
samtools sort -@ ${NCPU} ${BAM_TMP} -o ${BAM}

# Create the BAM index file.
${BAM}.bai: ${BAM}
samtools index ${BAM}

# Generate the alignment.
align: ${BAM}.bai
@ls -lh ${BAM}

# Generate the alignment.
align!:
rm -f ${BAM_TMP} ${BAM} ${BAM}.bai

# Create a wiggle coverage file.
wiggle: ${BAM}.bai ${REF}
@make -f src/run/wiggle.mk BAM=${BAM} REF=${REF} run

# Running creates the wiggle file as well.
run: align wiggle

# Remove the BAM file.
run!:
rm -rf ${BAM} ${BAM_TMP} ${BAM}.bai

# Run the test.
test:
make -f src/run/tester.mk test_aligner MOD=src/run/bwa.mk

# The name of the stats file.
STATS = $(basename ${BAM}).stats

# Generate stats on the alignme
${STATS}: ${BAM}.bai
samtools flagstat ${BAM} > ${STATS}

# Trigger the statistics generation.
stats: ${STATS}
@echo "# ${STATS}"
@cat ${STATS}

# Install required software.
install::
@echo micromamba install bwa samtools



# Targets that are not files.
.PHONY: usage index align run run! test wiggle install

检测基因组变异

1
make -f src/recipes/variant-calling.mk run
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#
# Biostar Workflows: https://www.biostarhandbook.com/
#

# Genome accession.
ACC = KM233118

# The reference genome.
REF = refs/${ACC}.fa

# SRR number.
SRR = SRR1972918

# The read pairs.
R1 = reads/${SRR}.fq
R2 = reads/${SRR}.fq

# The alignment file.
BAM = bam/${SRR}.bam

# The variants with bcftools.
VCF = vcf/${SRR}.vcf.gz

# Number of CPUS
NCPU = 4

# Makefile customizations.
SHELL := bash
.DELETE_ON_ERROR:
.ONESHELL:
.SHELLFLAGS := -eu -o pipefail -c
MAKEFLAGS += --warn-undefined-variables --no-print-directory

usage:
@echo "#"
@echo "# Short read variant calling."
@echo "#"
@echo "#"
@echo "# make run"
@echo "#"

run:
# Download the reference genome.
make -f src/run/genbank.mk fasta gff ACC=${ACC} REF=${REF}

# Download the short reads.
make -f src/run/sra.mk run SRR=${SRR} N=2500 R1=${R1} R2=${R2}

# Index the genome then align the reads to the reference genome.
make -f src/run/bwa.mk index run R1=${R1} R2=${R2} REF=${REF} BAM=${BAM} NCPU=${NCPU}

# Call variants with bcftools.
make -f src/run/bcftools.mk run REF=${REF} BAM=${BAM} VCF=${VCF} NCPU=${NCPU}

# Remove the variant calls generated by the pipeline
run!:
rm -f ${VCF}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#
# Downloads NCBI data vie Entrez API
#

# Accession number at NCBI
ACC ?= AF086833

# The accession as a fasta file.
REF ?= refs/${ACC}.fa

# The accession as a genbank file.
GBK ?= refs/${ACC}.gb

# The accession as a gff file.
GFF ?= refs/${ACC}.gff

# Check the make version.
ifeq ($(origin .RECIPEPREFIX), undefined)
$(error "### Error! Please use GNU Make 4.0 or later ###")
endif

# Makefile customizations.
.DELETE_ON_ERROR:
.ONESHELL:
SHELL := bash
.SHELLFLAGS := -eu -o pipefail -c
MAKEFLAGS += --warn-undefined-variables --no-print-directory

MAKEFLAGS += --warn-undefined-variables --no-print-directory

# Print usage information.
usage::
@echo "#"
@echo "# genbank.mk: download sequences from GenBank"
@echo "#"
@echo "# ACC=${ACC}"
@echo "# REF=${REF}"
@echo "# GBK=${GBK}"
@echo "# GFF=${GFF}"
@echo "#"
@echo "# make fasta genbank gff run"
@echo "#"

# Obtain a sequence from NCBI.
${REF}:
mkdir -p $(dir $@)
bio fetch ${ACC} --format fasta > $@

${GBK}:
mkdir -p $(dir $@)
bio fetch ${ACC} > $@

${GFF}:
mkdir -p $(dir $@)
bio fetch ${ACC} --format gff > $@

# Download FASTA file.
fasta:: ${REF}
@ls -lh ${REF}

# Download GenBank file.
genbank:: ${GBK}
@ls -lh ${GBK}

# Download GFF file.
gff:: ${GFF}
@ls -lh ${GFF}

# Remove FASTA file.
fasta!::
rm -rf ${REF}

# Remove GenBank file.
genbank!::
rm -rf ${GBK}

# Remove GFF file.
gff!::
rm -rf ${GFF}

# The run target is FASTA
run:: fasta

# Undo run target
run!:: fasta!

test:
make -f src/run/genbank.mk fasta! fasta ACC=${ACC} REF=${REF}
make -f src/run/genbank.mk gff! gff ACC=${ACC} GFF=${GFF}
make -f src/run/genbank.mk genbank! genbank ACC=${ACC} GBK=${GBK}

# Installation instructions
install::
@echo pip install bio --upgrade

# Targets that are not files.
.PHONY: usage install run run! test fasta fasta! genbank genbank! gff gff!
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#
# Downloads sequencing reads from SRA.
#

# The directory that stores FASTQ reads that we operate on.
DIR ?= reads

# SRR number (sequencing run from the Ebola outbreak data of 2014)
SRR ?= SRR1553425

# The name of the unpacked reads
P1 ?= ${DIR}/${SRR}_1.fastq
P2 ?= ${DIR}/${SRR}_2.fastq

# The name of the final reads (may be the same)
R1 ?= ${P1}
R2 ?= ${P2}

# How many reads to download (N=ALL downloads everything).
N ?= 10000

# Makefile customizations.
.DELETE_ON_ERROR:
SHELL := bash
.ONESHELL:
.SHELLFLAGS := -eu -o pipefail -c
MAKEFLAGS += --warn-undefined-variables --no-print-directory

# Print usage information.
usage::
@echo "#"
@echo "# sra.mk: downloads FASTQ reads from SRA"
@echo "#"
@echo "# make run SRR=${SRR} N=${N}"
@echo "#"

# Set the flags for the download
ifeq ($(N), ALL)
FLAGS ?= -F --split-files
else
FLAGS ?= -F --split-files -X ${N}
endif

# Obtain the reads from SRA.
${R1}:
mkdir -p ${DIR}
fastq-dump ${FLAGS} -O ${DIR} ${SRR}

@if [ "${P1}" != "${R1}" ]; then \
mv -f ${P1} ${R1}; \
fi

@if [ -f ${P2} ] && [ "${P2}" != "${R2}" ]; then \
mv -f ${P2} ${R2}; \
fi


# List the data. We don't know if the reads are paired or not.
run: ${R1}
@if [ -f ${R2} ]; then
@ls -lh ${R1} ${R2}
else
@ls -lh ${R1}
fi

# Removes the SRA files.
run!:
rm -f ${P1} ${P2} ${R1} ${R2}

test:
# Get the FASTQ reads.
make -f src/run/sra.mk SRR=${SRR} run! run

install::
@echo micromamba install sra-tools

# Targets that are not files.
.PHONY: usage run run! test install
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#
# Generate alignments with bwa
#

# The reference genome.
REF ?= refs/genome.fa

# The directory that holds the index.
IDX_DIR = $(dir ${REF})/idx

# The name of the index
IDX ?= ${IDX_DIR}/$(notdir ${REF})

# A file in the index directory.
IDX_FILE ?= ${IDX}.ann

# Number of CPUS
NCPU ?= 2

# Additional flags to pass to BWA.
BWA_FLAGS ?= -t ${NCPU}

# Sam filter flags to filter the BAM file before sorting.
SAM_FLAGS ?=

# First in pair.
R1 ?= reads/reads1.fq

# Second in pair. Runs in single end mode if not defined.
R2 ?=

# The alignment file.
BAM ?= bam/alignment.bam

# Unsorted BAM file.
BAM_TMP ?= $(basename ${BAM}).unsorted.bam

# Set the values for the read groups.
ID ?= run1
SM ?= sample1
LB ?= library1
PL ?= ILLUMINA

# Build the read groups tag.
RG ?= '@RG\tID:${ID}\tSM:${SM}\tLB:${LB}\tPL:${PL}'

# Makefile customizations.
.DELETE_ON_ERROR:
SHELL := bash
.ONESHELL:
.SHELLFLAGS := -eu -o pipefail -c
MAKEFLAGS += --warn-undefined-variables --no-print-directory

# Print usage information.
usage:
@echo "#"
@echo "# bwa.mk: align reads using BWA"
@echo "#"
@echo "# R1=${R1}"
ifneq (${R2},)
@echo "# R2=${R2}"
endif
@echo "# REF=${REF}"
@echo "# IDX=${IDX}"
@echo "# BAM=${BAM}"
@echo "#"
@echo "# make index align"
@echo "#"

# Read 1 must exist.
${R1}:
@echo "# Read 1 file not found: R1=${R1}"
@exit -1

# If R2 is set, it must exist.
ifneq (${R2},)
${R2}:
@echo "# Read 2 file not found: R2=${R2}"
@exit -1
endif

# Bail out if reference is not found
${REF}:
echo "# Reference not found: ${REF}";
exit -1

# Index the reference genome.
${IDX_FILE}: ${REF}
@mkdir -p $(dir $@)
bwa index -p ${IDX} ${REF}

# Create the index.
index: ${IDX_FILE}
@echo "# bwa index: ${IDX}"

# Remove the index.
index!:
rm -rf ${IDX_FILE}

# Paired end alignment.
${BAM_TMP}: ${R1} ${R2}
@if [ ! -f ${IDX_FILE} ]; then
echo "# bwa index not found: IDX=${IDX}";
exit -1
fi
mkdir -p $(dir $@)
bwa mem ${BWA_FLAGS} -R ${RG} ${IDX} ${R1} ${R2} | samtools view -b ${SAM_FLAGS} -o ${BAM_TMP}

# Sort the BAM file.
${BAM}: ${BAM_TMP}
mkdir -p $(dir $@)
samtools sort -@ ${NCPU} ${BAM_TMP} -o ${BAM}

# Create the BAM index file.
${BAM}.bai: ${BAM}
samtools index ${BAM}

# Generate the alignment.
align: ${BAM}.bai
@ls -lh ${BAM}

# Generate the alignment.
align!:
rm -f ${BAM_TMP} ${BAM} ${BAM}.bai

# Create a wiggle coverage file.
wiggle: ${BAM}.bai ${REF}
@make -f src/run/wiggle.mk BAM=${BAM} REF=${REF} run

# Running creates the wiggle file as well.
run: align wiggle

# Remove the BAM file.
run!:
rm -rf ${BAM} ${BAM_TMP} ${BAM}.bai

# Run the test.
test:
make -f src/run/tester.mk test_aligner MOD=src/run/bwa.mk

# The name of the stats file.
STATS = $(basename ${BAM}).stats

# Generate stats on the alignme
${STATS}: ${BAM}.bai
samtools flagstat ${BAM} > ${STATS}

# Trigger the statistics generation.
stats: ${STATS}
@echo "# ${STATS}"
@cat ${STATS}

# Install required software.
install::
@echo micromamba install bwa samtools



# Targets that are not files.
.PHONY: usage index align run run! test wiggle install

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#
# Generates SNP calls with bcftools.
#

# A root to derive output default names from.
SRR = SRR1553425

# Number of CPUS
NCPU ?= 2

# Genbank accession number.
ACC ?= AF086833

# The reference genome.
REF ?= refs/${ACC}.fa

# The alignment file.
BAM ?= bam/${SRR}.bam

# The variant file.
VCF ?= vcf/$(notdir $(basename ${BAM})).vcf.gz

# Additional bcf flags for pileup annotation.
PILE_FLAGS = -d 100 --annotate 'INFO/AD,FORMAT/DP,FORMAT/AD,FORMAT/ADF,FORMAT/ADR,FORMAT/SP'

# Additional bcf flags for calling process.
CALL_FLAGS = --ploidy 2 --annotate 'FORMAT/GQ'

# Makefile customizations.
SHELL := bash
.DELETE_ON_ERROR:
.ONESHELL:
MAKEFLAGS += --warn-undefined-variables --no-print-directory

# The first target is always the help.
usage::
@echo "#"
@echo "# bcftools.mk: calls variants using bcftools"
@echo "#"
@echo "# REF=${REF}"
@echo "# BAM=${BAM}"
@echo "# VCF=${VCF}"
@echo "#"
@echo "# make run"
@echo "#"

${VCF}: ${BAM} ${REF}
mkdir -p $(dir $@)
bcftools mpileup ${PILE_FLAGS} -O u -f ${REF} ${BAM} | \
bcftools call ${CALL_FLAGS} -mv -O u | \
bcftools norm -f ${REF} -d all -O u | \
bcftools sort -O z > ${VCF}

${VCF}.tbi: ${VCF}
bcftools index -t -f $<

run:: ${VCF}.tbi
@ls -lh ${VCF}

run!::
rm -rf ${VCF} ${VCF}.tbi

install::
@echo micromamba install bcftools

# Test the entire pipeline.
test:
make -f src/run/tester.mk test_caller MOD=src/run/bwa.mk CALL=src/run/bcftools.mk

# Targets that are not files.
.PHONY: run run! install usage index test

bcftools

freebayes

GATK

DeepVariant