initial commit

This commit is contained in:
Angeline Aguinaldo
2020-11-12 20:18:05 -05:00
commit db57e4495d
11 changed files with 1061 additions and 0 deletions

90
.gitignore vendored Normal file
View File

@@ -0,0 +1,90 @@
**/data/metasim-strawman_envassay.tsv
**/data/test
tmp
src
### Intellij+all ###
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf
# Generated files
.idea/**/contentModel.xml
# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml
# Gradle
.idea/**/gradle.xml
.idea/**/libraries
# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr
# CMake
cmake-build-*/
# Mongo Explorer plugin
.idea/**/mongoSettings.xml
# File-based project format
*.iws
# IntelliJ
out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Cursive Clojure plugin
.idea/replstate.xml
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
# Editor-based Rest Client
.idea/httpRequests
# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser
### Intellij+all Patch ###
# Ignores the whole .idea folder and all .iml files
# See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360
.idea/
# Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023
*.iml
modules.xml
.idea/misc.xml
*.ipr
# Sonarlint plugin
.idea/sonarlint

82
Dockerfile Normal file
View File

@@ -0,0 +1,82 @@
# docker build -t meta_simulator:latest .
FROM python:3.8-slim
# Explicit User (top of file to avoid conflicts down the line with IDs)
# Number should be same as meta_system to prevent conflicts...
ENV APP_USER simulator
ENV APP_WORK_DIR /home/${APP_USER}
ENV CONDA_VERSION 2020.07
RUN groupadd -r -g 999 ${APP_USER} && useradd -m -r -g ${APP_USER} -u 999 ${APP_USER}
# https://github.com/tianon/gosu/releases
# https://github.com/krallin/tini
RUN set -eux; \
apt-get update; \
apt-get install -y gosu tini; \
rm -rf /var/lib/apt/lists/*; \
gosu nobody true
# Install System Level Dependencies (Scripts, Simulator, guppy basecaller)
RUN set -eux; \
apt-get update; \
apt-get install -y libgl1-mesa-glx libegl1-mesa libxrandr2 libxrandr2 libxss1 libxcursor1 libxcomposite1 libasound2 libxi6 libxtst6; \
apt-get install --no-install-recommends -y bc git wget curl gawk gzip parallel build-essential libidn11; \
rm -rf /var/lib/apt/lists/*
# Setup Conda
# https://docs.anaconda.com/anaconda/install/linux/
ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
ENV PATH /opt/conda/bin:$PATH
RUN set -eux; \
mkdir -p /opt/conda; \
chown ${APP_USER}:${APP_USER} /opt/conda; \
gosu ${APP_USER} wget --quiet https://repo.anaconda.com/archive/Anaconda3-${CONDA_VERSION}-Linux-x86_64.sh -O ${APP_WORK_DIR}/anaconda.sh; \
gosu ${APP_USER} /bin/bash ${APP_WORK_DIR}/anaconda.sh -b -u -p /opt/conda; \
rm ${APP_WORK_DIR}/anaconda.sh; \
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
# Copy code to working directory
COPY --chown=${APP_USER}:${APP_USER} scripts ${APP_WORK_DIR}/scripts/
COPY --chown=${APP_USER}:${APP_USER} data/iss_model_iSeq_min120.npz ${APP_WORK_DIR}/data/iss_model_iSeq_min120.npz
COPY --chown=${APP_USER}:${APP_USER} data/strawman_envassay.tsv ${APP_WORK_DIR}/data/strawman_envassay.tsv
COPY --chown=${APP_USER}:${APP_USER} environment.yml ${APP_WORK_DIR}
# Switch User (this is fine for now because volumes mounted are 999 writable)
USER ${APP_USER}
# Tune Working Directory
WORKDIR ${APP_WORK_DIR}
# Install Conda dependencies for META Simulator
RUN set -eux; \
echo ". /opt/conda/etc/profile.d/conda.sh" | tee -a ~/.bashrc; \
echo "conda activate base" | tee -a ~/.bashrc; \
. /opt/conda/etc/profile.d/conda.sh; \
conda env create -f environment.yml
# Run install script for DeepSim, change permissions of shell files
RUN set -eux; \
. /opt/conda/etc/profile.d/conda.sh; \
conda activate simulator; \
chmod +x scripts/sim_module_wrapper.sh; \
chmod +x scripts/install.sh; \
${APP_WORK_DIR}/scripts/install.sh
# Install other dependencies for DeepSim
RUN set -eux; \
. /opt/conda/etc/profile.d/conda.sh; \
conda activate tensorflow_cdpm; \
pip install tensorflow==1.2.1; \
pip install tflearn==0.3.2; \
pip install tqdm==4.19.4; \
pip install scipy==0.18.1; \
pip install h5py==2.7.1; \
pip install numpy==1.13.1; \
pip install scikit-learn==0.20.3; \
pip install biopython==1.74
# Enable Process Ripper
ENTRYPOINT ["/usr/bin/tini", "--"]

125
README.md Normal file
View File

@@ -0,0 +1,125 @@
# Metagenomics Evaluation & Testing Analysis (META) Simulator
Compares open-source metagenomic classification tool performance (precision, sensitivity, runtime) across various
sequencing platforms ([Illumina MiSeq/iSeq](https://www.illumina.com/), [Oxford Nanopore MinION](https://nanoporetech.com/)) and use cases
(metagenomic profiles).
## Summary
- [Getting Started](#getting-started)
- [Running](#running)
- [License](#license)
## Getting Started
These instructions will get you a copy of the project up and running on
your local machine for development and testing purposes.
### Prerequisites
The META system has been designed to run on Linux (specifically, tested on Ubuntu 18.04) and in Docker containers.
The following packages are required:
* [Docker-ce 19.03](https://docs.docker.com/engine/)
Here is an example of how to install these on Ubuntu 18.04:
```bash
# Install Docker engine (reference: https://docs.docker.com/engine/install/ubuntu/)
sudo apt-get remove docker docker-engine docker.io containerd runc
sudo apt-get update
sudo apt-get install \
apt-transport-https \
ca-certificates \
curl \
gnupg-agent \
software-properties-common
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
sudo add-apt-repository \
"deb [arch=amd64] https://download.docker.com/linux/ubuntu \
$(lsb_release -cs) \
stable"
sudo apt-get update
sudo apt-get install docker-ce docker-ce-cli containerd.io
sudo docker run hello-world # to verify successful install
```
### Installing
To build the META Simulator, run the following from the root directory of `meta_simulator`:
```bash
docker build -t meta_simulator:latest .
```
### Integrating with Docker-based Meta System
To integrate the META simulator with the Docker-based Meta System, you will need to export the meta_simulator into a docker tarfile and save it in the `meta_system/data/docker` directory.
1. To export the meta_simulator, run the following command:
```bash
docker save -o meta_simulator.tar meta_simulator:latest
```
2. Move `meta_simulator.tar` to `meta_system/data/docker`
3. To make sure it loads on `meta_system` run `make load-docker` on `meta_system`
## Running
The META Simulator requires an abundance profile TSV. An *abundance profile* is expressed as a tab-delimited text file (TSV) where the first column contains the leaf taxonomic ID, the second column contains the corresponding abundance proportion (must sum to 1.000000), and the third column designates the organism as being foreground (`1`) or background (`0`). There should be no headers in the abundance profile TSV. An example is shown below:
```TSV
400667 0.10 1
435590 0.10 1
367928 0.10 1
864803 0.10 1
1091045 0.10 1
349101 0.10 1
1282 0.10 1
260799 0.10 1
1529886 0.10 1
198094 0.10 1
```
An example TSV is included within the Docker container in `data/test/strawman_envassay.tsv`.
The META Simulator accepts the following arguments:
* `-t` number of threads to use for simulations
* `-i` list of taxid with associated abundance (totalling 1.0)
* `-p` sequencing platform to simulate reads for (case sensitive)
* The options are:
* `iseq` Illumina iSeq 100
* `miseq` Illumina MiSeq (assuming both illumina platforms have spot count of 8M, and taking 1/100 of this) [80,000]
* `r9` Oxford Nanopore R9 flowcell (MIN106) - best performance at 50Gbp output (will assume 20Gbp and 20kb avg read length = 1M reads) [10,000]
* `flg` Oxford Nanopore Flongle flowcell (FLG001) - best performance at 2Gbp output (1/25 of r9) (assuming 10% of r9 output) [1,000]
* `-o` Output directory (combined fastq file for classification will be at `$outdir/simulated.fastq`)
### Deep Simulator
To run DeepSimulator ([Nanopore R9 flowcell](https://store.nanoporetech.com/us/flowcells/spoton-flow-cell-mk-i-r9-4.html)) using META Simulator, run:
```bash
docker run meta_simulator:latest bash scripts/sim_module_wrapper.sh -t 2 -i data/strawman_envassay.tsv -p r9 -o data/test
```
To run DeepSimulator ([Nanopore Flongle flowcell](https://store.nanoporetech.com/us/flowcells/flongle-flow-cell.html)) using META Simulator, run:
```bash
docker run meta_simulator:latest bash scripts/sim_module_wrapper.sh -t 2 -i data/strawman_envassay.tsv -p flg -o data/test
```
### InsilicoSeq
To run InsilicoSeq ([Illumina MiSeq](https://www.illumina.com/systems/sequencing-platforms/miseq.html)) using META Simulator, run:
```bash
docker run meta_simulator:latest bash scripts/sim_module_wrapper.sh -t 2 -i data/strawman_envassay.tsv -p miseq -o data/test
```
To run InsilicoSeq ([Illumina iSeq](https://www.illumina.com/systems/sequencing-platforms/iseq.html)) using META Simulator, run:
```bash
docker run meta_simulator:latest bash scripts/sim_module_wrapper.sh -t 2 -i data/strawman_envassay.tsv -p iseq -o data/test
```
If you wish to run the simulator with your own abundance profile, use the [Docker bind mount](https://docs.docker.com/storage/bind-mounts/) `-v` flag for `docker run
` to mount the volume containing your abundance profile TSV.
## License
This project is licensed under [Apache 2.0](http://www.apache.org/licenses/LICENSE-2.0).
Copyright under Johns Hopkins University Applied Physics Laboratory.

Binary file not shown.

View File

@@ -0,0 +1,10 @@
400667 0.10
435590 0.10
367928 0.10
864803 0.10
1091045 0.10
349101 0.10
1282 0.10
260799 0.10
1529886 0.10
198094 0.10
1 400667 0.10
2 435590 0.10
3 367928 0.10
4 864803 0.10
5 1091045 0.10
6 349101 0.10
7 1282 0.10
8 260799 0.10
9 1529886 0.10
10 198094 0.10

65
environment.yml Normal file
View File

@@ -0,0 +1,65 @@
name: simulator
channels:
- bioconda
- defaults
dependencies:
- _libgcc_mutex=0.1=main
- asn1crypto=1.3.0=py36_0
- biopython=1.76=py36h7b6447c_0
- blas=1.0=mkl
- bzip2=1.0.8=h7b6447c_0
- ca-certificates=2020.1.1=0
- certifi=2019.11.28=py36_0
- cffi=1.14.0=py36h2e261b9_0
- chardet=3.0.4=py36_1003
- cryptography=2.8=py36h1ba5d50_0
- curl=7.68.0=hbc83047_0
- future=0.18.2=py36_0
- idna=2.8=py36_0
- insilicoseq=1.4.5=py_0
- intel-openmp=2020.0=166
- joblib=0.14.1=py_0
- krb5=1.17.1=h173b8e3_0
- ld_impl_linux-64=2.33.1=h53a641e_7
- libcurl=7.68.0=h20c2e04_0
- libdeflate=1.0=h14c3975_1
- libedit=3.1.20181209=hc058e9b_0
- libffi=3.2.1=hd88cf55_4
- libgcc-ng=9.1.0=hdf63c60_0
- libgfortran-ng=7.3.0=hdf63c60_0
- libssh2=1.8.2=h1ba5d50_0
- libstdcxx-ng=9.1.0=hdf63c60_0
- mkl=2020.0=166
- mkl-service=2.3.0=py36he904b0f_0
- mkl_fft=1.0.15=py36ha843d7b_0
- mkl_random=1.1.0=py36hd6b4f25_0
- ncurses=6.1=he6710b0_1
- numpy=1.18.1=py36h4f9e942_0
- numpy-base=1.18.1=py36hde5b4d6_1
- openssl=1.1.1d=h7b6447c_4
- pandas=1.0.1=py36h0573a6f_0
- pip=20.0.2=py36_1
- pycparser=2.19=py36_0
- pyopenssl=19.1.0=py36_0
- pysam=0.15.3=py36hda2845c_1
- pysocks=1.7.1=py36_0
- python=3.6.10=h0371630_0
- python-dateutil=2.8.1=py_0
- pytz=2019.3=py_0
- readline=7.0=h7b6447c_5
- requests=2.22.0=py36_1
- scipy=1.4.1=py36h0b6359f_0
- setuptools=45.2.0=py36_0
- six=1.14.0=py36_0
- sqlite=3.31.1=h7b6447c_0
- tk=8.6.8=hbc83047_0
- urllib3=1.25.8=py36_0
- wheel=0.34.2=py36_0
- xz=5.2.4=h14c3975_4
- zlib=1.2.11=h7b6447c_3
- pip:
- bioseq==0.2.3
- h5py==2.10.0
- ont-fast5-api==3.0.1
- progressbar33==2.4

87
scripts/install.sh Normal file
View File

@@ -0,0 +1,87 @@
#!/bin/bash
# **********************************************************************
# Copyright (C) 2020 Johns Hopkins University Applied Physics Laboratory
#
# All Rights Reserved.
# For any other permission, please contact the Legal Office at JHU/APL.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# **********************************************************************
####Define ENV variables
run_dir=$PWD
g="$(which python)"
baseBin=$(dirname ${g})
src_bin="${run_dir}/src"
mkdir $src_bin
CONDA_BASE=$(conda info --base)
source "$CONDA_BASE/etc/profile.d/conda.sh"
script_location="$(perl -MCwd=abs_path -le 'print abs_path(shift)' $(which $(basename $0)))"
##########Install DeepSimulator##########################################
git clone https://github.com/lykaust15/DeepSimulator.git $src_bin/DeepSimulator
cd $src_bin/DeepSimulator
find "$PWD" -name "*download*.sh" -print0 | while read -d $'\0' fn; do sed -Ei "s/source activate/conda activate/g" $fn ; done
find "$PWD" -name "*install.sh" -print0 | while read -d $'\0' fn; do sed -Ei "s/source activate/conda activate/g" $fn ; done
find "$PWD" -name "*install.sh" -print0 | while read -d $'\0' fn; do sed -Ei "s/source deactivate/conda deactivate/g" $fn ; done
echo $PWD
grep -q "source $CONDA_BASE/etc/profile.d/conda.sh" install.sh
if [[ $? != 0 ]] ; then
echo "source $CONDA_BASE/etc/profile.d/conda.sh" | cat - install.sh > temp && mv temp install.sh
fi
bash install.sh
#-> 2. install basecaller
#--| 2.1 install albacore_2.3.1
cd base_caller/albacore_2.3.1/
./download_and_install.sh
cd ../../
#--| 2.2 install guppy_3.1.5
cd base_caller/guppy_3.1.5/
./download_and_install.sh
cd ../../
cd $run_dir
#######Ont guppy upgrade to 3.4.5########################################
wget https://americas.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_3.4.5_linux64.tar.gz -P src/
mkdir $src_bin/DeepSimulator/base_caller/guppy_3.4.5
tar -xvzf src/ont-guppy-cpu_3.4.5_linux64.tar.gz --directory $src_bin/DeepSimulator/base_caller/guppy_3.4.5
rm -rf src/ont-guppy-cpu_3.4.5_linux64.tar.gz
#Enable fast mode (less accurate) for cpu basecalling
sed -Ei "s/hac/fast/g" $src_bin/DeepSimulator/deep_simulator.sh
sed -Ei "s/guppy=guppy_3\.[0-9]*\.[0-9]*/guppy=guppy_3\.4\.5/g" $src_bin/DeepSimulator/deep_simulator.sh
sed -Ei "s/source deactivate/conda deactivate/g" $src_bin/DeepSimulator/deep_simulator.sh
#Create a new deepsim script that exits early i.e. there is no basecalling. It only makes fast5 files
cp $src_bin/DeepSimulator/deep_simulator.sh $src_bin/DeepSimulator/deep_simulator_fast5only.sh
sed -Ei "s/guppy=guppy_3\.[0-9]*\.[0-9]*/exit 1/g" $src_bin/DeepSimulator/deep_simulator_fast5only.sh
#Symlink the two files into the conda environment bin folder
ln -sf $src_bin/DeepSimulator/deep_simulator.sh\
$baseBin/
ln -sf $src_bin/DeepSimulator/deep_simulator_fast5only.sh\
$baseBin/
#because albacore doesnt work in the deepsim install script, redo it
source activate basecall
wget https://mirror.oxfordnanoportal.com/software/analysis/ont_albacore-2.3.1-cp36-cp36m-manylinux1_x86_64.whl \
-P $src_bin/
pip install $src_bin/ont_albacore-2.3.1-cp36-cp36m-manylinux1_x86_64.whl
rm $src_bin/*albacore*

View File

@@ -0,0 +1,82 @@
#!/bin/bash
# **********************************************************************
# Copyright (C) 2020 Johns Hopkins University Applied Physics Laboratory
#
# All Rights Reserved.
# For any other permission, please contact the Legal Office at JHU/APL.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# **********************************************************************
# FUNCTIONS
usage()
{
cat << EOF
Help message for remapOxfordFastq.sh:
DESCRIPTION
NOTES:
- WARNING:
USAGE:
bash scripts/remapOxfordFastq.sh -i data/fullDeepSim/metasim-strawman_envassay.tsv/r9/ -o pass_mapped.fastq
OPTIONS:
-h help show this message
-i FASTQ Input fastq containing directory from sim_module deepsim script run
-o FASTQ output fastq with mapped (3rd column) as runid
NOTES:
This script requires the input to be the metasim output folder at first depth/child (40067, 1282, etc.) where each accession/seq header is a folder within those with the pass.fastq file in that
____________________________________________________________________________________________________
References:
1. O. Tange (2011): GNU Parallel - The Command-Line Power Tool, ;login: The USENIX Magazine, February 2011:42-47.
EOF
}
# ARGUMENTS
# parse args
while getopts "ho:i:" OPTION
do
case $OPTION in
h) usage; exit 1 ;;
o) output_file=$OPTARG ;;
i) INPUT=$OPTARG ;;
?) usage; exit ;;
esac
done
# check args
if [[ -z "$INPUT" ]]; then printf "%s\n" "Please specify input directory containing fasta file headers as directory with deepsim output (pass.fastq) (-i)."; exit; fi
if [[ -z "$output_file" ]]; then printf "%s\n" "Please specify output filename in the same directory as the pass.fastq file (-o)."; exit; fi
if [[ "$output_file" == "pass.fastq" ]]; then printf "%s\n" "Please specify output file name different than the pass.fastq filename."; exit; fi
for file in $(find $INPUT -maxdepth 5 -name "pass.fastq"); do # Not recommended, will break on whitespace
dir=$(dirname $file)
baseDir=$(basename $dir)
# echo $dir $baseDir
awk -v name=$baseDir 'BEGIN{li=0; inc=0;}{if( NR%4 == 1 ) {++li; print "@"name"_"li}else{print}}' $file > $dir"/"$output_file
#awk -v name=$baseDir 'BEGIN{li=0; inc=0;}{if( NR%4 == 1 ) {++li; print "@"name"_"li}}' $file
done

42
scripts/separate_seqs.py Normal file
View File

@@ -0,0 +1,42 @@
# **********************************************************************
# Copyright (C) 2020 Johns Hopkins University Applied Physics Laboratory
#
# All Rights Reserved.
# For any other permission, please contact the Legal Office at JHU/APL.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# **********************************************************************
import sys
import argparse
import csv
##########################Take in Args######################################################
parser = argparse.ArgumentParser(description = "Parse a fasta file and retrieve random set of short sequences (cut up from FASTA, can be multiline) ")
parser.add_argument('-i', required = True, type=str, nargs='+', help = 'Input FASTA file, can containing multiple fastas ')
parser.add_argument('-n', required = True, type=str, nargs='+', help = 'Number of total reads you want')
parser.add_argument('-o', required = True, type=str, nargs='+', help = 'Output dir')
args = parser.parse_args()
##############################################################
from Bio import SeqIO
count = 0
for seq_record in SeqIO.parse(vars(args)['i'][0], "fasta"):
count += len(seq_record.seq)
for seq_record in SeqIO.parse(vars(args)['i'][0], "fasta"):
# print(str(seq_record.seq))
fp = open(vars(args)['o'][0]+"/"+seq_record.id+".fasta","w")
fp.write(">"+seq_record.id+"\n"+str(seq_record.seq))
fp.close()

View File

@@ -0,0 +1,330 @@
#!/bin/bash
# **********************************************************************
# Copyright (C) 2020 Johns Hopkins University Applied Physics Laboratory
#
# All Rights Reserved.
# For any other permission, please contact the Legal Office at JHU/APL.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# **********************************************************************
# FUNCTIONS
usage()
{
cat << EOF
Help message for sim_module:
DESCRIPTION
NOTES:
- WARNING:
USAGE:
bash sim_module_wrapper.sh -i </absolute/path/to/taxa_abundance_profile.tsv>
bash scripts/sim_module_wrapper.sh -t 10 -i data/test.tsv -p iseq
OPTIONS:
-h help show this message
-H STR Home directory specification for the sim_module_wrapper.sh file (OPTIONAL)
-t INT number of threads to use for simulations
-i TSV list of taxid with associated abundance (totalling 1.0)
TAXID can be at any taxonomic level, however the first accession
found when searching the 'taxid' column of "assembly_summary_refseq.txt"
will be used as the reference for simulating reads
-p STR sequencing platform to simulate reads for (case sensitive)
available options [total reads simulated]:
iseq Illumina iSeq 100 (assuming both illumina platforms have spot count of 8M, and taking 1/100 of this) [80,000]
miseq Illumina MiSeq [80,000]
r9 Oxford Nanopore R9 flowcell (MIN106) - best performance at 50Gbp output (will assume 20Gbp and 20kb avg read length = 1M reads) [10,000]
flg Oxford Nanopore Flongle flowcell (FLG001) - best performance at 2Gbp output (1/25 of r9) (assuming 10% of r9 output) [1,000]
pending options:
r10 Oxford Nanopore R10 flowcell (MIN107) <- not implemented
-j INT If using Deep Simulator (r9,flg) specify if you want to only generate fast5 files.
Available Options:
1 Full process from fast5 generation to fastq/basecalling
2 Exit early. Generated only fast5 files
-o DIR Output directory (combined fastq file for classification will be at "$outdir/simulated.fastq")
NOTES:
Input abundance profile (abundance based on proportion bps of reads, not genome size)
____________________________________________________________________________________________________
References:
1. O. Tange (2011): GNU Parallel - The Command-Line Power Tool, ;login: The USENIX Magazine, February 2011:42-47.
EOF
}
# ARGUMENTS
# parse args
while getopts "ht:i:p:r:j:H:o:" OPTION
do
case $OPTION in
h) usage; exit 1 ;;
t) THREADS=$OPTARG ;;
i) INPUT=$OPTARG ;;
p) PLATFORM=$OPTARG ;;
H) HOME_DIR=$OPTARG ;;
r) READSCOUNT=$OPTARG;;
j) r9cfg=$OPTARG;;
o) OUTPUT=$OPTARG;;
?) usage; exit ;;
esac
done
# check args
if [[ -z "$THREADS" ]]; then printf "%s\n" "Please specify number of threads (-t)."; exit; fi
if [[ -z "$INPUT" ]]; then printf "%s\n" "Please specify input tsv (-i)."; exit; fi
if [[ -z "$PLATFORM" ]]; then printf "%s\n" "Please specify sequencing platform (-p)."; exit; fi
#if [[ -z $READSCOUNT ]]; then READSCOUNT=10000; fi; # 20200520, readcount will be based on platform type
if [[ -z $HOME_DIR ]]; then HOME_DIR=$PWD; fi
if [[ -z $r9cfg ]]; then r9cfg=1; fi
if [[ -z $OUTPUT ]]; then printf "%s\n" "Please specify a final output directory (-o)."; exit; fi
if [[ ! -d "$OUTPUT" ]]; then mkdir -p "$OUTPUT"; fi
#Activate the correct environment for simulator (iss and deepsim)
# setup other variables
absolute_path_x="$(readlink -fn -- "$0"; echo x)"
absolute_path_of_script="${absolute_path_x%x}"
scriptdir=$(dirname "$absolute_path_of_script")
runtime=$(date +"%Y%m%d%H%M%S%N")
dn=$(dirname "$INPUT")
bn=$(basename "$INPUT")
#outdir="$dn/metasim-$bn"
outdir="$OUTPUT"
tmp="$outdir/tmp"
if [[ ! -d "$tmp" ]]; then
mkdir -p "$tmp"
fi
# MAIN
echo "Checking that abundance profile sums to 1.000000 (input.tsv column 2)."
check=$(awk -F'\t' '{x+=$2}END{printf("%f",x)}' "$INPUT" | cut -c1-8)
if [[ "$check" != "1.000000" ]]; then
echo "Input abundances sum to $check"
echo "They must sum to 1 within a tolerance of <1 millionth (i.e. 1.000000). Exiting."; exit
else
echo "Check successful."
fi
# get updated assembly summary refseq (asr)
echo "Pulling latest 'assembly_summary_refseq.txt files."
asr="$tmp/assembly_summary_refseq.txt"
for k in "archaea" "bacteria" "fungi" "invertebrate" "other" "plant" "protozoa" "vertebrate_mammalian" "vertebrate_other" "viral"; do
echo "downloading refseq summary for $k"
if [[ ! -f "$asr.tmp-$k" ]]; then
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/$k/assembly_summary.txt --output-document "$asr.tmp-$k"
fi
done 2> /dev/null
# combine all
find "$tmp" -maxdepth 1 -name "*assembly_summary_refseq.txt.tmp-*" -exec cat {} + > "$asr"
asri="$tmp/asr_input.tsv"
awk -F'\t' '{if(FNR==NR){rs[$6]=$0}else{printf("%s\n",rs[$1])}}' "$asr" "$INPUT" > "$asri"
# check if any accession are missing from the overlap
# find overlap between it and the input
check=$(comm -23 <(cut -f1 "$INPUT" | sort) <(cut -f6 "$asri" | sort))
if [[ "$check" != "" ]]; then
echo "The following accessions did not have a corresponding"
echo "path in the assembly summary refseq file:"
echo "$check"
echo "Please remove or subsitute the accession(s) above, and resubmit."; exit
fi
# pull ftp paths and download reference genomes for accessions in input tsv
# 1 assembly_accession
# 6 taxid <- strain if available, otherwise is species taxid
# 7 species_taxid
# 8 organism_name
# 9 infraspecific_name
# 20 ftp_path
mkdir -p "$outdir/$PLATFORM"
while read x; do
acc=$(printf "$x" | cut -f1)
taxid=$(printf "$x" | cut -f6)
name=$(printf "$x" | cut -f8)
path=$(printf "$x" | cut -f20)
bn=$(basename "$path")
echo " wgetting: $acc, $taxid, $name"
wget "$path/${bn}_genomic.fna.gz" --output-document "$outdir/$taxid.fasta.gz" 2> /dev/null
gunzip -f "$outdir/$taxid.fasta.gz"
# put all sequence strings under each header into a single line
awk '{if(NR == 1){printf("%s\n", $0)}else{if(substr($0,1,1) == ">"){printf("\n%s\n", $0)} else {printf("%s", $0)}}}END{printf("\n")}' "$outdir/$taxid.fasta" > "$outdir/$taxid.fasta.tmp"
# only retain contigs/assemblies >1 Kbp
sed $'$!N;s/\\\n/\t/' "$outdir/$taxid.fasta.tmp" | awk -F'\t' '{if(length($2)>=1000){printf("%s\n%s\n",$1,$2)}}' > "$outdir/$taxid.fasta"
rm "$outdir/$taxid.fasta.tmp"
abu=$(grep -P "^$taxid\t" "$INPUT" | cut -f2)
# calculate number reads based on abundance in input and on:
# ILLUMINA (~4-8 million reads output), output 8 million reads
# MISEQ 2 x 150bp [80,000]
# ISEQ 2 x 150f [80,000]
# OXFORD MINION (r9 ~20Gbp @ 20Kb avg read length), output 1 million reads
# R9 rapid library kit RAD004 [10,000]
# FLG rapid library kit RAD004 [1,000]
# R10 ligation library kit LSK009 [n/a]
# InSilicoSeq
# --cpus <int>, -p <int>
# number of cpus to use. (default: 2).
# --genomes <genomes.fasta> [<genomes.fasta> ...], -g <genomes.fasta> [<genomes.fasta> ...]
# Input genome(s) from where the reads will originate
# --draft <draft.fasta> [<draft.fasta> ...]
# Input draft genome(s) from where the reads will
# originate
# If you have draft genome files containing contigs, you can give them to the --draft option:
# --n_reads <int>, -n <int>
# Number of reads to generate (default: 1000000). Allows
# suffixes k, K, m, M, g and G (ex 0.5M for 500000).
# --model <npz>, -m <npz>
# Error model file. (default: None). Use HiSeq, NovaSeq
# or MiSeq for a pre-computed error model provided with
# the software, or a file generated with iss model. If
# you do not wish to use a model, use --mode basic or
# --mode perfect. The name of the built-in models are
# case insensitive.
# --output <fastq>, -o <fastq>
# Output file prefix (Required)
if [[ "$PLATFORM" == "iseq" ]]; then
count=$(printf "$abu" | awk '{printf("%.0f",$0*80000)}')
echo "Simulating $count reads for $taxid, $name, $PLATFORM, at abundance of $abu"
model=$HOME_DIR"/data/iss_model_iSeq_min120.npz"
source activate simulator
iss generate -p "$THREADS" --draft "$outdir/$taxid.fasta" -n "$count" -m "$model" -o "$outdir/$PLATFORM/$taxid" 2> "$outdir/error.log"
elif [[ "$PLATFORM" == "miseq" ]]; then
count=$(printf "$abu" | awk '{printf("%.0f",$0*80000)}')
echo "Simulating $count reads for $taxid, $name, $PLATFORM, at abundance of $abu"
model="MiSeq"
source activate simulator
iss generate -p "$THREADS" --draft "$outdir/$taxid.fasta" -n "$count" -m "$model" -o "$outdir/$PLATFORM/$taxid" 2> "$outdir/error.log"
elif [[ "$PLATFORM" == "r9" || "$PLATFORM" == "flg" ]]; then
if [[ "$PLATFORM" == "r9" ]]; then
# making r9 total reads default 10,000
count=$(printf "$abu" | awk '{printf("%.0f",$0*10000)}')
elif [[ "$PLATFORM" == "flg" ]]; then
# making flg total reads default 1,000
count=$(printf "$abu" | awk '{printf("%.0f",$0*1000)}')
fi
echo "Simulating $count reads for $taxid, $name, $PLATFORM, at abundance of $abu"
r9ScriptLocation=$HOME_DIR"/scripts/"
totalCountNucleotides=$( grep "^[^>]" "$outdir/$taxid.fasta" | tr -d "\n" | wc -c )
seqs=($( grep -R "^>" "$outdir/${taxid}.fasta" | tr " " "|" ))
total=${#seqs[*]}
if [ ! -d $r9ScriptLocation"../tmp" ]; then
mkdir $r9ScriptLocation"../tmp"
fi
if [ -d $r9ScriptLocation"../tmp/seqs" ]; then
rm -rf $r9ScriptLocation"../tmp/seqs"
fi
mkdir $r9ScriptLocation"../tmp/seqs/"
if [ -d $outdir/"$PLATFORM/$taxid" ]; then
rm -rf $outdir/"$PLATFORM/$taxid"
fi
if [ ! -d $outdir/"$PLATFORM/logs" ]; then
mkdir $outdir/"$PLATFORM/logs"
fi
if [ -f "$outdir/$PLATFORM/logs/pythonLog.txt" ]; then
rm "$outdir/$PLATFORM/logs/pythonLog.txt"
fi
if [ -f "$outdir/$PLATFORM/logs/simulationLog.txt" ]; then
rm "$outdir/$PLATFORM/logs/simulationLog.txt"
fi
mkdir -p $outdir/$PLATFORM/$taxid
source activate simulator
python "$r9ScriptLocation/"separate_seqs.py -i $outdir/$taxid.fasta -o $r9ScriptLocation"../tmp/seqs/" -n "$count" >> "$outdir/$PLATFORM/logs/pythonLog.txt" 2>&1
conda deactivate
files=()
while IFS= read -r -d $'\0'; do
files+=("$REPLY")
done < <(find ${r9ScriptLocation}"/../tmp/seqs/" -name "*.fasta" -print0)
for (( i=0; i < "${#files[@]}" ; i++ ))
do
length=$( grep -e '^[^>]' ${files[$i]} | tr -d "\n" | wc -c )
count_seq=$( echo "scale=20; 0.5+($count*$length)/$totalCountNucleotides" | bc -l | xargs printf %.0f )
echo "count_seq equals $count_seq"
bash "$r9ScriptLocation"/simulate.sh \
-i ${files[$i]} \
-n $count_seq \
-o "$outdir/$PLATFORM/$taxid" \
-c $THREADS \
-g "CPU" \
-d $HOME_DIR"/src/DeepSimulator" \
-j $r9cfg
find "$outdir/$PLATFORM/$taxid" -name "fast5" -type d -exec rm -rf "{}" \;
done >> "$outdir/$PLATFORM/logs/simulationLog.txt" 2>&1
rm -rf ${r9ScriptLocation}"/../tmp/seqs/*"
echo "done with this file $outdir $taxid"
fi
# NOTES:
# name your output fastq per taxid with the taxid, such that "sed 's/_R.*//'" will return ONLY the taxid
done < "$asri"
if [[ "$PLATFORM" == "r9" || "$PLATFORM" == "flg" ]]; then
echo "merging all $PLATFORM fastq files"
bash "$HOME_DIR/scripts/remapOxfordFastq.sh" \
-i "$outdir/$PLATFORM/" \
-o "pass_mapped.fastq" && find "$outdir/$PLATFORM/" \
-maxdepth 5 \
-name "pass_mapped.fastq" \
-exec cat {} + > $outdir"/simulated.fastq"
mv "$outdir/simulated.fastq" "$OUTPUT/"
rm -rf "$outdir/$PLATFORM"
elif [[ "$PLATFORM" == "iseq" || "$PLATFORM" == "miseq" ]]; then
echo "fixing headers and combining fastqs"
# rename headers for 'taxid'
find "$outdir/$PLATFORM" -maxdepth 1 -name "*fastq" | while read fq; do
bn=$(basename "$fq" | sed 's/_R.*//')
sed $'$!N;s/\\\n/\t/' "$fq" | sed $'$!N;s/\\\n/\t/' | awk -v name="$bn" -F'\t' '{printf("@%s\n%s\n%s\n%s\n",name,$2,$3,$4)}' > "$fq.tmp"
done
# merge all fastq
find "$outdir/$PLATFORM" -maxdepth 1 -name "*fastq.tmp" -exec cat {} + > "$outdir/$PLATFORM/fastq.merged"
# rename headers for 'taxid-readID'
sed $'$!N;s/\\\n/\t/' "$outdir/$PLATFORM/fastq.merged" | sed $'$!N;s/\\\n/\t/' | awk -F'\t' '{printf("%s-%s\n%s\n%s\n%s\n",$1,NR,$2,$3,$4)}' > "$outdir/simulated.fastq"
mv "$outdir/simulated.fastq" "$OUTPUT/"
rm -rf "$outdir/$PLATFORM"
fi

148
scripts/simulate.sh Normal file
View File

@@ -0,0 +1,148 @@
# **********************************************************************
# Copyright (C) 2020 Johns Hopkins University Applied Physics Laboratory
#
# All Rights Reserved.
# For any other permission, please contact the Legal Office at JHU/APL.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# **********************************************************************
usage()
{
cat << EOF
DESCRIPTION: This script will run deep simulator using either CPU or GPU to make a set of reads for a single fasta file. Future Updates below
USAGE:
bash simulate.sh -i absolute path to fasta file
-c number of cpu nodes you want
-n Number of reads for simulation
-g [CPU/GPU/ALBACORE]
-o <Output Directory that should be the name of the file you want/organism you want to simulate
OPTIONS:
-h help show this message
-i fna reference fasta input file
-n read# number of reads for given fasta file
-o output output directory
-g GPU/CPU Choose either CPU or GPU to run simulation on
-j exit at fast5 [1|2] where 1 is default and it doesnt exit and 2 exits after fast5 making
NOTES:
Update this script is in the works to map abundance profile to a directory of fasta files
EOF
}
source activate simulator
# parsing arguments from command line
cpu_count=1
deep_sim_loc="$PWD/src/DeepSimulator"
guppy_type="CPU"
read_count=1
j=1
while getopts "hi:d:o:n:B:c:r:g:j:" OPTION
do
case $OPTION in
h) usage; exit 1 ;;
i) fasta_input_file=$OPTARG ;;
o) output_dir=$OPTARG ;;
c) cpu_count=$OPTARG ;;
g) guppy_type=$OPTARG ;;
d) deep_sim_loc=$OPTARG;;
n) read_count=$OPTARG;;
j) j=$OPTARG;;
?) usage; exit ;;
esac
done
if [[ (! $guppy_type == "GPU" ) && (! $guppy_type == "CPU") && (! $guppy_type == "ALBACORE") ]]; then
usage
echo "Invalid Guppy basecaller selected [GPU|CPU|ALBACORE]. Exiting."
exit 1
fi
echo $guppy_type
if [[ $guppy_type == "GPU" ]]; then
guppy_type=1
elif [[ $guppy_type == "CPU" ]]; then
guppy_type=2
else
guppy_type=3
fi
#define location of fasta input and output location of simulated reads as user
#since torque runs the script from a different location, specify absolute pathing
current_loc=$( pwd )
envbin=$(which python)
base="$(dirname $envbin)"
echo $deep_sim_loc
echo $PWD
base=$(basename ${fasta_input_file} .fasta)
mkdir $output_dir"/$base"
conda deactivate
if [[ $j -eq 1 ]]; then
echo $output_dir"/$base"
cat <<EOF
${deep_sim_loc}/deep_simulator.sh \
-i ${fasta_input_file} \
-n ${read_count} \
-c $cpu_count \
-o $output_dir"/$base" \
-B $guppy_type \
-H $deep_sim_loc
EOF
bash ${deep_sim_loc}/deep_simulator.sh \
-i ${fasta_input_file} \
-n ${read_count} \
-c $cpu_count \
-o $output_dir"/$base" \
-B $guppy_type \
-H $deep_sim_loc
# exit 1
elif [[ $j -eq 2 ]]; then
cat <<EOF
${deep_sim_loc}/deep_simulator_fast5only.sh \
-i ${fasta_input_file} \
-n ${read_count} \
-c $cpu_count \
-o $output_dir"/$base" \
-B $guppy_type \
-H $deep_sim_loc
EOF
bash ${deep_sim_loc}/deep_simulator_fast5only.sh \
-i ${fasta_input_file} \
-n ${read_count} \
-c $cpu_count \
-o $output_dir"/$base" \
-B $guppy_type \
-H $deep_sim_loc
# exit 1
else
echo "Exit: -j isn't properly specified as 1 (dont exit after fast5) or 2 (exit after fast5)"
exit 1
fi
#After the fun is done for the fullDeepSim run
# bash scripts/remapOxfordFastq.sh \
# -i data/fullDeepSim/metasim-strawman_envassay.tsv/r9/ \
# -o pass_mapped.fastq && find data/fullDeepSim/metasim-strawman_envassay.tsv/r9/ \
# -maxdepth 3 \
# -name "pass_mapped.fastq" \
# -exec cat {} + > data/fullDeepSim/pass_mapped_merged.fastq