initial commit

2026-01-06 21:34:14 -05:00 · 2020-11-12 20:18:05 -05:00
commit db57e4495d
11 changed files with 1061 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,90 @@
+**/data/metasim-strawman_envassay.tsv
+**/data/test
+tmp
+src
+
+### Intellij+all ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+### Intellij+all Patch ###
+# Ignores the whole .idea folder and all .iml files
+# See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360
+
+.idea/
+
+# Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023
+
+*.iml
+modules.xml
+.idea/misc.xml
+*.ipr
+
+# Sonarlint plugin
+.idea/sonarlint
--- a/82
+++ b/82
@@ -0,0 +1,82 @@
+# docker build -t meta_simulator:latest .
+
+FROM python:3.8-slim
+
+# Explicit User (top of file to avoid conflicts down the line with IDs)
+# Number should be same as meta_system to prevent conflicts...
+ENV APP_USER simulator
+ENV APP_WORK_DIR /home/${APP_USER}
+ENV CONDA_VERSION 2020.07
+RUN groupadd -r -g 999 ${APP_USER} && useradd -m -r -g ${APP_USER} -u 999 ${APP_USER}
+
+# https://github.com/tianon/gosu/releases
+# https://github.com/krallin/tini
+RUN set -eux; \
+    apt-get update; \
+    apt-get install -y gosu tini; \
+    rm -rf /var/lib/apt/lists/*; \
+    gosu nobody true
+
+# Install System Level Dependencies (Scripts, Simulator, guppy basecaller)
+RUN set -eux; \
+    apt-get update; \
+    apt-get install -y libgl1-mesa-glx libegl1-mesa libxrandr2 libxrandr2 libxss1 libxcursor1 libxcomposite1 libasound2 libxi6 libxtst6; \
+    apt-get install --no-install-recommends -y bc git wget curl gawk gzip parallel build-essential libidn11; \
+    rm -rf /var/lib/apt/lists/*
+
+# Setup Conda
+# https://docs.anaconda.com/anaconda/install/linux/
+ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
+ENV PATH /opt/conda/bin:$PATH
+RUN set -eux; \
+    mkdir -p /opt/conda; \
+    chown ${APP_USER}:${APP_USER} /opt/conda; \
+    gosu ${APP_USER} wget --quiet https://repo.anaconda.com/archive/Anaconda3-${CONDA_VERSION}-Linux-x86_64.sh -O ${APP_WORK_DIR}/anaconda.sh; \
+    gosu ${APP_USER} /bin/bash ${APP_WORK_DIR}/anaconda.sh -b -u -p /opt/conda; \
+    rm ${APP_WORK_DIR}/anaconda.sh; \
+    ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
+
+# Copy code to working directory
+COPY --chown=${APP_USER}:${APP_USER} scripts ${APP_WORK_DIR}/scripts/
+COPY --chown=${APP_USER}:${APP_USER} data/iss_model_iSeq_min120.npz ${APP_WORK_DIR}/data/iss_model_iSeq_min120.npz
+COPY --chown=${APP_USER}:${APP_USER} data/strawman_envassay.tsv ${APP_WORK_DIR}/data/strawman_envassay.tsv
+COPY --chown=${APP_USER}:${APP_USER} environment.yml ${APP_WORK_DIR}
+
+# Switch User (this is fine for now because volumes mounted are 999 writable)
+USER ${APP_USER}
+
+# Tune Working Directory
+WORKDIR ${APP_WORK_DIR}
+
+# Install Conda dependencies for META Simulator
+RUN set -eux; \
+    echo ". /opt/conda/etc/profile.d/conda.sh" | tee -a ~/.bashrc; \
+    echo "conda activate base" | tee -a ~/.bashrc; \
+    . /opt/conda/etc/profile.d/conda.sh; \
+    conda env create -f environment.yml
+
+# Run install script for DeepSim, change permissions of shell files
+RUN set -eux; \
+    . /opt/conda/etc/profile.d/conda.sh; \
+    conda activate simulator; \
+    chmod +x scripts/sim_module_wrapper.sh; \
+    chmod +x scripts/install.sh; \
+    ${APP_WORK_DIR}/scripts/install.sh
+
+# Install other dependencies for DeepSim
+RUN set -eux; \
+    . /opt/conda/etc/profile.d/conda.sh; \
+    conda activate tensorflow_cdpm; \
+    pip install tensorflow==1.2.1; \
+    pip install tflearn==0.3.2; \
+    pip install tqdm==4.19.4; \
+    pip install scipy==0.18.1; \
+    pip install h5py==2.7.1; \
+    pip install numpy==1.13.1; \
+    pip install scikit-learn==0.20.3; \
+    pip install biopython==1.74
+
+# Enable Process Ripper
+ENTRYPOINT ["/usr/bin/tini", "--"]
+
+
--- a/README.md
+++ b/README.md
@@ -0,0 +1,125 @@
+# Metagenomics Evaluation & Testing Analysis (META) Simulator
+
+Compares open-source metagenomic classification tool performance (precision, sensitivity, runtime) across various 
+sequencing platforms ([Illumina MiSeq/iSeq](https://www.illumina.com/), [Oxford Nanopore MinION](https://nanoporetech.com/)) and use cases
+ (metagenomic profiles).
+
+## Summary
+
+  - [Getting Started](#getting-started)
+  - [Running](#running)
+  - [License](#license)
+
+## Getting Started
+
+These instructions will get you a copy of the project up and running on
+your local machine for development and testing purposes. 
+
+### Prerequisites
+
+The META system has been designed to run on Linux (specifically, tested on Ubuntu 18.04) and in Docker containers. 
+The following packages are required:
+
+* [Docker-ce 19.03](https://docs.docker.com/engine/)
+
+Here is an example of how to install these on Ubuntu 18.04:
+
+```bash
+# Install Docker engine (reference: https://docs.docker.com/engine/install/ubuntu/)
+sudo apt-get remove docker docker-engine docker.io containerd runc
+sudo apt-get update
+sudo apt-get install \
+    apt-transport-https \
+    ca-certificates \
+    curl \
+    gnupg-agent \
+    software-properties-common
+curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
+sudo add-apt-repository \
+    "deb [arch=amd64] https://download.docker.com/linux/ubuntu \
+    $(lsb_release -cs) \
+    stable"
+sudo apt-get update
+sudo apt-get install docker-ce docker-ce-cli containerd.io
+sudo docker run hello-world # to verify successful install
+```
+
+### Installing
+
+To build the META Simulator, run the following from the root directory of `meta_simulator`:
+```bash
+docker build -t meta_simulator:latest .
+```
+
+### Integrating with Docker-based Meta System 
+
+To integrate the META simulator with the Docker-based Meta System, you will need to export the meta_simulator into a docker tarfile and save it in the `meta_system/data/docker` directory.
+
+1. To export the meta_simulator, run the following command:
+    ```bash
+    docker save -o meta_simulator.tar meta_simulator:latest
+    ```
+2. Move `meta_simulator.tar` to `meta_system/data/docker`
+3. To make sure it loads on `meta_system` run `make load-docker` on `meta_system`
+
+## Running
+
+The META Simulator requires an abundance profile TSV. An *abundance profile* is expressed as a tab-delimited text file (TSV) where the first column contains the leaf taxonomic ID, the second column contains the corresponding abundance proportion (must sum to 1.000000), and the third column designates the organism as being foreground (`1`) or background (`0`). There should be no headers in the abundance profile TSV. An example is shown below:
+```TSV
+400667	0.10	1
+435590	0.10	1
+367928	0.10	1
+864803	0.10	1
+1091045	0.10	1
+349101	0.10	1
+1282	0.10	1
+260799	0.10	1
+1529886	0.10	1
+198094	0.10	1
+```
+An example TSV is included within the Docker container in `data/test/strawman_envassay.tsv`.
+
+The META Simulator accepts the following arguments:
+
+* `-t` number of threads to use for simulations
+* `-i` list of taxid with associated abundance (totalling 1.0)
+* `-p` sequencing platform to simulate reads for (case sensitive)
+    * The options are:
+        * `iseq` Illumina iSeq 100
+        * `miseq` Illumina MiSeq (assuming both illumina platforms have spot count of 8M, and taking 1/100 of this) [80,000]
+        * `r9` Oxford Nanopore R9 flowcell (MIN106) - best performance at 50Gbp output (will assume 20Gbp and 20kb avg read length = 1M reads) [10,000]
+        * `flg` Oxford Nanopore Flongle flowcell (FLG001) - best performance at 2Gbp output (1/25 of r9) (assuming 10% of r9 output) [1,000]
+* `-o` Output directory (combined fastq file for classification will be at `$outdir/simulated.fastq`)
+
+### Deep Simulator
+To run DeepSimulator ([Nanopore R9 flowcell](https://store.nanoporetech.com/us/flowcells/spoton-flow-cell-mk-i-r9-4.html)) using META Simulator, run:
+
+```bash
+docker run meta_simulator:latest bash scripts/sim_module_wrapper.sh -t 2 -i data/strawman_envassay.tsv -p r9 -o data/test
+```
+
+To run DeepSimulator ([Nanopore Flongle flowcell](https://store.nanoporetech.com/us/flowcells/flongle-flow-cell.html)) using META Simulator, run:
+
+```bash
+docker run meta_simulator:latest bash scripts/sim_module_wrapper.sh -t 2 -i data/strawman_envassay.tsv -p flg -o data/test
+```
+
+### InsilicoSeq
+To run InsilicoSeq ([Illumina MiSeq](https://www.illumina.com/systems/sequencing-platforms/miseq.html)) using META Simulator, run:
+
+```bash
+docker run meta_simulator:latest bash scripts/sim_module_wrapper.sh -t 2 -i data/strawman_envassay.tsv -p miseq -o data/test
+```
+
+To run InsilicoSeq ([Illumina iSeq](https://www.illumina.com/systems/sequencing-platforms/iseq.html)) using META Simulator, run:
+```bash
+docker run meta_simulator:latest bash scripts/sim_module_wrapper.sh -t 2 -i data/strawman_envassay.tsv -p iseq -o data/test
+```
+
+If you wish to run the simulator with your own abundance profile, use the [Docker bind mount](https://docs.docker.com/storage/bind-mounts/) `-v` flag for `docker run
+` to mount the volume containing your abundance profile TSV.
+
+## License
+
+This project is licensed under [Apache 2.0](http://www.apache.org/licenses/LICENSE-2.0).
+Copyright under Johns Hopkins University Applied Physics Laboratory.
--- a/data/iss_model_iSeq_min120.npz
+++ b/data/iss_model_iSeq_min120.npz
--- a/data/strawman_envassay.tsv
+++ b/data/strawman_envassay.tsv
@@ -0,0 +1,10 @@
+400667	0.10
+435590	0.10
+367928	0.10
+864803	0.10
+1091045	0.10
+349101	0.10
+1282	0.10
+260799	0.10
+1529886	0.10
+198094	0.10
--- a/environment.yml
+++ b/environment.yml
@@ -0,0 +1,65 @@
+name: simulator
+channels:
+  - bioconda
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - asn1crypto=1.3.0=py36_0
+  - biopython=1.76=py36h7b6447c_0
+  - blas=1.0=mkl
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2020.1.1=0
+  - certifi=2019.11.28=py36_0
+  - cffi=1.14.0=py36h2e261b9_0
+  - chardet=3.0.4=py36_1003
+  - cryptography=2.8=py36h1ba5d50_0
+  - curl=7.68.0=hbc83047_0
+  - future=0.18.2=py36_0
+  - idna=2.8=py36_0
+  - insilicoseq=1.4.5=py_0
+  - intel-openmp=2020.0=166
+  - joblib=0.14.1=py_0
+  - krb5=1.17.1=h173b8e3_0
+  - ld_impl_linux-64=2.33.1=h53a641e_7
+  - libcurl=7.68.0=h20c2e04_0
+  - libdeflate=1.0=h14c3975_1
+  - libedit=3.1.20181209=hc058e9b_0
+  - libffi=3.2.1=hd88cf55_4
+  - libgcc-ng=9.1.0=hdf63c60_0
+  - libgfortran-ng=7.3.0=hdf63c60_0
+  - libssh2=1.8.2=h1ba5d50_0
+  - libstdcxx-ng=9.1.0=hdf63c60_0
+  - mkl=2020.0=166
+  - mkl-service=2.3.0=py36he904b0f_0
+  - mkl_fft=1.0.15=py36ha843d7b_0
+  - mkl_random=1.1.0=py36hd6b4f25_0
+  - ncurses=6.1=he6710b0_1
+  - numpy=1.18.1=py36h4f9e942_0
+  - numpy-base=1.18.1=py36hde5b4d6_1
+  - openssl=1.1.1d=h7b6447c_4
+  - pandas=1.0.1=py36h0573a6f_0
+  - pip=20.0.2=py36_1
+  - pycparser=2.19=py36_0
+  - pyopenssl=19.1.0=py36_0
+  - pysam=0.15.3=py36hda2845c_1
+  - pysocks=1.7.1=py36_0
+  - python=3.6.10=h0371630_0
+  - python-dateutil=2.8.1=py_0
+  - pytz=2019.3=py_0
+  - readline=7.0=h7b6447c_5
+  - requests=2.22.0=py36_1
+  - scipy=1.4.1=py36h0b6359f_0
+  - setuptools=45.2.0=py36_0
+  - six=1.14.0=py36_0
+  - sqlite=3.31.1=h7b6447c_0
+  - tk=8.6.8=hbc83047_0
+  - urllib3=1.25.8=py36_0
+  - wheel=0.34.2=py36_0
+  - xz=5.2.4=h14c3975_4
+  - zlib=1.2.11=h7b6447c_3
+  - pip:
+    - bioseq==0.2.3
+    - h5py==2.10.0
+    - ont-fast5-api==3.0.1
+    - progressbar33==2.4
+
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+
+#  **********************************************************************
+#  Copyright (C) 2020 Johns Hopkins University Applied Physics Laboratory
+#
+#  All Rights Reserved.
+#  For any other permission, please contact the Legal Office at JHU/APL.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#  **********************************************************************
+
+####Define ENV variables
+run_dir=$PWD
+g="$(which python)"
+baseBin=$(dirname ${g})
+src_bin="${run_dir}/src"
+mkdir $src_bin
+CONDA_BASE=$(conda info --base)
+source "$CONDA_BASE/etc/profile.d/conda.sh"
+script_location="$(perl -MCwd=abs_path -le 'print abs_path(shift)' $(which $(basename $0)))"
+##########Install DeepSimulator##########################################
+git clone https://github.com/lykaust15/DeepSimulator.git $src_bin/DeepSimulator
+cd $src_bin/DeepSimulator
+find "$PWD" -name "*download*.sh" -print0 | while read -d $'\0' fn; do sed -Ei "s/source activate/conda activate/g" $fn ; done
+find "$PWD" -name "*install.sh" -print0 | while read -d $'\0' fn; do sed -Ei "s/source activate/conda activate/g" $fn ; done
+find "$PWD" -name "*install.sh" -print0 | while read -d $'\0' fn; do sed -Ei "s/source deactivate/conda deactivate/g" $fn ; done
+echo $PWD
+grep -q "source $CONDA_BASE/etc/profile.d/conda.sh" install.sh
+if [[ $? != 0 ]] ; then
+	echo "source $CONDA_BASE/etc/profile.d/conda.sh" | cat - install.sh > temp && mv temp install.sh
+fi
+bash install.sh
+#-> 2. install basecaller
+#--| 2.1 install albacore_2.3.1
+cd base_caller/albacore_2.3.1/
+	./download_and_install.sh
+cd ../../
+
+#--| 2.2 install guppy_3.1.5
+cd base_caller/guppy_3.1.5/
+	./download_and_install.sh
+cd ../../
+
+cd $run_dir
+
+#######Ont guppy upgrade to 3.4.5########################################
+wget https://americas.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_3.4.5_linux64.tar.gz -P src/
+
+mkdir $src_bin/DeepSimulator/base_caller/guppy_3.4.5
+tar -xvzf src/ont-guppy-cpu_3.4.5_linux64.tar.gz --directory $src_bin/DeepSimulator/base_caller/guppy_3.4.5
+
+rm -rf src/ont-guppy-cpu_3.4.5_linux64.tar.gz
+
+
+#Enable fast mode (less accurate) for cpu basecalling
+sed -Ei "s/hac/fast/g" $src_bin/DeepSimulator/deep_simulator.sh
+sed -Ei "s/guppy=guppy_3\.[0-9]*\.[0-9]*/guppy=guppy_3\.4\.5/g" $src_bin/DeepSimulator/deep_simulator.sh
+sed -Ei "s/source deactivate/conda deactivate/g" $src_bin/DeepSimulator/deep_simulator.sh
+
+#Create a new deepsim script that exits early i.e. there is no basecalling. It only makes fast5 files
+cp $src_bin/DeepSimulator/deep_simulator.sh $src_bin/DeepSimulator/deep_simulator_fast5only.sh
+sed -Ei "s/guppy=guppy_3\.[0-9]*\.[0-9]*/exit 1/g" $src_bin/DeepSimulator/deep_simulator_fast5only.sh
+
+
+#Symlink the two files into the conda environment bin folder
+ln -sf $src_bin/DeepSimulator/deep_simulator.sh\
+ $baseBin/
+ln -sf $src_bin/DeepSimulator/deep_simulator_fast5only.sh\
+ $baseBin/
+
+
+#because albacore doesnt work in the deepsim install script, redo it
+source activate basecall
+wget https://mirror.oxfordnanoportal.com/software/analysis/ont_albacore-2.3.1-cp36-cp36m-manylinux1_x86_64.whl \
+-P $src_bin/
+pip install $src_bin/ont_albacore-2.3.1-cp36-cp36m-manylinux1_x86_64.whl
+rm $src_bin/*albacore*
--- a/scripts/remapOxfordFastq.sh
+++ b/scripts/remapOxfordFastq.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+#  **********************************************************************
+#  Copyright (C) 2020 Johns Hopkins University Applied Physics Laboratory
+#
+#  All Rights Reserved.
+#  For any other permission, please contact the Legal Office at JHU/APL.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#  **********************************************************************
+
+#	FUNCTIONS
+usage()
+{
+cat << EOF
+
+Help message for remapOxfordFastq.sh:
+
+	DESCRIPTION
+
+NOTES:
+	- WARNING:
+
+
+USAGE:
+bash scripts/remapOxfordFastq.sh -i data/fullDeepSim/metasim-strawman_envassay.tsv/r9/ -o pass_mapped.fastq
+
+
+OPTIONS:
+	-h      help		show this message
+	-i      FASTQ		Input fastq containing directory from sim_module deepsim script run
+	-o 	FASTQ	 	output fastq with mapped (3rd column) as runid
+
+
+NOTES:
+	This script requires the input to be the metasim output folder at first depth/child (40067, 1282, etc.) where each accession/seq header is a folder within those with the pass.fastq file in that
+
+____________________________________________________________________________________________________
+References:
+	1. O. Tange (2011): GNU Parallel - The Command-Line Power Tool, ;login: The USENIX Magazine, February 2011:42-47.
+
+EOF
+}
+
+
+
+
+#	ARGUMENTS
+# parse args
+while getopts "ho:i:" OPTION
+do
+	case $OPTION in
+		h) usage; exit 1 ;;
+		o) output_file=$OPTARG ;;
+		i) INPUT=$OPTARG ;;
+		?) usage; exit ;;
+	esac
+done
+# check args
+if [[ -z "$INPUT" ]]; then printf "%s\n" "Please specify input directory containing fasta file headers as directory with deepsim output (pass.fastq) (-i)."; exit; fi
+if [[ -z "$output_file" ]]; then printf "%s\n" "Please specify output filename in the same directory as the pass.fastq file (-o)."; exit; fi
+if [[ "$output_file" == "pass.fastq" ]]; then printf "%s\n" "Please specify output file name different than the pass.fastq filename."; exit; fi
+
+for file in $(find $INPUT -maxdepth 5 -name "pass.fastq"); do # Not recommended, will break on whitespace
+    dir=$(dirname $file)
+    baseDir=$(basename $dir)
+	# echo $dir $baseDir
+	awk -v name=$baseDir 'BEGIN{li=0; inc=0;}{if( NR%4 == 1 ) {++li; print "@"name"_"li}else{print}}' $file > $dir"/"$output_file
+	#awk -v name=$baseDir 'BEGIN{li=0; inc=0;}{if( NR%4 == 1 ) {++li; print "@"name"_"li}}' $file
+
+
+done
--- a/scripts/separate_seqs.py
+++ b/scripts/separate_seqs.py
@@ -0,0 +1,42 @@
+#  **********************************************************************
+#  Copyright (C) 2020 Johns Hopkins University Applied Physics Laboratory
+#
+#  All Rights Reserved.
+#  For any other permission, please contact the Legal Office at JHU/APL.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#  **********************************************************************
+
+import sys
+import argparse
+import csv
+
+##########################Take in Args######################################################
+parser = argparse.ArgumentParser(description = "Parse a fasta file and retrieve random set of short sequences (cut up from FASTA, can be multiline) ")
+parser.add_argument('-i', required = True, type=str, nargs='+', help = 'Input FASTA file, can containing multiple fastas ')
+parser.add_argument('-n', required = True, type=str, nargs='+', help = 'Number of total reads you want')
+parser.add_argument('-o', required = True, type=str, nargs='+', help = 'Output dir')
+
+
+args = parser.parse_args()
+##############################################################
+from Bio import SeqIO
+count = 0
+for seq_record in SeqIO.parse(vars(args)['i'][0], "fasta"):
+	count += len(seq_record.seq)
+for seq_record in SeqIO.parse(vars(args)['i'][0], "fasta"):
+	# print(str(seq_record.seq))
+
+	fp = open(vars(args)['o'][0]+"/"+seq_record.id+".fasta","w")
+	fp.write(">"+seq_record.id+"\n"+str(seq_record.seq))
+	fp.close()
--- a/scripts/sim_module_wrapper.sh
+++ b/scripts/sim_module_wrapper.sh
@@ -0,0 +1,330 @@
+#!/bin/bash
+
+#  **********************************************************************
+#  Copyright (C) 2020 Johns Hopkins University Applied Physics Laboratory
+#
+#  All Rights Reserved.
+#  For any other permission, please contact the Legal Office at JHU/APL.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#  **********************************************************************
+
+#	FUNCTIONS
+usage()
+{
+cat << EOF
+
+Help message for sim_module:
+
+	DESCRIPTION
+
+NOTES:
+	- WARNING:
+
+
+USAGE:
+	bash sim_module_wrapper.sh -i </absolute/path/to/taxa_abundance_profile.tsv>
+
+bash scripts/sim_module_wrapper.sh -t 10 -i data/test.tsv -p iseq
+
+
+OPTIONS:
+	-h      help		show this message
+	-H      STR		Home directory specification for the sim_module_wrapper.sh file (OPTIONAL)
+	-t		INT		number of threads to use for simulations
+	-i      TSV		list of taxid with associated abundance (totalling 1.0)
+					TAXID can be at any taxonomic level, however the first accession
+					found when searching the 'taxid' column of "assembly_summary_refseq.txt"
+					will be used as the reference for simulating reads
+	-p		STR	    sequencing platform to simulate reads for (case sensitive)
+					available options [total reads simulated]:
+						iseq	Illumina iSeq 100 (assuming both illumina platforms have spot count of 8M, and taking 1/100 of this) [80,000]
+						miseq	Illumina MiSeq [80,000]
+						r9		Oxford Nanopore R9 flowcell (MIN106) - best performance at 50Gbp output (will assume 20Gbp and 20kb avg read length = 1M reads) [10,000]
+						flg		Oxford Nanopore Flongle flowcell (FLG001) - best performance at 2Gbp output (1/25 of r9) (assuming 10% of r9 output) [1,000]
+				    pending options:
+						r10		Oxford Nanopore R10 flowcell (MIN107)   <- not implemented
+	-j 		INT 	If using Deep Simulator (r9,flg) specify if you want to only generate fast5 files.
+					Available Options:
+						1	Full process from fast5 generation to fastq/basecalling
+						2 	Exit early. Generated only fast5 files
+	-o      DIR     Output directory (combined fastq file for classification will be at "$outdir/simulated.fastq")
+
+
+NOTES:
+	Input abundance profile (abundance based on proportion bps of reads, not genome size)
+
+____________________________________________________________________________________________________
+References:
+	1. O. Tange (2011): GNU Parallel - The Command-Line Power Tool, ;login: The USENIX Magazine, February 2011:42-47.
+
+EOF
+}
+
+
+
+
+#	ARGUMENTS
+# parse args
+while getopts "ht:i:p:r:j:H:o:" OPTION
+do
+	case $OPTION in
+		h) usage; exit 1 ;;
+		t) THREADS=$OPTARG ;;
+		i) INPUT=$OPTARG ;;
+		p) PLATFORM=$OPTARG ;;
+		H) HOME_DIR=$OPTARG ;;
+		r) READSCOUNT=$OPTARG;;
+		j) r9cfg=$OPTARG;;
+		o) OUTPUT=$OPTARG;;
+		?) usage; exit ;;
+	esac
+done
+# check args
+if [[ -z "$THREADS" ]]; then printf "%s\n" "Please specify number of threads (-t)."; exit; fi
+if [[ -z "$INPUT" ]]; then printf "%s\n" "Please specify input tsv (-i)."; exit; fi
+if [[ -z "$PLATFORM" ]]; then printf "%s\n" "Please specify sequencing platform (-p)."; exit; fi
+#if [[ -z $READSCOUNT ]]; then READSCOUNT=10000; fi;    # 20200520, readcount will be based on platform type
+if [[ -z $HOME_DIR ]]; then HOME_DIR=$PWD; fi
+if [[ -z $r9cfg ]]; then r9cfg=1; fi
+if [[ -z $OUTPUT ]]; then printf "%s\n" "Please specify a final output directory (-o)."; exit; fi
+if [[ ! -d "$OUTPUT" ]]; then mkdir -p "$OUTPUT"; fi
+
+#Activate the correct environment for simulator (iss and deepsim)
+
+
+# setup other variables
+absolute_path_x="$(readlink -fn -- "$0"; echo x)"
+absolute_path_of_script="${absolute_path_x%x}"
+scriptdir=$(dirname "$absolute_path_of_script")
+runtime=$(date +"%Y%m%d%H%M%S%N")
+dn=$(dirname "$INPUT")
+bn=$(basename "$INPUT")
+#outdir="$dn/metasim-$bn"
+outdir="$OUTPUT"
+tmp="$outdir/tmp"
+if [[ ! -d "$tmp" ]]; then
+    mkdir -p "$tmp"
+fi
+
+
+#	MAIN
+echo "Checking that abundance profile sums to 1.000000 (input.tsv column 2)."
+check=$(awk -F'\t' '{x+=$2}END{printf("%f",x)}' "$INPUT" | cut -c1-8)
+if [[ "$check" != "1.000000" ]]; then
+	echo "Input abundances sum to $check"
+	echo "They must sum to 1 within a tolerance of <1 millionth (i.e. 1.000000). Exiting."; exit
+else
+	echo "Check successful."
+fi
+# get updated assembly summary refseq (asr)
+echo "Pulling latest 'assembly_summary_refseq.txt files."
+asr="$tmp/assembly_summary_refseq.txt"
+for k in "archaea" "bacteria" "fungi" "invertebrate" "other" "plant" "protozoa" "vertebrate_mammalian" "vertebrate_other" "viral"; do
+	echo "downloading refseq summary for $k"
+	if [[ ! -f "$asr.tmp-$k" ]]; then
+		wget ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/$k/assembly_summary.txt --output-document "$asr.tmp-$k"
+	fi
+done 2> /dev/null
+#	combine all
+find "$tmp" -maxdepth 1 -name "*assembly_summary_refseq.txt.tmp-*" -exec cat {} + > "$asr"
+
+
+asri="$tmp/asr_input.tsv"
+awk -F'\t' '{if(FNR==NR){rs[$6]=$0}else{printf("%s\n",rs[$1])}}' "$asr" "$INPUT" > "$asri"
+# check if any accession are missing from the overlap
+#	find overlap between it and the input
+check=$(comm -23 <(cut -f1 "$INPUT" | sort) <(cut -f6 "$asri" | sort))
+if [[ "$check" != "" ]]; then
+	echo "The following accessions did not have a corresponding"
+	echo "path in the assembly summary refseq file:"
+	echo "$check"
+	echo "Please remove or subsitute the accession(s) above, and resubmit."; exit
+fi
+
+
+# pull ftp paths and download reference genomes for accessions in input tsv
+#	1	assembly_accession
+#	6	taxid				<- strain if available, otherwise is species taxid
+#	7	species_taxid
+#	8	organism_name
+#	9	infraspecific_name
+#	20	ftp_path
+mkdir -p "$outdir/$PLATFORM"
+while read x; do
+	acc=$(printf "$x" | cut -f1)
+	taxid=$(printf "$x" | cut -f6)
+	name=$(printf "$x" | cut -f8)
+	path=$(printf "$x" | cut -f20)
+	bn=$(basename "$path")
+	echo "	wgetting: $acc, $taxid, $name"
+
+
+
+	wget "$path/${bn}_genomic.fna.gz" --output-document "$outdir/$taxid.fasta.gz" 2> /dev/null
+	gunzip -f "$outdir/$taxid.fasta.gz"
+	# put all sequence strings under each header into a single line
+	awk '{if(NR == 1){printf("%s\n", $0)}else{if(substr($0,1,1) == ">"){printf("\n%s\n", $0)} else {printf("%s", $0)}}}END{printf("\n")}' "$outdir/$taxid.fasta" > "$outdir/$taxid.fasta.tmp"
+	# only retain contigs/assemblies >1 Kbp
+	sed $'$!N;s/\\\n/\t/' "$outdir/$taxid.fasta.tmp" | awk -F'\t' '{if(length($2)>=1000){printf("%s\n%s\n",$1,$2)}}' > "$outdir/$taxid.fasta"
+	rm "$outdir/$taxid.fasta.tmp"
+
+
+
+
+	abu=$(grep -P "^$taxid\t" "$INPUT" | cut -f2)
+	# calculate number reads based on abundance in input and on:
+	# ILLUMINA	(~4-8 million reads output), output 8 million reads
+	#	MISEQ	2 x 150bp [80,000]
+	#	ISEQ	2 x 150f [80,000]
+	# OXFORD MINION (r9 ~20Gbp @ 20Kb avg read length), output 1 million reads
+	#	R9		rapid library kit RAD004 [10,000]
+	#	FLG		rapid library kit RAD004 [1,000]
+	#	R10		ligation library kit LSK009 [n/a]
+
+
+	#	InSilicoSeq
+	#  --cpus <int>, -p <int>
+	#                        number of cpus to use. (default: 2).
+	#  --genomes <genomes.fasta> [<genomes.fasta> ...], -g <genomes.fasta> [<genomes.fasta> ...]
+	#                        Input genome(s) from where the reads will originate
+	#  --draft <draft.fasta> [<draft.fasta> ...]
+	#                        Input draft genome(s) from where the reads will
+	#                        originate
+	#						If you have draft genome files containing contigs, you can give them to the --draft option:
+	#  --n_reads <int>, -n <int>
+	#                        Number of reads to generate (default: 1000000). Allows
+	#                        suffixes k, K, m, M, g and G (ex 0.5M for 500000).
+	#  --model <npz>, -m <npz>
+	#                        Error model file. (default: None). Use HiSeq, NovaSeq
+	#                        or MiSeq for a pre-computed error model provided with
+	#                        the software, or a file generated with iss model. If
+	#                        you do not wish to use a model, use --mode basic or
+	#                        --mode perfect. The name of the built-in models are
+	#                        case insensitive.
+	#  --output <fastq>, -o <fastq>
+	#                        Output file prefix (Required)
+
+	if [[ "$PLATFORM" == "iseq" ]]; then
+		count=$(printf "$abu" | awk '{printf("%.0f",$0*80000)}')
+	    echo "Simulating $count reads for $taxid, $name, $PLATFORM, at abundance of $abu"
+		model=$HOME_DIR"/data/iss_model_iSeq_min120.npz"
+		source activate simulator
+		iss generate -p "$THREADS" --draft "$outdir/$taxid.fasta" -n "$count" -m "$model" -o "$outdir/$PLATFORM/$taxid" 2> "$outdir/error.log"
+	elif [[ "$PLATFORM" == "miseq" ]]; then
+		count=$(printf "$abu" | awk '{printf("%.0f",$0*80000)}')
+	    echo "Simulating $count reads for $taxid, $name, $PLATFORM, at abundance of $abu"
+		model="MiSeq"
+		source activate simulator
+		iss generate -p "$THREADS" --draft "$outdir/$taxid.fasta" -n "$count" -m "$model" -o "$outdir/$PLATFORM/$taxid" 2> "$outdir/error.log"
+	elif [[ "$PLATFORM" == "r9" || "$PLATFORM" == "flg" ]]; then
+
+	    if [[ "$PLATFORM" == "r9" ]]; then
+            # making r9 total reads default 10,000
+		    count=$(printf "$abu" | awk '{printf("%.0f",$0*10000)}')
+        elif [[ "$PLATFORM" == "flg" ]]; then
+            # making flg total reads default 1,000
+		    count=$(printf "$abu" | awk '{printf("%.0f",$0*1000)}')
+	    fi
+
+	    echo "Simulating $count reads for $taxid, $name, $PLATFORM, at abundance of $abu"
+		r9ScriptLocation=$HOME_DIR"/scripts/"
+
+		totalCountNucleotides=$( grep "^[^>]" "$outdir/$taxid.fasta" | tr -d "\n"  | wc -c )
+		seqs=($( grep -R "^>" "$outdir/${taxid}.fasta" | tr " " "|" ))
+		total=${#seqs[*]}
+		if [ ! -d $r9ScriptLocation"../tmp" ]; then
+			mkdir $r9ScriptLocation"../tmp"
+		fi
+		if [ -d $r9ScriptLocation"../tmp/seqs" ]; then
+			rm -rf 	$r9ScriptLocation"../tmp/seqs"
+		fi
+		mkdir $r9ScriptLocation"../tmp/seqs/"
+		if [ -d  $outdir/"$PLATFORM/$taxid" ]; then
+			rm -rf $outdir/"$PLATFORM/$taxid"
+		fi
+		if [ ! -d  $outdir/"$PLATFORM/logs" ]; then
+			mkdir $outdir/"$PLATFORM/logs"
+		fi
+		if [ -f "$outdir/$PLATFORM/logs/pythonLog.txt" ]; then
+			rm "$outdir/$PLATFORM/logs/pythonLog.txt"
+		fi
+		if [ -f "$outdir/$PLATFORM/logs/simulationLog.txt" ]; then
+			rm "$outdir/$PLATFORM/logs/simulationLog.txt"
+		fi
+		mkdir -p $outdir/$PLATFORM/$taxid
+		source activate simulator
+		python "$r9ScriptLocation/"separate_seqs.py -i $outdir/$taxid.fasta -o $r9ScriptLocation"../tmp/seqs/" -n "$count" >> "$outdir/$PLATFORM/logs/pythonLog.txt" 2>&1
+		conda deactivate
+		files=()
+		while IFS=  read -r -d $'\0'; do
+		    files+=("$REPLY")
+		done < <(find ${r9ScriptLocation}"/../tmp/seqs/" -name "*.fasta" -print0)
+		for (( i=0;  i < "${#files[@]}" ; i++ ))
+		do
+			length=$( grep -e '^[^>]' ${files[$i]} |  tr -d "\n" | wc -c )
+			count_seq=$( echo "scale=20; 0.5+($count*$length)/$totalCountNucleotides" | bc -l | xargs printf %.0f )
+			echo "count_seq equals $count_seq"
+			bash "$r9ScriptLocation"/simulate.sh \
+			-i ${files[$i]} \
+			-n $count_seq \
+			-o "$outdir/$PLATFORM/$taxid" \
+			-c $THREADS \
+			-g "CPU" \
+			-d $HOME_DIR"/src/DeepSimulator" \
+			-j $r9cfg
+
+			find "$outdir/$PLATFORM/$taxid" -name "fast5" -type d -exec rm -rf  "{}" \;
+		done >> "$outdir/$PLATFORM/logs/simulationLog.txt" 2>&1
+		rm -rf 	${r9ScriptLocation}"/../tmp/seqs/*"
+		echo "done with this file $outdir $taxid"
+
+
+	fi
+    # NOTES:
+    #	name your output fastq per taxid with the taxid, such that "sed 's/_R.*//'" will return ONLY the taxid
+
+done < "$asri"
+
+
+
+
+
+if [[ "$PLATFORM" == "r9" || "$PLATFORM" == "flg" ]]; then
+	echo "merging all $PLATFORM fastq files"
+	bash "$HOME_DIR/scripts/remapOxfordFastq.sh" \
+	-i "$outdir/$PLATFORM/" \
+	-o "pass_mapped.fastq" && find "$outdir/$PLATFORM/" \
+	-maxdepth 5 \
+	-name "pass_mapped.fastq" \
+	-exec cat {} + > $outdir"/simulated.fastq"
+	mv "$outdir/simulated.fastq" "$OUTPUT/"
+	rm -rf "$outdir/$PLATFORM"
+elif [[ "$PLATFORM" == "iseq" || "$PLATFORM" == "miseq" ]]; then
+	echo "fixing headers and combining fastqs"
+	# rename headers for 'taxid'
+	find "$outdir/$PLATFORM" -maxdepth 1 -name "*fastq" | while read fq; do
+		bn=$(basename "$fq" | sed 's/_R.*//')
+		sed $'$!N;s/\\\n/\t/' "$fq" | sed $'$!N;s/\\\n/\t/' | awk -v name="$bn" -F'\t' '{printf("@%s\n%s\n%s\n%s\n",name,$2,$3,$4)}' > "$fq.tmp"
+	done
+	# merge all fastq
+	find "$outdir/$PLATFORM" -maxdepth 1 -name "*fastq.tmp" -exec cat {} + > "$outdir/$PLATFORM/fastq.merged"
+	# rename headers for 'taxid-readID'
+	sed $'$!N;s/\\\n/\t/' "$outdir/$PLATFORM/fastq.merged" | sed $'$!N;s/\\\n/\t/' | awk -F'\t' '{printf("%s-%s\n%s\n%s\n%s\n",$1,NR,$2,$3,$4)}' > "$outdir/simulated.fastq"
+    mv "$outdir/simulated.fastq" "$OUTPUT/"
+    rm -rf "$outdir/$PLATFORM"
+fi
+
+
+
+
--- a/scripts/simulate.sh
+++ b/scripts/simulate.sh
@@ -0,0 +1,148 @@
+#  **********************************************************************
+#  Copyright (C) 2020 Johns Hopkins University Applied Physics Laboratory
+#
+#  All Rights Reserved.
+#  For any other permission, please contact the Legal Office at JHU/APL.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#  **********************************************************************
+
+usage()
+{
+cat << EOF
+
+	DESCRIPTION: This script will run deep simulator using either CPU or GPU to make a set of reads for a single fasta file. Future Updates below
+
+USAGE:
+	bash simulate.sh -i absolute path to fasta file
+	-c number of cpu nodes you want
+	-n Number of reads for simulation
+	-g [CPU/GPU/ALBACORE]
+	-o <Output Directory that should be the name of the file you want/organism you want to simulate
+OPTIONS:
+	-h      help		show this message
+	-i      fna		reference fasta input file
+	-n	read#	 	number of reads for given fasta file
+	-o 	output		output directory
+	-g 	GPU/CPU		Choose either CPU or GPU to run simulation on
+	-j  exit at fast5 [1|2] where 1 is default and it doesnt exit and 2 exits after fast5 making
+NOTES:
+	Update this script is in the works to map abundance profile to a directory of fasta files
+
+EOF
+}
+source activate simulator
+
+# parsing arguments from command line
+cpu_count=1
+deep_sim_loc="$PWD/src/DeepSimulator"
+guppy_type="CPU"
+read_count=1
+j=1
+while getopts "hi:d:o:n:B:c:r:g:j:" OPTION
+do
+	case $OPTION in
+		h) usage; exit 1 ;;
+		i) fasta_input_file=$OPTARG ;;
+		o) output_dir=$OPTARG ;;
+		c) cpu_count=$OPTARG ;;
+		g) guppy_type=$OPTARG ;;
+		d) deep_sim_loc=$OPTARG;;
+		n) read_count=$OPTARG;;
+		j) j=$OPTARG;;
+		?) usage; exit ;;
+	esac
+done
+
+if [[ (! $guppy_type == "GPU" ) && (! $guppy_type == "CPU") && (! $guppy_type == "ALBACORE") ]]; then
+	usage
+	echo "Invalid Guppy basecaller selected [GPU|CPU|ALBACORE]. Exiting."
+	exit 1
+fi
+echo $guppy_type
+if [[ $guppy_type == "GPU" ]]; then
+	guppy_type=1
+elif [[ $guppy_type == "CPU" ]]; then
+	guppy_type=2
+else
+	guppy_type=3
+fi
+
+#define location of fasta input and output location of simulated reads as user
+#since torque runs the script from a different location, specify absolute pathing
+current_loc=$( pwd )
+envbin=$(which python)
+base="$(dirname $envbin)"
+echo $deep_sim_loc
+echo $PWD
+
+base=$(basename ${fasta_input_file} .fasta)
+mkdir $output_dir"/$base"
+conda deactivate
+if [[ $j -eq 1 ]]; then
+	echo $output_dir"/$base"
+cat <<EOF
+${deep_sim_loc}/deep_simulator.sh \
+-i ${fasta_input_file} \
+-n ${read_count} \
+-c $cpu_count \
+-o $output_dir"/$base" \
+-B $guppy_type \
+-H $deep_sim_loc
+EOF
+	bash ${deep_sim_loc}/deep_simulator.sh \
+	-i ${fasta_input_file} \
+	-n ${read_count} \
+	-c $cpu_count \
+	-o $output_dir"/$base" \
+	-B $guppy_type \
+	-H $deep_sim_loc
+	# exit 1
+
+elif [[ $j -eq 2 ]]; then
+cat <<EOF
+${deep_sim_loc}/deep_simulator_fast5only.sh \
+-i ${fasta_input_file} \
+-n ${read_count} \
+-c $cpu_count \
+-o $output_dir"/$base" \
+-B $guppy_type \
+-H $deep_sim_loc
+EOF
+	bash ${deep_sim_loc}/deep_simulator_fast5only.sh \
+	-i ${fasta_input_file} \
+	-n ${read_count} \
+	-c $cpu_count \
+	-o $output_dir"/$base" \
+	-B $guppy_type \
+	-H $deep_sim_loc
+	# exit 1
+else
+	echo "Exit: -j isn't properly specified as 1 (dont exit after fast5) or 2 (exit after fast5)"
+	exit 1
+fi
+
+
+
+
+
+
+#After the fun is done for the fullDeepSim run
+
+# bash scripts/remapOxfordFastq.sh \
+# -i data/fullDeepSim/metasim-strawman_envassay.tsv/r9/ \
+# -o pass_mapped.fastq && find data/fullDeepSim/metasim-strawman_envassay.tsv/r9/ \
+# -maxdepth 3 \
+# -name "pass_mapped.fastq" \
+# -exec cat {} + > data/fullDeepSim/pass_mapped_merged.fastq
+