Files
meta-simulator/scripts/sim_module_wrapper.sh
Angeline Aguinaldo db57e4495d initial commit
2020-11-12 20:18:05 -05:00

331 lines
13 KiB
Bash

#!/bin/bash
# **********************************************************************
# Copyright (C) 2020 Johns Hopkins University Applied Physics Laboratory
#
# All Rights Reserved.
# For any other permission, please contact the Legal Office at JHU/APL.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# **********************************************************************
# FUNCTIONS
usage()
{
cat << EOF
Help message for sim_module:
DESCRIPTION
NOTES:
- WARNING:
USAGE:
bash sim_module_wrapper.sh -i </absolute/path/to/taxa_abundance_profile.tsv>
bash scripts/sim_module_wrapper.sh -t 10 -i data/test.tsv -p iseq
OPTIONS:
-h help show this message
-H STR Home directory specification for the sim_module_wrapper.sh file (OPTIONAL)
-t INT number of threads to use for simulations
-i TSV list of taxid with associated abundance (totalling 1.0)
TAXID can be at any taxonomic level, however the first accession
found when searching the 'taxid' column of "assembly_summary_refseq.txt"
will be used as the reference for simulating reads
-p STR sequencing platform to simulate reads for (case sensitive)
available options [total reads simulated]:
iseq Illumina iSeq 100 (assuming both illumina platforms have spot count of 8M, and taking 1/100 of this) [80,000]
miseq Illumina MiSeq [80,000]
r9 Oxford Nanopore R9 flowcell (MIN106) - best performance at 50Gbp output (will assume 20Gbp and 20kb avg read length = 1M reads) [10,000]
flg Oxford Nanopore Flongle flowcell (FLG001) - best performance at 2Gbp output (1/25 of r9) (assuming 10% of r9 output) [1,000]
pending options:
r10 Oxford Nanopore R10 flowcell (MIN107) <- not implemented
-j INT If using Deep Simulator (r9,flg) specify if you want to only generate fast5 files.
Available Options:
1 Full process from fast5 generation to fastq/basecalling
2 Exit early. Generated only fast5 files
-o DIR Output directory (combined fastq file for classification will be at "$outdir/simulated.fastq")
NOTES:
Input abundance profile (abundance based on proportion bps of reads, not genome size)
____________________________________________________________________________________________________
References:
1. O. Tange (2011): GNU Parallel - The Command-Line Power Tool, ;login: The USENIX Magazine, February 2011:42-47.
EOF
}
# ARGUMENTS
# parse args
while getopts "ht:i:p:r:j:H:o:" OPTION
do
case $OPTION in
h) usage; exit 1 ;;
t) THREADS=$OPTARG ;;
i) INPUT=$OPTARG ;;
p) PLATFORM=$OPTARG ;;
H) HOME_DIR=$OPTARG ;;
r) READSCOUNT=$OPTARG;;
j) r9cfg=$OPTARG;;
o) OUTPUT=$OPTARG;;
?) usage; exit ;;
esac
done
# check args
if [[ -z "$THREADS" ]]; then printf "%s\n" "Please specify number of threads (-t)."; exit; fi
if [[ -z "$INPUT" ]]; then printf "%s\n" "Please specify input tsv (-i)."; exit; fi
if [[ -z "$PLATFORM" ]]; then printf "%s\n" "Please specify sequencing platform (-p)."; exit; fi
#if [[ -z $READSCOUNT ]]; then READSCOUNT=10000; fi; # 20200520, readcount will be based on platform type
if [[ -z $HOME_DIR ]]; then HOME_DIR=$PWD; fi
if [[ -z $r9cfg ]]; then r9cfg=1; fi
if [[ -z $OUTPUT ]]; then printf "%s\n" "Please specify a final output directory (-o)."; exit; fi
if [[ ! -d "$OUTPUT" ]]; then mkdir -p "$OUTPUT"; fi
#Activate the correct environment for simulator (iss and deepsim)
# setup other variables
absolute_path_x="$(readlink -fn -- "$0"; echo x)"
absolute_path_of_script="${absolute_path_x%x}"
scriptdir=$(dirname "$absolute_path_of_script")
runtime=$(date +"%Y%m%d%H%M%S%N")
dn=$(dirname "$INPUT")
bn=$(basename "$INPUT")
#outdir="$dn/metasim-$bn"
outdir="$OUTPUT"
tmp="$outdir/tmp"
if [[ ! -d "$tmp" ]]; then
mkdir -p "$tmp"
fi
# MAIN
echo "Checking that abundance profile sums to 1.000000 (input.tsv column 2)."
check=$(awk -F'\t' '{x+=$2}END{printf("%f",x)}' "$INPUT" | cut -c1-8)
if [[ "$check" != "1.000000" ]]; then
echo "Input abundances sum to $check"
echo "They must sum to 1 within a tolerance of <1 millionth (i.e. 1.000000). Exiting."; exit
else
echo "Check successful."
fi
# get updated assembly summary refseq (asr)
echo "Pulling latest 'assembly_summary_refseq.txt files."
asr="$tmp/assembly_summary_refseq.txt"
for k in "archaea" "bacteria" "fungi" "invertebrate" "other" "plant" "protozoa" "vertebrate_mammalian" "vertebrate_other" "viral"; do
echo "downloading refseq summary for $k"
if [[ ! -f "$asr.tmp-$k" ]]; then
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/$k/assembly_summary.txt --output-document "$asr.tmp-$k"
fi
done 2> /dev/null
# combine all
find "$tmp" -maxdepth 1 -name "*assembly_summary_refseq.txt.tmp-*" -exec cat {} + > "$asr"
asri="$tmp/asr_input.tsv"
awk -F'\t' '{if(FNR==NR){rs[$6]=$0}else{printf("%s\n",rs[$1])}}' "$asr" "$INPUT" > "$asri"
# check if any accession are missing from the overlap
# find overlap between it and the input
check=$(comm -23 <(cut -f1 "$INPUT" | sort) <(cut -f6 "$asri" | sort))
if [[ "$check" != "" ]]; then
echo "The following accessions did not have a corresponding"
echo "path in the assembly summary refseq file:"
echo "$check"
echo "Please remove or subsitute the accession(s) above, and resubmit."; exit
fi
# pull ftp paths and download reference genomes for accessions in input tsv
# 1 assembly_accession
# 6 taxid <- strain if available, otherwise is species taxid
# 7 species_taxid
# 8 organism_name
# 9 infraspecific_name
# 20 ftp_path
mkdir -p "$outdir/$PLATFORM"
while read x; do
acc=$(printf "$x" | cut -f1)
taxid=$(printf "$x" | cut -f6)
name=$(printf "$x" | cut -f8)
path=$(printf "$x" | cut -f20)
bn=$(basename "$path")
echo " wgetting: $acc, $taxid, $name"
wget "$path/${bn}_genomic.fna.gz" --output-document "$outdir/$taxid.fasta.gz" 2> /dev/null
gunzip -f "$outdir/$taxid.fasta.gz"
# put all sequence strings under each header into a single line
awk '{if(NR == 1){printf("%s\n", $0)}else{if(substr($0,1,1) == ">"){printf("\n%s\n", $0)} else {printf("%s", $0)}}}END{printf("\n")}' "$outdir/$taxid.fasta" > "$outdir/$taxid.fasta.tmp"
# only retain contigs/assemblies >1 Kbp
sed $'$!N;s/\\\n/\t/' "$outdir/$taxid.fasta.tmp" | awk -F'\t' '{if(length($2)>=1000){printf("%s\n%s\n",$1,$2)}}' > "$outdir/$taxid.fasta"
rm "$outdir/$taxid.fasta.tmp"
abu=$(grep -P "^$taxid\t" "$INPUT" | cut -f2)
# calculate number reads based on abundance in input and on:
# ILLUMINA (~4-8 million reads output), output 8 million reads
# MISEQ 2 x 150bp [80,000]
# ISEQ 2 x 150f [80,000]
# OXFORD MINION (r9 ~20Gbp @ 20Kb avg read length), output 1 million reads
# R9 rapid library kit RAD004 [10,000]
# FLG rapid library kit RAD004 [1,000]
# R10 ligation library kit LSK009 [n/a]
# InSilicoSeq
# --cpus <int>, -p <int>
# number of cpus to use. (default: 2).
# --genomes <genomes.fasta> [<genomes.fasta> ...], -g <genomes.fasta> [<genomes.fasta> ...]
# Input genome(s) from where the reads will originate
# --draft <draft.fasta> [<draft.fasta> ...]
# Input draft genome(s) from where the reads will
# originate
# If you have draft genome files containing contigs, you can give them to the --draft option:
# --n_reads <int>, -n <int>
# Number of reads to generate (default: 1000000). Allows
# suffixes k, K, m, M, g and G (ex 0.5M for 500000).
# --model <npz>, -m <npz>
# Error model file. (default: None). Use HiSeq, NovaSeq
# or MiSeq for a pre-computed error model provided with
# the software, or a file generated with iss model. If
# you do not wish to use a model, use --mode basic or
# --mode perfect. The name of the built-in models are
# case insensitive.
# --output <fastq>, -o <fastq>
# Output file prefix (Required)
if [[ "$PLATFORM" == "iseq" ]]; then
count=$(printf "$abu" | awk '{printf("%.0f",$0*80000)}')
echo "Simulating $count reads for $taxid, $name, $PLATFORM, at abundance of $abu"
model=$HOME_DIR"/data/iss_model_iSeq_min120.npz"
source activate simulator
iss generate -p "$THREADS" --draft "$outdir/$taxid.fasta" -n "$count" -m "$model" -o "$outdir/$PLATFORM/$taxid" 2> "$outdir/error.log"
elif [[ "$PLATFORM" == "miseq" ]]; then
count=$(printf "$abu" | awk '{printf("%.0f",$0*80000)}')
echo "Simulating $count reads for $taxid, $name, $PLATFORM, at abundance of $abu"
model="MiSeq"
source activate simulator
iss generate -p "$THREADS" --draft "$outdir/$taxid.fasta" -n "$count" -m "$model" -o "$outdir/$PLATFORM/$taxid" 2> "$outdir/error.log"
elif [[ "$PLATFORM" == "r9" || "$PLATFORM" == "flg" ]]; then
if [[ "$PLATFORM" == "r9" ]]; then
# making r9 total reads default 10,000
count=$(printf "$abu" | awk '{printf("%.0f",$0*10000)}')
elif [[ "$PLATFORM" == "flg" ]]; then
# making flg total reads default 1,000
count=$(printf "$abu" | awk '{printf("%.0f",$0*1000)}')
fi
echo "Simulating $count reads for $taxid, $name, $PLATFORM, at abundance of $abu"
r9ScriptLocation=$HOME_DIR"/scripts/"
totalCountNucleotides=$( grep "^[^>]" "$outdir/$taxid.fasta" | tr -d "\n" | wc -c )
seqs=($( grep -R "^>" "$outdir/${taxid}.fasta" | tr " " "|" ))
total=${#seqs[*]}
if [ ! -d $r9ScriptLocation"../tmp" ]; then
mkdir $r9ScriptLocation"../tmp"
fi
if [ -d $r9ScriptLocation"../tmp/seqs" ]; then
rm -rf $r9ScriptLocation"../tmp/seqs"
fi
mkdir $r9ScriptLocation"../tmp/seqs/"
if [ -d $outdir/"$PLATFORM/$taxid" ]; then
rm -rf $outdir/"$PLATFORM/$taxid"
fi
if [ ! -d $outdir/"$PLATFORM/logs" ]; then
mkdir $outdir/"$PLATFORM/logs"
fi
if [ -f "$outdir/$PLATFORM/logs/pythonLog.txt" ]; then
rm "$outdir/$PLATFORM/logs/pythonLog.txt"
fi
if [ -f "$outdir/$PLATFORM/logs/simulationLog.txt" ]; then
rm "$outdir/$PLATFORM/logs/simulationLog.txt"
fi
mkdir -p $outdir/$PLATFORM/$taxid
source activate simulator
python "$r9ScriptLocation/"separate_seqs.py -i $outdir/$taxid.fasta -o $r9ScriptLocation"../tmp/seqs/" -n "$count" >> "$outdir/$PLATFORM/logs/pythonLog.txt" 2>&1
conda deactivate
files=()
while IFS= read -r -d $'\0'; do
files+=("$REPLY")
done < <(find ${r9ScriptLocation}"/../tmp/seqs/" -name "*.fasta" -print0)
for (( i=0; i < "${#files[@]}" ; i++ ))
do
length=$( grep -e '^[^>]' ${files[$i]} | tr -d "\n" | wc -c )
count_seq=$( echo "scale=20; 0.5+($count*$length)/$totalCountNucleotides" | bc -l | xargs printf %.0f )
echo "count_seq equals $count_seq"
bash "$r9ScriptLocation"/simulate.sh \
-i ${files[$i]} \
-n $count_seq \
-o "$outdir/$PLATFORM/$taxid" \
-c $THREADS \
-g "CPU" \
-d $HOME_DIR"/src/DeepSimulator" \
-j $r9cfg
find "$outdir/$PLATFORM/$taxid" -name "fast5" -type d -exec rm -rf "{}" \;
done >> "$outdir/$PLATFORM/logs/simulationLog.txt" 2>&1
rm -rf ${r9ScriptLocation}"/../tmp/seqs/*"
echo "done with this file $outdir $taxid"
fi
# NOTES:
# name your output fastq per taxid with the taxid, such that "sed 's/_R.*//'" will return ONLY the taxid
done < "$asri"
if [[ "$PLATFORM" == "r9" || "$PLATFORM" == "flg" ]]; then
echo "merging all $PLATFORM fastq files"
bash "$HOME_DIR/scripts/remapOxfordFastq.sh" \
-i "$outdir/$PLATFORM/" \
-o "pass_mapped.fastq" && find "$outdir/$PLATFORM/" \
-maxdepth 5 \
-name "pass_mapped.fastq" \
-exec cat {} + > $outdir"/simulated.fastq"
mv "$outdir/simulated.fastq" "$OUTPUT/"
rm -rf "$outdir/$PLATFORM"
elif [[ "$PLATFORM" == "iseq" || "$PLATFORM" == "miseq" ]]; then
echo "fixing headers and combining fastqs"
# rename headers for 'taxid'
find "$outdir/$PLATFORM" -maxdepth 1 -name "*fastq" | while read fq; do
bn=$(basename "$fq" | sed 's/_R.*//')
sed $'$!N;s/\\\n/\t/' "$fq" | sed $'$!N;s/\\\n/\t/' | awk -v name="$bn" -F'\t' '{printf("@%s\n%s\n%s\n%s\n",name,$2,$3,$4)}' > "$fq.tmp"
done
# merge all fastq
find "$outdir/$PLATFORM" -maxdepth 1 -name "*fastq.tmp" -exec cat {} + > "$outdir/$PLATFORM/fastq.merged"
# rename headers for 'taxid-readID'
sed $'$!N;s/\\\n/\t/' "$outdir/$PLATFORM/fastq.merged" | sed $'$!N;s/\\\n/\t/' | awk -F'\t' '{printf("%s-%s\n%s\n%s\n%s\n",$1,NR,$2,$3,$4)}' > "$outdir/simulated.fastq"
mv "$outdir/simulated.fastq" "$OUTPUT/"
rm -rf "$outdir/$PLATFORM"
fi