Skip to content

Commit

Permalink
Minor changes
Browse files Browse the repository at this point in the history
  • Loading branch information
smortezah committed Jan 13, 2018
1 parent fd7b4ba commit 09c2c94
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 3 deletions.
11 changes: 9 additions & 2 deletions run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ RUN_CRYFA_THREADS=0

### Run different methods to explore redundancy
RUN_REDUNDANCY=1 # cryfa, DELIMINATE, MFCompress
# Dataset (FASTA)
GET_DATASET_REDUN=1 # GB free disk space
# Run & Results
RUN_RES_REDUN=0


################################################################################
Expand Down Expand Up @@ -289,6 +293,9 @@ then
### Create a folder for results, if it doesn't already exist
if [[ ! -d $result ]]; then mkdir -p $result; fi

### Download datasets
if [[ $GET_DATASET_REDUN -eq 1 ]]; then . $script/dl_dataset_redun.sh; fi

### Run & Results
. $script/run_res_redun.sh;
fi
if [[ $RUN_RES_REDUN -eq 1 ]]; then . $script/run_res_redun.sh; fi
fi
46 changes: 46 additions & 0 deletions script/DownloadArchaea.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# This script downloads all archaea genomes in RefSeq and puts them in archaea.fa
# Script is taken from: http://www.ncbi.nlm.nih.gov/books/NBK25498/#chapter3.Application_3_Retrieving_large
# BY INTERFACE NCBI: http://www.ncbi.nlm.nih.gov/nuccore?term=%22plants%22[PORG]+AND+srcdb_refseq[PROP] , then sent to file: fasta.

use LWP::Simple;

$organism = 'archaea';

$query = $organism.'[orgn]+AND+srcdb_refseq[prop]';
print STDERR "Searching RefSeq for $organism: $query\n";
#assemble the esearch URL
$base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/';
$url = $base . "esearch.fcgi?db=nucleotide&term=$query&usehistory=y";

#post the esearch URL
$output = get($url);

#parse WebEnv, QueryKey and Count (# records retrieved)
$web = $1 if ($output =~ /<WebEnv>(\S+)<\/WebEnv>/);
$key = $1 if ($output =~ /<QueryKey>(\d+)<\/QueryKey>/);
$count = $1 if ($output =~ /<Count>(\d+)<\/Count>/);

print STDERR "Found: $count records for $organism\n";
if($count == 0) {
exit(0);
}

#open output file for writing
open(OUT, ">tmp.$organism.fa") || die "Can't open file!\n";


#retrieve data in batches of 5000
$retmax = 5000;
for ($ret = 0; $ret < $count; ) {
$efetch_url = $base ."efetch.fcgi?db=nucleotide&WebEnv=$web";
$efetch_url .= "&query_key=$key&retstart=$ret";
$efetch_url .= "&retmax=$retmax&rettype=fasta&retmode=text";
$efetch_out = get($efetch_url);
$actual_sequences_returned = $efetch_out =~ s/>/\n>/g; # count number of sequences returned
$ret += $actual_sequences_returned;
print OUT "$efetch_out";
print STDERR "Fetched $ret\n";
}
close OUT;

rename("tmp.$organism.fa", "$organism.fa")
41 changes: 41 additions & 0 deletions script/dl_dataset_redun.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#######################################################
# Download Datasets for redundancy (FASTA) -- GB #
# - - - - - - - - - - - - - - - - - - - - #
# Morteza Hosseini [email protected] #
# Diogo Pratas [email protected] #
# Armando J. Pinho [email protected] #
#######################################################
#!/bin/bash

### Create a folder for redundancy exploration datasets
if [[ ! -d $dataset/$redun ]]; then mkdir -p $dataset/$redun; fi

### Get 'goose' for splitting reads
git clone https://github.com/pratas/goose.git
cd goose/src/
make
cd ../..

### Download
# Archaea
if [[ ! -d $dataset/$redun/$ARCHAEA ]];
then
mkdir -p $dataset/$redun/$ARCHAEA;
fi

perl ./$script/DownloadArchaea.pl

### Remove blank lines and move it to dataset folder
cat archaea.fa | grep -Ev "^$" | ./goose/src/goose-splitreads "complete genome" \
> $dataset/$redun/$ARCHAEA
rm -f archaea.fa

# Bacteria
# Fungi
# Plants
# Viruses
#perl ./$script/DownloadViruses.pl

### Remove blank lines in downloaded file and move it to dataset folder
#cat viruses.fa | grep -Ev "^$" > $dataset/$FA/$VIRUSES/viruses.$fasta
#rm -f viruses.fa
4 changes: 4 additions & 0 deletions script/par.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ DENISOVA_FQ_URL="http://cdna.eva.mpg.de/denisova/raw_reads"
# Abbreviated names
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
HUMAN="HS"
ARCHAEA="A"
BACTERIA="B"
FUNGI="F"
PLANTS="P"
VIRUSES="V"
DENISOVA="DS"
Synth="Synth"
Expand Down
3 changes: 2 additions & 1 deletion script/run_res_redun.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ CRYFA=1
MFCOMPRESS=1
DELIMINATE=1

DATA_SET="A B F V P";

DATA_SET="$ARCHAEA $BACTERIA $FUNGI $PLANTS $VIRUSES"

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# cryfa
Expand Down

0 comments on commit 09c2c94

Please sign in to comment.