Lineanator

bhatarchanas · Nov 1, 2016 · bc37bc4 · bc37bc4
commit bc37bc4
Show file tree

Hide file tree

Showing 4 changed files with 358 additions and 0 deletions.
diff --git a/README.txt b/README.txt
@@ -0,0 +1,40 @@
+Introduction:
+Lineanator uses USEARCH and NCBI to create a database file with confidences up to the species level. 
+This database file can be used in MC-SMRT to assign taxonomy and confidences to OTUs in a microbiome community.
+
+Installation and Dependencies: 
+1) Ruby gems such as bio, troloop and nokogiri are required for Lineanator to function. Use the "gem install {name_of_the_gem}" command to install these gems.
+2) Download and install usearch v8.1.1861_i86linux64. 
+   Create a soft link pointing towards this version of usearch and name it "usearch". The soft link name HAS TO BE usearch, this is VERY IMPORTANT.
+
+Usage: 
+ruby lineanator.rb [-h] [-d DUMP_FILE] [-s SEQ_FILE] [-t TAB_DELIMITED_LINEAGE_OUTPUT_FILE] [-f FASTA_LINEAGE_OUTPUT_FILE] [-p PLACE_HOLDER_NAMES_FILE]
+
+Arguments explained:
+1) DUMP_FILE - This text file serves as a source to map between the gi id's in the SEQ_FILE and get the tax id's corresponding to each gi id. 
+   The dump file we used was taken from NCBI's taxonomy FTP (ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/). 
+   The directory called gi_taxid_nucl.zip in the FTP had a file with gi id in the first column and tax id in the second column (no headers).
+
+2) SEQ_FILE - This is a FASTA file with all the 16s sequences which you want to build a database from. 
+   The headers of the FASTA file should look like ">gi|219846313|ref|NR_025903.1| Gemella haemolysans strain ATCC 25563 16S ribosomal RNA gene, partial sequence" for example.
+   When all the 16s sequences are taken from NCBI - Command will be up soon, the headers appear in this format at once. This is what we used to create our database.
+
+3) TAB_DELIMITED_LINEAGE_OUTPUT_FILE - This is the name of an output file which will have the lineage as a tab delimited file along with the gi id as one of the columns.
+
+4) FASTA_LINEAGE_OUTPUT_FILE - This is the name of the FASTA output file which will have the lineage in the headers. 
+   The lineage in the header is in the format which is required by USEARCH to create the database file. 
+
+5) PLACE_HOLDER_NAMES_FILE - This is the file which will contain a list (a column) of all the words which can confuse the training process for building a database. 
+   Each line of the text file contains  a word to match the label/header in the FASTA file. All the sequence headers which match to any of the words in this list will be removed before training the database.
+
+Output files:
+Two of the output files are the ones which you gave as arguments to lineanator, these are expalined above.
+
+The basename of the FASTA_LINEAGE_OUTPUT_FILE given by you is used for creating other files using USEARCH commands. 
+For example, if the name of your FASTA_LINEAGE_OUTPUT_FILE is "16sMicrobial_ncbi_lineage.fasta", the output files created will be:
+1) 16sMicrobial_ncbi_lineage_filtered.fasta - File without the placeholder names.
+2) 16sMicrobial_ncbi_lineage_confidence.tc - File after training the sequences, also has the confidences.
+3) 16sMicrobial_ncbi_lineage_reference_database.udb - Final database file in udb format
+
+Built with: 
+ruby 2.2.1p85 (2015-02-26 revision 49769) [x86_64-linux]
diff --git a/get_all_xml.rb b/get_all_xml.rb
@@ -0,0 +1,50 @@
+require 'bio'
+require 'trollop'
+
+opts = Trollop::options do
+  opt :dumpfile, "Provide the dump file with gi id-tax id pairs using the -d argument.", :type => :string, :short => "-d"
+  opt :seqfile, "Provide the 16s sequences in a FASTA format file using the -s argument.", :type => :string, :short => "-s"
+end
+
+gi_taxid_nucl_dmp_file = File.open(opts[:dumpfile], "r")
+microbial_ncbi_fasta_file = Bio::FlatFile.auto(opts[:seqfile])
+#xml_file = File.open("all_xml_strings.txt", "w")
+tax_id_file = File.open("tax_id_for_16s.txt", "w")
+
+count_seqs = 0
+microbial_ncbi_hash = {}
+microbial_ncbi_fasta_file.each do |entry|
+	count_seqs += 1
+	#puts entry.definition
+	gi_id = entry.definition.split("|")[1]
+	full_header = entry.definition.gsub(" ", "_")
+	na_seq = entry.naseq.upcase
+	microbial_ncbi_hash[gi_id] = [full_header, na_seq]
+end
+#puts microbial_ncbi_hash
+puts "Number of sequences in the FASTA file are: #{count_seqs}"
+
+count_tax_id_found = 0
+gi_taxid_nucl_hash = {}
+gi_taxid_nucl_dmp_file.each do |line|
+	line_split = line.split("\t")
+	gi_id = line_split[0].strip
+	tax_id = line_split[1].strip
+	#puts gi_id
+	if microbial_ncbi_hash.key?(gi_id)
+		#puts gi_id, microbial_ncbi_hash[gi_id]
+		count_tax_id_found += 1
+		gi_taxid_nucl_hash[gi_id] = tax_id
+		tax_id_file.puts("#{gi_id}\t#{tax_id}")
+	end
+end
+#puts gi_taxid_nucl_hash
+puts "Number of sequences whose gi id corresponded to tax id which were found in the dump file: #{count_tax_id_found}"
+
+=begin
+gi_taxid_nucl_hash.each do |gi_id, tax_id|
+	ncbi = Bio::NCBI::REST::EFetch.new
+	xml_string = ncbi.taxonomy(tax_id, "xml")
+	xml_file.puts(xml_string)
+end
+=end
diff --git a/lineanator.rb b/lineanator.rb
@@ -0,0 +1,37 @@
+require 'trollop'
+
+opts = Trollop::options do
+	opt :dumpfile, "Provide the dump file with gi id-tax id pairs using the -d argument.", :type => :string, :short => "-d"
+  	opt :seqfile, "Provide the 16s sequences in a FASTA format file using the -s argument.", :type => :string, :short => "-s"
+  	opt :tablineageoutfile, "Provide the name of the output file which will contain the lineage in a tab delimited format using the -t argument.", :type => :string, :short => "-t"
+	opt :fastalineageoutfile, "Provide the name of the output FASTA file in which the headers have the lineage using the -f argument.", :type => :string, :short => "-f"
+	opt :placeholdernamesfile, "Provide the name of the file in which the placeholder names are present using the -p argument.", :type => :string, :short => "-p"
+end
+
+##### Assigning variables to the input and making sure we got all the inputs
+opts[:dumpfile].nil?              == false  ? dump_file = opts[:dumpfile]                    : abort("Must supply a dump file which has all the gi id-tax id pairs using the '-d' argument.")
+opts[:seqfile].nil?               == false  ? seq_file = opts[:seqfile]                      : abort("Must supply a FASTA file which has all the 16s sequences using the '-s' argument.")
+opts[:tablineageoutfile].nil?     == false  ? tab_out_file = opts[:tablineageoutfile]        : abort("Must supply an output file name which will contain the lineage in a tab format using the '-t' argument.")
+opts[:fastalineageoutfile].nil?   == false  ? fasta_out_file = opts[:fastalineageoutfile]    : abort("Must supply an output file name which will contain the lineage in a FASTA format using the '-f' argument.")
+opts[:placeholdernamesfile].nil?  == false  ? ph_names_file = opts[:fastalineageoutfile]     : abort("Must supply a file which contains the place holder names using the '-p' argument.")
+out_fasta_basename = File.basename(fasta_out_file, ".*")
+
+# Run the script which gives a file with all the xmls
+puts "Getting the file with XMLs for all tax id's ready..."
+`ruby get_all_xml.rb -d #{dump_file} -s #{seq_file}`
+
+# Run the script which parses the file with all xmls
+puts "Parsing the file with all XMLs..."
+`ruby parse_all_xml_string_2.rb -s #{seq_file} -t #{tab_out_file} -f #{fasta_out_file}` 
+
+# Run the command to remove place holder names
+puts "Removing place holder names..."
+`usearch -fastx_getseqs #{fasta_out_file} -label_words #{ph_names_file} -label_field tax -notmatched #{out_fasta_basename}_filtered.fasta -fastaout excluded.fa`
+
+# Run the command to train the sequences and obtain confidences
+puts "Training the sequences to give a file with confidences..."
+`usearch -utax_train #{out_fasta_basename}_filtered.fasta -taxconfsout #{out_fasta_basename}_confidence.tc -utax_splitlevels NVpcofgs -utax_trainlevels dpcofgs`
+
+# Run the command to build the DB file
+puts "Making the DB file! Final step..."
+`usearch -makeudb_utax #{fasta_out_file} -taxconfsin #{out_fasta_basename}_confidence.tc -output #{out_fasta_basename}_reference_database.udb`
diff --git a/parse_all_xml_string_2.rb b/parse_all_xml_string_2.rb
@@ -0,0 +1,231 @@
+require 'bio'
+require 'nokogiri'
+require 'trollop'
+
+opts = Trollop::options do
+	opt :seqfile, "Provide the 16s sequences in a FASTA format file using the -s argument.", :type => :string, :short => "-s"
+  	opt :tablineageoutfile, "Provide the name of the output file which will contain the lineage in a tab delimited format using the -t argument.", :type => :string, :short => "-t"
+	opt :fastalineageoutfile, "Provide the name of the output FASTA file in which the headers have the lineage using the -f argument.", :type => :string, :short => "-f"
+end
+
+# NOTE: Got the all_xml_strings file and tax_id_for_16s.txt from get_all_xml.rb
+
+# Read the file which maps the tax_id to gi_id (only for ones in NCBI)
+tax_id_file = File.open("tax_id_for_16s.txt", "r")
+
+# open the doc with all the xml strings as fragments suing nokogiri
+doc = Nokogiri::XML::DocumentFragment.parse File.read("all_xml_strings.txt")
+
+# File with all the origianl NCBI 16s seqs
+ncbi_fasta_file = Bio::FlatFile.auto(opts[:seqfile])
+
+# Write lineage with gi id into a file
+lineage_file = File.open(opts[:tablineageoutfile], "w")
+lineage_file.puts("gi_id\tkingdom\tphylum\tclass\torder\tfamily\tgenus\tspecies")
+
+# Write the lineage along with seqs in this FASTA file
+ncbi_lineage = File.open(opts[:fastalineageoutfile], "w") 
+
+# Get a hash of all gi and tax id pairs
+tax_gi_ids_hash = {}
+tax_id_file.each_line do |line|
+	gi_id = line.split("\t")[0].chomp  
+	tax_id = line.split("\t")[1].chomp ### Has repeats
+ 	tax_gi_ids_hash[gi_id] = tax_id
+end
+#puts tax_gi_ids_hash.length
+
+# Creating hashes which record the lineage
+all_taxa_hash = {}
+no_rank_hash = {}
+
+# loop through each child node from the fragmenting
+doc.children.each do |child|
+
+	#puts child
+
+	# Get the child node for each taxaset
+	if child.to_s.start_with?("<TaxaSet")
+
+		# convert taxaset into an xml string 
+		xml_string = child.to_s
+		xml_string_noko = Nokogiri::XML(xml_string)
+
+		# get taxa ranks and scientific names for each org and store in arrays
+		rank_array = []
+		scientific_name_array = []
+		taxID = ""
+
+		xml_string_noko.xpath('//TaxaSet').each do |taxaSet_element|
+			#puts taxaSet_element
+			taxID = taxaSet_element.xpath('//TaxId').first.text
+			taxaSet_element.xpath('//Rank').each do |rank_element|
+				rank = rank_element.text
+				#puts rank
+				rank_array.push(rank)
+			end
+			taxaSet_element.xpath('//ScientificName').each do |scientific_name_element|
+				scientific_name = scientific_name_element.text
+				#puts scientific_name 
+				scientific_name_array.push(scientific_name)
+			end
+		end
+		#puts taxID
+		#puts rank_array
+		#puts scientific_name_array
+
+		tax_gi_ids_hash.each do |gi_id, tax_id|
+			if tax_id == taxID
+				# Assign null strings to all the attributes 
+				all_taxa_hash[gi_id] = {"kingdom" => "", "phylum" => "", "class" => "", "order" => "", "family" => "", "genus" => "", "species" => ""}
+				# array within the no_rank_hash			
+				array_in_no_rank = []
+
+				(0..rank_array.length-1).each do |each_rank|
+					#puts rank_array[each_rank]
+					if rank_array[each_rank] == "superkingdom"
+						all_taxa_hash[gi_id]["kingdom"] = scientific_name_array[each_rank]
+					elsif rank_array[each_rank] == "phylum"
+						all_taxa_hash[gi_id]["phylum"] = scientific_name_array[each_rank]
+					elsif rank_array[each_rank] == "class"
+						all_taxa_hash[gi_id]["class"] = scientific_name_array[each_rank]
+					elsif rank_array[each_rank] == "order"
+						all_taxa_hash[gi_id]["order"] = scientific_name_array[each_rank]
+					elsif rank_array[each_rank] == "family"
+						all_taxa_hash[gi_id]["family"] = scientific_name_array[each_rank]
+					elsif rank_array[each_rank] == "genus"
+						all_taxa_hash[gi_id]["genus"] = scientific_name_array[each_rank]
+					elsif rank_array[each_rank] == "species"
+						all_taxa_hash[gi_id]["species"] = scientific_name_array[each_rank]
+					end
+
+					# Get the no_rank_hash
+					if rank_array[each_rank] == "no rank"
+						array_in_no_rank.push(scientific_name_array[each_rank])
+						no_rank_hash[gi_id]= array_in_no_rank
+					end
+
+				end
+
+			end
+
+		end
+
+	end
+
+end
+#puts all_taxa_hash.length
+#puts all_taxa_hash
+#puts no_rank_hash 
+
+# Filling the taxa levels which were unavailable from the xml using the no rank levels 
+all_taxa_hash.each do |gi_id, sub_hash|
+	#puts key
+	#puts value
+	counter = 0
+	sub_hash.each do |rank, sc|
+		if sc == ""
+			counter += 1
+			#puts tax_id, rank, sc
+			#puts no_rank_hash[gi_id].length
+
+			# When the first null attribute is getting filled
+			if counter == 1
+				if no_rank_hash[gi_id].length-1 > 1
+					if no_rank_hash[gi_id][0] != "cellular organisms"
+						sub_hash["species"] = no_rank_hash[gi_id][0]
+						sub_hash[rank] = "unsure_"+no_rank_hash[gi_id][2]
+					else
+						sub_hash[rank] = "unsure_"+no_rank_hash[gi_id][1]
+					end
+				elsif no_rank_hash[gi_id].length-1 == 1
+					if no_rank_hash[gi_id][0] != "cellular organisms"
+						sub_hash["species"] = no_rank_hash[gi_id][0]
+						sub_hash[rank] = "unclassified"
+					else
+						sub_hash[rank] = "unsure_"+no_rank_hash[gi_id][1]
+					end
+				else
+					sub_hash[rank] = "unclassified"
+				end	
+				all_taxa_hash[gi_id] = sub_hash
+
+			# When the second null attribute is getting filled
+			elsif counter == 2
+				#puts no_rank_hash[gi_id]
+				if no_rank_hash[gi_id].length-1 > 2
+					if no_rank_hash[gi_id][0] != "cellular organisms"
+						sub_hash[rank] = "unsure_"+no_rank_hash[gi_id][3]
+					else
+						sub_hash[rank] = "unsure_"+no_rank_hash[gi_id][2]
+					end
+				elsif no_rank_hash[gi_id].length-1 == 2
+					if no_rank_hash[gi_id][0] != "cellular organisms"
+						sub_hash["species"] = no_rank_hash[gi_id][0]
+						sub_hash[rank] = "unclassified"
+					else
+						sub_hash[rank] = "unsure_"+no_rank_hash[gi_id][2]
+					end
+				else
+					sub_hash[rank] = "unclassified"
+				end	
+				all_taxa_hash[gi_id] = sub_hash
+
+			# When the third null attribute is getting filled
+			elsif counter == 3
+				#puts no_rank_hash[gi_id]
+				if no_rank_hash[gi_id].length-1 > 3
+					if no_rank_hash[gi_id][0] != "cellular organisms"
+						sub_hash[rank] = "unsure_"+no_rank_hash[gi_id][4]
+					else
+						sub_hash[rank] = "unsure_"+no_rank_hash[gi_id][3]
+					end
+				elsif no_rank_hash[gi_id].length-1 == 3
+					if no_rank_hash[gi_id][0] != "cellular organisms"
+						sub_hash["species"] = no_rank_hash[gi_id][0]
+						sub_hash[rank] = "unclassified"
+					else
+						sub_hash[rank] = "unsure_"+no_rank_hash[gi_id][3]
+					end
+				else
+					sub_hash[rank] = "unclassified"
+				end	
+				all_taxa_hash[gi_id] = sub_hash
+
+			# If there are more than 3 null attributes, just call it "unclassified"
+			else
+				sub_hash[rank] = "unclassified"
+				all_taxa_hash[gi_id] = sub_hash
+			end	
+		end	
+	end	
+end
+#puts all_taxa_hash.length
+#puts all_taxa_hash
+
+# Writing lineage to files 
+ncbi_fasta_file.each do |entry|
+	gi_id = entry.definition.split("|")[1].chomp
+
+	if all_taxa_hash.key?(gi_id)
+
+		# Split and modify the species names and definition lines
+		def_mod_split = entry.definition.chomp.split("|")
+    	def_string = def_mod_split[0]+"|"+def_mod_split[1]+"|"+def_mod_split[2]+"|"+def_mod_split[3]+";"
+    	def_string_2 = def_string.tr("\s","")
+  		species_name = def_mod_split[4].split(" ")[0..1].join("_").tr('^A-Za-z0-9_', '')
+    	species_name_1 = ",s:"+species_name+";"
+
+		# Get the lineage and write in the FASTA file 
+		tax_string = "tax=d:"+all_taxa_hash[gi_id]["kingdom"]+",p:"+all_taxa_hash[gi_id]["phylum"]+",c:"+all_taxa_hash[gi_id]["class"]+",o:"+all_taxa_hash[gi_id]["order"]+",f:"+all_taxa_hash[gi_id]["family"]+",g:"+all_taxa_hash[gi_id]["genus"]
+		to_print = (">"+def_string_2+tax_string+species_name_1)
+    	ncbi_lineage.puts(to_print)
+    	ncbi_lineage.puts(entry.naseq.upcase)
+
+    	# Get the lineage and write in the tab delimited file 
+    	lineage_file.puts("#{gi_id}\t#{all_taxa_hash[gi_id]["kingdom"]}\t#{all_taxa_hash[gi_id]["phylum"]}\t#{all_taxa_hash[gi_id]["class"]}\t#{all_taxa_hash[gi_id]["order"]}\t#{all_taxa_hash[gi_id]["family"]}\t#{all_taxa_hash[gi_id]["genus"]}\t#{species_name}")
+	end
+end
+
+ncbi_lineage.close
+lineage_file.close