split seq function added

novapyth · Aug 18, 2021 · 535bd15 · 535bd15
1 parent d5b8a03
commit 535bd15
Show file tree

Hide file tree

Showing 4 changed files with 49 additions and 29 deletions.
diff --git a/README.md b/README.md
@@ -604,6 +604,27 @@ Sequencing coverage of the given FASTQ file
 <a href="https://reneshbedre.github.io/blog/seqcov.html" target="_blank">Description and Working example</a>
 
 
+### Split the sequence into smaller subsequences
+
+latest update v2.0.6
+
+`bioinfokit.analys.Fasta.split_seq(seq, seq_size, seq_overlap, any_cond, outfmt)`
+
+Parameters | Description
+------------ | -------------
+`seq` | Input sequence [string]
+`seq_size` | subsequence size [int][default: 3]
+`seq_overlap` | Split the sequence in overlap mode [bool][default: True]
+`any_cond` | Split sequence based on a condition. Note yet defined. 
+`outfmt` | Output format for the subsequences. If parameter set to 'fasta', the file will be saved in same folder with name `output_chunks.fasta` ['list' or 'fasta'][default: 'list']
+
+Returns:
+
+Subsequences in list or fasta file (output_chunks.fasta) format
+
+<a href="https://reneshbedre.com/blog/split-seq.html" target="_blank">Description and Working example</a>
+
+
 ### Reverse complement of DNA sequence
 
 latest update v2.0.4

diff --git a/VERSIONLOG.md b/VERSIONLOG.md
@@ -1,3 +1,6 @@
+v2.0.6 has the following updates and changes (August 18, 2021)
+- New function `Fasta.split_seq` added in `analys` module for splitting the sequence into smaller subsequences 
+- 
 v2.0.5 has the following updates and changes (August 16, 2021)
 - New function `HtsAna.merge_featureCount` added in `analys` module for merging the counts for all samples
   obtained from featureCounts 

diff --git a/bioinfokit/__init__.py b/bioinfokit/__init__.py
@@ -1,5 +1,5 @@
 name = "bioinfokit"
-__version__ = "2.0.5"
+__version__ = "2.0.6"
 __author__ = "Renesh Bedre"
 
 
diff --git a/bioinfokit/analys.py b/bioinfokit/analys.py
@@ -160,38 +160,41 @@ def split_fasta(file="fasta_file", n=2, bases_per_line=60):
             out_file.close()
 
     @staticmethod
-    def split_seq(seq="sequence", seq_size=10, seq_overlap=True, any_cond=False):
+    def split_seq(seq=None, seq_size=3, seq_overlap=True, any_cond=False, outfmt='list'):
         """
         Split a nucleotide sequence into smaller chunks
         Parameters
         seq: Nucleotide sequence to split
         seq_size: Sequence chunk size
         seq_overlap: Split sequence in overlap mode
+        any_cond: any conditions for splitting; not yet defined
+        outfmt: Split sequence ouput format (list or fasta) [default: fasta]
         """
+        if outfmt not in ['list', 'fasta']:
+            raise ValueError('Invalid value for outfmt')
+        if seq is None:
+            raise ValueError('Provide the input sequence')
+        chunk_counter = 1
+        temp_chunks = []
         if seq_overlap:
-            seq_chunks = [seq[i:i+seq_size] for i in range(0, len(seq), seq_size-1)]
+            seq_chunks = [seq[i:i+seq_size] for i in range(0, len(seq), seq_size-(seq_size-1))]
             if any_cond:
-
+                for s in seq_chunks:
+                    if s[-1] != 'G':
+                        temp_chunks.append(s[:-1])
         else:
             seq_chunks = [seq[i:i+seq_size] for i in range(0, len(seq), seq_size)]
-
-
-        seq_ids = []
-        fasta_iter = Fasta.fasta_reader(file)
-        for record in fasta_iter:
-            header, seq = record
-            seq_ids.append(header)
-        split_ids_list = np.array_split(seq_ids, n)
-        for ind, i in enumerate(split_ids_list):
-            out_file = open('output_' + str(ind) + '.fasta', 'w')
-            value = [1] * len(i)
-            dict_list = dict(zip(i, value))
-            fasta_iter = Fasta.fasta_reader(file)
-            for record in fasta_iter:
-                fasta_header, seq = record
-                if fasta_header.strip() in dict_list.keys():
-                    out_file.write(">" + fasta_header + "\n" + '\n'.join(wrap(seq, bases_per_line)) + "\n")
-            out_file.close()
+        if any_cond:
+            seq_chunks = temp_chunks
+            seq_size = seq_size-1
+        if outfmt == 'fasta':
+            out_fasta_file = open('output_chunks.fasta', 'w')
+            for s in seq_chunks:
+                if len(s) == seq_size:
+                    out_fasta_file.write(">" + str(chunk_counter) + "\n" + '\n'.join(wrap(s, 60)) + "\n")
+                    chunk_counter += 1
+        elif outfmt == 'list':
+            print([s for s in seq_chunks if len(s)==seq_size])
 
 
 class fastq:
@@ -2222,13 +2225,6 @@ def get_bg_counts(species=None, check_ids=False):
         else:
             return df, bg_gene_count, bg_trn_count, bg_phytid_count, plant_name
 
-        # bg_gene_count, bg_trn_count, bg_phytid_count = genfam.get_bg_counts(df)
-        # plant_name = 'Amaranthus hypochondriacus v2.1 (Amaranth)'
-        # bg_gene_count = df['loc_len'].sum()
-        # bg_trn_count = df['trn_len'].sum()
-        # bg_phytid_count = df['phyt_id_len'].sum()
-        # return bg_gene_count, bg_trn_count, bg_phytid_count
-
     @staticmethod
     def get_rec_dicts(df=None, glist=None, sname=None, loclen=None, gop=None, gof=None, goc=None):
         df1_glist = df[glist]