Skip to content

Commit

Permalink
split seq function added
Browse files Browse the repository at this point in the history
  • Loading branch information
reneshbedre committed Aug 18, 2021
1 parent d5b8a03 commit 535bd15
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 29 deletions.
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,27 @@ Sequencing coverage of the given FASTQ file
<a href="https://reneshbedre.github.io/blog/seqcov.html" target="_blank">Description and Working example</a>


### Split the sequence into smaller subsequences

latest update v2.0.6

`bioinfokit.analys.Fasta.split_seq(seq, seq_size, seq_overlap, any_cond, outfmt)`

Parameters | Description
------------ | -------------
`seq` | Input sequence [string]
`seq_size` | subsequence size [int][default: 3]
`seq_overlap` | Split the sequence in overlap mode [bool][default: True]
`any_cond` | Split sequence based on a condition. Note yet defined.
`outfmt` | Output format for the subsequences. If parameter set to 'fasta', the file will be saved in same folder with name `output_chunks.fasta` ['list' or 'fasta'][default: 'list']

Returns:

Subsequences in list or fasta file (output_chunks.fasta) format

<a href="https://reneshbedre.com/blog/split-seq.html" target="_blank">Description and Working example</a>


### Reverse complement of DNA sequence

latest update v2.0.4
Expand Down
3 changes: 3 additions & 0 deletions VERSIONLOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
v2.0.6 has the following updates and changes (August 18, 2021)
- New function `Fasta.split_seq` added in `analys` module for splitting the sequence into smaller subsequences
-
v2.0.5 has the following updates and changes (August 16, 2021)
- New function `HtsAna.merge_featureCount` added in `analys` module for merging the counts for all samples
obtained from featureCounts
Expand Down
2 changes: 1 addition & 1 deletion bioinfokit/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name = "bioinfokit"
__version__ = "2.0.5"
__version__ = "2.0.6"
__author__ = "Renesh Bedre"


52 changes: 24 additions & 28 deletions bioinfokit/analys.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,38 +160,41 @@ def split_fasta(file="fasta_file", n=2, bases_per_line=60):
out_file.close()

@staticmethod
def split_seq(seq="sequence", seq_size=10, seq_overlap=True, any_cond=False):
def split_seq(seq=None, seq_size=3, seq_overlap=True, any_cond=False, outfmt='list'):
"""
Split a nucleotide sequence into smaller chunks
Parameters
seq: Nucleotide sequence to split
seq_size: Sequence chunk size
seq_overlap: Split sequence in overlap mode
any_cond: any conditions for splitting; not yet defined
outfmt: Split sequence ouput format (list or fasta) [default: fasta]
"""
if outfmt not in ['list', 'fasta']:
raise ValueError('Invalid value for outfmt')
if seq is None:
raise ValueError('Provide the input sequence')
chunk_counter = 1
temp_chunks = []
if seq_overlap:
seq_chunks = [seq[i:i+seq_size] for i in range(0, len(seq), seq_size-1)]
seq_chunks = [seq[i:i+seq_size] for i in range(0, len(seq), seq_size-(seq_size-1))]
if any_cond:

for s in seq_chunks:
if s[-1] != 'G':
temp_chunks.append(s[:-1])
else:
seq_chunks = [seq[i:i+seq_size] for i in range(0, len(seq), seq_size)]


seq_ids = []
fasta_iter = Fasta.fasta_reader(file)
for record in fasta_iter:
header, seq = record
seq_ids.append(header)
split_ids_list = np.array_split(seq_ids, n)
for ind, i in enumerate(split_ids_list):
out_file = open('output_' + str(ind) + '.fasta', 'w')
value = [1] * len(i)
dict_list = dict(zip(i, value))
fasta_iter = Fasta.fasta_reader(file)
for record in fasta_iter:
fasta_header, seq = record
if fasta_header.strip() in dict_list.keys():
out_file.write(">" + fasta_header + "\n" + '\n'.join(wrap(seq, bases_per_line)) + "\n")
out_file.close()
if any_cond:
seq_chunks = temp_chunks
seq_size = seq_size-1
if outfmt == 'fasta':
out_fasta_file = open('output_chunks.fasta', 'w')
for s in seq_chunks:
if len(s) == seq_size:
out_fasta_file.write(">" + str(chunk_counter) + "\n" + '\n'.join(wrap(s, 60)) + "\n")
chunk_counter += 1
elif outfmt == 'list':
print([s for s in seq_chunks if len(s)==seq_size])


class fastq:
Expand Down Expand Up @@ -2222,13 +2225,6 @@ def get_bg_counts(species=None, check_ids=False):
else:
return df, bg_gene_count, bg_trn_count, bg_phytid_count, plant_name

# bg_gene_count, bg_trn_count, bg_phytid_count = genfam.get_bg_counts(df)
# plant_name = 'Amaranthus hypochondriacus v2.1 (Amaranth)'
# bg_gene_count = df['loc_len'].sum()
# bg_trn_count = df['trn_len'].sum()
# bg_phytid_count = df['phyt_id_len'].sum()
# return bg_gene_count, bg_trn_count, bg_phytid_count

@staticmethod
def get_rec_dicts(df=None, glist=None, sname=None, loclen=None, gop=None, gof=None, goc=None):
df1_glist = df[glist]
Expand Down

0 comments on commit 535bd15

Please sign in to comment.