Skip to content

Commit

Permalink
improved
Browse files Browse the repository at this point in the history
  • Loading branch information
kyclark committed Jan 23, 2021
1 parent 643c4e4 commit 9774b22
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 134 deletions.
4 changes: 2 additions & 2 deletions 19_blastomatic/solution1_manual.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@ def get_args():
'--blasthits',
metavar='FILE',
type=argparse.FileType('rt'),
help='BLAST output (-outfmt 6)',
help='BLAST -outfmt 6',
required=True)

parser.add_argument('-a',
'--annotations',
help='Annotation file',
help='Annotations file',
metavar='FILE',
type=argparse.FileType('rt'),
required=True)
Expand Down
14 changes: 7 additions & 7 deletions 19_blastomatic/solution2_dict_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@ def get_args():
'--blasthits',
metavar='FILE',
type=argparse.FileType('rt'),
help='BLAST output (-outfmt 6)',
help='BLAST -outfmt 6',
required=True)

parser.add_argument('-a',
'--annotations',
help='Annotation file',
help='Annotations file',
metavar='FILE',
type=argparse.FileType('rt'),
required=True)
Expand Down Expand Up @@ -74,16 +74,16 @@ def main():

args = get_args()
annots_reader = csv.DictReader(args.annotations, delimiter=',')
annots = {row['centroid']: row for row in annots_reader}
annots = {row['seq_id']: row for row in annots_reader}

writer = csv.DictWriter(
args.outfile,
fieldnames=['sseqid', 'pident', 'genus', 'species'],
fieldnames=['qseqid', 'pident', 'genus', 'species'],
delimiter=args.delimiter)
writer.writeheader()

hits = csv.DictReader(args.hits,
delimiter='\t',
delimiter=',',
fieldnames=[
'qseqid', 'sseqid', 'pident', 'length',
'mismatch', 'gapopen', 'qstart', 'qend',
Expand All @@ -95,11 +95,11 @@ def main():
if float(hit.get('pident', -1)) < args.pctid:
continue

if seq_id := hit.get('sseqid'):
if seq_id := hit.get('qseqid'):
if info := annots.get(seq_id):
num_written += 1
writer.writerow({
'sseqid': seq_id,
'qseqid': seq_id,
'pident': hit.get('pident', 'NA'),
'genus': info.get('genus') or 'NA',
'species': info.get('species') or 'NA',
Expand Down
18 changes: 9 additions & 9 deletions 19_blastomatic/solution3_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@ def get_args():
'--blasthits',
metavar='FILE',
type=argparse.FileType('rt'),
help='BLAST output (-outfmt 6)',
help='BLAST -outfmt 6',
required=True)

parser.add_argument('-a',
'--annotations',
help='Annotation file',
help='Annotations file',
metavar='FILE',
type=argparse.FileType('rt'),
required=True)
Expand Down Expand Up @@ -75,7 +75,7 @@ def main():
args = get_args()
annots = pd.read_csv(args.annotations)
hits = pd.read_csv(args.hits,
delimiter='\t',
delimiter=',',
names=[
'qseqid', 'sseqid', 'pident', 'length', 'mismatch',
'gapopen', 'qstart', 'qend', 'sstart', 'send',
Expand All @@ -84,14 +84,14 @@ def main():

data = []
for _, hit in hits[hits['pident'] >= args.pctid].iterrows():
centroids = annots[annots['centroid'] == hit['sseqid']]
if not centroids.empty:
for _, centroid in centroids.iterrows():
meta = annots[annots['seq_id'] == hit['qseqid']]
if not meta.empty:
for _, info in meta.iterrows():
data.append({
'sseqid': hit['sseqid'],
'qseqid': hit['qseqid'],
'pident': hit['pident'],
'genus': centroid['genus'] or 'NA',
'species': centroid['species'] or 'NA',
'latitude': info['latitude'] or 'NA',
'longitude': info['longitude'] or 'NA',
})

df = pd.DataFrame.from_records(data=data)
Expand Down
14 changes: 7 additions & 7 deletions 19_blastomatic/solution4_pandas_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@ def get_args():
'--blasthits',
metavar='FILE',
type=argparse.FileType('rt'),
help='BLAST output (-outfmt 6)',
help='BLAST -outfmt 6',
required=True)

parser.add_argument('-a',
'--annotations',
help='Annotation file',
help='Annotations file',
metavar='FILE',
type=argparse.FileType('rt'),
required=True)
Expand Down Expand Up @@ -75,24 +75,24 @@ def main():
args = get_args()
annots = pd.read_csv(args.annotations)
hits = pd.read_csv(args.hits,
delimiter='\t',
delimiter=',',
names=[
'qseqid', 'sseqid', 'pident', 'length', 'mismatch',
'gapopen', 'qstart', 'qend', 'sstart', 'send',
'evalue', 'bitscore'
])

joined = hits[hits['pident'] >= args.pctid].join(
annots.set_index('centroid'), on='sseqid', how='inner')
annots.set_index('seq_id'), on='qseqid', how='inner')

# joined = pd.merge(hits[hits['pident'] >= args.pctid],
# annots,
# left_on='sseqid',
# right_on='centroid')
# left_on='qseqid',
# right_on='seq_id')

joined.to_csv(args.outfile,
index=False,
columns=['sseqid', 'pident', 'genus', 'species'],
columns=['qseqid', 'pident', 'latitude', 'longitude'],
sep=args.delimiter)

print(f'Exported {joined.shape[0]:,} to "{args.outfile.name}".')
Expand Down
16 changes: 8 additions & 8 deletions 19_blastomatic/tests/blastomatic_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,12 @@ def test_good_input() -> None:
print(cmd)
rv, out = getstatusoutput(cmd)
assert rv == 0
assert out == 'Exported 499 to "out.csv".'
assert out == 'Exported 500 to "out.csv".'
assert os.path.isfile(outfile)

reader = csv.DictReader(open(outfile), delimiter=',')
records = list(reader)
assert len(records) == 499
assert len(records) == 500
assert records[0]['qseqid'] == 'CAM_READ_0234442157'
assert records[-1]['qseqid'] == 'JCVI_READ_1095403503430'
finally:
Expand All @@ -92,12 +92,12 @@ def test_delimiter() -> None:
cmd = f'{RUN} -a {META} -b {HITS1} -d "{delim}" -o {outfile}'
rv, out = getstatusoutput(cmd)
assert rv == 0
assert out == f'Exported 499 to "{outfile}".'
assert out == f'Exported 500 to "{outfile}".'
assert os.path.isfile(outfile)

reader = csv.DictReader(open(outfile), delimiter=delim)
records = list(reader)
assert len(records) == 499
assert len(records) == 500
assert records[0]['qseqid'] == 'CAM_READ_0234442157'
finally:
if os.path.isfile(outfile):
Expand All @@ -120,12 +120,12 @@ def test_guess_delimiter() -> None:
cmd = f'{RUN} -a {META} -b {HITS2} -o {outfile}'
rv, out = getstatusoutput(cmd)
assert rv == 0
assert out == f'Exported 248 to "{outfile}".'
assert out == f'Exported 252 to "{outfile}".'
assert os.path.isfile(outfile)

reader = csv.DictReader(open(outfile), delimiter=delim)
records = list(reader)
assert len(records) == 248
assert len(records) == 252
assert records[-1]['qseqid'] == 'JCVI_READ_1100018174123'
finally:
if os.path.isfile(outfile):
Expand All @@ -144,12 +144,12 @@ def test_pctid() -> None:
cmd = f'{RUN} -a {META} -b {HITS2} -p 90 -o {outfile}'
rv, out = getstatusoutput(cmd)
assert rv == 0
assert out == f'Exported 97 to "{outfile}".'
assert out == f'Exported 101 to "{outfile}".'
assert os.path.isfile(outfile)

reader = csv.DictReader(open(outfile), delimiter='\t')
records = list(reader)
assert len(records) == 97
assert len(records) == 101
assert records[-1]['qseqid'] == 'JCVI_READ_1092343670678'
finally:
if os.path.isfile(outfile):
Expand Down
Loading

0 comments on commit 9774b22

Please sign in to comment.