Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

wip: Auto-detect column and row coordinates from niid data #175

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
218 changes: 132 additions & 86 deletions tdb/niid_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
import subprocess
import unicodedata
from parse import parse
import xlrd
from upload import parser
sys.path.append('') # need to import from base
from base.rethink_io import rethink_io
from vdb.flu_upload import flu_upload
import logging
# logger = logging.getLogger()
from titer_block import find_titer_block, find_serum_rows, find_virus_columns

parser.add_argument('--assay_type', default='hi')

Expand All @@ -27,97 +27,143 @@ def read_niid(path, fstem, subtype, assay_type):
if real_file != '':
print("real_file: " + real_file)
ind = '.{}'.format(real_file.split('.')[-1])
convert_xls_to_csv(path, fstem, ind)
fname = "data/tmp/{}.csv".format(fstem)
parse_niid_matrix_to_tsv(fname, path, subtype, assay_type)
convert_niid_xls_to_tsv(path, fstem, ind, subtype, assay_type)

def convert_xls_to_csv(path, fstem, ind):
import xlrd
wb_name = path + '/' + fstem + ind
workbook = xlrd.open_workbook(filename=wb_name, encoding_override="cp1252")
for sheet in workbook.sheets():
with open('data/tmp/%s.csv'%(fstem), 'w') as f:
writer = csv.writer(f)
rows = []
for i in range(sheet.nrows):
row = []
for j in range(sheet.ncols):
val = sheet.cell_value(i, j)
row.append(val)
rows.append(row)
writer.writerows(rows)
return

def parse_niid_matrix_to_tsv(fname, original_path, subtype, assay_type):
def convert_niid_xls_to_tsv(path, fstem, ind, subtype, assay_type):
# Set flutype
suptype=subtype.lower()
flutype = ""
if subtype == "h3n2" or subtype == "h1n1pdm":
flutype = "A"
if subtype == "vic" or subtype == "yam":
flutype = "B"
src_id = fname.split('/')[-1]
with open(fname) as infile:
csv_reader = csv.reader(infile)
mat = list(csv_reader)
with open('data/tmp/%s.tsv'%(src_id[:-4]), 'w') as outfile:
header = ["virus_strain", "serum_strain","serum_id", "titer", "source", "virus_passage", "virus_passage_category", "serum_passage", "serum_passage_category", "assay_type"]
outfile.write("%s\n" % ("\t".join(header)))
original_path = original_path.split('/')
try:
original_path.remove('')
except:
pass
if subtype == "h3n2":
serum_id_row_index = 5 #5
start_row = 7
virus_id_col_index = 1
start_col = 4
elif subtype == "h1n1pdm":
serum_id_row_index = 5
start_row = 6
virus_id_col_index = 1
start_col = 4
elif subtype == "vic":
serum_id_row_index = 4
start_row = 5
virus_id_col_index = 1
start_col = 4
elif subtype == "yam":
serum_id_row_index = 3
start_row = 5
virus_id_col_index = 1
start_col = 4
for i in range(start_row, len(mat)):
for j in range(start_col, len(mat[0])):
virus_strain = mat[i][virus_id_col_index]
serum_id = mat[serum_id_row_index][j]
serum_id = re.sub(r'[\r\n ]+', '', serum_id)
m = re.search(r'^(\S+)(egg|cell|siat|hck|nib121|ivr|\(bvr)', serum_id, re.IGNORECASE)
if m is None:
m = re.search(r'^(\S+)(no\.)', serum_id, re.IGNORECASE)
serum_strain = ""
# import pdb; pdb.set_trace()
if m:
serum_strain = m.group(1)
if not serum_strain.startswith(flutype + "/"):
serum_strain = flutype + "/" + serum_strain
# Normalize U+ff1c '<' to U+003c '<'
titer = unicodedata.normalize('NFKC', mat[i][j])
# Allow either "< 10" or "<10"
titer = re.sub(r'< ', '<', titer)
source = "niid_%s"%(src_id)
virus_passage = mat[i][2]
virus_passage_category = ''
serum_passage = "unknown"
m = re.search(r'(egg)', serum_id, re.IGNORECASE)
if m:
serum_passage = m.group(1)
m = re.search(r'(cell|siat|hck)', serum_id, re.IGNORECASE)
if m:
serum_passage = m.group(1)
serum_passage_category = ''
line = "%s\n" % ("\t".join([ virus_strain, serum_strain, serum_id, titer, source, virus_passage, virus_passage_category, serum_passage, serum_passage_category, assay_type]))
outfile.write(line)

# Set NIID patterns
virus_pattern = r"[A-Z]/[\w\s-]+/.+/\d{4}"
virus_passage_pattern = r"(MDCK|SIAT|E\d+|hCK)"
serum_id_pattern = r".+(No\.|no\.).+"
serum_passage_pattern = r".+(Egg|Cell).+"
serum_abbrev_pattern = r"\w+\s{0,1}\w+/\d+.*"
crick = False

# Open workbook
wb_name = path + '/' + fstem + ind
workbook = xlrd.open_workbook(filename=wb_name, encoding_override="cp1252")
for worksheet_index, worksheet in enumerate(workbook.sheets(), start=1):
print(f"Reading worksheet {worksheet_index} '{worksheet.name}' in file '{fstem}'")
# autodetecting titer, virus, serum blocks
titer_block = find_titer_block(worksheet)

if len(titer_block["col_start"]) == 0:
print("No titer block found.")
break

titer_coords = {
'col_start': titer_block["col_start"][0][0],
'col_end': titer_block["col_end"][0][0],
'row_start': titer_block["row_start"][0][0],
'row_end': titer_block["row_end"][0][0]
}

virus_block = find_virus_columns(
worksheet=worksheet,
titer_coords=titer_coords,
virus_pattern=virus_pattern,
virus_passage_pattern=virus_passage_pattern,
)

# If no virus names are found, might not be a valid worksheet, skip worksheet to avoid breaking find_serum_rows
if virus_block["virus_names"] is None:
print(f"Virus names not found. Check the virus pattern: '{virus_pattern}'")
break

serum_block = find_serum_rows(
worksheet=worksheet,
titer_coords=titer_coords,
virus_names=virus_block["virus_names"],
serum_id_pattern=serum_id_pattern,
serum_passage_pattern=serum_passage_pattern,
serum_abbrev_pattern=serum_abbrev_pattern,
crick=crick,
)

# Print the most likely row and column indices for the titer block
print(f"Titer block: n = {titer_block['row_start'][0][1]}x{titer_block['col_start'][0][1]} = {titer_block['row_start'][0][1]*titer_block['col_start'][0][1]}")
print(f" Most likely (n={titer_block['col_start'][0][1]}) col_start: {titer_block['col_start'][0][0]}")
print(f" Most likely (n={titer_block['col_end'][0][1]}) col_end: {titer_block['col_end'][0][0]}")
print(f" Most likely (n={titer_block['row_start'][0][1]}) row_start: {titer_block['row_start'][0][0]}")
print(f" Most likely (n={titer_block['row_end'][0][1]}) row_end: {titer_block['row_end'][0][0]}")

# For debugging purposes, print alternative indices (e.g. col_start, col_end, row_start, row_end)
# print("Alternative indices:")
# for i in range(1, len(titer_block['row_start'])):
# print(f" Alternative (n={titer_block['row_start'][i][1]}) row_start: {titer_block['row_start'][i][0]}")

# Print Virus and Serum annotations row and column indices
print("Virus (antigen) block: left and right of the titer block")
print(f" virus column index: {virus_block['virus_col_idx']}")
print(f" virus passage column index: {virus_block['virus_passage_col_idx']}")
print(f" virus names: {virus_block['virus_names']}")

print("Serum (antisera) block: above the titer block")
print(f" serum ID row index: {serum_block['serum_id_row_idx']}")
print(f" serum passage row index: {serum_block['serum_passage_row_idx']}")
print(f" serum abbreviated name row index: {serum_block['serum_abbrev_row_idx']}")

# Match abbreviated names across the top to the full names along the left side and auto convert to full names
if serum_block["serum_abbrev_row_idx"] is not None:
print("serum_mapping = {")
for abbrev, full in serum_block["serum_mapping"].items():
print(f" '{abbrev}': '{full}',")
print("}")

serum_mapping = serum_block["serum_mapping"] # This is not used since serum_strain is being parsed in the loop below
print("NOTE: Serum mapping is not used since serum_strain is being parsed in the loop")

mat = worksheet

with open('data/tmp/%s.tsv'%(fstem), 'w') as outfile:
header = ["virus_strain", "serum_strain","serum_id", "titer", "source", "virus_passage", "virus_passage_category", "serum_passage", "serum_passage_category", "assay_type"]
outfile.write("%s\n" % ("\t".join(header)))

serum_id_row_index = serum_block['serum_id_row_idx']
row_start = titer_coords['row_start']
row_end = titer_coords['row_end']
virus_id_col_index = virus_block['virus_col_idx']
virus_passage_col_index=virus_block['virus_passage_col_idx']
col_start = titer_coords['col_start']
col_end = titer_coords['col_end']

for i in range(row_start, row_end+1):
for j in range(col_start, col_end+1):
virus_strain = str(mat.cell_value(i,virus_id_col_index)).strip()
serum_id = str(mat.cell_value(serum_id_row_index,j)).strip().replace(' ','')
serum_id = re.sub(r'[\r\n ]+', '', serum_id)
m = re.search(r'^(\S+)(egg|cell|siat|hck|nib121|ivr|\(bvr)', serum_id, re.IGNORECASE)
if m is None:
m = re.search(r'^(\S+)(no\.)', serum_id, re.IGNORECASE)
serum_strain = ""
if m:
serum_strain = m.group(1)
if not serum_strain.startswith(flutype + "/"):
serum_strain = flutype + "/" + serum_strain
# Normalize U+ff1c '<' to U+003c '<'
titer = unicodedata.normalize('NFKC', str(mat.cell_value(i,j)).strip())
# Allow either "< 10" or "<10"
titer = re.sub(r'< ', '<', titer)
source = "niid_%s"%(fstem).strip()
virus_passage = str(mat.cell_value(i,virus_passage_col_index)).strip()
virus_passage_category = ''
serum_passage = "unknown"
m = re.search(r'(egg)', serum_id, re.IGNORECASE)
if m:
serum_passage = m.group(1)
m = re.search(r'(cell|siat|hck)', serum_id, re.IGNORECASE)
if m:
serum_passage = m.group(1)
serum_passage_category = ''
line = "%s\n" % ("\t".join([ virus_strain, serum_strain, serum_id, titer, source, virus_passage, virus_passage_category, serum_passage, serum_passage_category, assay_type]))
outfile.write(line)

def determine_subtype(original_path):
original_path = original_path.lower().split('/')
Expand Down