From 3ed9a5488b84561e54e13be85ca9ede4b5637d7f Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Tue, 30 Jul 2024 11:26:05 -0400 Subject: [PATCH 1/2] wip: niid migration --- tdb/niid_upload.py | 217 +++++++++++++++++++++++++++------------------ 1 file changed, 131 insertions(+), 86 deletions(-) diff --git a/tdb/niid_upload.py b/tdb/niid_upload.py index 89d58a8..25f12a5 100644 --- a/tdb/niid_upload.py +++ b/tdb/niid_upload.py @@ -6,12 +6,12 @@ import subprocess import unicodedata from parse import parse +import xlrd from upload import parser sys.path.append('') # need to import from base from base.rethink_io import rethink_io from vdb.flu_upload import flu_upload -import logging -# logger = logging.getLogger() +from titer_block import find_titer_block, find_serum_rows, find_virus_columns parser.add_argument('--assay_type', default='hi') @@ -27,97 +27,142 @@ def read_niid(path, fstem, subtype, assay_type): if real_file != '': print("real_file: " + real_file) ind = '.{}'.format(real_file.split('.')[-1]) - convert_xls_to_csv(path, fstem, ind) - fname = "data/tmp/{}.csv".format(fstem) - parse_niid_matrix_to_tsv(fname, path, subtype, assay_type) + convert_niid_xls_to_tsv(path, fstem, ind, subtype, assay_type) -def convert_xls_to_csv(path, fstem, ind): - import xlrd - wb_name = path + '/' + fstem + ind - workbook = xlrd.open_workbook(filename=wb_name, encoding_override="cp1252") - for sheet in workbook.sheets(): - with open('data/tmp/%s.csv'%(fstem), 'w') as f: - writer = csv.writer(f) - rows = [] - for i in range(sheet.nrows): - row = [] - for j in range(sheet.ncols): - val = sheet.cell_value(i, j) - row.append(val) - rows.append(row) - writer.writerows(rows) - return - -def parse_niid_matrix_to_tsv(fname, original_path, subtype, assay_type): +def convert_niid_xls_to_tsv(path, fstem, ind, subtype, assay_type): + # Set flutype suptype=subtype.lower() flutype = "" if subtype == "h3n2" or subtype == "h1n1pdm": flutype = "A" if subtype == "vic" or subtype == "yam": flutype = "B" - src_id = fname.split('/')[-1] - with open(fname) as infile: - csv_reader = csv.reader(infile) - mat = list(csv_reader) - with open('data/tmp/%s.tsv'%(src_id[:-4]), 'w') as outfile: - header = ["virus_strain", "serum_strain","serum_id", "titer", "source", "virus_passage", "virus_passage_category", "serum_passage", "serum_passage_category", "assay_type"] - outfile.write("%s\n" % ("\t".join(header))) - original_path = original_path.split('/') - try: - original_path.remove('') - except: - pass - if subtype == "h3n2": - serum_id_row_index = 5 #5 - start_row = 7 - virus_id_col_index = 1 - start_col = 4 - elif subtype == "h1n1pdm": - serum_id_row_index = 5 - start_row = 6 - virus_id_col_index = 1 - start_col = 4 - elif subtype == "vic": - serum_id_row_index = 4 - start_row = 5 - virus_id_col_index = 1 - start_col = 4 - elif subtype == "yam": - serum_id_row_index = 3 - start_row = 5 - virus_id_col_index = 1 - start_col = 4 - for i in range(start_row, len(mat)): - for j in range(start_col, len(mat[0])): - virus_strain = mat[i][virus_id_col_index] - serum_id = mat[serum_id_row_index][j] - serum_id = re.sub(r'[\r\n ]+', '', serum_id) - m = re.search(r'^(\S+)(egg|cell|siat|hck|nib121|ivr|\(bvr)', serum_id, re.IGNORECASE) - if m is None: - m = re.search(r'^(\S+)(no\.)', serum_id, re.IGNORECASE) - serum_strain = "" - # import pdb; pdb.set_trace() - if m: - serum_strain = m.group(1) - if not serum_strain.startswith(flutype + "/"): - serum_strain = flutype + "/" + serum_strain - # Normalize U+ff1c '<' to U+003c '<' - titer = unicodedata.normalize('NFKC', mat[i][j]) - # Allow either "< 10" or "<10" - titer = re.sub(r'< ', '<', titer) - source = "niid_%s"%(src_id) - virus_passage = mat[i][2] - virus_passage_category = '' - serum_passage = "unknown" - m = re.search(r'(egg)', serum_id, re.IGNORECASE) - if m: - serum_passage = m.group(1) - m = re.search(r'(cell|siat|hck)', serum_id, re.IGNORECASE) - if m: - serum_passage = m.group(1) - serum_passage_category = '' - line = "%s\n" % ("\t".join([ virus_strain, serum_strain, serum_id, titer, source, virus_passage, virus_passage_category, serum_passage, serum_passage_category, assay_type])) - outfile.write(line) + + # Set NIID patterns + virus_pattern = r"[A-Z]/[\w\s-]+/.+/\d{4}" + virus_passage_pattern = r"(MDCK|SIAT|E\d+|hCK)" + serum_id_pattern = r".+(No\.|no\.).+" + serum_passage_pattern = r".+(Egg|Cell).+" + serum_abbrev_pattern = r"\w+\s{0,1}\w+/\d+.*" + crick = False + + # Open workbook + wb_name = path + '/' + fstem + ind + workbook = xlrd.open_workbook(filename=wb_name, encoding_override="cp1252") + for worksheet_index, worksheet in enumerate(workbook.sheets(), start=1): + print(f"Reading worksheet {worksheet_index} '{worksheet.name}' in file '{fstem}'") + # autodetecting titer, virus, serum blocks + titer_block = find_titer_block(worksheet) + + if len(titer_block["col_start"]) == 0: + print("No titer block found.") + break + + titer_coords = { + 'col_start': titer_block["col_start"][0][0], + 'col_end': titer_block["col_end"][0][0], + 'row_start': titer_block["row_start"][0][0], + 'row_end': titer_block["row_end"][0][0] + } + + virus_block = find_virus_columns( + worksheet=worksheet, + titer_coords=titer_coords, + virus_pattern=virus_pattern, + virus_passage_pattern=virus_passage_pattern, + ) + + # If no virus names are found, might not be a valid worksheet, skip worksheet to avoid breaking find_serum_rows + if virus_block["virus_names"] is None: + print(f"Virus names not found. Check the virus pattern: '{virus_pattern}'") + break + + serum_block = find_serum_rows( + worksheet=worksheet, + titer_coords=titer_coords, + virus_names=virus_block["virus_names"], + serum_id_pattern=serum_id_pattern, + serum_passage_pattern=serum_passage_pattern, + serum_abbrev_pattern=serum_abbrev_pattern, + crick=crick, + ) + + # Print the most likely row and column indices for the titer block + print(f"Titer block: n = {titer_block['row_start'][0][1]}x{titer_block['col_start'][0][1]} = {titer_block['row_start'][0][1]*titer_block['col_start'][0][1]}") + print(f" Most likely (n={titer_block['col_start'][0][1]}) col_start: {titer_block['col_start'][0][0]}") + print(f" Most likely (n={titer_block['col_end'][0][1]}) col_end: {titer_block['col_end'][0][0]}") + print(f" Most likely (n={titer_block['row_start'][0][1]}) row_start: {titer_block['row_start'][0][0]}") + print(f" Most likely (n={titer_block['row_end'][0][1]}) row_end: {titer_block['row_end'][0][0]}") + + # For debugging purposes, print alternative indices (e.g. col_start, col_end, row_start, row_end) + # print("Alternative indices:") + # for i in range(1, len(titer_block['row_start'])): + # print(f" Alternative (n={titer_block['row_start'][i][1]}) row_start: {titer_block['row_start'][i][0]}") + + # Print Virus and Serum annotations row and column indices + print("Virus (antigen) block: left and right of the titer block") + print(f" virus column index: {virus_block['virus_col_idx']}") + print(f" virus passage column index: {virus_block['virus_passage_col_idx']}") + print(f" virus names: {virus_block['virus_names']}") + + print("Serum (antisera) block: above the titer block") + print(f" serum ID row index: {serum_block['serum_id_row_idx']}") + print(f" serum passage row index: {serum_block['serum_passage_row_idx']}") + print(f" serum abbreviated name row index: {serum_block['serum_abbrev_row_idx']}") + + # Match abbreviated names across the top to the full names along the left side and auto convert to full names + if serum_block["serum_abbrev_row_idx"] is not None: + print("serum_mapping = {") + for abbrev, full in serum_block["serum_mapping"].items(): + print(f" '{abbrev}': '{full}',") + print("}") + + serum_mapping = serum_block["serum_mapping"] # This is not used since serum_strain is being parsed in the loop below + + mat = worksheet + + with open('data/tmp/%s.tsv'%(fstem), 'w') as outfile: + header = ["virus_strain", "serum_strain","serum_id", "titer", "source", "virus_passage", "virus_passage_category", "serum_passage", "serum_passage_category", "assay_type"] + outfile.write("%s\n" % ("\t".join(header))) + + serum_id_row_index = serum_block['serum_id_row_idx'] + row_start = titer_coords['row_start'] + row_end = titer_coords['row_end'] + virus_id_col_index = virus_block['virus_col_idx'] + virus_passage_col_index=virus_block['virus_passage_col_idx'] + col_start = titer_coords['col_start'] + col_end = titer_coords['col_end'] + + for i in range(row_start, row_end+1): + for j in range(col_start, col_end+1): + virus_strain = str(mat.cell_value(i,virus_id_col_index)).strip() + serum_id = str(mat.cell_value(serum_id_row_index,j)).strip().replace(' ','') + serum_id = re.sub(r'[\r\n ]+', '', serum_id) + m = re.search(r'^(\S+)(egg|cell|siat|hck|nib121|ivr|\(bvr)', serum_id, re.IGNORECASE) + if m is None: + m = re.search(r'^(\S+)(no\.)', serum_id, re.IGNORECASE) + serum_strain = "" + if m: + serum_strain = m.group(1) + if not serum_strain.startswith(flutype + "/"): + serum_strain = flutype + "/" + serum_strain + # Normalize U+ff1c '<' to U+003c '<' + titer = unicodedata.normalize('NFKC', str(mat.cell_value(i,j)).strip()) + # Allow either "< 10" or "<10" + titer = re.sub(r'< ', '<', titer) + source = "niid_%s"%(fstem).strip() + virus_passage = str(mat.cell_value(i,virus_passage_col_index)).strip() + virus_passage_category = '' + serum_passage = "unknown" + m = re.search(r'(egg)', serum_id, re.IGNORECASE) + if m: + serum_passage = m.group(1) + m = re.search(r'(cell|siat|hck)', serum_id, re.IGNORECASE) + if m: + serum_passage = m.group(1) + serum_passage_category = '' + line = "%s\n" % ("\t".join([ virus_strain, serum_strain, serum_id, titer, source, virus_passage, virus_passage_category, serum_passage, serum_passage_category, assay_type])) + outfile.write(line) def determine_subtype(original_path): original_path = original_path.lower().split('/') From c89136df3b8389da86a6076bf039a9812d147691 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Fri, 20 Dec 2024 06:38:08 -0800 Subject: [PATCH 2/2] Print statement about serum mapping --- tdb/niid_upload.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tdb/niid_upload.py b/tdb/niid_upload.py index 25f12a5..3ab7e88 100644 --- a/tdb/niid_upload.py +++ b/tdb/niid_upload.py @@ -118,6 +118,7 @@ def convert_niid_xls_to_tsv(path, fstem, ind, subtype, assay_type): print("}") serum_mapping = serum_block["serum_mapping"] # This is not used since serum_strain is being parsed in the loop below + print("NOTE: Serum mapping is not used since serum_strain is being parsed in the loop") mat = worksheet