diff --git a/modelseedpy/community/__init__.py b/modelseedpy/community/__init__.py index 7b94ceaa..294a8a1e 100644 --- a/modelseedpy/community/__init__.py +++ b/modelseedpy/community/__init__.py @@ -1,9 +1,13 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import + # import pyximport; pyximport.install(language_level=3) # improve computational speed from modelseedpy.community.mscommunity import * -from modelseedpy.community.dfbapkg import dFBAPkg -from modelseedpy.community.mscompatibility import MSCompatibility -from modelseedpy.community.commkineticpkg import CommKineticPkg +from modelseedpy.community.datastandardization import * +from modelseedpy.community.mssteadycom import MSSteadyCom +from modelseedpy.community.commphitting import CommPhitting +from modelseedpy.community.commhelper import build_from_species_models, phenotypes +from modelseedpy.community.mskineticsfba import MSKineticsFBA + diff --git a/modelseedpy/community/commhelper.py b/modelseedpy/community/commhelper.py new file mode 100644 index 00000000..5a8f617c --- /dev/null +++ b/modelseedpy/community/commhelper.py @@ -0,0 +1,371 @@ +from modelseedpy.core.msminimalmedia import minimizeFlux_withGrowth, bioFlux_check +from modelseedpy.core.exceptions import NoFluxError, ObjectiveError +from modelseedpy.fbapkg.mspackagemanager import MSPackageManager +from modelseedpy.core.msmodelutl import MSModelUtil +from modelseedpy.core.fbahelper import FBAHelper +from cobra import Model, Reaction, Metabolite +from cobra.medium import minimal_medium +# from commscores import GEMCompatibility +from cobra.flux_analysis import pfba +from collections import OrderedDict +from optlang.symbolics import Zero +from optlang import Constraint +from math import inf, isclose +from pandas import DataFrame +from pprint import pprint +from numpy import mean +import re + + +def strip_comp(ID): + ID = ID.replace("-", "~") + return re.sub("(\_\w\d)", "", ID) + +def export_lp(model, name): + with open(f"{name}.lp", 'w') as out: + out.write(model.solver.to_lp()) + +def correct_nonMSID(nonMSobject, output, model_index): + name, compartment = output + index = 0 if compartment == "e" else model_index + nonMSobject.compartment = compartment + str(index) + comp = re.search(r"(_[a-z]\d+$)", nonMSobject.id) + if comp is None and rf"[{compartment}]" in nonMSobject.id: return nonMSobject.id.replace(rf"[{compartment}]", f"_{nonMSobject.compartment}") + elif comp is None: return nonMSobject.id + f"_{nonMSobject.compartment}" + return "_".join([nonMSobject.id.replace(comp.group(), ""), nonMSobject.compartment]) + + +def build_from_species_models(org_models, model_id=None, name=None, abundances=None, + standardize=False, MSmodel = True, commkinetics=True, copy_models=True, printing=False): + """Merges the input list of single species metabolic models into a community metabolic model + + Parameters + ---------- + org_models : list to be merged into a community model + model_id : string specifying community model ID + name : string specifying community model name + names : list human-readable names for models being merged + abundances : dict relative abundances for input models in community model + cobra_model : bool for whether the raw COBRA model is returned + standardize: bool for whether the exchanges of each member model will be standardized (True) or just aligned. + + Returns + ------- + Cobra.Model for the desired Community + + Raises + ------ + """ + # construct the new model + models = org_models #if not standardize else GEMCompatibility.standardize( + #org_models, exchanges=True, conflicts_file_name='exchanges_conflicts.json') + biomass_indices = [] + biomass_index = minimal_biomass_index = 2 + new_metabolites, new_reactions = set(), set() + member_biomasses = {} + for model_index, org_model in enumerate(models): + model_util = MSModelUtil(org_model, copy=copy_models) + model_reaction_ids = [rxn.id for rxn in model_util.model.reactions] + model_index += 1 + # if MSmodel: + # Rename metabolites + for met in model_util.model.metabolites: + # Renaming compartments + output = MSModelUtil.parse_id(met) + # if printing: print(met, output) + if output is None: + if printing: print(f"The {met.id} ({output}; {hasattr(met, 'compartment')}) is unpredictable.") + met.id = correct_nonMSID(met, (met.id, "c"), model_index) + elif len(output) == 2: met.id = correct_nonMSID(met, output, model_index) + elif len(output) == 3: + name, compartment, out_index = output + index = 0 if compartment == "e" else model_index + if out_index == "": + met.id += str(index) + met.compartment += str(index) + elif compartment == "e": met.compartment = "e0" + else: + met.compartment = compartment + str(index) + met.id = name + "_" + met.compartment + new_metabolites.add(met) + if "cpd11416_c" in met.id or "biomass" in met.id: member_biomasses[org_model.id] = met + # Rename reactions + for rxn in model_util.model.reactions: # !!! all reactions should have a non-zero compartment index + if rxn.id[0:3] != "EX_": + ## biomass reactions + if re.search('^(bio)(\d+)$', rxn.id): + index = int(re.sub(r"(^bio)", "", rxn.id)) + if biomass_index == 2: + while f"bio{biomass_index}" in model_reaction_ids: biomass_index += 1 + if index not in biomass_indices and index >= minimal_biomass_index: biomass_indices.append(index) + else: # biomass indices can be decoupled from the respective reaction indices of the same model + rxn.id = "bio" + str(biomass_index) + if rxn.id not in model_reaction_ids: biomass_indices.append(biomass_index) + else: + index = minimal_biomass_index + rxn.id = "bio" + str(index) + while rxn.id not in model_reaction_ids and index not in biomass_indices: + index += 1 + rxn.id = "bio" + str(index) + biomass_indices.append(index) + biomass_index += 1 + ## non-biomass reactions + else: + initialID = str(rxn.id) + output = MSModelUtil.parse_id(rxn) + if output is None: + if printing: print(f"The {rxn.id} ({output}; {hasattr(rxn, 'compartment')}) is unpredictable.") + try: + rxn.id = correct_nonMSID(rxn, (rxn.id, "c"), model_index) + output = MSModelUtil.parse_id(rxn) + except ValueError: pass + elif len(output) == 2: + rxn.id = correct_nonMSID(rxn, output, model_index) + if printing: print(f"{output} from {rxn.id}") + output = MSModelUtil.parse_id(rxn) + if len(output) == 3: + name, compartment, index = output + if compartment != "e": + rxn.name = f"{name}_{compartment}{model_index}" + rxn_id = re.search(r"(.+\_\w)(?=\d+)", rxn.id).group() + if index == "": rxn.id += str(model_index) + else: rxn.id = rxn_id + str(model_index) + finalID = str(rxn.id) + string_diff = "" + for index, let in enumerate(finalID): + if index >= len(initialID) or index < len(initialID) and let != initialID[index]: string_diff += let + # if "compartment" not in locals(): print(f"the {rxn.id} with a {output} output is not defined with a compartment.") + if string_diff != f"_{compartment}{model_index}" and printing: print(f"The ID {initialID} is changed with {string_diff} to create the final ID {finalID}") + new_reactions.add(rxn) + # else: + # # TODO develop a method for compartmentalizing models without editing all reaction IDs or assuming their syntax + # pass + # adds only unique reactions and metabolites to the community model + newmodel = Model(model_id or "+".join([model.id for model in models]), + name or "+".join([model.name for model in models])) + newmodel.add_reactions(FBAHelper.filter_cobra_set(new_reactions)) + newmodel.add_metabolites(FBAHelper.filter_cobra_set(new_metabolites)) + + # Create community biomass + comm_biomass = Metabolite("cpd11416_c0", None, "Community biomass", 0, "c0") + metabolites = {comm_biomass: 1} + ## constrain the community abundances + if abundances: abundances = {met: -abundances[memberID] for memberID, met in member_biomasses.items() if memberID in abundances} + else: abundances = {met: -1 / len(member_biomasses) for met in member_biomasses.values()} + ## define community biomass components + metabolites.update(abundances) + comm_biorxn = Reaction(id="bio1", name="bio1", lower_bound=0, upper_bound=1000) + comm_biorxn.add_metabolites(metabolites) + print(comm_biorxn) + newmodel.add_reactions([comm_biorxn]) + # update model components + newutl = MSModelUtil(newmodel) + newutl.add_objective(comm_biorxn.flux_expression) + newutl.model.add_boundary(comm_biomass, "sink") # Is a sink reaction for reversible cpd11416_c0 consumption necessary? + print(newutl.model.problem._variables.keys()) + ## proportionally limit the fluxes to their abundances + # print(abundances) + if commkinetics: add_commkinetics(newutl, models, member_biomasses, abundances) + # add the metadata of community composition + if hasattr(newutl.model, "_context"): newutl.model._contents.append(member_biomasses) + elif hasattr(newutl.model, "notes"): newutl.model.notes.update(member_biomasses) + # print([cons.name for cons in newutl.model.constraints]) + return newutl.model + +def add_commkinetics(util, models, member_biomasses, abundances): + # TODO this creates an error with the member biomass reactions not being identified in the model + coef = {} + for model in models: + if member_biomasses[model.id] not in abundances: continue + coef[member_biomasses[model.id]] = -abundances[member_biomasses[model.id]] + for rxn in model.reactions: + if rxn.id[:3] == "rxn": coef[rxn.forward_variable] = coef[rxn.reverse_variable] = 1 + util.create_constraint(Constraint(Zero, name="member_flux_limit"), coef=coef, printing=True) + + +def phenotypes(community_members, phenotype_flux_threshold=.1, solver:str="glpk"): + # log information of each respective model + models = OrderedDict() + solutions = [] + media_conc = set() + # calculate all phenotype profiles for all members + comm_members = community_members.copy() + # print(community_members) + for org_model, content in community_members.items(): # community_members excludes the stationary phenotype + print("\n", org_model.id) + org_model.solver = solver + all_phenotypes = "phenotypes" not in content + model_util = MSModelUtil(org_model, True) + if "org_coef" not in locals(): + org_coef = {model_util.model.reactions.get_by_id("EX_cpd00007_e0").reverse_variable: -1} + model_util.standard_exchanges() + models[org_model.id] = {"exchanges": model_util.exchange_list(), "solutions": {}, "name": content["name"]} + phenotypes = {met.name: {"consumed": met.id.replace("EX_", "").replace("_e0", "")} + for met in model_util.carbon_exchange_mets_list(include_unknown=False) + } if all_phenotypes else content["phenotypes"] + # print(phenotypes) + models[org_model.id]["phenotypes"] = ["stationary"] + [ + content["phenotypes"].keys() for member, content in comm_members.items()] + phenoRXNs = [pheno_cpd for pheno, pheno_cpds in content['phenotypes'].items() + for pheno_cpd in pheno_cpds["consumed"]] + media = {cpd: 100 for cpd, flux in model_util.model.medium.items()} + #TODO correct or remove the media, since it seems to be overwritten by the optimization of all carbon exchanges + ### eliminate hydrogen absorption + media.update({"EX_cpd11640_e0": 0}) + past_phenoRXNs = [] + for name, phenoCPDs in phenotypes.items(): + pheno_util = MSModelUtil(model_util.model, True) + metID = phenoCPDs["consumed"][0] + try: + phenoRXN = pheno_util.model.reactions.get_by_id(f'EX_{metID}_e0') + if past_phenoRXNs: + del media[past_phenoRXNs[-1]] + except Exception as e: + print(e, f'\nEX_{metID}_e0 is not in the model {org_model.id}') + continue + media.update({phenoRXN.id: 100}) + pheno_util.add_medium(media) + print(phenoRXN.id) + pheno_util.model.solver = solver + ### define an oxygen absorption relative to the phenotype carbon source + # O2_consumption: EX_cpd00007_e0 <= phenotype carbon source # formerly <= 2 * sum(primary carbon fluxes) + coef = org_coef.copy() + coef.update({phenoRXN.reverse_variable: 1}) + pheno_util.create_constraint(Constraint(Zero, lb=0, ub=None, name="EX_cpd00007_e0_limitation"), coef=coef) + + ## minimize the influx of all carbonaceous exchanges, mostly non-phenotype compounds, at a fixed biomass growth + min_growth = float(1) # arbitrarily assigned minimal growth + pheno_util.add_minimal_objective_cons(min_growth) + phenoRXN.upper_bound = 0 + for ex in pheno_util.carbon_exchange_list(): + exMet = ex.id.replace("EX_", "").replace("_e0", "") + if exMet in phenoRXNs and exMet != metID: ex.lower_bound = 0 + # print(f"The new bounds of {exMet} exchange are: {ex.bounds}") + pheno_util.add_objective(Zero, "min", coef={ + ex.reverse_variable: 1000 if ex.id != phenoRXN.id else 1 + for ex in pheno_util.carbon_exchange_list()}) + # export_lp(pheno_util.model, f"minimize_cInFlux_{phenoRXN.id}") + sol = pheno_util.model.optimize() + if sol.status != "optimal": + pheno_util.model.remove_cons_vars(["EX_cpd00007_e0_limitation"]) + coef.update({phenoRXN.reverse_variable: 5}) + pheno_util.create_constraint( + Constraint(Zero, lb=0, ub=None, name="EX_cpd00007_e0_limitation"), coef=coef) + sol = pheno_util.model.optimize() + bioFlux_check(pheno_util.model, sol) + ### limit maximum consumption to the values from the previous minimization + for ex in pheno_util.carbon_exchange_list(): + #### (limiting the reverse_variable is more restrictive than the net flux variable) + if ex.id != phenoRXN.id: ex.reverse_variable.ub = abs(min(0, sol.fluxes[ex.id])) + + ## maximize the phenotype yield with the previously defined growth and constraints + pheno_util.add_objective(phenoRXN.reverse_variable, "min") + # export_lp(pheno_util.model, f"maximize_phenoYield_{phenoRXN.id}") + pheno_sol = pheno_util.model.optimize() + bioFlux_check(pheno_util.model, pheno_sol) + pheno_influx = pheno_sol.fluxes[phenoRXN.id] + if pheno_influx >= 0: + if not all_phenotypes: + print(f"The phenotype carbon source has a flux of {pheno_sol.fluxes[phenoRXN.id]}.") + pprint({rxn: flux for rxn, flux in pheno_sol.fluxes.items() if flux != 0}) + # TODO gapfill the model in media the non-functioning carbon source + raise NoFluxError(f"The (+) net flux of {pheno_influx} for the {phenoRXN.id} phenotype" + f" indicates that it is an implausible phenotype.") + print(f"NoFluxError: The (+) net flux of {pheno_influx} for the {phenoRXN.id}" + " phenotype indicates that it is an implausible phenotype.") + continue + phenoRXN.lower_bound = phenoRXN.upper_bound = pheno_influx + + ## maximize excretion of all potential carbon byproducts whose #C's < phenotype source #C's + phenotype_source_carbons = FBAHelper.rxn_mets_list(phenoRXN)[0].elements["C"] + minimum_fluxes = {} + for carbon_source in pheno_util.carbon_exchange_list(include_unknown=False): + if 0 < FBAHelper.rxn_mets_list(carbon_source)[0].elements["C"] < phenotype_source_carbons: + pheno_util.add_objective(carbon_source.flux_expression, "max") + minObj = pheno_util.model.slim_optimize() + # print(carbon_source.reaction, "\t", carbon_source.flux_expression, "\t", minObj) + if minObj > phenotype_flux_threshold: + minimum_fluxes[carbon_source.id] = minObj + # TODO limit the possible excreted compounds to only those that are defined in the media + excreted_compounds = list([exID for exID in minimum_fluxes.keys() if exID != "EX_cpd00011_e0"]) + # minimum_fluxes_df = DataFrame(data=list(minimum_fluxes.values()), index=excreted_compounds, columns=["min_flux"]) + # max_excretion_cpd = minimum_fluxes_df["minimum"].idxmin() + ### optimize the excretion of the discovered phenotype excreta + if "excreted" in phenoCPDs: + phenoCPDs["excreted"] = [f"EX_{cpd}_e0" for cpd in phenoCPDs["excreted"]] + phenoCPDs["excreted"].extend(excreted_compounds) + else: phenoCPDs["excreted"] = excreted_compounds + pheno_excreta = [pheno_util.model.reactions.get_by_id(excreta) + for excreta in phenoCPDs["excreted"]] + pheno_util.add_objective(sum([ex.flux_expression for ex in pheno_excreta]), "max") + # export_lp(pheno_util.model, "maximize_excreta") + sol = pheno_util.model.optimize() + bioFlux_check(pheno_util.model, sol) + for ex in pheno_excreta: + ex.lower_bound = ex.upper_bound = sol.fluxes[ex.id] + + ## minimize flux of the total simulation flux through pFBA + # TODO discover why some phenotypes are infeasible with pFBA + try: pheno_sol = pfba(pheno_util.model) + # pheno_util.add_objective(sum([rxn.flux_expression for rxn in pheno_util.e]), "min") + # pheno_sol = pheno_util.model.optimize() + except Exception as e: + print(f"The {phenoRXN.id} phenotype of the {pheno_util.model} model is " + f"unable to be simulated with pFBA and yields a < {e} > error.") + sol_dict = FBAHelper.solution_to_variables_dict(pheno_sol, pheno_util.model) + simulated_growth = sum([flux for var, flux in sol_dict.items() if re.search(r"(^bio\d+$)", var.name)]) + if not isclose(simulated_growth, min_growth): + display([(rxn, flux) for rxn, flux in pheno_sol.fluxes.items() if "EX_" in rxn and flux != 0]) + raise ObjectiveError(f"The assigned minimal_growth of {min_growth} was not optimized" + f" during the simulation, where the observed growth was {simulated_growth}.") + + ## store solution fluxes and update the community_members phenotypes + met_name = strip_comp(name).replace(" ", "-") + col = content["name"] + '_' + met_name + models[pheno_util.model.id]["solutions"][col] = pheno_sol + solutions.append(models[pheno_util.model.id]["solutions"][col].objective_value) + met_name = met_name.replace("_", "-").replace("~", "-") + if all_phenotypes: + if "phenotypes" not in comm_members[org_model]: + comm_members[org_model]["phenotypes"] = {met_name: {"consumed": [strip_comp(metID)]}} + if met_name not in comm_members[org_model]["phenotypes"]: + comm_members[org_model]["phenotypes"].update({met_name: {"consumed": [strip_comp(metID)]}}) + else: comm_members[org_model]["phenotypes"][met_name]["consumed"] = [strip_comp(metID)] + met_pheno = content["phenotypes"][met_name] + if "excreted" in met_pheno and strip_comp(metID) in met_pheno["excreted"]: + comm_members[org_model]["phenotypes"][met_name].update({"excreted": met_pheno}) + past_phenoRXNs.append(phenoRXN.id) + + # construct the parsed table of all exchange fluxes for each phenotype + cols = {} + ## biomass row + cols["rxn"] = ["bio"] + for content in models.values(): + for col in content["solutions"]: + cols[col] = [0] + if col not in content["solutions"]: continue + bio_rxns = [x for x in content["solutions"][col].fluxes.index if "bio" in x] + flux = mean([content["solutions"][col].fluxes[rxn] for rxn in bio_rxns + if content["solutions"][col].fluxes[rxn] != 0]) + cols[col] = [flux] + ## exchange reactions rows + looped_cols = cols.copy() + looped_cols.pop("rxn") + for content in models.values(): + for ex_rxn in content["exchanges"]: + cols["rxn"].append(ex_rxn.id) + for col in looped_cols: + ### reactions that are not present in the columns are ignored + flux = 0 if (col not in content["solutions"] or + ex_rxn.id not in list(content["solutions"][col].fluxes.index) + ) else content["solutions"][col].fluxes[ex_rxn.id] + cols[col].append(flux) + ## construct the DataFrame + fluxes_df = DataFrame(data=cols) + fluxes_df.index = fluxes_df['rxn'] + fluxes_df.drop('rxn', axis=1, inplace=True) + fluxes_df = fluxes_df.groupby(fluxes_df.index).sum() + fluxes_df = fluxes_df.loc[(fluxes_df != 0).any(axis=1)] + fluxes_df.astype(str) + # fluxes_df.to_csv("fluxes.csv") + return fluxes_df, comm_members diff --git a/modelseedpy/community/commkineticpkg.py b/modelseedpy/community/commkineticpkg.py deleted file mode 100644 index 9d47f9e5..00000000 --- a/modelseedpy/community/commkineticpkg.py +++ /dev/null @@ -1,29 +0,0 @@ -# -*- coding: utf-8 -*- - -from __future__ import absolute_import - -import logging -from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg -from modelseedpy.community.mscommunity import MSCommunity -from modelseedpy.core.fbahelper import FBAHelper - -#Base class for FBA packages -class CommKineticPkg(BaseFBAPkg): - def __init__(self,model): - BaseFBAPkg.__init__(self,model,"community kinetics",{},{"commkin":"string"}) - - def build_package(self,kinetic_coef,community_model=None): - self.validate_parameters({},[],{ - "kinetic_coef":kinetic_coef, - "community":community_model if community_model else MSCommunity(self.model) - }) - for species in self.parameters["community"].species: - self.build_constraint(species) - - def build_constraint(self,species): - coef = {species.biomasses[0].forward_variable:-1*self.parameters["kinetic_coef"]} - for reaction in self.model.reactions: - if int(FBAHelper.rxn_compartment(reaction)[1:]) == species.index and reaction != species.biomasses[0]: - coef[reaction.forward_variable] = 1 - coef[reaction.reverse_variable] = 1 - return BaseFBAPkg.build_constraint(self,"commkin",None,0,coef,"Species"+str(species.index)) \ No newline at end of file diff --git a/modelseedpy/community/commphitting.py b/modelseedpy/community/commphitting.py new file mode 100644 index 00000000..3399a8f5 --- /dev/null +++ b/modelseedpy/community/commphitting.py @@ -0,0 +1,1345 @@ +# -*- coding: utf-8 -*- +# from modelseedpy.fbapkg.mspackagemanager import MSPackageManager +from modelseedpy.core.exceptions import FeasibilityError, ParameterError, ObjectAlreadyDefinedError, NoFluxError +from modelseedpy.core.optlanghelper import OptlangHelper, Bounds, tupVariable, tupConstraint, tupObjective, isIterable, define_term +from modelseedpy.community.datastandardization import GrowthData +from modelseedpy.core.fbahelper import FBAHelper +from modelseedpy.biochem import from_local +from scipy.constants import hour, minute +from zipfile import ZipFile, ZIP_LZMA +from optlang import Model, Objective +from time import sleep, process_time +from typing import Union, Iterable +from optlang.symbolics import Zero +from scipy.optimize import newton +from matplotlib import pyplot +from math import inf, isclose +from deepdiff import DeepDiff +from pandas import DataFrame +from itertools import chain +from pprint import pprint +from h5py import File +from icecream import ic +import numpy as np +import cobra.io +# from cplex import Cplex +import warnings, logging, json, os, re + +logger = logging.getLogger(__name__) + +def dict_keys_exists(dic, *keys): + result = keys[0] in dic + if keys[0] in dic: + remainingKeys = keys[1:] + if len(remainingKeys) > 0: result = dict_keys_exists(dic[keys[0]], *remainingKeys) + return result + return result + +def find_dic_number(dic): + for k, v in dic.items(): + if FBAHelper.isnumber(v): return v + num = find_dic_number(dic[k]) + return num + +def trial_contents(short_code, indices_tup, values): + matches = [ele == short_code for ele in indices_tup] + return np.array(values)[matches] + +def dic_keys(dic): + keys = [] + if isinstance(dic, dict): + for key, value in dic.items(): + keys.append(key) + keys.extend(dic_keys(value)) + return keys + +# define data objects +def _name(name, suffix, short_code, timestep, names): + name = '-'.join([x for x in list(map(str, [name + suffix, short_code, timestep])) if x]) + if name not in names: names.append(name) ; return name + else: pprint(names) ; raise ObjectAlreadyDefinedError(f"The object {name} is already defined for the problem.") + +def _export_model_json(json_model, path): + with open(path, 'w') as lp: json.dump(json_model, lp, indent=3) + +def _met_id_parser(met): + met_id = re.sub('(\_\w\d+)', '', met) + met_id = met_id.replace('EX_', '', 1) + met_id = met_id.replace('c_', '', 1) + return met_id + +# define an entity as a variable or a constant +def _obj_val(primal, name, pheno, short_code, timestep, bounds, data_timestep_hr, names): + time_hr = int(timestep) * data_timestep_hr + return tupVariable(_name(name, pheno, short_code, timestep, names), + Bounds=bounds) if not primal else primal[short_code][name+pheno][time_hr] + +def _michaelis_menten(conc, vmax, km): + return (conc*vmax)/(km+conc) + +def clamp(val, minimum, maximum): + return min(max(val, minimum), maximum) + +# parse primal values for use in the optimization loops +def parse_primals(primal_values, entity_labels=None, coefs=None, kcat_vals=None): + if kcat_vals: + kcat_primal = {} + for trial, content in primal_values.items(): + for primal, time_value in content.items(): + if "bin" not in primal: continue + name, trial = primal.split("-") + number = re.search(r"(\d)", name).group() + species, pheno = re.sub(r"(bin\d_)", "", name).split("_") + if "stationary" in pheno: continue + if species not in kcat_primal: kcat_primal[species] = {} + if pheno not in kcat_primal[species]: kcat_primal[species][pheno] = 0 + # kcat_(k,new) = sum_z^Z ( kcat_z * bin_k^z ) * kcat_(k,old) < 10 + if time_value == 0 and kcat_primal[species][pheno] < 10: + kcat_primal[species][pheno] += coefs[int(number)-1]*kcat_vals[species][pheno] + kcat_primal[species][pheno] = clamp(kcat_primal[species][pheno], 1e-4, 10) + return kcat_primal + select_primals = {} + for trial, entities in primal_values.items(): + select_primals[trial] = {} + for entity, times in entities.items(): + # a poor man's dictionary copy + if any([label in entity for label in entity_labels]): select_primals[trial][entity] = dict(list(times.items())) + return select_primals + +def signal_species(signal): + return signal.split(":")[0].replace(" ", "_") + +def _partition_coefs(initial_val, divisor): + return (initial_val, initial_val/divisor, initial_val/divisor**2, initial_val/divisor**3, initial_val/divisor**4) + + +biomass_partition_coefs = [_partition_coefs(10, 10), _partition_coefs(2, 2), _partition_coefs(1, 3)] + + +class CommPhitting: + + def __init__(self, msdb_path, community_members: dict=None, fluxes_df=None, data_df=None, carbon_conc=None, + media_conc=None, experimental_metadata=None, base_media=None, solver: str = 'glpk', all_phenotypes=True, + data_paths: dict = None, species_abundances: str = None, ignore_trials: Union[dict, list] = None, + ignore_timesteps: list = None, species_identities_rows=None, significant_deviation: float = 2, + extract_zip_path: str = None, determine_requisite_biomass:bool = True, consumed_mets:iter=None): + self.msdb = from_local(msdb_path) ; self.msdb_path = msdb_path + self.solver = solver ; self.all_phenotypes = all_phenotypes ; self.data_paths = data_paths + self.species_abundances = species_abundances ; self.ignore_trials = ignore_trials + self.ignore_timesteps = ignore_timesteps ; self.species_identities_rows = species_identities_rows + self.significant_deviation = significant_deviation ; self.extract_zip_path = extract_zip_path + + self.community_members = community_members + self.consumed_mets = consumed_mets or set([ + met for content in community_members.values() for met in content["phenotypes"]]) + if community_members is not None or any([x is None for x in [fluxes_df, data_df]]): + (self.experimental_metadata, data_df, fluxes_df, carbon_conc, self.requisite_biomass, + self.trial_name_conversion, self.data_timestep_hr, simulation_timestep, media_conc + ) = GrowthData.process(community_members, base_media, solver, all_phenotypes, data_paths, + species_abundances, carbon_conc, ignore_trials, ignore_timesteps, + species_identities_rows, significant_deviation, extract_zip_path, + determine_requisite_biomass) + # for content in community_members.values() for met in content["phenotypes"]] + self.fluxes_tup = FBAHelper.parse_df(fluxes_df) + self.fluxes_df = fluxes_df ; self.data_df = data_df + self.default_excreta = [index for index, row in fluxes_df.iterrows() if any(row > 1)] + self.parameters, self.variables, self.constraints = {}, {}, {} + self.zipped_output, self.plots, self.names = [], [], [] + self.experimental_metadata = experimental_metadata + self.carbon_conc = carbon_conc; self.media_conc = media_conc + + #################### FITTING PHASE METHODS #################### + + def fit_kcat(self, parameters: dict = None, mets_to_track: list = None, rel_final_conc: dict = None, + zero_start: list = None, abs_final_conc: dict = None, graphs: list = None, data_timesteps: dict = None, + export_zip_name: str = None, export_parameters: bool = True, requisite_biomass: dict = None, + export_lp:str = f'solveKcat.lp', figures_zip_name:str=None, publishing=True, primals_export_path=None): + if export_zip_name and os.path.exists(export_zip_name): os.remove(export_zip_name) + kcat_primal = None + requisite_biomass = requisite_biomass or self.requisite_biomass + for index, coefs in enumerate(biomass_partition_coefs): + # solve for growth rate constants with the previously solved biomasses + newSim = CommPhitting(self.msdb_path, None, self.fluxes_df, self.data_df, self.carbon_conc, + self.media_conc, self.experimental_metadata, None, self.solver, self.all_phenotypes, + self.data_paths, self.species_abundances, self.ignore_trials, self.ignore_timesteps, + self.species_identities_rows, self.significant_deviation, self.extract_zip_path, + True, self.consumed_mets) + newSim.define_problem(parameters, mets_to_track, rel_final_conc, zero_start, abs_final_conc, + data_timesteps, export_zip_name, export_parameters, export_lp, + kcat_primal, coefs, requisite_biomass) + newSim.compute(graphs, export_zip_name, figures_zip_name, publishing, + primals_export_path or re.sub(r"(.lp)", ".json", export_lp)) + kcat_primal = parse_primals(newSim.values, coefs=coefs, kcat_vals=newSim.parameters["kcat"]) + pprint(kcat_primal) + print(f"Interation {index+1} is complete\n") + kcats = {k: val for k, val in newSim.values.items() if "kcat" in k} + DataFrame(kcats).T.to_csv("pheno_growth_kcat.tsv", sep="\t") + return kcats + + def fit(self, parameters:dict=None, mets_to_track: list = None, rel_final_conc:dict=None, zero_start:list=None, + abs_final_conc:dict=None, graphs: list = None, data_timesteps: dict = None, + export_zip_name: str = None, export_parameters: bool = True, requisite_biomass: dict = None, + export_lp: str = 'CommPhitting.lp', figures_zip_name:str=None, publishing:bool=False, primals_export_path=None): + if hasattr(self, "requisite_biomass"): requisite_biomass = self.requisite_biomass + self.define_problem(parameters, mets_to_track, rel_final_conc, zero_start, abs_final_conc, + data_timesteps, export_zip_name, export_parameters, export_lp, + None, None, requisite_biomass) + self.compute(graphs, export_zip_name, figures_zip_name, publishing, + primals_export_path or re.sub(r"(.lp)", ".json", export_lp)) + + def define_b_vars(self, pheno, short_code, timestep, variables): + self.variables['b_' + pheno][short_code][timestep] = tupVariable( + _name("b_", pheno, short_code, timestep, self.names), Bounds(0, 1000)) + self.variables['b1_' + pheno][short_code][timestep] = tupVariable( + _name("b1_", pheno, short_code, timestep, self.names), Bounds(0, 1000)) + self.variables['b2_' + pheno][short_code][timestep] = tupVariable( + _name("b2_", pheno, short_code, timestep, self.names), Bounds(0, 1000)) + self.variables['b3_' + pheno][short_code][timestep] = tupVariable( + _name("b3_", pheno, short_code, timestep, self.names), Bounds(0, 1000)) + self.variables['b4_' + pheno][short_code][timestep] = tupVariable( + _name("b4_", pheno, short_code, timestep, self.names), Bounds(0, 1000)) + self.variables['b5_' + pheno][short_code][timestep] = tupVariable( + _name("b5_", pheno, short_code, timestep, self.names), Bounds(0, 1000)) + variables.extend([self.variables['b_' + pheno][short_code][timestep], + self.variables['b1_' + pheno][short_code][timestep], + self.variables['b2_' + pheno][short_code][timestep], + self.variables['b3_' + pheno][short_code][timestep], + self.variables['b4_' + pheno][short_code][timestep], + self.variables['b5_' + pheno][short_code][timestep]]) + if short_code not in self.variables[f"bin1_{pheno}"]: + self.variables[f"bin1_{pheno}"][short_code] = tupVariable( + _name("bin1_", pheno, short_code, "", self.names), Bounds(0, 1), "binary") + self.variables[f"bin2_{pheno}"][short_code] = tupVariable( + _name("bin2_", pheno, short_code, "", self.names), Bounds(0, 1), "binary") + self.variables[f"bin3_{pheno}"][short_code] = tupVariable( + _name("bin3_", pheno, short_code, "", self.names), Bounds(0, 1), "binary") + self.variables[f"bin4_{pheno}"][short_code] = tupVariable( + _name("bin4_", pheno, short_code, "", self.names), Bounds(0, 1), "binary") + self.variables[f"bin5_{pheno}"][short_code] = tupVariable( + _name("bin5_", pheno, short_code, "", self.names), Bounds(0, 1), "binary") + variables.extend([self.variables[f"bin1_{pheno}"][short_code], self.variables[f"bin2_{pheno}"][short_code], + self.variables[f"bin3_{pheno}"][short_code], self.variables[f"bin4_{pheno}"][short_code], + self.variables[f"bin5_{pheno}"][short_code]]) + return variables + + def define_b_cons(self, pheno, short_code, timestep, biomass_coefs): + biomass_coefs = biomass_coefs or biomass_partition_coefs[-1] + # define the partitioned biomass groups + ## b_n{pheno,t} <= coef*b_tot{pheno,t} + self.constraints['b1c_' + pheno][short_code][timestep] = tupConstraint( + _name("b1c_", pheno, short_code, timestep, self.names), Bounds(0, None), { + "elements": [ + {"elements": [biomass_coefs[0], self.variables['b_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + {"elements": [-1, self.variables['b1_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + ], + "operation": "Add" + }) + self.constraints['b2c_' + pheno][short_code][timestep] = tupConstraint( + _name("b2c_", pheno, short_code, timestep, self.names), Bounds(0, None), { + "elements": [ + {"elements": [biomass_coefs[1], self.variables['b_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + {"elements": [-1, self.variables['b2_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + ], + "operation": "Add" + }) + self.constraints['b3c_' + pheno][short_code][timestep] = tupConstraint( + _name("b3c_", pheno, short_code, timestep, self.names), Bounds(0, None), { + "elements": [ + {"elements": [biomass_coefs[2], self.variables['b_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + {"elements": [-1, self.variables['b3_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + ], + "operation": "Add" + }) + self.constraints['b4c_' + pheno][short_code][timestep] = tupConstraint( + _name("b4c_", pheno, short_code, timestep, self.names), Bounds(0, None), { + "elements": [ + {"elements": [biomass_coefs[3], self.variables['b_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + {"elements": [-1, self.variables['b4_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + ], + "operation": "Add" + }) + self.constraints['b5c_' + pheno][short_code][timestep] = tupConstraint( + _name("b5c_", pheno, short_code, timestep, self.names), Bounds(0, None), { + "elements": [ + {"elements": [biomass_coefs[4], self.variables['b_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + {"elements": [-1, self.variables['b5_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + ], + "operation": "Add" + }) + + # define the comprehensive biomass constraints + ## coef*b{pheno,t} - b_n{pheno,t} - 1000*bin_n{pheno} <= 0 + self.constraints['b1c_control_' + pheno][short_code][timestep] = tupConstraint( + _name("b1c_control_", pheno, short_code, timestep, self.names), Bounds(None, 0), { + "elements": [ + {"elements": [biomass_coefs[0], self.variables['b_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + {"elements": [-1, self.variables['b1_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + {"elements": [-1000, self.variables[f"bin1_{pheno}"][short_code].name], + "operation": "Mul"}, + ], + "operation": "Add" + }) + self.constraints['b2c_control_' + pheno][short_code][timestep] = tupConstraint( + _name("b2c_control_", pheno, short_code, timestep, self.names), Bounds(None, 0), { + "elements": [ + {"elements": [biomass_coefs[1], self.variables['b_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + {"elements": [-1, self.variables['b2_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + {"elements": [-1000, self.variables[f"bin2_{pheno}"][short_code].name], + "operation": "Mul"}, + ], + "operation": "Add" + }) + self.constraints['b3c_control_' + pheno][short_code][timestep] = tupConstraint( + _name("b3c_control_", pheno, short_code, timestep, self.names), Bounds(None, 0), { + "elements": [ + {"elements": [biomass_coefs[2], self.variables['b_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + {"elements": [-1, self.variables['b3_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + {"elements": [-1000, self.variables[f"bin3_{pheno}"][short_code].name], + "operation": "Mul"}, + ], + "operation": "Add" + }) + self.constraints['b4c_control_' + pheno][short_code][timestep] = tupConstraint( + _name("b4c_control_", pheno, short_code, timestep, self.names), Bounds(None, 0), { + "elements": [ + {"elements": [biomass_coefs[3], self.variables['b_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + {"elements": [-1, self.variables['b4_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + {"elements": [-1000, self.variables[f"bin4_{pheno}"][short_code].name], + "operation": "Mul"}, + ], + "operation": "Add" + }) + self.constraints['b5c_control_' + pheno][short_code][timestep] = tupConstraint( + _name("b5c_control_", pheno, short_code, timestep, self.names), Bounds(None, 0), { + "elements": [ + {"elements": [biomass_coefs[4], self.variables['b_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + {"elements": [-1, self.variables['b5_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + {"elements": [-1000, self.variables[f"bin5_{pheno}"][short_code].name], + "operation": "Mul"}, + ], + "operation": "Add" + }) + + # define the binary constraints + ## b_n{pheno,t} <= 1000 - 1000*bin_n{pheno} + self.constraints['bin1c_' + pheno][short_code][timestep] = tupConstraint( + _name("bin1c_", pheno, short_code, timestep, self.names), Bounds(0, None), { + "elements": [ + 1000, + {"elements": [-1, self.variables['b1_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + {"elements": [-1000, self.variables[f"bin1_{pheno}"][short_code].name], + "operation": "Mul"} + ], + "operation": "Add" + }) + self.constraints['bin2c_' + pheno][short_code][timestep] = tupConstraint( + _name("bin2c_", pheno, short_code, timestep, self.names), Bounds(0, None), { + "elements": [ + 1000, + {"elements": [-1, self.variables['b2_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + {"elements": [-1000, self.variables[f"bin2_{pheno}"][short_code].name], + "operation": "Mul"} + ], + "operation": "Add" + }) + self.constraints['bin3c_' + pheno][short_code][timestep] = tupConstraint( + _name("bin3c_", pheno, short_code, timestep, self.names), Bounds(0, None), { + "elements": [ + 1000, + {"elements": [-1, self.variables['b3_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + {"elements": [-1000, self.variables[f"bin3_{pheno}"][short_code].name], + "operation": "Mul"} + ], + "operation": "Add" + }) + self.constraints['bin4c_' + pheno][short_code][timestep] = tupConstraint( + _name("bin4c_", pheno, short_code, timestep, self.names), Bounds(0, None), { + "elements": [ + 1000, + {"elements": [-1, self.variables['b4_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + {"elements": [-1000, self.variables[f"bin4_{pheno}"][short_code].name], + "operation": "Mul"} + ], + "operation": "Add" + }) + self.constraints['bin5c_' + pheno][short_code][timestep] = tupConstraint( + _name("bin5c_", pheno, short_code, timestep, self.names), Bounds(0, None), { + "elements": [ + 1000, + {"elements": [-1, self.variables['b5_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + {"elements": [-1000, self.variables[f"bin5_{pheno}"][short_code].name], + "operation": "Mul"} + ], + "operation": "Add" + }) + + # load the constraints to the model + return [self.constraints['b1c_' + pheno][short_code][timestep], + self.constraints['b2c_' + pheno][short_code][timestep], + self.constraints['b3c_' + pheno][short_code][timestep], + self.constraints['b4c_' + pheno][short_code][timestep], + self.constraints['b5c_' + pheno][short_code][timestep], + self.constraints['b1c_control_' + pheno][short_code][timestep], + self.constraints['b2c_control_' + pheno][short_code][timestep], + self.constraints['b3c_control_' + pheno][short_code][timestep], + self.constraints['b4c_control_' + pheno][short_code][timestep], + self.constraints['b5c_control_' + pheno][short_code][timestep], + self.constraints['bin1c_' + pheno][short_code][timestep], + self.constraints['bin2c_' + pheno][short_code][timestep], + self.constraints['bin3c_' + pheno][short_code][timestep], + self.constraints['bin4c_' + pheno][short_code][timestep], + self.constraints['bin5c_' + pheno][short_code][timestep]] + + def initialize_vars_cons(self, pheno, short_code): + # cvt and cvf + self.variables['cvt_' + pheno] = {}; self.variables['cvf_' + pheno] = {} + self.variables['cvt_' + pheno][short_code] = {}; self.variables['cvf_' + pheno][short_code] = {} + # total biomass and growth + self.variables['b_' + pheno] = {}; self.variables['g_' + pheno] = {} + self.variables['b_' + pheno][short_code] = {}; self.variables['g_' + pheno][short_code] = {} + self.constraints['gc_' + pheno] = {}; self.constraints['cvc_' + pheno] = {} + self.constraints['gc_' + pheno][short_code] = {}; self.constraints['cvc_' + pheno][short_code] = {} + # partitioned biomasses + self.variables['b1_' + pheno] = {}; self.variables['b2_' + pheno] = {}; self.variables['b3_' + pheno] = {} + self.variables['b4_' + pheno] = {}; self.variables['b5_' + pheno] = {} + self.variables['b1_' + pheno][short_code] = {}; self.variables['b2_' + pheno][short_code] = {} + self.variables['b3_' + pheno][short_code] = {}; self.variables['b4_' + pheno][short_code] = {} + self.variables['b5_' + pheno][short_code] = {} + ## biomass binary variables + self.variables[f'bin1_{pheno}'] = {}; self.variables[f'bin2_{pheno}'] = {}; self.variables[f'bin3_{pheno}'] = {} + self.variables[f'bin4_{pheno}'] = {}; self.variables[f'bin5_{pheno}'] = {} + self.variables[f"bin1_{pheno}"][short_code] = {}; self.variables[f"bin2_{pheno}"][short_code] = {} + self.variables[f"bin3_{pheno}"][short_code] = {}; self.variables[f"bin4_{pheno}"][short_code] = {} + self.variables[f"bin5_{pheno}"][short_code] = {} + ## biomass partition constraints + self.constraints['b1c_' + pheno] = {}; self.constraints['b2c_' + pheno] = {}; self.constraints['b3c_' + pheno] = {} + self.constraints['b4c_' + pheno] = {}; self.constraints['b5c_' + pheno] = {} + self.constraints['b1c_' + pheno][short_code] = {}; self.constraints['b2c_' + pheno][short_code] = {} + self.constraints['b3c_' + pheno][short_code] = {}; self.constraints['b4c_' + pheno][short_code] = {} + self.constraints['b5c_' + pheno][short_code] = {} + self.constraints['b1c_control_' + pheno] = {}; self.constraints['b2c_control_' + pheno] = {} + self.constraints['b3c_control_' + pheno] = {}; self.constraints['b4c_control_' + pheno] = {} + self.constraints['b5c_control_' + pheno] = {} + self.constraints['b1c_control_' + pheno][short_code] = {}; self.constraints['b2c_control_' + pheno][short_code] = {} + self.constraints['b3c_control_' + pheno][short_code] = {}; self.constraints['b4c_control_' + pheno][short_code] = {} + self.constraints['b5c_control_' + pheno][short_code] = {} + self.constraints[f'binc_{pheno}'] = {}; self.constraints[f'binc_{pheno}'][short_code] = {} + self.constraints['bin1c_' + pheno] = {}; self.constraints['bin2c_' + pheno] = {} + self.constraints['bin3c_' + pheno] = {}; self.constraints['bin4c_' + pheno] = {}; self.constraints['bin5c_' + pheno] = {} + self.constraints['bin1c_' + pheno][short_code] = {}; self.constraints['bin2c_' + pheno][short_code] = {} + self.constraints['bin3c_' + pheno][short_code] = {}; self.constraints['bin4c_' + pheno][short_code] = {} + self.constraints['bin5c_' + pheno][short_code] = {} + + def get_timestep_bin(self, timestep): + if timestep < self.first: return 0 + elif timestep < self.second: return 1 + elif timestep < self.third: return 2 + elif timestep < self.fourth: return 3 + return 4 + + def define_problem(self, parameters=None, mets_to_track=None, rel_final_conc=None, zero_start=None, + abs_final_conc=None, data_timesteps=None, export_zip_name: str=None, + export_parameters: bool=True, export_lp: str='CommPhitting.lp', primal_values=None, + biomass_coefs=None, requisite_biomass:dict=None, biolog_simulation=False, + export_phenotype_profiles=True): + # parse the growth data + growth_tup = FBAHelper.parse_df(self.data_df, False) + self.phenotypes = list(self.fluxes_tup.columns) + self.phenotypes.extend([signal_species(signal)+"_stationary" for signal in growth_tup.columns if ( + ":" in signal and "OD" not in signal)]) + self.species_list = [signal_species(signal) for signal in growth_tup.columns if ":" in signal] + num_sorted = np.sort(np.array([int(obj[1:]) for obj in set(growth_tup.index)])) + # TODO - short_codes must be distinguished for different conditions + unique_short_codes = [f"{growth_tup.index[0][0]}{num}" for num in map(str, num_sorted)] + full_times = growth_tup.values[:, growth_tup.columns.index("Time (s)")] + self.times = {short_code: trial_contents(short_code, growth_tup.index, full_times) + for short_code in unique_short_codes} + average_time_series = np.mean(list(self.times.values()), axis=0) ; points = len(average_time_series) + self.first, self.second, self.third, self.fourth = int(points*0.1), int(points*0.25), int(points*0.45), int(points*0.7) + self.time_ranges = {0: average_time_series[:self.first], 1: average_time_series[self.first:self.second], + 2: average_time_series[self.second:self.third], 3: average_time_series[self.third:self.fourth], + 4: average_time_series[self.fourth:]} + + # define default values + # TODO render bcv and cvmin dependent upon temperature, and possibly trained on Carlson's data + parameters, data_timesteps = parameters or {}, data_timesteps or {} + self.parameters["data_timestep_hr"] = np.mean(np.diff(np.array(list( + self.times.values())).flatten()))/hour if not hasattr(self, "data_timestep_hr") else self.data_timestep_hr + self.parameters.update({ + "timestep_hr": self.parameters['data_timestep_hr'], + "cvct": 0.01, "cvcf": 0.01, + "bcv": 0.01, "cvmin": 0.01, + "kcat": 0.33, + 'diffpos': 1, 'diffneg': 1, # coefficients that weight difference between experimental and predicted biomass + "stationary": 10, # the penalty coefficient for the stationary phenotype + }) + self.parameters.update(parameters) + # distribute kcat values to all phenotypes of all species and update from previous simulations where necessary + self.parameters.update(self._universalize(self.parameters, "kcat", exclude=["stationary"])) + if primal_values is not None: + for species, content in self.parameters["kcat"].items(): + if species not in primal_values: continue + for pheno, content2 in content.items(): + if pheno not in primal_values[species]: continue + for time, val in content2.items(): + if time not in primal_values[species][pheno]: continue + self.parameters["kcat"][species][pheno][time] = val + print(self.parameters["kcat"]) + # define the metabolites that are tracked, exchanged, and not available in the media + # TODO the default zero_start logic appears to be incorrect + self.zero_start = zero_start or [met for met in self.consumed_mets + if (met not in self.carbon_conc or self.carbon_conc[met] == 0)] + self.rel_final_conc = rel_final_conc or { + met:0.1 for met, concs in self.carbon_conc.items() if any( + [concs[short_code] > 0 for short_code in self.data_df.index.unique()] + ) and met not in self.zero_start} + self.abs_final_conc = abs_final_conc or {} + if mets_to_track: self.mets_to_track = mets_to_track + elif not isinstance(rel_final_conc, dict): self.mets_to_track = self.fluxes_tup.index + else: self.mets_to_track = list(self.rel_final_conc.keys()) + self.zero_start + print(self.mets_to_track) + + ts_to_delete = {} # {short_code: full_times for short_code in unique_short_codes} + if data_timesteps: # {short_code:[times]} + for short_code, times in data_timesteps.items(): + ts_to_delete[short_code] = set(list(range(len(full_times)))) - set(times) + self.times[short_code] = np.delete(self.times[short_code], list(ts_to_delete[short_code])) + + # construct the problem + objective = tupObjective("minimize variance and phenotypic transitions", [], "min") + constraints, variables, simulated_mets = [], [], [] + time_1 = process_time() + for exID in self.fluxes_tup.index: + if exID == "bio": continue + met_id = re.search(r"(cpd\d{5})", exID).group() + met = self.msdb.compounds.get_by_id(met_id) + if "C" not in met.elements: continue + concID = f"c_{met_id}_e0" + simulated_mets.append(met_id) + self.variables[concID] = {}; self.constraints['dcc_' + met_id] = {} + + # define the growth rate for each metabolite and concentrations + # TODO the MM parameters may be deletable once the binned kcat method is refined + if "Vmax" and "Km" in self.parameters: + self.parameters["Vmax"].update(self._universalize(self.parameters["Vmax"], met_id)) + self.parameters["Km"].update(self._universalize(self.parameters["Km"], met_id)) + for short_code in unique_short_codes: + self.variables[concID][short_code] = {}; self.constraints['dcc_' + met_id][short_code] = {} + timesteps = list(range(1, len(self.times[short_code]) + 1)) + for timestep in timesteps: + ## define the concentration variables + conc_var = tupVariable(_name(concID, "", short_code, timestep, self.names)) + ## constrain initial time concentrations to the media or a large default + if timestep == timesteps[0]: + initial_val = None + if met_id in self.media_conc: initial_val = self.media_conc[met_id] + if met_id in self.zero_start: initial_val = 0 + if dict_keys_exists(self.carbon_conc, met_id, short_code): + initial_val = self.carbon_conc[met_id][short_code] + if initial_val is not None: + conc_var = conc_var._replace(bounds=Bounds(initial_val, initial_val)) + if biolog_simulation: conc_var = conc_var._replace(bounds=Bounds(1, None)) + ## mandate complete carbon consumption + elif timestep == timesteps[-1] and (met_id in self.rel_final_conc or met_id in self.abs_final_conc): + if met_id in self.rel_final_conc: + final_bound = self.variables[concID][short_code][1].bounds.lb * self.rel_final_conc[met_id] + if met_id in self.abs_final_conc: # this intentionally overwrites rel_final_conc + final_bound = self.abs_final_conc[met_id] + conc_var = conc_var._replace(bounds=Bounds(0, final_bound)) + if met_id in self.zero_start: + conc_var = conc_var._replace(bounds=Bounds(final_bound, final_bound)) + self.variables[concID][short_code][timestep] = conc_var + variables.append(self.variables[concID][short_code][timestep]) + for pheno in self.phenotypes: + self.constraints['dbc_' + pheno] = {short_code: {} for short_code in unique_short_codes} + + # define growth and biomass variables and constraints + for pheno in self.phenotypes: + for short_code in unique_short_codes: + self.initialize_vars_cons(pheno, short_code) + timesteps = list(range(1, len(self.times[short_code]) + 1)) + nth_percentile_timestep = timesteps[int(0.90*len(timesteps))] + penalty_range = np.linspace(self.parameters['stationary'], self.parameters['stationary']/10, + len(timesteps[nth_percentile_timestep:])) + timestep_excess_count = 0 + for timestep in map(int, timesteps): + variables = self.define_b_vars(pheno, short_code, timestep, variables) + if short_code not in self.constraints[f"binc_{pheno}"]: + self.constraints[f"binc_{pheno}"][short_code] = tupConstraint( + _name("binc_", pheno, short_code, "", self.names), Bounds(0, 4), { + "elements": [self.variables[f"bin1_{pheno}"][short_code].name, + self.variables[f"bin2_{pheno}"][short_code].name, + self.variables[f"bin3_{pheno}"][short_code].name, + self.variables[f"bin4_{pheno}"][short_code].name, + self.variables[f"bin5_{pheno}"][short_code].name], + "operation": "Add"}) + constraints.append(self.constraints[f'binc_{pheno}'][short_code]) + constraints.extend(self.define_b_cons(pheno, short_code, timestep, biomass_coefs)) + + ## define the growth rate variable or primal value + species, phenotype = pheno.split("_") + self.variables['g_' + pheno][short_code][timestep] = tupVariable( + _name("g_", pheno, short_code, timestep, self.names)) + variables.append(self.variables['g_' + pheno][short_code][timestep]) + + if 'stationary' in pheno: + weight = self.parameters['stationary'] + if timestep > nth_percentile_timestep: + weight = penalty_range[timestep_excess_count] + timestep_excess_count += 1 + objective.expr.extend([{ + "elements": [{"elements": [weight, self.variables['b_' + pheno][short_code][timestep].name], + "operation": "Mul"}], + "operation": "Add"}]) + continue + # the conversion rates to and from the stationary phase + self.variables['cvt_' + pheno][short_code][timestep] = tupVariable( + _name("cvt_", pheno, short_code, timestep, self.names), Bounds(0, 100)) + self.variables['cvf_' + pheno][short_code][timestep] = tupVariable( + _name("cvf_", pheno, short_code, timestep, self.names), Bounds(0, 100)) + variables.extend([self.variables['cvf_' + pheno][short_code][timestep], + self.variables['cvt_' + pheno][short_code][timestep]]) + + # cvt <= bcv*b_{pheno} + cvmin + self.constraints['cvc_' + pheno][short_code][timestep] = tupConstraint( + _name('cvc_', pheno, short_code, timestep, self.names), (0, None), { + "elements": [{"elements": [-1, self.variables['cvt_' + pheno][short_code][timestep].name], + "operation": "Mul"}], + "operation": "Add"}) + # biomass_term = [self.parameters['bcv']*b_value + self.parameters['cvmin']] if FBAHelper.isnumber(b_value) else [ + biomass_term = [self.parameters['cvmin'], + {"elements": [self.parameters['bcv'], + self.variables["b_"+pheno][short_code][timestep].name], + "operation": "Mul"}] + self.constraints['cvc_' + pheno][short_code][timestep].expr["elements"].extend(biomass_term) + + # g_{pheno} = b_{pheno}*v_{pheno} + b_values = [self.variables['b1_' + pheno][short_code][timestep].name, + self.variables['b2_' + pheno][short_code][timestep].name, + self.variables['b3_' + pheno][short_code][timestep].name, + self.variables['b4_' + pheno][short_code][timestep].name, + self.variables['b5_' + pheno][short_code][timestep].name] + self.constraints['gc_' + pheno][short_code][timestep] = tupConstraint( + name=_name('gc_', pheno, short_code, timestep, self.names), + expr={"elements": [*[{"elements": [-self.parameters["kcat"][species][phenotype], b], + "operation": "Mul"} for b in b_values], + self.variables['g_' + pheno][short_code][timestep].name], + "operation": "Add"}) + + constraints.extend([self.constraints['cvc_' + pheno][short_code][timestep], + self.constraints['gc_' + pheno][short_code][timestep]]) + # self.constraints["binTot_" + pheno][short_code]]) + + # define the concentration constraint + half_dt = self.parameters['data_timestep_hr'] / 2 + time_2 = process_time() + print(f'Done with concentrations and biomass loops: {(time_2 - time_1) / 60} min') + for r_index, met in enumerate(self.fluxes_tup.index): + met_id = _met_id_parser(met) + if met_id not in simulated_mets: continue + concID = f"c_{met_id}_e0" + for short_code in unique_short_codes: + timesteps = list(range(1, len(self.times[short_code]) + 1)) + for timestep in timesteps[:-1]: + # c_{met} + dt/2*sum_k^K(n_{k,met} * (g_{pheno}+g+1_{pheno})) = c+1_{met} + next_timestep = timestep + 1 + growth_phenos = [[self.variables['g_' + pheno][short_code][next_timestep].name, + self.variables['g_' + pheno][short_code][timestep].name] + for pheno in self.fluxes_tup.columns] + self.constraints['dcc_' + met_id][short_code][timestep] = tupConstraint( + name=_name("dcc_", met_id, short_code, timestep, self.names), + expr={ + "elements": [ + self.variables[concID][short_code][timestep].name, + {"elements": [-1, self.variables[concID][short_code][next_timestep].name], + "operation": "Mul"}, + *OptlangHelper.dot_product( + growth_phenos, heuns_coefs=half_dt * self.fluxes_tup.values[r_index])], + "operation": "Add"}) + constraints.append(self.constraints['dcc_' + met_id][short_code][timestep]) + + # define the conversion variables of every signal for every phenotype + # for signal in growth_tup.columns[2:]: + # for pheno in self.fluxes_tup.columns: + # conversion_name = "_".join([signal, pheno, "__conversion"]) + # self.variables[conversion_name] = tupVariable(conversion_name) + # variables.append(self.variables[conversion_name]) + + time_3 = process_time() + print(f'Done with DCC loop: {(time_3 - time_2) / 60} min') + species_phenos = {} + self.conversion_bounds = [5e-6, 50] + for index, org_signal in enumerate(growth_tup.columns[2:]): + # signal = org_signal.split(":")[1] + signal = org_signal.replace(":", "|") + species = signal_species(org_signal) + species_phenos[species] = {None if "OD" in species else f"{species}_stationary"} + signal_column_index = index + 2 + data_timestep = 1 + self.variables[signal + '|conversion'] = tupVariable( + signal + '|conversion', bounds=Bounds(*self.conversion_bounds)) + variables.append(self.variables[signal + '|conversion']) + + self.variables[signal + '|bio'] = {}; self.variables[signal + '|diffpos'] = {} + self.variables[signal + '|diffneg'] = {}; self.variables['g_' + species] = {} + self.constraints[signal + '|bioc'] = {}; self.constraints[signal + '|diffc'] = {} + self.constraints["gc_" + species] = {}; self.constraints["totVc_" + species] = {} + self.constraints["totGc_" + species] = {}; self.constraints[signal + '|bio_finalc'] = {} + for short_code in unique_short_codes: + self.variables[signal + '|bio'][short_code] = {} + self.variables[signal + '|diffpos'][short_code] = {} + self.variables[signal + '|diffneg'][short_code] = {} + self.variables['g_' + species][short_code] = {} + self.constraints[signal + '|bioc'][short_code] = {} + self.constraints[signal + '|diffc'][short_code] = {} + self.constraints["gc_" + species][short_code] = {} + self.constraints["totVc_" + species][short_code] = {} + self.constraints["totGc_" + species][short_code] = {} + # self.constraints[signal + '|bio_finalc'][short_code] = {} + # the value entries are matched to only the timesteps that are condoned by data_timesteps + values_slice = trial_contents(short_code, growth_tup.index, growth_tup.values) + if ts_to_delete: values_slice = np.delete(values_slice, list(ts_to_delete[short_code]), axis=0) + timesteps = list(range(1, len(values_slice) + 1)) + # the last timestep is omitted since Heun's method in the modelled biomass + ## requires a future timestep, which does not exist for the last timestep + for timestep in timesteps[:-1]: + ## the user timestep and data timestep must be synchronized + if (int(timestep)*self.parameters['timestep_hr'] + < data_timestep*self.parameters['data_timestep_hr']): + print(f"Skipping timestep {timestep} that does not align with the user's timestep") ; continue + data_timestep += 1 + if data_timestep > int(self.times[short_code][-1] / self.parameters["data_timestep_hr"]): + print(f"The user-defined time exceeds the simulation time, so the DBC & diff loop is broken.") + break + next_timestep = int(timestep) + 1 + ## the phenotype transition terms are aggregated + total_biomass, signal_sum, from_sum, to_sum = [], [], [], [] + for pheno_index, pheno in enumerate(self.phenotypes): + ### define the collections of signal and pheno terms + if species in pheno or "OD" in signal: + # if not FBAHelper.isnumber(b_values[pheno][short_code][timestep]): + signal_sum.append({"operation": "Mul", "elements": [ + -1, self.variables['b_' + pheno][short_code][timestep].name]}) + # else: + # signal_sum.append(-b_values[pheno][short_code][timestep]) + ### total_biomass.append(self.variables["b_"+pheno][short_code][timestep].name) + if all(['OD' not in signal, species in pheno, 'stationary' not in pheno]): + species_phenos[species].add(pheno) + from_sum.append({"operation": "Mul", "elements": [ + -1, self.variables["cvf_" + pheno][short_code][timestep].name]}) + to_sum.append(self.variables["cvt_" + pheno][short_code][timestep].name) + for pheno in species_phenos[species]: + if "OD" in signal: continue + # print(pheno, timestep, b_values[pheno][short_code][timestep], b_values[pheno][short_code][next_timestep]) + if "stationary" in pheno: + # b_{phenotype} - sum_k^K(es_k*cvf) + sum_k^K(pheno_bool*cvt) = b+1_{phenotype} + self.constraints['dbc_' + pheno][short_code][timestep] = tupConstraint( + name=_name("dbc_", pheno, short_code, timestep, self.names), + expr={"elements": [*from_sum, *to_sum], "operation": "Add"}) + else: + # b_{phenotype} + dt/2*(g_{phenotype} + g+1_{phenotype}) + cvf-cvt = b+1_{phenotype} + self.constraints['dbc_' + pheno][short_code][timestep] = tupConstraint( + name=_name("dbc_", pheno, short_code, timestep, self.names), + expr={ + "elements": [ + self.variables['cvf_' + pheno][short_code][timestep].name, + {"elements": [half_dt, self.variables['g_' + pheno][short_code][timestep].name], + "operation": "Mul"}, + {"elements": [half_dt, self.variables['g_' + pheno][short_code][next_timestep].name], + "operation": "Mul"}, + {"elements": [-1, self.variables['cvt_' + pheno][short_code][timestep].name], + "operation": "Mul"}], + "operation": "Add"}) + # if not FBAHelper.isnumber(self.variables['b_' + pheno][short_code][timestep]): + biomass_term = [self.variables['b_' + pheno][short_code][timestep].name, { + "elements": [-1, self.variables['b_' + pheno][short_code][next_timestep].name], + "operation": "Mul"}] + # else: + # biomass_term = [b_values[pheno][short_code][timestep]-b_values[pheno][short_code][next_timestep]] + self.constraints['dbc_' + pheno][short_code][timestep].expr["elements"].extend(biomass_term) + constraints.append(self.constraints['dbc_' + pheno][short_code][timestep]) + + if not requisite_biomass or any([timestep != timesteps[-2], signal not in requisite_biomass[short_code]]): + self.variables[signal + '|bio'][short_code][timestep] = tupVariable( + _name(signal, '|bio', short_code, timestep, self.names)) + else: + biomass_flux = requisite_biomass[short_code][signal]["bio"] + estimated_biomass = biomass_flux #* int(timestep)*self.parameters['data_timestep_hr'] + self.variables[signal + '|bio'][short_code][timestep] = tupVariable( + _name(signal, '|bio', short_code, timestep, self.names), + Bounds(estimated_biomass, None)) + self.variables[signal + '|diffpos'][short_code][timestep] = tupVariable( + _name(signal, '|diffpos', short_code, timestep, self.names), Bounds(0, 100)) + self.variables[signal + '|diffneg'][short_code][timestep] = tupVariable( + _name(signal, '|diffneg', short_code, timestep, self.names), Bounds(0, 100)) + variables.extend([self.variables[signal + '|bio'][short_code][timestep], + self.variables[signal + '|diffpos'][short_code][timestep], + self.variables[signal + '|diffneg'][short_code][timestep]]) + + # {signal}__conversion*datum = {signal}__bio + # TODO - the conversion variable must be a constant for BIOLOG conditions + self.constraints[signal + '|bioc'][short_code][timestep] = tupConstraint( + name=_name(signal, '|bioc', short_code, timestep, self.names), + expr={ + "elements": [ + {"elements": [-1, self.variables[signal + '|bio'][short_code][timestep].name], + "operation": "Mul"}, + {"elements": [self.variables[signal + '|conversion'].name, + values_slice[timestep, signal_column_index]], + "operation": "Mul"}], + "operation": "Add"}) + constraints.append(self.constraints[signal + '|bioc'][short_code][timestep]) + + # {speces}_bio + {signal}_diffneg-{signal}_diffpos = sum_k^K(es_k*b_{phenotype}) + self.constraints[signal + '|diffc'][short_code][timestep] = tupConstraint( + name=_name(signal, '|diffc', short_code, timestep, self.names), + expr={ + "elements": [ + self.variables[signal + '|bio'][short_code][timestep].name, + self.variables[signal + '|diffneg'][short_code][timestep].name, + {"elements": [-1, self.variables[signal + '|diffpos'][short_code][timestep].name], + "operation": "Mul"}], + "operation": "Add"}) + if all([isinstance(val, dict) for val in signal_sum]): + self.constraints[signal + "|diffc"][short_code][timestep].expr["elements"].extend(signal_sum) + else: raise ValueError(f"The {signal_sum} value has unexpected contents.") + constraints.append(self.constraints[signal + '|diffc'][short_code][timestep]) + + objective.expr.extend([{ + "elements": [ + {"elements": [self.parameters['diffpos'], + self.variables[f'{signal}|diffpos'][short_code][timestep].name], + "operation": "Mul"}, + {"elements": [self.parameters['diffneg'], + self.variables[f'{signal}|diffneg'][short_code][timestep].name], + "operation": "Mul"}], + "operation": "Add"}]) + + time_4 = process_time() + print(f'Done with the DBC & diffc loop: {(time_4 - time_3) / 60} min') + + # construct the problem + self.problem = OptlangHelper.define_model("CommPhitting model", variables, constraints, objective, True) + self.hdf5_name = export_lp.replace(".lp", ".h5") + self.hdf5_file = File(self.hdf5_name, 'w') + time_5 = process_time() + print(f'Done with constructing the {type(self.problem)} model: {(time_5 - time_4) / 60} min') + + # export contents + if export_phenotype_profiles: + phenotype_profiles_name = 'phenotype_profiles.tsv' + self.fluxes_df.to_csv(phenotype_profiles_name, sep="\t") + self.zipped_output.append(phenotype_profiles_name) + if export_parameters: + parameter_name = 'parameters.tsv' + DataFrame(data=list(self.parameters.values()), index=list(self.parameters.keys()), + columns=['values']).to_csv(parameter_name, sep="\t") + self.zipped_output.append(parameter_name) + if export_lp: + if re.search(r"(\\\\/)", export_lp): os.makedirs(os.path.dirname(export_lp), exist_ok=True) + with open(export_lp, 'w') as lp: lp.write(self.problem.to_lp()) + model_name = 'CommPhitting.json' + _export_model_json(self.problem.to_json(), model_name) + self.zipped_output.extend([export_lp, model_name]) + if export_zip_name: + self.zip_name = export_zip_name + sleep(2) + with ZipFile(self.zip_name, 'a', compression=ZIP_LZMA) as zp: + for file in self.zipped_output: + zp.write(file) ; os.remove(file) ; self.zipped_output.remove(file) + time_6 = process_time() + print(f'Done exporting the content: {(time_6 - time_5) / 60} min') + + def compute(self, graphs: list = None, export_zip_name=None, figures_zip_name=None, publishing=False, + primals_export_path:str = "primal_values.json", remove_empty_plots=False): + print("starting optimization") + time1 = process_time() + self.values = {} + solution = self.problem.optimize() + timesteps = min(list(map(len, self.times.values()))) + fit_quality = self.problem.objective.value/timesteps + print(f"The optimization fit quality is {fit_quality}") + if "parameters.tsv" in self.zipped_output: + self.parameters["fit"] = fit_quality + parameter_name = 'parameters.tsv' + DataFrame(data=list(self.parameters.values()), index=list(self.parameters.keys()), + columns=['values']).to_csv(parameter_name, sep="\t") + with ZipFile(self.zip_name, 'a', compression=ZIP_LZMA) as zp: + for file in self.zipped_output: + zp.write(file) ; os.remove(file) + + # TODO approximate a threshold of good fits, and trigger black box optimization for bad fits + ## that iteratively adjust parameters until the fit metric surmounts the threshold. + + # categorize the primal values by trial and time + if "optimal" not in solution: + raise FeasibilityError(f'The solution is sub-optimal, with a(n) {solution} status.') + if all(np.array(list(self.problem.primal_values.values())) == 0): + raise NoFluxError("The simulation lacks any flux.") + for variable, value in self.problem.primal_values.items(): + if "v_" in variable: self.values[variable] = value + elif 'conversion' in variable or re.search(r"(bin\d)", variable): + self.values[short_code].update({variable: value}) + if value in self.conversion_bounds: + warnings.warn(f"The conversion factor {value} optimized to a bound, which may be " + f"indicative of an error, such as improper kinetic rates.") + else: + basename, short_code, timestep = variable.split('-') + time_hr = int(timestep) * self.parameters['data_timestep_hr'] + self.values[short_code] = self.values.get(short_code, {}) + self.values[short_code][basename] = self.values[short_code].get(basename, {}) + self.values[short_code][basename][time_hr] = value + + # export the processed primal values for graphing + # with open(primals_export_path, 'w') as out: + # json.dump(self.values, out, indent=3) + # if not export_zip_name and hasattr(self, 'zip_name'): + # export_zip_name = self.zip_name + # if export_zip_name: + # with ZipFile(export_zip_name, 'a', compression=ZIP_LZMA) as zp: + # zp.write(primals_export_path) + # os.remove(primals_export_path) + # visualize the specified information + time2 = process_time() + if graphs: self.graph(graphs, export_zip_name=figures_zip_name or export_zip_name, + publishing=publishing, remove_empty_plots=remove_empty_plots) + + # parse the primal values + values_df = DataFrame(self.values) + values_index = values_df.index.tolist() + for col in values_df.columns: + trial_values = values_df[col].tolist() + ## process the times + times = [list(ele.keys()) for ele in trial_values if isinstance(ele, dict)] + max_time = max(list(map(len, times))) + for max_time_series in times: + if len(max_time_series) == max_time: break + trial_path = f'results/primals/{col}/' + self.hdf5_file.create_dataset(f'{trial_path}/times', data=max_time_series) + ## process the data values + for index, ele in enumerate(trial_values): + dataset_name = f'{trial_path}/{values_index[index]}' + if FBAHelper.isnumber(ele): self.hdf5_file.create_dataset(dataset_name, data=[float(ele)]) + elif isinstance(ele, dict): + self.hdf5_file.create_dataset(dataset_name, data=list(map(float, ele.values()))) + self.hdf5_file[dataset_name].attrs["full_time"] = (len(ele.values()) == max_time) + + self.hdf5_file.close() + with ZipFile(self.zip_name, 'a', compression=ZIP_LZMA) as zp: + zp.write(self.hdf5_name) ; os.remove(self.hdf5_name) + + time3 = process_time() + print(f"Optimization completed in {(time2-time1)/60} minutes") + print(f"Graphing completed in {(time3-time2)/60} minutes") + + def load_model(self, mscomfit_json_path: str = None, zip_name: str = None, model_to_load: dict = None): + if zip_name: + with ZipFile(zip_name, 'r') as zp: zp.extract(mscomfit_json_path) + if mscomfit_json_path: + with open(mscomfit_json_path, 'r') as mscmft: return json.load(mscmft) + if model_to_load: self.problem = Model.from_json(model_to_load) + + @staticmethod + def assign_values(param, var, next_dimension, kcat=True): + dic = {var: {}} + for dim1, dim2_list in next_dimension.items(): + if isinstance(dim2_list, dict): dic[var].update(CommPhitting.assign_values(param, dim1, dim2_list)) + else: + if kcat: dic[var][dim1] = param + else: dic[var][dim1] = {dim2: param for dim2 in dim2_list} + return dic + + def _universalize(self, param, var, next_dimension=None, exclude=None, tsBin=False): + if not next_dimension: + next_dimension = {} + for organism in self.fluxes_tup.columns: + species, pheno = organism.split("_") + if pheno in exclude: continue + if not tsBin: + if species in next_dimension: next_dimension[species].append(pheno) + else: next_dimension[species] = [pheno] + else: + if species in next_dimension: next_dimension[species].update({pheno: self.time_ranges}) + else: next_dimension[species] = {pheno: self.time_ranges} + if FBAHelper.isnumber(param): return CommPhitting.assign_values(param, var, next_dimension) + elif FBAHelper.isnumber(param[var]): return CommPhitting.assign_values(param[var], var, next_dimension) + elif isinstance(param[var], dict): + return {var: {dim1: {dim2: param[var][dim1] for dim2 in dim2_list} + for dim1, dim2_list in next_dimension.items()}} + else: logger.critical(f"The param (with keys {dic_keys(param)}) and var {var} are not amenable" + " with the parameterizing a universal value.") + # {short_code: {list(timestep_info.keys())[0]: find_dic_number(param)} for short_code, timestep_info in variable.items()}} + + def adjust_color(self, color, amount=0.5): + """ + Lightens the given color by multiplying (1-luminosity) by the given amount. + Input can be matplotlib color string, hex string, or RGB tuple. + + Examples: + >> lighten_color('g', 0.3) + >> lighten_color('#F034A3', 0.6) + >> lighten_color((.3,.55,.1), 0.5) + """ + import colorsys + import matplotlib.colors as mc + try: c = mc.cnames[color] + except: c = color + c = colorsys.rgb_to_hls(*mc.to_rgb(c)) + return colorsys.hls_to_rgb(c[0], max(0, min(1, amount * c[1])), c[2]) + + def _add_plot(self, ax, labels, label, basename, trial, x_axis_split, linestyle="solid", + scatter=False, color=None, xs=None, ys=None): + labels.append(label or basename.split('-')[-1]) + xs = xs if xs is not None else list(map(float, self.values[trial][basename].keys())) + ys = ys if ys is not None else list(map(float, self.values[trial][basename].values())) + if scatter: ax.scatter(xs, ys, s=10, label=labels[-1], color=color or None) + else: ax.plot(xs, ys, label=labels[-1], linestyle=linestyle, color=color or None) + ax.set_xticks(list(map(int, xs))[::x_axis_split]) + return ax, labels + + def graph(self, graphs, primal_values_filename: str = None, primal_values_zip_path: str = None, + export_zip_name: str = None, data_timestep_hr: float = 0.163, publishing: bool = False, + title: str = None, remove_empty_plots:bool = False): + print(export_zip_name) + # define the default timestep ratio as 1 + data_timestep_hr = self.parameters.get('data_timestep_hr', data_timestep_hr) + timestep_ratio = data_timestep_hr / self.parameters.get('timestep_hr', data_timestep_hr) + if primal_values_filename: + if primal_values_zip_path: + with ZipFile(primal_values_zip_path, 'r') as zp: zp.extract(primal_values_filename) + with open(primal_values_filename, 'r', encoding='utf-8') as primal: self.values = json.load(primal) + + # plot the content for desired trials + x_axis_split = int(3 / data_timestep_hr / timestep_ratio) + self.plots = set() + contents = {"biomass": 'b_', "all_biomass": 'b_', "growth": 'g_', "conc": "c_"} + mM_threshold = 1e-3 + for graph_index, graph in enumerate(graphs): + content = contents.get(graph['content'], graph['content']) + y_label = 'Variable value'; x_label = r'Time ($hr$)' + if any([x in graph['content'] for x in ['biomass', 'OD']]): + total_biomasses = {name: [] for name in self.species_list} + total_biomasses.update({"OD":[]}) + if "species" not in graph: graph['species'] = self.species_list + if "biomass" in graph['content']: y_label = r'Biomass ($\frac{g}{L}$)' + elif 'growth' in graph['content']: y_label = r'Biomass growth ($\frac{g}{hr}$)' + graph["experimental_data"] = graph.get("experimental_data", False) + if "painting" not in graph: + graph["painting"] = { + "OD": { + "color": "blue", + "linestyle": "solid", + "name": "Total biomass" + }, + "ecoli": { + "color": "red", + "linestyle": "dashed", + "name": "E. coli" + }, + "pf": { + "color": "green", + "linestyle": "dotted", + "name": "P. fluorescens" + }} + graph["parsed"] = graph.get("parsed", False) + if 'phenotype' in graph and graph['phenotype'] == '*': + if "species" not in graph: graph['species'] = self.species_list + graph['phenotype'] = set([pheno.split("_")[-1] for pheno in self.phenotypes + if pheno.split("_")[0] in graph["species"]]) + # TODO - a species-resolved option must be developed for the paper figure + if 'species' in graph and graph['species'] == '*': graph['species'] = self.species_list + elif content == "c_" and 'mets' not in graph: + print(self.mets_to_track) + graph["mets"] = self.mets_to_track + elif not any(["species" in graph, "mets" in graph]): + raise ValueError(f"The specified graph {graph} must define species for which data will be plotted.") + print(f"graph_{graph_index}") ; pprint(graph) + + # define figure specifications + if publishing: + pyplot.rc('axes', titlesize=22, labelsize=28) + pyplot.rc('xtick', labelsize=24) + pyplot.rc('ytick', labelsize=24) + pyplot.rc('legend', fontsize=18) + if graph["parsed"]: + parsed_graphs = {} + for species in graph["species"]: + parsed_graphs[species] = pyplot.subplots(dpi=200, figsize=(11, 7)) + else: fig, ax = pyplot.subplots(dpi=200, figsize=(11, 7)) + yscale = "linear" + + # populate the figures + for trial, basenames in self.values.items(): + if trial not in graph['trial']: continue + labels = [] + for basename, values in basenames.items(): + # graph experimental and total simulated biomasses + if any([x in graph['content'] for x in ['biomass', 'OD']]): + if 'b_' in basename: + vals = list(map(float, values.values())) + var_name, species, phenotype = basename.split('_') + # ic(basename) + label = f'{species}_biomass (model)' + if publishing: + species_name = graph["painting"][species]["name"] + label = f'{species_name} total (model)' + labels.append({species: label}) + if remove_empty_plots and all([v == 0 for v in vals]): + print(f"The {basename} is empty and thus is removed.") + continue + if (any([x in graph['content'] for x in ["total", "biomass", 'OD']]) or + graph['species'] == self.species_list): # and not graph["parsed"]: + total_biomasses['OD'].append(vals) + if "OD" not in graph['content']: total_biomasses[species].append(vals) + if all([graph['experimental_data'], '|bio' in basename, ]): + # any([content in basename])]): # TODO - any() must include all_biomass and total + species, signal, phenotype = basename.split('|') + label = basename + if publishing: + species_name = "total" if "OD" in signal else graph["painting"][species]["name"] + label = f'Experimental {species_name} (from {signal})' + # print(basename, label, self.values[trial][basename].values()) + if remove_empty_plots and all(self.values[trial][basename].values() == 0): + print(f"The {basename} is empty and thus is removed.") + continue + ax, labels = self._add_plot(ax, labels, label, basename, trial, x_axis_split, scatter=True, + color=self.adjust_color(graph["painting"][species]["color"], 1.5)) + + if content not in basename: continue + # graph individual phenotypes + if "phenotype" in graph: + # print(graph['phenotype']) + for specie in graph["species"]: + if specie not in basename: continue + if not any([p in basename for p in graph['phenotype']]): + print(f"{basename} data with unknown phenotype.") + continue + if remove_empty_plots and all(self.values[trial][basename].values() == 0): + print(f"The {specie} is empty and thus is removed.") + continue + if graph["parsed"]: fig, ax = parsed_graphs[specie] + ## define graph characteristics + label = basename.split("_")[-1] + style = "solid" + if len(graph["species"]) > 1: + label = re.sub(r"(^[a-b]+\_)", "", basename) + style = graph["painting"][specie]["linestyle"] + ax, labels = self._add_plot(ax, labels, label, basename, trial, x_axis_split, style) + if graph["parsed"]: parsed_graphs[specie] = (fig, ax) + # graph media concentration plots + elif "mets" in graph and all([any([x in basename for x in graph["mets"]]), 'c_cpd' in basename]): + if not any(np.array(list(self.values[trial][basename].values())) > mM_threshold): continue + if remove_empty_plots and all(self.values[trial][basename].values() == 0): continue + label=self.msdb.compounds.get_by_id(re.search(r"(cpd\d+)", basename).group()).name + ax, labels = self._add_plot(ax, labels, label, basename, trial, x_axis_split) + yscale = "log" + y_label = r'Concentration ($mM$)' + + if labels: # assesses whether graph(s) were created + ## graph all of the total biomasses + if any([x in graph['content'] for x in ['OD', 'biomass', 'total']]): + labeled_species = [label for label in labels if isinstance(label, dict)] + for name, vals in total_biomasses.items(): + # ic(name) + if not vals or (len(total_biomasses) == 2 and "OD" not in name): continue + if len(total_biomasses) == 2: + specie_label = [graph["painting"][name]["name"] for name in total_biomasses + if "OD" not in name][0] + label = f"{graph['painting'][name]['name']} ({specie_label})" + else: + label = f'{name}_biomass (model)' + if labeled_species: + for label_specie in labeled_species: + if name in label_specie: label = label_specie[name] ; break + style = "solid" if (len(graph["species"]) < 1 or name not in graph["painting"] + ) else graph["painting"][name]["linestyle"] + style = "dashdot" if "model" in label else style + style = "solid" if ("OD" in name and not graph["experimental_data"] + or "total" in graph["content"]) else style + total_biomass = sum(np.array(vals))[:-1] + xs = list(map(float, values.keys())) + if graph["parsed"]: fig, ax = parsed_graphs[name] + self._add_plot(ax, labels, label, None, None, x_axis_split, style, False, + graph["painting"][name]["color"], xs, total_biomass) + if graph["parsed"]: + ## process and export the parsed figures + ax.set_xlabel(x_label) ; ax.set_ylabel(y_label) ; ax.grid(axis="y") + ax.set_yscale(yscale) ; ax.legend() + phenotype_id = graph.get('phenotype', "") + if "phenotype" in graph and not isinstance(graph['phenotype'], str): + phenotype_id = f"{','.join(graph['phenotype'])} phenotypes" + fig_name = f'{"_".join([trial, name, phenotype_id, content])}.jpg' + fig.savefig(fig_name, bbox_inches="tight", transparent=True) + self.plots.add(fig_name) + + if graph["parsed"]: continue + ## process and export the non-parsed figures + phenotype_id = graph.get('phenotype', "") + if "phenotype" in graph and not isinstance(graph['phenotype'], str): + phenotype_id = f"{','.join(graph['phenotype'])} phenotypes" + + species_id = "" + if "mets" not in graph and content != "c_": + species_id = graph["species"] if isinstance(graph["species"], str) else ",".join(graph["species"]) + if "species" in graph and graph['species'] == self.species_list: species_id = 'all species' + else: phenotype_id = f"{','.join(graph['species'])} species" + if species_id == "all species" and not phenotype_id: phenotype_id = ','.join(graph['species']) + + ax.set_xlabel(x_label) ; ax.set_ylabel(y_label) + if "mets" in graph: ax.set_ylim(mM_threshold) + ax.grid(axis="y") + if len(labels) > 1: ax.legend() + else: yscale = "linear" + ax.set_yscale(yscale) + if not publishing: + if not title: + org_content = content if content not in contents.values() else list( + contents.keys())[list(contents.values()).index(content)] + this_title = f'{org_content} of {species_id} ({phenotype_id}) in the {trial} trial' + if content == "c_": this_title = f"{org_content} in the {trial} trial" + ax.set_title(this_title) + else: ax.set_title(title) + fig_name = f'{"_".join([trial, species_id, phenotype_id, content])}.jpg' + if "mets" in graph: fig_name = f"{trial}_{','.join(graph['mets'])}_c.jpg" + fig.savefig(fig_name, bbox_inches="tight", transparent=True) + + self.plots.add(fig_name) + + # export the figures with other simulation content + if export_zip_name: + with ZipFile(export_zip_name, 'a', compression=ZIP_LZMA) as zp: + for plot in self.plots: + zp.write(plot) ; os.remove(plot) + + + #################### ENGINEERING PHASE METHODS #################### + + def engineering(self): + if not hasattr(self, "problem"): + self.fit() # TODO - accommodate both fitting a new model and loading an existing model + + # This will capture biomass variables at all times and trials, which seems undesirable + self.problem.objective = Objective(sum([x for x in self.problem.variables if "bio" in x.name])) + + # Use a community COBRA model and CommKinetics with the fitted kinetic parameters? + + def _add_phenotypes(self): + pass + + + + def _change_obj(self): + pass + + +class BIOLOGPhitting(CommPhitting): + def __init__(self, carbon_conc, media_conc, biolog_df, fluxes_df, + experimental_metadata, msdb_path, community_members): + self.biolog_df = biolog_df; self.experimental_metadata = experimental_metadata + self.carbon_conc = carbon_conc; self.media_conc = media_conc or [] + self.fluxes_df = fluxes_df ; self.phenotypes = list(self.fluxes_df.columns) + self.phenotypes.extend([signal_species(signal)+"_stationary" + for signal in self.biolog_df if ":" in signal]) + self.community_members = community_members + # import os + from modelseedpy.biochem import from_local + self.msdb_path = msdb_path ; self.msdb = from_local(msdb_path) + + def fitAll(self, parameters: dict = None, rel_final_conc: float = None, + abs_final_conc: dict = None, graphs: list = None, data_timesteps: dict = None, + export_zip_name: str = None, export_parameters: bool = True, requisite_biomass: dict = None, + figures_zip_name: str = None, publishing: bool = False): + # simulate each condition + if export_zip_name and os.path.exists(export_zip_name): + os.remove(export_zip_name) + org_rel_final_conc = rel_final_conc + # total_reactions = set(list(chain.from_iterable([model.reactions for model in models_dict.values()]))) + model_abbreviations = ','.join([content["name"] for content in self.community_members.values()]) + for exp_index, experiment in self.experimental_metadata.iterrows(): + print(f"\n{exp_index} {experiment}") + display(experiment) + pheno = experiment["ModelSEED_ID"] + if not pheno: + print("The BIOLOG condition is not defined.") + continue + for model in self.community_members: + cpd = self.msdb.compounds.get_by_id(pheno) + if "C" not in cpd.elements or not any([re.search(pheno, rxn.id) for rxn in model.reactions]): + if "valid_condition" not in locals(): + valid_condition = False + continue + exp_list = [pheno] if isinstance(pheno, str) else pheno + self.community_members[model].update({"phenotypes": { + re.sub(r"(-|\s)", "", experiment["condition"]): {"consumed": exp_list} }}) + # determine the requisite biomass for each condition based on which member consumes the compound + valid_condition = True + # proceed if none of the members can utilize the phenotype condition + if not valid_condition: + print(f"The BIOLOG condition with {experiment['ModelSEED_ID']} is not" + f" absorbed by the {model_abbreviations} model(s).") + continue + print(f"The {experiment['ModelSEED_ID']} ({cpd.formula}) metabolite of the " + f"{experiment['condition']} condition may feed the {model_abbreviations} model(s).") + if not any([experiment["ModelSEED_ID"] in pheno for pheno in self.phenotypes]): + print(e) + print(f"The {experiment['ModelSEED_ID']} ({cpd.formula}) metabolite of the " + f"{experiment['condition']} condition is not a suitable phenotype for " + f"the {model_abbreviations} model(s).") + continue + + # for exp_index, experiment in self.experimental_metadata.iterrows(): + # the model(s) for which the condition is a suitable carbon source must be defined here + # simulate through the kinetics ranges with conditions that can be used by one of members + rel_final_conc = {experiment["ModelSEED_ID"]: org_rel_final_conc} + export_path = os.path.join(os.getcwd(), "BIOLOG_LPs", f"{exp_index}_{','.join(exp_list)}.lp") + kcat_primal = None + for coef_index, coefs in enumerate(biomass_partition_coefs): + # solve for growth rate constants with the previously solved biomasses + new_simulation = CommPhitting(self.fluxes_df, self.carbon_conc, self.media_conc, + self.msdb_path, self.biolog_df.loc[exp_index,:], + self.experimental_metadata) + new_simulation.define_problem( + parameters, exp_list, rel_final_conc, + set(list(chain.from_iterable([ + content["excretions"] for content in self.community_members.values()]))), + abs_final_conc, data_timesteps, export_zip_name, export_parameters, export_path, + kcat_primal, coefs, requisite_biomass, True) + time1 = process_time() + primals_export_path = primals_export_path or f"BIOLOG_{experiment['ModelSEED_ID']}.json" + try: + new_simulation.compute(graphs, export_zip_name, None, publishing, primals_export_path, True) + except (NoFluxError) as e: + print(e) + kcat_primal = parse_primals(new_simulation.values, coefs=coefs, + kcat_vals=new_simulation.parameters["kcat"]) + time2 = process_time() + print(f"Done simulating with the coefficients for biomass partitions: {coef_index}" + f"\n{(time2 - time1) / 60} minutes") + pprint(kcat_primal) + print("\n\n\n") + return {k: val for k, val in new_simulation.values.items() if "kcat" in k} diff --git a/modelseedpy/community/commscores_old.py b/modelseedpy/community/commscores_old.py new file mode 100644 index 00000000..3f81119b --- /dev/null +++ b/modelseedpy/community/commscores_old.py @@ -0,0 +1,918 @@ +from modelseedpy.core.exceptions import ObjectiveError, ParameterError +from modelseedpy.community.commhelper import build_from_species_models +from modelseedpy.community.mscompatibility import MSCompatibility +from modelseedpy.core.msminimalmedia import MSMinimalMedia +from modelseedpy.community.mscommunity import MSCommunity +from modelseedpy.core.msmodelutl import MSModelUtil +from modelseedpy.core.fbahelper import FBAHelper +from modelseedpy.core.msgapfill import MSGapfill +from itertools import combinations, permutations, chain +from optlang import Variable, Constraint, Objective +from numpy import array, unique, ndarray, where, sort, array_split, nan +from collections import Counter +from deepdiff import DeepDiff # (old, new) +from typing import Iterable, Union +from pprint import pprint +from numpy.random import shuffle +from multiprocess import current_process +from math import inf +import sigfig +# from icecream import ic +import re +# from math import prod + +# silence deprecation warnings from DeepDiff parsing the syntrophy +import warnings +warnings.simplefilter("ignore", category=DeprecationWarning) + +rm_comp = FBAHelper.remove_compartment + +def _compatibilize(member_models: Iterable, printing=False): + # return member_models + models = MSCompatibility.standardize(member_models, conflicts_file_name='exchanges_conflicts.json', printing=printing) + if not isinstance(member_models, (set, list, tuple)): return models[0] + return models + +def _load_models(member_models: Iterable, com_model=None, compatibilize=True, printing=False): + # ic(member_models, com_model, compatibilize) + if not com_model and member_models: + model = build_from_species_models(member_models, name="SMETANA_pair") + return member_models, model # (model, names=names, abundances=abundances) + # models = PARSING_FUNCTION(community_model) # TODO the individual models of a community model can be parsed + if compatibilize: return _compatibilize(member_models, printing), _compatibilize([com_model], printing)[0] + return member_models, com_model + +def _get_media(media=None, com_model=None, model_s_=None, min_growth=None, environment=None, + interacting=True, printing=False, minimization_method="minFlux", skip_bad_media=False): + # ic(media, com_model, model_s_) + if com_model is None and model_s_ is None: raise TypeError("< com_model > or < model_s_ > must be parameterized.") + if media is not None: + if model_s_ is not None and not isinstance(model_s_, (list,set,tuple)): + return media["members"][model_s_.id]["media"] + elif com_model is not None: return media["community_media"] + return media + # model_s_ is either a singular model or a list of models + if com_model is not None: + try: + com_media, media_sol = MSMinimalMedia.determine_min_media( + com_model, minimization_method, min_growth, None, interacting, 5, printing) + except Exception as e: + if skip_bad_media: com_media, media_sol = None, None + else: print(e) + if model_s_ is not None: + if not isinstance(model_s_, (list,set,tuple,ndarray)): + try: + return MSMinimalMedia.determine_min_media( + model_s_, minimization_method, min_growth, environment, interacting, printing) + except Exception as e: + if not skip_bad_media: print(e) + return None + members_media = {} + for model in model_s_: + try: + members_media[model.id] = {"media": MSMinimalMedia.determine_min_media( + model, minimization_method, min_growth, environment, interacting, printing)[0]} + continue + except Exception as e: + if skip_bad_media: continue + else: print(e) + # print(members_media) + if com_model is None: return members_media + else: return com_media, media_sol + return {"community_media":com_media, "members":members_media} + + +def _sigfig_check(value, sigfigs, default): + if str(value) in ["inf", "nan"]: value = "" + if FBAHelper.isnumber(value): return sigfig.round(value, sigfigs) + else: return default + + + +def nanFilter(value, string=True): + if isinstance(value, str) or value is None: + if string: return value + else: return nan + if any([value < 0, value > 1e5]): return "" if string else nan + return value + + +class CommScores: + def __init__(self, member_models, min_growth=0.1, n_solutions=100, environment=None, + abstol=1e-3, media_dict=None, printing=True, raw_content=False, antismash_json_path:str=None, + antismash_zip_path:str=None, minimal_media_method="minFlux"): + self.min_growth = min_growth ; self.abstol = abstol ; self.n_solutions = n_solutions + self.printing = printing ; self.raw_content = raw_content + self.antismash_json_path = antismash_json_path ; self.antismash_zip_path = antismash_zip_path + + # process the models + self.models = _compatibilize(member_models) + self.community = MSModelUtil(build_from_species_models(self.models)) + ## define the environment + if environment: + if hasattr(environment, "get_media_constraints"): + ### standardize modelseed media into COBRApy media + environment = {"EX_" + exID: -bound[0] for exID, bound in environment.get_media_constraints().items()} + self.community.add_medium(environment) + self.environment = environment + ## test growth + for model in self.models: + if model.slim_optimize() == 0: + raise ObjectiveError(f"The model {model.id} possesses an objective value of 0 in complete media, " + "which is incompatible with minimal media computations and hence SMETANA.") + if self.community.model.slim_optimize() == 0: + raise ObjectiveError(f"The community model {self.community.model.id} possesses an objective " + "value of 0 in complete media, which is incompatible with minimal " + "media computations and hence SMETANA.") + ## determine the minimal media for each model, including the community + self.media = media_dict if media_dict else MSMinimalMedia.comm_media_est( + member_models, self.community.model, minimal_media_method, + min_growth, self.environment, True, n_solutions, printing) + + def all_scores(self, mp_score=True, kbase_obj=None, cobrakbase_path:str=None, + kbase_token_path:str=None, annotated_genomes:dict=None): + mro = self.mro_score() + mip = self.mip_score(interacting_media=self.media) + mp = None if not mp_score else self.mp_score() + mu = None # self.mu_score() + sc = None # self.sc_score() + smetana = None # self.smetana_score() + gyd = self.gyd_score() + fs = self.fs_score() if any([kbase_obj is not None, annotated_genomes != [], cobrakbase_path is not None + and kbase_token_path is not None]) else None + return {"mro": mro, "mip": mip, "mp": mp, "mu": mu, "sc": sc, "smetana": smetana, + "gyd":gyd, "fs":fs} + + def mro_score(self): + self.mro_val = CommScores.mro(self.models, self.media["members"], self.min_growth, + self.media, self.raw_content, self.environment, self.printing, True) + if not self.printing: return self.mro_val + if self.raw_content: + for pair, (interaction, media) in self.mro_val.items(): + newcomer, established = pair.split('---') + print(f"\n(MRO) The {newcomer} media {media} possesses {interaction} shared " + f"requirements with the {established} established member.") + return self.mro_val + for pair, mro in self.mro_val.items(): + newcomer, established = pair.split('---') + print(f"\nThe {newcomer} on {established} MRO score: {mro[0]} ({mro[0]*100:.2f}%). " + f"This is the percent of nutritional requirements in {newcomer} " + f"that overlap with {established} ({mro[1]}/{mro[2]}).") + return self.mro_val + + def mip_score(self, interacting_media:dict=None, noninteracting_media:dict=None): + interacting_media = interacting_media or self.media or None + diff, self.mip_val = CommScores.mip(self.models, self.community.model, self.min_growth, interacting_media, + noninteracting_media, self.environment, self.printing, True) + if not self.printing: return self.mip_val + print(f"\nMIP score: {self.mip_val}\t\t\t{self.mip_val} required compound(s) can be sourced via syntrophy:") + if self.raw_content: pprint(diff) + return self.mip_val + + def gyd_score(self, coculture_growth=False): + self.gyd_val = CommScores.gyd(self.models, environment=self.environment, coculture_growth=coculture_growth) + if not self.printing: return self.gyd + growth_type = "monocultural" if not coculture_growth else "cocultural" + for pair, score in self.gyd_val.items(): + print(f"\nGYD score: The {growth_type} growth difference between the {pair} member models" + f" is {score} times greater than the growth of the slower member.") + return self.gyd + + def fs_score(self, kbase_obj=None, cobrakbase_path:str=None, kbase_token_path:str=None, annotated_genomes:dict=None): + self.fs_val = CommScores.fs(self.models, kbase_obj, cobrakbase_path, kbase_token_path, annotated_genomes) + if not self.printing: return self.fs + for pair, score in self.fs_val.items(): + print(f"\nFS Score: The similarity of RAST functional SSO ontology " + f"terms between the {pair} members is {score}.") + return self.fs + + def mp_score(self): + print("executing MP") + self.mp_val = CommScores.mp(self.models, self.environment, self.community.model, None, self.abstol, self.printing) + if not self.printing: return self.mp_val + if self.raw_content: + print("\n(MP) The possible contributions of each member in the member media include:\n") + pprint(self.mp_val) + else: + print("\nMP score:\t\t\tEach member can possibly contribute the following to the community:\n") + for member, contributions in self.mp_val.items(): + print(member, "\t", len(contributions)) + return self.mp_val + + def mu_score(self): + member_excreta = self.mp_score() if not hasattr(self, "mp_val") else self.mp_val + self.mu_val = CommScores.mu(self.models, self.environment, member_excreta, self.n_solutions, + self.abstol, True, self.printing) + if not self.printing: return self.mu_val + print("\nMU score:\t\t\tThe fraction of solutions in which each member is the " + "syntrophic receiver that contain a respective metabolite:\n") + pprint(self.mu_val) + return self.mu_val + + def sc_score(self): + self.sc_val = CommScores.sc(self.models, self.community.model, self.min_growth, + self.n_solutions, self.abstol, True, self.printing) + if not self.printing: return self.sc_val + print("\nSC score:\t\t\tThe fraction of community members who syntrophically contribute to each species:\n") + pprint(self.sc_val) + return self.sc_val + + def smetana_score(self): + if not hasattr(self, "sc_val"): self.sc_val = self.sc_score() + sc_coupling = all(array(list(self.sc.values())) is not None) + if not hasattr(self, "mu_val"): self.mu_val = self.mu_score() + if not hasattr(self, "mp_val"): self.mp_val = self.mp_score() + + self.smetana = CommScores.smetana( + self.models, self.community.model, self.min_growth, self.n_solutions, self.abstol, + (self.sc_val, self.mu_val, self.mp_val), True, sc_coupling, self.printing) + if self.printing: print("\nsmetana score:\n") ; pprint(self.smetana) + return self.smetana + + def antiSMASH_scores(self, antismash_json_path=None): + self.antismash = CommScores.antiSMASH(antismash_json_path or self.antismash_json_path) + if not self.printing: return self.antismash + if self.raw_content: + print("\n(antismash) The biosynthetic_areas, BGCs, protein_annotations, clusterBlast, and " + "num_clusterBlast from the provided antiSMASH results:\n") + print("The 'areas' that antiSMASH determines produce biosynthetic products:") + pprint(self.antismash[0]) + print("The set of biosynthetic gene clusters:") + pprint(self.antismash[1]) + print("The set of clusterblast protein annotations:") + pprint(self.antismash[2]) + print("Resistance information from clusterblast") + pprint(self.antismash[3]) + print("The number of proteins associated with resistance") + pprint(self.antismash[4]) + return self.antismash + print("\nantiSMASH scores:\n") + print("The community exhibited:" + f"- {len(self.antismash[0])}'areas' that antiSMASH determines produce biosynthetic products." + f"- {len(self.antismash[1])} biosynthetic gene clusters." + f"- {len(self.antismash[2])} clusterblast protein annotations." + f"- {len(self.antismash[3])} parcels of resistance information from clusterblast." + f"- {self.antismash[4]} proteins associated with resistance.") + return list(map(len, self.antismash[:4]))+[self.antismash[4]] + + + ###### STATIC METHODS OF THE SMETANA SCORES, WHICH ARE APPLIED IN THE ABOVE CLASS OBJECT ###### + + @staticmethod + def _check_model(model_util, media, model_str, skip_bad_media): + default_media = model_util.model.medium + if media is not None: model_util.add_medium(media) + obj_val = model_util.model.slim_optimize() + if obj_val == 0 or not FBAHelper.isnumber(obj_val): + print(f"The {model_str} model input does not yield an operational model, and will therefore be gapfilled.") + # if not skip_bad_media: return MSGapfill.gapfill(model_util.model, media) + model_util.add_medium(default_media) + return model_util.model + + @staticmethod + def _load(model, kbase_obj): + model_str = model + if len(model) == 2: model = kbase_obj.get_from_ws(*model) + else: model = kbase_obj.get_from_ws(model) + return model, model_str + + @staticmethod + def _determine_growths(modelUtils): + return [util.model.slim_optimize() for util in modelUtils] + + + @staticmethod + def calculate_scores(pairs, models_media=None, environments=None, annotated_genomes=True, lazy_load=False, + kbase_obj=None, cip_score=True, costless=True, skip_bad_media=False, anme_comm=False, + print_progress=False): + from pandas import Series + + if isinstance(pairs, list): pairs, models_media, environments, annotated_genomes, lazy_load, kbase_obj = pairs + series, mets = [], [] + if not isinstance(environments, (list, tuple)): environments = [environments] + if isinstance(environments, (list, tuple)) and hasattr(environments[0], "name"): + environments = {m.name:FBAHelper.convert_kbase_media(m, 1000) for m in environments} + elif not isinstance(environments, dict): environments = {f"media{i}":m for i,m in enumerate(environments)} + pid = current_process().name + model_utils = {} + count = 0 + for model1, models in pairs.items(): + if model1.id == "": model1.id = "model1" + if lazy_load: model1, model1_str = CommScores._load(model1, kbase_obj) + else: model1_str = model1.id + if model1.id not in models_media: + models_media[model1.id] = {"media": _get_media(model_s_=model1, skip_bad_media=skip_bad_media)} + if models_media[model1.id] is None: continue + if model1.id not in model_utils: model_utils[model1.id] = MSModelUtil(model1) + # print(pid, model1) + for model_index, model2 in enumerate(models): + if model2.id == "": model2.id = "model2" + if lazy_load: model2, model2_str = CommScores._load(model2, kbase_obj) + else: model2_str = model2.id + if model2.id not in models_media: + models_media[model2.id] = {"media": _get_media(model_s_=model2, skip_bad_media=skip_bad_media)} + if models_media[model2.id] is None: continue + if model2.id not in model_utils: model_utils[model2.id] = MSModelUtil(model2) + grouping = [model1, model2] ; grouping_utils = [model_utils[model1.id], model_utils[model2.id]] + modelIDs = [model.id for model in grouping] + comm_model = build_from_species_models(grouping) + community = MSCommunity(comm_model, ids=modelIDs) + comm_sol = comm_model.optimize() + print(f"{pid}~~{count}\t{modelIDs}") + for environName, environ in environments.items(): + if print_progress: print(f"\tEnvironment\t{environName}", end="\t") + if not anme_comm: + model1 = CommScores._check_model(model_utils[model1.id], environ, model1_str, skip_bad_media) + model2 = CommScores._check_model(model_utils[model2.id], environ, model2_str, skip_bad_media) + # initiate the KBase output + report_dic = {f"model{i+1}": modelID for i,modelID in enumerate(modelIDs)} + g1, g2, comm = CommScores._determine_growths([model_utils[model1.id], model_utils[model2.id], community.util]) + g1, g2, comm = _sigfig_check(g1, 5, ""), _sigfig_check(g2, 5, ""), _sigfig_check(comm, 5, "") + report_dic.update({"media": environName, "model1 growth": g1, "model2 growth": g2, "community growth": comm}) + coculture_growths = {mem.id: comm_sol.fluxes[mem.primary_biomass.id] for mem in community.members} + report_dic.update({f"coculture growth model{modelIDs.index(memID)}": growth for memID, growth in coculture_growths.items()}) + # define the MRO content + mro_values = CommScores.mro(grouping, models_media, raw_content=True, environment=environ) + report_dic.update({f"MRO_model{modelIDs.index(models_string.split('--')[0])+1}": + f"{100*len(intersection)/len(memMedia):.3f}% ({len(intersection)}/{len(memMedia)})" + for models_string, (intersection, memMedia) in mro_values.items()}) + mets.append({"MRO metabolites": list(mro_values.values())[0][0]}) + if print_progress: print("MRO done", end="\t") + # define the CIP content + if cip_score: + cip_values = CommScores.cip(modelutils=[model_utils[mem.id] for mem in grouping]) + report_dic.update({"CIP": cip_values[1]}) + mets[-1].update({"CIP metabolites": list(cip_values[0])}) + if print_progress: print("CIP done", end="\t") + # define the MIP content + mip_values = CommScores.mip(grouping, comm_model, 0.1, None, None, environ, print_progress, True, + costless, costless, skip_bad_media) + # print(mip_values) + if mip_values is not None: + report_dic.update({f"MIP_model{modelIDs.index(models_name)+1}": str(len(received)) + for models_name, received in mip_values[0].items()}) + mets[-1].update({"MIP model1 metabolites": list(mip_values[0].values())[0], + "MIP model2 metabolites": list(mip_values[0].values())[1]}) + if costless: + for models_name, received in mip_values[1].items(): + report_dic[f"MIP_model{modelIDs.index(models_name)+1} (costless)"] = report_dic[ + f"MIP_model{modelIDs.index(models_name)+1}"] + f" ({len(received)})" + del report_dic[f"MIP_model{modelIDs.index(models_name)+1}"] + if print_progress: print("costless_MIP done", end="\t") + else: + report_dic.update({f"MIP_model1 (costless)": "", f"MIP_model2 (costless)": ""}) + mets[-1].update({"MIP model1 metabolites": [None], "MIP model2 metabolites": [None]}) + if print_progress: print("MIP done", end="\t") + # define the BSS content + bss_values = CommScores.bss(grouping, grouping_utils, environments, models_media, skip_bad_media) + report_dic.update({f"BSS_model{modelIDs.index(name.split(' supporting ')[0])+1}": + f"{_sigfig_check(100*val, 5, '')}%" for name, (mets, val) in bss_values.items()}) + mets[-1].update({"BSS model1 metabolites": [met_set for met_set, val in bss_values.values()][0], + "BSS model2 metabolites": [met_set for met_set, val in bss_values.values()][1]}) + # mets[-1].update({"bss_mets": list(bss_values[0].values())}) + if print_progress: print("BSS done", end="\t") + # define the PC content + pc_values = CommScores.pc(grouping, grouping_utils, comm_model, None, comm_sol, environ, True, community) + report_dic.update({"PC_comm": _sigfig_check(pc_values[0], 5, ""), + "PC_model1": _sigfig_check(list(pc_values[1].values())[0], 5, ""), + "PC_model2": _sigfig_check(list(pc_values[1].values())[1], 5, ""), + "BIT": pc_values[3]}) + if print_progress: print("PC done\tBIT done", end="\t") + # print([mem.slim_optimize() for mem in grouping]) + # define the GYD content + gyd1, gyd2, g1, g2 = list(CommScores.gyd(grouping, grouping_utils, environ, False, community, anme_comm).values())[0] + report_dic.update({"GYD1": _sigfig_check(gyd1, 5, ""), "GYD2": _sigfig_check(gyd2, 5, "")}) + if print_progress: print("GYD done\t\t", end="\t" if annotated_genomes else "\n") + # define the FS content + if kbase_obj is not None and annotated_genomes and not anme_comm: + fs_values = list(CommScores.fs(grouping, kbase_obj, annotated_genomes=annotated_genomes).values())[0] + print(len(fs_values[0]) if fs_values[0] is not None else "NaN", fs_values[1]) + report_dic.update({"FS": sigfig.round(fs_values[1], 5)}) + if fs_values is not None: mets[-1].update({"FS features": fs_values[0]}) + if print_progress: print("FS done\t\t") + # return a pandas Series, which can be easily aggregated with other results into a DataFrame + series.append(Series(report_dic)) + count += 1 + return series, mets + + @staticmethod + def html_report(df, mets, export_html_path="commscores_report.html", msdb_path=None): + from modelseedpy.core.report import commscores_report + return commscores_report(df, mets, export_html_path, msdb_path) + + @staticmethod + def report_generation(all_models:iter=None, # a list of distinct lists is provided for specifying exclusive groups + pairs:dict=None, mem_media:dict=None, pair_limit:int=None, + exclude_pairs:list=None, kbase_obj=None, annotated_genomes:dict=True, # True triggers internal acquisition of the genomes, where None skips + see_media=True, environments:iter=None, # a collection of environment dicts or KBase media objects + pool_size:int=None, cip_score=True, costless=True, skip_bad_media=False, anme_comm=False, + print_progress=False): + from pandas import concat + + if pairs: model_pairs = unique([{model1, model2} for model1, models in pairs.items() for model2 in models]) + elif all_models is not None: + if not isinstance(all_models[0], list): + all_models = list(set(all_models)) ; model_pairs = array(list(combinations(all_models, 2))) + else: + model_pairs = [] + for models1, models2 in combinations(all_models, 2): + models1 = set(models1) ; models2 = set(models2) + if len(models1) > len(models2): larger_list = models1 ; smaller_list = models2 + else: larger_list = models2 ; smaller_list = models1 + model_pairs.append([list(zip(combin, smaller_list)) for combin in permutations(larger_list, len(smaller_list))]) + # flatten the assembled pairs and filter duplicates + model_pairs = array([x for x in set(tuple(x) for x in [i for y in list(chain.from_iterable(model_pairs)) for i in y])]) + all_models = list(chain.from_iterable(all_models)) + if pair_limit is not None: + shuffle(model_pairs) + new_pairs = [] + for index, pair in enumerate(model_pairs): + if set(pair) not in exclude_pairs and index < pair_limit: new_pairs.append(pair) + elif index >= pair_limit: break + model_pairs = array(new_pairs) + if isinstance(model_pairs[0], str): model_pairs = unique(sort(model_pairs, axis=1)) + pairs = {first: model_pairs[where(model_pairs[:, 0] == first)][:, 1] for first in model_pairs[:, 0]} + else: raise ValueError("Either < all_models > or < pairs > must be defined to simulate interactions.") + if not all_models: all_models = list(chain(*[list(values) for values in pairs.values()])) + list(pairs.keys()) + lazy_load = len(model_pairs) > 10000 # all_models[0], (list,set,tuple)) + if lazy_load and not kbase_obj: ValueError("The < kbase_obj > argument must be provided to lazy load models.") + new_models = [] + for index, model in enumerate(all_models): + if model.id == "": model.id = f"model_index{index}" + new_models.append(model) + all_models = new_models[:] + if not mem_media: models_media = _get_media(model_s_=all_models, skip_bad_media=skip_bad_media) + else: + models_media = mem_media.copy() + missing_models = set() + missing_modelID = [] + for model in all_models: + if model is not None and model.id not in models_media: + missing_models.add(model) + missing_modelID.append(model if not hasattr(model, "id") else model.id) + if missing_models != set(): + print(f"Media of the {missing_modelID} models are not defined, and will be calculated separately.") + models_media.update(_get_media(model_s_=missing_models), skip_bad_media=skip_bad_media) + if see_media: print(f"The minimal media of all members:\n{models_media}") + print(f"\nExamining the {len(list(model_pairs))} model pairs") + if pool_size is not None: + from datetime import datetime ; from multiprocess import Pool + print(f"Loading {int(pool_size)} workers and computing the scores", datetime.now()) + pool = Pool(int(pool_size)) #.map(calculate_scores, [{k: v} for k,v in pairs.items()]) + args = [[dict([pair]), models_media, environments, annotated_genomes, lazy_load, kbase_obj] + for pair in list(pairs.items())] + output = pool.map(CommScores.calculate_scores, args) + series = chain.from_iterable([ele[0] for ele in output]) + mets = chain.from_iterable([ele[1] for ele in output]) + else: series, mets = CommScores.calculate_scores(pairs, models_media, environments, annotated_genomes, lazy_load, + kbase_obj, cip_score, costless, skip_bad_media, anme_comm, print_progress) + return concat(series, axis=1).T, mets + + @staticmethod + def mro(member_models:Iterable=None, mem_media:dict=None, min_growth=0.1, media_dict=None, + raw_content=False, environment=None, skip_bad_media=False, printing=False, compatibilized=False): + """Determine the overlap of nutritional requirements (minimal media) between member organisms.""" + # determine the member minimal media if they are not parameterized + if not mem_media: + if not member_models: + raise ParameterError("The either member_models or minimal_media parameter must be defined.") + member_models = member_models if compatibilized else _compatibilize(member_models, printing) + mem_media = _get_media(media_dict, None, member_models, min_growth, environment, printing=printing, + skip_bad_media=skip_bad_media) + if "community_media" in mem_media: mem_media = mem_media["members"] + # MROs = array(list(map(len, pairs.values()))) / array(list(map(len, mem_media.values()))) + mro_values = {} + for model1, model2 in combinations(member_models, 2): + intersection = set(mem_media[model1.id]["media"].keys()) & set(mem_media[model2.id]["media"].keys()) + inter = [ex.replace("EX_", "").replace("_e0", "") for ex in intersection] + m1_media = mem_media[model1.id]["media"] ; m2_media = mem_media[model2.id]["media"] + if raw_content: mro_values.update({f"{model1.id}---{model2.id})": (inter, m1_media), + f"{model2.id}---{model1.id})": (inter, m2_media)}) + else: + mro_values.update({f"{model1.id}---{model2.id})": 100*(len(inter) / len(m1_media), len(inter), len(m1_media)), + f"{model2.id}---{model1.id})": 100*(len(inter) / len(m2_media), len(inter), len(m2_media)), + "mets": inter}) + return mro_values + # return mean(list(map(len, pairs.values()))) / mean(list(map(len, mem_media.values()))) + + @staticmethod + def mip(member_models: Iterable, com_model=None, min_growth=0.1, interacting_media_dict=None, + noninteracting_media_dict=None, environment=None, printing=False, compatibilized=False, + costless=False, multi_output=False, skip_bad_media=False): + """Determine the quantity of nutrients that can be potentially sourced through syntrophy""" + member_models, community = _load_models(member_models, com_model, not compatibilized, printing=printing) + # determine the interacting and non-interacting media for the specified community .util.model + noninteracting_medium, noninteracting_sol = _get_media( + noninteracting_media_dict, community, None, min_growth, environment, False, skip_bad_media=skip_bad_media) + if noninteracting_medium is None: return None + if "community_media" in noninteracting_medium: noninteracting_medium = noninteracting_medium["community_media"] + interacting_medium, interacting_sol = _get_media( + interacting_media_dict, community, None, min_growth, environment, True, skip_bad_media=skip_bad_media) + if interacting_medium is None: return None + if "community_media" in interacting_medium: interacting_medium = interacting_medium["community_media"] + interact_diff = DeepDiff(noninteracting_medium, interacting_medium) + if "dictionary_item_removed" not in interact_diff: return None + cross_fed_exIDs = [re.sub("(root\['|'\])", "", x) for x in interact_diff["dictionary_item_removed"]] + # Determine each direction of the MIP score interactions + comm_util = MSModelUtil(community) + cross_fed_metIDs = [ex.replace("EX_", "").replace("_e0", "") for ex in cross_fed_exIDs] + cross_fed_copy = cross_fed_metIDs[:] + directionalMIP = {mem.id:[] for mem in member_models} + for rxn in comm_util.transport_list(): + # print(rxn.reaction, "\t", [met.id for met in rxn.metabolites if "_e0" in met.id]) + metIDs = list(set([met.id.split("_")[0] for met in rxn.reactants]).intersection( + set([met.id.split("_")[0] for met in rxn.products]))) + if len(metIDs) == 1: metID = metIDs[0] + else: + if "cpd00067" in metIDs: metIDs.remove("cpd00067") + metID = metIDs[0] + if metID not in cross_fed_metIDs: continue + rxn_index = FBAHelper.compartment_index(rxn.id.split("_")[-1]) + if rxn_index == 0: continue + mets = [met for met in rxn.metabolites if met.id == f"{metID}_c{rxn_index}"] + if mets == []: print(f"The {metID}_c{rxn_index} is missing in {rxn.reaction}.") ; continue + rxn_model = member_models[rxn_index-1] + # comm_trans[metID] = comm_trans.get(f"{metID}_c{rxn_index}", {}) + if (rxn.metabolites[mets[0]] > 0 and interacting_sol.fluxes[rxn.id] > 0 + or rxn.metabolites[mets[0]] < 0 and interacting_sol.fluxes[rxn.id] < 0): # donor + directionalMIP[rxn_model.id].append(metID) + if metID in cross_fed_copy: cross_fed_copy.remove(metID) ; continue + # if printing: print(f"{mets[0]} in {rxn.id} ({rxn.reaction}) is not assigned a receiving member.") + if cross_fed_copy != [] and printing: print(f"Missing directions for the {cross_fed_copy} cross-fed metabolites") + outputs = [directionalMIP] + # TODO categorize all of the cross-fed substrates to examine potential associations of specific compounds + if costless: + costless_mets, numExs = CommScores.cip(member_models=member_models) + # print(list(directionalMIP.values()), costless_mets) + costlessDirectionalMIP = {member_name: set(receive_mets).intersection(costless_mets) + for member_name, receive_mets in directionalMIP.items()} + if not multi_output: return costlessDirectionalMIP + outputs.append(costlessDirectionalMIP) + return outputs + + @staticmethod + def cip(modelutils=None, member_models=None): # costless interaction potential + if not modelutils: modelutils = {MSModelUtil(model) for model in member_models} + costless_mets = set(chain.from_iterable([modelutil.costless_excreta() for modelutil in modelutils])) + return costless_mets, len(costless_mets) + + @staticmethod + def contributions(org_possible_contributions, scores, model_util, abstol): + # identify and log excreta from the solution + model_util.add_objective(sum(ex_rxn.flux_expression for ex_rxn in org_possible_contributions)) + sol = model_util.model.optimize() + if sol.status != "optimal": + # exit the while loop by returning the original possible_contributions, + ## hence DeepDiff == {} and the while loop terminates + return scores, org_possible_contributions + # identify and log excreta from the solution + possible_contributions = org_possible_contributions[:] + for ex in org_possible_contributions: + if ex.id in sol.fluxes.keys() and sol.fluxes[ex.id] >= abstol: + possible_contributions.remove(ex) + scores[model_util.model.id].update([met.id for met in ex.metabolites]) + return scores, possible_contributions + + @staticmethod + def mp(member_models:Iterable, environment, com_model=None, minimal_media=None, abstol=1e-3, printing=False): + """Discover the metabolites that each species can contribute to a community""" + community = _compatibilize(com_model) if com_model else build_from_species_models(member_models,standardize=True) + community.medium = minimal_media or MSMinimalMedia.minimize_flux(community) + scores = {} + for org_model in member_models: # TODO support parsing the individual members through the MSCommunity object + model_util = MSModelUtil(org_model) + model_util.compatibilize(printing=printing) + if environment: model_util.add_medium(environment) + scores[model_util.model.id] = set() + # determines possible member contributions in the community environment, where the excretion of media compounds is irrelevant + org_possible_contr = [ex_rxn for ex_rxn in model_util.exchange_list() + if (ex_rxn.id not in community.medium and ex_rxn.upper_bound > 0)] + # ic(org_possible_contributions, len(model_util.exchange_list()), len(community.medium)) + scores, possible_contr = CommScores.contributions(org_possible_contr, scores, model_util, abstol) + while DeepDiff(org_possible_contr, possible_contr): + print("remaining possible_contributions", len(possible_contr), end="\r") + ## optimize the sum of the remaining exchanges that have not surpassed the abstol + org_possible_contr = possible_contr[:] + scores, possible_contr = CommScores.contributions(org_possible_contr, scores, model_util, abstol) + + ## individually checks the remaining possible contributions + for ex_rxn in possible_contr: + model_util.model.objective = Objective(ex_rxn.flux_expression) + sol = model_util.model.optimize() + if sol.status == 'optimal' or sol.objective_value > abstol: + for met in ex_rxn.metabolites: + if met.id in scores[model_util.model.id]: + scores[model_util.model.id].remove(met.id) ; print("removing", met.id) + return scores + + @staticmethod + def mu(member_models:Iterable, environment=None, member_excreta=None, n_solutions=100, abstol=1e-3, + compatibilized=False, printing=True): + """the fractional frequency of each received metabolite amongst all possible alternative syntrophic solutions""" + # member_solutions = member_solutions if member_solutions else {model.id: model.optimize() for model in member_models} + scores = {} + member_models = member_models if compatibilized else _compatibilize(member_models, printing) + if member_excreta: + missing_members = [model for model in member_models if model.id not in member_excreta] + if missing_members: + print(f"The {','.join(missing_members)} members are missing from the defined " + f"excreta list and will therefore be determined through an additional MP simulation.") + member_excreta.update(CommScores.mp(missing_members, environment)) + else: member_excreta = CommScores.mp(member_models, environment, None, abstol, printing) + for org_model in member_models: + other_excreta = set(chain.from_iterable([excreta for model, excreta in member_excreta.items() + if model != org_model.id])) + print(f"\n{org_model.id}\tOther Excreta", other_excreta) + model_util = MSModelUtil(org_model, True) + if environment: + model_util.add_medium(environment) + ex_rxns = {ex_rxn: list(ex_rxn.metabolites)[0] for ex_rxn in model_util.exchange_list()} + print(f"\n{org_model.id}\tExtracellular reactions", ex_rxns) + variables = {ex_rxn.id: Variable('___'.join([model_util.model.id, ex_rxn.id]), + lb=0, ub=1, type="binary") for ex_rxn in ex_rxns} + model_util.add_cons_vars(list(variables.values())) + media, solutions = [], [] + sol = model_util.model.optimize() + while sol.status == "optimal" and len(solutions) < n_solutions: + solutions.append(sol) + medium = set([ex for ex in ex_rxns if sol.fluxes[ex.id] < -abstol and ex in other_excreta]) + model_util.create_constraint(Constraint(sum([variables[ex.id] for ex in medium]), + ub=len(medium)-1, name=f"iteration_{len(solutions)}")) + media.append(medium) + sol = model_util.model.optimize() + counter = Counter(chain(*media)) + scores[model_util.model.id] = {met.id: counter[ex] / len(media) + for ex, met in ex_rxns.items() if counter[ex] > 0} + return scores + + @staticmethod + def sc(member_models:Iterable=None, com_model=None, min_growth=0.1, n_solutions=100, + abstol=1e-6, compatibilized=True, printing=False): + """Calculate the frequency of interspecies dependency in a community""" + member_models, community = _load_models( + member_models, com_model, not compatibilized, printing=printing) + for rxn in com_model.reactions: + rxn.lower_bound = 0 if 'bio' in rxn.id else rxn.lower_bound + + # c_{rxn.id}_lb: rxn < 1000*y_{species_id} + # c_{rxn.id}_ub: rxn > -1000*y_{species_id} + variables = {} + constraints = [] + # TODO this can be converted to an MSCommunity object by looping through each index + # leverage CommKinetics + for org_model in member_models: + model_util = MSModelUtil(org_model, True) + variables[model_util.model.id] = Variable(name=f'y_{model_util.model.id}', lb=0, ub=1, type='binary') + model_util.add_cons_vars([variables[model_util.model.id]]) + for rxn in model_util.model.reactions: + if "bio" not in rxn.id: + # print(rxn.flux_expression) + lb = Constraint(rxn.flux_expression + 1000*variables[model_util.model.id], + name="_".join(["c", model_util.model.id, rxn.id, "lb"]), lb=0) + ub = Constraint(rxn.flux_expression - 1000*variables[model_util.model.id], + name="_".join(["c", model_util.model.id, rxn.id, "ub"]), ub=0) + constraints.extend([lb, ub]) + + # calculate the SCS + scores = {} + for model in member_models: + com_model_util = MSModelUtil(com_model) + com_model_util.add_cons_vars(constraints, sloppy=True) + # model growth is guaranteed while minimizing the growing members of the community + ## SMETANA_Biomass: {biomass_reactions} > {min_growth} + com_model_util.create_constraint(Constraint(sum(rxn.flux_expression for rxn in model.reactions + if "bio" in rxn.id), name='SMETANA_Biomass', lb=min_growth)) # sloppy = True) + other_members = [other for other in member_models if other.id != model.id] + com_model_util.add_objective(sum([variables[other.id] for other in other_members]), "min") + previous_constraints, donors_list = [], [] + for i in range(n_solutions): + sol = com_model.optimize() # FIXME The solution is not optimal + if sol.status != 'optimal': + scores[model.id] = None + break + donors = [o for o in other_members if com_model.solver.primal_values[f"y_{o.id}"] > abstol] + donors_list.append(donors) + previous_con = f'iteration_{i}' + previous_constraints.append(previous_con) + com_model_util.add_cons_vars([Constraint(sum(variables[o.id] for o in donors), name=previous_con, + ub=len(previous_constraints)-1)], sloppy=True) + if i != 0: + donors_counter = Counter(chain(*donors_list)) + scores[model.id] = {o.id: donors_counter[o] / len(donors_list) for o in other_members} + return scores + + @staticmethod + def gyd(member_models:Iterable=None, model_utils:Iterable=None, environment=None, coculture_growth=False, + community=None, anme_comm=False): + gyds = {} + for combination in combinations(model_utils or member_models, 2): + if model_utils is None: + model1_util = MSModelUtil(combination[0], True) ; model2_util = MSModelUtil(combination[1], True) + print(f"{model1_util.model.id} ++ {model2_util.model.id}", model1_util.model.slim_optimize(), model2_util.model.slim_optimize()) + if environment and not anme_comm: model1_util.add_medium(environment); model2_util.add_medium(environment) + else: model1_util = combination[0] ; model2_util = combination[1] + if not coculture_growth: + G_m1, G_m2 = CommScores._determine_growths([model1_util, model2_util]) + G_m1, G_m2 = G_m1 if FBAHelper.isnumber(str(G_m1)) else 0, G_m2 if FBAHelper.isnumber(str(G_m2)) else 0 + else: + community = community or MSCommunity(member_models=[model1_util.model, model2_util.model], + ids=[mem.id for mem in member_models]) + community.run_fba() + member_growths = community.parse_member_growths() + G_m1, G_m2 = member_growths[model1_util.model.id], member_growths[model2_util.model.id] + if G_m2 <= 0 or G_m1 <= 0: gyds[f"{model1_util.model.id} ++ {model2_util.model.id}"] = ("", "", G_m1, G_m2) ; continue + gyds[f"{model1_util.model.id} ++ {model2_util.model.id}"] = (abs(G_m1-G_m2)/G_m1, abs(G_m2-G_m1)/G_m2, G_m1, G_m2) + return gyds + + @staticmethod + def pc(member_models=None, modelutils=None, com_model=None, isolate_growths=None, comm_sol=None, + environment=None, comm_effects=True, community=None, interaction_threshold=0.1, compatibilized=False): + assert member_models or modelutils or community, "Members must be defined through either < member_models >" \ + "or < modelutils > or < community >." + member_models = member_models or [mem.model for mem in modelutils] or community.members + if com_model is None: member_models, com_model = _load_models(member_models, None, not compatibilized, printing=False) + community = community or MSCommunity(com_model, member_models) + if comm_sol is None: community.util.add_medium(environment) ; comm_sol = community.util.model.optimize() + model_utils = modelutils or [MSModelUtil(mem, True) for mem in member_models] ; modelutils = [] + for mem in model_utils: + mem.add_medium(environment) ; modelutils.append(mem) + if isolate_growths is None: isolate_growths = {mem.id: mem.model.slim_optimize() for mem in modelutils} + pc_score = (comm_sol.objective_value/sum(list(isolate_growths.values()))) + if not comm_effects: return pc_score + + comm_member_growths = {mem.id: comm_sol.fluxes[mem.primary_biomass.id] for mem in community.members} + comm_growth_effect = {memID: nanFilter(comm_environ/isolate_growths[memID]) + for memID, comm_environ in comm_member_growths.items()} + growth_diffs = array([nanFilter(x, False) for x in list(comm_growth_effect.values())]) + th_pos, th_neg = 1+interaction_threshold, 1-interaction_threshold + if all(growth_diffs > th_pos): bit = "mutualism" + elif all(growth_diffs < th_neg): bit = "competitive" + elif ((th_pos > growth_diffs) & (growth_diffs > th_neg)).all(): bit = "neutral" + elif all(growth_diffs > th_neg) and any(growth_diffs > th_pos): bit = "commensalism" + elif all(growth_diffs < th_pos) and any(growth_diffs < th_neg): bit = "amensalism" + elif any(growth_diffs > th_pos) and any(growth_diffs < th_neg): bit = "parasitism" + else: print(f"The relative growths {comm_growth_effect} from {comm_member_growths} coculture and" + f" {isolate_growths} monoculture are not captured.") ; bit = "" + return (pc_score, comm_growth_effect, comm_member_growths, bit) + + @staticmethod + def bss(member_models:Iterable=None, model_utils:Iterable=None, environments=None, minMedia=None, skip_bad_media=False): + def compute_score(minMedia, environment=None, index=0): + minMedia = minMedia or _get_media(model_s_=[modelUtil.model for modelUtil in model_utils], + environment=environment, skip_bad_media=skip_bad_media) + model1_media = set([re.sub(r"(\_\w\d+$)", "", rxnID.replace("EX_", "")) + for rxnID in minMedia[model1_util.id]["media"].keys()]) + model2_media = set([re.sub(r"(\_\w\d+$)", "", rxnID.replace("EX_", "")) + for rxnID in minMedia[model2_util.id]["media"].keys()]) + model1_internal = {rm_comp(met.id) for rxn in model1_util.internal_list() for met in rxn.products} + model2_internal = {rm_comp(met.id) for rxn in model2_util.internal_list() for met in rxn.products} + bss_scores[f"{model1_util.id} supporting {model2_util.id} in media{index}"] = (model1_internal, + len(model2_media.intersection(model1_internal)) / len(model2_media)) + bss_scores[f"{model2_util.id} supporting {model1_util.id} in media{index}"] = (model2_internal, + len(model1_media.intersection(model2_internal)) / len(model1_media)) + + bss_scores = {} + for combination in combinations(model_utils or member_models, 2): + if model_utils is None: + model1_util = MSModelUtil(combination[0], True) ; model2_util = MSModelUtil(combination[1], True) + model_utils = [model1_util, model2_util] + else: model1_util = combination[0] ; model2_util = combination[1] + if environments: + for index, environment in enumerate(environments): + compute_score(minMedia, environment, index) + else: compute_score(minMedia) + return bss_scores + + @staticmethod + def mqs(): + pass + + + @staticmethod + def _calculate_jaccard_score(set1, set2): + if set1 == set2: print(f"The sets are identical, with a length of {len(set1)}.") + if len(set1.union(set2)) == 0: return (None, None) + return (set1.intersection(set2), len(set1.intersection(set2)) / len(set1.union(set2))) + + @staticmethod + def get_all_genomes_from_ws(ws_id, kbase_object=None, cobrakbase_repo_path:str=None, kbase_token_path:str=None): + def get_genome(genome_name): + return kbase_object.ws_client.get_objects2( + {'objects': [{'ref': f"{ws_id}/{genome_name}"}]})["data"][0]['data'] + # load the kbase client instance + if not kbase_object: + import os + os.environ["HOME"] = cobrakbase_repo_path + import cobrakbase + with open(kbase_token_path) as token_file: kbase_object = cobrakbase.KBaseAPI(token_file.readline()) + + # calculate the complementarity + genome_list = kbase_object.ws_client.list_objects( + {"ids": [ws_id], "type": 'KBaseGenomes.Genome', 'minObjectID': 0, 'maxObjectID': 10000}) + genome_names = [g[1] for g in genome_list if g[1].endswith("RAST")] + return {genome_name: set([sso for j in get_genome(genome_name)['cdss'] + for sso in j['ontology_terms']['SSO'].keys()]) + for genome_name in genome_names} + + @staticmethod + def fs(models:Iterable=None, kbase_object=None, cobrakbase_repo_path:str=None, + kbase_token_path:str=None, annotated_genomes:dict=None, printing=False): + if not isinstance(annotated_genomes, dict): + if not kbase_object: + import os ; os.environ["HOME"] = cobrakbase_repo_path ; import cobrakbase + with open(kbase_token_path) as token_file: kbase_object = cobrakbase.KBaseAPI(token_file.readline()) + annotated_genomes = {model.id: kbase_object.get_from_ws(model.genome_ref) + for model in models if hasattr(model, "genome_ref")} + elif isinstance(annotated_genomes, list): annotated_genomes = dict(zip([model.id for model in models], annotated_genomes)) + elif models is not None: + annotated_genomes = {k:v for k,v in annotated_genomes.items() if k in [model.id for model in models]} + genome_combinations = list(combinations(annotated_genomes.keys(), 2)) + if printing: print(f"The Functionality Score (FS) will be calculated for {len(genome_combinations)} pairs.") + if not isinstance(list(annotated_genomes.values())[0], dict): + genome1_set, genome2_set = set(), set() + distances = {} + for genome1, genome2 in genome_combinations: + for j in annotated_genomes[genome1].features: + for key, val in j.ontology_terms.items(): + if key == 'SSO': genome1_set.update(val) + for j in annotated_genomes[genome2].features: + for key, val in j.ontology_terms.items(): + if key == 'SSO': genome2_set.update(val) + distances[f"{genome1} ++ {genome2}"] = CommScores._calculate_jaccard_score(genome1_set, genome2_set) + else: + distances = {f"{genome1} ++ {genome2}": CommScores._calculate_jaccard_score( + set(list(content["SSO"].keys())[0] for dic in annotated_genomes[genome1]["cdss"] + for x, content in dic.items() if x == "ontology_terms" and len(content["SSO"].keys()) > 0), + set(list(content["SSO"].keys())[0] for dic in annotated_genomes[genome2]["cdss"] + for x, content in dic.items() if x == "ontology_terms" and len(content["SSO"].keys()) > 0)) + for genome1, genome2 in combinations(annotated_genomes.keys(), 2)} + return distances + + @staticmethod + def smetana(member_models: Iterable, environment, com_model=None, min_growth=0.1, n_solutions=100, + abstol=1e-6, prior_values=None, compatibilized=False, sc_coupling=False, printing=False): + """Quantifies the extent of syntrophy as the sum of all exchanges in a given nutritional environment""" + member_models, community = _load_models( + member_models, com_model, compatibilized==False, printing=printing) + sc = None + if not prior_values: + mp = CommScores.mp(member_models, environment, com_model, abstol) + mu = CommScores.mu(member_models, environment, mp, n_solutions, abstol, compatibilized) + if sc_coupling: + sc = CommScores.sc(member_models, com_model, min_growth, n_solutions, abstol, compatibilized) + elif len(prior_values) == 3: sc, mu, mp = prior_values + else: mu, mp = prior_values + + smetana_scores = {} + for pairs in combinations(member_models, 2): + for model1, model2 in permutations(pairs): + if model1.id not in smetana_scores: + smetana_scores[model1.id] = {} + if not any([not mu[model1.id], not mp[model1.id]]): + sc_score = 1 if not sc_coupling else sc[model1.id][model2.id] + models_mets = list(model1.metabolites)+list(model2.metabolites) + unique_mets = set([met.id for met in models_mets]) + smetana_scores[model1.id][model2.id] = 0 + for met in models_mets: + if met.id in unique_mets: + mp_score = 0 if met.id not in mp[model1.id] else 1 + smetana_scores[model1.id][model2.id] += mu[model1.id].get(met.id,0)*sc_score*mp_score + return smetana_scores + + @staticmethod + def antiSMASH(json_path=None, zip_path=None): + # TODO Scores 2, 4, and 5 are being explored for relevance to community formation and reveal specific member interactions/targets + # load the antiSMASH report from either the JSON or the raw ZIP, or both + from os import mkdir, listdir, path + from zipfile import ZipFile + from json import load + if json_path: + cwd_files = listdir() + if json_path not in cwd_files and zip_path: + with ZipFile(zip_path, "r") as zip_file: + zip_file.extract(json_path) + with open(json_path, "r") as json_file: + data = load(json_file) + elif zip_path: + mkdir("extracted_antiSMASH") + with ZipFile(zip_path, "r") as zip_file: + zip_file.extractall("extracted_antiSMASH") + json_files = [x for x in listdir("extracted_antiSMASH") if x.endswith("json")] + if len(json_files) > 1: + print(f"The antiSMASH report describes {len(json_files)} JSON files, the first of which is selected " + f"{json_files[0]} for analysis, otherwise explicitly identify the desired JSON file in the json_path parameter.") + with open(path.join("extracted_antiSMASH", json_files[0]), "r") as json_file: + data = load(json_file) + else: + raise ParameterError("Either the json_path or zip_path from the antiSMASH analysis must be provided," + " for these scores to be determined.") + # Parse data and scores from the antiSMASH report + biosynthetic_areas = data["records"][0]['areas'] + BGCs = set(array([data["records"][0]['areas'][i]['products'] for i in range(biosynthetic_areas)]).flatten()) + len_proteins = len(data["records"][0]['modules']['antismash.modules.clusterblast']['knowncluster']['proteins']) + protein_annotations = [data["records"][0]['modules']['antismash.modules.clusterblast']['knowncluster']['proteins'][i]['annotations'] + for i in range(len_proteins)] + clusterBlast = [s for s in protein_annotations if "resistance" in s] + num_clusterBlast = sum([item.count("resistance") for item in protein_annotations]) + + return biosynthetic_areas, BGCs, protein_annotations, clusterBlast, num_clusterBlast diff --git a/modelseedpy/community/commscores_template.html b/modelseedpy/community/commscores_template.html new file mode 100644 index 00000000..b379568a --- /dev/null +++ b/modelseedpy/community/commscores_template.html @@ -0,0 +1,157 @@ + + + + + + CommScores Results + + + + + + + + + + + + + + + +

CommScores Results

+ + + + \ No newline at end of file diff --git a/modelseedpy/community/datastandardization.py b/modelseedpy/community/datastandardization.py new file mode 100644 index 00000000..932ae461 --- /dev/null +++ b/modelseedpy/community/datastandardization.py @@ -0,0 +1,770 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Aug 1 11:44:07 2022 + +@author: Andrew Freiburger +""" +from modelseedpy.community.commhelper import phenotypes +from modelseedpy.core.exceptions import ParameterError +from modelseedpy.core.optlanghelper import isIterable +from modelseedpy.core.fbahelper import FBAHelper +from optlang import Constraint +from optlang.symbolics import Zero +from scipy.constants import hour +from zipfile import ZipFile, ZIP_LZMA +from itertools import chain +from typing import Union, Iterable +from copy import deepcopy +from icecream import ic +# from cplex import Cplex +import logging, json, os, re +from pandas import read_csv, DataFrame, ExcelFile +import numpy as np + + +import logging +logger = logging.getLogger(__name__) + +def isnumber(string): + try: + float(string) + except: + return False + return True + +def _findDate(string, numerical=False): + monthNames = ["January", "February", "March", "April", "May", "June", "July", + "August", "September", "October", "November", "December"] + monthNums = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] + days = list(range(31, 0, -1)) # [f"{num}-" for num in list(range(31,0,-1))] + years = list(range(2010, 2025))+list(range(10,25)) # [f"-{num}" for num in list(range(2000, 2100))] + americanDates = [f"{mon}-{day}-{year}" for mon in monthNums for day in days for year in years] + + for date in americanDates: + if re.search(date, string): + month, day, year = date.split("-") + if numerical: + return "-".join([day, month, year]) + return f"{monthNames[int(month)-1][:3]} {day}, {year}" + # # determine the month + # for monName in monthNames: + # if re.search(monName, string): + # month = monName + # break + # if not month: + # for monNum in monthNums: + # if re.search(monNum, string): + # month = monNum # maybe should be converted to the Name for standardization + # # determine the day + # for dayNum in days: + # if re.search(dayNum, string): + # day = dayNum + # break + # # determine the year + # for yearNum in years: + # if re.search(yearNum, string): + # year = yearNum + # break + # return day+month+year + +def dict_keys_exists(dic, *keys): + if keys[0] in dic: + remainingKeys = keys[1:] + if len(remainingKeys) > 0: + dict_keys_exists(dic[keys[0]], keys[1:]) + return True + return False + +def find_dic_number(dic): + for k, v in dic.items(): + if isnumber(v): + return v + num = find_dic_number(dic[k]) + return num + +def default_dict_values(dic, key, default): + return default if not key in dic else dic[key] + +def trial_contents(short_code, indices_tup, values): + matches = [ele == short_code for ele in indices_tup] + return np.array(values)[matches] + +def _spreadsheet_extension_load(path): + if ".csv" in path: + return read_csv(path) + elif ".xls" in path: + return ExcelFile(path) + +def _spreadsheet_extension_parse(path, raw_data, org_sheet): + if ".csv" in path: + return raw_data + elif ".xls" in path: + return raw_data.parse(org_sheet) + +def _met_id_parser(met): + met_id = re.sub('(\_\w\d+)', '', met) + met_id = met_id.replace('EX_', '', 1) + met_id = met_id.replace('c_', '', 1) + return met_id + +def _column_reduction(org_df): + dataframe = org_df.copy() # this prevents an irrelevant warning from pandas + dataframe.columns = map(str, dataframe.columns) + dataframe.index = dataframe['Well'] + dataframe.drop('Well', axis=1, inplace=True) + for col in dataframe.columns: + if any([x in col for x in ['Plate', 'Well', 'Cycle']]): + dataframe.drop(col, axis=1, inplace=True) + dataframe.columns = list(map(int, list(map(float, dataframe.columns)))) + return dataframe + +def _remove_trials(org_df, ignore_trials, signal, name, significant_deviation): + # refine the ignore_trials parameter + if isinstance(ignore_trials, dict): + ignore_trials['columns'] = list(map(str, ignore_trials['columns'])) if 'columns' in ignore_trials else [] + ignore_trials['rows'] = list(map(str, ignore_trials['rows'])) if 'rows' in ignore_trials else [] + ignore_trials['wells'] = ignore_trials['wells'] if 'wells' in ignore_trials else [] + elif isIterable(ignore_trials): + if ignore_trials[0][0].isalpha() and isnumber(ignore_trials[0][1:]): + short_code = True # TODO - drop trials with respect to the short codes, and not the full codes + + dataframe = org_df.copy() # this prevents an irrelevant warning from pandas + dropped_trials = [] + for trial in dataframe.index: + if isinstance(ignore_trials, dict) and any( + [trial[0] in ignore_trials['rows'], trial[1:] in ignore_trials['columns'], trial in ignore_trials['wells']] + ) or isIterable(ignore_trials) and trial in ignore_trials: + dataframe.drop(trial, axis=0, inplace=True) + dropped_trials.append(trial) + elif isIterable(ignore_trials) and trial in ignore_trials: + dataframe.drop(trial, axis=0, inplace=True) + dropped_trials.append(trial) + removed_trials = [] + if 'OD' not in signal: + for trial, row in dataframe.iterrows(): + row_array = np.array(row.to_list()) + ## remove trials for which the biomass growth did not change by the determined minimum deviation + if row_array[-1] / row_array[0] < significant_deviation: + dataframe.drop(trial, axis=0, inplace=True) + removed_trials.append(trial) + if removed_trials: + print(f'The {removed_trials} trials were removed from the {name} measurements, ' + f'with their deviation over time being less than the threshold of {significant_deviation}.') + if dropped_trials: + print(f'The {dropped_trials} trials were dropped from the {name} measurements ' + 'per the ignore_trials parameter.') + return dataframe, dropped_trials+removed_trials + +def _check_plateau(org_df, signal, name, significant_deviation, timesteps_len): + significant_deviation = max([2, significant_deviation]) + dataframe = org_df.copy() # this prevents an irrelevant warning from pandas + dropped = [] + for trial, row in dataframe.iterrows(): + row_array = np.array(row.to_list()) + values = [] + tracking = False + ## remove trials for which the biomass growth did not change by the determined minimum deviation + for index, val in enumerate(row_array): + if val / row_array[0] >= significant_deviation or tracking: + tracking = True + values.append(val) + if len(values) > timesteps_len: + del values[0] + remaining_values = list(dataframe.columns[index-timesteps_len+1:]) + if all([len(values) == timesteps_len, values[-1] <= values[0], + remaining_values[0] <= remaining_values[-1]*1.1]): + # the entire plateau, minus the first point of plateau, are removed + dropped = remaining_values + break + if dropped: + break + if dropped: + content = f"{name} {signal}" if name != signal else signal + print(f"The {dropped} timesteps (with {row_array[index-len(values)+1:]} values) were removed " + f"from the {content} data since the OD plateaued and is no longer valid.") + return dropped + +def _remove_timesteps(org_df, ignore_timesteps, name, signal): + dataframe = org_df.copy() # this prevents an irrelevant warning from pandas + if ignore_timesteps: + dropped = [] + for col in dataframe: + if col in ignore_timesteps: + dataframe.drop(col, axis=1, inplace=True) + dropped.append(col) + if dropped == ignore_timesteps: + print(f"The ignore_timesteps columns were dropped for the {name} {signal} data.") + else: + raise ParameterError(f"The ignore_timesteps values {ignore_timesteps} " + f"were unsuccessfully dropped for the {name} {signal} data.") + return dataframe, ignore_timesteps + +def _df_construction(name, df_name, ignore_trials, ignore_timesteps, + significant_deviation, dataframe, row_num, buffer_col1=True): + # refine the DataFrames + time_df = _column_reduction(dataframe.iloc[0::2]) + values_df = _column_reduction(dataframe.iloc[1::2]) + # display(name, time_df, values_df) + + # remove specified data trials + if ignore_trials: + values_df, removed_trials = _remove_trials( + values_df, ignore_trials, df_name, name, significant_deviation) + for row in removed_trials: + time_df.drop(row, axis=0, inplace=True) + + # remove specified data timesteps + if ignore_timesteps: + values_df, removed_timesteps = _remove_timesteps( + values_df, ignore_timesteps, name, df_name) + for col in list(map(int, removed_timesteps)): + time_df.drop(col, axis=1, inplace=True) + + # remove undefined trials + if buffer_col1: + possible_rows = [chr(ord("A")+row) for row in range(1, row_num+1)] + for trial_code in values_df.index: + if trial_code[0] not in possible_rows: + values_df.drop(trial_code, axis=0, inplace=True) + time_df.drop(trial_code, axis=0, inplace=True) + + # process the data for subsequent operations and optimal efficiency + values_df.astype(str); time_df.astype(str) + return time_df, values_df + +def _find_culture(string): + matches = re.findall(r"([A-Z]{2}\+?[A-Z]*)", string) + return [m for m in matches if not any([x in m for x in ["BIOLOG", "III"]])] + +def reverse_strip_comp(ID): + return ID.replace("~", "-") + +def _process_csv(self, csv_path, index_col): + self.zipped_output.append(csv_path) + csv = read_csv(csv_path) ; csv.index = csv[index_col] + csv.drop(index_col, axis=1, inplace=True) + csv.astype(str) + return csv + +def add_rel_flux_cons(model, ex, phenoRXN, carbon_ratio, rel_flux=0.2): + # {ex.id}_uptakeLimit: {net_{carbonous_ex}} >= {net_{carbon_source}}*{rel_flux}*{carbon_ratio} + # The negative flux sign of influxes specifies that the carbon_source value must be lesser than the other + # carbon influx that is being constrained. + cons = Constraint(Zero, lb=0, ub=None, name=f"{ex.id}_uptakeLimit") + model.add_cons_vars(cons) + cons.set_linear_coefficients({ + ex.forward_variable:1, ex.reverse_variable:-1, + phenoRXN.forward_variable:-rel_flux*carbon_ratio, phenoRXN.reverse_variable:rel_flux*carbon_ratio}) + return model, cons + + +class GrowthData: + + @staticmethod + def process(community_members: dict, base_media=None, solver: str = 'glpk', all_phenotypes=True, + data_paths: dict = None, species_abundances: str = None, carbon_conc_series: dict = None, + ignore_trials: Union[dict, list] = None, ignore_timesteps: list = None, species_identities_rows=None, + significant_deviation: float = 2, extract_zip_path: str = None, determine_requisite_biomass=False): #, msdb_path:str=None): + # define the number of rows in the experimental data + row_num = len(species_identities_rows) + if "rows" in carbon_conc_series and carbon_conc_series["rows"]: + row_num = len(list(carbon_conc_series["rows"].values())[0]) + # load and parse data and metadata + (media_conc, data_timestep_hr, simulation_time, dataframes, trials, fluxes_df + ) = GrowthData.load_data( + base_media, community_members, solver, data_paths, ignore_trials, all_phenotypes, + ignore_timesteps, significant_deviation, row_num, extract_zip_path) + experimental_metadata, standardized_carbon_conc, trial_name_conversion = GrowthData.metadata( + base_media, community_members, species_abundances, carbon_conc_series, + species_identities_rows, row_num, _findDate(data_paths["path"])) + data_df = GrowthData.data_process(dataframes, trial_name_conversion) + requisite_biomass = {} if not determine_requisite_biomass else GrowthData.biomass_growth( + carbon_conc_series, fluxes_df, data_df.index.unique(), trial_name_conversion, + data_paths, community_members if all_phenotypes else None) + return (experimental_metadata, data_df, fluxes_df, standardized_carbon_conc, requisite_biomass, + trial_name_conversion, np.mean(data_timestep_hr), simulation_time, media_conc) + + @staticmethod + def load_data(base_media, community_members, solver, data_paths, ignore_trials, all_phenotypes, + ignore_timesteps, significant_deviation, row_num, extract_zip_path, min_timesteps=False): + # define default values + significant_deviation = significant_deviation or 0 + data_paths = data_paths or {} + ignore_timesteps = ignore_timesteps or "0:0" + start, end = ignore_timesteps.split(':') + raw_data = _spreadsheet_extension_load(data_paths['path']) + for org_sheet, name in data_paths.items(): + if org_sheet == 'path': + continue + df = _spreadsheet_extension_parse(data_paths['path'], raw_data, org_sheet) + df.columns = df.iloc[6] + df.drop(df.index[:7], inplace=True) + ## acquire the default start and end indices of ignore_timesteps + start = int(start or df.columns[0]) + end = int(end or df.columns[-1]) + break + ignore_timesteps = list(range(start, end+1)) if start != end else None + if extract_zip_path: + with ZipFile(extract_zip_path, 'r') as zp: + zp.extractall() + + # define only species for which data is defined + fluxes_df, comm_members = phenotypes(community_members, all_phenotypes, solver=solver) + modeled_species = list(v for v in data_paths.values() if ("OD" not in v and " " not in v)) + removed_phenotypes = [col for col in fluxes_df if not any([species in col for species in modeled_species])] + fluxes_df.drop(removed_phenotypes, axis=1, inplace=True) + if removed_phenotypes: + print(f'The {removed_phenotypes} phenotypes were removed ' + f'since their species is not among those with data: {modeled_species}.') + + # determine the time range in which all datasets are significant + data_timestep_hr = [] + dataframes = {} + max_timestep_cols = [] + if min_timesteps: + for org_sheet, name in data_paths.items(): + if org_sheet == 'path' or "OD" in sheet: continue + ## define the DataFrame + sheet = org_sheet.replace(' ', '_') + df_name = f"{name}:{sheet}" + dataframes[df_name] = _spreadsheet_extension_parse(data_paths['path'], raw_data, org_sheet) + dataframes[df_name].columns = dataframes[df_name].iloc[6] + dataframes[df_name].drop(dataframes[df_name].index[:7], inplace=True) + ## parse the timesteps from the DataFrame + drop_timestep_range = GrowthData._min_significant_timesteps( + dataframes[df_name], ignore_timesteps, significant_deviation, ignore_trials, df_name, name) + max_timestep_cols.append(drop_timestep_range) + ## timesteps that must be dropped for the most restrictive dataset is acquired + max_cols = max(list(map(len, max_timestep_cols))) + for ignore_timesteps in max_timestep_cols: + if len(ignore_timesteps) == max_cols: break + + # remove trials for which the OD has plateaued + # TODO - this somehow seems to break when the requisite_biomass is ignored + for org_sheet, name in data_paths.items(): + if "OD" not in name: continue + ## load the OD DataFrame + sheet = org_sheet.replace(' ', '_') + df_name = f"{name}:{sheet}" + dataframes[df_name] = _spreadsheet_extension_parse(data_paths['path'], raw_data, org_sheet) + dataframes[df_name].columns = dataframes[df_name].iloc[6] + dataframes[df_name].drop(dataframes[df_name].index[:7], inplace=True) + ## process the OD DataFrame + data_times_df, data_values_df = _df_construction( + name, df_name, ignore_trials, ignore_timesteps, + significant_deviation, dataframes[df_name], row_num) + plateaued_times = _check_plateau(data_values_df, name, name, significant_deviation, 3) + ## define and store the final DataFrames + for col in plateaued_times: + if col in data_times_df.columns: data_times_df.drop(col, axis=1, inplace=True) + if col in data_values_df.columns: data_values_df.drop(col, axis=1, inplace=True) + dataframes[df_name] = (data_times_df, data_values_df) + break + + # refine the non-OD signals + for org_sheet, name in data_paths.items(): + if org_sheet == 'path' or "OD" in name: continue + sheet = org_sheet.replace(' ', '_') + df_name = f"{name}:{sheet}" + if df_name not in dataframes: + dataframes[df_name] = _spreadsheet_extension_parse( + data_paths['path'], raw_data, org_sheet) + dataframes[df_name].columns = dataframes[df_name].iloc[6] + dataframes[df_name].drop(dataframes[df_name].index[:7], inplace=True) + # parse the DataFrame for values + simulation_time = dataframes[df_name].iloc[0, -1] / hour + data_timestep_hr.append(simulation_time / int(dataframes[df_name].columns[-1])) + # define the times and data + data_times_df, data_values_df = _df_construction( + name, df_name, ignore_trials, ignore_timesteps, significant_deviation, + dataframes[df_name], row_num) + # display(data_times_df) ; display(data_values_df) + for col in plateaued_times: + if col in data_times_df.columns: data_times_df.drop(col, axis=1, inplace=True) + if col in data_values_df.columns: data_values_df.drop(col, axis=1, inplace=True) + dataframes[df_name] = (data_times_df, data_values_df) + + # differentiate the phenotypes for each species + trials = set(chain.from_iterable([list(times.index) for times, values in dataframes.values()])) + media_conc = {} if not base_media else {cpd.id: cpd.concentration for cpd in base_media.mediacompounds} + return (media_conc, data_timestep_hr, simulation_time, dataframes, trials, fluxes_df) + + @staticmethod + def _min_significant_timesteps(full_df, ignore_timesteps, significant_deviation, ignore_trials, df_name, name): + # refine the DataFrames + values_df = _column_reduction(full_df.iloc[1::2]) + values_df, removed_trials = _remove_trials(values_df, ignore_trials, df_name, name, significant_deviation) + timestep_range = list(set(list(values_df.columns)) - set(ignore_timesteps)) + start, end = ignore_timesteps[0], ignore_timesteps[-1] + start_index = list(values_df.columns).index(start) + end_index = list(values_df.columns).index(end) + ## adjust the customized range such that the threshold is reached. + for trial, row in values_df.iterrows(): + row_array = np.delete(np.array(row.to_list()), list(range(start_index, end_index + 1))) + ## remove trials for which the biomass growth did not change by the determined minimum deviation + while all([row_array[-1] / row_array[0] < significant_deviation, + end <= values_df.columns[-1], start >= values_df.columns[0]]): + # print(timestep_range[0], values_df.columns[0], values_df.columns[-1], end, start) + if timestep_range[0] == values_df.columns[0] and start != values_df.columns[-1]: + timestep_range.append(timestep_range[-1] + 1) + start += 1 + print(f"The end boundary for {name} is increased to {timestep_range[-1]}", end="\r") + elif timestep_range[-1] == values_df.columns[-1] and end != values_df.columns[0]: + timestep_range.append(timestep_range[0] - 1) + end -= 1 + print(f"The start boundary for {name} is decreased to {timestep_range[0]}", end="\r") + else: + raise ParameterError(f"All of the timesteps were omitted for {name}.") + row_array = np.delete(np.array(row.to_list()), list(range( + list(values_df.columns).index(start), list(values_df.columns).index(end) + 1))) + print("\n") + return list(range(start, end+1)) + + @staticmethod + def metadata(base_media, community_members, species_abundances, + carbon_conc, species_identities_rows, row_num, date): + # define carbon concentrations for each trial + carbon_conc = carbon_conc or {} + carbon_conc['columns'] = default_dict_values(carbon_conc, "columns", {}) + carbon_conc['rows'] = default_dict_values(carbon_conc, "rows", {}) + column_num = len(species_abundances) + + # define the metadata DataFrame and a few columns + constructed_experiments = DataFrame(index = [f"G{x+1}" for x in list(range(column_num*row_num))]) + constructed_experiments.index.name = "short_code" + base_media_path = "minimal components media" if not base_media else base_media.path[0] + constructed_experiments["base_media"] = [base_media_path] * (column_num*row_num) + + # define community content + # species_mets = {mem["name"]: np.array([mets["consumed"] for mets in mem["phenotypes"].values()]).flatten() + # for mem in community_members.values()} + # define the strains column + strains, additional_compounds, experiment_ids = [], [], [] + trial_name_conversion = {} + count = 1 + ## apply universal values to all trials + base_row_conc = [] if '*' not in carbon_conc else [ + ':'.join([met, str(carbon_conc['*'][met][0]), str(carbon_conc['*'][met][1])]) for met in carbon_conc['*']] + members = list(mem["name"] for mem in community_members.values()) + for row in range(1, row_num+1): + row_conc = base_row_conc[:] + trial_letter = chr(ord("A") + row) + trial_name_conversion[trial_letter] = {} + ## add rows where the initial concentration in the first trial is non-zero + for met, conc_dict in carbon_conc["rows"].items(): + if conc_dict[sorted(list(conc_dict.keys()))[row-1]] > 0: + row_conc.append(':'.join([ + met, str(conc_dict[sorted(list(conc_dict.keys()))[row-1]]), + str(conc_dict[sorted(list(conc_dict.keys()), reverse=True)[-row]])])) + + row_concentration = ';'.join(row_conc) + composition = {} + for col in range(1, column_num+1): + ## construct the columns of information + additional_compounds.append(row_concentration) + experiment_id = [] + for member in members: + ### define the relative community abundances + composition[member] = [member, f"r{species_abundances[col][member]}"] + ### define the member strain, where it is appropriate + if member in species_identities_rows[row]: + composition[member][0] += f"_{species_identities_rows[row][member]}" + ### the experimental ID is abundance+memberID + if int(composition[member][1][1:]) != 0: + experiment_id.append(f"{composition[member][1]}_{composition[member][0]}") + composition[member] = ':'.join(composition[member]) + strains.append(';'.join(composition[member] for member in members)) + # for row2 in row_conc: + # metID, init, end = row2.split(':') + # ### get the met_name for the corresponding match in values + # met_name = None + # for index, mets in enumerate(species_mets.values()): + # if metID in mets: + # met_name = list(species_mets.keys())[index] + # break + # if "met_name" not in locals() or not met_name: + # logger.critical(f"The specified phenotypes {species_mets} for the {members} members" + # f" does not include the consumption of the available sources" + # f" {row_conc}; hence, the model cannot grow.") + # content = "" + # else: + # content = f"{init}_{met_name}" + # experiment_id.append(content) + experiment_id.extend([":".join(row.split(":")[:2]) for row in row_conc]) + experiment_id = '-'.join(experiment_id) + experiment_ids.append(experiment_id) + trial_name_conversion[trial_letter][str(col+1)] = ("G"+str(count), experiment_id) + count += 1 + + # convert the variable concentrations to short codes + standardized_carbon_conc = {} + for met, conc in carbon_conc["rows"].items(): + standardized_carbon_conc[met] = {} + for row, val in conc.items(): + standardized_carbon_conc[met].update({short_code:val for ( + short_code, expID) in trial_name_conversion[row].values()}) + for met, conc in carbon_conc["columns"].items(): + standardized_carbon_conc[met] = default_dict_values(standardized_carbon_conc, met, {}) + for col, val in conc.items(): + for row in trial_name_conversion: + standardized_carbon_conc[met][trial_name_conversion[row][str(col)][0]] = val + + # add columns to the exported dataframe + constructed_experiments.insert(0, "trial_IDs", experiment_ids) + constructed_experiments["additional_compounds"] = additional_compounds + constructed_experiments["strains"] = strains + constructed_experiments["date"] = [date] * (column_num*row_num) + constructed_experiments.to_csv("growth_metadata.tsv", sep="\t") + return constructed_experiments, standardized_carbon_conc, trial_name_conversion + + @staticmethod + def biomass_growth(carbon_conc, fluxes_df, data_df_trials, trial_name_conversion, + data_paths, community_members=None, pheno_info=None): + # TODO - leverage cFBA to partition metabolite consumption between the defined phenotypes + pheno_info = pheno_info or {f"{content['name']}_{pheno}": mets + for model, content in community_members.items() + for pheno, mets in content["phenotypes"].items()} + # invert the trial_name_conversion and data_paths keys and values + short_code_trials = {contents[0]: row+col for row in trial_name_conversion + for col, contents in trial_name_conversion[row].items()} + # short_code_trials = {contents[0]:contents[1] for contents in trial_name_conversion[row].values()} + name_signal = {name: signal for signal, name in data_paths.items()} + + # calculate the 90% concentration for each carbon source + requisite_fluxes = {} + for trial in [short_code_trials[ID] for ID in data_df_trials]: + row_letter = trial[0] ; col_number = trial[1:] + ## add rows where the initial concentration in the first trial is non-zero + utilized_phenos = {} + food_gradient = carbon_conc.copy() + for dimension, content in food_gradient.items(): + for met, conc_dict in content.items(): + source_conc = conc_dict[row_letter if dimension == "rows" else int(col_number)] + # print(met, source_conc) + if source_conc == 0 or f"EX_{met}_e0" not in fluxes_df.index: continue + for pheno, val in fluxes_df.loc[f"EX_{met}_e0"].items(): + # print(pheno, val) + if val < 0: utilized_phenos[pheno] = source_conc*0.9 / val + total_consumed = sum(list(utilized_phenos.values())) + # print(utilized_phenos) + + display(fluxes_df) + short_code = trial_name_conversion[row_letter][col_number][0] + requisite_fluxes[short_code] = {} + excreta = {} + for pheno, flux_conversion in utilized_phenos.items(): + species, phenotype = pheno.split("_", 1) + fluxes = fluxes_df.loc[:, pheno]*abs(flux_conversion) * abs(flux_conversion/total_consumed) + requisite_fluxes[short_code][f"{species}|{name_signal[species]}"] = fluxes[fluxes != 0] + pheno = reverse_strip_comp(pheno) + if "excreted" in pheno_info[pheno]: + # print(pheno_info[pheno]["excreted"]) + excreta.update({met:fluxes.loc[met] for met in pheno_info[pheno]["excreted"]}) + ## determine the fluxes for the other members of the community through cross-feeding + participated_species = [] + for pheno, mets in pheno_info.items(): + species, phenotype = pheno.split("_", 1) + if any([species in ph for ph in utilized_phenos]) or species in participated_species: continue + for met in mets["consumed"]: + exMet = f"EX_{met}_e0" + if exMet not in excreta: continue + fluxes = abs(excreta[exMet] * 0.99 / fluxes_df.loc[exMet, pheno]) * fluxes_df.loc[:, pheno] + requisite_fluxes[short_code][f"{species}|{name_signal[species]}"] = fluxes[fluxes != 0] + participated_species.append(species) + # print(requisite_fluxes) + return requisite_fluxes + + @staticmethod + def data_process(dataframes, trial_name_conversion): + short_codes, trials_list = [], [] + values, times = {}, {} # The times must capture upstream + first = True + for df_name, (times_df, values_df) in dataframes.items(): + # print(df_name) + # display(times_df) ; display(values_df) + times_tup = FBAHelper.parse_df(times_df) + average_times = np.mean(times_tup.values, axis=0) + values[df_name], times[df_name] = [], [] + for trial_code in values_df.index: + row_let, col_num = trial_code[0], trial_code[1:] + # print(trial_code, row_let, col_num) + for trial_row_values in trial_contents(trial_code, values_df.index, values_df.values): + if first: + short_code, experimentalID = trial_name_conversion[row_let][col_num] + trials_list.extend([experimentalID] * len(values_df.columns)) + short_codes.extend([short_code] * len(values_df.columns)) + values[df_name].extend(trial_row_values) + times[df_name].extend(average_times) + first = False + # process the data to the smallest dataset, to accommodate heterogeneous data sizes + minVal = min(list(map(len, values.values()))) + for df_name, data in values.items(): + values[df_name] = data[:minVal] + times2 = times.copy() + for df_name, data in times2.items(): + times[df_name] = data[:minVal] + # construct the growth DataFrame + df_data = {"trial_IDs": trials_list[:minVal], "short_codes": short_codes[:minVal]} + df_data.update({"Time (s)": np.mean(list(times.values()), axis=0)}) # element-wise average + df_data.update({df_name:vals for df_name, vals in values.items()}) + data_df = DataFrame(df_data) + data_df.index = data_df["short_codes"] + data_df = data_df.drop(["short_codes"], axis=1) + data_df.to_csv("growth_spectra.tsv", sep="\t") + return data_df + + +class BiologData: + + @staticmethod + def process(data_paths, trial_conditions_path, community_members, col_row_num, member_conversions, + culture=None, date=None, significant_deviation=None, solver="glpk", msdb_path:str=None): + row_num = 8 ; column_num = 12 + (zipped_output, data_timestep_hr, simulation_time, dataframes, trials, culture, date, fluxes_df + ) = BiologData.load_data(data_paths, significant_deviation, community_members, + col_row_num, row_num, culture, date, solver) + experimental_metadata, standardized_carbon_conc, trial_name_conversion = BiologData.metadata( + trial_conditions_path, row_num, column_num, culture, date) + biolog_df = BiologData.data_process(dataframes, trial_name_conversion) + requisite_biomass = BiologData.biomass_growth(biolog_df, member_conversions) + return (experimental_metadata, biolog_df, fluxes_df, standardized_carbon_conc, requisite_biomass, + trial_name_conversion, np.mean(data_timestep_hr), simulation_time) + + @staticmethod + def load_data(data_paths, significant_deviation, community_members, col_row_num, + row_num, culture, date, solver): + zipped_output = [data_paths['path'], "fluxes.tsv"] + # determine the metabolic fluxes for each member and phenotype + # import and parse the raw CSV data + # TODO - this may be capable of emulating leveraged functions from the GrowthData object + fluxes_df = phenotypes(community_members, solver=solver) + # fluxes_df = None + data_timestep_hr = [] + dataframes = {} + raw_data = _spreadsheet_extension_load(data_paths['path']) + significant_deviation = significant_deviation or 2 + # culture = culture or _find_culture(data_paths['path']) + culture = culture or ",".join([x for x in data_paths.values() if (x not in ["OD"] and not re.search(r"\w\.\w", x))]) + date = date or _findDate(data_paths['path']) + for org_sheet, name in data_paths.items(): + if org_sheet == 'path': + continue + sheet = org_sheet.replace(" ", "_") + df_name = f"{name}:{sheet}" + if df_name not in dataframes: + dataframes[df_name] = _spreadsheet_extension_parse( + data_paths['path'], raw_data, org_sheet) + dataframes[df_name].columns = dataframes[df_name].iloc[col_row_num] + dataframes[df_name].drop(dataframes[df_name].index[:col_row_num+1], inplace=True) + dataframes[df_name].dropna(inplace=True) + # parse the DataFrame for values + dataframes[df_name].columns = [str(x).strip() for x in dataframes[df_name].columns] + simulation_time = dataframes[df_name].iloc[0, -1] / hour + # display(dataframes[df_name]) + data_timestep_hr.append(simulation_time / int(float(dataframes[df_name].columns[-1]))) + # define the times and data + data_times_df, data_values_df = _df_construction( + name, df_name, None, None, significant_deviation, + dataframes[df_name], row_num, False) + # display(data_times_df) ; display(data_values_df) + dataframes[df_name] = (data_times_df, data_values_df) + + # differentiate the phenotypes for each species + trials = set(chain.from_iterable([list(df.index) for df, times in dataframes.values()])) + return (zipped_output, data_timestep_hr, simulation_time, dataframes, trials, culture, date, fluxes_df) + + @staticmethod + def metadata(trial_conditions_path, row_num, column_num, culture, date): + # define the conditions for each trial + with open(trial_conditions_path) as trials: + trial_conditions = json.load(trials) + + # define the metadata DataFrame and a few columns + constructed_experiments = DataFrame() + ex_prefix = "B" + constructed_experiments.index = [f"{ex_prefix}{x+1}" for x in list(range(row_num*column_num))] + constructed_experiments.index.name = "short_code" + + # define the strains column + experiment_ids, trial_names = [], [] + trial_name_conversion, trial_mets = {}, {} + count = 1 + ## apply universal values to all trials + for row in range(row_num): + trial_letter = chr(ord("A") + row) + trial_name_conversion[trial_letter] = {} + ## add rows where the initial concentration in the first trial is non-zero + for col in range(1, column_num+1): + ## construct the columns of information + dataID = trial_letter+str(col) + MSID = trial_conditions[dataID]["ModelSEED_ID"] + short_code = ex_prefix+str(count) + + experiment_ids.append(MSID) + trial_names.append(trial_conditions[dataID]["name"]) + trial_name_conversion[trial_letter][str(col)] = (short_code, MSID) + trial_mets[MSID] = {short_code:trial_conditions[dataID]["mM"]} + count += 1 + + # add columns to the exported dataframe + constructed_experiments.insert(0, "ModelSEED_ID", experiment_ids) + constructed_experiments.insert(0, "condition", trial_names) + constructed_experiments["strain"] = [culture] * (column_num*row_num) + constructed_experiments["date"] = [date] * (column_num*row_num) + constructed_experiments.to_csv("growth_metadata.tsv", sep="\t") + return constructed_experiments, trial_mets, trial_name_conversion + + @staticmethod + def data_process(dataframes, trial_name_conversion): + short_codes, trials_list = [], [] + values, times = {}, {} # The times must capture upstream + first = True + for df_name, (times_df, values_df) in dataframes.items(): + # display(df_name, times_df, values_df) + times_tup = FBAHelper.parse_df(times_df) + # display(DataFrame(times_tup.values)) + average_times = list(np.mean(times_tup.values, axis=0)) + # print(average_times) + # print(len(average_times)) + values[df_name], times[df_name] = [], [] + for exprID in values_df.index: + row_let, col_num = exprID[0], exprID[1:] + for trial_row_values in trial_contents(exprID, values_df.index, values_df.values): + if first: + short_code, experimentalID = trial_name_conversion[row_let][col_num] + trials_list.extend([experimentalID] * len(values_df.columns)) + short_codes.extend([short_code] * len(values_df.columns)) + if len(trial_row_values) != len(average_times): + print(f"The length of the trial data {len(trial_row_values)} " + f"exceeds that of the timesteps {len(average_times)} " + f"which creates an incompatible DataFrame.") + values[df_name].extend(trial_row_values) + times[df_name].extend(average_times) + first = False + # process the data to the smallest dataset, to accommodate heterogeneous data sizes + minVal = min(list(map(len, values.values()))) + for df_name, data in values.items(): + values[df_name] = data[:minVal] + times2 = times.copy() + for df_name, data in times2.items(): + times[df_name] = data[:minVal] + df_data = {"trial_IDs": trials_list, "short_codes": short_codes} + df_data.update({"Time (s)": list(np.mean(list(times.values()), axis=0))}) # element-wise average + df_data.update({df_name:vals for df_name, vals in values.items()}) + biolog_df = DataFrame(df_data) + biolog_df.index = biolog_df["short_codes"] + del biolog_df["short_codes"] + biolog_df.to_csv("growth_spectra.tsv", sep="\t") + + return biolog_df + + @staticmethod + def biomass_growth(biolog_df, member_conversions): + requisite_biomass = {} + for short_code in biolog_df.index.unique(): + requisite_biomass[short_code] = {} + for signal, conversion in member_conversions.items(): + short_code_df = biolog_df[biolog_df.index == short_code] + requisite_biomass[short_code][signal] = conversion * short_code_df[ + signal.replace("|", ":").replace(" ", "_")].iloc[-1] + return requisite_biomass diff --git a/modelseedpy/community/dfbapkg.py b/modelseedpy/community/dfbapkg.py deleted file mode 100644 index 8dcc5819..00000000 --- a/modelseedpy/community/dfbapkg.py +++ /dev/null @@ -1,505 +0,0 @@ -# -*- coding: utf-8 -*- - -from scipy.constants import milli, hour, minute, day, femto -from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg -# from modelseedpy.core.fbahelper import FBAHelper -from collections import OrderedDict -from optlang.symbolics import Zero -from numpy import log10, nan, mean -from warnings import warn -from matplotlib import pyplot -from pprint import pprint -from datetime import date -from math import inf -import pandas -# import cython -import json, re, os - - -def isnumber(string): - try: - float(string) - remainder = re.sub('([0-9.\-eE])', '', str(string)) - except: - try: - int(string) - remainder = re.sub('[0-9.-eE])', '', str(string)) - except: - return False - if remainder == '': - return True - -class dFBAPkg(BaseFBAPkg): - def __init__(self, - model, # Cobrakbase model - modelseed_db_path: str, # local path to the ModelSEED Database - solver: str = 'glpk', # specifies the LP solver - warnings: bool = True, verbose: bool = False, printing: bool = False, jupyter: bool = False - ): - # define the parameter and variable dictionaries - BaseFBAPkg.__init__(self, model, "BasedFBA", {"met": "metabolite"}, {"conc": 'metabolite'}) - self.pkgmgr.addpkgs(["FullThermoPkg"]) - # self.parameters["modelseed_api"] = FBAHelper.get_modelseed_db_api(self.parameters["modelseed_db_path"]) - - # define simulation conditions - self.warnings: bool = warnings; self.verbose: bool = verbose; self.printing: bool = printing; self.jupyter: bool = jupyter - self.model = model - self.model.solver = solver - - # define a list of metabolite ids - self.met_ids, self.met_names = OrderedDict(), [] - for met in self.model.metabolites: - self.met_ids[met.id] = met.name - self.met_names.append(met.name) - - # user functions - def print_lp(self, filename=None): - BaseFBAPkg.print_lp(self, filename.removesuffix('.lp')) - - def simulate(self, - kinetics_path: str = None, # the path of the kinetics data JSON file - initial_concentrations_M: dict = {}, # a dictionary of the initial metabolic concentrations , which supplants concentrations from the defined kinetics data - total_time: float = 200, timestep: float = 20, # total simulation time and the simulation timestep in minutes - export_name: str = None, export_directory: str = None, # the location to which simulation content will be exported - chemostat_L: float = None, feed_profile: dict = {}, # the volume (l) and feed profile for a chemostat simulation, where None ignores a chemostat - exchange_rate: float = None, # the flow rate (Molar/Liter) of addition to and removal from the chemostat system - thermo_constraints: bool = False, # specifies whether thermodynamic constraints will be layered with the kinetic constraints - kinetics_data: dict = {}, # A dictionary of custom kinetics data - temperature: float = 25, p_h: float = 7, # simulation conditions - cellular_dry_mass_fg: float = 222, # mass of the simulated cell in femtograms - cellular_fL: float = 1, # volume of the simulated cell in femtoliters - figure_title: str = 'Metabolic perturbation', # title of the concentrations figure - included_metabolites: list = [], # A list of the metabolites that will be graphically displayed - labeled_plots: bool = True, # specifies whether plots will be individually labeled - visualize: bool = True, export: bool = True # specifies whether simulation content will be visualized or exported, respectively - ): - # define the dataframe for the time series content - self.cellular_dry_mass_fg: float = cellular_dry_mass_fg*femto; self.cellular_fL: float = cellular_fL*femto - self.parameters['timesteps'] = int(total_time/timestep) - self.timestep_value: float = timestep; self.total_time: float = total_time - self.constrained = OrderedDict() - self.solutions = [] - self.minimum = inf - - # define experimental conditions - self.parameters['pH'], self.parameters['temperature'] = p_h, temperature - self.variables['elapsed_time'] = 0 - - # define initial concentrations - self.initial_concentrations_M = initial_concentrations_M - self._initial_concentrations(kinetics_path, kinetics_data) - - # apply constraints and system specifications - chemostat_requirements = [isnumber(chemostat_L), feed_profile != {}, isnumber(exchange_rate)] - if any(chemostat_requirements) and not all(chemostat_requirements): - warn(f'The chemostat_L ({chemostat_L}), feed_profile ({feed_profile}), and exchange_rate ({exchange_rate}) parameters must all be defined to simulate a chemostat.') - if thermo_constraints: - self.pkgmgr.addpkgs(["FullThermoPkg"]) - self.pkgmgr.getpkg("FullThermoPkg").build_package() - - # determine the reactions for which kinetics are predefined - self.defined_reactions = {} - for rxn in self.model.reactions: - if rxn.name in kinetics_data: - self.defined_reactions[rxn.name] = rxn - - # execute FBA for each timestep - for self.timestep in range(1,self.parameters['timesteps']+1): - # calculate custom fluxes, constrain the model, and update concentrations - self._define_timestep() - self._build_constraints() - self._calculate_kinetics() - self._execute_cobra() - self._update_concentrations() - if all(chemostat_requirements): - self.chemical_moles[self.col] = self.concentrations[self.col]*milli * chemostat_L - self._chemostat(feed_profile, exchange_rate, chemostat_L) - - self.variables['elapsed_time'] += self.timestep - if self.printing: - print(f'\nobjective value for timestep {self.timestep}: ', self.solutions[-1].objective_value) - - # identify the chemicals that dynamically changed in concentrations - self.changed, self.unchanged = set(), set() - for met_name in self.met_names: - first = self.concentrations.at[met_name, '0 min'] - final = self.concentrations.at[met_name, self.col] - if first != final: - self.changed.add(met_name) - if first == final: - self.unchanged.add(met_name) - - # visualize concentration changes over time - if visualize: - self._visualize(figure_title,included_metabolites,labeled_plots) - if export: - self._export(export_name, export_directory) - - # view calculations and results - if self.verbose: - print('\n\n', 'Changed metabolite concentrations\n', '='*2*len('changed metabolites'), '\n', self.changed) - print('\nConstrained reactions:', self.constrained.keys()) - elif self.printing: - if self.jupyter: - pandas.set_option('max_rows', None) - display(self.concentrations) - display(self.fluxes) - if self.unchanged == set(): - print('\nAll of the metabolites changed concentration over the simulation') - else: - print('\n\nUnchanged metabolite concentrations', '\n', '='*2*len('unchanged metabolites'), '\n', self.unchanged) - - return self.concentrations, self.fluxes - - #utility functions - def _initial_concentrations(self, - kinetics_path: str = None, # the absolute path to a JSON file of kinetics data - kinetics_data: dict = {}, # a dictionary of kinetics data, which supplants imported data from the kinetics_path - ): - # define kinetics of the system - self.kinetics_data = {} - if kinetics_path: - if not os.path.exists(kinetics_path): - raise ValueError('The path {kinetics_data} is not a valid path') - with open(kinetics_path) as data: - self.kinetics_data = json.load(data) - if kinetics_data != {}: - for reaction in kinetics_data: - self.kinetics_data[reaction] = kinetics_data[reaction] - if self.kinetics_data == {}: - raise ValueError('Kinetics data must be defined.') - - # define the DataFrames - self.col = '0 min' - self.concentrations = pandas.DataFrame(index=set(self.met_names), columns=[self.col]) - self.chemical_moles = pandas.DataFrame(index=set(self.met_names), columns=[self.col]) - self.concentrations.index.name = self.chemical_moles.index.name = 'metabolite (\u0394mM)' - - self.fluxes = pandas.DataFrame(index = set(rxn.name for rxn in self.model.reactions), columns = [self.col]) - self.fluxes.index.name = 'reactions (mmol/g_(dw)/hr)' - - # parse the kinetics data - initial_concentrations = {} - for met in self.met_names: - self.concentrations.at[str(met), self.col] = float(0) - for reaction_name in self.kinetics_data: - for condition, datum in self.kinetics_data[reaction_name].items(): - for var in datum['initial_concentrations_M']: - met_id = datum['met_id'][var] - if met_id in self.met_ids: - name = self.met_ids[met_id] - if name in self.met_names: - self.concentrations.at[name, self.col] += datum['initial_concentrations_M'][var]/milli - initial_concentrations[met_id] = self.concentrations.at[name, self.col] - else: - if self.warnings: - warn(f"KineticsError: The {name} reagent ({var}) in the {datum['substituted_rate_law']} rate law is not defined by the model.") - else: - if self.warnings: - warn(f"KineticsError: The {name} reagent ({var}) in the {datum['substituted_rate_law']} rate law is not recognized by the ModelSEED Database.") - - # incorporate custom initial concentrations - if isinstance(self.initial_concentrations_M, dict) and self.initial_concentrations_M != {}: - for met_id in self.initial_concentrations_M: - met_name = self.met_ids[met_id] - if met_name not in self.concentrations.index: - if self.warnings: - warn(f'InitialConcError: The {met_id} ({met_name}) metabolite is not defined by the model.') - else: - self.concentrations.at[met_name, self.col] = self.initial_concentrations_M[met_id]*milli - initial_concentrations[met_id] = self.concentrations.at[name, self.col] - self.initial_concentrations_M = initial_concentrations - - - def _define_timestep(self,): - self.col = f'{self.timestep*self.timestep_value} min' - self.previous_col = f'{(self.timestep-1)*self.timestep_value} min' - self.concentrations[self.col] = [float(0) for ind in self.concentrations.index] #!!! - self.fluxes[self.col] = [nan for ind in self.fluxes.index] - - - def _calculate_kinetics(self): - for reaction_name in self.kinetics_data: - fluxes = [] - for source in self.kinetics_data[reaction_name]: - incalculable = False - datum = self.kinetics_data[reaction_name][source] - if "substituted_rate_law" in datum: #!!! Statistics of aggregating each condition should be provided for provenance. - remainder = re.sub('([0-9A-Za-z/()e\-\+\.\*])', '', datum["substituted_rate_law"]) - if remainder == '': - # define each variable concentration - conc_dict = {} - for var in datum['met_id']: - met_id = datum['met_id'][var] - if len(var) == 1: - conc_dict[var] = self.concentrations.at[self.met_ids[met_id], self.previous_col]*milli # concentrations are mM - # warn(f'MetaboliteError: The {self.met_ids[met_id]} chemical is not recognized by the ModelSEED Database.') - - if conc_dict != {}: - locals().update(conc_dict) - flux = eval(datum["substituted_rate_law"]) - - # average or overwrite flux calculations based upon the alignment of the data conditions with the simulation conditions - add_or_write = 'a' - if 'metadata' in self.kinetics_data[reaction_name][source]: - add_or_write = self.__find_data_match(reaction_name, source) - if add_or_write == 'a': - fluxes.append(flux) - elif add_or_write == 'w': - fluxes = [flux] - else: - if self.warnings: - warn(f'MetaboliteError: The {reaction_name} reaction possesses unpredictable chemicals.') - else: - if self.warnings: - warn('RateLawError: The {datum["substituted_rate_law"]} rate law contains unknown characters: {remainder}') - else: - if self.warnings: - warn(f'RateLawError: The {datum} datum lacks a rate law.') - - flux = mean(fluxes) - if isnumber(flux): - if reaction_name in self.defined_reactions: - self.__set_constraints(reaction_name, flux) - self.fluxes.at[reaction_name, self.col] = flux - if self.printing: - print('\n') - else: - if self.warnings: - warn(f'ReactionError: The {reaction_name} reaction, with a flux of {flux}, is not described by the model.') - else: - if self.warnings: - warn(f'FluxError: The {reaction_name} reaction flux {datum["substituted_rate_law"]} value {flux} is not numberic.') - - def _execute_cobra(self): - # execute the COBRA model - solution = self.model.optimize() - self.solutions.append(solution) - for rxn in self.model.reactions: - if not isnumber(self.fluxes.at[rxn.name, self.col]): - self.fluxes.at[rxn.name, self.col] = solution.fluxes[rxn.id] - - def _update_concentrations(self,): - for met in self.model.metabolites: - self.concentrations.at[self.met_ids[met.id], self.col] = 0 - for rxn in met.reactions: # flux units: mmol/(g_(dry weight)*hour) - stoich = rxn.metabolites[met] - flux = self.fluxes.at[rxn.name, self.col] - delta_conc = stoich * (flux * self.timestep_value*(minute/hour) * self.cellular_dry_mass_fg/self.cellular_fL) - self.concentrations.at[self.met_ids[met.id], self.col] += delta_conc - - def _visualize(self, - figure_title, # defines the title of the concentrations figure - included_metabolites, # specifies which metabolites will be included in the figure - labeled_plots # specifies which plots will be labeled in the figure - ): - # define the figure - pyplot.rcParams['figure.figsize'] = (11, 7) - pyplot.rcParams['figure.dpi'] = 150 - - self.figure, ax = pyplot.subplots() - ax.set_title(figure_title) - ax.set_ylabel('Concentrations (mM)') - - x_axis_scalar, unit = self.__x_axis_determination() - ax.set_xlabel('Time '+unit) - legend_list, times = [], [t*self.timestep_value*x_axis_scalar for t in range(self.parameters['timesteps']+1)] - - # determine the plotted metabolites and the scale of the figure axis - bbox = (1,1) - if included_metabolites == []: - bbox = (1.7,1) - for chem in self.changed: - if max(self.concentrations.loc[[chem]].values[0].tolist()) > 1e-2: # an arbitrary concentration threshold for plotting on the figure - included_metabolites.append(chem) - - log_axis = False - minimum, maximum = inf, -inf - printed_concentrations = {} - for chem in self.changed: - if chem in included_metabolites: - concentrations = self.concentrations.loc[[chem]].values[0].tolist() # molar - - # determine the concentration range - max_conc = max([x if x > 1e-9 else 0 for x in concentrations]) - maximum = max(maximum, max_conc) - min_conc = min([x if x > 1e-9 else 0 for x in concentrations]) - minimum = min(minimum, min_conc) - - # plot chemicals with perturbed concentrations - ax.plot(times, concentrations) - if len(chem) > 25: - chem = list(self.met_ids.keys())[self.met_names.index(chem)] - if not concentrations[0] < 1e-9: - legend_list.append(chem) - else: - legend_list.append(f'(rel) {chem}') - - # design the proper location of the overlaid labels in the figure - if labeled_plots: - for i, conc in enumerate(concentrations): - if conc > 1e-9: - x_value = i*self.timestep_value - vertical_adjustment = 0 - if x_value in printed_concentrations: - vertical_adjustment = (maximum - minimum)*.05 - if log_axis: - vertical_adjustment = log10(maximum - minimum)/3 - ax.text(x_value, conc+vertical_adjustment, f"{chem} - {round(conc, 4)}", ha="left") - printed_concentrations[x_value] = conc - break - - # finalize figure details - if maximum > 10*minimum: - log_axis = True - ax.set_yscale('log') - ax.set_xticks(times) - ax.grid(True) - ax.legend(legend_list, title = 'Changed chemicals', loc='upper right', bbox_to_anchor = bbox, title_fontsize = 'x-large', fontsize = 'large') - - - def _export(self, - export_name: str, # the folder name to which the simulation content will be exported - export_directory: str # the directory within which the simulation folder will be created - ): - # define a unique simulation name - directory = os.getcwd() - if export_directory is not None: - directory = os.path.dirname(export_directory) - if export_name is None: - export_name = '-'.join([re.sub(' ', '_', str(x)) for x in [date.today(), 'dFBA', self.model.name, f'{self.total_time} min']]) - - simulation_number = -1 - while os.path.exists(os.path.join(directory, export_name)): - simulation_number += 1 - export_name = re.sub('(\-\d+$)', '', export_name) - export_name = '-'.join([export_name, str(simulation_number)]) - - self.parameters['simulation_path'] = self.simulation_path = os.path.join(directory, export_name) - os.mkdir(self.simulation_path) - - # export simulation content - self.fluxes.to_csv(os.path.join(self.simulation_path, 'fluxes.csv')) - self.concentrations.to_csv(os.path.join(self.simulation_path, 'concentrations.csv')) - - times = self.fluxes.columns - with open(os.path.join(self.simulation_path, 'objective_values.csv'), 'w') as obj_val: - obj_val.write('min,objective_value') - for sol in self.solutions: - index = self.solutions.index(sol) - time = re.sub('(\smin)', '', times[index]) - obj_val.write(f'\n{time},{sol.objective_value}') - - # export the parameters - parameters = {'parameter':[], 'value':[]} - for parameter in self.parameters: - parameters['parameter'].append(parameter) - parameters['value'].append(self.parameters[parameter]) - - parameters_table = pandas.DataFrame(parameters) - parameters_table.to_csv(os.path.join(self.simulation_path, 'parameters.csv')) - - # export the figure - self.figure.savefig(os.path.join(self.simulation_path, 'changed_concentrations.svg')) - if self.verbose: - if not self.jupyter: - self.figure.show() - - def _build_constraints(self): - # create a metabolite variable that prevents negative concentrations - timestep_hr = self.timestep_value * (minute/hour) - for met in self.model.metabolites: - if met.id in self.initial_concentrations_M: - coef = {} - for rxn in met.reactions: - stoich = timestep_hr*rxn.metabolites[met] # The product of the reaction stoichiometry and the timestep, for the integration of the steady-state - coef[rxn.forward_variable] = stoich - coef[rxn.reverse_variable] = -stoich - if stoich < 0: - coef[rxn.forward_variable] = -stoich - coef[rxn.reverse_variable] = stoich - - # build the metabolite constraint - if met.id in self.constraints["conc"]: - self.model.remove_cons_vars(self.constraints["conc"][met.id]) - self.constraints["conc"][met.id] = self.model.problem.Constraint( - Zero, lb=-self.concentrations.at[met.name, self.previous_col], ub=None, name=f'{met.id}_conc' - ) - self.model.add_cons_vars(self.constraints["conc"][met.id]) - self.model.solver.update() - - self.constraints["conc"][met.id].set_linear_coefficients(coef) - self.model.solver.update() - - # var = BaseFBAPkg.build_variable(self,"met",0,None,"continuous",met) - # BaseFBAPkg.build_constraint(self,"conc",0,None,{met:1},met) - - def _chemostat(self, feed_profile, # a dictionary of the chemicals and their concentrations in the influent feed - exchange_rate, # the L/hr flow rate of the feed and extraction of the chemostat - chemostat_L # the volume (l) of the chemostat - ): - L_changed = exchange_rate*self.timestep_value - # chemostat addition - for chem_id, conc in feed_profile.items(): - chem_name = self.met_ids[chem_id] - self.chemical_moles.at[chem_name, self.col] += conc*L_changed - self.concentrations.at[chem_name, self.col] = (self.chemical_moles.at[chem_name, self.col]/milli/chemostat_L) # normalize to the chemostat volume - - # chemostat subtraction - for met in self.model.metabolites: - if met.compartment[0] == 'e': - self.chemical_moles.at[met.name, self.col] -= (self.concentrations.at[met.name, self.col]*L_changed) - self.concentrations.at[met.name, self.col] = (self.chemical_moles.at[met.name, self.col]/milli/chemostat_L) # normalize to the chemostat volume - - - # nested functions - def __find_data_match(self, - reaction_name: str, # specifies the name of the given reaction - source: str # specifies which datum of the enzymatic data will be used, where multiple data entries are present - ): - # identifies the datum whose experimental conditions most closely matches the simulation conditions - temperature_deviation = ph_deviation = 0 - if isnumber(self.kinetics_data[reaction_name][source]['metadata']["Temperature"]): - temperature_deviation = abs(self.parameters['temperature'] - float(self.kinetics_data[reaction_name][source]['metadata']["Temperature"]))/self.parameters['temperature'] - if isnumber(self.kinetics_data[reaction_name][source]['metadata']["pH"]): - ph_deviation = abs(self.parameters['pH'] - float(self.kinetics_data[reaction_name][source]['metadata']["pH"]))/self.parameters['pH'] - - # equally weight between temperature and pH deviation from the simulation conditions - old_minimum = self.minimum - deviation = mean(temperature_deviation, ph_deviation) - self.minimum = min(deviation, self.minimum) - - if old_minimum == self.minimum: - return 'a' # append to an existing list of data - elif deviation == self.minimum: - return 'w' # construct a new list of data - - def __set_constraints(self, - reaction_name: str, flux: float # specify the name and flux of the given reaction, respectively - ): - rxn = self.defined_reactions[reaction_name] - rxn_name = re.sub(' ', '_', rxn.name) - if rxn_name in self.constrained: - self.model.remove_cons_vars(self.constrained[rxn_name]) - self.model.solver.update() - self.constrained[rxn_name] = self.model.problem.Constraint(rxn.flux_expression, lb=flux, ub=flux, name=f'{rxn_name}_kinetics') - self.model.add_cons_vars(self.constrained[rxn_name]) - self.model.solver.update() - if self.verbose: - print(self.model.constraints[f'{rxn_name}_kinetics']) - - - def __x_axis_determination(self,): - scalar = minute - time = self.total_time*scalar - unit = 's' - if time > 600: - unit = 'min' - scalar = 1 - if time > 7200: - unit = 'hr' - scalar = 1/hour - if time > 2e5: - scalar = 1/day - unit = 'days' - return scalar, unit \ No newline at end of file diff --git a/modelseedpy/community/get_ncbi_gbff.pl b/modelseedpy/community/get_ncbi_gbff.pl new file mode 100644 index 00000000..cbeddcfc --- /dev/null +++ b/modelseedpy/community/get_ncbi_gbff.pl @@ -0,0 +1,13 @@ +use strict; + +while (<>){ + chomp ($_); + next if ($_=~/^\s*$/); + my $val = `grep $_ assembly_summary_refseq.txt |cut -f 20`; + chomp ($val); + my @p = split ("/", $val); + my $n = $p[-1]; + my $url = "${val}/${n}_genomic.gbff.gz"; + my $fpath = "${n}_genomic.gbff.gz "; + print "curl $url -o $fpath" . "\n"; +} diff --git a/modelseedpy/community/metquest_code.py b/modelseedpy/community/metquest_code.py new file mode 100644 index 00000000..d2001edb --- /dev/null +++ b/modelseedpy/community/metquest_code.py @@ -0,0 +1,858 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import +from collections import deque, defaultdict +import os +import glob +import sys +import warnings +from itertools import combinations +import re +import pandas as pd +import numpy as np +import cobra +import networkx as nx + +from modelseedpy.community import commhelper +from modelseedpy import MSModelUtil + +warnings.filterwarnings("ignore") + + +def _create_graph_with_internal_reaction(organismsdata): + """ + This function creates a NetworkX DiGraph object which consists of + reactions and metabolites happening inside the organisms in a community. + This makes use of the reaction information i.e., irreversible and + reversible, which is obtained from another script fetch_reactions. + + Parameters + ---------- + organismsdata : dict + Dictionary containing the reaction information about organisms + + Returns + ------- + G : NetworkX DiGraph Object + Bipartite graph consisting of internal reactions in organisms + """ + G = nx.DiGraph() + for modelname in organismsdata: + G.add_nodes_from(organismsdata[modelname]['irreversible_rxn_no'], bipartite=1) + G.add_nodes_from(organismsdata[modelname]['reversible_rxn_no'], bipartite=1) + G.add_nodes_from(organismsdata[modelname]['reversible_back_rxn_no'], bipartite=1) + irrev_lhs_nodes = list(set( + [item for sublist in organismsdata[modelname]['irreversible_lhs_nodes'] for item in sublist])) + irrev_rhs_nodes = list(set( + [item for sublist in organismsdata[modelname]['irreversible_rhs_nodes'] for item in sublist])) + rev_lhs_nodes = list(set( + [item for sublist in organismsdata[modelname]['reversible_lhs_nodes'] for item in sublist])) + rev_rhs_nodes = list(set( + [item for sublist in organismsdata[modelname]['reversible_rhs_nodes'] for item in sublist])) + G.add_nodes_from(irrev_lhs_nodes, bipartite=0) + G.add_nodes_from(irrev_rhs_nodes, bipartite=0) + G.add_nodes_from(rev_lhs_nodes, bipartite=0) + G.add_nodes_from(rev_rhs_nodes, bipartite=0) + for irrevidx in range(len(organismsdata[modelname]['irreversible_rxn_no'])): + for lhsmetidx in range(len(organismsdata[modelname]['irreversible_lhs_nodes'][irrevidx])): + G.add_edges_from([(organismsdata[modelname]['irreversible_lhs_nodes'][irrevidx][lhsmetidx], + organismsdata[modelname]['irreversible_rxn_no'][irrevidx])]) + for rhsmetidx in range(len(organismsdata[modelname]['irreversible_rhs_nodes'][irrevidx])): + G.add_edges_from([(organismsdata[modelname]['irreversible_rxn_no'][irrevidx], + organismsdata[modelname]['irreversible_rhs_nodes'][irrevidx][rhsmetidx])]) + for revidx in range(len(organismsdata[modelname]['reversible_rxn_no'])): + for lhsmetidxrev in range(len(organismsdata[modelname]['reversible_lhs_nodes'][revidx])): + G.add_edges_from([(organismsdata[modelname]['reversible_lhs_nodes'][revidx][lhsmetidxrev], + organismsdata[modelname]['reversible_rxn_no'][revidx])]) + G.add_edges_from([(organismsdata[modelname]['reversible_back_rxn_no'][revidx], + organismsdata[modelname]['reversible_lhs_nodes'][revidx][lhsmetidxrev])]) + for rhsmetidxrev in range(len(organismsdata[modelname]['reversible_rhs_nodes'][revidx])): + G.add_edges_from([(organismsdata[modelname]['reversible_rxn_no'][revidx], + organismsdata[modelname]['reversible_rhs_nodes'][revidx][rhsmetidxrev])]) + G.add_edges_from([(organismsdata[modelname]['reversible_rhs_nodes'][revidx][rhsmetidxrev], + organismsdata[modelname]['reversible_back_rxn_no'][revidx])]) + return G + + +def _create_graph_with_exchange_reactions(G, orgs, namemap): + """ + This function first identifies the common exchange metabolites + and the non-common exchange metabolites and adds them to the + DiGraph object generated above. + + Parameters + ---------- + G : NetworkX DiGraph Object + Bipartite graph of reaction network from organisms + orgs : dict + Dictionary consisting of irreversible, reversible and exchange + reactions pertaining to the organisms. If more than one organism + is used, this dictionary consists of information about all the + organisms. + namemap : dict + Dictionary mapping the adhoc reaction names to reaction names in + the model + + Returns + ------- + G : NetworkX DiGraph Object + Bipartite graph consisting of internal and exchange reactions in organisms + namemap : dict + Dictionary mapping the adhoc exchange reaction names to reaction names in + the model + """ + metabolite_exchanged = [] + for orgnames in orgs: + exc_met = orgs[orgnames]['exchange_metab_nodes'] + metabolite_exchanged.append(exc_met) + # Common exchange metabolites in different organisms + common_exchange_metabolite = list(set.intersection(*list(map(set, metabolite_exchanged)))) + common_exchange_metabolite.sort() + # Adding the common exchange metabolites to the graph + for orgnames in orgs: + renamed_exc_met = [f"{orgnames} {comexcmet}" for comexcmet in common_exchange_metabolite] + number_exc_met = list(range(0, len(common_exchange_metabolite))) + mod_exc_rxn_number = [f'Org_{orgnames} ER{str(num + 1)}' for num in number_exc_met] + mod_exc_rev_rxn_number = [f'Org_{orgnames} ERR{str(num + 1)}' for num in number_exc_met] + G.add_nodes_from(mod_exc_rxn_number, bipartite=1) + G.add_nodes_from(mod_exc_rev_rxn_number, bipartite=1) + G.add_nodes_from(common_exchange_metabolite, bipartite=0) + G.add_nodes_from(renamed_exc_met, bipartite=0) + for k in range(len(renamed_exc_met)): + namemap[mod_exc_rxn_number[k]] = common_exchange_metabolite[k] + namemap[mod_exc_rev_rxn_number[k]] = common_exchange_metabolite[k] + G.add_edges_from([(renamed_exc_met[k], mod_exc_rxn_number[k])]) + G.add_edges_from([(mod_exc_rxn_number[k], common_exchange_metabolite[k])]) + G.add_edges_from([(common_exchange_metabolite[k], mod_exc_rev_rxn_number[k])]) + G.add_edges_from([(mod_exc_rev_rxn_number[k], renamed_exc_met[k])]) + # Adding the uncommon exchange metabolites to the graph + for orgnames in orgs: + metitems = orgs[orgnames]['exchange_metab_nodes'] + non_common_exc_met = list(set(metitems) - set(common_exchange_metabolite)) + non_common_exc_met.sort() + renamed_non_common_exc_met = [f"{orgnames} {s}" for s in non_common_exc_met] + number_non_common_exc_met = list(range(0, len(non_common_exc_met))) + mod_non_common_exc_rxn_number = [f"Org_{orgnames} NCER{str(num + 1)}" for num in number_non_common_exc_met] + mod_non_common_exc_rev_rxn_number = [f"Org_{orgnames} NCERR{str(num + 1)}" + for num in number_non_common_exc_met] + G.add_nodes_from(mod_non_common_exc_rxn_number, bipartite=1) + G.add_nodes_from(mod_non_common_exc_rev_rxn_number, bipartite=1) + G.add_nodes_from(non_common_exc_met, bipartite=0) + G.add_nodes_from(renamed_non_common_exc_met, bipartite=0) + for k in range(len(renamed_non_common_exc_met)): + namemap[mod_non_common_exc_rxn_number[k]] = non_common_exc_met[k] + namemap[mod_non_common_exc_rev_rxn_number[k]] = non_common_exc_met[k] + G.add_edges_from([(renamed_non_common_exc_met[k], mod_non_common_exc_rxn_number[k])]) + G.add_edges_from([(mod_non_common_exc_rxn_number[k], non_common_exc_met[k])]) + G.add_edges_from([(non_common_exc_met[k], mod_non_common_exc_rev_rxn_number[k])]) + G.add_edges_from([(mod_non_common_exc_rev_rxn_number[k], renamed_non_common_exc_met[k])]) + return G, namemap + +def create_graph(file_names, no_of_orgs): + """ + This function creates bipartite graph of the organisms based on the + path provided and the number of organsisms. For instance, if a folder + has 3 model files, and the number of organisms is 2, 3 (3C2) different + bipartite graphs are created. The graph objects and the dictionary + are saved as gpickle and pickle files respectively. + + Parameters + ---------- + file_names : list + List containing the file names of models + no_of_orgs : int + Number of organisms to be used for creating the DiGraph. + + Returns + ------- + H : NetworkX DiGraph Object + Bipartite graph consisting of internal and exchange reactions in organisms + full_name_map : dict + Dictionary mapping the adhoc reaction names to reaction names in + the model + """ + + H=[] + organisms_reaction_data, partial_name_map = segregate_reactions_from_models(file_names) + if organisms_reaction_data: + organisms_names = list(organisms_reaction_data.keys()) + all_possible_combis = list(combinations(list(range(len(organisms_names))), int(no_of_orgs))) + if int(no_of_orgs)>1 and sorted(organisms_names)[0][0]=='0': + all_possible_combis = all_possible_combis[:len(organisms_names)-1] + if all_possible_combis: + for ncom in range(len(all_possible_combis)): + file_name = '' + current_combination = {} + for numincom in range(len(all_possible_combis[ncom])): + current_combination[organisms_names[all_possible_combis[ncom][numincom]]] = \ + organisms_reaction_data[organisms_names[all_possible_combis[ncom][numincom]]] + file_name = file_name + organisms_names[all_possible_combis[ncom][numincom]] + '_' + H.append(_create_graph_with_internal_reaction(current_combination)) + temp, full_name_map = _create_graph_with_exchange_reactions( + H[ncom], current_combination, partial_name_map) + H[ncom]=temp + print(len(H), H[ncom]) + print('Number of edges in graph', len(H[ncom].edges())) + print('Number of nodes in graph', len(H[ncom].nodes())) + + # Uncomment the following code to save the graph files externally in your machine + # Note: Graph files can occupy a large space for large datasets + ''' + if os.access(path_name_with_models, os.W_OK): + with open(file_name + 'namemap' + '.pickle', 'wb') as filetodump: + dump(full_name_map, filetodump) + nx.write_gpickle(H[ncom], file_name + '.gpickle') + print('Graph and namemap saved for file(s) in', path_name_with_models) + ''' + else: + print( + 'Number of organisms for creating a consortium graph is more than the models given') + print('Program will now exit') + sys.exit() + else: + print("Cannot create graph") + sys.exit() + return H, full_name_map + + +def forward_pass(graph_object, media): + """ + This function carries out the Guided Breadth First Search on a directed + bipartite graph starting from the entries in seed metabolite set. + + Parameters + ---------- + graph_object : NetworkX DiGraph Object + Bipartite graph of the metabolic network + + seedmet : set + Set of seed metabolites including the source + + Returns + ------- + lower_bound_metabolite : defaultdict + Minimum number of steps required to reach a metabolite + status_dict : defaultdict + Dictionary pertaining to the status of every reaction - whether it + has been visited or not + scope : set + Set of metabolites that can be produced from the given set of + seed metabolites + + Notes + ----- + Starting with the set of seed metabolites S, the algorithm first finds + all the reactions from the set R, whose precursor metabolites are in S. + Such reactions are marked visited and added to the visited reaction set. + Metabolites produced by these reactions are checked. The reactions where + these metabolites participate are then checked for the presence of all its + predecessors and are added to the queue. This traversal continues in a + breadth-first manner and stops when there are no further reactions to + be visited. + """ + pred = graph_object.predecessors + succ = graph_object.successors + lower_bound_metabolite = {cpd: [0] for cpd in media} + lower_bound_reaction = defaultdict(list) + status_dict = defaultdict(str) + # Using a deque since deques have O(1) speed for appendleft() and popleft() + # while lists have O(n) performance for inserting and popping. + queue = deque([]) + # All seed metabolites are always present, hence require 0 steps + stage = 1 + mediaMets = list(media.keys()) + scope = list(media.keys()) + starting_rxn_node = [] + # First stage where starting_rxn_node list contains all the reactions + # which require only the seed metabolites as input + for starting_met_nodes in mediaMets: + # Essential when analysing mutiple networks with same seed metabolite + # set, although would be redundant in case of single network + if starting_met_nodes in graph_object: + for startingrxns in succ(starting_met_nodes): + if set(pred(startingrxns)).issubset(mediaMets): + if startingrxns not in starting_rxn_node: + starting_rxn_node.append(startingrxns) + for metsprod in succ(startingrxns): + scope.add(metsprod) + if stage not in lower_bound_metabolite[metsprod]: + lower_bound_metabolite[metsprod].append(stage) + if stage not in lower_bound_reaction[startingrxns]: + lower_bound_reaction[startingrxns].append(stage) + for rxn in starting_rxn_node: + for metabs in succ(rxn): + for nextrxn in succ(metabs): + if set(pred(nextrxn)).issubset(scope): + if nextrxn not in queue: queue.append(nextrxn) + status_dict[rxn] = 'V' + while queue: + stage += 1 + for parentrxn in list(queue): + if status_dict[parentrxn] == '': + if stage not in lower_bound_reaction[parentrxn]: + lower_bound_reaction[parentrxn].append(stage) + for mets in succ(parentrxn): + scope.add(mets) + if stage not in lower_bound_metabolite[mets]: + lower_bound_metabolite[mets].append(stage) + for progeny in succ(mets): + if set(pred(progeny)).issubset(scope): + if status_dict[progeny] != 'V': + if progeny not in queue: queue.append(progeny) + status_dict[parentrxn] = 'V' + elif status_dict[parentrxn] == 'V': + for mets in succ(parentrxn): + if stage not in lower_bound_metabolite[mets]: lower_bound_metabolite[mets].append(stage) + queue.popleft() + return lower_bound_metabolite, status_dict, scope + +def find_different_reaction_types(stoi_matrix, model, current_model_name): + """ + This function finds the exchange, irreversible and the reversible reactions + from the model. + + Parameters + ---------- + stoi_matrix : numpy array + full path name where the model files are + model : COBRA model object + COBRA model object created from SBML models + current_model_name : str + Name which is to be prefixed against every + reaction/metabolite (to differentiate the entries in multiple organisms, + when a community model is built) + Returns + ------- + exchange_met_ids : list + Metabolite identifiers of exchange metabolites + irrev_lhs_nodes : list + Metabolite identifiers of reactants of irreversible reactions + irrev_rhs_nodes : list + Metabolite identifiers of products of irreversible reactions + rev_lhs_nodes : list + Metabolite identifiers of reactants of reversible reactions + rev_rhs_nodes : list + Metabolite identifiers of products of reversible reactions + exchange_rxn_ids : list + Reaction identifers of exchange reactions + irrev_rxn_ids : list + Reaction identifiers of irreversible reactions + rev_rxn_ids : list + Reaction identifiers of reversible reactions + + """ + + xdim = np.shape(stoi_matrix) + reactants_of_reaction, total_metabolites_in_reaction, products_of_reaction = [], [], [] + number_of_reactants_in_reaction, total_number_of_metabs_in_reaction = [], [] + number_of_products_in_reaction, exchange_reaction_idx = [], [] + reaction_identifiers, reaction_in_model, metabolite_identifiers = [], [], [] + for metab in model.metabolites: + metabolite_identifiers.append(metab.id) + for rxns in model.reactions: + reaction_identifiers.append(rxns.id) + reaction_in_model.append(rxns.reaction) + for rxnidx in range(xdim[0]): + reactants_of_reaction.append(np.where(stoi_matrix[rxnidx] == -1)) + total_metabolites_in_reaction.append(np.where(stoi_matrix[rxnidx] != 0)) + products_of_reaction.append(np.where(stoi_matrix[rxnidx] == 1)) + number_of_reactants_in_reaction.append(len(reactants_of_reaction[rxnidx][0])) + total_number_of_metabs_in_reaction.append(len(total_metabolites_in_reaction[rxnidx][0])) + number_of_products_in_reaction.append(len(products_of_reaction[rxnidx][0])) + + # Case 1 - Presence of bulk metabolites in the medium + + if reaction_in_model[rxnidx][-1] == 'b': # Assuming the bulk metabolites end in 'b' + if number_of_reactants_in_reaction[rxnidx] == 1 and number_of_products_in_reaction[rxnidx] == 1: + exchange_reaction_idx.append(rxnidx) + # Case 2 - Presence of exchange metabolites + elif number_of_reactants_in_reaction[rxnidx] == 1 and total_number_of_metabs_in_reaction[rxnidx] == 1: + exchange_reaction_idx.append(rxnidx) + elif number_of_products_in_reaction[rxnidx] == 1 and total_number_of_metabs_in_reaction[rxnidx] == 1: + exchange_reaction_idx.append(rxnidx) + exchange_met_ids, exchange_met_index, exchange_rxn_ids = [], [], [] + for excentry in exchange_reaction_idx: + exchange_rxn_ids.append(reaction_identifiers[excentry]) + if reaction_in_model[excentry][-1] == 'b': + exchange_met_ids.append(metabolite_identifiers[np.nonzero(stoi_matrix[excentry])[0][0]]) + else: exchange_met_index.append(np.nonzero(stoi_matrix[excentry])[0].tolist()[0]) + if exchange_met_index: + for metind in exchange_met_index: + exchange_met_ids.append(metabolite_identifiers[metind]) + all_rxn_idx = list(range(len(reaction_in_model))) + internal_rxns = list(set(all_rxn_idx) ^ set(exchange_reaction_idx)) + reversible_rxns, irreversible_rxns, rxns_lowerbound, rxns_upperbound = [], [], [], [] + for rxns in model.reactions: + rxns_lowerbound.append(rxns.lower_bound) ; rxns_upperbound.append(rxns.upper_bound) + for idxint in internal_rxns: + if rxns_lowerbound[idxint] < 0 and rxns_upperbound[idxint] >= 0: reversible_rxns.append(idxint) + elif rxns_lowerbound[idxint] >= 0 and rxns_upperbound[idxint] >= 0: irreversible_rxns.append(idxint) + # Irreversible reaction nodes + irrev_lhs_temporary, irrev_rhs_temporary, irrev_lhs_nodes, irrev_rhs_nodes, irrev_rxn_ids = [], [], [], [], [] + for irridx in irreversible_rxns: + irrev_rxn_ids.append(reaction_identifiers[irridx]) + irrev_lhs_temporary.append(np.where(stoi_matrix[irridx] < 0)[0].tolist()) + irrev_rhs_temporary.append(np.where(stoi_matrix[irridx] > 0)[0].tolist()) + for lhsirridx in range(len(irrev_lhs_temporary)): + temp_metab_list_lhs = [] + for met_idx_lhs in irrev_lhs_temporary[lhsirridx]: + met_namech_lhs = f"{current_model_name} {metabolite_identifiers[met_idx_lhs]}" + temp_metab_list_lhs.append(met_namech_lhs) + irrev_lhs_nodes.append(temp_metab_list_lhs) + for rhsirridx in range(len(irrev_rhs_temporary)): + temp_metab_list_rhs = [] + for met_idx_rhs in irrev_rhs_temporary[rhsirridx]: + met_namech_rhs = f"{current_model_name} {metabolite_identifiers[met_idx_rhs]}" + temp_metab_list_rhs.append(met_namech_rhs) + irrev_rhs_nodes.append(temp_metab_list_rhs) + + # Reversible reaction nodes + rev_lhs_temporary, rev_rhs_temporary, rev_lhs_nodes, rev_rhs_nodes, rev_rxn_ids = [], [], [], [], [] + for rridx in reversible_rxns: + rev_rxn_ids.append(reaction_identifiers[rridx]) + rev_lhs_temporary.append(np.where(stoi_matrix[rridx] < 0)[0].tolist()) + rev_rhs_temporary.append(np.where(stoi_matrix[rridx] > 0)[0].tolist()) + for lhsrevidx in range(len(rev_lhs_temporary)): + temp_metab_list_lhs_rev = [] + for met_idx_lhs in rev_lhs_temporary[lhsrevidx]: + met_namech_lhs = "%s %s" % (current_model_name, metabolite_identifiers[met_idx_lhs]) + temp_metab_list_lhs_rev.append(met_namech_lhs) + rev_lhs_nodes.append(temp_metab_list_lhs_rev) + for rhsrevidx in range(len(rev_rhs_temporary)): + temp_metab_list_rhs_rev = [] + for met_idx_rhs in rev_rhs_temporary[rhsrevidx]: + met_namech_rhs = "%s %s" % (current_model_name, metabolite_identifiers[met_idx_rhs]) + temp_metab_list_rhs_rev.append(met_namech_rhs) + rev_rhs_nodes.append(temp_metab_list_rhs_rev) + return (exchange_met_ids, irrev_lhs_nodes, irrev_rhs_nodes, rev_lhs_nodes, + rev_rhs_nodes, exchange_rxn_ids, irrev_rxn_ids, rev_rxn_ids) + + +def segregate_reactions_from_models(models): + """ + This function gets the data pertaining to the reactions and the + metabolites from the models of multiple organisms. + This requires as input the pathname where the '.xml' files are located. + From this path, this function reads all the files using the functions + in the COBRA toolbox and generates the stoichiometric model for these + SBML models. + + Parameters + ---------- + models : list + List of model objects + + Returns + ------- + all_organisms_info : dict + Dictionary of all model data (reaction information about all the + organisms) + namemap : dict + Dictionary mapping the adhoc reaction names to reaction names in + the model + + """ + all_organisms_info = {} + namemap = {} + for model in models: + stoi = cobra.util.array.create_stoichiometric_matrix(model) + current_organisms_info = {} + rxns_in_model, mets_in_model = [], [] + for metab in model.metabolites: + mets_in_model.append(metab.id) + for reac in model.reactions: + rxns_in_model.append(reac.id) + stoi_matrix = stoi.T + (exchange_nodes, irrev_lhs_nodes, irrev_rhs_nodes, rev_lhs_nodes, rev_rhs_nodes, + exc_name, irrev_rxn_name, rev_rxn_name + ) = find_different_reaction_types(stoi_matrix, model, model.id) + current_organisms_info[model.id] = { + 'exchange_metab_nodes': exchange_nodes, 'irreversible_lhs_nodes': irrev_lhs_nodes, + 'irreversible_rhs_nodes': irrev_rhs_nodes, 'reversible_lhs_nodes': rev_lhs_nodes, + 'reversible_rhs_nodes': rev_rhs_nodes, 'exch_rxn_name': exc_name, 'irrev_rxn_name': irrev_rxn_name, + 'rev_rxn_name': rev_rxn_name} + + irrev_rxn_number = [] + for num in range(len(irrev_lhs_nodes)): + modified_name_irrev = f'Org_{model.id} IR' + str(num + 1) + irrev_rxn_number.append(modified_name_irrev) + namemap[modified_name_irrev] = irrev_rxn_name[num] + + rev_rxn_number = [] + for num in range(len(rev_lhs_nodes)): + modified_name_rev = f'Org_{model.id} RR' + str(num + 1) + rev_rxn_number.append(modified_name_rev) + namemap[modified_name_rev] = rev_rxn_name[num] + + rev_back_rxn_number = [] + for num in range(len(rev_lhs_nodes)): + modified_name_back_rev = f'Org_{model.id} RevBR' + str(num + 1) + rev_back_rxn_number.append(modified_name_back_rev) + namemap[modified_name_back_rev] = rev_rxn_name[num] + + current_organisms_info[model.id]['reversible_rxn_no'] = rev_rxn_number + current_organisms_info[model.id]['irreversible_rxn_no'] = irrev_rxn_number + current_organisms_info[model.id]['total_nodes'] = len( + exchange_nodes) + len(irrev_lhs_nodes) + len(rev_lhs_nodes) + current_organisms_info[model.id]['model_rxns'] = rxns_in_model + current_organisms_info[model.id]['reversible_back_rxn_no'] = rev_back_rxn_number + current_organisms_info[model.id]['metabolites'] = mets_in_model + all_organisms_info.update(current_organisms_info) + return all_organisms_info, namemap + +def find_relievedrxns(model, org_info, org_info_pert): + relieved = {i: list(set(org_info_pert[i]) - set(org_info[i])) for i in org_info_pert} + detailed_rel_rxns, rel_rxns_name = {}, {} + + for i in model: + j = i.id + detailed_rel_rxns[j] = [] + rel_rxns_name[j] = [] + if len(relieved[j]): + rxn_ids = [] + for r in i.reactions: + rxn_ids.append(r.id) + for rel in relieved[j]: + rel_rxn = i.reactions[rxn_ids.index(rel)].reaction + detailed_rel_rxns[j].append(rel_rxn) + rel_rxns_name[j].append(i.reactions[rxn_ids.index(rel)].name) + + return relieved, detailed_rel_rxns, rel_rxns_name + +def find_stuckrxns(model, community, media, no_of_orgs): + # Constructing graphs + warnings.filterwarnings("ignore") + G, full_name_map = create_graph(community, no_of_orgs) + if not os.path.exists('results'): os.makedirs('results') + all_possible_combis = list(combinations(list(range(len(community))), int(no_of_orgs))) + if no_of_orgs > 1 and sorted(community)[0][0] == '0': + all_possible_combis = all_possible_combis[:len(community) - 1] + org_info = {} + scope = {} + print('No. of graphs constructed: ', len(G)) + + # This loop finds all the stuck reaction + for i in range(len(all_possible_combis)): + lbm, sd, s = forward_pass(G[i], media) + for j in range(len(all_possible_combis[i])): + stuck, rxnNode = [], [] + model1 = model[all_possible_combis[i][j]].id + visited = list(sd.keys()) + for r in G[i].nodes: + if r.find(model1) >= 0: rxnNode.append(r) + for rxn in rxnNode: + if rxn in visited: continue + elif rxn.find('ERR') >= 0: continue + elif rxn.find('Org') >= 0: + if (rxn[len(model1) + 5] == 'I') or (rxn[len(model1) + 5] == 'R'): stuck.append(rxn) + org_info[model1] = stuck + scope[model1] = s + return org_info, scope, full_name_map + +def decrypt_orginfo(org_info, namemap): + """ + This function decrypts the rxn ids using the data in corresponding namemaps + :param org_info: + :param namemap: + :return: + org_info: An dictionary of decrypted rxn ids for each community + """ + for i in org_info: + for j in range(len(org_info[i])): + org_info[i][j] = namemap[org_info[i][j]] + return org_info + +def make_perturbed_community(rem_org, pert_models, pert_community): + pert_model_ids = [i.id for i in pert_models] + for i in rem_org: + if i in pert_model_ids: + pert_models.remove(pert_models[pert_model_ids.index(i)]) + pert_community.remove(pert_community[pert_model_ids.index(i)]) + pert_model_ids.remove(i) + + return pert_models, pert_community, pert_model_ids + +def perform_task(media, model, transport_rxns, pert_community, + org_info_wo_trans_rxn, rem_org_list, n): + org_info_pert, scope_pert, namemap_pert = find_stuckrxns(model, pert_community, media, len(pert_community)) + org_info_pert = decrypt_orginfo(org_info_pert, namemap_pert) + org_info_pert_wo_trans_rxn = {i:list(set(org_info_pert[i]) - set(transport_rxns)) for i in org_info_pert} + + with open(f"results/Community_without_clus{str(n)}.csv", "w") as g: + for m in org_info_pert_wo_trans_rxn: + g.write(m + ',' + str(len(org_info_pert_wo_trans_rxn[m])) + '\n') + stuck_com = stuck_pert_com = 0 + for i in org_info_wo_trans_rxn: + if i not in rem_org_list: stuck_com += len(org_info_wo_trans_rxn[i]) + for i in org_info_pert_wo_trans_rxn: + stuck_pert_com += len(org_info_pert_wo_trans_rxn[i]) + msi = 1 - (stuck_com / stuck_pert_com) + print(n, 'th cluster') + return org_info_pert, org_info_pert_wo_trans_rxn, msi + +def write_relieved_rxns(g, relieved, detailed_rel_rxns, rel_rxns_name): + g.write('acceptor\trelieved reactions\n') + for i in relieved: + g.write(i + '\t') + for j in list(set(relieved[i])): + g.write(j + '\t\n\t') + for d in list(set(rel_rxns_name[i])): + g.write(d + '\t\n\t') + for k in list(set(detailed_rel_rxns[i])): + g.write(k + '\t\n') + +def write_relieved_rxn_metadata(h, org_info_wo_trans_rxn, org_info_pert_wo_trans_rxn): + nrelieved = {} + for i in org_info_pert_wo_trans_rxn: + nrelieved[i] = len(org_info_pert_wo_trans_rxn[i]) - len(org_info_wo_trans_rxn[i]) + if nrelieved[i]: + h.write(i + ',' + str(len(org_info_wo_trans_rxn[i])) + ',' + str( + len(org_info_pert_wo_trans_rxn[i])) + ',' + str(nrelieved[i]) + '\n') + +def find_relieved_rxn(model, media_name, org_info_single, org_info_pair): + """ + This function extracts and writes the relieved rxns into a tsv file + :param model: + :param media_name: name of the media used (identifer to know what media is used when analysis is done using multiple media) + :param org_info_single: Dictionary containing stuck reactions of all microbes in the community + :param org_info_pair: Dictionary containing stuck reactions of all microbes in the community + :return: None + """ + relieved = {} + for org1 in model: + for org2 in model: + if org1.id + '_' + org2.id in org_info_pair.keys(): + relieved[org1.id + '_' + org2.id] = [] + temp = list(set(org_info_single[org1.id + '_' + org1.id]) - set(org_info_pair[org1.id + '_' + org2.id])) + for j in temp: + relieved[org1.id + '_' + org2.id].append(j) + else: continue + + rel_rxns_name, detailed_rel_rxns = {}, {} + for i in model: + rxn_ids = [r.id for r in i.reactions] + for j in model: + org1 = i.id ; org2 = j.id + if org1 + '_' + org2 in relieved.keys(): + detailed_rel_rxns[org1 + '_' + org2] = [] + rel_rxns_name[org1 + '_' + org2] = [] + for rel in relieved[org1 + '_' + org2]: + rel_rxn = i.reactions[rxn_ids.index(rel)].reaction + detailed_rel_rxns[org1 + '_' + org2].append(rel_rxn) + rel_rxns_name[org1 + '_' + org2].append(i.reactions[rxn_ids.index(rel)].name) + + relieved_rxn_output_file = f'results/relieved_rxns_{media_name}_w_excrxns.tsv' + with open(relieved_rxn_output_file, 'w') as g: + header = 'acceptor\tdonor\trelieved reactions\n' + g.write(header) + for i in model: + for j in model: + org1 = i.id ; org2 = j.id + if org1 + '_' + org2 in relieved.keys(): + g.write(org1 + '\t' + org2 + '\t') + rel_rxns = list(set(relieved[org1 + '_' + org2])) + det_rel_rxns = list(set(detailed_rel_rxns[org1 + '_' + org2])) + rel_rxn_nam = list(set(rel_rxns_name[org1 + '_' + org2])) + for x in rel_rxns: + g.write(x + '\t\n\t\t') + for d in rel_rxn_nam: + g.write(d + '\t\n\t\t') + for k in det_rel_rxns: + g.write(k + '\t\n') + print('relieved reactions are written at:\n', relieved_rxn_output_file) + +def find_stuck_rxns(models, community, media, comm_size): + """ + Constructs graphs using MetQuest and finds all stuck reactions in the cellular compartment + :param models: list of GEMs + :param community: the community model + :param seedmet_file: path to txt file containing seed metabolites + :param comm_size: number of organisms in a community + :return: + org_info: Dictionary containing stuck reactions of all microbes in the community + scope: Dictionary containing all the metabolites that can be produced by the microbes in the community + namemap: Dictionaru containing all the decrypted rxn ids + """ + warnings.filterwarnings("ignore") + G, full_name_map = create_graph(community, comm_size) + if not os.path.exists('results'): os.makedirs('results') + + all_possible_combis = combinations(models, comm_size) + org_info, scope, vis = {}, {}, {} + print('No. of graphs constructed: ', len(G)) + + # This loop finds all the stuck reaction + for i in range(len(all_possible_combis)): + lbm, sd, s = forward_pass(G[i], media) + for j in range(len(all_possible_combis[i])): + stuck, rxnNode = [], [] + model1 = models[all_possible_combis[i][j]].id + visited = list(sd.keys()) + for r in G[i].nodes: + if r.find(model1) >= 0: rxnNode.append(r) + for rxn in rxnNode: + if rxn in visited or rxn.find('ERR') >= 0: continue + elif rxn.find('Org') >= 0: + if (rxn[len(model1) + 5] == 'I') or (rxn[len(model1) + 5] == 'R'): stuck.append(rxn) + model2 = models[all_possible_combis[i][j - 1]].id + org_info[model1 + '_' + model2] = stuck + scope[model1 + '_' + model2] = s + vis[model1 + '_' + model2] = visited + return org_info, scope, full_name_map, vis + +def decrypt_org_info(org_info, namemap): + """ + This function decrypts the rxn ids using the data in corresponding namemaps + :param org_info: + :param namemap: + :return: + org_info: An dictionary of decrypted rxn ids for each community + """ + for i in org_info: + for j in range(len(org_info[i])): + org_info[i][j] = namemap[org_info[i][j]] + return org_info + +def pMSI(models, media): + """ + Calculates MSI for CarveMe models + Extracts and writes relieved reactions in every pair + :param community: list of GSMM files + :param sd_file: path to txt file containing seed metabolites + :return: msi: Dictionary containing MSI values for every pair + """ + # find all transport reactions + community_model = commhelper.build_from_species_models(models) + comm_util = MSModelUtil(community_model) + # find stuck reactions + org_info_single, scope_sin, namemap_sin, vis = find_stuck_rxns(models, community_model, media, 1) + org_info_pair, scope_pair, namemap_pair, vis = find_stuck_rxns(models, models, media, 2) + # decrypt the stuck reactions + org_info_single = decrypt_org_info(org_info_single, namemap_sin) + org_info_pair = decrypt_org_info(org_info_pair, namemap_pair) + # Filter out the transport reactions from every stuck reaction list + org_info_single_wo_trans_rxn, org_info_pair_wo_trans_rxn = {}, {} + for i in org_info_single: + org_info_single_wo_trans_rxn[i] = list(set(org_info_single[i]) - set(comm_util.transport_list())) + for i in org_info_pair: + org_info_pair_wo_trans_rxn[i] = list(set(org_info_pair[i]) - set(comm_util.transport_list())) + # find all the relieved reactions in every pairs + find_relieved_rxn(models, "relieved_rxns", org_info_single, org_info_pair) + # calculate MSI for every pair + msi = {} + for org1 in models: + stuck_A = len(org_info_single_wo_trans_rxn[org1.id + '_' + org1.id]) + for org2 in models: + if org1.id + '_' + org2.id in org_info_pair_wo_trans_rxn.keys(): + stuck_AUB = len(org_info_pair_wo_trans_rxn[org1.id + '_' + org2.id]) + if stuck_A == 0: msi[org1.id + '_' + org2.id] = 0 + else: msi[org1.id + '_' + org2.id] = 1 - (stuck_AUB / stuck_A) + return msi, community_model + +def calculate_pairwiseMSI(models, media): + """ + This function calculates pairwise-MSI for all given microbes. + + Creates a csv file containing the MSI values of all pairs. + + Creates an tsv file containing the list of reaction relieved + in all acceptor microbes in the presence of corresponding donor microbes. + + :param path: path to all xml files + :param sd_file: path to txt file containing seed metabolites + """ + + warnings.filterwarnings("ignore") + msi, community_model = pMSI(models, media) + msi_output_file = f"results/MSI_{os.path.basename(media).replace('.txt', '')}.csv" + with open(msi_output_file, 'w') as f: + header = 'organism,in_the_presence,msi_value\n' + f.write(header) + for org1, org2 in combinations(models, 2): + if org1.id + '_' + org2.id in msi.keys(): + f.write(f"{org1.id},{org2.id},{str(msi[org1.id + '_' + org2.id])}\n") + print('MSI values are written at:\n', msi_output_file) + +def calculate_higherorderMSI(models, media, clusters = 'individual_clusters'): + community_model = commhelper.build_from_species_models(models) + comm_util = MSModelUtil(community_model) + org_info, scope, namemap = find_stuckrxns(model, community, media, len(community)) + org_info = decrypt_orginfo(org_info, namemap) + org_info_wo_trans_rxn = {i: list(set(org_info[i]) - set(comm_util.transport_list())) for i in org_info} + + with open(f"results/community_unperturbed.csv", 'w') as f: + for i, diff in org_info_wo_trans_rxn.items(): + f.write(i + ',' + str(len(diff)) + '\n') + + if clusters == 'individual_clusters': + rem_org_list1, rem_org_list2 = {}, {} + for i, model in enumerate(models): + rem_org_list1[i] = model.id ; rem_org_list2[i] = model.id + else: + cluster_data = pd.read_csv(clusters, sep=',') + rem_org_list1 = cluster_data.set_index('Cluster').T.to_dict('list') + for n in rem_org_list1: + rem_org_list1[n] = [j for j in rem_org_list1[n] if pd.isna(j) is False] + for n in rem_org_list1: + rem_org_list1[n] = [cobra.io.read_sbml_model(i).id for i in rem_org_list1[n]] + # rem_org_list1[n] = [model_ids[model_ids.index(i)] for i in rem_org_list1[n]] + rem_org_list2 = rem_org_list1.copy() + + for nclus in rem_org_list2: + rem_org_list2[nclus] = [x.replace('.xml', '') for x in rem_org_list2[nclus]] + + with open(f"results/higher_order_msi.csv", 'w') as f: + for n in rem_org_list1: + # os.chdir(path) + # new_models = model.copy() + # new_community = glob.glob('*.xml') + # if not new_community: + # new_community = glob.glob('*.sbml') + # new_community.sort() + + pert_models, pert_community, pert_model_ids = make_perturbed_community(rem_org_list1[n], new_models, + new_community) + + org_info_pert, org_info_pert_wo_trans_rxn, msi = perform_task( + media, pert_models, transport_rxns, pert_community, org_info_wo_trans_rxn, rem_org_list2[n], n) + for i in rem_org_list2[n]: + f.write('Comm,clus_' + str(n) + '#' + i + ',' + str(msi) + '\n') + + if msi: + relieved, detailed_rel_rxns, rel_rxns_name = find_relievedrxns(pert_models, org_info, org_info_pert) + with open(f'results/clusterKO_/data_analysis/relieved_rxns_Comm--clus{n}.tsv', 'w') as g: + write_relieved_rxns(g, relieved, detailed_rel_rxns, rel_rxns_name) + with open(f'results/clusterKO_/data_analysis/Comm--clus{n}.tsv', 'w') as h: + h.write('Comm--clus' + str(n) + '\n') + for i in rem_org_list2[n]: + h.write(i + '\n') + h.write('num of rxns relieved in the below orgs in the presence of clust' + str(n) + '\n') + h.write('org,unpert,clust_' + str(n) + 'KO,rxns relieved\n') + write_relieved_rxn_metadata(h, org_info_wo_trans_rxn, org_info_pert_wo_trans_rxn) + print('Comm--clus' + str(n)) + + new_models = model.copy() + new_community = glob.glob('*.xml') + if not new_community: + new_community = glob.glob('*.sbml') + new_community.sort() + ko_models, ko_community, model_ids = make_perturbed_community(pert_model_ids, new_models, new_community) + ko_org_list = [x for x in pert_model_ids] + if len(ko_org_list) < len(model): + org_info_pert, org_info_pert_wo_trans_rxn, msi = perform_task( + media, ko_models, transport_rxns, ko_community, org_info_wo_trans_rxn, ko_org_list, n) + for i in ko_community: + f.write('clus_' + str(n) + '#' + i + ',Comm,' + str(msi) + '\n') + + if msi: + relieved, detailed_rel_rxns, rel_rxns_name = find_relievedrxns(ko_models, org_info, org_info_pert) + with open(f'results/clusterKO_/data_analysis/relieved_rxns_Comm--clus{n}.tsv', 'w') as g: + write_relieved_rxns(g, relieved, detailed_rel_rxns, rel_rxns_name) + with open(f'results/clusterKO_/data_analysis/Comm{n}--clus.tsv', 'w') as h: + h.write('clus' + str(n) + '--Comm\n') + for i in ko_org_list: + h.write(i + '\n') + h.write('num of rxns relieved in the below orgs in the presence of Comm') + h.write('org,unpert,commKO,rxns relieved\n') + write_relieved_rxn_metadata(h, org_info_wo_trans_rxn, org_info_pert_wo_trans_rxn) + print('clus' + str(n) + '--Comm') diff --git a/modelseedpy/community/mscommunity.py b/modelseedpy/community/mscommunity.py index 4c2b1a50..90dfdb3a 100644 --- a/modelseedpy/community/mscommunity.py +++ b/modelseedpy/community/mscommunity.py @@ -1,534 +1,248 @@ +# -*- coding: utf-8 -*- from modelseedpy.fbapkg.mspackagemanager import MSPackageManager from modelseedpy.core.msmodelutl import MSModelUtil +from modelseedpy.community.mssteadycom import MSSteadyCom +from modelseedpy.community.commhelper import build_from_species_models +from modelseedpy.core.exceptions import ObjectAlreadyDefinedError, FeasibilityError, NoFluxError from modelseedpy.core.msgapfill import MSGapfill from modelseedpy.core.fbahelper import FBAHelper #from modelseedpy.fbapkg.gapfillingpkg import default_blacklist -from modelseedpy.core import MSATPCorrection -from cobra import Model, Reaction, Metabolite +from modelseedpy.core.msatpcorrection import MSATPCorrection +from cobra.io import save_matlab_model, write_sbml_model from cobra.core.dictlist import DictList -from cobra.io import save_matlab_model -from itertools import combinations from optlang.symbolics import Zero -from matplotlib import pyplot +from optlang import Constraint from pandas import DataFrame +from pprint import pprint +from cobra import Reaction import logging -#import itertools -# import reframed -import cobra -import networkx -import sigfig -import re, os logger = logging.getLogger(__name__) -class CommunityModelSpecies: - def __init__(self, - community, # MSCommunity environment - biomass_cpd, # metabolite in the biomass reaction - names=[] # names of the community species - ): + +class CommunityMember: + def __init__(self, community, biomass_cpd, name=None, index=None, abundance=0): self.community, self.biomass_cpd = community, biomass_cpd - self.index = int(self.biomass_cpd.compartment[1:]) - self.abundance = 0 + self.index = index or int(self.biomass_cpd.compartment[1:]) + self.abundance = abundance if self.biomass_cpd in self.community.primary_biomass.metabolites: self.abundance = abs(self.community.primary_biomass.metabolites[self.biomass_cpd]) - if self.index <= len(names) and names[self.index-1]: - self.id = names[self.index-1] - else: - if "species_name" in self.biomass_cpd.annotation: - self.id = self.biomass_cpd.annotation["species_name"] - else: - self.id = "Species"+str(self.index) - + if name: self.id = name + elif "species_name" in self.biomass_cpd.annotation: + self.id = self.biomass_cpd.annotation["species_name"] + else: self.id = "Species"+str(self.index) + logger.info("Making atp hydrolysis reaction for species: "+self.id) - atp_rxn = FBAHelper.add_atp_hydrolysis(self.community.model,"c"+str(self.index)) - # FBAHelper.add_autodrain_reactions_to_self.community_model(self.community.model) # !!! FIXME This FBAHelper function is not defined. + atp_rxn = self.community.util.add_atp_hydrolysis("c"+str(self.index)) self.atp_hydrolysis = atp_rxn["reaction"] self.biomass_drain = None - self.biomasses = [] - for reaction in self.community.model.reactions: - if self.biomass_cpd in reaction.metabolites: - if reaction.metabolites[self.biomass_cpd] == 1 and len(reaction.metabolites) > 1: - self.biomasses.append(reaction) - elif len(reaction.metabolites) == 1 and reaction.metabolites[self.biomass_cpd] < 0: - self.biomass_drain = reaction - - if len(self.biomasses) == 0: - logger.critical("No biomass reaction found for species "+self.id) + self.biomasses, self.reactions = [], [] + self.primary_biomass = None + for rxn in self.community.util.model.reactions: + rxnComp = FBAHelper.rxn_compartment(rxn) + if not rxnComp: + print(f"The reaction {rxn.id} strangely lacks a compartment.") + elif int(rxnComp[1:]) == self.index and 'bio' not in rxn.name: + self.reactions.append(rxn) + if self.biomass_cpd in rxn.metabolites: + if rxn.metabolites[self.biomass_cpd] == 1 and len(rxn.metabolites) > 1: + self.biomasses.append(rxn) + if len(self.biomasses) == 1: # TODO make this condition more reflective of primary biomass + self.primary_biomass = rxn + elif len(rxn.metabolites) == 1 and rxn.metabolites[self.biomass_cpd] < 0: + self.biomass_drain = rxn + + if self.biomasses == []: logger.critical("No biomass reaction found for species " + self.id) if not self.biomass_drain: logger.info("Making biomass drain reaction for species: "+self.id) - self.biomass_drain = Reaction(id="DM_"+self.biomass_cpd.id, - name="DM_" + self.biomass_cpd.name, - lower_bound=0, upper_bound=100) - self.community.model.add_reactions([self.biomass_drain]) + self.biomass_drain = Reaction( + id="DM_"+self.biomass_cpd.id, name="DM_" + self.biomass_cpd.name, lower_bound=0, upper_bound=100) + self.community.util.model.add_reactions([self.biomass_drain]) self.biomass_drain.add_metabolites({self.biomass_cpd: -1}) self.biomass_drain.annotation["sbo"] = 'SBO:0000627' - + def disable_species(self): for reaction in self.community.model.reactions: - if int(FBAHelper.rxn_compartment(reaction)[1:]) == self.index: - reaction.upper_bound = reaction.lower_bound = 0 - + reaction_index = FBAHelper.rxn_compartment(reaction)[1:] + if int(reaction_index) == self.index: reaction.upper_bound = reaction.lower_bound = 0 + def compute_max_biomass(self): - if len(self.biomasses) == 0: - logger.critical("No biomass reaction found for species "+self.id) - self.community.model.objective = self.community.model.problem.Objective(Zero,direction="max") - self.community.model.objective.set_linear_coefficients({self.biomasses[0].forward_variable:1}) - if self.community.lp_filename != None: - self.community.print_lp(self.community.lp_filename+"_"+self.id+"_Biomass") + if len(self.biomasses) == 0: logger.critical("No biomass reaction found for species "+self.id) + self.community.util.add_objective(self.primary_biomass.flux_expression) + if self.community.lp_filename: self.community.print_lp(f"{self.community.lp_filename}_{self.id}_Biomass") return self.community.model.optimize() - + def compute_max_atp(self): - if not self.atp_hydrolysis: - logger.critical("No ATP hydrolysis found for species:"+self.id) - self.community.model.objective = self.community.model.problem.Objective(Zero,direction="max") - self.community.model.objective.set_linear_coefficients({self.atp_hydrolysis.forward_variable:1}) - if self.community.lp_filename: - self.community.print_lp(self.community.lp_filename+"_"+self.id+"_ATP") + if not self.atp_hydrolysis: logger.critical("No ATP hydrolysis found for species:" + self.id) + self.community.util.add_objective(Zero, coef={self.atp_hydrolysis.forward_variable: 1}) + if self.community.lp_filename: self.community.print_lp(f"{self.community.lp_filename}_{self.id}_ATP") return self.community.model.optimize() + class MSCommunity: - def __init__(self, model, - names=[], abundances=None, # names and abundances of the community species - pfba = True, # specify whether parsimonious FBA will be simulated - lp_filename = None # specify a filename to create an lp file - ): - #Setting model and package manager - self.model, self.lp_filename, self.pfba = model, lp_filename, pfba - self.pkgmgr = MSPackageManager.get_pkg_mgr(model) + def __init__(self, model=None, member_models: list = None, ids=None, abundances=None, kinetic_coeff=2000, + flux_limit=300, lp_filename=None, printing=False): + self.lp_filename = lp_filename self.gapfillings = {} + #Define Data attributes as None - self.solution = self.biomass_cpd = self.primary_biomass = self.biomass_drain = self.msgapfill = self.element_uptake_limit = self.kinetic_coeff = self.modelseed_db_path = None - self.species = DictList() - #Computing data from model - msid_cobraid_hash = FBAHelper.msid_hash(model) - if "cpd11416" not in msid_cobraid_hash: - logger.critical("Could not find biomass compound") + self.solution = self.biomass_cpd = self.primary_biomass = self.biomass_drain = None + self.msgapfill = self.element_uptake_limit = self.kinetic_coeff = self.msdb_path = None + # defining the models + if member_models is not None and model is None: + model = build_from_species_models(member_models, abundances=abundances, printing=printing) + if ids is None and member_models is not None: ids = [mem.id for mem in member_models] + self.id = model.id + self.util = MSModelUtil(model, True) + self.pkgmgr = MSPackageManager.get_pkg_mgr(self.util.model) + msid_cobraid_hash = self.util.msid_hash() + # print(msid_cobraid_hash) + write_sbml_model(model, "test_comm.xml") + + if "cpd11416" not in msid_cobraid_hash: raise KeyError("Could not find biomass compound for the model.") other_biomass_cpds = [] - for biomass_cpd in msid_cobraid_hash["cpd11416"]: - if biomass_cpd.compartment == "c0": - self.biomass_cpd = biomass_cpd - for reaction in model.reactions: - if self.biomass_cpd in reaction.metabolites: - if reaction.metabolites[self.biomass_cpd] == 1 and len(reaction.metabolites) > 1: - self.primary_biomass = reaction - elif reaction.metabolites[self.biomass_cpd] < 0 and len(reaction.metabolites) == 1: - self.biomass_drain = reaction - else: - other_biomass_cpds.append(biomass_cpd) - for biomass_cpd in other_biomass_cpds: - species_obj = CommunityModelSpecies(self,biomass_cpd,names) - self.species.append(species_obj) - if abundances: - self.set_abundance(abundances) - - @staticmethod - def build_from_species_models(models,mdlid=None,name=None,names=[],abundances=None): - """Merges the input list of single species metabolic models into a community metabolic model - - Parameters - ---------- - models : list - List of models to be merged into a community model - mdlid : string - String specifying community model ID - name : string - String specifying community model name - names : list - List of human readable names for models being merged - abundances : dict - Hash of relative abundances for input models in community model - - Returns - ------- - Cobra.Model - Community model object - - Raises - ------ - """ - newmodel = Model(mdlid,name) - newutl = MSModelUtil(newmodel) - biomass_compounds = [] - index = 1 - biomass_index = 2 - for model in models: - new_metabolites = [] - new_reactions = [] - #Rename metabolites - for met in model.metabolites: - #Renaming compartments - if re.search('[a-z+](\d*)$', met.compartment): - m = re.search('([a-z]+)(\d*)$', met.compartment) - if len(m[2]) == 0: - if m[1] == "e": - met.compartment += "0" - else: - met.compartment += str(index) - elif m[1] == "e": - met.compartment = m[1]+"0" - else: - met.compartment = m[1]+str(index) - #Processing metabolite ID - output = MSModelUtil.parse_id(met) - if output == None: - if met.compartment[0] != "e": - met.id += str(index) - elif output[1] != "e": - if len(output[2]) == 0: - met.id = met.id+str(index) - else: - met.id = output[0]+"_"+output[1]+str(index) - if met.id not in newmodel.metabolites: - new_metabolites.append(met) - if met.id == "cpd11416": - biomass_compounds.append(met) - #Rename reactions - for rxn in model.reactions: - if rxn.id[0:3] != "EX_": - if re.search('^(bio)(\d+)$', rxn.id) != None: - rxn.id = "bio"+str(biomass_index) - biomass_index += 1 - else: - output = MSModelUtil.parse_id(rxn) - if output == None: - if rxn.compartment.id[0] != "e": - rxn.id += str(index) - elif output[1] != "e": - if len(output[2]) == 0: - rxn.id = rxn.id+str(index) - else: - rxn.id = output[0]+"_"+output[1]+str(index) - if rxn.id not in newmodel.reactions: - new_reactions.append(rxn) - #Adding new reactions and compounds to base model - newmodel.add_reactions(new_reactions) - newmodel.add_metabolites(new_metabolites) - index += 1 - #Create community biomass - comm_biomass = Metabolite("cpd11416_c0", None, "Community biomass", 0, "c0") - metabolites = {comm_biomass : 1} - comm_biorxn = Reaction(id="bio1", name= "bio1", lower_bound=0, upper_bound=100) - count = len(biomass_compounds) - for cpd in biomass_compounds: - metabolites[cpd] = -1/count - comm_biorxn.add_metabolites(metabolites) - newmodel.add_reactions([comm_biorxn]) - newutl.add_exchanges_for_metabolites([comm_biomass],0,100,'SK_') - return MSCommunity(newmodel,names,abundances) - + for self.biomass_cpd in msid_cobraid_hash["cpd11416"]: + if "c0" in self.biomass_cpd.id: + for rxn in self.util.model.reactions: + if self.biomass_cpd not in rxn.metabolites: continue + print(self.biomass_cpd, rxn, end=";\t") + if rxn.metabolites[self.biomass_cpd] == 1 and len(rxn.metabolites) > 1: + if self.primary_biomass: raise ObjectAlreadyDefinedError( + f"The primary biomass {self.primary_biomass} is already defined," + f"hence, the {rxn.id} cannot be defined as the model primary biomass.") + if printing: print('primary biomass defined', rxn.id) + self.primary_biomass = rxn + elif rxn.metabolites[self.biomass_cpd] < 0 and len(rxn.metabolites) == 1: self.biomass_drain = rxn + elif 'c' in self.biomass_cpd.compartment: + other_biomass_cpds.append(self.biomass_cpd) + # assign community members and their abundances + print() # this returns the carriage after the tab-ends in the biomass compound printing + abundances = abundances or [1/len(other_biomass_cpds)]*len(other_biomass_cpds) + self.members = DictList( + CommunityMember(community=self, biomass_cpd=biomass_cpd, name=ids[memIndex], abundance=abundances[memIndex]) + for memIndex, biomass_cpd in enumerate(other_biomass_cpds)) + # assign the MSCommunity constraints and objective + self.abundances_set = False + if isinstance(abundances, dict): self.set_abundance(abundances) + self.pkgmgr.getpkg("CommKineticPkg").build_package(kinetic_coeff, self) + for member in self.members: + vars_coef = {} + for rxn in self.util.model.reactions: + if "EX_" not in rxn.id and member.index == FBAHelper.rxn_compartment(rxn)[1:]: + vars_coef[rxn.forward_variable] = vars_coef[rxn.reverse_variable] = 1 + print(member.id, flux_limit, member.abundance) + self.util.create_constraint(Constraint(Zero, lb=0, ub=flux_limit*member.abundance, + name=f"{member.id}_resource_balance"), coef=vars_coef) + #Manipulation functions - def set_abundance(self,abundances): - #ensure normalization - total_abundance = sum([abundances[species] for species in abundances]) - #map abundances to all species - for species in abundances: - abundances[species] = abundances[species]/total_abundance - if species in self.species: - self.species.get_by_id(species).abundance = abundances[species] + def set_abundance(self, abundances): + #calculate the normalized biomass + total_abundance = sum(list(abundances.values())) + # map abundances to all species + for species, abundance in abundances.items(): + if species in self.members: self.members.get_by_id(species).abundance = abundance/total_abundance #remake the primary biomass reaction based on abundances - if self.primary_biomass == None: - logger.critical("Primary biomass reaction not found in community model") - all_metabolites = {self.biomass_cpd:1} - for species in self.species: - all_metabolites[species.biomass_cpd] = -1*abundances[species.id] - self.primary_biomass.add_metabolites(all_metabolites,combine=False) - - def set_objective(self,target = None,minimize = False): #!!! Mustn't a multilevel objective be set for community models? - if target == None: - target = self.primary_biomass.id - sense = "max" - if minimize: - sense = "min" - self.model.objective = self.model.problem.Objective( - self.model.reactions.get_by_id(target).flux_expression, - direction=sense - ) - - def constrain(self,element_uptake_limit = None, kinetic_coeff = None,modelseed_db_path = None): - # applying uptake constraints - self.element_uptake_limit = element_uptake_limit + if self.primary_biomass is None: logger.critical("Primary biomass reaction not found in community model") + all_metabolites = {self.primary_biomass.products[0]: 1} + all_metabolites.update({mem.biomass_cpd: -abundances[mem.id]/total_abundance for mem in self.members}) + self.primary_biomass.add_metabolites(all_metabolites, combine=False) + self.abundances_set = True + + def set_objective(self, target=None, targets=None, minimize=False): + targets = targets or [self.util.model.reactions.get_by_id(target or self.primary_biomass.id).flux_expression] + self.util.model.objective = self.util.model.problem.Objective( + sum(targets), direction="max" if not minimize else "min") + + def constrain(self, element_uptake_limit=None, thermo_params=None, msdb_path=None): if element_uptake_limit: + self.element_uptake_limit = element_uptake_limit self.pkgmgr.getpkg("ElementUptakePkg").build_package(element_uptake_limit) - # applying kinetic constraints - self.kinetic_coeff = kinetic_coeff - if kinetic_coeff: - self.pkgmgr.getpkg("CommKineticPkg").build_package(kinetic_coeff,self) - # applying FullThermo constraints - self.modelseed_db_path = modelseed_db_path - if modelseed_db_path: - self.pkgmgr.getpkg("FullThermoPkg").build_package({'modelseed_db_path':modelseed_db_path}) - - #Utility functions - def print_lp(self,filename = None): - if not filename: - filename = self.lp_filename - if filename: - with open(filename+".lp", 'w') as out: - out.write(str(self.model.solver)) - out.close() - - def compute_interactions(self, solution = None, # the COBRA simulation solution that will be parsed and visualized - threshold: int = 1, #!!! What is this threshold? - visualize: bool = True, # specifies whether the net flux will be depicted in a network diagram - export_directory: str = None, # specifies the directory to which the network diagram and associated datatable will be exported, where None does not export the content - node_metabolites: bool = True, # specifies whether the metabolites of each node will be printed - x_offset: float = 0.15, # specifies the x-axis buffer between each species node and its metabolite list in the network diagram - show_figure: bool = True # specifies whether the figure will be printed to the console - ): - #Check for solution - if not solution: - solution = self.solution - if not solution: - logger.warning("No feasible solution!") - return None - - #Initialize data - metabolite_data, species_data, species_collection = {}, {"Environment":{}}, {"Environment":{}} - data = {"IDs":[],"Metabolites/Donor":[], "Environment":[]} - met_list, species_list = [], [None for i in range(1000)] + if thermo_params: + if msdb_path: + self.msdb_path = msdb_path + thermo_params.update({'modelseed_db_path':msdb_path}) + self.pkgmgr.getpkg("FullThermoPkg").build_package(thermo_params) + else: self.pkgmgr.getpkg("SimpleThermoPkg").build_package(thermo_params) + + def interactions(self, solution=None, media=None, msdb=None, msdb_path=None, filename=None, figure_format="svg", + node_metabolites=True, flux_threshold=1, visualize=True, ignore_mets=None): + return MSSteadyCom.interactions(self, solution or self.solution, media, flux_threshold, msdb, msdb_path, + visualize, filename, figure_format, node_metabolites, True, ignore_mets) - #establish spreadsheet infrastructure for only extracellular metabolites - for met in self.model.metabolites: - if met.compartment == "e0": - met_list.append(met) - data["IDs"].append(met.id) - data["Metabolites/Donor"].append(met.name) - - metabolite_data[met] = {} - metabolite_data[met]["Environment"] = 0 - for individual in self.species: - metabolite_data[met][individual.id] = 0 - - for individual in self.species: - species_data[individual.id], species_collection[individual.id] = {}, {} - species_list[individual.index] = individual - data[individual.id] = [] - data["IDs"].append(individual.index) - data["Metabolites/Donor"].append(individual.id) - for other in self.species: - species_data[individual.id][other.id] = 0 - species_collection[individual.id][other.id] = [] - - species_data["Environment"][individual.id] = species_data[individual.id]["Environment"] = 0 - species_collection["Environment"][individual.id], species_collection[individual.id]["Environment"] = [], [] - - data["IDs"].append("Environment") - data["Metabolites/Donor"].append("Environment") - for individual in self.species: - data["IDs"].append(individual.index) - data["Metabolites/Donor"].append(individual.id+" list") - - # computing net metabolite flux from each reaction - for rxn in self.model.reactions: - if rxn.id[0:3] == "EX_" and abs(solution.fluxes[rxn.id]) > Zero: - cpd = list(rxn.metabolites.keys())[0] - if cpd in metabolite_data: - metabolite_data[cpd]["Environment"] += -1*solution.fluxes[rxn.id] - if len(rxn.id.split("_")) > 1: - comp_index = int(rxn.id.split("_")[-1][1:]) - for metabolite in rxn.metabolites: - if metabolite in metabolite_data: - if species_list[comp_index] != None: - metabolite_data[metabolite][species_list[comp_index].id] += solution.fluxes[rxn.id]*rxn.metabolites[metabolite] - - # translating net metbaolite flux into species interaction flux - for met in metabolite_data: - #Iterating through the metabolite producers - total = sum([metabolite_data[met][individual.id] for individual in self.species if metabolite_data[met][individual.id] > Zero]) - if metabolite_data[met]["Environment"] > Zero: - total += metabolite_data[met]["Environment"] - for individual in self.species: - if metabolite_data[met][individual.id] > Zero: - # calculate the total net flux between each combination of species, and track the involved metabolites - for other in self.species: - if metabolite_data[met][other.id] < Zero: - normalized_flux = abs(metabolite_data[met][individual.id]*metabolite_data[met][other.id])/total - species_data[individual.id][other.id] += normalized_flux - if normalized_flux > threshold: - species_collection[individual.id][other.id].append(met.name) - # calculate the total net flux between the species and the environment, and track the involved metabolites - if metabolite_data[met]["Environment"] < Zero: - normalized_flux = abs(metabolite_data[met][individual.id]*metabolite_data[met]["Environment"])/total - species_data[individual.id]["Environment"] += normalized_flux - if normalized_flux > threshold: - species_collection[individual.id]["Environment"].append(met.name) - if metabolite_data[met]["Environment"] > Zero: - for individual in self.species: - if metabolite_data[met][individual.id] < Zero: - normalized_flux = abs(metabolite_data[met]["Environment"]*metabolite_data[met][individual.id])/total - species_data["Environment"][individual.id] += normalized_flux - if normalized_flux > threshold: - species_collection["Environment"][individual.id].append(met.name) - - # construct a dataframe - for met in met_list: - for individual in self.species: - data[individual.id].append(metabolite_data[met][individual.id]) - data["Environment"].append(metabolite_data[met]["Environment"]) - for individual in self.species: - for other in self.species: - data[individual.id].append(species_data[individual.id][other.id]) - data[individual.id].append(species_data[individual.id]["Environment"]) - for individual in self.species: - data["Environment"].append(species_data["Environment"][individual.id]) - data["Environment"].append(0) - for individual in self.species: - for other in self.species: - data[individual.id].append("; ".join(species_collection[individual.id][other.id])) - data[individual.id].append("; ".join(species_collection[individual.id]["Environment"])) - for individual in self.species: - data["Environment"].append("; ".join(species_collection["Environment"][individual.id])) - data["Environment"].append(0), data["IDs"].append("Environment list"), data["Metabolites/Donor"].append("Environment list") - - self.cross_feeding_df = DataFrame(data) - logger.info(self.cross_feeding_df) - - # graph the network diagram - if visualize: - self._visualize_cross_feeding(export_directory, node_metabolites, x_offset, show_figure) - - return self.cross_feeding_df - - def _visualize_cross_feeding(self, export_directory, node_metabolites = True, x_offset = 0.15, show_figure = True): - # construct an efficient DataFrame of the cross-feeding interactions - net_cross_feeding = {} - for index, row in self.cross_feeding_df.iterrows(): - if re.search('Species\d+', row["Metabolites/Donor"]): - net_cross_feeding[row["Metabolites/Donor"]] = row[len(self.species):] - - # define species and the metabolite fluxes - net_cross_feeding = DataFrame(net_cross_feeding) - self.graph = networkx.Graph() - species_nums = {} - for species in self.species: - species_nums[species.index]= set() - self.graph.add_node(species.index) - for index, entry in net_cross_feeding[f'Species{species.index} list'].iteritems(): - if 'Species' in index and re.search('(\d+)', index).group() != species.index: - species_nums[species.index].update(entry.split('; ')) - - # define the net fluxes for each combination of two species - for species_1, species_2 in combinations(list(species_nums.keys()), 2): - species_2_to_1 = net_cross_feeding.at[f'Species{species_2}', f'Species{species_1}'] - species_1_to_2 = net_cross_feeding.at[f'Species{species_1}', f'Species{species_2}'] - interaction_net_flux = sigfig.round(species_2_to_1 - species_1_to_2, 3) - self.graph.add_edge(species_1,species_2,flux = interaction_net_flux) # The graph plots directionally toward the larger numbered species + #Utility functions + def print_lp(self, filename=None): + filename = filename or self.lp_filename + with open(filename+".lp", 'w') as out: out.write(str(self.util.model.solver)) ; out.close() - # compose the nextwork diagram of net fluxes - self.pos = networkx.circular_layout(self.graph) - if node_metabolites: - for species in self.pos: - x, y = self.pos[species] - metabolites = '\n'.join(species_nums[species]) - pyplot.text(x+x_offset, y, metabolites) - networkx.draw_networkx(self.graph,self.pos) - self.labels = networkx.get_edge_attributes(self.graph,'flux') - networkx.draw_networkx_edge_labels(self.graph,self.pos,edge_labels=self.labels) - - if export_directory: - pyplot.savefig(os.path.join(export_directory, 'cross_feeding_diagram.svg')) - self.cross_feeding_df.to_csv(os.path.join(export_directory, 'cross_feeding.csv')) - - if show_figure: - pyplot.show() - #Analysis functions - def gapfill(self, media = None, target = None, minimize = False,default_gapfill_templates = [], default_gapfill_models = [], test_conditions = [], reaction_scores = {}, blacklist = [], suffix = None, solver = 'glpk'): - if not target: - target = self.primary_biomass.id - self.set_objective(target,minimize) - gfname = FBAHelper.medianame(media)+"-"+target - if suffix: - gfname += f"-{suffix}" - self.gapfillings[gfname] = MSGapfill(self.model, default_gapfill_templates, default_gapfill_models, test_conditions, reaction_scores, blacklist) - gfresults = self.gapfillings[gfname].run_gapfilling(media,target, solver = solver) - if not gfresults: - logger.critical("Gapfilling failed with the specified model, media, and target reaction.") - return None + def gapfill(self, media = None, target = None, minimize = False, default_gapfill_templates=None, default_gapfill_models=None, + test_conditions=None, reaction_scores=None, blacklist=None, suffix = None, solver:str="glpk"): + default_gapfill_templates = default_gapfill_templates or [] + default_gapfill_models = default_gapfill_models or [] + test_conditions, blacklist = test_conditions or [], blacklist or [] + reaction_scores = reaction_scores or {} + if not target: target = self.primary_biomass.id + self.set_objective(target, minimize) + gfname = FBAHelper.mediaName(media) + "-" + target + if suffix: gfname += f"-{suffix}" + self.gapfillings[gfname] = MSGapfill(self.util.model, default_gapfill_templates, default_gapfill_models, + test_conditions, reaction_scores, blacklist, solver) + gfresults = self.gapfillings[gfname].run_gapfilling(media, target) + assert gfresults, f"Gapfilling of {self.util.model.id} in {gfname} towards {target} failed." return self.gapfillings[gfname].integrate_gapfill_solution(gfresults) - - def test_individual_species(self,media = None,allow_cross_feeding=True,run_atp=True,run_biomass=True): - self.pkgmgr.getpkg("KBaseMediaPkg").build_package(media) - #Iterating over species and running tests - data = {"Species":[],"Biomass":[],"ATP":[]} - for individual in self.species: + + def test_individual_species(self, media=None, interacting=True, run_atp=True, run_biomass=True): + assert run_atp or run_biomass, ValueError("Either the run_atp or run_biomass arguments must be True.") + # self.pkgmgr.getpkg("KBaseMediaPkg").build_package(media) + if media is not None: self.util.add_medium(media) + data = {"Species": [], "Biomass": [], "ATP": []} + for individual in self.members: data["Species"].append(individual.id) - with self.model: # WITH, here, discards changes after each simulation - #If no interaction allowed, iterate over all other species and disable them - if not allow_cross_feeding: - for indtwo in self.species: - if indtwo != individual: - indtwo.disable_species() - if run_biomass: #If testing biomass, setting objective to individual species biomass and optimizing - data["Biomass"].append(individual.compute_max_biomass()) - if run_atp: #If testing atp, setting objective to individual species atp and optimizing - data["ATP"].append(individual.compute_max_atp()) - df = DataFrame(data) - logger.info(df) - return df - - def atp_correction(self,core_template, atp_medias, atp_objective="bio2", max_gapfilling=None, gapfilling_delta=0): - self.atpcorrect = MSATPCorrection(self.model,core_template, atp_medias, atp_objective="bio2", max_gapfilling=None, gapfilling_delta=0) - - def predict_abundances(self,media=None,pfba=True,kinetic_coeff = None): - with self.model: # WITH, here, discards changes after each simulation - if not kinetic_coeff: - kinetic_coeff = self.kinetic_coeff - if not kinetic_coeff: #Kinetic coefficients must be used for this formulation to work - kinetic_coeff = 2000 - self.pkgmgr.getpkg("CommKineticPkg").build_package(kinetic_coeff,self) - - objcoef = {} - for species in self.species: - objcoef[species.biomasses[0].forward_variable] = 1 - new_objective = self.model.problem.Objective(Zero,direction="max") - self.model.objective = new_objective - new_objective.set_linear_coefficients(objcoef) - self.run(media,pfba) + with self.util.model: + if not interacting: + for other in self.members: + if other != individual: other.disable_species() + if run_biomass: data["Biomass"].append(individual.compute_max_biomass()) + if run_atp: data["ATP"].append(individual.compute_max_atp()) + return DataFrame(data) + + def atp_correction(self, core_template, atp_medias, max_gapfilling=None, gapfilling_delta=0): + self.atp = MSATPCorrection(self.util.model, core_template, atp_medias, "c0", max_gapfilling, gapfilling_delta) + + # TODO evaluate the comparison of this method with MICOM + def predict_abundances(self, media=None, pfba=True): + with self.util.model: + self.util.model.objective = self.util.model.problem.Objective( + sum([species.primary_biomass.forward_variable for species in self.members]), direction="max") + self.run_fba(media, pfba) return self._compute_relative_abundance_from_solution() - return None - - def run(self,media,pfba = None): - self.pkgmgr.getpkg("KBaseMediaPkg").build_package(media) - self.print_lp() - save_matlab_model(self.model, self.model.name+".mat") - if pfba or self.pfba: - self._set_solution(cobra.flux_analysis.pfba(self.model)) - else: - self._set_solution(self.model.optimize()) - if not self.solution: - return None - logger.info(self.model.summary()) - return self.solution - - #Internal functions - def _compute_relative_abundance_from_solution(self,solution = None): - if not solution and not self.solution: - logger.warning("No feasible solution!") - return None - data = {"Species":[],"Abundance":[]} - totalgrowth = sum([self.solution.fluxes[species.biomasses[0].id] for species in self.species]) - if totalgrowth == 0: - logger.warning("The community did not grow!") - return None - for species in self.species: - data["Species"].append(species.id) - data["Abundance"].append(self.solution.fluxes[species.biomasses[0].id]/totalgrowth) - df = DataFrame(data) - logger.info(df) - return df - - def _set_solution(self,solution): - self.solution = None - if solution.status != 'optimal': - logger.warning("No solution found for the simulation.") - return + def run_fba(self, media=None, pfba=False, fva_reactions=None): + if media is not None: self.util.add_medium(media) + return self._set_solution(self.util.run_fba(None, pfba, fva_reactions)) + + def _compute_relative_abundance_from_solution(self, solution=None): + if not solution and not self.solution: logger.warning("The simulation lacks any flux.") ; return None + comm_growth = sum([self.solution.fluxes[member.primary_biomass.id] for member in self.members]) + assert comm_growth > 0, NoFluxError(f"The total community growth is {comm_growth}") + return {member.id: self.solution.fluxes[member.primary_biomass.id]/comm_growth for member in self.members} + + def _set_solution(self, solution): + if solution.status != "optimal": + FeasibilityError(f'The solution is sub-optimal, with a(n) {solution} status.') + self.solution = None + self.print_lp() + save_matlab_model(self.util.model, self.util.model.name + ".mat") self.solution = solution + logger.info(self.util.model.summary()) + return self.solution + + def parse_member_growths(self): + # f"cpd11416_c{member.index}" + return {member.name: self.solution.fluxes[member.primary_biomass.id] for member in self.members} - def steady_com(self,): - from reframed.community import SteadyCom, SteadyComVA - - reframed_model = FBAHelper.get_reframed_model(self.model) - \ No newline at end of file + def return_member_models(self): + # TODO return a list of member models that is parsed from the .members attribute + ## which will have applicability in disaggregating community models that do not have member models + ## such as Filipe's Nitrate reducing community model for the SBI ENIGMA team. + return diff --git a/modelseedpy/community/mscompatibility.py b/modelseedpy/community/mscompatibility.py deleted file mode 100644 index 02fdbe8c..00000000 --- a/modelseedpy/community/mscompatibility.py +++ /dev/null @@ -1,452 +0,0 @@ -from collections import OrderedDict -from cobra.core.metabolite import Metabolite -from cobra.io.json import save_json_model -from zipfile import ZipFile, ZIP_LZMA -from warnings import warn -from pprint import pprint -import json, lzma, re, os - -class MSCompatibility(): - def __init__(self, - modelseed_db_path: str, # the local path to the ModelSEEDDatabase repository - printing = True # specifies whether results are printed - ): - self.printing = printing - - # import and parse ModelSEED Database reactions and compounds - with open(os.path.join(modelseed_db_path, 'Biochemistry', 'reactions.json'), 'r') as rxns: - self.reactions = json.load(rxns) - self.reaction_ids = OrderedDict() - for rxn in self.reactions: - self.reaction_ids[rxn['id']] = rxn['name'] - - with open(os.path.join(modelseed_db_path, 'Biochemistry', 'compounds.json'), 'r') as rxns: - self.compounds = json.load(rxns) - self.compounds_cross_references, self.compound_names = OrderedDict(), OrderedDict() - for cpd in self.compounds: - self.compounds_cross_references[cpd['id']] = {} - if cpd['aliases'] is not None: - for category in cpd['aliases']: - content = category.split(';') - if 'Name' in category: - content[0] = content[0].split(':')[0].strip() - names = [name.strip() for name in content] - names.append(cpd['name']) - for name in names: - if name not in self.compound_names: - self.compound_names[name] = cpd['id'] - else: - first = content[0].split(':') - db = first[0].strip() - content[0] = first[1] - self.compounds_cross_references[cpd['id']][db] = [x.strip() for x in content] - - - # def _parse_modelReactionReagents(self, modelReactionReagents, model_metabolites): - # rxn_dict = {} - # for cpd in modelReactionReagents: - # met = re.search('(?<=id\/)(.+)', cpd['modelcompound_ref']).group() - # stoich = float(cpd['coefficient']) - # if met in model_metabolites: - # met = model_metabolites[met] - # elif re.sub('_\w\d', '', met) in model_metabolites: - # met = model_metabolites[re.sub('_\w\d', '', met)] - # else: - # KeyError(f'ModelSEEDError: The metabolite {met} in the reactions is not in the modelreactions.') - # rxn_dict[met] = stoich - - # return rxn_dict - - def standardize(self, models, # the collection of cobrakbase models that will be compared - metabolites: bool = True, # specifies whether metabolites or reactions (FALSE) will be standardized - exchanges: bool = True, # specifies whether only the exchange reaction will be standardized - conflicts_file_name: str = None, # the metabolite conflicts are stored and organized, where None does not export - model_names: list = None, # specifies the export names of the models - model_format: str = 'json', # specifies to which format the model will be exported - export_directory: str = None # specifies the directory to which all of the content will be exported - ): - self.models = models - self.unique_mets, self.met_conflicts = OrderedDict(), OrderedDict() - self.unknown_met_ids, self.changed_metabolites, self.changed_reactions= [], [], [] - self.changed_ids_count = self.changed_rxn_count = 0 - for self.model_index, self.model in enumerate(self.models): - # standardize metabolites - if metabolites: - if exchanges: - model_metabolites = [met.id for met in self.model.metabolites] - for ex_rxn in self.model.exchanges: - for met in ex_rxn.metabolites: - met, new_met_id, success = self._fix_met(met) - try: - ex_rxn.id = 'EX_'+met.id - except: - ex_rxn.id = 'EX_'+new_met_id - if 'cpd' not in met.id and success and new_met_id not in model_metabolites: - self.unknown_met_ids.append(met.id) - warn(f'CodeError: The metabolite {met.id} | {met.name} was not corrected to a ModelSEED metabolite.') - else: - for met in self.model.metabolites: - met, new_met_id, success = self._fix_met(met) - if 'cpd' not in met.id: - self.unknown_met_ids.append(met.id) - warn(f'CodeError: The metabolite {met.id} | {met.name} was not corrected to a ModelSEED metabolite.') - - if conflicts_file_name is not None: - self._export({'metabolite_changes':self.changed_metabolites, 'reaction_changes':self.changed_reactions}, - conflicts_file_name, model_names, model_format, export_directory - ) - - # standardize reactions - # else: #!!! The modelreactions appear to be incorrect - # modelreactions_ids = {re.sub('(_\w\d$)', '', rxn['id']).removeprefix('R-'):rxn for rxn in model.modelreactions} - # with open(os.path.join(export_directory, 'modelreactions.json'), 'w') as out: - # json.dump(modelreactions_ids, out, indent = 3) - # model_metabolites = {met.id:met for met in model.metabolites} - # missed_reactions = 0 - # for rxn in model.reactions: - # if 'EX_' in rxn.id: - # continue - # original_reaction = rxn.reaction - # rxn.add_metabolites({rxn_met:0 for rxn_met in rxn.metabolites}, combine = False) - - # if re.sub('(_\w\d$)', '', rxn.id) in modelreactions_ids: - # reaction_dict = self._parse_modelReactionReagents( - # modelreactions_ids[re.sub('(_\w\d$)', '', rxn.id)]['modelReactionReagents'], model_metabolites - # ) - # elif rxn.id in modelreactions_ids: - # reaction_dict = self._parse_modelReactionReagents( - # modelreactions_ids[rxn.id]['modelReactionReagents'], model_metabolites - # ) - # else: - # warn(f'ModelSEEDError: The reaction ID {rxn.id} is not captured by the modelreactions.') - - # try: - # rxn.add_metabolites(reaction_dict, combine = False) - # except: - # new_reaction_dict = {} - # for met, content in reaction_dict.items(): - # if isinstance(met, str): - # met = re.sub('_\w\d', '', met) - # else: - # if re.sub('_\w\d', '', met.id) not in model.metabolites: - # met.id = re.sub('_\w\d', '', met.id) - # new_reaction_dict[met] = content - # reaction_dict = new_reaction_dict - # if rxn.id not in self.reaction_ids: - # missed_reactions += 1 - # # warn(f'ModelSEEDError: The {rxn.id} | {rxn.name} reaction is not recognized by the ModelSEED Database') - - # # describe the change - # if original_reaction != rxn.reaction: - # change = { - # 'original': { - # 'reaction': original_reaction - # }, - # 'new': { - # 'reaction': rxn.reaction - # }, - # 'explanation': f'The reaction {rxn.id} was reconstructed from the ModelSEED Database.' - # } - # self.changed_reactions.append(change) - - # if export_directory is not None: - # with open(os.path.join(export_directory, 'standardized_reactions.txt'), 'w') as out: - # json.dump(self.changed_reactions, out, indent = 3) - - # total_reactions = 0 - # for model in models: - # total_reactions += len(model.reactions) - - # warn(f'\nModelSEEDError: {missed_reactions}/{total_reactions} reactions were not captured by the ModelSEED modelreaction IDs.') - - self.models[self.model_index] = self.model - print(f'\n\n{self.changed_rxn_count} reactions were substituted and {self.changed_ids_count} metabolite IDs were redefined.') - return self.models - - def align_exchanges(self, models, # the collection of cobrakbase models that will be compared - standardize: bool = False, # standardize the model names and reactions to the ModelSEED Database - conflicts_file_name: str = None, # the metabolite conflicts are stored and organized, where None does not the conflicts - model_names: list = None, # specifies the name of the exported model, where None does not export the models - model_format: str = 'json', # specifies to which format the model will be exported - export_directory: str = None # specifies the directory to which all of the content will be exported - ): - self.models = models - self.changed_ids_count = self.changed_rxn_count = 0 - if standardize: - self.standardize_MSD(self.models) - - unique_names, established_mets, self.unknown_met_ids, self.changed_metabolites, self.changed_reactions = [], [], [], [], [] - self.unique_mets, self.met_conflicts = OrderedDict(), OrderedDict() - for self.model_index, self.model in enumerate(self.models): - model_metabolites = {met.id:met for met in self.model.metabolites} - for ex_rxn in self.model.exchanges: - for met in ex_rxn.metabolites: - met_name = re.sub('_\w\d$', '', met.name) - if met.id not in self.unique_mets and met.id not in established_mets: - if met_name not in unique_names: - # identify the unique metabolite - self.unique_mets[met.id] = { - f'model{self.model_index}_id': met.id, - f'model{self.model_index}_met': met - } - unique_names.append(met_name) - else: - # describe the metabolite conflict between the ID and name - former_id = list(self.unique_mets.keys())[unique_names.index(met_name)] - former_model_index = list(self.unique_mets[former_id].keys())[0].split('_')[0].removeprefix('model') - if met.name not in self.met_conflicts: - self.met_conflicts[met_name] = { - f'model{former_model_index}_id': former_id, - f'model{former_model_index}_met': self.unique_mets[former_id][f'model{former_model_index}_met'], - f'model{self.model_index}_id': met.id, - f'model{self.model_index}_met': met - } - else: - self.met_conflicts[met_name].update({ - f'model{self.model_index}_id': met.id, - f'model{self.model_index}_met': met - }) - met, new_met_id, success = self._fix_met(met) - else: - former_name = unique_names[list(self.unique_mets.keys()).index(met.id)] - former_model_index = list(self.unique_mets[met.id].keys())[0].split('_')[0].removeprefix('model') - if met_name == former_name: - # remove the metabolite that is no longer unique - del unique_names[list(self.unique_mets.keys()).index(met.id)] - self.unique_mets.pop(met.id) - established_mets.append(met.id) - else: - # describe the conflicting metabolite names - if met.id not in self.met_conflicts: - self.met_conflicts[met.id] = { - f'model{former_model_index}_name': former_name, - f'model{former_model_index}_met': self.unique_mets[former_id][f'model{former_model_index}_met'], - f'model{self.model_index}_name': met.name, - f'model{self.model_index}_met': met - } - else: - if f'model{self.model_index}_name' not in self.met_conflicts[met.id]: - self.met_conflicts[met.id].update({ - f'model{self.model_index}_name': met.name, - f'model{self.model_index}_met': met - }) - else: - iteration = 0 - while f'model{self.model_index}_{iteration}_name' in self.met_conflicts[met.id]: - iteration += 1 - - self.met_conflicts[met.id].update({ - f'model{self.model_index}_{iteration}_name': met.name, - f'model{self.model_index}_{iteration}_met': met - }) - met, new_met_id, success = self._fix_met(met) - - self.models[self.model_index] = self.model - - # correct the reaction ID - if re.sub('(_\w\d$)', '', ex_rxn.id).removeprefix('EX_') in model_metabolites: - suffix = re.search('(_\w\d$)', ex_rxn.id).group() - rxn_met, new_met_id, success = self._fix_met(re.sub('(_\w\d$)', '', ex_rxn.id).removeprefix('EX_')) - ex_rxn.id = 'EX_'+new_met_id+suffix - - if conflicts_file_name: - export_met_conflicts = {} - for met_id, content in self.met_conflicts.items(): - export_met_conflicts[met_id] = {} - for key, val in content.items(): - if '_met' not in key: - export_met_conflicts[met_id][key] = val - else: - export_met_conflicts[met_id][key.replace('_met','_formula')] = val.formula - - self._export(export_met_conflicts, conflicts_file_name, model_names, model_format, export_directory) - - print(f'\n\n{self.changed_rxn_count} exchange reactions were substituted and {self.changed_ids_count} exchange metabolite IDs were redefined.') - return self.models - - def _fix_met(self,met): - # correct the conflict - base_name = ''.join(met.name.split('-')[1:]).capitalize() - met_name = re.sub('_\w\d$', '', met.name) - new_met_id = met.id - success = True - if met.name in self.compound_names: - met, new_met_id = self.__correct_met(met, met.name) - elif met.name.capitalize() in self.compound_names: - met, new_met_id = self.__correct_met(met, met.name.capitalize()) - elif met_name in self.compound_names: - met, new_met_id = self.__correct_met(met, met_name) - elif met_name.capitalize() in self.compound_names: - met, new_met_id = self.__correct_met(met, met_name.capitalize()) - elif base_name in self.compound_names and base_name != '': - met, new_met_id = self.__correct_met(met, base_name) - else: - self.unknown_met_ids.append(met.id) - success = False - warn(f'ModelSEEDError: The metabolite ({" | ".join([x for x in [met.id, met.name, base_name, met_name] if x != ""])}) is not recognized by the ModelSEED Database') - return met, new_met_id, success - - def _export(self, conflicts, # the conflicts dictionary that will be exported - conflicts_file_name, # the metabolite conflicts are stored and organized, where None does not the conflicts - model_names, # specifies the name of the exported model, where None does not export the models - model_format, # specifies to which format the model will be exported - export_directory # specifies the directory to which all of the content will be exported - ): - if export_directory is None: - export_directory = os.getcwd() - - file_paths = [] - if conflicts_file_name is not None: - path = os.path.join(export_directory,conflicts_file_name) - file_paths.append(os.path.relpath(path, export_directory)) - with open(path, 'w') as out: - json.dump(conflicts, out, indent = 3) - if model_names is not None: - for index, model in enumerate(self.models): - path = os.path.join(export_directory,f'{model_names[index]}.{model_format}') - file_paths.append(os.path.relpath(path, export_directory)) - save_json_model(model, path) - with ZipFile('_'.join(model_names[:4])+'.zip', 'w', compression = ZIP_LZMA) as zip: - for file in file_paths: - zip.write(file) - os.remove(file) - - def __correct_met(self, met, met_name, standardize = False): - def check_cross_references(met, general_met): - for db in self.compounds_cross_references[general_met]: - for cross_ref in self.compounds_cross_references[general_met][db]: - if cross_ref in self.compounds_cross_references[self.compound_names[met_name]][db]: - match = True - break - if match: - break - return match, db - - original_id = new_met_id = met.id - compartment = re.search('(_\w\d$)', met.id).group() - if met.id.removesuffix(compartment) != self.compound_names[met_name]: # If the ID associated with the name deviates from that in the ModelSEED Database - new_met_id = self.compound_names[met_name]+compartment - if new_met_id in met.model.metabolites: - # replace the undesirable isomer in every instance, since it cannot be renamed - for rxn in met.reactions: - double_reagent = False - original_reaction = rxn.reaction - removal_dict, reaction_dict = {}, {} - for rxn_met in rxn.reactants+rxn.products: # The REACTANTS+PRODUCTS may resolve metabolites that are both, more than the METABOLITES attribute - match = False - stoich = float(rxn.metabolites[rxn_met]) - compartment = re.search('(_\w\d$)', rxn_met.id).group() - new_met = rxn_met - if rxn_met.id == met.id: - if new_met_id in [old_met.id for old_met in rxn.metabolites]: - double_reagent = True - warn(f'CodeError: The metabolite {new_met_id} replacement for {met.id} already exists in the reaction {rxn.id}, thus the reaction cannot be updated.') - break - - # affirm the match with cross-references, where it is possible for ModelSEED compounds - general_met = re.sub("(_\w\d$)", "", met.id) - if 'cpd' in met.id and self.compounds_cross_references[general_met] != {}: - match, db = check_cross_references(met, general_met) - if not match: - warn(f'ModelSEEDError: The old metabolite {met.id} cross-references ({self.compounds_cross_references[general_met]}) do not overlap with those ({self.compounds_cross_references[self.compound_names[met_name]]}) of the new metabolite {new_met_id}.') - - # remove duplicate exchange reaction - if 'EX_' in rxn.id and 'EX_'+new_met_id in [ex_rxn.id for ex_rxn in self.model.exchanges]: - change = { - 'original': { - 'reaction': original_reaction - }, - 'new': { - 'reaction': None - }, - 'justification': f'A {new_met_id} exchange reaction already exists in model {self.model_index}, thus this duplicative exchange reaction ({rxn.id}) is deleted.' - } - if match: - change['justification'] += f' The ID match was verified with {db} cross-references.' - self.model.remove_reactions([rxn.id]) - self.changed_reactions.append(change) - if self.printing: - print('\n') - pprint(change, sort_dicts=False) - self.changed_rxn_count += 1 - double_reagent = True - break - - # define the metabolite with the new name - new_met = Metabolite( - id = new_met_id, - name = met_name, - formula = met.formula, - charge = met.charge, - compartment = met.compartment - ) - - removal_dict[rxn_met] = 0 - reaction_dict[new_met] = stoich - - # reconstruct the reactions - if double_reagent: - continue - new_reactants = 0 - for key, val in reaction_dict.items(): - new_reactants += 1 if val < 0 else 0 - new_products = len(reaction_dict) - new_reactants - num_reactants, num_products = len(rxn.reactants), len(rxn.products) - if num_reactants == new_reactants and num_products == new_products: - rxn.add_metabolites(removal_dict, combine = False) - rxn.add_metabolites(reaction_dict, combine = False) - change = { - 'original': { - 'reaction': original_reaction - }, - 'new': { - 'reaction': rxn.reaction - }, - 'justification': f'The {new_met_id} replacement for {met.id} already exists in model {self.model_index}, so each reaction (here {rxn.id}) must be updated.' - } - if match: - change['justification'] += f' The ID match was verified with {db} cross-references.' - self.changed_reactions.append(change) - if self.printing: - print('\n') - pprint(change, sort_dicts=False) - - self.changed_rxn_count += 1 - else: - warn(f'CodeError: The reaction {reaction_dict} | {new_reactants} {new_products} possesses a different number of reagents than the original reaction {original_reaction} | {num_reactants} {num_products}, and is skipped.') - else: - # affirm the match with cross-references, where it is possible for ModelSEED compounds - match = False - general_met = re.sub("(_\w\d$)", "", met.id) - if 'cpd' in met.id and self.compounds_cross_references[general_met] != {}: - match, db = check_cross_references(met, general_met) - if not match: - warn(f'ModelSEEDError: The old metabolite {met.id} cross-references ({self.compounds_cross_references[general_met]}) do not overlap with those ({self.compounds_cross_references[self.compound_names[met_name]]}) of the new metabolite {new_met_id}.') - - # rename the undesirable isomer - met.id = self.compound_names[met_name]+compartment - change = { - 'original': { - 'id': original_id, - 'name': met.name - }, - 'new': { - 'id': met.id, - 'name': met_name+compartment - }, - 'justification': f'The {original_id} and {met.id} distinction in {self.model_index} is incompatible.' - } - if 'cpd' not in original_id: - change['justification'] = f'The {original_id} ID is not a ModelSEED Database ID.' - if standardize: - change['justification'] = f'The {original_id} and {met.id} metabolites were matched via their name.' - if match: - change['justification'] += f' The ID match was verified with {db} cross-references.' - self.changed_metabolites.append(change) - if self.printing: - print('\n') - pprint(change, sort_dicts=False) - self.changed_ids_count += 1 - - return met, new_met_id - \ No newline at end of file diff --git a/modelseedpy/community/mskineticsfba.py b/modelseedpy/community/mskineticsfba.py new file mode 100644 index 00000000..7a5e92b8 --- /dev/null +++ b/modelseedpy/community/mskineticsfba.py @@ -0,0 +1,281 @@ +# -*- coding: utf-8 -*- + +from scipy.constants import milli, hour, minute, day, femto +from modelseedpy.fbapkg.basefbapkg import BaseFBAPkg +from modelseedpy import MSModelUtil +from optlang import Constraint +from modelseedpy.core.fbahelper import FBAHelper +from collections import OrderedDict +from optlang.symbolics import Zero +from numpy import log10, nan, mean +from warnings import warn +from matplotlib import pyplot +from pprint import pprint +from datetime import date +from math import inf +import pandas +import json, re, os + +def _x_axis_determination(total_time): + time = total_time * minute + if time <= 600: return minute, "s" + if time > 600: return 1, "min" + if time > 7200: return 1/hour, "hr" + return 1/day, "days" + +def _check_datum(datum): + if "substituted_rate_law" not in datum: + print(f"RateLawError: The {datum} datum lacks a rate law.") + return False + remainder = re.sub("([0-9A-Za-z/()e\-\+\.\*\_])", "", datum["substituted_rate_law"]) + if remainder != "": + print(f'RateLawError: The {datum["substituted_rate_law"]}' + f' rate law contains unknown characters: {remainder}') + return False + return True + + +class MSKineticsFBA: + def __init__(self, model, warnings: bool = True, verbose: bool = False, + printing: bool = False, jupyter: bool = False): + self.warnings, self.verbose, self.printing, self.jupyter = warnings, verbose, printing, jupyter + self.model_util = MSModelUtil(model) + self.met_ids = OrderedDict({met.id: met.id for met in self.model_util.model.metabolites}) + + def baseKinFBA(self, kinetics_path: str = None, kinetics_data: dict = None, + initial_M: dict = None, # a dictionary of the initial metabolic concentrations, which supplants concentrations from the defined kinetics data + total_min: float = 200, ts_min: float = 20, export_name = None, export_directory = None, + chemostat_L: float = None, feed_profile: dict = None, chemostat_L_hr: float = None, + temperature: float = 25, p_h: float = 7, cell_dry_g: float = 1.44e-13, cellular_L: float = 1e-18, + conc_figure_title = "Metabolic perturbation", included_mets: list = None, labeled_plots = True, + visualize = True, export = True): + # define the dataframe for the time series content + feed_profile, constrained, self.constraints = feed_profile or {}, {}, {} + included_mets, self.sols = included_mets or [], [] + self.parameters = {"timesteps": int(total_min/ts_min), "pH": p_h, "temperature": temperature} + self.variables = {"elapsed_time": 0} + self.ts_min, self.minimum = ts_min, inf + timestep_hr = self.ts_min / (hour / minute) + self.constrained = OrderedDict() + cell_g_L = (cell_dry_g/cellular_L) # https://journals.asm.org/doi/full/10.1128/AEM.64.2.688-694.1998 + + # define reaction kinetics and initial concentrations + assert kinetics_path or kinetics_data, "Either < kinetics_path > or < kinetics_data > must be provided" + if kinetics_path: + with open(kinetics_path) as data: self.kinetics_data = json.load(data) + elif kinetics_data: self.kinetics_data = kinetics_data.copy() + ## define the concentration, moles, and fluxes DataFrames + self.time = "0 min" + self.conc = pandas.DataFrame([0]*len(self.met_ids), index=list(self.met_ids.keys()), columns=[self.time]) + self.conc.index.name = "metabolite (mM)" + self.moles = self.conc.copy(deep=True) + self.fluxes = pandas.DataFrame(index=[rxn.id for rxn in self.model_util.model.reactions], columns=[self.time]) + self.fluxes.index.name = "reaction (\u0394mmol/hr*g_(dw)))" # Delta + ## parse the kinetics data + for content in self.kinetics_data.values(): + for condition, datum in content.items(): + if "initial_M" not in datum: continue + for var, conc in datum["initial_M"].items(): + met_id = datum["met_id"][var] + if met_id in self.met_ids: self.conc.at[met_id, self.time] += conc/milli + elif self.warnings: warn(f"KineticsError: The {met_id} reagent ({var}) in the" + f" {datum['substituted_rate_law']} rate law is not defined by the model.") + ## incorporate custom initial concentrations, which overwrites values from the kinetics data + for met_id in initial_M: + self.conc.at[met_id, self.time] = initial_M[met_id] / milli + defined_concs = self.conc[self.conc[self.time] != 0][self.time].to_dict() + chemostat_requirements = [chemostat_L is not None, feed_profile != {}, chemostat_L_hr is not None] + # execute FBA for each timestep, then calculate custom fluxes, constrain the model, and update concentrations + model_rxns = [rxn.id for rxn in self.model_util.model.reactions] + newTime = 0 + for timestep in range(1, self.parameters["timesteps"] + 1): + oldTime = newTime ; newTime = timestep * self.ts_min ; t = timestep * timestep_hr + self.previous_time = f"{oldTime} min" ; self.time = f"{newTime} min" + self.conc[self.time] = [float(0)] * len(self.conc.index) + self.fluxes[self.time] = [0] * len(self.fluxes.index) + ## create a metabolite variable that prevents negative concentrations + for met in self.model_util.model.metabolites: + if met.id not in defined_concs: continue + if met.id not in self.constraints: self.constraints[met.id] = {} + coef = {} + for rxn in met.reactions: + ### The product of the reaction stoichiometry and the timestep + stoich = abs(timestep_hr * rxn.metabolites[met]) + coef[rxn.forward_variable], coef[rxn.reverse_variable] = stoich, -stoich + ### build the metabolite constraint + if newTime-self.ts_min in self.constraints[met.id]: + self.model_util.remove_cons_vars([self.constraints[met.id][newTime-self.ts_min]]) + self.constraints[met.id][newTime] = Constraint(Zero, lb=0, ub=None, name=f"{met.id}_conc") + self.model_util.create_constraint(self.constraints[met.id][newTime], coef) + ## calculate the flux + display(self.conc[self.conc["0 min"] != 0], self.fluxes) + for rxnID in self.kinetics_data: + # TODO allocate the following code into a function and recusively reduce the timestep until + ## the concentration becomes not negative, following the model of microBialSim. This may require + ## time dependency in the kinetics expression to achieve the desired behavior. + if rxnID not in model_rxns and self.warnings: + warn(f"ReactionError: {rxnID} is not in the model.") ; continue + fluxes = [] + for source in self.kinetics_data[rxnID]: + datum = self.kinetics_data[rxnID][source] + if not _check_datum(datum): continue + ### define rate law variables; calculate flux; average or overwrite the flux based on data criteria + locals().update({metID: self.conc.at[metID, self.previous_time]*milli for metID in datum["mets"]}) + flux = eval(datum["substituted_rate_law"]) + print(datum["substituted_rate_law"], flux) + if ("metadata" not in self.kinetics_data[rxnID][source] + or self.__find_data_match(rxnID, source) == 'a'): fluxes.append(flux) + else: fluxes = [flux] + + flux = mean(fluxes) + rxn = self.model_util.model.reactions.get_by_id(rxnID) + rxn.lb = rxn.ub = flux + self.fluxes.at[rxnID, self.time] = flux + ## execute the COBRA model + sol = self.model_util.model.optimize() + self.sols.append(sol) + ## add previously undefined fluxes and concentrations + for rxnID in self.fluxes.index: + if self.fluxes.at[rxnID, self.time] == 0: + self.fluxes.at[rxnID, self.time] = sol.fluxes[rxnID] + for met in self.model_util.model.metabolites: + self.conc.at[met.id, self.time] = 0 + for rxn in met.reactions: + flux = self.fluxes.at[rxn.id, self.time] + if flux == 0: continue + # print(rxn.metabolites[met], flux, timestep_hr, cell_g_L) + self.conc.at[met.id, self.time] += rxn.metabolites[met] * flux * timestep_hr * cell_g_L + if all(chemostat_requirements): + self.moles[self.time] = (self.conc[self.time] * milli * chemostat_L) + self._chemostat(feed_profile, chemostat_L_hr, chemostat_L) + elif any(chemostat_requirements): warn("The < chemostat_L > , < feed_profile >, and < chemostat_L_hr >" + " parameters must all be defined to simulate a chemostat.") + self.variables["elapsed_time"] += self.ts_min + if self.printing: print(f"\nObjective value (\u0394t{self.ts_min}): ", self.sols[-1].objective_value) + + # identify the chemicals that dynamically changed in concentrations + self.changed = set([met_id for met_id in self.met_ids + if self.conc.at[met_id, "0 min"] != self.conc.at[met_id, self.time]]) + self.unchanged = set(self.met_ids.keys()) - self.changed + + # visualize concentration changes over time + if visualize: self._visualize(conc_figure_title, included_mets, labeled_plots) + if export: self._export(export_name, export_directory, total_min) + if self.verbose: print(f"\nChanged concentrations:\t{self.changed}", + f"\nConstrained reactions:\t{constrained.keys()}") + elif self.printing: + if self.jupyter: pandas.set_option("max_rows", None) ; display(self.conc, self.fluxes) + if self.unchanged == set(): print("All of the metabolites changed concentration over the simulation") + else: print(f"\nUnchanged metabolite concentrations\t{self.unchanged}") + return self.conc, self.fluxes + + def _chemostat(self, feed_profile:dict, chemostat_L_hr, chemostat_L): + L_changed = chemostat_L_hr * self.ts_min + # chemostat addition + for met_id, conc in feed_profile.items(): + self.moles.at[met_id, self.time] += conc * L_changed + self.conc.at[met_id, self.time] = ( + self.moles.at[met_id, self.time] / milli / chemostat_L) # normalize to the chemostat volume + # chemostat subtraction + for met in self.model_util.model.metabolites: + if met.compartment[0] != "e": continue + ## update the chemical moles + self.moles.at[met.id, self.time] -= (self.conc.at[met.id, self.time] * L_changed) + ## define the chemical concentration + self.conc.at[met.id, self.time] = ( + self.moles.at[met.id, self.time] / milli / chemostat_L) + + # nested functions + def __find_data_match(self, rxnID: str, source: str): + # identifies the datum whose experimental conditions most closely matches the simulation conditions + temperature_deviation = ph_deviation = 0 + if FBAHelper.isnumber(self.kinetics_data[rxnID][source]["metadata"]["Temperature"]): + temp = float(self.kinetics_data[rxnID][source]["metadata"]["Temperature"]) + temperature_deviation = (abs(self.parameters["temperature"] - temp) / self.parameters["temperature"]) + if FBAHelper.isnumber(self.kinetics_data[rxnID][source]["metadata"]["pH"]): + pH = float(self.kinetics_data[rxnID][source]["metadata"]["pH"]) + ph_deviation = (abs(self.parameters["pH"] - pH) / self.parameters["pH"]) + + # equally weight between temperature and pH deviation from the simulation conditions + old_minimum = self.minimum + deviation = mean(temperature_deviation, ph_deviation) + self.minimum = min(deviation, self.minimum) + return "a" if old_minimum == self.minimum else "w" # append or write a list of data + + def _visualize(self, conc_fig_title, included_mets, labeled_plots): + # TODO construct a Vega visualization with a range bind that permits scanning over a time series + ## and accordingly adjusting arrowhead widths to reflect flux at the particularly timestep. + ## The heatmap may likewise be dynamic for each timestep over a bind range. + + + # define the figure + pyplot.rcParams['figure.figsize'] = (11, 7) + pyplot.rcParams['figure.dpi'] = 150 + self.figure, ax = pyplot.subplots() + ax.set_title(conc_fig_title) + ax.set_ylabel("Concentrations (mM)") + + x_axis_scalar, unit = _x_axis_determination(self.total_min) + ax.set_xlabel("Time " + unit) + legend_list = [] + times = [t * self.ts_min * x_axis_scalar for t in range(self.parameters["timesteps"] + 1)] + + # determine the plotted metabolites and the scale of the figure axis + bbox = (1, 1) + if not included_mets: + bbox = (1.7, 1) + # 1e-2 is an arbitrary concentration threshold for plotting on the figure + included_mets = [chem for chem in self.changed + if max(self.conc.loc[[chem]].values[0].tolist()) > 1e-2] + + log_axis = False + minimum, maximum = inf, -inf + printed_concentrations = {} + for chem in self.changed: + if chem not in included_mets: continue + concentrations = self.conc.loc[[chem]].values[0].tolist() + maximum = max(maximum, max([x if x > 1e-9 else 0 for x in concentrations])) + minimum = min(minimum, min([x if x > 1e-9 else 0 for x in concentrations])) + # plot chemicals with perturbed concentrations + ax.plot(times, concentrations) + if len(chem) > 25: chem = list(self.met_ids.keys())[self.met_ids.index(chem)] + if not concentrations[0] < 1e-9: legend_list.append(chem) + else: legend_list.append(f"(rel) {chem}") + + # design the proper location of the overlaid labels in the figure + if not labeled_plots: continue + for i, conc in enumerate(concentrations): + if conc <= 1e-9: continue + x_value = i * self.ts_min + vertical_adjustment = 0 + if x_value in printed_concentrations: + vertical_adjustment = (maximum - minimum) * 0.05 + if log_axis: vertical_adjustment = log10(maximum - minimum) / 3 + ax.text(x_value, conc + vertical_adjustment, f"{chem} - {round(conc, 4)}", ha="left") + printed_concentrations[x_value] = conc + break + + # finalize figure details + if maximum > 10 * minimum: ax.set_yscale("log") + ax.set_xticks(times) + ax.grid(True) + ax.legend(legend_list, title="Changed chemicals", loc="upper right", + bbox_to_anchor=bbox, title_fontsize="x-large", fontsize="large") + + def _export(self, export_name="kineticsFBA", export_directory: str=None): + # define a unique simulation name + directory = os.path.dirname(export_directory) if export_directory else os.getcwd() + self.parameters["simulation_path"] = self.simulation_path = os.path.join(directory, export_name) + # export simulation content + self.fluxes.to_csv(os.path.join(self.simulation_path, "fluxes.csv")) + self.conc.to_csv(os.path.join(self.simulation_path, "concentrations.csv")) + obj_vals_df = pandas.DataFrame([(self.fluxes.columns[index].replace(' min', ''), sol.objective_value) + for index, sol in enumerate(self.sols)], columns=["min", "objective_value"]) + obj_vals_df.index = obj_vals_df["min"] ; obj_vals_df.drop(["min"], axis=1, inplace=True) + obj_vals_df.to_csv(os.path.join(self.simulation_path, "objective_values.csv")) + # export the parameters + parameters_table = pandas.DataFrame(self.parameters, columns=["parameter", "value"]) + parameters_table.to_csv(os.path.join(self.simulation_path, "parameters.csv")) + # export the figure + self.figure.savefig(os.path.join(self.simulation_path, "changed_concentrations.svg")) + if self.verbose and not self.jupyter: self.figure.show() diff --git a/modelseedpy/community/mssteadycom.py b/modelseedpy/community/mssteadycom.py new file mode 100644 index 00000000..62988f11 --- /dev/null +++ b/modelseedpy/community/mssteadycom.py @@ -0,0 +1,282 @@ +from icecream import ic + +from modelseedpy import FBAHelper +from modelseedpy.core.exceptions import ObjectAlreadyDefinedError, ParameterError, NoFluxError +# from modelseedpy.community.commhelper import build_from_species_models, CommHelper +from optlang import Constraint, Variable +from itertools import combinations +from optlang.symbolics import Zero +from pandas import DataFrame, concat +from matplotlib import pyplot +from numpy import array +import networkx +import sigfig +import os, re + + +def add_collection_item(met_name, normalized_flux, flux_threshold, ignore_mets, + species_collection, first, second): + if flux_threshold and normalized_flux <= flux_threshold: return species_collection + if not any([re.search(x, met_name, flags=re.IGNORECASE) for x in ignore_mets]): + species_collection[first][second].append(re.sub(r"(_\w\d$)", "", met_name)) + return species_collection + + +class MSSteadyCom: + + @staticmethod + def run_fba(mscommodel, media, pfba=False, fva_reactions=None, ava=False, minMemGrwoth:float=1, interactions=True): + + + # minGrowth = Constraint(name="minMemGrowth", lb=, ub=None) + # mscommodel.model.add_cons_vars + + # fix member abundances + if not mscommodel.abundances_set: + for member in mscommodel.members: + member.biomass_cpd.lb = minMemGrwoth + all_metabolites = {mscommodel.primary_biomass.products[0]: 1} + all_metabolites.update({mem.biomass_cpd: 1 / len(mscommodel.members) for mem in mscommodel.members}) + mscommodel.primary_biomass.add_metabolites(all_metabolites, combine=False) + # TODO constrain fluxes to be proportional to the relative abundance + + # TODO constrain the sum of fluxes to be proportional with the abundance + sol = mscommodel.run_fba(media, pfba, fva_reactions) + if interactions: return MSSteadyCom.interactions(mscommodel, sol) + if ava: return MSSteadyCom.abundance_variability_analysis(mscommodel, sol) + + @staticmethod + def abundance_variability_analysis(mscommodel, media): + variability = {} + for mem in mscommodel.members: + variability[mem.id] = {} + # minimal variability + mscommodel.set_objective(mem.biomasses, minimize=True) + variability[mem.id]["minVar"] = mscommodel.run_fba(media) + # maximal variability + mscommodel.set_objective(mem.biomasses, minimize=False) + variability[mem.id]["maxVar"] = mscommodel.run_fba(media) + return variability + + @staticmethod + def interactions( + mscommodel, # The MSCommunity object of the model (mandatory to prevent circular imports) + solution = None, # the COBRA simulation solution that will be parsed and visualized + media=None, # The media in which the community model will be simulated + # names=None, abundances=None, # names and abundances of the community species + flux_threshold: int = 1, # The threshold of normalized flux below which a reaction is not plotted + msdb=None, msdb_path:str=None, + visualize: bool = True, # specifies whether the net flux will be depicted in a network diagram + filename: str = 'cross_feeding', # Cross-feeding figure export name + export_format: str = "svg", + node_metabolites: bool = True, # specifies whether the metabolites of each node will be printed + show_figure: bool = True, # specifies whether the figure will be printed to the console + ignore_mets=None # cross-fed exchanges that will not be displayed in the graphs + ): + # verify that the model has a solution and parallelize where the solver is permissible + solver = str(type(mscommodel.util.model.solver)) + print(f"{solver} model loaded") + if "gurobi" in solver: mscommodel.util.model.problem.Params.Threads = os.cpu_count()/2 + solution = solution or mscommodel.run_fba(media) + if not solution: raise ParameterError("A solution must be provided, from which interactions are computed.") + if all(array(list(solution.fluxes.values)) == 0): + print(list(solution.fluxes.values)) + raise NoFluxError("The simulation lacks any flux.") + + #Initialize data + metabolite_data, species_data, species_collection = {}, {"Environment":{}}, {"Environment":{}} + data = {"IDs":[], "Metabolites/Donor":[], "Environment":[]} + species_list = {} + + # track extracellularly exchanged metabolites + exchange_mets_list = mscommodel.util.exchange_mets_list() + for met in exchange_mets_list: + data["IDs"].append(met.id) + data["Metabolites/Donor"].append(re.sub(r"(_\w\d$)", "", met.name)) + metabolite_data[met.id] = {"Environment": 0} + metabolite_data[met.id].update({individual.id: 0 for individual in mscommodel.members}) + + # computing net metabolite flux from each reaction + # print([mem.id for mem in mscommodel.members]) + for individual in mscommodel.members: + species_data[individual.id], species_collection[individual.id] = {}, {} + species_list[individual.index] = individual + data[individual.id] = [] + for other in mscommodel.members: + species_data[individual.id][other.id] = 0 + species_collection[individual.id][other.id] = [] + species_data["Environment"][individual.id] = species_data[individual.id]["Environment"] = 0 + species_collection["Environment"][individual.id] = [] + species_collection[individual.id]["Environment"] = [] + + for rxn in mscommodel.util.model.reactions: + if rxn.id[0:3] == "EX_": + cpd = list(rxn.metabolites.keys())[0] + # the Environment takes the opposite perspective to the members + metabolite_data[cpd.id]["Environment"] += -solution.fluxes[rxn.id] + rxn_index = int(FBAHelper.rxn_compartment(rxn)[1:]) + if not any([met not in exchange_mets_list for met in rxn.metabolites] + ) or rxn_index not in species_list: continue + for met in rxn.metabolites: + if met.id not in metabolite_data: continue + metabolite_data[met.id][species_list[rxn_index].id] += solution.fluxes[rxn.id]*rxn.metabolites[met] + + # translating net metabolite flux into species interaction flux + ignore_mets = ignore_mets if ignore_mets is not None else ["h2o_e0", "co2_e0"] + for met in exchange_mets_list: + #Iterating through the metabolite producers + # TODO Why are fluxes normalized? + total = sum([max([metabolite_data[met.id][individual.id], 0]) for individual in mscommodel.members + ]) + max([metabolite_data[met.id]["Environment"], 0]) + for individual in mscommodel.members: + ## calculate metabolic consumption of a species from the environment + if metabolite_data[met.id][individual.id] < Zero: + if metabolite_data[met.id]["Environment"] <= Zero: continue + normalized_flux = abs(metabolite_data[met.id][individual.id] + * metabolite_data[met.id]["Environment"]) / total + species_data["Environment"][individual.id] += normalized_flux + species_collection = add_collection_item(met.name, normalized_flux, flux_threshold, ignore_mets, + species_collection, "Environment", individual.id) + ## calculate and track metabolic donations between a member and another or the environment + elif metabolite_data[met.id][individual.id] > Zero: + for other in mscommodel.members: + ### filter against organisms that do not consume + if metabolite_data[met.id][other.id] >= Zero: continue + normalized_flux = abs(metabolite_data[met.id][individual.id] + * metabolite_data[met.id][other.id])/total + species_data[individual.id][other.id] += normalized_flux + species_collection = add_collection_item(met.name, normalized_flux, flux_threshold, ignore_mets, + species_collection, individual.id, other.id) + ## calculate donations to the environment + if metabolite_data[met.id]["Environment"] >= Zero: continue + normalized_flux = abs(metabolite_data[met.id][individual.id] + * metabolite_data[met.id]["Environment"])/total + species_data[individual.id]["Environment"] += normalized_flux + species_collection = add_collection_item(met.name, normalized_flux, flux_threshold, ignore_mets, + species_collection, individual.id, "Environment") + + # construct the dataframes + for metID in metabolite_data: + for individual in mscommodel.members: + data[individual.id].append(metabolite_data[metID][individual.id]) + data["Environment"].append(metabolite_data[metID]["Environment"]) + + ## process the fluxes dataframe + data["IDs"].append("zz_Environment") + data["Metabolites/Donor"].append(0) + for individual in mscommodel.members: + data[individual.id].append(species_data["Environment"][individual.id]) + data["Environment"].append(0) + for individual in mscommodel.members: + for other in mscommodel.members: + data[individual.id].append(species_data[individual.id][other.id]) + data["Environment"].append(species_data[individual.id]["Environment"]) + data["IDs"].append(f"zz_Species{individual.index}") + data["Metabolites/Donor"].append(individual.id) + + # if len(set(list(map(len, list(data.values()))))) != 1: + # print([(col, len(content)) for col, content in data.items()]) + cross_feeding_df = DataFrame(data) + cross_feeding_df.index = [ID.replace("_e0", "") for ID in map(str, cross_feeding_df["IDs"])] + cross_feeding_df.index.name = "Metabolite/Donor ID" + cross_feeding_df.drop(['IDs', "Metabolites/Donor"], axis=1, inplace=True) + cross_feeding_df = cross_feeding_df.loc[(cross_feeding_df != 0).any(axis=1)] + cross_feeding_df.sort_index(inplace=True) + + ## process the identities dataframe + exchanged_mets = {"Environment": [" "], "Donor ID": ["Environment"]} + exchanged_mets.update({ind.id: [] for ind in mscommodel.members}) + for individual in mscommodel.members: + ### environment exchanges + exchanged_mets[individual.id].append("; ".join(species_collection["Environment"][individual.id])) + exchanged_mets["Environment"].append("; ".join(species_collection[individual.id]["Environment"])) + ### member exchanges + exchanged_mets["Donor ID"].append(individual.id) + for other in mscommodel.members: + exchanged_mets[individual.id].append("; ".join(species_collection[individual.id][other.id])) + + # if len(set(list(map(len, list(exchanged_mets.values()))))) != 1: + # print([(col, len(content)) for col, content in exchanged_mets.items()]) + exMets_df = DataFrame(exchanged_mets) + exMets_df.index = [ID.replace("_e0", "") for ID in map(str, exMets_df["Donor ID"])] + exMets_df.index.name = "Donor ID" + exMets_df.drop(["Donor ID"], axis=1, inplace=True) + exMets_df.sort_index(inplace=True) + exMets_df.fillna(" ") + + # graph the network diagram + if visualize: MSSteadyCom.visual_interactions(cross_feeding_df, filename, export_format, + msdb, msdb_path, show_figure, node_metabolites) + + return cross_feeding_df, exMets_df + + @staticmethod + def visual_interactions(cross_feeding_df, filename="cross_feeding", export_format="svg", msdb=None, + msdb_path=None, view_figure=True, node_metabolites=True): + # load the MSDB + assert msdb or msdb_path, ValueError("Either the MSDB object or the local MSDB path must be provided") + from modelseedpy.biochem import from_local + msdb = msdb or from_local(msdb_path) + # construct the structure of the cross-feeding DataFrame + if "Metabolite/Donor ID" in cross_feeding_df.columns: + cross_feeding_df.index = [metID.replace("_e0", "") for metID in cross_feeding_df["Metabolite/Donor ID"].values] + cross_feeding_df.index.name = "Metabolite/Donor ID" + cross_feeding_df.drop([col for col in cross_feeding_df.columns if "ID" in col], axis=1, inplace=True) + else: cross_feeding_df.index = [metID.replace("_e0", "") for metID in cross_feeding_df.index] + # define the cross-fed metabolites + cross_feeding_rows = [] + for index, row in cross_feeding_df.iterrows(): + positive = negative = False + for col, val in row.items(): + if col not in ["Environment"]: + if val > 1e-4: positive = True + elif val < -1e-4: negative = True + if negative and positive: cross_feeding_rows.append(row) ; break + metabolites_df = concat(cross_feeding_rows, axis=1).T + metabolites_df.index.name = "Metabolite ID" + display(metabolites_df) + metabolites = [msdb.compounds.get_by_id(metID.replace("_e0", "")) for metID in metabolites_df.index.tolist() + if metID not in ["cpdETCM", "cpdETCMe"]] + # define the community members that participate in cross-feeding + members = metabolites_df.loc[:, (metabolites_df != 0).any(axis=0)].columns.tolist() + members.remove("Environment") + members_cluster1, members_cluster2 = members[:int(len(members) / 2)], members[int(len(members) / 2):] + + # TODO define a third node tier of just the environment as a rectangle that spans the width of the members + ## which may alleviate much of the ambiguity about mass imbalance between the member fluxes + import graphviz + dot = graphviz.Digraph(filename, format=export_format) # directed graph + # define nodes + ## top-layer members + # TODO hyperlink the member nodes with their Narrative link + dot.attr('node', shape='rectangle', color="lightblue2", style="filled") + for mem in members_cluster1: + index = members.index(mem) + dot.node(f"S{index}", mem) + ## mets in the middle layer + with dot.subgraph(name="mets") as mets_subgraph: + mets_subgraph.attr(rank="same") + mets_subgraph.attr('node', shape='circle', color="green", style="filled") + for metIndex, met in enumerate(metabolites): + mets_subgraph.node(met.abbr[:3], fixedsize="true", height="0.4", tooltip=f"{met.id} ; {met.name}", + URL=f"https://modelseed.org/biochem/compounds/{met.id}") + ## bottom-layer members + with dot.subgraph(name="members") as members_subgraph: + members_subgraph.attr(rank="same") + for mem in members_cluster2: + index = members.index(mem) + dot.node(f"S{index}", mem) + # define the edges by parsing the interaction DataFrame + for met in metabolites: + row = metabolites_df.loc[met.id] + maxVal = max(list(row.to_numpy())) + for col, val in row.items(): + if col == "Environment": continue + index = members.index(col) + # TODO color carbon sources red + if val > 0: dot.edge(f"S{index}", met.abbr[:3], arrowsize=f"{val / maxVal}", edgetooltip=str(val)) + if val < 0: dot.edge(met.abbr[:3], f"S{index}", arrowsize=f"{abs(val / maxVal)}", edgetooltip=str(val)) + + # render and export the source + dot.render(filename, view=view_figure) + return dot.source diff --git a/modelseedpy/community/steadycom_template.html b/modelseedpy/community/steadycom_template.html new file mode 100644 index 00000000..b894c7f7 --- /dev/null +++ b/modelseedpy/community/steadycom_template.html @@ -0,0 +1,54 @@ + + + + + + SteadyCom Results + + + + + + + + + + +

SteadyCom Results

+ + + + \ No newline at end of file diff --git a/modelseedpy/core/msmodelutl.py b/modelseedpy/core/msmodelutl.py index da65a4f3..d8c0a6c9 100644 --- a/modelseedpy/core/msmodelutl.py +++ b/modelseedpy/core/msmodelutl.py @@ -1,76 +1,305 @@ +# -*- coding: utf-8 -*- import logging import re +import time +import json +import sys +import pandas as pd +import cobra from cobra import Model, Reaction, Metabolite +from cobra.io.json import from_json, to_json from modelseedpy.fbapkg.mspackagemanager import MSPackageManager +from modelseedpy.core.exceptions import * +from modelseedpy.core.fbahelper import FBAHelper +from itertools import chain +from optlang.symbolics import Zero +from optlang import Constraint, Objective +from math import isclose +from multiprocessing import Value + +# from builtins import None logger = logging.getLogger(__name__) +logger.setLevel( + logging.INFO +) # When debugging - set this to INFO then change needed messages below from DEBUG to INFO -def search_name(name): - name = name.lower() - name = re.sub(r'_[a-z]\d*$', '', name) - name = re.sub(r'\W+', '', name) - return name class MSModelUtil: + mdlutls = {} + + @staticmethod + def metabolite_msid(metabolite): + if re.search("^(cpd\d+)", metabolite.id): + m = re.search("^(cpd\d+)", metabolite.id) + return m[1] + for anno in metabolite.annotation: + if isinstance(metabolite.annotation[anno], list): + for item in metabolite.annotation[anno]: + if re.search("^(cpd\d+)", item): + m = re.search("^(cpd\d+)", item) + return m[1] + elif re.search("^(cpd\d+)", metabolite.annotation[anno]): + m = re.search("^(cpd\d+)", metabolite.annotation[anno]) + return m[1] + return None + + @staticmethod + def reaction_msid(reaction): + if re.search("^(rxn\d+)", reaction.id): + m = re.search("^(rxn\d+)", reaction.id) + return m[1] + for anno in reaction.annotation: + if isinstance(reaction.annotation[anno], list): + for item in reaction.annotation[anno]: + if re.search("^(rxn\d+)", item): + m = re.search("^(rxn\d+)", item) + return m[1] + elif re.search("^(rxn\d+)", reaction.annotation[anno]): + m = re.search("^(rxn\d+)", reaction.annotation[anno]) + return m[1] + return None + + @staticmethod + def stoichiometry_to_string(stoichiometry): + reactants, products = [], [] + for met in stoichiometry: + stoich = stoichiometry[met] + if not isinstance(met, str): + met = ( + None + if FBAHelper.modelseed_id_from_cobra_metabolite(met) == "cpd00067" + else met.id + ) + if met: + if stoich < 0: + reactants.append(met) + else: + products.append(met) + return [ + "+".join(sorted(reactants)) + "=" + "+".join(sorted(products)), + "+".join(sorted(products)) + "=" + "+".join(sorted(reactants)), + ] + + @staticmethod + def search_name(name): + name = name.lower() + name = re.sub(r"_[a-z]\d*$", "", name) + name = re.sub(r"\W+", "", name) + return name + + @staticmethod + def get(model, create_if_missing=True): + if isinstance(model, MSModelUtil): + return model + if model in MSModelUtil.mdlutls: + return MSModelUtil.mdlutls[model] + elif create_if_missing: + MSModelUtil.mdlutls[model] = MSModelUtil(model) + return MSModelUtil.mdlutls[model] + else: + return None + - def __init__(self,model): + ########################### CLASS METHODS ########################### + + def __init__(self, model, copy=False, environment=None): self.model = model - self.pkgmgr = MSPackageManager.get_pkg_mgr(model) + if environment is not None: self.add_medium(environment) + self.id = model.id + if copy: + org_obj_val = model.slim_optimize() + self.model = model.copy() ; self.model.objective = model.objective + new_obj_val = self.model.slim_optimize() + if not isclose(org_obj_val, new_obj_val, rel_tol=1e-2) and org_obj_val > 1e-2: + raise ModelError(f"The {model.id} objective value is corrupted by being copied," + f" where the original objective value is {org_obj_val}" + f" and the new objective value is {new_obj_val}.") + self.pkgmgr = MSPackageManager.get_pkg_mgr(self.model) + self.wsid = None + self.atputl = None + self.gfutl = None self.metabolite_hash = None self.search_metabolite_hash = None - + self.test_objective = None + self.reaction_scores = None + self.score = None + self.integrated_gapfillings = [] + self.attributes = {} + if hasattr(self.model, "computed_attributes"): + if self.model.computed_attributes: + self.attributes = self.model.computed_attributes + if "pathways" not in self.attributes: + self.attributes["pathways"] = {} + if "auxotrophy" not in self.attributes: + self.attributes["auxotrophy"] = {} + if "fbas" not in self.attributes: + self.attributes["fbas"] = {} + + def compute_automated_reaction_scores(self): + """ + Computes reaction scores automatically from model data + :return: + """ + self.reaction_scores = {} + + def add_timeout(self, timeout_s=10): + from optlang.interface import Configuration + Configuration(self.model.problem, timeout=timeout_s) + + def printlp(self, lpfilename="debug.lp"): + with open(lpfilename, "w") as out: + out.write(str(self.model.solver)) + def build_metabolite_hash(self): self.metabolite_hash = {} self.search_metabolite_hash = {} for met in self.model.metabolites: - self.add_name_to_metabolite_hash(met.id,met) - self.add_name_to_metabolite_hash(met.name,met) + self.add_name_to_metabolite_hash(met.id, met) + self.add_name_to_metabolite_hash(met.name, met) for anno in met.annotation: if isinstance(met.annotation[anno], list): for item in met.annotation[anno]: - self.add_name_to_metabolite_hash(item,met) + self.add_name_to_metabolite_hash(item, met) else: - self.add_name_to_metabolite_hash(met.annotation[anno],met) - - def add_name_to_metabolite_hash(self,name,met): + self.add_name_to_metabolite_hash(met.annotation[anno], met) + + def add_name_to_metabolite_hash(self, name, met): if name not in self.metabolite_hash: self.metabolite_hash[name] = [] self.metabolite_hash[name].append(met) - sname = search_name(name) + sname = MSModelUtil.search_name(name) if sname not in self.search_metabolite_hash: self.search_metabolite_hash[sname] = [] self.search_metabolite_hash[sname].append(met) - - def find_met(self,name): + + def find_met(self, name, compartment=None): if self.metabolite_hash == None: self.build_metabolite_hash() if name in self.metabolite_hash: - return self.metabolite_hash[name] - sname = search_name(name) + if not compartment: return self.metabolite_hash[name] + for met in self.metabolite_hash[name]: + array = met.id.split("_") + if array[1] == compartment or met.compartment == compartment: return [met] + return None + sname = MSModelUtil.search_name(name) if sname in self.search_metabolite_hash: - return self.search_metabolite_hash[sname] - logger.info(name," not found in model!") + if not compartment: return self.search_metabolite_hash[sname] + for met in self.search_metabolite_hash[sname]: + array = met.id.split("_") + if array[1] == compartment or met.compartment == compartment: return [met] + return None + logger.info(name + " not found in model!") return [] - + + def rxn_hash(self): + output = {} + for rxn in self.model.reactions: + strings = MSModelUtil.stoichiometry_to_string(rxn.metabolites) + output[strings[0]] = [rxn, 1] + output[strings[1]] = [rxn, -1] + return output + + def find_reaction(self, stoichiometry): + output = MSModelUtil.stoichiometry_to_string(stoichiometry) + atpstring = output[0] + rxn_hash = self.rxn_hash() + if atpstring in rxn_hash: + return rxn_hash[atpstring] + return None + + def msid_hash(self): + output = {} + for cpd in self.model.metabolites: + msid = MSModelUtil.metabolite_msid(cpd) + if msid != None: + if msid not in output: output[msid] = [] + output[msid].append(cpd) + return output + def exchange_list(self): - exchange_reactions = [] - for reaction in self.model.reactions: - if reaction.id[:3] == 'EX_': - exchange_reactions.append(reaction) - return exchange_reactions + return [rxn for rxn in self.model.reactions if 'EX_' in rxn.id] + def internal_list(self): + exchanges, transports = self.exchange_list(), self.transport_list() + return [rxn for rxn in self.model.reactions if rxn not in exchanges and rxn not in transports] + + def transport_list(self): + all_transports = [rxn for rxn in self.model.reactions if len(set([ + met.id.split("_")[0] for met in rxn.reactants]).intersection(set([ + met.id.split("_")[0] for met in rxn.products]))) > 0] + # TODO look for compounds that have compounds in different compartments + # TODO PTS transporters would fail this logic + # remove biomass reactions + for rxn in all_transports: + if "cpd11416" in [met.id.split("_")[0] for met in rxn.metabolites]: all_transports.remove(rxn) + return all_transports + + def carbon_mets(self): + return [met for met in self.model.metabolites if 'C' in met.elements] + + def carbon_exchange_list(self, include_unknown=True): + if not include_unknown: + return [ex for ex in self.exchange_list() if "C" in ex.reactants[0].elements] + return [ex for ex in self.exchange_list() if not ex.reactants[0].elements or "C" in ex.reactants[0].elements] + + def carbon_exchange_mets_list(self, include_unknown=True): + return self.metabolites_set(self.carbon_exchange_list(include_unknown)) + + def exchange_mets_list(self): + return self.metabolites_set(self.exchange_list()) + + def media_exchanges_list(self): + return [exRXN for exRXN in self.exchange_list() if exRXN.id in self.model.medium] + + def metabolites_set(self, reactions_set=None, ids=False): + rxns = reactions_set or self.model.reactions + if ids: return {met.id for rxn in rxns for met in rxn.metabolites} + return {met for rxn in rxns for met in rxn.metabolites} + + def bio_rxns_list(self): + return [rxn for rxn in self.model.reactions if re.search(r"(^bio\d+)", rxn.id)] + + def compatibilize(self, conflicts_file_name="orig_conflicts.json", printing=False): + from commscores import GEMCompatibility + self.model = GEMCompatibility.standardize( + [self.model], conflicts_file_name=conflicts_file_name, printing=printing)[0] + return self.model + + def standard_exchanges(self): + for ex in self.exchange_list(): + if len(ex.reactants) != 1 and len(ex.products) != 0: + raise ModelError(f"The ex {ex.id} possesses {len(ex.reactants)} reactants and " + f"{len(ex.products)} products, which are non-standard and are incompatible" + f" with various ModelSEED operations.") + + def nonexchange_reaction_count(self): + count = 0 + for reaction in self.model.reactions: + if ( + reaction.id[:3] != "EX_" + and reaction.id[:3] != "SK_" + and reaction.id[:3] != "DM_" + and reaction.id[:3] != "bio" + ): + if reaction.upper_bound > 0 or reaction.lower_bound < 0: + count += 1 + return count + def exchange_hash(self): exchange_reactions = {} - exlist = self.exchange_list() - for reaction in exlist: - for met in reaction.metabolites: - if reaction.metabolites[met] == -1: - exchange_reactions[met] = reaction + for ex_rxn in self.exchange_list(): + for met in ex_rxn.metabolites: + if ex_rxn.metabolites[met] == -1: + exchange_reactions[met] = ex_rxn else: - logger.warn("Nonstandard exchange reaction ignored:"+reaction.id) + logger.warning("Nonstandard exchange reaction ignored:" + ex_rxn.id) return exchange_reactions - - def add_missing_exchanges(self,media): + + def var_names_list(self): + return [var.name for var in self.model.variables] + + def add_missing_exchanges(self, media): output = [] exchange_hash = self.exchange_hash() exchange_list = [] @@ -78,69 +307,289 @@ def add_missing_exchanges(self,media): for mediacpd in media.mediacompounds: mets = self.find_met(mediacpd.id) if len(mets) > 0: - found = 0 + found = False cpd = None for met in mets: - if met in exchange_hash: - found = 1 + if met in self.exchange_hash(): + found = True elif met.compartment[0:1] == "c": - #We prefer to add a transport for the cytosol compound + # We prefer to add a transport for the cytosol compound cpd = met if cpd == None: - #No cytosol compound exists so choosing the first version we found that does exist + # No cytosol compound exists so choosing the first version we found that does exist cpd = mets[0] - if found == 0: + if found: #No transporter currently exists - adding exchange reaction for the compound that does exist output.append(cpd.id) exchange_list.append(cpd) if len(exchange_list) > 0: self.add_exchanges_for_metabolites(exchange_list) return output - - def add_exchanges_for_metabolites(self,cpds,uptake=0,excretion=0,prefix='EX_', prefix_name='Exchange for '): + + def add_exchanges_for_metabolites( + self, cpds, uptake=0, excretion=0, prefix="EX_", prefix_name="Exchange for " + ): drains = [] for cpd in cpds: - drain_reaction = Reaction(id=f'{prefix}{cpd.id}', - name=prefix_name + cpd.name, - lower_bound=-1*uptake, - upper_bound=excretion) - drain_reaction.add_metabolites({cpd : -1}) - drain_reaction.annotation["sbo"] = 'SBO:0000627' - drains.append(drain_reaction) + drain_reaction = Reaction( + id=f"{prefix}{cpd.id}", + name=prefix_name + cpd.name, + lower_bound=-1 * uptake, + upper_bound=excretion, + ) + drain_reaction.add_metabolites({cpd: -1}) + drain_reaction.annotation["sbo"] = "SBO:0000627" + if drain_reaction.id not in self.model.reactions: + drains.append(drain_reaction) self.model.add_reactions(drains) return drains - - def reaction_scores(self): + + def reaction_scores(self): #!!! Can this be deleted? return {} - - #Required this function to add gapfilled compounds to a KBase model for saving gapfilled model - def convert_cobra_compound_to_kbcompound(self,cpd,kbmodel,add_to_model = 1): + + ################################################################################# + # Functions related to editing the model + ################################################################################# + def get_attributes(self, key=None, default=None): + if not key: + return self.attributes + if key not in self.attributes: + self.attributes[key] = default + return self.attributes[key] + + def save_attributes(self, value=None, key=None): + if value: + if key: + self.attributes[key] = value + else: + self.attributes = value + if hasattr(self.model, "computed_attributes"): + logger.info( + "Setting FBAModel computed_attributes to mdlutl attributes" + ) + self.attributes["gene_count"] = len(self.model.genes) + self.model.computed_attributes = self.attributes + + def add_ms_reaction(self, rxn_dict, msdb_path=None, msdb_object=None, comp_trans=["c0", "e0"]): + if msdb_object: modelseed = msdb_object + else: + # from modelseedpy.biochem.modelseed_biochem import ModelSEEDBiochem + from modelseedpy.biochem import from_local + # modelseed = ModelSEEDBiochem.get() + modelseed = from_local(msdb_path) + output = [] + model_mets = self.metabolites_set(ids=True) + for rxnid, comp in rxn_dict.items(): + fullid = f"{rxnid}_{comp}" + rxn = modelseed.reactions.get_by_id(rxnid) + new_reaction = Reaction(id=fullid, name=f"{rxn.name}_{comp}") + metabolites_to_add = {} + for met, stoich in rxn.metabolites.items(): + comp_num = FBAHelper.compartment_index(met.id) + if comp_num > len(comp_trans): + logger.critical(f"The compartment index {comp_num} is out of range") + comp_str = comp_trans[comp_num] + met_output = self.find_met(met.id, comp_str) + new_met = Metabolite(f"{met.id}_{comp_str}", name=f"{met.name}_{comp_str}", + compartment=comp_str) if not met_output else met_output[0] + metabolites_to_add[new_met] = stoich + if new_met.id not in model_mets: self.model.add_metabolites([new_met]) + new_reaction.add_metabolites(metabolites_to_add) + output.append(new_reaction) + print(f"The {new_reaction.id} reaction is defined.") + self.model.add_reactions(output) + print(f"{len(output)} reactions and {len(self.model.metabolites)-len(model_mets)} metabolites" + f" were added to the model.") + return output + + def create_constraint(self, constraint, coef=None, sloppy=False, printing=False): + if printing: print(coef) + self.model.add_cons_vars(constraint, sloppy=sloppy) + self.model.solver.update() + if coef: constraint.set_linear_coefficients(coef) + self.model.solver.update() + + # self.model.solver.update() + # for cons in self.model.constraints: + # if cons.name == constraint.name: + # cons.set_linear_coefficients(coef) + # self.model.solver.update() + + # self.model.add_cons_vars(constraint, sloppy=sloppy) + # self.model.solver.update() + + def add_cons_vars(self, vars_cons, sloppy=False): + self.model.add_cons_vars(vars_cons, sloppy=sloppy) + self.model.solver.update() + + def remove_cons_vars(self, vars_cons): + self.model.remove_cons_vars(vars_cons) + self.model.solver.update() + + def add_objective(self, objective, direction="max", coef=None): + self.model.objective = Objective(objective, direction=direction) + self.model.solver.update() + if coef: + self.model.objective.set_linear_coefficients(coef) + self.model.solver.update() + + def set_objective_from_target_reaction(self, target_reaction, minimize=False): + target_reaction = self.model.reactions.get_by_id(target_reaction) + sense = "max" if not minimize else "min" + self.model.objective = self.model.problem.Objective( + target_reaction.flux_expression, direction=sense + ) + return target_reaction + + def biomass_expression(self): + for met in self.model.metabolites: + if "cpd11416" in met.id: + # returns the biomass expression of the lowest cytoplasmic compartment + return met.constraint.expression + + def add_minimal_objective_cons(self, min_value=0.1, objective_expr=None): + if "min_value" not in self.model.constraints: + objective_expr = objective_expr or self.model.objective.expression + self.create_constraint(Constraint(objective_expr, lb=min_value, ub=None, name="min_value")) + # print(self.model.constraints["min_value"]) + else: + print(f"The min_value constraint already exists in {self.model.id}, " + f"hence the lb is simply updated from" + f" {self.model.constraints['min_value'].lb} to {min_value}.\n") + self.model.constraints["min_value"].lb = min_value + + def add_exchange_to_model(self, cpd, rxnID): + self.model.add_boundary(metabolite=Metabolite(id=cpd.id, name=cpd.name, compartment="e0"), + reaction_id=rxnID, type="exchange", lb=cpd.minFlux, ub=cpd.maxFlux) + + def update_model_media(self, media): + medium = self.model.medium + model_reactions = [rxn.id for rxn in self.model.reactions] + for cpd in media.data["mediacompounds"]: + ex_rxn = f"EX_{cpd.id}_e0" + if ex_rxn not in model_reactions: + self.add_exchange_to_model(cpd, ex_rxn) + medium[ex_rxn] = cpd.maxFlux + self.model.medium = medium + return self.model + + ################################################################################# + # Functions related to utility functions + ################################################################################# + def build_model_data_hash(self): + data = { + "Model": self.id, + "Genome": self.genome.info.metadata["Name"], + "Genes": self.genome.info.metadata["Number of Protein Encoding Genes"], + } + return data + + def compare_reactions(self, reaction_list, filename): + data = {} + for rxn in reaction_list: + for met in rxn.metabolites: + if met.id not in data: + data[met.id] = {} + for other_rxn in reaction_list: + data[met.id][other_rxn.id] = 0 + data[met.id][rxn.id] = rxn.metabolites[met] + df = pd.DataFrame(data) + df = df.transpose() + df.to_csv(filename) + + ################################################################################# + # Functions related to managing biomass reactions + ################################################################################# + def evaluate_biomass_reaction_mass(self, biomass_rxn_id, normalize=False): + biorxn = self.model.reactions.get_by_id(biomass_rxn_id) + # First computing energy biosynthesis coefficients + atp = None + atp_compounds = { + "cpd00002": -1, + "cpd00001": -1, + "cpd00008": 1, + "cpd00009": 1, + "cpd00067": 1, + } + mass_compounds = {"cpd11463": 1, "cpd11461": 1, "cpd11462": 1} + process_compounds = {"cpd17041": 1, "cpd17042": 1, "cpd17043": 1} + for met in biorxn.metabolites: + msid = self.metabolite_msid(met) + if msid == "cpd00008": + atp = abs(biorxn.metabolites[met]) + # Computing non ATP total mass + total = 0 + for met in biorxn.metabolites: + msid = self.metabolite_msid(met) + if msid == "cpd11416": + continue + coef = biorxn.metabolites[met] + if msid in mass_compounds: + total += coef + elif msid in process_compounds: + total += 0 + else: + mw = FBAHelper.metabolite_mw(met) + if msid in atp_compounds: + if coef < 0: + coef += atp + else: + coef += -1 * atp + total += mw * coef / 1000 + return {"ATP": atp, "Total": total} + + # Required this function to add gapfilled compounds to a KBase model for saving gapfilled model + # adding gapfilling compounds to a KBase model saves gapfilled models + def convert_cobra_compound_to_kbcompound(self, cpd, kbmodel, add_to_model=1): refid = "cpd00000" - if re.search('cpd\d+_[a-z]+',cpd.id): + if re.search("cpd\d+_[a-z]+", cpd.id): refid = cpd.id - refid = re.sub("_[a-z]\d+$","",refid) + refid = re.sub("_[a-z]\d+$", "", refid) cpd_data = { "aliases": [], "charge": cpd.charge, - "compound_ref": "~/template/compounds/id/"+refid, + "compound_ref": "~/template/compounds/id/" + refid, "dblinks": {}, "formula": cpd.formula, "id": cpd.id, - "modelcompartment_ref": "~/modelcompartments/id/"+cpd.id.split("_").pop(), + "modelcompartment_ref": "~/modelcompartments/id/" + cpd.id.split("_").pop(), "name": cpd.name, "numerical_attributes": {}, - "string_attributes": {} + "string_attributes": {}, } if add_to_model == 1: kbmodel["modelcompounds"].append(cpd_data) return cpd_data - #Required this function to add gapfilled reactions to a KBase model for saving gapfilled model - def convert_cobra_reaction_to_kbreaction(self,rxn,kbmodel,cpd_hash,direction = "=",add_to_model = 1,reaction_genes = None): + def compute_flux_values_from_variables(self): + """Returns a hash of reaction fluxes from model object + + Parameters + ---------- + + Returns + ------- + dict + Hash of reactions and their associated flux values + + Raises + ------ + """ + flux_values = {} + for rxn in self.model.reactions: + flux_values[rxn.id] = { + "reverse": rxn.reverse_variable.primal, + "forward": rxn.forward_variable.primal, + } + return flux_values + + # Required this function to add gapfilled reactions to a KBase model for saving gapfilled model + def convert_cobra_reaction_to_kbreaction( + self, rxn, kbmodel, cpd_hash, direction="=", add_to_model=1, reaction_genes=None + ): rxnref = "~/template/reactions/id/rxn00000_c" - if re.search('rxn\d+_[a-z]+',rxn.id): - rxnref = "~/template/reactions/id/"+rxn.id - rxnref = re.sub("\d+$","",rxnref) + if re.search("rxn\d+_[a-z]+", rxn.id): + rxnref = re.sub("\d+$","",f"~/template/reactions/id/{rxn.id}") rxn_data = { "id": rxn.id, "aliases": [], @@ -152,84 +601,723 @@ def convert_cobra_reaction_to_kbreaction(self,rxn,kbmodel,cpd_hash,direction = " "maxrevflux": 1000000, "modelReactionProteins": [], "modelReactionReagents": [], - "modelcompartment_ref": "~/modelcompartments/id/"+rxn.id.split("_").pop(), + "modelcompartment_ref": "~/modelcompartments/id/" + rxn.id.split("_").pop(), "name": rxn.name, "numerical_attributes": {}, "probability": 0, "protons": 0, "reaction_ref": rxnref, - "string_attributes": {} + "string_attributes": {}, } for cpd in rxn.metabolites: if cpd.id not in kbmodel["modelcompounds"]: - cpd_hash[cpd.id] = self.convert_cobra_compound_to_kbcompound(cpd,kbmodel,1) - rxn_data["modelReactionReagents"].append({ - "coefficient" : rxn.metabolites[cpd], - "modelcompound_ref" : "~/modelcompounds/id/"+cpd.id - }) + cpd_hash[cpd.id] = self.convert_cobra_compound_to_kbcompound( + cpd, kbmodel, 1 + ) + rxn_data["modelReactionReagents"].append( + { + "coefficient": rxn.metabolites[cpd], + "modelcompound_ref": "~/modelcompounds/id/" + cpd.id, + } + ) if reaction_genes != None and rxn.id in reaction_genes: best_gene = None for gene in reaction_genes[rxn.id]: - if best_gene == None or reaction_genes[rxn.id][gene] > reaction_genes[rxn.id][best_gene]: + if ( + best_gene == None + or reaction_genes[rxn.id][gene] > reaction_genes[rxn.id][best_gene] + ): best_gene = gene - rxn_data["modelReactionProteins"] = [{"note":"Added from gapfilling","modelReactionProteinSubunits":[],"source":"Unknown"}] - rxn_data["modelReactionProteins"][0]["modelReactionProteinSubunits"] = [{"note":"Added from gapfilling","optionalSubunit":0,"triggering":1,"feature_refs":["~/genome/features/id/"+best_gene],"role":"Unknown"}] + rxn_data["modelReactionProteins"] = [ + { + "note": "Added from gapfilling", + "modelReactionProteinSubunits": [], + "source": "Unknown", + } + ] + rxn_data["modelReactionProteins"][0]["modelReactionProteinSubunits"] = [ + { + "note": "Added from gapfilling", + "optionalSubunit": 0, + "triggering": 1, + "feature_refs": ["~/genome/features/id/" + best_gene], + "role": "Unknown", + } + ] if add_to_model == 1: kbmodel["modelreactions"].append(rxn_data) return rxn_data - - def add_gapfilling_solution_to_kbase_model(self,newmodel,gapfilled_reactions,gfid=None,media_ref = None,reaction_genes = None): - rxn_table = [] - gapfilling_obj = None - if gfid == None: - largest_index = 0 - for gapfilling in newmodel["gapfillings"]: - current_index = int(gapfilling["id"].split(".").pop()) - if largest_index == 0 or largest_index < current_index: - largest_index = current_index - largest_index += 1 - gfid = "gf."+str(largest_index) - else: - for gapfilling in newmodel["gapfillings"]: - if gapfilling["id"] == gfid: - gapfilling_obj = gapfilling - if gapfilling_obj == None: + + ################################################################################# + # Functions related to gapfilling of models + ################################################################################# + """Tests if every reaction in a given gapfilling solution is actually needed for growth + Optionally can remove unneeded reactions from the model AND the solution object. + Note, this code assumes the gapfilling solution is already integrated. + + Parameters + ---------- + {"new":{string reaction_id: string direction},"reversed":{string reaction_id: string direction}} - solution + Data for gapfilling solution to be tested + bool - keep_changes + Set this bool to True to remove the unneeded reactions from the solution and model + Returns + ------- + list> + List of unneeded reactions + + Raises + ------ + """ + + def test_solution(self, solution, keep_changes=False): + unneeded = [] + removed_rxns = [] + tempmodel = self.model + if not keep_changes: + tempmodel = from_json(to_json(self.model)) + tempmodel.objective = solution["target"] + pkgmgr = MSPackageManager.get_pkg_mgr(tempmodel) + pkgmgr.getpkg("KBaseMediaPkg").build_package(solution["media"]) + objective = tempmodel.slim_optimize() + logger.debug("Starting objective:" + str(objective)) + types = ["new", "reversed"] + for key in types: + for rxn_id in solution[key]: + rxnobj = tempmodel.reactions.get_by_id(rxn_id) + if solution[key][rxn_id] == ">": + original_bound = rxnobj.upper_bound + rxnobj.upper_bound = 0 + objective = tempmodel.slim_optimize() + if objective < solution["minobjective"]: + logger.info( + rxn_id + + solution[key][rxn_id] + + " needed:" + + str(objective) + + " with min obj:" + + str(solution["minobjective"]) + ) + rxnobj.upper_bound = original_bound + else: + removed_rxns.append(rxnobj) + unneeded.append([rxn_id, solution[key][rxn_id], key]) + logger.info( + rxn_id + + solution[key][rxn_id] + + " not needed:" + + str(objective) + ) + else: + original_bound = rxnobj.lower_bound + rxnobj.lower_bound = 0 + objective = tempmodel.slim_optimize() + if objective < solution["minobjective"]: + logger.info( + rxn_id + + solution[key][rxn_id] + + " needed:" + + str(objective) + + " with min obj:" + + str(solution["minobjective"]) + ) + rxnobj.lower_bound = original_bound + else: + removed_rxns.append(rxnobj) + unneeded.append([rxn_id, solution[key][rxn_id], key]) + logger.info( + rxn_id + + solution[key][rxn_id] + + " not needed:" + + str(objective) + ) + if keep_changes: + tempmodel.remove_reactions(removed_rxns) + for items in unneeded: + del solution[items[2]][items[0]] + return unneeded + + def add_gapfilling(self, solution): + self.integrated_gapfillings.append(solution) + + def create_kb_gapfilling_data(self, kbmodel, atpmedia_ws="94026"): + gapfilling_hash = {} + if "gapfillings" not in kbmodel: + kbmodel["gapfillings"] = [] + for gapfilling in kbmodel["gapfillings"]: + gapfilling_hash[gapfilling["id"]] = gapfilling + rxn_hash = {} + for rxn in kbmodel["modelreactions"]: + rxn_hash[rxn["id"]] = rxn + for gf in self.integrated_gapfillings: + media_ref = "KBaseMedia/Empty" + gf["media"].id.replace("/", ".") + gfid = gf["media"].id + if self.atputl: + for item in self.atputl.atp_medias: + if item[0] == gf["media"]: + gfid = "ATP-" + gfid + media_ref = atpmedia_ws + "/" + gf["media"].id + ".atp" + break + if hasattr(gf["media"], "info"): + media_ref = gf["media"].info.workspace_id + "/" + gf["media"].info.id + suffix = 0 + while gfid in gapfilling_hash: + suffix += 1 + gfid += "." + str(suffix) + gapfilling_hash[gfid] = 1 gapfilling_obj = { - "gapfill_id": newmodel["id"]+"."+gfid, + "gapfill_id": gfid, "id": gfid, "integrated": 1, "integrated_solution": "0", - "media_ref": media_ref + "target": gf["target"], + "minobjective": gf["minobjective"], + "binary_check": gf["binary_check"], + "media_ref": media_ref, } - newmodel["gapfillings"].append(gapfilling_obj) - cpd_hash = {} - for cpd in newmodel["modelcompounds"]: - cpd_hash[cpd["id"]] = cpd - for rxn in gapfilled_reactions["new"]: - reaction = self.model.reactions.get_by_id(rxn) - kbrxn = self.convert_cobra_reaction_to_kbreaction(reaction,newmodel,cpd_hash,gapfilled_reactions["new"][rxn],1,reaction_genes) - kbrxn["gapfill_data"][gfid] = dict() - kbrxn["gapfill_data"][gfid]["0"] = [gapfilled_reactions["new"][rxn],1,[]] - rxn_table.append({ - 'id':kbrxn["id"], - 'name':kbrxn["name"], - 'direction':format_direction(kbrxn["direction"]), - 'gene':format_gpr(kbrxn), - 'equation':format_equation(kbrxn,cpd_hash), - 'newrxn':1 - }) - for rxn in gapfilled_reactions["reversed"]: - for kbrxn in newmodel["modelreactions"]: - if kbrxn["id"] == rxn: - kbrxn["direction"] = "=" - rxn_table.append({ - 'id':kbrxn["id"], - 'name':kbrxn["name"], - 'direction':format_direction(kbrxn["direction"]), - 'gene':format_gpr(kbrxn), - 'equation':format_equation(kbrxn,cpd_hash), - 'newrxn':0 - }) - kbrxn["gapfill_data"][gfid] = dict() - kbrxn["gapfill_data"][gfid]["0"] = [gapfilled_reactions["reversed"][rxn],1,[]] - return rxn_table \ No newline at end of file + kbmodel["gapfillings"].append(gapfilling_obj) + for rxn in gf["new"]: + if rxn in rxn_hash: + rxnobj = rxn_hash[rxn] + if "gapfill_data" not in rxnobj: + rxnobj["gapfill_data"] = {} + if gfid not in rxnobj["gapfill_data"]: + rxnobj["gapfill_data"][gfid] = {"0": [gf["new"][rxn], 1, []]} + for rxn in gf["reversed"]: + if rxn in rxn_hash: + rxnobj = rxn_hash[rxn] + if "gapfill_data" not in rxnobj: + rxnobj["gapfill_data"] = {} + if gfid not in rxnobj["gapfill_data"]: + rxnobj["gapfill_data"][gfid] = { + "0": [gf["reversed"][rxn], 1, []] + } + + ################################################################################# + # Functions related to applying, running, and expanding with test conditions + ################################################################################# + def run_fba(self, media=None, pfba=False, fva_reactions=None): + from cobra import flux_analysis + if media: + self.pkgmgr.getpkg("KBaseMediaPkg").build_package(media) + if pfba: + return flux_analysis.pfba(self.model) + if fva_reactions is not None: + return flux_analysis.variability.flux_variability_analysis(self.model, fva_reactions) + return self.model.optimize() + + def resource_balance_constraint(self, flux_limit=140): + vars_coef = {} + for rxn in self.model.reactions: + if "EX_" not in rxn.id: + vars_coef[rxn.forward_variable] = vars_coef[rxn.reverse_variable] = 1 + self.create_constraint(Constraint(Zero, lb=0, ub=flux_limit, name="resource_balance_limit"), coef=vars_coef) + + def apply_test_condition(self, condition, model=None): + """Applies constraints and objective of specified condition to model + + Parameters + ---------- + condition : dict + Specifies condition to be tested with media, objective, is_max_threshold, threshold. + model : cobra.Model, optional + Specific instance of model to apply conditions to (useful if using "with model") + + Returns + ------- + boolean + True if threshold is NOT exceeded, False if threshold is exceeded + + Raises + ------ + """ + if model is None: + model = self.model + pkgmgr = self.pkgmgr + else: + pkgmgr = MSPackageManager.get_pkg_mgr(model) + model.objective = condition["objective"] + if condition["is_max_threshold"]: + model.objective.direction = "max" + else: + model.objective.direction = "min" + pkgmgr.getpkg("KBaseMediaPkg").build_package(condition["media"]) + + def test_single_condition(self, condition, apply_condition=True, model=None): + """Runs a single test condition to determine if objective value on set media exceeds threshold + + Parameters + ---------- + condition : dict + Specifies condition to be tested with media, objective, is_max_threshold, threshold. + apply_condition : bool,optional + Indicates if condition constraints and objective should be applied. + model : cobra.Model, optional + Specific instance of model to apply tests to (useful if using "with model") + + Returns + ------- + boolean + True if threshold is NOT exceeded, False if threshold is exceeded + + Raises + ------ + """ + if model is None: + model = self.model + if apply_condition: + print("applying - bad") + self.apply_test_condition(condition, model) + new_objective = model.slim_optimize() + value = new_objective + if "change" in condition and condition["change"]: + if self.test_objective: + value = new_objective - self.test_objective + logger.debug( + condition["media"].id + + " testing for change:" + + str(value) + + "=" + + str(new_objective) + + "-" + + str(self.test_objective) + ) + self.score = value + if model.solver.status != "optimal": + self.printlp(condition["media"].id + "-Testing-Infeasible.lp") + logger.critical( + condition["media"].id + + "testing leads to infeasible problem. LP file printed to debug!" + ) + return False + if value >= condition["threshold"] and condition["is_max_threshold"]: + # logger.debug("Failed high:"+condition["media"].id+":"+str(new_objective)+";"+str(condition["threshold"])) + return False + elif value <= condition["threshold"] and not condition["is_max_threshold"]: + # logger.debug("Failed low:"+condition["media"].id+":"+str(new_objective)+";"+str(condition["threshold"])) + return False + self.test_objective = new_objective + return True + + def test_condition_list(self, condition_list, model=None): + """Runs a set of test conditions to determine if objective values on set medias exceed thresholds + + Parameters + ---------- + condition_list : list + Specifies set of conditions to be tested with media, objective, is_max_threshold, threshold. + model : cobra.Model, optional + Specific instance of model to apply tests to (useful if using "with model") + + Returns + ------- + boolean + True if ALL tests pass, False if any test returns false + + Raises + ------ + """ + if model == None: + model = self.model + for condition in condition_list: + if not self.test_single_condition(condition, True, model): + return False + return True + + def linear_expansion_test(self, reaction_list, condition, currmodel): + """Tests addition of reactions one at a time + + Parameters + ---------- + reaction_list : list<[obj reaction,{>|>}]> + List of reactions and directions to test for addition in the model (should already be in model) + + Returns + ------- + list<[obj reaction,{>|>}]> + List of reactions and directions filtered because they fail tests when in the model + + Raises + ------ + """ + # First run the full test + if self.test_single_condition(condition, False, currmodel): + return [] + # First knockout all reactions in the input list and save original bounds + filtered_list = [] + original_bound = [] + for item in reaction_list: + if item[1] == ">": + original_bound.append(item[0].upper_bound) + item[0].upper_bound = 0 + else: + original_bound.append(item[0].lower_bound) + item[0].lower_bound = 0 + # Now restore reactions one at a time + count = 0 + for item in reaction_list: + if item[1] == ">": + item[0].upper_bound = original_bound[count] + if not self.test_single_condition(condition, False, currmodel): + # logger.debug(item[0].id+":"+item[1]) + item[0].upper_bound = 0 + if item not in filtered_list: + item.append(original_bound[count]) + item.append(self.score) + filtered_list.append(item) + else: + item[0].lower_bound = original_bound[count] + if not self.test_single_condition(condition, False, currmodel): + # logger.debug(item[0].id+":"+item[1]) + item[0].lower_bound = 0 + if item not in filtered_list: + item.append(original_bound[count]) + item.append(self.score) + filtered_list.append(item) + count += 1 + return filtered_list + + def binary_expansion_test(self, reaction_list, condition, currmodel, depth=0): + """Conducts a binary search for bad reaction combinations + Parameters + ---------- + reaction_list : list<[obj reaction,{>|>}]> + List of reactions and directions to test for addition in the model (should already be in model) + condition_list : list + Specifies set of conditions to be tested with media, objective, is_max_threshold, threshold. + + Returns + ------- + list<[obj reaction,{>|>}]> + List of reactions and directions filtered because they fail tests when in the model + + Raises + ------ + """ + newdepth = depth + 1 + filtered_list = [] + # First run the full test + if self.test_single_condition(condition, False, currmodel): + return [] + # Check if input list contains only one reaction: + if len(reaction_list) == 1: + if reaction_list[0][1] == ">": + reaction_list[0].append(reaction_list[0][0].upper_bound) + reaction_list[0][0].upper_bound = 0 + else: + reaction_list[0].append(reaction_list[0][0].lower_bound) + reaction_list[0][0].lower_bound = 0 + reaction_list[0].append(self.score) + filtered_list.append(reaction_list[0]) + return filtered_list + # Break reaction list into two + original_bound = [] + sub_lists = [[], []] + midway_point = int(len(reaction_list) / 2) + for i, item in enumerate(reaction_list): + if item[1] == ">": + original_bound.append(item[0].upper_bound) + else: + original_bound.append(item[0].lower_bound) + if i < midway_point: + sub_lists[0].append(item) + else: + sub_lists[1].append(item) + if item[1] == ">": + item[0].upper_bound = 0 + else: + item[0].lower_bound = 0 + # Submitting first half of reactions for testing + new_filter = self.binary_expansion_test( + sub_lists[0], condition, currmodel, newdepth + ) + for item in new_filter: + filtered_list.append(item) + # Submitting second half of reactions for testing - now only breaking reactions are removed from the first list + for i, item in enumerate(reaction_list): + if i >= midway_point: + if item[1] == ">": + item[0].upper_bound = original_bound[i] + else: + item[0].lower_bound = original_bound[i] + new_filter = self.binary_expansion_test( + sub_lists[1], condition, currmodel, newdepth + ) + for item in new_filter: + filtered_list.append(item) + return filtered_list + + def reaction_expansion_test( + self, reaction_list, condition_list, binary_search=True + ): + """Adds reactions in reaction list one by one and appplies tests, filtering reactions that fail + + Parameters + ---------- + reaction_list : list<[obj reaction,{>|>}]> + List of reactions and directions to test for addition in the model (should already be in model) + condition_list : list + Specifies set of conditions to be tested with media, objective, is_max_threshold, threshold. + + Returns + ------- + list<[obj reaction,{>|>}]> + List of reactions and directions filtered because they fail tests when in the model + + Raises + ------ + """ + logger.debug(f"Expansion started! Binary = {binary_search}") + filtered_list = [] + for condition in condition_list: + logger.debug(f"testing condition {condition}") + currmodel = self.model + tic = time.perf_counter() + new_filtered = [] + with currmodel: + self.apply_test_condition(condition) + if binary_search: + new_filtered = self.binary_expansion_test( + reaction_list, condition, currmodel + ) + for item in new_filtered: + if item not in filtered_list: + filtered_list.append(item) + else: + new_filtered = self.linear_expansion_test( + reaction_list, condition, currmodel + ) + for item in new_filtered: + if item not in filtered_list: + filtered_list.append(item) + # Restoring knockout of newly filtered reactions, which expire after exiting the "with" block above + for item in new_filtered: + if item[1] == ">": + item[0].upper_bound = 0 + else: + item[0].lower_bound = 0 + toc = time.perf_counter() + logger.info( + "Expansion time:" + condition["media"].id + ":" + str((toc - tic)) + ) + logger.info( + "Filtered count:" + + str(len(filtered_list)) + + " out of " + + str(len(reaction_list)) + ) + # Adding filter results to attributes + gf_filter_att = self.get_attributes("gf_filter", {}) + if condition["media"].id not in gf_filter_att: + gf_filter_att[condition["media"].id] = {} + if condition["objective"] not in gf_filter_att[condition["media"].id]: + gf_filter_att[condition["media"].id][condition["objective"]] = {} + if ( + condition["threshold"] + not in gf_filter_att[condition["media"].id][condition["objective"]] + ): + gf_filter_att[condition["media"].id][condition["objective"]][ + condition["threshold"] + ] = {} + for item in new_filtered: + if ( + item[0].id + not in gf_filter_att[condition["media"].id][condition["objective"]][ + condition["threshold"] + ] + ): + gf_filter_att[condition["media"].id][condition["objective"]][ + condition["threshold"] + ][item[0].id] = {} + if ( + item[1] + not in gf_filter_att[condition["media"].id][condition["objective"]][ + condition["threshold"] + ][item[0].id] + ): + if len(item) < 3: + gf_filter_att[condition["media"].id][condition["objective"]][ + condition["threshold"] + ][item[0].id][item[1]] = None + else: + gf_filter_att[condition["media"].id][condition["objective"]][ + condition["threshold"] + ][item[0].id][item[1]] = item[2] + gf_filter_att = self.save_attributes(gf_filter_att, "gf_filter") + return filtered_list + + ################################################################################# + # Functions related to biomass sensitivity analysis + ################################################################################# + def find_unproducible_biomass_compounds(self, target_rxn="bio1", ko_list=None): + # Cloning the model because we don't want to modify the original model with this analysis + tempmodel = cobra.io.json.from_json(cobra.io.json.to_json(self.model)) + # Getting target reaction and making sure it exists + if target_rxn not in tempmodel.reactions: + logger.critical(target_rxn + " not in model!") + target_rxn_obj = tempmodel.reactions.get_by_id(target_rxn) + tempmodel.objective = target_rxn + original_objective = tempmodel.objective + pkgmgr = MSPackageManager.get_pkg_mgr(tempmodel) + rxn_list = [target_rxn, "rxn05294_c0", "rxn05295_c0", "rxn05296_c0"] + for rxn in rxn_list: + if rxn in tempmodel.reactions: + pkgmgr.getpkg("FlexibleBiomassPkg").build_package( + { + "bio_rxn_id": rxn, + "flex_coefficient": [0, 1], + "use_rna_class": None, + "use_dna_class": None, + "use_protein_class": None, + "use_energy_class": [0, 1], + "add_total_biomass_constraint": False, + } + ) + + # Creating min flex objective + min_flex_obj = tempmodel.problem.Objective(Zero, direction="min") + obj_coef = dict() + for reaction in tempmodel.reactions: + if reaction.id[0:5] == "FLEX_" or reaction.id[0:6] == "energy": + obj_coef[reaction.forward_variable] = 1 + obj_coef[reaction.reverse_variable] = 1 + # Temporarily setting flex objective so I can set coefficients + tempmodel.objective = min_flex_obj + min_flex_obj.set_linear_coefficients(obj_coef) + if not ko_list: + return self.run_biomass_dependency_test( + target_rxn_obj, tempmodel, original_objective, min_flex_obj, rxn_list + ) + else: + output = {} + for item in ko_list: + logger.debug("KO:" + item[0] + item[1]) + rxnobj = tempmodel.reactions.get_by_id(item[0]) + if item[1] == ">": + original_bound = rxnobj.upper_bound + rxnobj.upper_bound = 0 + if item[0] not in output: + output[item[0]] = {} + output[item[0]][item[1]] = self.run_biomass_dependency_test( + target_rxn_obj, + tempmodel, + original_objective, + min_flex_obj, + rxn_list, + ) + rxnobj.upper_bound = original_bound + else: + original_bound = rxnobj.lower_bound + rxnobj.lower_bound = 0 + if item[0] not in output: + output[item[0]] = {} + output[item[0]][item[1]] = self.run_biomass_dependency_test( + target_rxn_obj, + tempmodel, + original_objective, + min_flex_obj, + rxn_list, + ) + rxnobj.lower_bound = original_bound + return output + + def run_biomass_dependency_test( + self, target_rxn, tempmodel, original_objective, min_flex_obj, rxn_list + ): + tempmodel.objective = original_objective + objective = tempmodel.slim_optimize() + if objective > 0: + target_rxn.lower_bound = 0.1 + tempmodel.objective = min_flex_obj + solution = tempmodel.optimize() + biocpds = [] + for reaction in tempmodel.reactions: + if reaction.id[0:5] == "FLEX_" and ( + reaction.forward_variable.primal > Zero + or reaction.reverse_variable.primal > Zero + ): + logger.debug("Depends on:" + reaction.id) + label = reaction.id[5:] + for item in rxn_list: + if label[0 : len(item)] == item: + biocpds.append(label[len(item) + 1 :]) + target_rxn.lower_bound = 0 + return biocpds + else: + logger.debug("Cannot grow") + return None + + def add_atp_hydrolysis(self, compartment): + # Searching for ATP hydrolysis compounds + coefs = { + "cpd00002": [-1, compartment], + "cpd00001": [-1, compartment], + "cpd00008": [1, compartment], + "cpd00009": [1, compartment], + "cpd00067": [1, compartment], + } + stoichiometry = {} + id_hash = self.msid_hash() + for msid, content in coefs.items(): + if msid not in id_hash: + logger.warning("Compound " + msid + " not found in model!") + return None + else: + for cpd in id_hash[msid]: + if cpd.compartment == coefs[msid][1]: + stoichiometry[cpd] = coefs[msid][0] + output = self.find_reaction(stoichiometry) + if output != None and output[1] == ">": + return {"reaction": output[0], "direction": ">", "new": False} + cobra_reaction = Reaction( + "rxn00062_" + compartment, + name="ATP hydrolysis", + lower_bound=0, + upper_bound=1000, + ) + cobra_reaction.annotation["sbo"] = "SBO:0000176" # biochemical reaction + cobra_reaction.annotation["seed.reaction"] = "rxn00062" + cobra_reaction.add_metabolites(stoichiometry) + self.model.add_reactions([cobra_reaction]) + return {"reaction": cobra_reaction, "direction": ">", "new": True} + + def costless_excreta(self, pfba=False): + # the double optimization is intended to truly find the maximal biomass growth + original_objective = self.model.objective + minObj_cons = Constraint(self.model.objective.expression, lb=self.model.slim_optimize(), name="minObj") + self.add_cons_vars([minObj_cons]) + if pfba: + self.model.problem.constraints.minObj_cons.lb = self.model.slim_optimize() + reaction_variables = ((rxn.forward_variable, rxn.reverse_variable) for rxn in self.model.reactions) + self.model.problem.Objective(sum(chain(*reaction_variables)), direction="min") + sol = self.model.optimize() + # revert conditions to before the simulation + self.model.objective = original_objective + self.remove_cons_vars([minObj_cons]) + return [rxnID.replace("EX_", "").replace("_e0", "") for rxnID, flux in sol.fluxes.items() + if "EX_" in rxnID and flux > 0] + + @staticmethod + def parse_id(cobra_obj): + MSID = re.search("(.+)_([a-z])(\d+)$", cobra_obj.id) + if MSID is not None: return (MSID[1], MSID[2], int(MSID[3])) + nonMSID = re.search("(.+)\[([a-z])\]$", cobra_obj.id) + if nonMSID is not None: return (nonMSID[1], nonMSID[2]) + return (cobra_obj.id.replace("EX_", ""), "c" if "EX_" not in cobra_obj.id else "e") + + def add_kbase_media(self, kbase_media): + exIDs = [exRXN.id for exRXN in self.exchange_list()] + self.model.medium = {"EX_"+exID: -bound[0] for exID, bound in kbase_media.get_media_constraints().items() + if "EX_"+exID in exIDs} + return self.model.medium + + def add_medium(self, media, uniform_uptake=None): + # add the new media and its flux constraints + if media is None: return self.model.medium + exIDs = [exRXN.id for exRXN in self.exchange_list()] + if not hasattr(media, "items"): media = FBAHelper.convert_kbase_media(media) + self.model.medium = {ex: uptake for ex, uptake in media.items() if ex in exIDs} + if uniform_uptake is not None: self.model.medium = dict(zip( + list(self.model.medium.keys()), [uniform_uptake]*len(self.model.medium))) + return self.model.medium \ No newline at end of file