Merge pull request #129 from datamol-org/reaction

Reaction
datamol-io · Oct 14, 2022 · 6d1539f · 6d1539f
2 parents 81aed56 + a5fb1c0
commit 6d1539f
Show file tree

Hide file tree

Showing 14 changed files with 1,004 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -62,3 +62,6 @@ dev.ipynb
 
 # MkDocs
 site/
+
+.idea/
+__pycache__
diff --git a/datamol/__init__.py b/datamol/__init__.py
@@ -1,6 +1,13 @@
 from ._version import __version__
 
+from ._version import is_lower_than_current_rdkit_version
+from ._version import is_greater_than_current_rdkit_version
+from ._version import is_lower_eq_than_current_rdkit_version
+from ._version import is_greater_eq_than_current_rdkit_version
+
 from .types import Mol
+from .types import BondType
+from .types import ChemicalReaction
 
 from . import utils
 
@@ -72,6 +79,7 @@
 from . import molar
 from . import descriptors
 from . import predictors
+from . import reactions
 
 from .convert import to_smiles
 from .convert import to_selfies

diff --git a/datamol/_version.py b/datamol/_version.py
@@ -1 +1,25 @@
 __version__ = "0.7.16"
+
+
+import rdkit
+import packaging.version
+
+
+CURRENT_RDKIT_VERSION = rdkit.__version__
+CURRENT_RDKIT_VERSION_OBJ = packaging.version.parse(CURRENT_RDKIT_VERSION)
+
+
+def is_lower_than_current_rdkit_version(rdkit_version: str):
+    return CURRENT_RDKIT_VERSION_OBJ < packaging.version.parse(rdkit_version)
+
+
+def is_greater_than_current_rdkit_version(rdkit_version: str):
+    return CURRENT_RDKIT_VERSION_OBJ > packaging.version.parse(rdkit_version)
+
+
+def is_lower_eq_than_current_rdkit_version(rdkit_version: str):
+    return CURRENT_RDKIT_VERSION_OBJ <= packaging.version.parse(rdkit_version)
+
+
+def is_greater_eq_than_current_rdkit_version(rdkit_version: str):
+    return CURRENT_RDKIT_VERSION_OBJ >= packaging.version.parse(rdkit_version)
diff --git a/datamol/reactions/__init__.py b/datamol/reactions/__init__.py
@@ -0,0 +1,18 @@
+from ._reactions import is_reaction_ok
+from ._reactions import select_reaction_output
+from ._reactions import apply_reaction
+from ._reactions import can_react
+from ._reactions import inverse_reaction
+from ._reactions import find_reactant_position
+from ._reactions import ATTACHING_RXN
+from ._reactions import rxn_from_smarts
+from ._reactions import rxn_to_smarts
+from ._reactions import rxn_from_block
+from ._reactions import rxn_from_block_file
+from ._reactions import rxn_to_block
+from ._reactions import rxn_to_block_file
+
+from ._attachments import add_brackets_to_attachment_points
+from ._attachments import convert_attach_to_isotope
+from ._attachments import num_attachment_points
+from ._attachments import open_attach_points
diff --git a/datamol/reactions/_attachments.py b/datamol/reactions/_attachments.py
@@ -0,0 +1,132 @@
+from typing import cast
+from typing import Union
+
+import re
+import operator
+
+import datamol as dm
+from rdkit import Chem
+
+ATTACHMENT_POINT_TOKEN = "*"
+ATTACHMENT_POINT_NUM_REGEXP = r"\[{}:?(\d*)\]".format(re.escape(ATTACHMENT_POINT_TOKEN))
+ATTACHMENT_POINT_REGEXP = r"(?:{0}|\[{0}[^\]]*\])".format(re.escape(ATTACHMENT_POINT_TOKEN))
+ATTACHMENT_POINT_NO_BRACKETS_REGEXP = r"(?<![:\[]){0}(?![:\]])".format(
+    re.escape(ATTACHMENT_POINT_TOKEN)
+)
+ALL_POSSIBLE_ATTACHMENTS = r"\[(\d*){}:?(\d*)\]".format(re.escape(ATTACHMENT_POINT_TOKEN))
+
+
+def add_brackets_to_attachment_points(smiles: str) -> str:
+    """
+    Adds brackets to the attachment points (if they don't have them).
+    Example: "CC(C)CO*" to "CC(C)CO[*]"
+
+    Args:
+        smiles: A smiles string.
+
+    Returns:
+        A smiles string with brackets.
+    """
+    return re.sub(
+        ATTACHMENT_POINT_NO_BRACKETS_REGEXP,
+        "[{}]".format(ATTACHMENT_POINT_TOKEN),
+        smiles,
+    )
+
+
+def convert_attach_to_isotope(
+    mol_or_smiles: Union[dm.Mol, str],
+    same_isotope: bool = False,
+    as_smiles: bool = False,
+) -> Union[dm.Mol, str]:
+    """Convert attachment to isotope mapping.
+
+    Examples: "O=C(NCc1cnc([*])c1)[*]" to  "O=C(NCc1cnc([1*])c1)[2*]"
+
+    Args:
+        mol_or_smiles: A Mol object or a smiles to be converted
+        same_isotope: Whether convert to the same isotope.
+            Example: "O=C(NCc1cnc([*])c1)[*]" to  "O=C(NCc1cnc([1*])c1)[1*]"
+
+    Returns:
+        Converted Mol object or SMILES.
+    """
+    mol = dm.to_mol(mol_or_smiles)
+    smiles = dm.to_smiles(mol)
+    smiles = cast(str, smiles)
+
+    smiles = add_brackets_to_attachment_points(smiles)
+
+    # reg matching seems to be the most effective
+    subs_reg = r"[\g<1>{}]"
+    if same_isotope:
+        subs_reg = "[1{}]"
+
+    smiles = re.sub(ATTACHMENT_POINT_NUM_REGEXP, subs_reg.format(ATTACHMENT_POINT_TOKEN), smiles)
+
+    if as_smiles:
+        return smiles
+    return dm.to_mol(smiles)
+
+
+def num_attachment_points(mol_or_smiles: Union[dm.Mol, str]) -> int:
+    """
+    Get the number of attachment point in the
+
+    Args:
+        mol_or_smiles: A Mol object or a smiles to be converted
+
+    Returns:
+        Number of attachment points of the given molecule.
+    """
+    if isinstance(mol_or_smiles, dm.Mol):
+        mol = cast(dm.Mol, mol_or_smiles)
+        n_points = len(
+            [atom for atom in mol.GetAtoms() if atom.GetSymbol() == ATTACHMENT_POINT_TOKEN]
+        )
+    else:
+        n_points = len(re.findall(ATTACHMENT_POINT_REGEXP, mol_or_smiles))
+
+    return n_points
+
+
+def open_attach_points(
+    mol: dm.Mol,
+    fix_atom_map: bool = False,
+    bond_type: dm.BondType = dm.SINGLE_BOND,
+) -> dm.Mol:
+    """Compute attachment points on a molecule.
+    This will highlight all valid attachment point on the current molecule instead.
+
+    Args:
+        mol: A Mol object to be processed.
+        fix_atom_map: Whether fix the atom mapping of the molecule.
+        bond_type: The bond type to be opened.
+
+    Returns:
+        Molecule with open attachment points
+    """
+
+    emol = Chem.rdchem.RWMol(dm.to_mol(mol))
+    with dm.log.without_rdkit_log():
+        atoms = [
+            (a.GetIdx(), a)
+            for a in emol.GetAtoms()
+            if a.GetSymbol() != ATTACHMENT_POINT_TOKEN
+            and a.GetImplicitValence() > 0
+            and (not a.HasProp("_protected") or a.GetProp("_protected") != "1")
+        ]
+        atoms.sort(reverse=True, key=operator.itemgetter(0))
+
+        for atom in atoms:
+            new_atom = Chem.rdchem.Atom(ATTACHMENT_POINT_TOKEN)
+            new_atom.SetAtomMapNum(1 if fix_atom_map else atom[0])
+            new_index = emol.AddAtom(new_atom)
+            emol.UpdatePropertyCache(strict=False)
+            if bond_type is not None:
+                emol.AddBond(atom[0], new_index, bond_type)
+            else:
+                emol.AddBond(atom[0], new_index)
+
+    mol = dm.sanitize_mol(emol)
+    return mol
-Original file line number
+Diff line change
@@ Expand Up / @@ -62,3 +62,6 @@ dev.ipynb @@
     # MkDocs
     site/
+    .idea/
+    __pycache__