PAINS filtering for Large_Scale_Chemical_Screens.ipynb#1355

AWS-BugBust-75 · Nov 7, 2021 · 64c86d6 · 64c86d6
1 parent 224d90e
commit 64c86d6
Show file tree

Hide file tree

Showing 2 changed files with 1,302 additions and 1 deletion.
diff --git a/examples/tutorials/Large_Scale_Chemical_Screens.ipynb b/examples/tutorials/Large_Scale_Chemical_Screens.ipynb
@@ -716,6 +716,55 @@
     "The screen seems to favor molecules with one or multiple sulfur trioxides. The top scoring molecules also have low diversity.  When creating a \"buy list\" we want to optimize for more things than just activity, for instance diversity and drug like MPO. "
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#We use the code from https://github.com/PatWalters/rd_filters, detailed explanation is here: http://practicalcheminformatics.blogspot.com/2018/08/filtering-chemical-libraries.html\n",
+    "#We will run the PAINS filter on best_mols as suggested by Issue 1355 (https://github.com/deepchem/deepchem/issues/1355)\n",
+    "import os\n",
+    "\n",
+    "import pandas as pd\n",
+    "from rdkit import Chem\n",
+    "from rdkit.Chem.Descriptors import MolWt, MolLogP, NumHDonors, NumHAcceptors, TPSA\n",
+    "from rdkit.Chem.rdMolDescriptors import CalcNumRotatableBonds\n",
+    "\n",
+    "#First we get the rules from alert_collection.csv and then filter to get PAINS filter\n",
+    "rule_df = pd.read_csv(os.path.join(os.path.dirname(__file__), 'alert_collection.csv'))\n",
+    "rule_df = rule_df[rule_df['rule_set_name']=='PAINS']\n",
+    "rule_list = []\n",
+    "for rule_id, smarts, max_val, desc in rule_df[[\"rule_id\", \"smarts\", \"max\", \"description\"]].values.tolist():\n",
+    "    smarts_mol = Chem.MolFromSmarts(smarts)\n",
+    "    if smarts_mol:\n",
+    "        rule_list.append((smarts_mol, max_val, desc))\n",
+    "\n",
+    "def evaluate(smile):\n",
+    "    mol = Chem.MolFromSmiles(smile)\n",
+    "    if mol is None:\n",
+    "        return [smile, \"INVALID\", -999, -999, -999, -999, -999, -999]\n",
+    "    desc_list = [MolWt(mol), MolLogP(mol), NumHDonors(mol), NumHAcceptors(mol), TPSA(mol), CalcNumRotatableBonds(mol)]\n",
+    "    for patt, max_val, desc in rule_list:\n",
+    "        if len(mol.GetSubstructMatches(patt)) > max_val:\n",
+    "            return [smiles, desc + \" > %d\" % (max_val)] +desc_list\n",
+    "    return [smiles, \"OK\"]+desc_list\n",
+    "\n",
+    "smiles = [x.strip().split()[0] for x in open('/tmp/zinc/screen/top_100k.smi').readlines()[:100]] # obtain the smiles\n",
+    "res = list(map(evaluate, smiles)) # here we apply the PAINS filter\n",
+    "\n",
+    "df = pd.DataFrame(res, columns=[\"SMILES\", \"FILTER\", \"MW\", \"LogP\", \"HBD\", \"HBA\", \"TPSA\", \"Rot\"])\n",
+    "df_ok = df[\n",
+    "    (df.FILTER == \"OK\") &\n",
+    "    df.MW.between(*[0, 500]) & # MW\n",
+    "    df.LogP.between(*[-5, 5]) & #LogP\n",
+    "    df.HBD.between(*[0, 5]) & #HBD\n",
+    "    df.HBA.between(*[0, 10]) & #HBA\n",
+    "    df.TPSA.between(*[0, 200]) & #TPSA\n",
+    "    df.Rot.between(*[0, 10]) #Rot\n",
+    "    ]"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -748,7 +797,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.10"
+   "version": "3.7.6"
   }
  },
  "nbformat": 4,