-
Notifications
You must be signed in to change notification settings - Fork 9
/
score.py
70 lines (60 loc) · 2.05 KB
/
score.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import argparse
import pandas as pd
from molskill.data.featurizers import AVAILABLE_FEATURIZERS, get_featurizer
from molskill.helpers.cleaners import ensure_readability_and_remove
from molskill.helpers.logging import get_logger
from molskill.models.ranknet import LitRankNet
from molskill.scorer import MolSkillScorer
LOGGER = get_logger(__name__)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog=__file__,
description="Scoring module for MolSkill.",
add_help=True,
)
parser.add_argument(
"--model_ckpt",
type=str,
default=None,
required=False,
help="Path to model checkpoint (`.ckpt`) file.",
)
parser.add_argument(
"--featurizer_name",
choices=list(AVAILABLE_FEATURIZERS.keys()),
default="morgan_count_rdkit_2d",
help="Molecular representation to use.",
)
parser.add_argument(
"--compound_csv",
type=str,
required=True,
help="Path to a `.csv` file separated by commas with compounds to be scored.",
)
parser.add_argument(
"--smiles_col",
type=str,
default="smiles",
help="Column name containing SMILES strings.",
)
parser.add_argument(
"--output_csv",
type=str,
required=True,
help="Output `.csv` file, which will contain a column of SMILES and another of corresponding scores.",
)
args = parser.parse_args()
cpd_df = pd.read_csv(args.compound_csv)
molrpr = cpd_df[args.smiles_col].tolist()
molrpr = ensure_readability_and_remove(molrpr)
featurizer = get_featurizer(args.featurizer_name)
model = (
LitRankNet.load_from_checkpoint(args.model_ckpt, input_size=featurizer.dim())
if args.model_ckpt is not None
else None
)
scorer = MolSkillScorer(model=model, featurizer=featurizer)
LOGGER.info("Now predicting...")
scores = scorer.score(molrpr=molrpr)
score_df = pd.DataFrame({"smiles": molrpr, "score": scores})
score_df.to_csv(args.output_csv, index=False)