Skip to content

Commit

Permalink
make BaseFeaturizer an ABC with abstractmethods
Browse files Browse the repository at this point in the history
  • Loading branch information
janosh committed Jul 22, 2020
1 parent 66573ca commit 33a9c48
Showing 1 changed file with 12 additions and 7 deletions.
19 changes: 12 additions & 7 deletions matminer/featurizers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,20 @@
import sys
import traceback
import warnings
from multiprocessing import Pool, cpu_count
from abc import ABC, abstractmethod
from functools import partial
from multiprocessing import Pool, cpu_count

import numpy as np
import pandas as pd
from six import string_types, reraise
from sklearn.base import TransformerMixin, BaseEstimator, is_classifier
from six import reraise, string_types
from sklearn.base import BaseEstimator, TransformerMixin, is_classifier
from tqdm.auto import tqdm

from matminer.utils.utils import homogenize_multiindex


class BaseFeaturizer(BaseEstimator, TransformerMixin):
class BaseFeaturizer(BaseEstimator, TransformerMixin, ABC):
"""
Abstract class to calculate features from raw materials input data
such a compound formula or a pymatgen crystal structure or
Expand Down Expand Up @@ -99,7 +100,7 @@ class BaseFeaturizer(BaseEstimator, TransformerMixin):
An additional factor to consider is the chunksize for data parallelisation.
For lightweight computational tasks, the overhead associated with passing
data from `multiprocessing.Pool.map()` to the function being parallelised
data from `multiprocessing.Pool.map()` to the function being parallelized
can increase the time taken for all tasks to be completed. By setting
the `self._chunksize` argument, the overhead associated with passing data
to the tasks can be reduced. Note that there is only an advantage to using
Expand All @@ -108,7 +109,7 @@ class BaseFeaturizer(BaseEstimator, TransformerMixin):
itself. By default, we allow the Python multiprocessing library to determine
the chunk size automatically based on the size of the list being featurized.
You may want to specify a small chunk size for computationally-expensive
featurizers, which will enable better distribution of taks across threads.
featurizers, which will enable better distribution of tasks across threads.
In contrast, for more lightweight featurizers, it is recommended that
the implementor trial a range of chunksize values to find the optimum.
As a general rule of thumb, if the featurize function takes 0.1 seconds or
Expand Down Expand Up @@ -140,7 +141,7 @@ class BaseFeaturizer(BaseEstimator, TransformerMixin):
"""

def set_n_jobs(self, n_jobs):
"""Set the number of threads for this """
"""Set the number of threads for this."""
self._n_jobs = n_jobs

@property
Expand Down Expand Up @@ -503,6 +504,7 @@ def featurize_wrapper(self, x, return_errors=False, ignore_errors=False):
"featurize_many(), featurize_dataframe(), etc.)."
reraise(type(e), type(e)(msg), sys.exc_info()[2])

@abstractmethod
def featurize(self, *x):
"""
Main featurizer function, which has to be implemented
Expand All @@ -517,6 +519,7 @@ def featurize(self, *x):

raise NotImplementedError("featurize() is not defined!")

@abstractmethod
def feature_labels(self):
"""
Generate attribute names.
Expand All @@ -527,6 +530,7 @@ def feature_labels(self):

raise NotImplementedError("feature_labels() is not defined!")

@abstractmethod
def citations(self):
"""
Citation(s) and reference(s) for this feature.
Expand All @@ -538,6 +542,7 @@ def citations(self):

raise NotImplementedError("citations() is not defined!")

@abstractmethod
def implementors(self):
"""
List of implementors of the feature.
Expand Down

0 comments on commit 33a9c48

Please sign in to comment.