diff --git a/skills_ml/algorithms/embedding/train.py b/skills_ml/algorithms/embedding/train.py index 5b3d2c6fe..929db9eb0 100644 --- a/skills_ml/algorithms/embedding/train.py +++ b/skills_ml/algorithms/embedding/train.py @@ -142,6 +142,7 @@ def save_model(self, storage=None): ms = self.model_storage for model in self._models: + model.storage = ms.storage ms.save_model(model, model.model_name) logging.info(f"{model.model_name} has been stored to {ms.storage.path}.") diff --git a/skills_ml/algorithms/occupation_classifiers/__init__.py b/skills_ml/algorithms/occupation_classifiers/__init__.py index 2e160e678..3662520d7 100644 --- a/skills_ml/algorithms/occupation_classifiers/__init__.py +++ b/skills_ml/algorithms/occupation_classifiers/__init__.py @@ -78,7 +78,7 @@ class FullSOC(TargetVariable): def __init__(self, filters=None, onet_cache=None): super().__init__(filters) self.default_filters = [unknown_soc_filter, empty_soc_filter] - self.choices = Onet().all_soc + self.choices = onet_cache.all_soc if onet_cache else Onet().all_soc self.encoder = SocEncoder(self.choices) def extract_occupation_from_jobposting(self, job_posting): @@ -120,6 +120,7 @@ def _combine_pipelines(self): return combined def build(self): + logging.info("Building matrix") for i, item in enumerate(self._combine_pipelines()): self._X.append(item[0]) self._y.append(item[1]) diff --git a/skills_ml/algorithms/occupation_classifiers/train.py b/skills_ml/algorithms/occupation_classifiers/train.py index 9b3c92e23..c5dabb1e2 100644 --- a/skills_ml/algorithms/occupation_classifiers/train.py +++ b/skills_ml/algorithms/occupation_classifiers/train.py @@ -68,6 +68,9 @@ def train(self, save=True): is vailable in this package's environment and implements .fit """ logging.info(f"Start training {self.train_time}") + if len(self.matrix.X) == 0: + self.matrix.build() + X = self.matrix.X y = self.matrix.y store_path = os.path.join(self.storage.path, self.train_time) @@ -82,7 +85,7 @@ def train(self, save=True): kf = StratifiedKFold(n_splits=self.k_folds, random_state=self.random_state_for_split) model_hash = self._model_hash(self.matrix.metadata, class_name, parameter_config) trained_model_name = class_name.lower() + "_" + model_hash - self.storage.path = os.path.join(store_path, score, trained_model_name) + self.storage.path = os.path.join(store_path, score) if 'n_jobs' in inspect.signature(cls).parameters.keys(): cls_cv = ProxyObjectWithStorage( model_obj=GridSearchCV( diff --git a/skills_ml/algorithms/sampling/methods.py b/skills_ml/algorithms/sampling/methods.py index 1c32567ad..182efef17 100644 --- a/skills_ml/algorithms/sampling/methods.py +++ b/skills_ml/algorithms/sampling/methods.py @@ -30,7 +30,7 @@ def reservoir(it, k): yield result.pop() -def reservoir_weighted(it, k, weights): +def reservoir_weighted(it, k, weights, key): """Weighted reservoir Sampling from job posting iterator Randomly choosing a sample of k items from a streaming iterator based on the weights. @@ -51,7 +51,7 @@ def reservoir_weighted(it, k, weights): heap = [] hkey = lambda w: np.power(np.random.uniform(0.0, 1.0), 1.0 / w) for i, datum in enumerate(it): - weight = weights[datum[1]] + weight = weights[key(datum)] score = hkey(weight) if len(heap) < k: hq.heappush(heap, (hkey(weight), datum)) diff --git a/skills_ml/evaluation/embedding_metrics.py b/skills_ml/evaluation/embedding_metrics.py index 156b9d30d..d9277a579 100644 --- a/skills_ml/evaluation/embedding_metrics.py +++ b/skills_ml/evaluation/embedding_metrics.py @@ -42,7 +42,7 @@ def eval(self, vectorization: Callable) -> Dict: result = {} for concept, entities in self.clustering.items(): centroid = np.average([vectorization(entity[1]) for entity in entities], axis=0) - result[concept] = distance.cosine(vectorization(concept), centroid) + result[concept] = distance.cosine(vectorization(concept), centroid).astype(float) self.eval_result = result return result @@ -65,7 +65,7 @@ def eval(self, vectorization: Callable) -> Dict: for concept, entities in self.clustering.items(): entities_vec = [vectorization(entity[1]) for entity in entities] centroid = np.average(entities_vec, axis=0) - result[concept] = np.sum((entities_vec - centroid)**2) + result[concept] = np.sum((entities_vec - centroid)**2).astype(float) self.eval_result = result return result diff --git a/skills_ml/evaluation/occ_cls_evaluator.py b/skills_ml/evaluation/occ_cls_evaluator.py index 0b3f5cc21..ceb80fede 100644 --- a/skills_ml/evaluation/occ_cls_evaluator.py +++ b/skills_ml/evaluation/occ_cls_evaluator.py @@ -12,7 +12,7 @@ def __init__(self, result_generator): else: self.target_variable = self.result_generator.target_variable self.labels = self.target_variable.choices - self.result = np.array(list(result_generator)) + self.result = np.array(list(self.result_generator)) @cachedproperty def y_pred(self): @@ -68,14 +68,13 @@ def micro_f1(self): class OnetOccupationClassificationEvaluator(ClassificationEvaluator): - def __init__(self,result_generator): + def __init__(self, result_generator): super().__init__(result_generator) - if not hasattr(self.result_generator,'target_variable'): + if not hasattr(self.result_generator, 'target_variable'): raise AttributeError("the result_generator should have target_variable property") else: self.target_variable = self.result_generator.target_variable self.labels = self.target_variable.choices - self.result = np.array(list(result_generator)) @cachedproperty def _result_for_major_group(self): diff --git a/skills_ml/job_postings/sample.py b/skills_ml/job_postings/sample.py index 83730dd7c..4426805c3 100644 --- a/skills_ml/job_postings/sample.py +++ b/skills_ml/job_postings/sample.py @@ -14,47 +14,26 @@ class JobSampler(object): Attributes: job_posting_generator (iterator): Job posting iterator to sample from. k (int): number of documents to sample - major_group (bool): A flag for using major_group as a label or not - keys (list|str): a key or keys(for nested dictionary) indicates the label which should exist in common schema - of job posting. weights (dict): a dictionary that has key-value pairs as label-weighting pairs. It expects every label in the iterator to be present as a key in the weights dictionary For example, weights = {'11': 2, '13', 1}. In this case, the label/key is the occupation major group and the value is the weight you want to sample with. + key (callable): a function to be called on each element to associate to the key of weights dictionary random_state (int): the seed used by the random number generator """ - def __init__(self, job_posting_generator, k, major_group=False, keys=None, weights=None, random_state=None): + def __init__(self, job_posting_generator, k, weights=None, key=lambda x: x, random_state=None): self.job_posting_generator = job_posting_generator self.k = k - self.major_group = major_group + self.key = key self.weights = weights - self.keys = keys self.random_state = random_state if random_state: np.random.seed(random_state) random.seed(random_state) - def _transform_generator(self, job_posting_generator): - if isinstance(self.keys, list): - for job in job_posting_generator: - yield (job, safe_get(job, *self.keys)) - elif isinstance(self.keys, str): - for job in job_posting_generator: - yield (job, job[self.keys]) - elif self.major_group: - for job in job_posting_generator: - try: - yield (job, job['onet_soc_code'][:2]) - except TypeError: - yield (job, None) - else: - for job in job_posting_generator: - yield (job, ) - def __iter__(self): - it = self._transform_generator(self.job_posting_generator) if self.weights: - yield from reservoir_weighted(it, self.k, self.weights) + yield from reservoir_weighted(self.job_posting_generator, self.k, self.weights, self.key) else: - yield from reservoir(it, self.k) + yield from reservoir(self.job_posting_generator, self.k) diff --git a/skills_ml/ontologies/onet.py b/skills_ml/ontologies/onet.py index 2999bb2bc..58dc79e67 100644 --- a/skills_ml/ontologies/onet.py +++ b/skills_ml/ontologies/onet.py @@ -5,29 +5,29 @@ import logging majorgroupname = { - '11': 'Management Occupations', - '13': 'Business and Financial Operations Occupations', - '15': 'Computer and Mathematical Occupations', - '17': 'Architecture and Engineering Occupations', - '19': 'Life, Physical, and Social Science Occupations', - '21': 'Community and Social Service Occupations', - '23': 'Legal Occupations', - '25': 'Education, Training, and Library Occupations', - '27': 'Arts, Design, Entertainment, Sports, and Media Occupations', - '29': 'Healthcare Practitioners and Technical Occupations', - '31': 'Healthcare Support Occupations', - '33': 'Protective Service Occupations', - '35': 'Food Preparation and Serving Related Occupations', + '11': 'Management', + '13': 'Business and Financial Operations', + '15': 'Computer and Mathematical', + '17': 'Architecture and Engineering', + '19': 'Life, Physical, and Social Science', + '21': 'Community and Social Service', + '23': 'Legal', + '25': 'Education, Training, and Library', + '27': 'Arts, Design, Entertainment, Sports, and Media', + '29': 'Healthcare Practitioners and Technical', + '31': 'Healthcare Support', + '33': 'Protective Service', + '35': 'Food Preparation and Serving Related', '37': 'Building and Grounds Cleaning and Maintenance', - '39': 'Personal Care and Service Occupations', - '41': 'Sales and Related Occupations', - '43': 'Office and Administrative Support Occupations', - '45': 'Farming, Fishing, and Forestry Occupations', - '47': 'Construction and Extraction Occupations', - '49': 'Installation, Maintenance, and Repair Occupations', - '51': 'Production Occupations', - '53': 'Transportation and Material Moving Occupations', - '55': 'Military Specific Occupations' + '39': 'Personal Care and Service', + '41': 'Sales and Related', + '43': 'Office and Administrative Support', + '45': 'Farming, Fishing, and Forestry', + '47': 'Construction and Extraction', + '49': 'Installation, Maintenance, and Repair', + '51': 'Production', + '53': 'Transportation and Material Moving', + '55': 'Military Specific' } diff --git a/tests/job_postings/test_job_sampler.py b/tests/job_postings/test_job_sampler.py index 0be655e1c..86bb77ae6 100644 --- a/tests/job_postings/test_job_sampler.py +++ b/tests/job_postings/test_job_sampler.py @@ -1,4 +1,5 @@ from skills_ml.job_postings.sample import JobSampler +from skills_ml.job_postings.filtering import JobPostingFilterer from skills_utils.common import safe_get import gensim from collections import Counter @@ -85,35 +86,35 @@ def test_soc(self): result = [] for i in range(self.num_loops): - result.extend(list(map(lambda x: x[0]['onet_soc_code'], js))) + result.extend(list(map(lambda x: x['onet_soc_code'], js))) counts = dict(Counter(result)) assert np.mean(np.array(list(counts.values()))) == self.num_loops * self.sample_size / self.occ_num def test_state(self): + transformer = lambda job: safe_get(job, 'jobLocation', 'address', 'addressRegion') js = JobSampler( job_posting_generator=self.fake_corpus_train, k=self.sample_size, - keys=['jobLocation', 'address', 'addressRegion'] ) result = [] for i in range(self.num_loops): - result.extend(list(map(lambda x: x[1], js))) + result.extend(list(map(lambda x: transformer(x), js))) counts = dict(Counter(result)) assert np.mean(np.array(list(counts.values()))) == self.num_loops * self.sample_size / len(self.states) def test_employment_type(self): + transformer = lambda job: safe_get(job, 'employmentType') js = JobSampler( job_posting_generator=self.fake_corpus_train, k=self.sample_size, - keys='employmentType' ) result = [] for i in range(self.num_loops): - result.extend(list(map(lambda x: x[1], js))) + result.extend(list(map(lambda x: transformer(x), js))) counts = dict(Counter(result)) assert np.mean(np.array(list(counts.values()))) == self.num_loops * self.sample_size / len(self.employment_type) @@ -131,15 +132,23 @@ def test_major_group(self): ratio = self.weights['13'] / self.weights['11'] + major_group_filter = lambda job: job['onet_soc_code'][:2] in ['11', '13'] + + filtered_jobposting = JobPostingFilterer( + self.fake_corpus_train, + [major_group_filter] + ) + js = JobSampler( - job_posting_generator=self.fake_corpus_train, + job_posting_generator=filtered_jobposting, k=self.sample_size, weights=self.weights, - major_group=True) + key=lambda job: job['onet_soc_code'][:2] + ) result = [] for i in range(self.num_loops): - r = list(map(lambda x: x[1][:2], js)) + r = list(map(lambda x: x['onet_soc_code'][:2], js)) counts = dict(Counter(r)) result.append(counts['13'] / counts['11']) diff --git a/tests/ontologies/test_clustering.py b/tests/ontologies/test_clustering.py index 213734a27..3ac9a8005 100644 --- a/tests/ontologies/test_clustering.py +++ b/tests/ontologies/test_clustering.py @@ -34,11 +34,11 @@ def test_basic(self): assert set(occupation_clustering.keys()) == set([major_group_37_concept.name, major_group_35_concept.name]) assert occupation_clustering["Building and Grounds Cleaning and Maintenance"] == major_group_37_entities assert occupation_clustering.map_raw_key["Building and Grounds Cleaning and Maintenance"] == major_group_37_concept - assert occupation_clustering["Food Preparation and Serving Related Occupations"] == major_group_35_entities - assert occupation_clustering.map_raw_key["Food Preparation and Serving Related Occupations"] == major_group_35_concept + assert occupation_clustering["Food Preparation and Serving Related"] == major_group_35_entities + assert occupation_clustering.map_raw_key["Food Preparation and Serving Related"] == major_group_35_concept # Delete - del occupation_clustering["Food Preparation and Serving Related Occupations"] + del occupation_clustering["Food Preparation and Serving Related"] assert len(occupation_clustering) == 1 # Iterable