Skip to content

Commit

Permalink
Fix Train Test Samples Mix (deepchecks#695)
Browse files Browse the repository at this point in the history
* Fix Train Test Samples Mix

* Fix test

* Add nan to spelling list

* test tests

* Fix nan problem

* Fix typo
  • Loading branch information
matanper authored Jan 19, 2022
1 parent deb4e11 commit eb49b84
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 79 deletions.
143 changes: 68 additions & 75 deletions deepchecks/checks/methodology/train_test_samples_mix.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,7 @@
# ----------------------------------------------------------------------------
#
"""The data_sample_leakage_report check module."""
from typing import Dict, List
import re

import numpy as np
from typing import List
import pandas as pd

from deepchecks import Dataset
Expand Down Expand Up @@ -55,54 +52,37 @@ def _data_sample_leakage_report(self, test_dataset: Dataset, train_dataset: Data

columns = features + [label_name]

train_f = train_dataset.data.copy()
test_f = test_dataset.data.copy()

train_dups = _get_dup_indexes_map(train_f, columns)
train_f.index = [f'Train indices: {_get_dup_txt(i, train_dups)}' for i in train_f.index]
train_f.drop_duplicates(columns, inplace=True)

test_dups = _get_dup_indexes_map(test_f, columns)
test_f.index = [f'Test indices: {_get_dup_txt(i, test_dups)}' for i in test_f.index]
test_f.drop_duplicates(columns, inplace=True)
# For pandas.groupby in python 3.6, there is problem with comparing numpy nan, so replace with placeholder
na_filler = '__deepchecks_na_filler__'
train_df = train_dataset.data.fillna(value=na_filler)
test_df = test_dataset.data.fillna(value=na_filler)

appended_df = train_f.append(test_f)
duplicate_rows_df = appended_df[appended_df.duplicated(columns, keep=False)]
duplicate_rows_df.sort_values(columns, inplace=True)
train_uniques = _create_unique_frame(train_df, columns, text_prefix='Train indices: ')
test_uniques = _create_unique_frame(test_df, columns, text_prefix='Test indices: ')

count_val_array = np.zeros((duplicate_rows_df.shape[0],))
idx_in_array = 0
for index in duplicate_rows_df.index:
if index.startswith('Test'):
if 'Tot.' not in index:
count_val_array[idx_in_array] = len(index.split(','))
else:
count_val_array[idx_in_array] = int(re.findall(r'Tot. (\d+)', index)[0])
count_val_array[idx_in_array + 1] = count_val_array[idx_in_array]
idx_in_array += 2
duplicates_df, test_dup_count = _create_train_test_joined_duplicate_frame(train_uniques, test_uniques, columns)

duplicate_rows_df = duplicate_rows_df.iloc[np.flip(count_val_array.argsort()), :]

count_dups = count_val_array.sum() // 2

dup_ratio = count_dups / test_dataset.n_samples
user_msg = f'{format_percent(dup_ratio)} ({count_dups} / {test_dataset.n_samples}) \
# Replace filler back to none
duplicates_df = duplicates_df.applymap(lambda x: None if x == na_filler else x)
dup_ratio = test_dup_count / test_dataset.n_samples
user_msg = f'{format_percent(dup_ratio)} ({test_dup_count} / {test_dataset.n_samples}) \
of test data samples appear in train data'
display = [user_msg, duplicate_rows_df.head(10)] if dup_ratio else None

return CheckResult(dup_ratio, header='Train Test Samples Mix', display=display)
display = [user_msg, duplicates_df.head(10)] if dup_ratio else None
result = {'ratio': dup_ratio, 'data': duplicates_df}
return CheckResult(result, header='Train Test Samples Mix', display=display)

def add_condition_duplicates_ratio_not_greater_than(self, max_ratio: float = 0.1):
"""Add condition - require max allowed ratio of test data samples to appear in train data.
Args:
max_ratio (float): Max allowed ratio of test data samples to appear in train data
"""
def condition(result: float) -> ConditionResult:
if result > max_ratio:
def condition(result: dict) -> ConditionResult:
ratio = result['ratio']
if ratio > max_ratio:
return ConditionResult(False,
f'Percent of test data samples that appear in train data: '
f'{format_percent(result)}')
f'{format_percent(ratio)}')
else:
return ConditionResult(True)

Expand All @@ -111,41 +91,54 @@ def condition(result: float) -> ConditionResult:
condition)


def _get_dup_indexes_map(df: pd.DataFrame, columns: List[Hashable]) -> Dict:
"""Find duplicated indexes in the dataframe.
Args:
df: a Dataframe object of the dataset
columns: list of column that duplicates are defined by
Returns:
dictionary of each of the first indexes and its' duplicated indexes
"""
dup = df[df.duplicated(columns, keep=False)].groupby(columns).groups.values()
dup_map = {}
for i_arr in dup:
key = i_arr[0]
dup_map[key] = [int(i) for i in i_arr[1:]]
return dup_map


def _get_dup_txt(i: int, dup_map: Dict) -> str:
"""Return a prettified text for a key in the dict.
Args:
i: the index key
dup_map: the dict of the duplicated indexes
def _create_train_test_joined_duplicate_frame(first: pd.DataFrame, second: pd.DataFrame, columns: List[Hashable]):
"""Create duplicate dataframe out of 2 uniques dataframes.
Returns:
prettified text for a key in the dict
This function accept 2 dataframes resulted from `_create_unique_frame`. this means that each dataframe have
no duplicate in it. so if the concatenation between the 2 find duplicates, they are necessarily between each other.
"""
val = dup_map.get(i)
if not val:
return str(i)
txt = f'{i}, '
for j in val:
txt += f'{j}, '
txt = txt[:-2]
if len(txt) < 30:
return txt
return f'{txt[:30]}.. Tot. {(1 + len(val))}'
columns_data = []
index_text = []
total_test_count = 0
group_unique_data: dict = pd.concat([first, second]).groupby(columns, dropna=False).groups
# The group data is backward (the columns are the indexes, and the indexes are the values)
for duplicate_columns, indexes in group_unique_data.items():
# If length is 1, then no duplicate found between first and second
if len(indexes) == 1:
continue
# Indexes should have dict of train & test info from `_filter_duplicates`
text = indexes[0]['text'] + '\n' + indexes[1]['text']
# Take the count only of test
test_count = indexes[0]['count'] if indexes[0]['text'].startswith('Test') else indexes[1]['count']
total_test_count += test_count
# Save info of duplicated text and the columns info
columns_data.append([*duplicate_columns, test_count])
index_text.append(text)

count_column_name = '_value_to_sort_by_'
duplicates = pd.DataFrame(columns_data, index=index_text, columns=[*columns, count_column_name])
duplicates = duplicates.sort_values(by=count_column_name, ascending=False)
duplicates = duplicates.drop(count_column_name, axis=1)
return duplicates, total_test_count


def _create_unique_frame(df: pd.DataFrame, columns: List[Hashable], text_prefix: str = '') -> pd.DataFrame:
"""For given dataframe and columns create a dataframe with only unique combinations of the columns."""
columns_data = []
index_text = []
group_unique_data: dict = df.groupby(columns, dropna=False).groups
# The group data is backward (the columns are the indexes, and the indexes are the values)
for duplicate_columns, indexes in group_unique_data.items():
# Save info of duplicated text and the columns info
columns_data.append(duplicate_columns)
index_text.append(_get_dup_info(indexes, text_prefix))

return pd.DataFrame(columns_data, index=index_text, columns=columns)


def _get_dup_info(index_arr: list, text_prefix: str) -> dict:
text = ', '.join([str(i) for i in index_arr])
if len(text) > 30:
text = f'{text[:30]}.. Tot. {(len(index_arr))}'

return {'text': f'{text_prefix}{text}', 'count': len(index_arr)}
1 change: 1 addition & 0 deletions spelling-allowlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,4 @@ dtypes
groupby
r2
tqdm
nan
9 changes: 5 additions & 4 deletions tests/checks/methodology/train_test_samples_mix_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from hamcrest import assert_that, calling, raises, equal_to, has_items
from hamcrest import assert_that, calling, raises, equal_to, has_items, has_entry

from deepchecks.base import Dataset
from deepchecks.errors import DeepchecksValueError
Expand Down Expand Up @@ -49,7 +49,8 @@ def test_no_leakage(iris_clean):
# Act X
result = check.run(test_dataset=test_dataset, train_dataset=train_dataset).value
# Assert
assert_that(result, equal_to(0))
assert_that(result, has_entry('ratio', 0))


def test_leakage(iris_clean):
x = iris_clean.data
Expand All @@ -70,7 +71,7 @@ def test_leakage(iris_clean):
# Act X
result = check.run(test_dataset=test_dataset, train_dataset=train_dataset).value
# Assert
assert_that(result, equal_to(0.1))
assert_that(result, has_entry('ratio', 0.1))


def test_nan():
Expand All @@ -83,7 +84,7 @@ def test_nan():
# Act X
result = check.run(test_dataset=test_dataset, train_dataset=train_dataset).value
# Assert
assert_that(result, equal_to(0.5))
assert_that(result, has_entry('ratio', 0.5))


def test_condition_ratio_not_greater_than_not_passed(iris_clean):
Expand Down

0 comments on commit eb49b84

Please sign in to comment.