From 11ff08f3b5bd7ae24b19fa408790caa09a34780f Mon Sep 17 00:00:00 2001 From: Simon Stiebellehner Date: Sun, 22 Oct 2017 16:18:14 +0200 Subject: [PATCH 1/2] added MSLR-WEB dataset compatibility --- pyltr/data/pandas_converter.py | 35 +++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/pyltr/data/pandas_converter.py b/pyltr/data/pandas_converter.py index a657cf8..35069cc 100644 --- a/pyltr/data/pandas_converter.py +++ b/pyltr/data/pandas_converter.py @@ -1,11 +1,10 @@ -#%% import pandas as pd class PandasLetorConverter(object): ''' - Class Converter implements parsing from original letor txt files to - pandas data frame representation. + Class Converter implements parsing from original MSLR-WEB and LETOR + txt files to pandas data frame representation. ''' def __init__(self, path): @@ -31,7 +30,6 @@ def path(self, p): def _load_file(self): ''' Loads and parses raw letor txt file. - Return: letor txt file parsed to csv in raw format ''' @@ -42,7 +40,6 @@ def _drop_col(self, df): ''' Drops last column, which was added in the parsing procedure due to a trailing white space for each sample in the text file - Arguments: df: pandas dataframe Return: @@ -56,25 +53,45 @@ def _split_colon(self, df): Splits the data on the colon and transforms it into a tabular format where columns are features and rows samples. Cells represent feature values per sample. - Arguments: df: pandas dataframe object Return: df: original df with string pattern ':' removed; columns named appropriately ''' + + tracker = 0 + + # Ensures compatibility with MSLR-WEBK datasets for col in range(1,len(df.columns)): - df.loc[:,col] = df.loc[:,col].apply(lambda x: str(x).split(':')[1]) + tracker += 1 + if ':' in str(df.ix[:,col][0]): + df.ix[:,col] = df.ix[:,col].apply(lambda x: str(x).split(':')[1]) + else: + break + #tracker = col + df.columns = ['rel', 'qid'] + [str(x) for x in range(1,len(df.columns)-1)] # renaming cols + + # Ensures compatibility with LETOR datasets + if tracker != len(df.columns)-1: + newcols = [] + for col in df.columns: + test = df.ix[0,col] + if ('docid' in str(test)) or ('inc' in str(test)) or ('prob' in str(test)) or ('=' in str(test)): + newcols.append(test) + df = df.drop(str(col), axis=1) + newcols = [x for x in newcols if '=' not in x] + df.columns.values[-len(newcols):] = newcols + return df def convert(self): ''' Performs final conversion. - Return: fully converted pandas dataframe ''' df_raw = self._load_file() df_drop = self._drop_col(df_raw) - return self._split_colon(df_drop) + return self._split_colon(df_drop) \ No newline at end of file From 22f0f588e9372f5e04f8dec286cbc680660f324b Mon Sep 17 00:00:00 2001 From: Simon Stiebellehner Date: Sat, 4 Nov 2017 20:30:41 +0100 Subject: [PATCH 2/2] added pairwise_transform function --- pyltr/data/pairwise_transform.py | 47 ++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 pyltr/data/pairwise_transform.py diff --git a/pyltr/data/pairwise_transform.py b/pyltr/data/pairwise_transform.py new file mode 100644 index 0000000..1af6c44 --- /dev/null +++ b/pyltr/data/pairwise_transform.py @@ -0,0 +1,47 @@ +import numpy as np +import itertools + + +def pairwise_transform(x, y): + + ''' + Performs the pairwise transformation of the input data as described in + Herbrich, R., Graepel, T., & Obermayer, K. (1999). Support vector learning for ordinal regression. + (x_i, y_i) = (x_i - x_j, sign(y_i - y_j)) + + WARNING: may be computationally expensive due to slow itertools.combinations + and growing size of unique combinations + + Arguments: + x: input data as list, pandas dataframe or numpy array of shape (num_samples, num_features) + y: labels as list, pandas dataframe or numpy array of shape (num_samples) + Return: + xpair: input data after pairwise transform as numpy array of shape (num_pairs, num_features) + ypair: labels after pairwise transform (values either -1 or 1]) as numpy array of shape (num_pairs) + ''' + + + x = np.asarray(x) + y = np.asarray(y) + + comb_iter = itertools.combinations(range(x.shape[0]), 2) + comb_vals = [row for row in comb_iter] + + xpair, ypair = list(), list() + balance = False + + for i,j in comb_vals: + if y[i] == y[j]: + continue + else: + xpair.append(x[i] - x[j]) + ypair.append(np.sign(y[i] - y[j])) + if balance == True: + balance = False + continue + else: + balance = True + xpair[-1] = np.negative(xpair[-1]) + ypair[-1] = np.negative(ypair[-1]) + + return np.asarray(xpair), np.asarray(ypair) \ No newline at end of file