Merge pull request jma127#10 from stiebels/master

added MSLR-WEB dataset compatibility to LETOR converter; added pairwise transformation function
kretes · Nov 5, 2017 · 78fa0eb · 78fa0eb
2 parents b13d330 + 22f0f58
commit 78fa0eb
Show file tree

Hide file tree

Showing 2 changed files with 73 additions and 9 deletions.
diff --git a/pyltr/data/pairwise_transform.py b/pyltr/data/pairwise_transform.py
@@ -0,0 +1,47 @@
+import numpy as np
+import itertools
+
+
+def pairwise_transform(x, y):
+
+    '''
+    Performs the pairwise transformation of the input data as described in 
+    Herbrich, R., Graepel, T., & Obermayer, K. (1999). Support vector learning for ordinal regression.
+    (x_i, y_i) = (x_i - x_j, sign(y_i - y_j))
+    
+    WARNING: may be computationally expensive due to slow itertools.combinations
+    and growing size of unique combinations
+    
+    Arguments:
+        x: input data as list, pandas dataframe or numpy array of shape (num_samples, num_features)
+        y: labels as list, pandas dataframe or numpy array of shape (num_samples)
+    Return:
+        xpair: input data after pairwise transform as numpy array of shape (num_pairs, num_features)
+        ypair: labels after pairwise transform (values either -1 or 1]) as numpy array of shape (num_pairs)
+    '''
+
+
+    x = np.asarray(x)
+    y = np.asarray(y)
+
+    comb_iter = itertools.combinations(range(x.shape[0]), 2)
+    comb_vals = [row for row in comb_iter]
+
+    xpair, ypair = list(), list()
+    balance = False
+
+    for i,j in comb_vals:
+        if y[i] == y[j]:
+            continue
+        else:
+            xpair.append(x[i] - x[j])
+            ypair.append(np.sign(y[i] - y[j]))
+        if balance == True:
+            balance = False
+            continue
+        else:
+            balance = True
+            xpair[-1] = np.negative(xpair[-1])
+            ypair[-1] = np.negative(ypair[-1])
+
+    return np.asarray(xpair), np.asarray(ypair)
diff --git a/pyltr/data/pandas_converter.py b/pyltr/data/pandas_converter.py
@@ -1,11 +1,10 @@
-#%%
 import pandas as pd
 
 
 class PandasLetorConverter(object):
     '''
-    Class Converter implements parsing from original letor txt files to
-    pandas data frame representation.
+    Class Converter implements parsing from original MSLR-WEB and LETOR
+    txt files to pandas data frame representation.
     '''
 
     def __init__(self, path):
@@ -31,7 +30,6 @@ def path(self, p):
     def _load_file(self):
         '''
         Loads and parses raw letor txt file.
-
         Return:
             letor txt file parsed to csv in raw format
         '''
@@ -42,7 +40,6 @@ def _drop_col(self, df):
         '''
         Drops last column, which was added in the parsing procedure due to a
         trailing white space for each sample in the text file
-
         Arguments:
             df: pandas dataframe
         Return:
@@ -56,25 +53,45 @@ def _split_colon(self, df):
         Splits the data on the colon and transforms it into a tabular format
         where columns are features and rows samples. Cells represent feature
         values per sample.
-
         Arguments:
             df: pandas dataframe object
         Return:
             df: original df with string pattern ':' removed; columns named appropriately
         '''
+
+        tracker = 0
+
+        # Ensures compatibility with MSLR-WEBK datasets
         for col in range(1,len(df.columns)):
-            df.loc[:,col] = df.loc[:,col].apply(lambda x: str(x).split(':')[1])
+            tracker += 1
+            if ':' in str(df.ix[:,col][0]):
+                df.ix[:,col] = df.ix[:,col].apply(lambda x: str(x).split(':')[1])
+            else:
+                break
+                #tracker = col
+
         df.columns = ['rel', 'qid'] + [str(x) for x in range(1,len(df.columns)-1)] # renaming cols
+
+        # Ensures compatibility with LETOR datasets
+        if tracker != len(df.columns)-1:
+            newcols = []
+            for col in df.columns:
+                test = df.ix[0,col]
+                if ('docid' in str(test)) or ('inc' in str(test)) or ('prob' in str(test)) or ('=' in str(test)):
+                    newcols.append(test)
+                    df = df.drop(str(col), axis=1)
+            newcols = [x for x in newcols if '=' not in x]
+            df.columns.values[-len(newcols):] = newcols
+
         return df
 
 
     def convert(self):
         '''
         Performs final conversion.
-
         Return:
             fully converted pandas dataframe
         '''
         df_raw = self._load_file()
         df_drop = self._drop_col(df_raw)
-        return self._split_colon(df_drop)
+        return self._split_colon(df_drop)