Skip to content

Commit

Permalink
Merge pull request jma127#10 from stiebels/master
Browse files Browse the repository at this point in the history
added MSLR-WEB dataset compatibility to LETOR converter; added pairwise transformation function
  • Loading branch information
jma127 authored Nov 5, 2017
2 parents b13d330 + 22f0f58 commit 78fa0eb
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 9 deletions.
47 changes: 47 additions & 0 deletions pyltr/data/pairwise_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import numpy as np
import itertools


def pairwise_transform(x, y):

'''
Performs the pairwise transformation of the input data as described in
Herbrich, R., Graepel, T., & Obermayer, K. (1999). Support vector learning for ordinal regression.
(x_i, y_i) = (x_i - x_j, sign(y_i - y_j))
WARNING: may be computationally expensive due to slow itertools.combinations
and growing size of unique combinations
Arguments:
x: input data as list, pandas dataframe or numpy array of shape (num_samples, num_features)
y: labels as list, pandas dataframe or numpy array of shape (num_samples)
Return:
xpair: input data after pairwise transform as numpy array of shape (num_pairs, num_features)
ypair: labels after pairwise transform (values either -1 or 1]) as numpy array of shape (num_pairs)
'''


x = np.asarray(x)
y = np.asarray(y)

comb_iter = itertools.combinations(range(x.shape[0]), 2)
comb_vals = [row for row in comb_iter]

xpair, ypair = list(), list()
balance = False

for i,j in comb_vals:
if y[i] == y[j]:
continue
else:
xpair.append(x[i] - x[j])
ypair.append(np.sign(y[i] - y[j]))
if balance == True:
balance = False
continue
else:
balance = True
xpair[-1] = np.negative(xpair[-1])
ypair[-1] = np.negative(ypair[-1])

return np.asarray(xpair), np.asarray(ypair)
35 changes: 26 additions & 9 deletions pyltr/data/pandas_converter.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
#%%
import pandas as pd


class PandasLetorConverter(object):
'''
Class Converter implements parsing from original letor txt files to
pandas data frame representation.
Class Converter implements parsing from original MSLR-WEB and LETOR
txt files to pandas data frame representation.
'''

def __init__(self, path):
Expand All @@ -31,7 +30,6 @@ def path(self, p):
def _load_file(self):
'''
Loads and parses raw letor txt file.
Return:
letor txt file parsed to csv in raw format
'''
Expand All @@ -42,7 +40,6 @@ def _drop_col(self, df):
'''
Drops last column, which was added in the parsing procedure due to a
trailing white space for each sample in the text file
Arguments:
df: pandas dataframe
Return:
Expand All @@ -56,25 +53,45 @@ def _split_colon(self, df):
Splits the data on the colon and transforms it into a tabular format
where columns are features and rows samples. Cells represent feature
values per sample.
Arguments:
df: pandas dataframe object
Return:
df: original df with string pattern ':' removed; columns named appropriately
'''

tracker = 0

# Ensures compatibility with MSLR-WEBK datasets
for col in range(1,len(df.columns)):
df.loc[:,col] = df.loc[:,col].apply(lambda x: str(x).split(':')[1])
tracker += 1
if ':' in str(df.ix[:,col][0]):
df.ix[:,col] = df.ix[:,col].apply(lambda x: str(x).split(':')[1])
else:
break
#tracker = col

df.columns = ['rel', 'qid'] + [str(x) for x in range(1,len(df.columns)-1)] # renaming cols

# Ensures compatibility with LETOR datasets
if tracker != len(df.columns)-1:
newcols = []
for col in df.columns:
test = df.ix[0,col]
if ('docid' in str(test)) or ('inc' in str(test)) or ('prob' in str(test)) or ('=' in str(test)):
newcols.append(test)
df = df.drop(str(col), axis=1)
newcols = [x for x in newcols if '=' not in x]
df.columns.values[-len(newcols):] = newcols

return df


def convert(self):
'''
Performs final conversion.
Return:
fully converted pandas dataframe
'''
df_raw = self._load_file()
df_drop = self._drop_col(df_raw)
return self._split_colon(df_drop)
return self._split_colon(df_drop)

0 comments on commit 78fa0eb

Please sign in to comment.