initial checkin

fettermania · Mar 6, 2016 · 685a018 · 685a018
commit 685a018
Show file tree

Hide file tree

Showing 16 changed files with 41,944 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,33 @@
+Pivoted Document Length Normalization
+===
+
+To Use:
+---
+
+- choose dataset, among:
+  - full crowdflower data (use import_data.create_query_datasets_crowdflower(small=False), run_pdln_test:25)
+    - Full evaluation can take a few hours.
+  - partial crowdflower data (use import_data.create_query_datasets_crowdflower(small=False), run_pdln_test:25)
+    - Quick evaluation
+  - toy example data (use import_data.create_query_datasets_toy(), run_pdln_test:24)
+    - Quick evaluation
+- python run_pdln_test.py
+- Can also play with queries in SECTION: DEBUG in run_pdln_test
+
+Results
+---
+- results/crowdflower_results_large.csv: 
+  - Low accuracy, very low recall suggests our thresholds might be too high.
+  - TODO: Actually look at test data.
+  - TODO: Actually confine to smaller region
+
+
+Considerations
+---
+- Crowdflower data: Right now, we are looking at accuracy of "relevance" (crowdflower "relevance score" > 4 means "relevant retrieval", else "irrelevant retrieval").  Probably better to learn this function (range: 0-4) than collapse into a binary classificatin.
+
+- More things to consider in docs/notes.txt
+
+
+
+
diff --git a/clean_text.py b/clean_text.py
@@ -0,0 +1,19 @@
+from nltk.stem.porter import *
+
+stemmer = PorterStemmer()
+
+def clean_query_string(query):
+  query = query.strip()
+  query = re.sub("[\W]+", " ", query)
+  return porter_stem_pass(query)
+
+def clean_text(text):
+  text = text.strip()
+  text = re.sub('<[^>]*>', '', text)
+  text = re.sub('\'', '', text)
+  text = re.sub('[\W]+', ' ', text)
+  text = text.lower()
+  return porter_stem_pass(text)
+
+def porter_stem_pass(text):
+  return ' '.join([stemmer.stem(i) for i in text.split(' ')])
diff --git a/data/crowdflower.csv b/data/crowdflower.csv
diff --git a/docs/cosine_tf_idf_example.pdf b/docs/cosine_tf_idf_example.pdf
diff --git a/docs/lec3.pdf b/docs/lec3.pdf
diff --git a/docs/sbm96.pdf b/docs/sbm96.pdf
diff --git a/import_data.py b/import_data.py
@@ -0,0 +1,94 @@
+#import_data.py
+
+# SECTION : LOAD DATA
+import pandas as pd
+import os
+import re
+import numpy as np
+
+# Fettermania libraries
+import clean_text
+
+
+def create_query_datasets_crowdflower(small=False):
+  df = pd.read_csv('./data/crowdflower.csv').dropna()
+  if small:
+    df = df[:100]  
+  df['original_text'] = df['product_title'] + " " + df['product_description']
+  df = import_preprocess(df)
+  #df['query'] is already in there
+  # Fettermania TODO: not strictly classification.  Can be 0-4.
+  df['relevant'] = df['median_relevance'].map(lambda x: 1 if x == 4 else 0)
+  del df['product_description']
+  del df['median_relevance']
+  del df['relevance_variance']
+
+  normalization_sample_ct = round(df.shape[0] / 3)
+  df_normalization_sample = df[-normalization_sample_ct:]
+  df = df[0:-normalization_sample_ct]
+  df['doc_index'] = np.array(range(df.shape[0]))  # Fettermania TODO: This is not random-ish.
+
+  return (df_normalization_sample['cleaned_text'], df[['cleaned_text', 'original_text', 'doc_index']], df)
+
+
+
+
+def import_data_frame_toy():
+  df = pd.DataFrame(np.array([
+    'Cats are similar in anatomy to the other felids, with a strong, flexible body, quick reflexes, sharp retractable claws, and teeth adapted to killing small prey. Cat senses fit a crepuscular and predatory ecological niche. Cats can hear sounds too faint or too high in frequency for human ears, such as those made by mice and other small animals. They can see in near darkness. Like most other mammals, cats have poorer color vision and a better sense of smell than humans. Cats, despite being solitary hunters, are a social species and cat communication includes the use of a variety of vocalizations (mewing, purring, trilling, hissing, growling, and grunting), as well as cat pheromones and types of cat-specific body language.[8]',
+    'Dogs love to eat and run around.',
+    'It was raining cats and dogs the other night... so bad that I couldn\'t go outside. Sometimes I would come to the window and just stare at the rain. It was very depressing, but in the morning, I felt better!',
+    'It\'s a dog-eat-dog world out there. From puppies to big hounds, everyone struggles to survive, to avoid his superior and to beat up on his inferior. That\'s just how it is.',
+    'Cats are cool, soft, fuzzy and bouncy!',
+    'Cats and dogs are two common types of household animals. There are many species of cats and dogs - from the common house cat, to the Blue Russian, from bulldog to shepherd. Both cats and dogs have been domesticated by man many thousands of years ago and are loved and cared for by many pet owners today. There are even urban legends of cat owners having statistically better health than non-cat owners - and everyone knows how useful a dog can be, for protecting the house, for instance! There are many more things to say about cats and dogs, but I think I\'ve run out of time, so I have to go. Thank you for listening!',
+    'The domesticated cat (Latin: Felis catus) or the undomesticated cat (Latin: Felis silvestris catus) is a small, typically furry, carnivorous mammal',
+    'In comparison to dogs, cats have not undergone major changes during the domestication process, as the form and behavior of the domestic cat is not radically different from those of wildcats and domestic cats are perfectly capable of surviving in the wild',
+    'Cats, like dogs, are digitigrades',
+    'Cats do eat grass occasionally.',
+    'Cats can hear higher-pitched sounds than either dogs or humans, detecting frequencies from 55 Hz to 79,000 Hz, a range of 10.5 octaves, while humans and dogs both have ranges of about 9 octaves.',
+    'The average lifespan of pet cats has risen in recent years. In the early 1980s it was about seven years,[96]:33[97] rising to 9.4 years in 1995[96]:33 and 12–15 years in 2014.[98] However, cats have been reported as surviving into their 30s,[99] with the oldest known cat, Creme Puff, dying at a verified age of 38.[100] Spaying or neutering increases life expectancy: one study found neutered male cats live twice as long as intact males, while spayed female cats live 62% longer than intact females.[96]:35 Having a cat neutered confers health benefits, because castrated males cannot develop testicular cancer, spayed females cannot develop uterine or ovarian cancer, and both have a reduced risk of mammary cancer.[101]']))
+  df.columns = ["original_text"]
+  return df
+
+
+def import_query_data_frame_toy():
+  df = pd.DataFrame(np.array([
+    'cat',
+    'dog',
+    'cat dog'
+    ]))
+  df.columns = ["query"]
+  return df
+
+# Fettermania: Cartesian product - might want to reduce repeated 
+# text payloads with keys instead
+def create_query_datasets_toy(simulated_relevant_p=.5):
+  df = import_data_frame_toy();
+  df = import_preprocess(df); # cleaned_text, original_text
+
+  normalization_sample_ct = round(df.shape[0] / 3)
+  df_normalization_sample = df[-normalization_sample_ct:]
+  df = df[0:-normalization_sample_ct]
+  df['doc_index'] = np.array(range(df.shape[0]))  # Fettermania TODO: This is not random-ish.
+  qdf = import_query_data_frame_toy();
+
+  # Fettermania: Hack cartesian product
+  df['join_key'] = np.repeat(1, df.shape[0])
+  qdf['join_key'] = np.repeat(1, qdf.shape[0])
+  cartesian = pd.merge(df,qdf, on='join_key')
+  del df['join_key']
+  del qdf['join_key']
+  del cartesian['join_key']
+
+  np.random.seed(5) # wide "looking" sample
+  relevant = np.random.rand(cartesian.shape[0])
+  relevant = (relevant < simulated_relevant_p) + 0
+  cartesian['relevant'] = relevant
+  return (df_normalization_sample['cleaned_text'], df, cartesian)
+
+
+# preprocess
+def import_preprocess(df):
+  df['cleaned_text'] = df['original_text'].apply(clean_text.clean_text)
+  return df # passed by ref, TODO fix
+
diff --git a/notes.txt b/notes.txt
@@ -0,0 +1,16 @@
+
+
+TODO 
+---
+- SBM section 3: idf is used in query terms, not in doc weigths
+- BM25 - whole system; https://en.wikipedia.org/wiki/Okapi_BM25
+  - need to learn more core constants
+- To optimize:
+  - Use PAIRWISE rankings?
+    - Note: This exponentially increases the number of "pivots" because there's no P(ret) and P(rel) curves that cross anymore.
+  - Metrics (later): Old: precision, recall, f-score.  New: ROC, DCG and variants.
+  - Chapelle: Judgement metric for the contest was NDCG 
+- TODO: Finding threshold: n-fold cross validation?
+  - Probably just a param, like slope
+- TODO: Dump to .csv
+
diff --git a/plot_result_surface.py b/plot_result_surface.py
@@ -0,0 +1,20 @@
+from matplotlib import cm
+import matplotlib.pyplot as plt
+import numpy as np
+
+def plot_result_surface(z_label, result_matrix, THRESHOLD_MAX, THRESHOLD_POINTS, SLOPE_MAX, SLOPE_POINTS):
+  fig = plt.figure()
+  ax = fig.add_subplot(111, projection='3d')
+  x = np.arange(0, THRESHOLD_MAX, THRESHOLD_MAX / THRESHOLD_POINTS)
+  y = np.arange(0, SLOPE_MAX, SLOPE_MAX / SLOPE_POINTS)
+  X, Y = np.meshgrid(x, y)
+  zs = np.array([result_matrix[int(x * THRESHOLD_POINTS / THRESHOLD_MAX)][int(y * SLOPE_POINTS / SLOPE_MAX)] for x,y in zip(np.ravel(X), np.ravel(Y))])
+  Z = zs.reshape(X.shape)
+  surface = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False)
+  ax.set_xlabel('Threshold')
+  ax.set_ylabel('Slope')
+  ax.set_zlabel(z_label)
+  ax.set_title('PDLN ' + z_label + ' vs. hyperparams')
+  ax.set_zlim(np.min(result_matrix), np.max(result_matrix))
+  fig.colorbar(surface, shrink=0.5, aspect=5)
+  plt.show()
diff --git a/results/crowdflower_results_large.csv b/results/crowdflower_results_large.csv
diff --git a/run_pdln_test.py b/run_pdln_test.py
@@ -0,0 +1,90 @@
+#pivot_norm.py - main file
+
+# SECTION : LOAD DATA
+import pandas as pd
+import re
+import numpy as np
+import sklearn.feature_extraction.text as sktext
+import sklearn.preprocessing as skpre
+import functools
+from sklearn.cross_validation import train_test_split
+from mpl_toolkits.mplot3d import Axes3D
+import datetime
+
+# Fettermania libraries
+import import_data
+import tfidf_pdln
+import plot_result_surface
+
+# ===== SECTION: Get normalization_datadata and test/train set =====
+
+# normalization_corpus: Series (array-like)
+# input_docs: DF with "cleaned_text", "original text", "doc_index"
+# relevance results: DF: "cleaned_text", "original text", "doc index", "query", "relevant"
+(normalization_corpus, input_docs, relevance_results) = import_data.create_query_datasets_toy()
+#(normalization_corpus, input_docs, relevance_results) = import_data.create_query_datasets_crowdflower(small=True)
+
+X_train, X_test, y_train, y_test = train_test_split(
+  relevance_results[['query', 'doc_index']], relevance_results['relevant'], test_size = .3, random_state=0)
+
+# Fettermania TODO Clean when you learn pandas mo betta
+X_train_array = np.array(X_train)
+y_train_array = np.array(y_train)
+X_test_array = np.array(X_test)
+y_test_array = np.array(y_test)
+
+# ====== SECTION: Create model =====
+tfidf_ranker = tfidf_pdln.TFIDFRanker(input_docs, slope=.75, relevance_threshold=.05)
+tfidf_ranker.add_normalization_corpus(normalization_corpus)
+print ("DEBUG: Pivot is calculated at ", tfidf_ranker.pivot)
+
+# ===== SECTION: Run test ======
+THRESHOLD_MAX = .4
+THRESHOLD_POINTS = 20
+SLOPE_MAX = 1.5
+SLOPE_POINTS = 30
+
+def run_test(slope, threshold, X_array, y_array):
+  # predicted = np.empty(len(y_array))
+  tfidf_ranker.set_slope(slope)
+  tfidf_ranker.set_relevance_threshold(threshold)
+  predicted = tfidf_ranker.predict(X_array[:,0], X_array[:,1])    
+  # Fettermania TODO: Is there a different precision/recall definition for IR?
+  # https://en.wikipedia.org/wiki/Precision_and_recall
+  tp = sum(np.logical_and(predicted, y_train_array))
+  fp = sum(np.logical_and(predicted, np.logical_not(y_train_array)))
+  fn = sum(np.logical_and(np.logical_not(predicted), y_train_array))
+  accuracy  = sum(predicted == y_train_array) / len(X_array)
+  precision = 0 if tp + fp == 0 else tp / (tp + fp)
+  recall = 0 if tp + fn == 0 else tp / (tp + fn)
+  fscore = 0 if precision + recall == 0 else 2*(precision * recall)/(precision + recall)
+  print ("(DEBUG: %s) Run test: train = [a: %f, p: %f, r: %f, f: %f], slope=(%f), threshold=(%f)" % (
+    datetime.datetime.now(), accuracy, precision, recall, fscore, slope, threshold))
+  return (accuracy, precision, recall)
+accuracy_matrix = np.empty([THRESHOLD_POINTS, SLOPE_POINTS])
+precision_matrix = np.empty([THRESHOLD_POINTS, SLOPE_POINTS])
+recall_matrix = np.empty([THRESHOLD_POINTS, SLOPE_POINTS])
+# TODO: Replace with grid thing or cross-validation
+for threshold_ix in range(THRESHOLD_POINTS):
+  for slope_ix in range(SLOPE_POINTS):
+    threshold = threshold_ix * THRESHOLD_MAX / THRESHOLD_POINTS
+    slope = slope_ix * SLOPE_MAX / SLOPE_POINTS
+    (acc, prec, rec) = run_test(slope, threshold, X_train_array, y_train_array)
+    accuracy_matrix[threshold_ix][slope_ix] = acc
+    precision_matrix[threshold_ix][slope_ix] = prec
+    recall_matrix[threshold_ix][slope_ix] = rec
+
+# ==== SECTION: Show results ======
+plot_result_surface.plot_result_surface("Accuracy", accuracy_matrix, THRESHOLD_MAX, THRESHOLD_POINTS, SLOPE_MAX, SLOPE_POINTS)
+plot_result_surface.plot_result_surface("Precision", precision_matrix, THRESHOLD_MAX, THRESHOLD_POINTS, SLOPE_MAX, SLOPE_POINTS)
+plot_result_surface.plot_result_surface("Recall", recall_matrix, THRESHOLD_MAX, THRESHOLD_POINTS, SLOPE_MAX, SLOPE_POINTS)
+
+# ====== SECTION: DEBUG ======
+print ("DEBUG QUERIES")
+debug_queries = np.array(['cat', 'cat dog', 'felid', 'and', 'fasdf'])
+print(list(zip(debug_queries, tfidf_ranker.get_top_document_matches(debug_queries, 3))))
+print(list(zip(debug_queries, tfidf_ranker.get_documents_over_threshold(debug_queries))))
+
+
+
+
diff --git a/snippets/count_vectorizer.py b/snippets/count_vectorizer.py
@@ -0,0 +1,45 @@
+import numpy as np
+import sklearn.feature_extraction.text as sktext
+import sklearn.preprocessing as skpre
+
+# Fettermania: Can switch grams
+#cv = textlib.CountVectorizer(ngram_range=(1,2))
+cv = sktext.CountVectorizer(ngram_range=(1,1))
+
+# Note: Doesn't convert very small words or punctuation
+doc_freqs = np.array( ['The sun is shining', 'The weather is sweet', 'The sun is shining and the weather is sweet'])
+
+tf = cv.fit_transform(doc_freqs);
+tf_array = tf.toarray();
+
+print ("VOCAB")
+print (cv.vocabulary_)
+
+print ("tf")
+print (tf_array)
+
+# n_d row vectors of frequencies in that doc.
+# columns: size of cv.vocabulary_
+df_array = np.sum(tf_array > 0, axis=0)
+n_d = doc_freqs.size
+#idf_1 = np.log(n_d / (1 + df_array))
+#tfidf_1 = tf_array * (idf_1 + 1)
+idf_2 = np.log((1 + n_d)/ (1 + df_array))
+tfidf_2 = tf_array * (idf_2 + 1)
+tfidf_2_normalized = skpre.normalize(tfidf_2, axis=1)
+
+
+from sklearn.feature_extraction.text import TfidfTransformer
+tfidf_builtin_transformer = TfidfTransformer()
+tfidf_builtin = tfidf_builtin_transformer.fit_transform(cv.fit_transform(doc_freqs))
+tfidf_builtin_array = tfidf_builtin.toarray()
+
+print ("TFIDF HANDROLL should be 0")
+print (tfidf_2_normalized)
+print ("TFIFD BUILTIN should be 0")
+print (tfidf_builtin_array)
+
+print ("DEBUG: DIFF should be 0")
+print (tfidf_builtin_array - tfidf_2_normalized)
+
+