parser and associated unit tests, done

Ironholds · Aug 21, 2015 · 0e13601 · 0e13601
1 parent f5267c9
commit 0e13601
Show file tree

Hide file tree

Showing 7 changed files with 137 additions and 14 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,4 +1,5 @@
 # Generated by roxygen2 (4.1.1): do not edit by hand
 
+export(humaniformat)
 importFrom(Rcpp,sourceCpp)
 useDynLib(humaniformat)
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -0,0 +1,8 @@
+# This file was generated by Rcpp::compileAttributes
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+#' @export
+humaniformat <- function(names) {
+    .Call('humaniformat_humaniformat', PACKAGE = 'humaniformat', names)
+}
+
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
@@ -0,0 +1,18 @@
+// This file was generated by Rcpp::compileAttributes
+// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+#include <Rcpp.h>
+
+using namespace Rcpp;
+
+// humaniformat
+DataFrame humaniformat(std::vector < std::string > names);
+RcppExport SEXP humaniformat_humaniformat(SEXP namesSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject __result;
+    Rcpp::RNGScope __rngScope;
+    Rcpp::traits::input_parameter< std::vector < std::string > >::type names(namesSEXP);
+    __result = Rcpp::wrap(humaniformat(names));
+    return __result;
+END_RCPP
+}
diff --git a/src/human_parse.cpp b/src/human_parse.cpp
@@ -29,7 +29,7 @@ std::string human_parse::erase_periods(std::string part){
 }
 
 // See if a chunk matches a component
-bool human_parse::match_component(std::string part, std::set < std::string >& set_ref){
+bool human_parse::match_component(std::string part, std::set < std::string > set_ref){
 
   // Clean up - erase periods and lowercase
   part = erase_periods(part);
@@ -56,9 +56,8 @@ std::vector < std::string > human_parse::parse_single(std::string name){
 
   // Split and create output object.
   std::deque < std::string > split_name = split_parts(name);
-  std::vector < std::string > output(6);
-  output[5] = name;
-
+  std::vector < std::string > output(5);
+
   // If there's only one element we assume it is a first name and return it.
   if(split_name.size() == 1){
     output[1] = split_name[0];
@@ -69,7 +68,7 @@ std::vector < std::string > human_parse::parse_single(std::string name){
   if(split_name.size() > 1 && match_component(split_name[0], salutations)){
     output[0] = split_name[0];
     split_name.pop_front();
-    output[1] = split_name[1];
+    output[1] = split_name[0];
     split_name.pop_front();
   } else {
     output[1] = split_name[0];
@@ -78,16 +77,68 @@ std::vector < std::string > human_parse::parse_single(std::string name){
 
   // If there is still > 1 element and we find a suffix, pop those two elements. Otherwise just one.
   if(split_name.size() > 1 && match_component(split_name[split_name.size() - 1], suffixes)){
-    output[5] = split_name[split_name.size() - 1];
-    split_name.pop_back();
     output[4] = split_name[split_name.size() - 1];
+    split_name.pop_back();
+    output[3] = split_name[split_name.size() - 1];
+    split_name.pop_back();
   } else if(split_name.size() > 0){
-    output[5] = split_name[split_name.size() - 1];
+    output[3] = split_name[split_name.size() - 1];
+    split_name.pop_back();
+  } else {
+    return output;
+  }
+
+  // If there is still 1 or more elements we test for compounds
+  while(split_name.size() > 0 && match_component(split_name[split_name.size() - 1], compounds)){
+    output[3] = split_name[split_name.size() - 1] + " " + output[3];
     split_name.pop_back();
   }
 
+  // If we still have elements, those are middle names.
+  if(split_name.size() > 0){
+    output[2].append(split_name[0]);
+    for(unsigned int i = 1; i < split_name.size(); i++){
+      output[2].append(" " + split_name[i]);
+    }
+  }
+
+  return output;
 }
 
+DataFrame human_parse::parse_vector(std::vector < std::string > names){
+
+  // Measure and construct output
+  unsigned int input_size = names.size();
+  std::vector < std::string > salutation(input_size);
+  std::vector < std::string > first_name(input_size);
+  std::vector < std::string > middle_name(input_size);
+  std::vector < std::string > last_name(input_size);
+  std::vector < std::string > suffix(input_size);
+  std::vector < std::string > holding(5);
+
+  // For each element, go nuts
+  for(unsigned int i = 0; i < input_size; i++){
+    if((i % 10000) == 0){
+      Rcpp::checkUserInterrupt();
+    }
+
+    holding = parse_single(names[i]);
+    salutation[i] = holding[0];
+    first_name[i] = holding[1];
+    middle_name[i] = holding[2];
+    last_name[i] = holding[3];
+    suffix[i] = holding[4];
+
+  }
+
+  return DataFrame::create(_["salutation"] = salutation,
+                           _["first_name"] = first_name,
+                           _["middle_name"] = middle_name,
+                           _["last_name"] = last_name,
+                           _["suffix"] = suffix,
+                           _["full_name"] = names,
+                           _["stringsAsFactors"] = false);
+}
 
 // Constructor
 human_parse::human_parse(){

diff --git a/src/human_parse.h b/src/human_parse.h
@@ -18,13 +18,13 @@ class human_parse {
 
   std::string erase_periods(std::string part);
 
-  bool match_component(std::string part, std::set < std::string > & set_ref);
+  bool match_component(std::string part, std::set < std::string > set_ref);
 
   std::vector < std::string > parse_single(std::string name);
 
 public:
 
-  DataFrame parse_vector(std::vector < std::string >& names);
+  DataFrame parse_vector(std::vector < std::string > names);
 
   human_parse();
 

diff --git a/src/humaniformat.cpp b/src/humaniformat.cpp
@@ -1,5 +1,8 @@
 #include "human_parse.h"
 
+//' @export
+// [[Rcpp::export]]
 DataFrame humaniformat(std::vector < std::string > names){
-
+  human_parse parse_inst;
+  return parse_inst.parse_vector(names);
 }
diff --git a/tests/testthat/test.R b/tests/testthat/test.R
@@ -1,8 +1,50 @@
+context("Name parsing")
 
-context("humaniformat")
+test_that("Simple {first, last} names can be parsed", {
+  result <- unlist(humaniformat("Jim Jeffries"))
+  expect_true(all(names(result) == c("salutation","first_name","middle_name","last_name","suffix","full_name")))
+  expect_true(result["first_name"] == "Jim")
+  expect_true(result["last_name"] == "Jeffries")
+})
+
+test_that("Names with salutations can be parsed", {
+  result <- unlist(humaniformat("Dr. Jim Jeffries"))
+  expect_true(all(names(result) == c("salutation","first_name","middle_name","last_name","suffix","full_name")))
+  expect_true(result["first_name"] == "Jim")
+  expect_true(result["last_name"] == "Jeffries")
+  expect_true(result["salutation"] == "Dr.")
+})
 
-test_that("humaniformat works", {
+test_that("Names with suffixes can be parsed", {
+  result <- unlist(humaniformat("Jim Jeffries PhD"))
+  expect_true(all(names(result) == c("salutation","first_name","middle_name","last_name","suffix","full_name")))
+  expect_true(result["first_name"] == "Jim")
+  expect_true(result["last_name"] == "Jeffries")
+  expect_true(result["suffix"] == "PhD")
+})
 
-  expect_true(TRUE)
+test_that("Names with middle names can be parsed", {
+  result <- unlist(humaniformat("Jim Schmidt Jeffries"))
+  expect_true(all(names(result) == c("salutation","first_name","middle_name","last_name","suffix","full_name")))
+  expect_true(result["first_name"] == "Jim")
+  expect_true(result["middle_name"] == "Schmidt")
+  expect_true(result["last_name"] == "Jeffries")
+})
+
+test_that("Names with compound surnames can be parsed", {
+  result <- unlist(humaniformat("Jim de la ben Jeffries"))
+  expect_true(all(names(result) == c("salutation","first_name","middle_name","last_name","suffix","full_name")))
+  expect_true(result["first_name"] == "Jim")
+  expect_true(result["last_name"] == "de la ben Jeffries")
+})
 
+test_that("Names with all elements can be parsed", {
+  result <- unlist(humaniformat("Rev Jim Schmidt de la ben Jeffries PhD"))
+  expect_true(all(names(result) == c("salutation","first_name","middle_name","last_name","suffix","full_name")))
+  expect_true(result["salutation"] == "Rev")
+  expect_true(result["first_name"] == "Jim")
+  expect_true(result["middle_name"] == "Schmidt")
+  expect_true(result["last_name"] == "de la ben Jeffries")
+  expect_true(result["suffix"] == "PhD")
+
 })