switch names, fix tests, add start of experimental formatter

Ironholds · Aug 21, 2015 · 706afe8 · 706afe8
1 parent efb6d7c
commit 706afe8
Show file tree

Hide file tree

Showing 11 changed files with 210 additions and 28 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,5 @@
 # Generated by roxygen2 (4.1.1): do not edit by hand
 
-export(humaniformat)
+export(parse_names)
 importFrom(Rcpp,sourceCpp)
 useDynLib(humaniformat)
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -1,8 +1,35 @@
 # This file was generated by Rcpp::compileAttributes
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
+#' @title Parse Human Names
+#' @description human names are complex things; sometimes people have honorifics, or not. Or a single middle name, or many. Or
+#' a compound surname, or not a compound surname but 'PhD' at the end of their name, and augh.
+#' 
+#' \code{parse_names} provides a simple
+#' function for taking consistently formatted human names and splitting them into \code{salutation}, \code{first_name},
+#' \code{middle_name}, \code{last_name} and \code{suffix}. It is capable of dealing with compound surnames, multiple middle names,
+#' and similar variations, and is fully vectorised.
+#' 
+#' @param names a character vector of names to parse.
+#' 
+#' @return a data.frame with the columns \code{salutation}, \code{first_name},
+#' \code{middle_name}, \code{last_name}, \code{suffix} and \code{full_name} (which contains the original name). In the
+#' event that a name doesn't \emph{have} a salutation, middle name, suffix, or so on, an empty string will be in that
+#' field instead.
+#' 
+#' @examples
+#' # Parse a simple name
+#' parse_names("Oliver Keyes")
+#' 
+#' # Parse a more complex name
+#' parse_names("Hon. Oliver Timothy Keyes Esq.")
+#' 
 #' @export
-humaniformat <- function(names) {
-    .Call('humaniformat_humaniformat', PACKAGE = 'humaniformat', names)
+parse_names <- function(names) {
+    .Call('humaniformat_parse_names', PACKAGE = 'humaniformat', names)
+}
+
+format_names <- function(names) {
+    .Call('humaniformat_format_names', PACKAGE = 'humaniformat', names)
 }
 
diff --git a/humaniformat.Rproj b/humaniformat.Rproj
@@ -17,4 +17,5 @@ AutoAppendNewline: Yes
 BuildType: Package
 PackageUseDevtools: Yes
 PackageInstallArgs: --no-multiarch --with-keep.source
+PackageCheckArgs: --as-cran
 PackageRoxygenize: rd,collate,namespace,vignette
diff --git a/man/parse_names.Rd b/man/parse_names.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2 (4.1.1): do not edit by hand
+% Please edit documentation in R/RcppExports.R
+\name{parse_names}
+\alias{parse_names}
+\title{Parse Human Names}
+\usage{
+parse_names(names)
+}
+\arguments{
+\item{names}{a character vector of names to parse.}
+}
+\value{
+a data.frame with the columns \code{salutation}, \code{first_name},
+\code{middle_name}, \code{last_name}, \code{suffix} and \code{full_name} (which contains the original name). In the
+event that a name doesn't \emph{have} a salutation, middle name, suffix, or so on, an empty string will be in that
+field instead.
+}
+\description{
+human names are complex things; sometimes people have honorifics, or not. Or a single middle name, or many. Or
+a compound surname, or not a compound surname but 'PhD' at the end of their name, and augh.
+
+\code{parse_names} provides a simple
+function for taking consistently formatted human names and splitting them into \code{salutation}, \code{first_name},
+\code{middle_name}, \code{last_name} and \code{suffix}. It is capable of dealing with compound surnames, multiple middle names,
+and similar variations, and is fully vectorised.
+}
+\examples{
+# Parse a simple name
+parse_names("Oliver Keyes")
+
+# Parse a more complex name
+parse_names("Hon. Oliver Timothy Keyes Esq.")
+}
+
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
@@ -5,14 +5,25 @@
 
 using namespace Rcpp;
 
-// humaniformat
-DataFrame humaniformat(std::vector < std::string > names);
-RcppExport SEXP humaniformat_humaniformat(SEXP namesSEXP) {
+// parse_names
+DataFrame parse_names(std::vector < std::string > names);
+RcppExport SEXP humaniformat_parse_names(SEXP namesSEXP) {
 BEGIN_RCPP
     Rcpp::RObject __result;
     Rcpp::RNGScope __rngScope;
     Rcpp::traits::input_parameter< std::vector < std::string > >::type names(namesSEXP);
-    __result = Rcpp::wrap(humaniformat(names));
+    __result = Rcpp::wrap(parse_names(names));
+    return __result;
+END_RCPP
+}
+// format_names
+std::vector < std::string > format_names(std::vector < std::string > names);
+RcppExport SEXP humaniformat_format_names(SEXP namesSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject __result;
+    Rcpp::RNGScope __rngScope;
+    Rcpp::traits::input_parameter< std::vector < std::string > >::type names(namesSEXP);
+    __result = Rcpp::wrap(format_names(names));
     return __result;
 END_RCPP
 }
diff --git a/src/human_format.cpp b/src/human_format.cpp
@@ -0,0 +1,54 @@
+#include "human_format.h"
+
+std::string human_format::comma_format(std::string name){
+
+  // Split on commas. If there are no commas, return.
+  std::deque < std::string > split_string = split_parts(name, ",");
+  if(split_string.size() < 2){
+    return name;
+  }
+
+  std::string output;
+  std::string holding;
+
+  // Comma formatting
+  while(split_string.size() > 0){
+    unsigned int split_size = (split_string.size() - 1);
+    if(match_component(split_string[split_size], suffixes)){
+      if(output.size() == 0){
+        output.append(split_string[split_size]);
+      } else {
+        output.append(" " + split_string[split_size]);
+      }
+    } else {
+      if(output.size() == 0){
+
+      }
+      holding.append(split_string[split_size]);
+    }
+    split_string.pop_back();
+  }
+
+  if(holding.size() > 0){
+    output = holding + output;
+  }
+
+  return output;
+}
+
+std::vector < std::string > human_format::format_vector(std::vector < std::string > names){
+
+  unsigned int input_size = names.size();
+
+  // For each element, go nuts
+  for(unsigned int i = 0; i < input_size; i++){
+    if((i % 10000) == 0){
+      Rcpp::checkUserInterrupt();
+    }
+
+    names[i] = comma_format(names[i]);
+
+  }
+
+  return names;
+}
diff --git a/src/human_format.h b/src/human_format.h
@@ -0,0 +1,19 @@
+#include "human_parse.h"
+
+
+#ifndef __HUMAN_FORMAT__
+#define __HUMAN_FORMAT__
+
+class human_format: public human_parse {
+
+private:
+
+  std::string comma_format(std::string name);
+
+public:
+
+  std::vector < std::string > format_vector(std::vector < std::string > names);
+
+};
+
+#endif 
diff --git a/src/human_parse.cpp b/src/human_parse.cpp
@@ -1,15 +1,16 @@
 #include "human_parse.h"
 
-std::deque < std::string > human_parse::split_parts(std::string name){
-
+
+std::deque < std::string > human_parse::split_parts(std::string name, std::string split_on){
+
   std::deque < std::string > output;
   size_t last    = 0;
-  size_t current = name.find(" ");
+  size_t current = name.find(split_on);
 
   while(current != std::string::npos){
     output.push_back(name.substr(last, (current - last)));
     last = ++current;
-    current = name.find(" ", current);
+    current = name.find(split_on, current);
     if(current == std::string::npos){
       output.push_back(name.substr(last, name.size()));
     }
@@ -19,10 +20,12 @@ std::deque < std::string > human_parse::split_parts(std::string name){
 }
 
 // Erase periods
-std::string human_parse::erase_periods(std::string part){
+std::string human_parse::erase_char(std::string part, std::string char_to_erase){
+
+  unsigned int erase_size = char_to_erase.size();
 
-  for(size_t i = part.find("."); i != std::string::npos; i = part.find(".")){
-    part.erase(i, 1);
+  for(size_t i = part.find(char_to_erase); i != std::string::npos; i = part.find(char_to_erase)){
+    part.erase(i, erase_size);
   }
 
   return part;
@@ -32,7 +35,7 @@ std::string human_parse::erase_periods(std::string part){
 bool human_parse::match_component(std::string part, std::set < std::string > set_ref){
 
   // Clean up - erase periods and lowercase
-  part = erase_periods(part);
+  part = erase_char(part, ".");
   unsigned int input_size = part.size();
   for(unsigned int i = 0; i < input_size; i++){
     part[i] = tolower(part[i]);
@@ -55,7 +58,7 @@ std::vector < std::string > human_parse::parse_single(std::string name){
   }
 
   // Split and create output object.
-  std::deque < std::string > split_name = split_parts(name);
+  std::deque < std::string > split_name = split_parts(name, " ");
   std::vector < std::string > output(5);
 
   // If there's only one element we assume it is a first name and return it.
@@ -175,13 +178,15 @@ human_parse::human_parse(){
   suffixes.insert("ma");
   suffixes.insert("dmd");
   suffixes.insert("cme");
+  suffixes.insert("esq");
 
   // Compounds
   compounds.insert("vere");
   compounds.insert("von");
   compounds.insert("van");
   compounds.insert("del");
   compounds.insert("de");
+  compounds.insert("den");
   compounds.insert("della");
   compounds.insert("der");
   compounds.insert("di");

diff --git a/src/human_parse.h b/src/human_parse.h
@@ -5,21 +5,23 @@ using namespace Rcpp;
 #define __HUMAN_PARSE__
 
 class human_parse {
-  
-private:
+
+protected:
 
   std::set < std::string > salutations;
 
   std::set < std::string > compounds;
 
   std::set < std::string > suffixes;
 
-  std::deque < std::string > split_parts(std::string name);
+  std::deque < std::string > split_parts(std::string name, std::string split_on);
 
-  std::string erase_periods(std::string part);
+  std::string erase_char(std::string part, std::string char_to_erase);
 
   bool match_component(std::string part, std::set < std::string > set_ref);
 
+private:
+
   std::vector < std::string > parse_single(std::string name);
 
 public:

diff --git a/src/humaniformat.cpp b/src/humaniformat.cpp
@@ -1,8 +1,37 @@
-#include "human_parse.h"
+#include "human_format.h"
 
+//' @title Parse Human Names
+//' @description human names are complex things; sometimes people have honorifics, or not. Or a single middle name, or many. Or
+//' a compound surname, or not a compound surname but 'PhD' at the end of their name, and augh.
+//' 
+//' \code{parse_names} provides a simple
+//' function for taking consistently formatted human names and splitting them into \code{salutation}, \code{first_name},
+//' \code{middle_name}, \code{last_name} and \code{suffix}. It is capable of dealing with compound surnames, multiple middle names,
+//' and similar variations, and is fully vectorised.
+//' 
+//' @param names a character vector of names to parse.
+//' 
+//' @return a data.frame with the columns \code{salutation}, \code{first_name},
+//' \code{middle_name}, \code{last_name}, \code{suffix} and \code{full_name} (which contains the original name). In the
+//' event that a name doesn't \emph{have} a salutation, middle name, suffix, or so on, an empty string will be in that
+//' field instead.
+//' 
+//' @examples
+//' # Parse a simple name
+//' parse_names("Oliver Keyes")
+//' 
+//' # Parse a more complex name
+//' parse_names("Hon. Oliver Timothy Keyes Esq.")
+//' 
 //' @export
 // [[Rcpp::export]]
-DataFrame humaniformat(std::vector < std::string > names){
+DataFrame parse_names(std::vector < std::string > names){
   human_parse parse_inst;
   return parse_inst.parse_vector(names);
 }
+
+// [[Rcpp::export]]
+std::vector < std::string > format_names(std::vector < std::string > names){
+  human_format format_inst;
+  return format_inst.format_vector(names);
+}
diff --git a/tests/testthat/test.R b/tests/testthat/test.R
@@ -1,45 +1,45 @@
 context("Name parsing")
 
 test_that("Simple {first, last} names can be parsed", {
-  result <- unlist(humaniformat("Jim Jeffries"))
+  result <- unlist(parse_names("Jim Jeffries"))
   expect_true(all(names(result) == c("salutation","first_name","middle_name","last_name","suffix","full_name")))
   expect_true(result["first_name"] == "Jim")
   expect_true(result["last_name"] == "Jeffries")
 })
 
 test_that("Names with salutations can be parsed", {
-  result <- unlist(humaniformat("Dr. Jim Jeffries"))
+  result <- unlist(parse_names("Dr. Jim Jeffries"))
   expect_true(all(names(result) == c("salutation","first_name","middle_name","last_name","suffix","full_name")))
   expect_true(result["first_name"] == "Jim")
   expect_true(result["last_name"] == "Jeffries")
   expect_true(result["salutation"] == "Dr.")
 })
 
 test_that("Names with suffixes can be parsed", {
-  result <- unlist(humaniformat("Jim Jeffries PhD"))
+  result <- unlist(parse_names("Jim Jeffries PhD"))
   expect_true(all(names(result) == c("salutation","first_name","middle_name","last_name","suffix","full_name")))
   expect_true(result["first_name"] == "Jim")
   expect_true(result["last_name"] == "Jeffries")
   expect_true(result["suffix"] == "PhD")
 })
 
 test_that("Names with middle names can be parsed", {
-  result <- unlist(humaniformat("Jim Schmidt Jeffries"))
+  result <- unlist(parse_names("Jim Schmidt Jeffries"))
   expect_true(all(names(result) == c("salutation","first_name","middle_name","last_name","suffix","full_name")))
   expect_true(result["first_name"] == "Jim")
   expect_true(result["middle_name"] == "Schmidt")
   expect_true(result["last_name"] == "Jeffries")
 })
 
 test_that("Names with compound surnames can be parsed", {
-  result <- unlist(humaniformat("Jim de la ben Jeffries"))
+  result <- unlist(parse_names("Jim de la ben Jeffries"))
   expect_true(all(names(result) == c("salutation","first_name","middle_name","last_name","suffix","full_name")))
   expect_true(result["first_name"] == "Jim")
   expect_true(result["last_name"] == "de la ben Jeffries")
 })
 
 test_that("Names with all elements can be parsed", {
-  result <- unlist(humaniformat("Rev Jim Schmidt de la ben Jeffries PhD"))
+  result <- unlist(parse_names("Rev Jim Schmidt de la ben Jeffries PhD"))
   expect_true(all(names(result) == c("salutation","first_name","middle_name","last_name","suffix","full_name")))
   expect_true(result["salutation"] == "Rev")
   expect_true(result["first_name"] == "Jim")