Skip to content

Commit

Permalink
parser and associated unit tests, done
Browse files Browse the repository at this point in the history
  • Loading branch information
ironholds committed Aug 21, 2015
1 parent f5267c9 commit 0e13601
Show file tree
Hide file tree
Showing 7 changed files with 137 additions and 14 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Generated by roxygen2 (4.1.1): do not edit by hand

export(humaniformat)
importFrom(Rcpp,sourceCpp)
useDynLib(humaniformat)
8 changes: 8 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# This file was generated by Rcpp::compileAttributes
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#' @export
humaniformat <- function(names) {
.Call('humaniformat_humaniformat', PACKAGE = 'humaniformat', names)
}

18 changes: 18 additions & 0 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// This file was generated by Rcpp::compileAttributes
// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#include <Rcpp.h>

using namespace Rcpp;

// humaniformat
DataFrame humaniformat(std::vector < std::string > names);
RcppExport SEXP humaniformat_humaniformat(SEXP namesSEXP) {
BEGIN_RCPP
Rcpp::RObject __result;
Rcpp::RNGScope __rngScope;
Rcpp::traits::input_parameter< std::vector < std::string > >::type names(namesSEXP);
__result = Rcpp::wrap(humaniformat(names));
return __result;
END_RCPP
}
67 changes: 59 additions & 8 deletions src/human_parse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ std::string human_parse::erase_periods(std::string part){
}

// See if a chunk matches a component
bool human_parse::match_component(std::string part, std::set < std::string >& set_ref){
bool human_parse::match_component(std::string part, std::set < std::string > set_ref){

// Clean up - erase periods and lowercase
part = erase_periods(part);
Expand All @@ -56,9 +56,8 @@ std::vector < std::string > human_parse::parse_single(std::string name){

// Split and create output object.
std::deque < std::string > split_name = split_parts(name);
std::vector < std::string > output(6);
output[5] = name;

std::vector < std::string > output(5);

// If there's only one element we assume it is a first name and return it.
if(split_name.size() == 1){
output[1] = split_name[0];
Expand All @@ -69,7 +68,7 @@ std::vector < std::string > human_parse::parse_single(std::string name){
if(split_name.size() > 1 && match_component(split_name[0], salutations)){
output[0] = split_name[0];
split_name.pop_front();
output[1] = split_name[1];
output[1] = split_name[0];
split_name.pop_front();
} else {
output[1] = split_name[0];
Expand All @@ -78,16 +77,68 @@ std::vector < std::string > human_parse::parse_single(std::string name){

// If there is still > 1 element and we find a suffix, pop those two elements. Otherwise just one.
if(split_name.size() > 1 && match_component(split_name[split_name.size() - 1], suffixes)){
output[5] = split_name[split_name.size() - 1];
split_name.pop_back();
output[4] = split_name[split_name.size() - 1];
split_name.pop_back();
output[3] = split_name[split_name.size() - 1];
split_name.pop_back();
} else if(split_name.size() > 0){
output[5] = split_name[split_name.size() - 1];
output[3] = split_name[split_name.size() - 1];
split_name.pop_back();
} else {
return output;
}

// If there is still 1 or more elements we test for compounds
while(split_name.size() > 0 && match_component(split_name[split_name.size() - 1], compounds)){
output[3] = split_name[split_name.size() - 1] + " " + output[3];
split_name.pop_back();
}

// If we still have elements, those are middle names.
if(split_name.size() > 0){
output[2].append(split_name[0]);
for(unsigned int i = 1; i < split_name.size(); i++){
output[2].append(" " + split_name[i]);
}
}

return output;
}

DataFrame human_parse::parse_vector(std::vector < std::string > names){

// Measure and construct output
unsigned int input_size = names.size();
std::vector < std::string > salutation(input_size);
std::vector < std::string > first_name(input_size);
std::vector < std::string > middle_name(input_size);
std::vector < std::string > last_name(input_size);
std::vector < std::string > suffix(input_size);
std::vector < std::string > holding(5);

// For each element, go nuts
for(unsigned int i = 0; i < input_size; i++){
if((i % 10000) == 0){
Rcpp::checkUserInterrupt();
}

holding = parse_single(names[i]);
salutation[i] = holding[0];
first_name[i] = holding[1];
middle_name[i] = holding[2];
last_name[i] = holding[3];
suffix[i] = holding[4];

}

return DataFrame::create(_["salutation"] = salutation,
_["first_name"] = first_name,
_["middle_name"] = middle_name,
_["last_name"] = last_name,
_["suffix"] = suffix,
_["full_name"] = names,
_["stringsAsFactors"] = false);
}

// Constructor
human_parse::human_parse(){
Expand Down
4 changes: 2 additions & 2 deletions src/human_parse.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ class human_parse {

std::string erase_periods(std::string part);

bool match_component(std::string part, std::set < std::string > & set_ref);
bool match_component(std::string part, std::set < std::string > set_ref);

std::vector < std::string > parse_single(std::string name);

public:

DataFrame parse_vector(std::vector < std::string >& names);
DataFrame parse_vector(std::vector < std::string > names);

human_parse();

Expand Down
5 changes: 4 additions & 1 deletion src/humaniformat.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#include "human_parse.h"

//' @export
// [[Rcpp::export]]
DataFrame humaniformat(std::vector < std::string > names){

human_parse parse_inst;
return parse_inst.parse_vector(names);
}
48 changes: 45 additions & 3 deletions tests/testthat/test.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,50 @@
context("Name parsing")

context("humaniformat")
test_that("Simple {first, last} names can be parsed", {
result <- unlist(humaniformat("Jim Jeffries"))
expect_true(all(names(result) == c("salutation","first_name","middle_name","last_name","suffix","full_name")))
expect_true(result["first_name"] == "Jim")
expect_true(result["last_name"] == "Jeffries")
})

test_that("Names with salutations can be parsed", {
result <- unlist(humaniformat("Dr. Jim Jeffries"))
expect_true(all(names(result) == c("salutation","first_name","middle_name","last_name","suffix","full_name")))
expect_true(result["first_name"] == "Jim")
expect_true(result["last_name"] == "Jeffries")
expect_true(result["salutation"] == "Dr.")
})

test_that("humaniformat works", {
test_that("Names with suffixes can be parsed", {
result <- unlist(humaniformat("Jim Jeffries PhD"))
expect_true(all(names(result) == c("salutation","first_name","middle_name","last_name","suffix","full_name")))
expect_true(result["first_name"] == "Jim")
expect_true(result["last_name"] == "Jeffries")
expect_true(result["suffix"] == "PhD")
})

expect_true(TRUE)
test_that("Names with middle names can be parsed", {
result <- unlist(humaniformat("Jim Schmidt Jeffries"))
expect_true(all(names(result) == c("salutation","first_name","middle_name","last_name","suffix","full_name")))
expect_true(result["first_name"] == "Jim")
expect_true(result["middle_name"] == "Schmidt")
expect_true(result["last_name"] == "Jeffries")
})

test_that("Names with compound surnames can be parsed", {
result <- unlist(humaniformat("Jim de la ben Jeffries"))
expect_true(all(names(result) == c("salutation","first_name","middle_name","last_name","suffix","full_name")))
expect_true(result["first_name"] == "Jim")
expect_true(result["last_name"] == "de la ben Jeffries")
})

test_that("Names with all elements can be parsed", {
result <- unlist(humaniformat("Rev Jim Schmidt de la ben Jeffries PhD"))
expect_true(all(names(result) == c("salutation","first_name","middle_name","last_name","suffix","full_name")))
expect_true(result["salutation"] == "Rev")
expect_true(result["first_name"] == "Jim")
expect_true(result["middle_name"] == "Schmidt")
expect_true(result["last_name"] == "de la ben Jeffries")
expect_true(result["suffix"] == "PhD")

})

0 comments on commit 0e13601

Please sign in to comment.