Skip to content

Commit

Permalink
switch names, fix tests, add start of experimental formatter
Browse files Browse the repository at this point in the history
  • Loading branch information
ironholds committed Aug 21, 2015
1 parent efb6d7c commit 706afe8
Show file tree
Hide file tree
Showing 11 changed files with 210 additions and 28 deletions.
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Generated by roxygen2 (4.1.1): do not edit by hand

export(humaniformat)
export(parse_names)
importFrom(Rcpp,sourceCpp)
useDynLib(humaniformat)
31 changes: 29 additions & 2 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,35 @@
# This file was generated by Rcpp::compileAttributes
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#' @title Parse Human Names
#' @description human names are complex things; sometimes people have honorifics, or not. Or a single middle name, or many. Or
#' a compound surname, or not a compound surname but 'PhD' at the end of their name, and augh.
#'
#' \code{parse_names} provides a simple
#' function for taking consistently formatted human names and splitting them into \code{salutation}, \code{first_name},
#' \code{middle_name}, \code{last_name} and \code{suffix}. It is capable of dealing with compound surnames, multiple middle names,
#' and similar variations, and is fully vectorised.
#'
#' @param names a character vector of names to parse.
#'
#' @return a data.frame with the columns \code{salutation}, \code{first_name},
#' \code{middle_name}, \code{last_name}, \code{suffix} and \code{full_name} (which contains the original name). In the
#' event that a name doesn't \emph{have} a salutation, middle name, suffix, or so on, an empty string will be in that
#' field instead.
#'
#' @examples
#' # Parse a simple name
#' parse_names("Oliver Keyes")
#'
#' # Parse a more complex name
#' parse_names("Hon. Oliver Timothy Keyes Esq.")
#'
#' @export
humaniformat <- function(names) {
.Call('humaniformat_humaniformat', PACKAGE = 'humaniformat', names)
parse_names <- function(names) {
.Call('humaniformat_parse_names', PACKAGE = 'humaniformat', names)
}

format_names <- function(names) {
.Call('humaniformat_format_names', PACKAGE = 'humaniformat', names)
}

1 change: 1 addition & 0 deletions humaniformat.Rproj
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@ AutoAppendNewline: Yes
BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageCheckArgs: --as-cran
PackageRoxygenize: rd,collate,namespace,vignette
34 changes: 34 additions & 0 deletions man/parse_names.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/RcppExports.R
\name{parse_names}
\alias{parse_names}
\title{Parse Human Names}
\usage{
parse_names(names)
}
\arguments{
\item{names}{a character vector of names to parse.}
}
\value{
a data.frame with the columns \code{salutation}, \code{first_name},
\code{middle_name}, \code{last_name}, \code{suffix} and \code{full_name} (which contains the original name). In the
event that a name doesn't \emph{have} a salutation, middle name, suffix, or so on, an empty string will be in that
field instead.
}
\description{
human names are complex things; sometimes people have honorifics, or not. Or a single middle name, or many. Or
a compound surname, or not a compound surname but 'PhD' at the end of their name, and augh.
\code{parse_names} provides a simple
function for taking consistently formatted human names and splitting them into \code{salutation}, \code{first_name},
\code{middle_name}, \code{last_name} and \code{suffix}. It is capable of dealing with compound surnames, multiple middle names,
and similar variations, and is fully vectorised.
}
\examples{
# Parse a simple name
parse_names("Oliver Keyes")
# Parse a more complex name
parse_names("Hon. Oliver Timothy Keyes Esq.")
}
19 changes: 15 additions & 4 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,25 @@

using namespace Rcpp;

// humaniformat
DataFrame humaniformat(std::vector < std::string > names);
RcppExport SEXP humaniformat_humaniformat(SEXP namesSEXP) {
// parse_names
DataFrame parse_names(std::vector < std::string > names);
RcppExport SEXP humaniformat_parse_names(SEXP namesSEXP) {
BEGIN_RCPP
Rcpp::RObject __result;
Rcpp::RNGScope __rngScope;
Rcpp::traits::input_parameter< std::vector < std::string > >::type names(namesSEXP);
__result = Rcpp::wrap(humaniformat(names));
__result = Rcpp::wrap(parse_names(names));
return __result;
END_RCPP
}
// format_names
std::vector < std::string > format_names(std::vector < std::string > names);
RcppExport SEXP humaniformat_format_names(SEXP namesSEXP) {
BEGIN_RCPP
Rcpp::RObject __result;
Rcpp::RNGScope __rngScope;
Rcpp::traits::input_parameter< std::vector < std::string > >::type names(namesSEXP);
__result = Rcpp::wrap(format_names(names));
return __result;
END_RCPP
}
54 changes: 54 additions & 0 deletions src/human_format.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#include "human_format.h"

std::string human_format::comma_format(std::string name){

// Split on commas. If there are no commas, return.
std::deque < std::string > split_string = split_parts(name, ",");
if(split_string.size() < 2){
return name;
}

std::string output;
std::string holding;

// Comma formatting
while(split_string.size() > 0){
unsigned int split_size = (split_string.size() - 1);
if(match_component(split_string[split_size], suffixes)){
if(output.size() == 0){
output.append(split_string[split_size]);
} else {
output.append(" " + split_string[split_size]);
}
} else {
if(output.size() == 0){

}
holding.append(split_string[split_size]);
}
split_string.pop_back();
}

if(holding.size() > 0){
output = holding + output;
}

return output;
}

std::vector < std::string > human_format::format_vector(std::vector < std::string > names){

unsigned int input_size = names.size();

// For each element, go nuts
for(unsigned int i = 0; i < input_size; i++){
if((i % 10000) == 0){
Rcpp::checkUserInterrupt();
}

names[i] = comma_format(names[i]);

}

return names;
}
19 changes: 19 additions & 0 deletions src/human_format.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#include "human_parse.h"


#ifndef __HUMAN_FORMAT__
#define __HUMAN_FORMAT__

class human_format: public human_parse {

private:

std::string comma_format(std::string name);

public:

std::vector < std::string > format_vector(std::vector < std::string > names);

};

#endif
23 changes: 14 additions & 9 deletions src/human_parse.cpp
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
#include "human_parse.h"

std::deque < std::string > human_parse::split_parts(std::string name){


std::deque < std::string > human_parse::split_parts(std::string name, std::string split_on){

std::deque < std::string > output;
size_t last = 0;
size_t current = name.find(" ");
size_t current = name.find(split_on);

while(current != std::string::npos){
output.push_back(name.substr(last, (current - last)));
last = ++current;
current = name.find(" ", current);
current = name.find(split_on, current);
if(current == std::string::npos){
output.push_back(name.substr(last, name.size()));
}
Expand All @@ -19,10 +20,12 @@ std::deque < std::string > human_parse::split_parts(std::string name){
}

// Erase periods
std::string human_parse::erase_periods(std::string part){
std::string human_parse::erase_char(std::string part, std::string char_to_erase){

unsigned int erase_size = char_to_erase.size();

for(size_t i = part.find("."); i != std::string::npos; i = part.find(".")){
part.erase(i, 1);
for(size_t i = part.find(char_to_erase); i != std::string::npos; i = part.find(char_to_erase)){
part.erase(i, erase_size);
}

return part;
Expand All @@ -32,7 +35,7 @@ std::string human_parse::erase_periods(std::string part){
bool human_parse::match_component(std::string part, std::set < std::string > set_ref){

// Clean up - erase periods and lowercase
part = erase_periods(part);
part = erase_char(part, ".");
unsigned int input_size = part.size();
for(unsigned int i = 0; i < input_size; i++){
part[i] = tolower(part[i]);
Expand All @@ -55,7 +58,7 @@ std::vector < std::string > human_parse::parse_single(std::string name){
}

// Split and create output object.
std::deque < std::string > split_name = split_parts(name);
std::deque < std::string > split_name = split_parts(name, " ");
std::vector < std::string > output(5);

// If there's only one element we assume it is a first name and return it.
Expand Down Expand Up @@ -175,13 +178,15 @@ human_parse::human_parse(){
suffixes.insert("ma");
suffixes.insert("dmd");
suffixes.insert("cme");
suffixes.insert("esq");

// Compounds
compounds.insert("vere");
compounds.insert("von");
compounds.insert("van");
compounds.insert("del");
compounds.insert("de");
compounds.insert("den");
compounds.insert("della");
compounds.insert("der");
compounds.insert("di");
Expand Down
10 changes: 6 additions & 4 deletions src/human_parse.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,23 @@ using namespace Rcpp;
#define __HUMAN_PARSE__

class human_parse {
private:

protected:

std::set < std::string > salutations;

std::set < std::string > compounds;

std::set < std::string > suffixes;

std::deque < std::string > split_parts(std::string name);
std::deque < std::string > split_parts(std::string name, std::string split_on);

std::string erase_periods(std::string part);
std::string erase_char(std::string part, std::string char_to_erase);

bool match_component(std::string part, std::set < std::string > set_ref);

private:

std::vector < std::string > parse_single(std::string name);

public:
Expand Down
33 changes: 31 additions & 2 deletions src/humaniformat.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,37 @@
#include "human_parse.h"
#include "human_format.h"

//' @title Parse Human Names
//' @description human names are complex things; sometimes people have honorifics, or not. Or a single middle name, or many. Or
//' a compound surname, or not a compound surname but 'PhD' at the end of their name, and augh.
//'
//' \code{parse_names} provides a simple
//' function for taking consistently formatted human names and splitting them into \code{salutation}, \code{first_name},
//' \code{middle_name}, \code{last_name} and \code{suffix}. It is capable of dealing with compound surnames, multiple middle names,
//' and similar variations, and is fully vectorised.
//'
//' @param names a character vector of names to parse.
//'
//' @return a data.frame with the columns \code{salutation}, \code{first_name},
//' \code{middle_name}, \code{last_name}, \code{suffix} and \code{full_name} (which contains the original name). In the
//' event that a name doesn't \emph{have} a salutation, middle name, suffix, or so on, an empty string will be in that
//' field instead.
//'
//' @examples
//' # Parse a simple name
//' parse_names("Oliver Keyes")
//'
//' # Parse a more complex name
//' parse_names("Hon. Oliver Timothy Keyes Esq.")
//'
//' @export
// [[Rcpp::export]]
DataFrame humaniformat(std::vector < std::string > names){
DataFrame parse_names(std::vector < std::string > names){
human_parse parse_inst;
return parse_inst.parse_vector(names);
}

// [[Rcpp::export]]
std::vector < std::string > format_names(std::vector < std::string > names){
human_format format_inst;
return format_inst.format_vector(names);
}
12 changes: 6 additions & 6 deletions tests/testthat/test.R
Original file line number Diff line number Diff line change
@@ -1,45 +1,45 @@
context("Name parsing")

test_that("Simple {first, last} names can be parsed", {
result <- unlist(humaniformat("Jim Jeffries"))
result <- unlist(parse_names("Jim Jeffries"))
expect_true(all(names(result) == c("salutation","first_name","middle_name","last_name","suffix","full_name")))
expect_true(result["first_name"] == "Jim")
expect_true(result["last_name"] == "Jeffries")
})

test_that("Names with salutations can be parsed", {
result <- unlist(humaniformat("Dr. Jim Jeffries"))
result <- unlist(parse_names("Dr. Jim Jeffries"))
expect_true(all(names(result) == c("salutation","first_name","middle_name","last_name","suffix","full_name")))
expect_true(result["first_name"] == "Jim")
expect_true(result["last_name"] == "Jeffries")
expect_true(result["salutation"] == "Dr.")
})

test_that("Names with suffixes can be parsed", {
result <- unlist(humaniformat("Jim Jeffries PhD"))
result <- unlist(parse_names("Jim Jeffries PhD"))
expect_true(all(names(result) == c("salutation","first_name","middle_name","last_name","suffix","full_name")))
expect_true(result["first_name"] == "Jim")
expect_true(result["last_name"] == "Jeffries")
expect_true(result["suffix"] == "PhD")
})

test_that("Names with middle names can be parsed", {
result <- unlist(humaniformat("Jim Schmidt Jeffries"))
result <- unlist(parse_names("Jim Schmidt Jeffries"))
expect_true(all(names(result) == c("salutation","first_name","middle_name","last_name","suffix","full_name")))
expect_true(result["first_name"] == "Jim")
expect_true(result["middle_name"] == "Schmidt")
expect_true(result["last_name"] == "Jeffries")
})

test_that("Names with compound surnames can be parsed", {
result <- unlist(humaniformat("Jim de la ben Jeffries"))
result <- unlist(parse_names("Jim de la ben Jeffries"))
expect_true(all(names(result) == c("salutation","first_name","middle_name","last_name","suffix","full_name")))
expect_true(result["first_name"] == "Jim")
expect_true(result["last_name"] == "de la ben Jeffries")
})

test_that("Names with all elements can be parsed", {
result <- unlist(humaniformat("Rev Jim Schmidt de la ben Jeffries PhD"))
result <- unlist(parse_names("Rev Jim Schmidt de la ben Jeffries PhD"))
expect_true(all(names(result) == c("salutation","first_name","middle_name","last_name","suffix","full_name")))
expect_true(result["salutation"] == "Rev")
expect_true(result["first_name"] == "Jim")
Expand Down

0 comments on commit 706afe8

Please sign in to comment.