From b666e809effe2d0f867157e6feef61ff8be7292e Mon Sep 17 00:00:00 2001 From: endyul Date: Sun, 4 Jan 2015 15:58:58 +0800 Subject: [PATCH 1/2] [add] capture special tokens in preprocessing --- src/segmentor/rulebase.cpp | 73 ++++++++++++++++++++++++++++----- src/segmentor/rulebase.h | 3 +- src/segmentor/special_lexicon.h | 19 +++++++++ 3 files changed, 84 insertions(+), 11 deletions(-) create mode 100644 src/segmentor/special_lexicon.h diff --git a/src/segmentor/rulebase.cpp b/src/segmentor/rulebase.cpp index 3b1a37e1b..c05d355b6 100644 --- a/src/segmentor/rulebase.cpp +++ b/src/segmentor/rulebase.cpp @@ -1,4 +1,5 @@ #include "segmentor/rulebase.h" +#include "segmentor/special_lexicon.h" namespace ltp { namespace segmentor { @@ -27,6 +28,23 @@ int preprocess(const std::string & sentence, flags[i] = 0; } + int pos = 0; + for (int i = 0; i < ltp::segmentor::special_lexicon_size; ++i){ + pos = 0; + const std::string& special_word = ltp::segmentor::special_lexicon[i]; + while((pos = sent.find(special_word, pos)) != std::string::npos){ + int pos_end = pos + special_word.length(); + if (flags_clear_check(flags, pos, pos_end)){ + flags[pos] = SPECIAL_TOKEN_BEG; + if(pos_end -1 > pos){ + flags_color(flags, pos+1, pos_end-1, SPECIAL_TOKEN_MID); + flags[pos_end-1] = SPECIAL_TOKEN_END; + } + } + pos = pos_end; + } + } + start = sent.begin(); end = sent.end(); @@ -36,7 +54,10 @@ int preprocess(const std::string & sentence, if (flags_clear_check(flags, left, right)) { flags[left] = URI_BEG; - flags_color(flags, left+1, right, URI_MID); + if(right-1 > left){ + flags_color(flags, left+1, right-1, URI_MID); + flags[right-1] = URI_END; + } } start = what[0].second; @@ -48,10 +69,12 @@ int preprocess(const std::string & sentence, while (boost::regex_search(start, end, what, engpattern, boost::match_default)) { int left = what[0].first - sent.begin(); int right = what[0].second - sent.begin(); - if (flags_clear_check(flags, left, right)) { flags[left] = ENG_BEG; - flags_color(flags, left+1, right, ENG_MID); + if(right-1 > left){ + flags_color(flags, left+1, right-1, ENG_MID); + flags[right-1]=ENG_END; + } } start = what[0].second; @@ -62,15 +85,36 @@ int preprocess(const std::string & sentence, for (int i = 0; i < len; ) { int flag = 0; - if ((flag = flags[i])) { - form = ""; - for (; i 0) { + chartypes.back() |= HAVE_ENG_ON_RIGHT; + } - if (flag == ENG_BEG) { + chartypes.push_back(CHAR_ENG); + chartypes.back() |= left; + left = HAVE_ENG_ON_LEFT; + ++ret; + } else if((flag = flags[i]) == ENG_BEG){ + form = ""; + form += sent[i++]; + for (; i 0) { chartypes.back() |= HAVE_ENG_ON_RIGHT; @@ -79,7 +123,17 @@ int preprocess(const std::string & sentence, chartypes.push_back(CHAR_ENG); chartypes.back() |= left; left = HAVE_ENG_ON_LEFT; - } else if (flag == URI_BEG) { + ++ret; + } else if ((flag = flags[i]) == URI_BEG){ + form = ""; + form += sent[i++]; + for (; i 0) { chartypes.back() |= HAVE_URI_ON_RIGHT; @@ -88,8 +142,7 @@ int preprocess(const std::string & sentence, chartypes.push_back(CHAR_URI); chartypes.back() |= left; left = HAVE_URI_ON_LEFT; - } - ++ ret; + ++ret; } else { if ((sent[i]&0x80)==0) { if ((sent[i] != ' ') && (sent[i] != '\t')) { diff --git a/src/segmentor/rulebase.h b/src/segmentor/rulebase.h index 2f18cb173..45640e9b4 100644 --- a/src/segmentor/rulebase.h +++ b/src/segmentor/rulebase.h @@ -22,10 +22,11 @@ namespace ltp { namespace segmentor { namespace rulebase { -enum { URI_BEG=1, URI_MID, URI_END, ENG_BEG, ENG_MID, ENG_END }; +enum { URI_BEG=1, URI_MID, URI_END, ENG_BEG, ENG_MID, ENG_END, SPECIAL_TOKEN_BEG, SPECIAL_TOKEN_MID, SPECIAL_TOKEN_END }; const int CHAR_ENG = strutils::chartypes::CHAR_PUNC+1; const int CHAR_URI = strutils::chartypes::CHAR_PUNC+2; + const unsigned HAVE_SPACE_ON_LEFT = (1<<3); const unsigned HAVE_SPACE_ON_RIGHT = (1<<4); const unsigned HAVE_ENG_ON_LEFT = (1<<5); diff --git a/src/segmentor/special_lexicon.h b/src/segmentor/special_lexicon.h new file mode 100644 index 000000000..a9be61371 --- /dev/null +++ b/src/segmentor/special_lexicon.h @@ -0,0 +1,19 @@ +#ifndef __LTP_SPECIAL_LEXICON_H__ +#define __LTP_SPECIAL_LEXICON_H__ + +namespace ltp{ +namespace segmentor{ +const static std::string special_lexicon[] = { +"AT&T", +"c#", +"C#", +"c++", +"C++", +}; + +const static size_t special_lexicon_size = sizeof(special_lexicon) / sizeof(std::string); +} +} + + +#endif From 487b8cf4ebd03e895f08449cc02b3d2ec3ff245d Mon Sep 17 00:00:00 2001 From: endyul Date: Sun, 18 Jan 2015 22:46:07 +0800 Subject: [PATCH 2/2] [fix] change name --- src/segmentor/rulebase.cpp | 10 +++++----- src/segmentor/special_lexicon.h | 19 ------------------- src/segmentor/special_tokens.h | 19 +++++++++++++++++++ 3 files changed, 24 insertions(+), 24 deletions(-) delete mode 100644 src/segmentor/special_lexicon.h create mode 100644 src/segmentor/special_tokens.h diff --git a/src/segmentor/rulebase.cpp b/src/segmentor/rulebase.cpp index c05d355b6..0a516df88 100644 --- a/src/segmentor/rulebase.cpp +++ b/src/segmentor/rulebase.cpp @@ -1,5 +1,5 @@ #include "segmentor/rulebase.h" -#include "segmentor/special_lexicon.h" +#include "segmentor/special_tokens.h" namespace ltp { namespace segmentor { @@ -29,11 +29,11 @@ int preprocess(const std::string & sentence, } int pos = 0; - for (int i = 0; i < ltp::segmentor::special_lexicon_size; ++i){ + for (int i = 0; i < ltp::segmentor::special_tokens_size; ++i){ pos = 0; - const std::string& special_word = ltp::segmentor::special_lexicon[i]; - while((pos = sent.find(special_word, pos)) != std::string::npos){ - int pos_end = pos + special_word.length(); + const std::string& special_token = ltp::segmentor::special_tokens[i]; + while((pos = sent.find(special_token, pos)) != std::string::npos){ + int pos_end = pos + special_token.length(); if (flags_clear_check(flags, pos, pos_end)){ flags[pos] = SPECIAL_TOKEN_BEG; if(pos_end -1 > pos){ diff --git a/src/segmentor/special_lexicon.h b/src/segmentor/special_lexicon.h deleted file mode 100644 index a9be61371..000000000 --- a/src/segmentor/special_lexicon.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef __LTP_SPECIAL_LEXICON_H__ -#define __LTP_SPECIAL_LEXICON_H__ - -namespace ltp{ -namespace segmentor{ -const static std::string special_lexicon[] = { -"AT&T", -"c#", -"C#", -"c++", -"C++", -}; - -const static size_t special_lexicon_size = sizeof(special_lexicon) / sizeof(std::string); -} -} - - -#endif diff --git a/src/segmentor/special_tokens.h b/src/segmentor/special_tokens.h new file mode 100644 index 000000000..6d056da5f --- /dev/null +++ b/src/segmentor/special_tokens.h @@ -0,0 +1,19 @@ +#ifndef __LTP_SPECIAL_TOKENS_H__ +#define __LTP_SPECIAL_TOKENS_H__ + +namespace ltp{ +namespace segmentor{ +const static std::string special_tokens[] = { +"AT&T", +"c#", +"C#", +"c++", +"C++", +}; + +const static size_t special_tokens_size = sizeof(special_tokens) / sizeof(std::string); +} +} + + +#endif