Skip to content

Commit

Permalink
[add] capture special tokens in preprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
endyul committed Jan 4, 2015
1 parent 89465ae commit b666e80
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 11 deletions.
73 changes: 63 additions & 10 deletions src/segmentor/rulebase.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "segmentor/rulebase.h"
#include "segmentor/special_lexicon.h"

namespace ltp {
namespace segmentor {
Expand Down Expand Up @@ -27,6 +28,23 @@ int preprocess(const std::string & sentence,
flags[i] = 0;
}

int pos = 0;
for (int i = 0; i < ltp::segmentor::special_lexicon_size; ++i){
pos = 0;
const std::string& special_word = ltp::segmentor::special_lexicon[i];
while((pos = sent.find(special_word, pos)) != std::string::npos){
int pos_end = pos + special_word.length();
if (flags_clear_check(flags, pos, pos_end)){
flags[pos] = SPECIAL_TOKEN_BEG;
if(pos_end -1 > pos){
flags_color(flags, pos+1, pos_end-1, SPECIAL_TOKEN_MID);
flags[pos_end-1] = SPECIAL_TOKEN_END;
}
}
pos = pos_end;
}
}

start = sent.begin();
end = sent.end();

Expand All @@ -36,7 +54,10 @@ int preprocess(const std::string & sentence,

if (flags_clear_check(flags, left, right)) {
flags[left] = URI_BEG;
flags_color(flags, left+1, right, URI_MID);
if(right-1 > left){
flags_color(flags, left+1, right-1, URI_MID);
flags[right-1] = URI_END;
}
}

start = what[0].second;
Expand All @@ -48,10 +69,12 @@ int preprocess(const std::string & sentence,
while (boost::regex_search(start, end, what, engpattern, boost::match_default)) {
int left = what[0].first - sent.begin();
int right = what[0].second - sent.begin();

if (flags_clear_check(flags, left, right)) {
flags[left] = ENG_BEG;
flags_color(flags, left+1, right, ENG_MID);
if(right-1 > left){
flags_color(flags, left+1, right-1, ENG_MID);
flags[right-1]=ENG_END;
}
}

start = what[0].second;
Expand All @@ -62,15 +85,36 @@ int preprocess(const std::string & sentence,

for (int i = 0; i < len; ) {
int flag = 0;
if ((flag = flags[i])) {
form = "";

for (; i<len && flags[i]; ++ i) {
if((flag = flags[i]) == SPECIAL_TOKEN_BEG){
form = "";
form += sent[i++];
for (; i<len && flags[i]==SPECIAL_TOKEN_MID; ++ i) {
form += sent[i];
}
if(i < len && flags[i]==SPECIAL_TOKEN_END){
form += sent[i++];
}
raw_forms.push_back(form);
forms.push_back( __eng__ );
if (chartypes.size() > 0) {
chartypes.back() |= HAVE_ENG_ON_RIGHT;
}

if (flag == ENG_BEG) {
chartypes.push_back(CHAR_ENG);
chartypes.back() |= left;
left = HAVE_ENG_ON_LEFT;
++ret;
} else if((flag = flags[i]) == ENG_BEG){
form = "";
form += sent[i++];
for (; i<len && flags[i]==ENG_MID; ++ i) {
form += sent[i];
}
if(i < len && flags[i]==ENG_END){
form += sent[i++];
}
raw_forms.push_back(form);
forms.push_back( __eng__ );
if (chartypes.size() > 0) {
chartypes.back() |= HAVE_ENG_ON_RIGHT;
Expand All @@ -79,7 +123,17 @@ int preprocess(const std::string & sentence,
chartypes.push_back(CHAR_ENG);
chartypes.back() |= left;
left = HAVE_ENG_ON_LEFT;
} else if (flag == URI_BEG) {
++ret;
} else if ((flag = flags[i]) == URI_BEG){
form = "";
form += sent[i++];
for (; i<len && flags[i]==URI_MID; ++ i) {
form += sent[i];
}
if(i < len && flags[i]==URI_END){
form += sent[i++];
}
raw_forms.push_back(form);
forms.push_back( __uri__ );
if (chartypes.size() > 0) {
chartypes.back() |= HAVE_URI_ON_RIGHT;
Expand All @@ -88,8 +142,7 @@ int preprocess(const std::string & sentence,
chartypes.push_back(CHAR_URI);
chartypes.back() |= left;
left = HAVE_URI_ON_LEFT;
}
++ ret;
++ret;
} else {
if ((sent[i]&0x80)==0) {
if ((sent[i] != ' ') && (sent[i] != '\t')) {
Expand Down
3 changes: 2 additions & 1 deletion src/segmentor/rulebase.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,11 @@ namespace ltp {
namespace segmentor {
namespace rulebase {

enum { URI_BEG=1, URI_MID, URI_END, ENG_BEG, ENG_MID, ENG_END };
enum { URI_BEG=1, URI_MID, URI_END, ENG_BEG, ENG_MID, ENG_END, SPECIAL_TOKEN_BEG, SPECIAL_TOKEN_MID, SPECIAL_TOKEN_END };
const int CHAR_ENG = strutils::chartypes::CHAR_PUNC+1;
const int CHAR_URI = strutils::chartypes::CHAR_PUNC+2;


const unsigned HAVE_SPACE_ON_LEFT = (1<<3);
const unsigned HAVE_SPACE_ON_RIGHT = (1<<4);
const unsigned HAVE_ENG_ON_LEFT = (1<<5);
Expand Down
19 changes: 19 additions & 0 deletions src/segmentor/special_lexicon.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#ifndef __LTP_SPECIAL_LEXICON_H__
#define __LTP_SPECIAL_LEXICON_H__

namespace ltp{
namespace segmentor{
const static std::string special_lexicon[] = {
"AT&T",
"c#",
"C#",
"c++",
"C++",
};

const static size_t special_lexicon_size = sizeof(special_lexicon) / sizeof(std::string);
}
}


#endif

0 comments on commit b666e80

Please sign in to comment.