forked from aymara/lima
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrecognizerCompiler.h
153 lines (129 loc) · 5.35 KB
/
recognizerCompiler.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
/*
Copyright 2002-2013 CEA LIST
This file is part of LIMA.
LIMA is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
LIMA is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with LIMA. If not, see <http://www.gnu.org/licenses/>
*/
/************************************************************************
*
* File : recognizerCompiler.h
* Project : Named Entities
* Author : Besancon Romaric ([email protected])
* Created on : Mon Apr 7 2003
* Copyright : (c) 2003 by CEA
* Version : $Id: recognizerCompiler.h 9081 2008-02-25 18:34:51Z de-chalendarg $
*
* Description : a namespace for the compilation of a recognizer based on a text file
*
************************************************************************/
#ifndef RECOGNIZERCOMPILER_H
#define RECOGNIZERCOMPILER_H
#include "AutomatonCompilerExport.h"
#include "linguisticProcessing/core/Automaton/recognizer.h"
#include "gazeteer.h"
#include "subAutomaton.h"
namespace Lima {
namespace LinguisticProcessing {
namespace Automaton {
class LIMA_AUTOMATONCOMPILER_EXPORT RecognizerCompiler
{
public:
RecognizerCompiler(const std::string& filename);
~RecognizerCompiler();
/**
* encoding of the recognizer compiler (encoding of the file
* containing the rules)
*
* @param encoding the encoding to set (if not set, is "latin1",
* can be set to "utf8", other encodings are ignored)
*/
static void setRecognizerEncoding(const std::string& encoding) {
m_recognizerEncoding = encoding;
}
/**
* read a line from a text file into a LimaString, according
* to the encoding set by setRecognizerEncoding() function
*
* @param file the file to read from
* @param line the line read
*/
void readline(LimaString& line);
int getLineNumber() const { return m_lineNumber; }
const std::string& getFilename() const { return m_filename; }
/**
* build a recognizer from a file
*
* @param reco the recognizer build
* @param filename the name of the text file containing the rules
*/
void buildRecognizer(Recognizer& reco,
MediaId language,
const std::string& filename="");
bool endOfFile() { return m_stream->eof(); }
/**
* function to log error properly (filename, line number...)
* and throw exception
*
* @param error
* @param ruleString
*/
void throwError(const std::string& error,
const LimaString& line=LimaString()) const;
void printWarning(const std::string& error,
const LimaString& line=LimaString()) const;
private:
static std::string m_recognizerEncoding;
LimaString m_defaultAction;
std::vector<LimaString> m_activeEntityGroups;
int m_lineNumber; // just kept for better error identification
std::string m_filename; // just kept for better error identification
LimaString m_currentLine; // just kept for better error identification
std::ifstream* m_stream;
uint64_t m_nbRule;
// private functions
void expandGazeteersInRule(LimaString& s,
const std::vector<Gazeteer>& gazeteers);
void expandSubAutomatonsInRule(LimaString& s,
const std::vector<SubAutomaton>& subAutomatons);
LimaString peekConstraints(std::ifstream& file);
void parseTypeDefinition(const LimaString& str,
int& offset,
std::string& typeName,
std::string& openingTag,
std::string& closingTag,
std::vector<std::string>& attributes);
std::string nextFieldTypeDefinition(const LimaString& str,
int& offset);
void readSubAutomaton(const LimaString& line,
const std::vector<Gazeteer>& gazeteers,
std::vector<SubAutomaton>& subAutomatons);
void readGazeteers(const std::string& filename,
std::vector<Gazeteer>& gazeteers,
const std::vector<SubAutomaton>& subAutomatons);
void addRuleWithGazeteerTrigger(const LimaString& gazeteerName,
LimaString& ruleString,
const std::vector<Gazeteer>& gazeteers,
const std::vector<SubAutomaton>& subAutomatons,
const LimaString& defaultAction,
Recognizer& reco,
MediaId language,const std::string& currentId,
const bool keepTrigger=true,
const bool headTrigger=false);
double currentRuleWeight();
// bool checkRule(const Rule& rule,
// const TransitionUnit* trigger,
// MediaId language,
// std::ostringstream& message) const;
};
} // end namespace
} // end namespace
} // end namespace
#endif