FreeLing
3.1
|
00001 00002 // 00003 // FreeLing - Open Source Language Analyzers 00004 // 00005 // Copyright (C) 2004 TALP Research Center 00006 // Universitat Politecnica de Catalunya 00007 // 00008 // This library is free software; you can redistribute it and/or 00009 // modify it under the terms of the GNU General Public 00010 // License as published by the Free Software Foundation; either 00011 // version 3 of the License, or (at your option) any later version. 00012 // 00013 // This library is distributed in the hope that it will be useful, 00014 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 // General Public License for more details. 00017 // 00018 // You should have received a copy of the GNU General Public 00019 // License along with this library; if not, write to the Free Software 00020 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 00021 // 00022 // contact: Lluis Padro (padro@lsi.upc.es) 00023 // TALP Research Center 00024 // despatx C6.212 - Campus Nord UPC 00025 // 08034 Barcelona. SPAIN 00026 // 00028 00029 #ifndef _AUTOMAT 00030 #define _AUTOMAT 00031 00032 #include <set> 00033 00034 #include "freeling/windll.h" 00035 #include "freeling/morfo/language.h" 00036 #include "freeling/morfo/processor.h" 00037 #include "freeling/morfo/traces.h" 00038 00039 namespace freeling { 00040 00041 #define MAX_STATES 100 00042 #define MAX_TOKENS 50 00043 00044 #define MOD_TRACENAME L"AUTOMAT" 00045 #define MOD_TRACECODE AUTOMAT_TRACE 00046 00050 00051 class automat_status : public processor_status { 00052 public: 00053 // shift beggining of multiword by N words (in_1982 -> 1982) 00054 int shiftbegin; 00055 }; 00056 00103 00104 template <class T> 00105 class WINDLL automat : public processor { 00106 private: 00109 virtual int ComputeToken(int, sentence::iterator &, sentence &) const =0; 00112 virtual void ResetActions(T *) const =0; 00116 virtual void StateActions(int, int, int, sentence::const_iterator, T *) const =0; 00119 virtual void SetMultiwordAnalysis(sentence::iterator, int, const T *) const =0; 00122 virtual bool ValidMultiWord(const word &w, T *st) const { return(true); } 00123 00125 virtual sentence::iterator BuildMultiword(sentence &se, sentence::iterator start, sentence::iterator end, int fs, bool &built) const { 00126 sentence::iterator i; 00127 std::list<word> mw; 00128 std::wstring form; 00129 00130 TRACE(3,L"Building multiword"); 00131 00132 processor_status *pst = se.get_processing_status(); 00133 T *st = (T*)pst; 00134 00135 // ignore initial tokens, if needed (e.g. in_1982 -> 1982) 00136 for (int i=0; i<((automat_status*)st)->shiftbegin && start!=end; i++) start++; 00137 00138 for (i=start; i!=end; i++){ 00139 mw.push_back(*i); 00140 form += i->get_form()+L"_"; 00141 TRACE(3,L"added next ["+form+L"]"); 00142 } 00143 // don't forget last word 00144 mw.push_back(*i); 00145 form += i->get_form(); 00146 TRACE(3,L"added last ["+form+L"]"); 00147 00148 // build new word with the mw list, and check whether it is acceptable 00149 word w(form,mw); 00150 00151 if (ValidMultiWord(w,st)) { 00152 TRACE(3,L"Valid Multiword. Modifying the sentence"); 00153 00154 // erasing from the sentence the words that composed the multiword 00155 end++; 00156 i=se.erase(start, end); 00157 // insert new multiword it into the sentence 00158 i=se.insert(i,w); 00159 00160 TRACE(3,L"New word inserted"); 00161 // Set morphological info for new MW 00162 SetMultiwordAnalysis(i,fs,st); 00163 built=true; 00164 } 00165 else { 00166 TRACE(3,L"Multiword found, but rejected. Sentence untouched"); 00167 ResetActions(st); 00168 i=start; 00169 built=false; 00170 } 00171 00172 return(i); 00173 } 00174 00175 00176 protected: 00178 int initialState; 00180 int stopState; 00182 int trans[MAX_STATES][MAX_TOKENS]; 00184 std::set<int> Final; 00185 00186 public: 00188 automat<T>() {}; 00190 virtual ~automat<T>() {}; 00191 00193 bool matching(sentence &se, sentence::iterator &i) const { 00194 sentence::iterator j,sMatch,eMatch; 00195 int newstate, state, token, fstate; 00196 bool found=false; 00197 00198 TRACE(3,L"Checking for mw starting at word '"+i->get_form()+L"'"); 00199 00200 T *pst = new T(); 00201 se.set_processing_status((processor_status *)pst); 00202 00203 // reset automaton 00204 state=initialState; 00205 fstate=0; 00206 ResetActions(pst); 00207 ((automat_status *)pst)->shiftbegin=0; 00208 00209 sMatch=i; eMatch=se.end(); 00210 for (j=i;state != stopState && j!=se.end(); j++) { 00211 // request the child class to compute the token 00212 // code for current word in current state 00213 token = ComputeToken(state,j,se); 00214 // do the transition to new state 00215 newstate = trans[state][token]; 00216 // let the child class perform any actions 00217 // for the new state (e.g. computing date value...) 00218 StateActions(state, newstate, token, j, pst); 00219 // change state 00220 state = newstate; 00221 // if the state codes a valid match, remember it 00222 // as the longest match found so long. 00223 if (Final.find(state)!=Final.end()) { 00224 eMatch=j; 00225 fstate=state; 00226 TRACE(3,L"New candidate found"); 00227 } 00228 } 00229 00230 TRACE(3,L"STOP state reached. Check longest match"); 00231 // stop state reached. find longest match (if any) and build a multiword 00232 if (eMatch!=se.end()) { 00233 TRACE(3,L"Match found"); 00234 i=BuildMultiword(se,sMatch,eMatch,fstate,found); 00235 TRACE_SENTENCE(3,se); 00236 } 00237 00238 se.clear_processing_status(); 00239 return(found); 00240 } 00241 00242 00244 void analyze(sentence &se) const { 00245 sentence::iterator i; 00246 bool found=false; 00247 00248 // check whether there is a match starting at each position i 00249 for (i=se.begin(); i!=se.end(); i++) { 00250 if (not i->is_locked()) { 00251 if (matching(se, i)) found=true; 00252 } 00253 else TRACE(3,L"Word '"+i->get_form()+L"' is locked. Skipped."); 00254 } 00255 00256 if (found) se.rebuild_word_index(); 00257 00258 // Printing module results 00259 TRACE_SENTENCE(1,se); 00260 } 00261 00263 using processor::analyze; 00264 }; 00265 00266 #undef MOD_TRACENAME 00267 #undef MOD_TRACECODE 00268 00269 } // namespace 00270 00271 #endif 00272