FreeLing  3.1
automat.h
Go to the documentation of this file.
00001 
00002 //
00003 //    FreeLing - Open Source Language Analyzers
00004 //
00005 //    Copyright (C) 2004   TALP Research Center
00006 //                         Universitat Politecnica de Catalunya
00007 //
00008 //    This library is free software; you can redistribute it and/or
00009 //    modify it under the terms of the GNU General Public
00010 //    License as published by the Free Software Foundation; either
00011 //    version 3 of the License, or (at your option) any later version.
00012 //
00013 //    This library is distributed in the hope that it will be useful,
00014 //    but WITHOUT ANY WARRANTY; without even the implied warranty of
00015 //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016 //    General Public License for more details.
00017 //
00018 //    You should have received a copy of the GNU General Public
00019 //    License along with this library; if not, write to the Free Software
00020 //    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00021 //
00022 //    contact: Lluis Padro (padro@lsi.upc.es)
00023 //             TALP Research Center
00024 //             despatx C6.212 - Campus Nord UPC
00025 //             08034 Barcelona.  SPAIN
00026 //
00028 
00029 #ifndef _AUTOMAT
00030 #define _AUTOMAT
00031 
00032 #include <set>
00033 
00034 #include "freeling/windll.h"
00035 #include "freeling/morfo/language.h"
00036 #include "freeling/morfo/processor.h"
00037 #include "freeling/morfo/traces.h"
00038 
00039 namespace freeling {
00040 
00041 #define MAX_STATES 100
00042 #define MAX_TOKENS 50
00043 
00044 #define MOD_TRACENAME L"AUTOMAT"
00045 #define MOD_TRACECODE AUTOMAT_TRACE
00046 
00050 
00051   class automat_status : public processor_status {
00052   public:
00053     // shift beggining of multiword by N words (in_1982 -> 1982)
00054     int shiftbegin; 
00055   };
00056 
00103 
00104   template <class T>
00105     class WINDLL automat : public processor {
00106   private:
00109     virtual int ComputeToken(int, sentence::iterator &, sentence &) const =0;
00112     virtual void ResetActions(T *) const =0;
00116     virtual void StateActions(int, int, int, sentence::const_iterator, T *) const =0;
00119     virtual void SetMultiwordAnalysis(sentence::iterator, int, const T *) const =0;
00122     virtual bool ValidMultiWord(const word &w, T *st) const { return(true); }
00123 
00125     virtual sentence::iterator BuildMultiword(sentence &se, sentence::iterator start, sentence::iterator end, int fs, bool &built) const {
00126       sentence::iterator i;
00127       std::list<word> mw;
00128       std::wstring form;
00129     
00130       TRACE(3,L"Building multiword");
00131         
00132       processor_status *pst = se.get_processing_status();
00133       T *st = (T*)pst;
00134 
00135       // ignore initial tokens, if needed (e.g. in_1982 -> 1982)
00136       for (int i=0; i<((automat_status*)st)->shiftbegin && start!=end; i++) start++;
00137     
00138       for (i=start; i!=end; i++){
00139         mw.push_back(*i);           
00140         form += i->get_form()+L"_";
00141         TRACE(3,L"added next ["+form+L"]");
00142       } 
00143       // don't forget last word
00144       mw.push_back(*i);           
00145       form += i->get_form();
00146       TRACE(3,L"added last ["+form+L"]");
00147     
00148       // build new word with the mw list, and check whether it is acceptable
00149       word w(form,mw);
00150     
00151       if (ValidMultiWord(w,st)) {  
00152         TRACE(3,L"Valid Multiword. Modifying the sentence");
00153       
00154         // erasing from the sentence the words that composed the multiword
00155         end++;
00156         i=se.erase(start, end);
00157         // insert new multiword it into the sentence
00158         i=se.insert(i,w); 
00159       
00160         TRACE(3,L"New word inserted");
00161         // Set morphological info for new MW
00162         SetMultiwordAnalysis(i,fs,st);
00163         built=true;
00164       }
00165       else {
00166         TRACE(3,L"Multiword found, but rejected. Sentence untouched");
00167         ResetActions(st);
00168         i=start;
00169         built=false;
00170       }
00171     
00172       return(i);
00173     }
00174   
00175   
00176   protected:
00178     int initialState;
00180     int stopState;
00182     int trans[MAX_STATES][MAX_TOKENS];
00184     std::set<int> Final;
00185 
00186   public:
00188     automat<T>() {};
00190     virtual ~automat<T>() {};
00191 
00193     bool matching(sentence &se, sentence::iterator &i) const {
00194       sentence::iterator j,sMatch,eMatch; 
00195       int newstate, state, token, fstate;
00196       bool found=false;
00197 
00198       TRACE(3,L"Checking for mw starting at word '"+i->get_form()+L"'");
00199 
00200       T *pst = new T();
00201       se.set_processing_status((processor_status *)pst);  
00202     
00203       // reset automaton
00204       state=initialState;
00205       fstate=0;
00206       ResetActions(pst);
00207       ((automat_status *)pst)->shiftbegin=0;
00208     
00209       sMatch=i; eMatch=se.end();
00210       for (j=i;state != stopState && j!=se.end(); j++) {
00211         // request the child class to compute the token
00212         // code for current word in current state
00213         token = ComputeToken(state,j,se);
00214         // do the transition to new state
00215         newstate = trans[state][token];
00216         // let the child class perform any actions 
00217         // for the new state (e.g. computing date value...)
00218         StateActions(state, newstate, token, j, pst);
00219         // change state
00220         state = newstate;
00221         // if the state codes a valid match, remember it
00222         //  as the longest match found so long.
00223         if (Final.find(state)!=Final.end()) {
00224           eMatch=j;
00225           fstate=state;
00226           TRACE(3,L"New candidate found");
00227         }
00228       }
00229     
00230       TRACE(3,L"STOP state reached. Check longest match");
00231       // stop state reached. find longest match (if any) and build a multiword
00232       if (eMatch!=se.end()) {
00233         TRACE(3,L"Match found");
00234         i=BuildMultiword(se,sMatch,eMatch,fstate,found);
00235         TRACE_SENTENCE(3,se);
00236       }
00237     
00238       se.clear_processing_status();
00239       return(found);
00240     }
00241   
00242 
00244     void analyze(sentence &se) const {
00245       sentence::iterator i;
00246       bool found=false;
00247 
00248       // check whether there is a match starting at each position i
00249       for (i=se.begin(); i!=se.end(); i++) {
00250         if (not i->is_locked()) {
00251           if (matching(se, i)) found=true;
00252         }
00253         else TRACE(3,L"Word '"+i->get_form()+L"' is locked. Skipped.");
00254       }
00255     
00256       if (found) se.rebuild_word_index();
00257     
00258       // Printing module results
00259       TRACE_SENTENCE(1,se);
00260     }
00261 
00263     using processor::analyze;
00264   };
00265 
00266 #undef MOD_TRACENAME
00267 #undef MOD_TRACECODE
00268 
00269 } // namespace
00270 
00271 #endif
00272