FreeLing
3.1
|
00001 00002 // 00003 // FreeLing - Open Source Language Analyzers 00004 // 00005 // Copyright (C) 2004 TALP Research Center 00006 // Universitat Politecnica de Catalunya 00007 // 00008 // This library is free software; you can redistribute it and/or 00009 // modify it under the terms of the GNU General Public 00010 // License as published by the Free Software Foundation; either 00011 // version 3 of the License, or (at your option) any later version. 00012 // 00013 // This library is distributed in the hope that it will be useful, 00014 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 // General Public License for more details. 00017 // 00018 // You should have received a copy of the GNU General Public 00019 // License along with this library; if not, write to the Free Software 00020 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 00021 // 00022 // contact: Lluis Padro (padro@lsi.upc.es) 00023 // TALP Research Center 00024 // despatx C6.212 - Campus Nord UPC 00025 // 08034 Barcelona. SPAIN 00026 // 00028 00029 #ifndef _LANGUAGE 00030 #define _LANGUAGE 00031 00032 #include <string> 00033 #include <list> 00034 #include <vector> 00035 #include <set> 00036 #include <map> 00037 00038 #include "freeling/regexp.h" 00039 #include "freeling/windll.h" 00040 #include "freeling/tree.h" 00041 00042 namespace freeling { 00043 00044 class word; // predeclaration 00045 00050 00051 class WINDLL analysis { 00052 00053 private: 00055 std::wstring lemma; 00057 std::wstring tag; 00059 double prob; 00061 double distance; 00063 std::list<std::pair<std::wstring,double> > senses; 00065 std::list<word> retok; 00066 00067 // store which sequences --among the kbest proposed by 00068 // the tagger-- contain this analysis 00069 std::set<int> selected_kbest; 00070 00071 public: 00073 std::vector<std::wstring> user; 00074 00076 analysis(); 00078 analysis(const std::wstring &, const std::wstring &); 00080 analysis& operator=(const analysis&); 00081 00082 void init(const std::wstring &l, const std::wstring &t); 00083 void set_lemma(const std::wstring &); 00084 void set_tag(const std::wstring &); 00085 void set_prob(double); 00086 void set_distance(double); 00087 void set_retokenizable(const std::list<word> &); 00088 00089 bool has_prob() const; 00090 bool has_distance() const; 00091 std::wstring get_lemma() const; 00092 std::wstring get_tag() const; 00093 double get_prob() const; 00094 double get_distance() const; 00095 bool is_retokenizable() const; 00096 std::list<word>& get_retokenizable(); 00097 const std::list<word>& get_retokenizable() const; 00098 00099 const std::list<std::pair<std::wstring,double> > & get_senses() const; 00100 std::list<std::pair<std::wstring,double> > & get_senses(); 00101 void set_senses(const std::list<std::pair<std::wstring,double> > &); 00102 // useful for java API 00103 std::wstring get_senses_string() const; 00104 00105 // get the largest kbest sequence index the analysis is selected in. 00106 int max_kbest() const; 00107 // find out whether the analysis is selected in the tagger k-th best sequence 00108 bool is_selected(int k=0) const; 00109 // mark this analysis as selected in k-th best sequence 00110 void mark_selected(int k=0); 00111 // unmark this analysis as selected in k-th best sequence 00112 void unmark_selected(int k=0); 00113 00115 bool operator>(const analysis &) const; 00117 bool operator<(const analysis &) const; 00119 bool operator==(const analysis &) const; 00120 }; 00121 00122 00127 00128 class WINDLL word : public std::list<analysis> { 00129 private: 00131 std::wstring form; 00133 std::wstring lc_form; 00135 std::wstring ph_form; 00137 std::list<word> multiword; 00139 bool ambiguous_mw; 00141 std::list<std::pair<std::wstring,int> > alternatives; 00143 unsigned long start, finish; 00145 bool in_dict; 00147 bool locked; 00149 void clone(const word &); 00151 size_t position; 00152 00154 static const int SELECTED=0; 00155 static const int UNSELECTED=1; 00156 static const int ALL=2; 00157 00158 public: 00159 // predeclarations 00160 class iterator; 00161 class const_iterator; 00162 00164 std::vector<std::wstring> user; 00165 00167 word(); 00169 word(const std::wstring &); 00171 word(const std::wstring &, const std::list<word> &); 00173 word(const std::wstring &, const std::list<analysis> &, const std::list<word> &); 00175 word(const word &); 00177 word& operator=(const word&); 00178 00180 void copy_analysis(const word &); 00182 int get_n_selected(int k=0) const; 00184 int get_n_unselected(int k=0) const; 00186 bool is_multiword() const; 00188 bool is_ambiguous_mw() const; 00190 void set_ambiguous_mw(bool); 00192 int get_n_words_mw() const; 00194 const std::list<word>& get_words_mw() const; 00196 std::wstring get_form() const; 00198 std::wstring get_lc_form() const; 00200 std::wstring get_ph_form() const; 00202 word::iterator selected_begin(int k=0); 00204 word::const_iterator selected_begin(int k=0) const; 00206 word::iterator selected_end(int k=0); 00208 word::const_iterator selected_end(int k=0) const; 00210 word::iterator unselected_begin(int k=0); 00212 word::const_iterator unselected_begin(int k=0) const; 00214 word::iterator unselected_end(int k=0); 00216 word::const_iterator unselected_end(int k=0) const; 00218 unsigned int num_kbest() const; 00220 std::wstring get_lemma(int k=0) const; 00222 std::wstring get_tag(int k=0) const; 00224 std::wstring get_short_tag(int k=0) const; 00226 std::wstring get_short_tag(const std::wstring &,int k=0) const; 00227 00229 const std::list<std::pair<std::wstring,double> >& get_senses(int k=0) const; 00230 std::list<std::pair<std::wstring,double> >& get_senses(int k=0); 00231 // useful for java API 00232 std::wstring get_senses_string(int k=0) const; 00234 void set_senses(const std::list<std::pair<std::wstring,double> > &, int k=0); 00235 00237 unsigned long get_span_start() const; 00238 unsigned long get_span_finish() const; 00239 00241 bool found_in_dict() const; 00243 void set_found_in_dict(bool); 00245 bool has_retokenizable() const; 00247 void lock_analysis(); 00249 bool is_locked() const; 00250 00252 void add_alternative(const std::wstring &, int); 00254 void set_alternatives(const std::list<std::pair<std::wstring,int> > &); 00256 void clear_alternatives(); 00258 bool has_alternatives() const; 00260 const std::list<std::pair<std::wstring,int> >& get_alternatives() const; 00262 std::list<std::pair<std::wstring,int> >& get_alternatives(); 00264 std::list<std::pair<std::wstring,int> >::iterator alternatives_begin(); 00266 std::list<std::pair<std::wstring,int> >::iterator alternatives_end(); 00268 std::list<std::pair<std::wstring,int> >::const_iterator alternatives_begin() const; 00270 std::list<std::pair<std::wstring,int> >::const_iterator alternatives_end() const; 00271 00273 void add_analysis(const analysis &); 00275 void set_analysis(const analysis &); 00277 void set_analysis(const std::list<analysis> &); 00279 void set_form(const std::wstring &); 00281 void set_ph_form(const std::wstring &); 00283 void set_span(unsigned long, unsigned long); 00284 00285 // get/set word position 00286 void set_position(size_t); 00287 size_t get_position() const; 00288 00290 bool find_tag_match(const freeling::regexp &) const; 00291 00293 int get_n_analysis() const; 00295 void unselect_all_analysis(int k=0); 00297 void select_all_analysis(int k=0); 00299 void select_analysis(word::iterator, int k=0); 00301 void unselect_analysis(word::iterator, int k=0); 00303 std::list<analysis> get_analysis() const; 00305 word::iterator analysis_begin(); 00306 word::const_iterator analysis_begin() const; 00308 word::iterator analysis_end(); 00309 word::const_iterator analysis_end() const; 00310 00312 class WINDLL iterator : public std::list<analysis>::iterator { 00313 friend class word::const_iterator; 00314 private: 00316 std::list<analysis>::iterator ibeg; 00318 std::list<analysis>::iterator iend; 00320 int type; 00322 int kbest; 00323 00324 public: 00326 iterator(); 00328 iterator(const word::iterator &); 00330 iterator(const std::list<analysis>::iterator &); 00332 iterator(const std::list<analysis>::iterator &, 00333 const std::list<analysis>::iterator &, 00334 const std::list<analysis>::iterator &, int,int k=0); 00336 iterator& operator++(); 00337 iterator operator++(int); 00338 }; 00339 00341 class WINDLL const_iterator : public std::list<analysis>::const_iterator { 00342 private: 00344 std::list<analysis>::const_iterator ibeg; 00346 std::list<analysis>::const_iterator iend; 00348 int type; 00350 int kbest; 00351 00352 public: 00354 const_iterator(); 00356 const_iterator(const word::const_iterator &); 00358 const_iterator(const word::iterator &); 00360 const_iterator(const std::list<analysis>::const_iterator &); 00362 const_iterator(const std::list<analysis>::iterator &); 00364 const_iterator(const std::list<analysis>::const_iterator &, 00365 const std::list<analysis>::const_iterator &, 00366 const std::list<analysis>::const_iterator &, int, int k=0); 00368 const_iterator& operator++(); 00369 const_iterator operator++(int); 00370 }; 00371 00372 }; 00373 00374 00375 00381 00382 class WINDLL node { 00383 protected: 00385 std::wstring nodeid; 00387 bool head; 00389 int chunk; 00391 std::wstring label; 00393 word * w; 00394 00395 public: 00397 std::vector<std::wstring> user; 00398 00400 node(); 00401 node(const std::wstring &); 00403 std::wstring get_node_id() const; 00405 void set_node_id(const std::wstring &); 00407 std::wstring get_label() const; 00409 const word & get_word() const; 00411 word & get_word(); 00413 void set_label(const std::wstring &); 00415 void set_word(word &); 00417 bool is_head() const; 00419 void set_head(const bool); 00421 bool is_chunk() const; 00423 void set_chunk(const int); 00425 int get_chunk_ord() const; 00426 00427 }; 00428 00432 00433 class WINDLL parse_tree : public tree<node> { 00434 private: 00435 // access nodes by id 00436 std::map<std::wstring,parse_tree::iterator> node_index; 00437 // acces leaf nodes by word position 00438 std::vector<parse_tree::iterator> word_index; 00439 00440 public: 00441 parse_tree(); 00442 parse_tree(parse_tree::iterator p); 00443 parse_tree(const node &); 00444 00446 void build_node_index(const std::wstring &); 00448 void rebuild_node_index(); 00450 parse_tree::const_iterator get_node_by_id(const std::wstring &) const; 00452 parse_tree::const_iterator get_node_by_pos(size_t) const; 00454 parse_tree::iterator get_node_by_id(const std::wstring &); 00456 parse_tree::iterator get_node_by_pos(size_t); 00457 00458 }; 00459 00460 00465 00466 class WINDLL depnode : public node { 00467 00468 private: 00470 parse_tree::iterator link; 00471 00472 public: 00473 depnode(); 00474 depnode(const std::wstring &); 00475 depnode(const node &); 00476 00478 void set_link(const parse_tree::iterator); 00480 parse_tree::iterator get_link(); 00481 parse_tree::const_iterator get_link() const; 00483 tree<node>& get_link_ref(); 00484 }; 00485 00486 00487 00491 00492 class WINDLL dep_tree : public tree<depnode> { 00493 00494 private: 00495 // acces nodes by word position 00496 std::vector<dep_tree::iterator> word_index; 00497 00498 public: 00499 dep_tree(); 00500 dep_tree(const depnode &); 00501 00503 dep_tree::const_iterator get_node_by_pos(size_t) const; 00505 dep_tree::iterator get_node_by_pos(size_t); 00507 void rebuild_node_index(); 00508 }; 00509 00510 00511 00517 00518 class processor_status { 00519 public: 00520 processor_status(); 00521 virtual ~processor_status() {}; 00522 }; 00523 00524 00530 00531 class WINDLL sentence : public std::list<word> { 00532 private: 00533 // sentence identifier, in case user application wants to set it. 00534 std::wstring sent_id; 00535 // vector with pointers to sentence words, for fast access by position 00536 std::vector<word*> wpos; 00537 // parse tree (if sentence parsed) 00538 std::map<int,parse_tree> pts; 00539 // dependencey tree (if sentence dep. parsed) 00540 std::map<int,dep_tree> dts; 00541 // clone sentence (used by assignment/copy constructors) 00542 void clone(const sentence &); 00543 // stack processing status for processor currently analyzing the sentence 00544 // (there might be a hierarchy of embeeded processors, thus the stack) 00545 std::list<processor_status*> status; 00546 00547 public: 00548 typedef std::pair<std::wstring, std::map<int,std::wstring> > pred_arg_set; 00549 std::map<int,pred_arg_set> pred_args; 00550 00551 sentence(); 00552 sentence(const std::list<word>&); 00554 sentence(const sentence &); 00556 sentence& operator=(const sentence&); 00558 const word& operator[](size_t) const; 00559 word& operator[](size_t); 00561 unsigned int num_kbest() const; 00563 void push_back(const word &); 00565 void rebuild_word_index(); 00566 00567 void clear(); 00568 00569 void set_sentence_id(const std::wstring &); 00570 std::wstring get_sentence_id(); 00571 00572 void set_parse_tree(const parse_tree &, int k=0); 00573 parse_tree & get_parse_tree(int k=0); 00574 const parse_tree & get_parse_tree(int k=0) const; 00575 bool is_parsed() const; 00576 00577 void set_dep_tree(const dep_tree &, int k=0); 00578 dep_tree & get_dep_tree(int k=0); 00579 const dep_tree & get_dep_tree(int k=0) const; 00580 bool is_dep_parsed() const; 00581 00583 processor_status* get_processing_status(); 00584 const processor_status* get_processing_status() const; 00586 void set_processing_status(processor_status *); 00588 void clear_processing_status(); 00589 00591 std::vector<word> get_words() const; 00593 sentence::iterator words_begin(); 00594 sentence::const_iterator words_begin() const; 00595 sentence::iterator words_end(); 00596 sentence::const_iterator words_end() const; 00597 }; 00598 00603 00604 class WINDLL paragraph : public std::list<sentence> { 00605 public: 00606 paragraph() {} 00607 virtual ~paragraph() {} 00608 }; 00609 00614 00615 class WINDLL document : public std::list<paragraph> { 00616 00617 private: 00618 paragraph title; 00619 std::multimap<int,std::wstring> group2node; 00620 std::map<std::wstring,int> node2group; 00621 int last_group; 00622 00623 public: 00624 document(); 00626 void add_positive(const std::wstring &node, int group); 00628 void add_positive(const std::wstring &node1, const std::wstring &node2); 00630 int get_coref_group(const std::wstring&) const; 00632 std::list<std::wstring> get_coref_nodes(int) const; 00634 bool is_coref(const std::wstring &, const std::wstring &) const; 00635 }; 00636 00637 } // namespace 00638 00639 #endif 00640