FreeLing  3.1
language.h
Go to the documentation of this file.
00001 
00002 //
00003 //    FreeLing - Open Source Language Analyzers
00004 //
00005 //    Copyright (C) 2004   TALP Research Center
00006 //                         Universitat Politecnica de Catalunya
00007 //
00008 //    This library is free software; you can redistribute it and/or
00009 //    modify it under the terms of the GNU General Public
00010 //    License as published by the Free Software Foundation; either
00011 //    version 3 of the License, or (at your option) any later version.
00012 //
00013 //    This library is distributed in the hope that it will be useful,
00014 //    but WITHOUT ANY WARRANTY; without even the implied warranty of
00015 //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016 //    General Public License for more details.
00017 //
00018 //    You should have received a copy of the GNU General Public
00019 //    License along with this library; if not, write to the Free Software
00020 //    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00021 //
00022 //    contact: Lluis Padro (padro@lsi.upc.es)
00023 //             TALP Research Center
00024 //             despatx C6.212 - Campus Nord UPC
00025 //             08034 Barcelona.  SPAIN
00026 //
00028 
00029 #ifndef _LANGUAGE
00030 #define _LANGUAGE
00031 
00032 #include <string>
00033 #include <list>
00034 #include <vector>
00035 #include <set>
00036 #include <map>
00037 
00038 #include "freeling/regexp.h"
00039 #include "freeling/windll.h"
00040 #include "freeling/tree.h"
00041 
00042 namespace freeling {
00043 
00044   class word; // predeclaration
00045 
00050 
00051   class WINDLL analysis {
00052 
00053   private:
00055     std::wstring lemma;
00057     std::wstring tag;
00059     double prob;
00061     double distance;
00063     std::list<std::pair<std::wstring,double> > senses;
00065     std::list<word> retok;
00066 
00067     // store which sequences --among the kbest proposed by 
00068     // the tagger-- contain this analysis
00069     std::set<int> selected_kbest;
00070 
00071   public:
00073     std::vector<std::wstring> user;
00074 
00076     analysis();
00078     analysis(const std::wstring &, const std::wstring &);
00080     analysis& operator=(const analysis&);
00081 
00082     void init(const std::wstring &l, const std::wstring &t);
00083     void set_lemma(const std::wstring &);
00084     void set_tag(const std::wstring &);
00085     void set_prob(double);
00086     void set_distance(double);
00087     void set_retokenizable(const std::list<word> &);
00088 
00089     bool has_prob() const;
00090     bool has_distance() const;
00091     std::wstring get_lemma() const;
00092     std::wstring get_tag() const;
00093     double get_prob() const;
00094     double get_distance() const;
00095     bool is_retokenizable() const;
00096     std::list<word>& get_retokenizable();
00097     const std::list<word>& get_retokenizable() const;
00098 
00099     const std::list<std::pair<std::wstring,double> > & get_senses() const;
00100     std::list<std::pair<std::wstring,double> > & get_senses();
00101     void set_senses(const std::list<std::pair<std::wstring,double> > &);
00102     // useful for java API
00103     std::wstring get_senses_string() const;
00104 
00105     // get the largest kbest sequence index the analysis is selected in.
00106     int max_kbest() const;
00107     // find out whether the analysis is selected in the tagger k-th best sequence
00108     bool is_selected(int k=0) const;
00109     // mark this analysis as selected in k-th best sequence
00110     void mark_selected(int k=0);
00111     // unmark this analysis as selected in k-th best sequence
00112     void unmark_selected(int k=0);
00113 
00115     bool operator>(const analysis &) const;
00117     bool operator<(const analysis &) const;
00119     bool operator==(const analysis &) const;
00120   };
00121 
00122 
00127 
00128   class WINDLL word : public std::list<analysis> {
00129   private:
00131     std::wstring form;
00133     std::wstring lc_form;
00135     std::wstring ph_form;
00137     std::list<word> multiword;
00139     bool ambiguous_mw;
00141     std::list<std::pair<std::wstring,int> > alternatives;
00143     unsigned long start, finish;
00145     bool in_dict;
00147     bool locked;
00149     void clone(const word &);
00151     size_t position;
00152  
00154     static const int SELECTED=0;
00155     static const int UNSELECTED=1;
00156     static const int ALL=2;
00157 
00158   public:
00159     // predeclarations
00160     class iterator; 
00161     class const_iterator; 
00162 
00164     std::vector<std::wstring> user;
00165 
00167     word();
00169     word(const std::wstring &);
00171     word(const std::wstring &, const std::list<word> &);
00173     word(const std::wstring &, const std::list<analysis> &, const std::list<word> &);
00175     word(const word &);
00177     word& operator=(const word&);
00178 
00180     void copy_analysis(const word &);
00182     int get_n_selected(int k=0) const;
00184     int get_n_unselected(int k=0) const;
00186     bool is_multiword() const;
00188     bool is_ambiguous_mw() const;
00190     void set_ambiguous_mw(bool);
00192     int get_n_words_mw() const;
00194     const std::list<word>& get_words_mw() const;
00196     std::wstring get_form() const;
00198     std::wstring get_lc_form() const;
00200     std::wstring get_ph_form() const;
00202     word::iterator selected_begin(int k=0);
00204     word::const_iterator selected_begin(int k=0) const;
00206     word::iterator selected_end(int k=0);
00208     word::const_iterator selected_end(int k=0) const;
00210     word::iterator unselected_begin(int k=0);
00212     word::const_iterator unselected_begin(int k=0) const;
00214     word::iterator unselected_end(int k=0);
00216     word::const_iterator unselected_end(int k=0) const;
00218     unsigned int num_kbest() const;
00220     std::wstring get_lemma(int k=0) const;
00222     std::wstring get_tag(int k=0) const;
00224     std::wstring get_short_tag(int k=0) const;
00226     std::wstring get_short_tag(const std::wstring &,int k=0) const;
00227 
00229     const std::list<std::pair<std::wstring,double> >& get_senses(int k=0) const;
00230     std::list<std::pair<std::wstring,double> >& get_senses(int k=0);
00231     // useful for java API
00232     std::wstring get_senses_string(int k=0) const;
00234     void set_senses(const std::list<std::pair<std::wstring,double> > &, int k=0);
00235 
00237     unsigned long get_span_start() const;
00238     unsigned long get_span_finish() const;
00239 
00241     bool found_in_dict() const;
00243     void set_found_in_dict(bool);
00245     bool has_retokenizable() const;
00247     void lock_analysis();
00249     bool is_locked() const;
00250 
00252     void add_alternative(const std::wstring &, int);
00254     void set_alternatives(const std::list<std::pair<std::wstring,int> > &);
00256     void clear_alternatives();
00258     bool has_alternatives() const;
00260     const std::list<std::pair<std::wstring,int> >& get_alternatives() const;
00262     std::list<std::pair<std::wstring,int> >& get_alternatives();
00264     std::list<std::pair<std::wstring,int> >::iterator alternatives_begin();
00266     std::list<std::pair<std::wstring,int> >::iterator alternatives_end();
00268     std::list<std::pair<std::wstring,int> >::const_iterator alternatives_begin() const;
00270     std::list<std::pair<std::wstring,int> >::const_iterator alternatives_end() const;
00271 
00273     void add_analysis(const analysis &);
00275     void set_analysis(const analysis &);
00277     void set_analysis(const std::list<analysis> &);
00279     void set_form(const std::wstring &);
00281     void set_ph_form(const std::wstring &);
00283     void set_span(unsigned long, unsigned long);
00284 
00285     // get/set word position
00286     void set_position(size_t);
00287     size_t get_position() const;
00288 
00290     bool find_tag_match(const freeling::regexp &) const;
00291 
00293     int get_n_analysis() const;
00295     void unselect_all_analysis(int k=0);
00297     void select_all_analysis(int k=0);
00299     void select_analysis(word::iterator, int k=0);
00301     void unselect_analysis(word::iterator, int k=0);
00303     std::list<analysis> get_analysis() const;
00305     word::iterator analysis_begin();
00306     word::const_iterator analysis_begin() const;
00308     word::iterator analysis_end();
00309     word::const_iterator analysis_end() const;
00310 
00312     class WINDLL iterator : public std::list<analysis>::iterator {
00313       friend class word::const_iterator;
00314     private:
00316       std::list<analysis>::iterator ibeg;
00318       std::list<analysis>::iterator iend;
00320       int type;
00322       int kbest;
00323 
00324     public:
00326       iterator();
00328       iterator(const word::iterator &);
00330       iterator(const std::list<analysis>::iterator &);
00332       iterator(const std::list<analysis>::iterator &, 
00333                const std::list<analysis>::iterator &, 
00334                const std::list<analysis>::iterator &, int,int k=0);  
00336       iterator& operator++();
00337       iterator operator++(int);
00338     };
00339   
00341     class WINDLL const_iterator : public std::list<analysis>::const_iterator {
00342     private:
00344       std::list<analysis>::const_iterator ibeg;
00346       std::list<analysis>::const_iterator iend;
00348       int type;
00350       int kbest;
00351 
00352     public:
00354       const_iterator();
00356       const_iterator(const word::const_iterator &);
00358       const_iterator(const word::iterator &);
00360       const_iterator(const std::list<analysis>::const_iterator &);
00362       const_iterator(const std::list<analysis>::iterator &);
00364       const_iterator(const std::list<analysis>::const_iterator &,
00365                      const std::list<analysis>::const_iterator &, 
00366                      const std::list<analysis>::const_iterator &, int, int k=0);
00368       const_iterator& operator++();  
00369       const_iterator operator++(int);  
00370     };
00371 
00372   };
00373 
00374 
00375 
00381 
00382   class WINDLL node {
00383   protected:
00385     std::wstring nodeid;
00387     bool head;
00389     int chunk;
00391     std::wstring label;
00393     word * w;
00394 
00395   public:
00397     std::vector<std::wstring> user;
00398 
00400     node();
00401     node(const std::wstring &);
00403     std::wstring get_node_id() const;
00405     void set_node_id(const std::wstring &);
00407     std::wstring get_label() const;
00409     const word & get_word() const;
00411     word & get_word();
00413     void set_label(const std::wstring &);
00415     void set_word(word &);
00417     bool is_head() const;
00419     void set_head(const bool);
00421     bool is_chunk() const;
00423     void set_chunk(const int);
00425     int  get_chunk_ord() const;
00426 
00427   };
00428 
00432 
00433   class WINDLL parse_tree : public tree<node> {
00434   private:
00435     // access nodes by id
00436     std::map<std::wstring,parse_tree::iterator> node_index;
00437     // acces leaf nodes by word position
00438     std::vector<parse_tree::iterator> word_index;
00439 
00440   public:
00441     parse_tree();
00442     parse_tree(parse_tree::iterator p);
00443     parse_tree(const node &);
00444 
00446     void build_node_index(const std::wstring &);
00448     void rebuild_node_index();
00450     parse_tree::const_iterator get_node_by_id(const std::wstring &) const;
00452     parse_tree::const_iterator get_node_by_pos(size_t) const;
00454     parse_tree::iterator get_node_by_id(const std::wstring &);
00456     parse_tree::iterator get_node_by_pos(size_t);
00457 
00458   };
00459 
00460 
00465 
00466   class WINDLL depnode : public node {
00467 
00468   private:
00470     parse_tree::iterator link;
00471 
00472   public:
00473     depnode();
00474     depnode(const std::wstring &);
00475     depnode(const node &);
00476 
00478     void set_link(const parse_tree::iterator);
00480     parse_tree::iterator get_link();
00481     parse_tree::const_iterator get_link() const;
00483     tree<node>& get_link_ref();  
00484   };
00485 
00486 
00487 
00491 
00492   class WINDLL dep_tree :  public tree<depnode> {
00493 
00494   private:
00495     // acces nodes by word position
00496     std::vector<dep_tree::iterator> word_index;
00497 
00498   public:
00499     dep_tree();
00500     dep_tree(const depnode &);
00501 
00503     dep_tree::const_iterator get_node_by_pos(size_t) const;
00505     dep_tree::iterator get_node_by_pos(size_t);
00507     void rebuild_node_index();
00508   };
00509 
00510 
00511 
00517 
00518   class processor_status {
00519   public:
00520     processor_status();
00521     virtual ~processor_status() {};
00522   };
00523 
00524 
00530 
00531   class WINDLL sentence : public std::list<word> {
00532   private:
00533     // sentence identifier, in case user application wants to set it.
00534     std::wstring sent_id;
00535     // vector with pointers to sentence words, for fast access by position
00536     std::vector<word*> wpos; 
00537     // parse tree (if sentence parsed)
00538     std::map<int,parse_tree> pts;
00539     // dependencey tree (if sentence dep. parsed)
00540     std::map<int,dep_tree> dts;
00541     // clone sentence (used by assignment/copy constructors)
00542     void clone(const sentence &);
00543     // stack processing status for processor currently analyzing the sentence
00544     // (there might be a hierarchy of embeeded processors, thus the stack)
00545     std::list<processor_status*> status;
00546 
00547   public:
00548     typedef std::pair<std::wstring, std::map<int,std::wstring> > pred_arg_set;
00549     std::map<int,pred_arg_set> pred_args;
00550 
00551     sentence();
00552     sentence(const std::list<word>&);
00554     sentence(const sentence &);
00556     sentence& operator=(const sentence&);
00558     const word& operator[](size_t) const;
00559     word& operator[](size_t);
00561     unsigned int num_kbest() const;
00563     void push_back(const word &);
00565     void rebuild_word_index();
00566  
00567     void clear();
00568 
00569     void set_sentence_id(const std::wstring &);
00570     std::wstring get_sentence_id();
00571 
00572     void set_parse_tree(const parse_tree &, int k=0);
00573     parse_tree & get_parse_tree(int k=0);
00574     const parse_tree & get_parse_tree(int k=0) const;
00575     bool is_parsed() const;
00576 
00577     void set_dep_tree(const dep_tree &, int k=0);
00578     dep_tree & get_dep_tree(int k=0);
00579     const dep_tree & get_dep_tree(int k=0) const;
00580     bool is_dep_parsed() const;
00581 
00583     processor_status* get_processing_status();
00584     const processor_status* get_processing_status() const;
00586     void set_processing_status(processor_status *);
00588     void clear_processing_status();
00589 
00591     std::vector<word> get_words() const;
00593     sentence::iterator words_begin();
00594     sentence::const_iterator words_begin() const;
00595     sentence::iterator words_end();
00596     sentence::const_iterator words_end() const;
00597   };
00598 
00603 
00604   class WINDLL paragraph : public std::list<sentence> {
00605   public:
00606     paragraph() {}
00607     virtual ~paragraph() {}
00608   };
00609 
00614 
00615   class WINDLL document : public std::list<paragraph> {
00616 
00617   private:
00618     paragraph title;
00619     std::multimap<int,std::wstring> group2node;
00620     std::map<std::wstring,int> node2group;
00621     int last_group;
00622 
00623   public:
00624     document();
00626     void add_positive(const std::wstring &node, int group);
00628     void add_positive(const std::wstring &node1, const std::wstring &node2);
00630     int get_coref_group(const std::wstring&) const;
00632     std::list<std::wstring> get_coref_nodes(int) const;
00634     bool is_coref(const std::wstring &, const std::wstring &) const;
00635   };
00636 
00637 } // namespace
00638 
00639 #endif
00640