FreeLing  3.1
util.h
Go to the documentation of this file.
00001 
00002 //
00003 //    Fries - Feature Retriever for Intensional Encoding of Sentences
00004 //
00005 //    Copyright (C) 2006   TALP Research Center
00006 //                         Universitat Politecnica de Catalunya
00007 //
00008 //    This file is part of the Fries library
00009 //
00010 //    The Fries library is free software; you can redistribute it 
00011 //    and/or modify it under the terms of the GNU General Public
00012 //    License as published by the Free Software Foundation; either
00013 //    version 3 of the License, or (at your option) any later version.
00014 //
00015 //    This library is distributed in the hope that it will be useful,
00016 //    but WITHOUT ANY WARRANTY; without even the implied warranty of
00017 //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 //    General Public License for more details.
00019 //
00020 //    You should have received a copy of the GNU General Public
00021 //    License along with this library; if not, write to the Free Software
00022 //    Foundation, Inc., 51 Franklin St, 5th Floor, Boston, MA 02110-1301 USA
00023 //
00024 //    contact: Lluis Padro (padro@lsi.upc.es)
00025 //             TALP Research Center
00026 //             despatx Omega.S112 - Campus Nord UPC
00027 //             08034 Barcelona.  SPAIN
00028 //
00030 
00031 #ifndef _UTIL
00032 #define _UTIL
00033 
00034 #include <cstdio>
00035 #include <list>
00036 #include <string>
00037 #include <vector>
00038 #include <set>
00039 
00040 #include <locale>
00041 #include <iostream>
00042 #include "freeling/utf8/utf8.h"
00043 
00044 #include "freeling/regexp.h"
00045 #include "freeling/windll.h"
00046 #include "freeling/morfo/traces.h"
00047 
00048 #ifdef WIN32
00049 #include <windows.h>
00050 #define getpid() GetCurrentProcessId()
00051 #define pid_t DWORD
00052 #define err_type errno_t
00053 #define NEW_TMPNAME(buf,sz) tmpnam_s(buf,sz)
00054 #define TMPNAME_FAILED(x) x
00055 #else
00056 #define err_type char*
00057 #define NEW_TMPNAME(buf,sz) tmpnam(buf)
00058 #define TMPNAME_FAILED(x) (x==NULL)
00059 #endif
00060 
00061 
00062 // Capitalization patterns
00063 #define UPPER_NONE 0
00064 #define UPPER_1ST 1
00065 #define UPPER_ALL 2
00066  
00067 namespace freeling {
00068 
00069 #define MOD_TRACENAME L"UTIL"
00070 #define MOD_TRACECODE UTIL_TRACE
00071 
00077 
00078   class WINDLL util {
00079 
00080   public:
00082     static freeling::regexp RE_has_lowercase;   // wstring contains lowercase chars
00083     static freeling::regexp RE_has_alphanum;    // wstring contains alphanum chars
00084     static freeling::regexp RE_is_capitalized;  // wstring is capitalized
00085     static freeling::regexp RE_all_digits;      // wstring is all digits
00086     static freeling::regexp RE_all_caps;        // wstring is uppercase
00087     static freeling::regexp RE_initial_dot;     // wstring is an initial plus optional dot
00088     static freeling::regexp RE_all_caps_dot;    // wstring is uppercase plus optional dot
00089     static freeling::regexp RE_capitalized_dot; // wstring is capitalized plus optional dot
00090     static freeling::regexp RE_has_digits;      // wstring contains digits
00091     static freeling::regexp RE_lowercase_dot;   // wstring is lowercase plus optional dot
00092 
00093     static freeling::regexp RE_win_absolute_path; // to detect absolute paths in windows
00094 
00096     static void init_locale(const std::wstring &s=L"default");
00098     static void open_utf8_file(std::wifstream &, const std::wstring &);
00100     static void open_utf8_file(std::wofstream &, const std::wstring &);
00102     static std::wstring lowercase(const std::wstring &);
00104     static std::wstring uppercase(const std::wstring &);
00106     static bool is_absolute(const std::string &p);
00108     static bool is_absolute(const std::wstring &p);
00110     static std::string get_current_path(); 
00112     static std::string absolute(const std::string &, const std::string &);
00114     static std::wstring absolute(const std::wstring &, const std::wstring &);
00116     static std::string expand_filename(const std::string &);
00118     static std::wstring expand_filename(const std::wstring &);
00120     static std::wstring new_tempfile_name();
00122     static std::wstring remove_chars(const std::wstring &, const std::wstring &);
00124     static void find_and_replace(std::wstring &, const std::wstring &, const std::wstring &);
00125 
00127     static int wstring2int(const std::wstring &);
00128     static double wstring2double(const std::wstring &);
00129     static long double wstring2longdouble(const std::wstring &);
00130 
00131     template<class C> static std::wstring wstring_from(const C&, const std::wstring &);
00132     template<class C> static std::wstring wstring_from(const C&);
00133     template<class C> static C wstring_to(const std::wstring &, const std::wstring &, bool mcsep=true);
00134     template<class C> static C wstring_to(const std::wstring &);
00135 
00136     template<class P1,class P2> static std::wstring pairlist2wstring(const std::list<std::pair<P1,P2> > &, const std::wstring &, const std::wstring &);
00137     template<class P1,class P2> static std::list<std::pair<P1,P2> > wstring2pairlist(const std::wstring &, const std::wstring &, const std::wstring &);
00138 
00139     static int capitalization(const std::wstring &);
00140     static std::wstring capitalize(const std::wstring &, int, bool);
00141 
00143     template<class T1,class T2> static bool ascending_first(const std::pair<T1,T2> &, const std::pair<T1,T2> &);
00144     template<class T1,class T2> static bool ascending_second(const std::pair<T1,T2> &, const std::pair<T1,T2> &);
00145     template<class T1,class T2> static bool descending_first(const std::pair<T1,T2> &, const std::pair<T1,T2> &);
00146     template<class T1,class T2> static bool descending_second(const std::pair<T1,T2> &, const std::pair<T1,T2> &);
00147   };
00148 
00149 
00153 
00154   inline std::wstring util::new_tempfile_name() {
00155     char* tempfile = new char[L_tmpnam+1]; 
00156     err_type err = NEW_TMPNAME(tempfile,L_tmpnam+1);
00157     if (TMPNAME_FAILED(err))
00158       ERROR_CRASH(L"Error occurred creating unique filename.");
00159     std::wstring fname = wstring_from(tempfile)+L"-FL-"+wstring_from(getpid());
00160     delete[] tempfile;
00161     return fname;
00162   }
00163 
00167 
00168   template<class C>
00169     inline std::wstring util::wstring_from(const C& ls, const std::wstring &sep) {
00170     // if nothing to convert, we are done
00171     if (ls.empty()) return L"";  
00172     // print first element to output
00173     typename C::const_iterator i=ls.begin();
00174     std::wstring sn;  sn=(*i);  
00175     // print all remaining elements, adding separators
00176     while (++i!=ls.end()) sn += sep+(*i);
00177     // return resulting string
00178     return(sn); 
00179   }
00180  
00184 
00185   template<class C>
00186     inline std::wstring util::wstring_from(const C & x) {
00187     std::wostringstream ss;
00188     ss<<std::fixed<<x;
00189     return ss.str();
00190   }
00191 
00195 
00196   template<>
00197     inline std::wstring util::wstring_from(const long double &x) {
00198     std::wostringstream ss;
00199     ss<<std::fixed<<x;
00200     // remove decimal digits if all zeros.
00201     std::wstring s(ss.str());
00202     std::wstring::size_type pos = s.find(L'.');
00203     std::wstring::size_type posLast = s.find_last_not_of(L"0");
00204     if ((pos != s.npos) && (posLast != s.npos) && (posLast >= pos)) {
00205       if (posLast == pos) s.erase(pos);
00206       else s.erase(posLast+1);
00207     }
00208     return s;
00209   }
00210 
00214 
00215   template<>
00216     inline std::wstring util::wstring_from(const std::string &s) {
00217     std::wstring ws;
00218     if (sizeof(std::wstring::value_type)==2) 
00219       utf8::utf8to16(s.begin(), s.end(), back_inserter(ws));
00220     else if (sizeof(std::wstring::value_type)==4) 
00221       utf8::utf8to32(s.begin(), s.end(), back_inserter(ws));
00222     else 
00223       WARNING(L"Unexpected wchar size "+wstring_from<int>(sizeof(std::wstring::value_type)));
00224     return ws; 
00225   }
00226 
00227 
00233 
00234   template<class C>
00235     inline C util::wstring_to(const std::wstring &ws, const std::wstring &sep, bool mcsep) {
00236     C ls;
00237     std::wstring::size_type p,q;
00238     // at each occurence of separator "sep" in string "s", cut and insert at the end of the container
00239     p=0; q = (mcsep? ws.find(sep) : ws.find_first_of(sep));
00240     while(q!=std::wstring::npos){
00241       ls.insert(ls.end(),ws.substr(p,q-p));
00242       p = q+sep.size();
00243       q = (mcsep? ws.find(sep,p) : ws.find_first_of(sep,p));
00244     }
00245     // piece remaining after last separator, if any.
00246     if (not ws.empty()) ls.insert(ls.end(),ws.substr(p,ws.size()-p));
00247     return(ls);    
00248   }
00249 
00253 
00254   template<class C>
00255     inline C util::wstring_to(const std::wstring &ws) {
00256     long double x;
00257     std::wistringstream ss; ss.str(ws); 
00258     ss>>x;
00259     // if original wstring hasn't been fully emptied return default value
00260     std::wstring r;
00261     if (ss>>r) x= -99999;
00262     return static_cast<C>(x);
00263   }
00264 
00265 
00269 
00270   template<>
00271     inline std::string util::wstring_to(const std::wstring &ws) {
00272     std::string s;
00273     if (sizeof(std::wstring::value_type)==2) 
00274       utf8::utf16to8(ws.begin(), ws.end(), back_inserter(s));
00275     else if (sizeof(std::wstring::value_type)==4) 
00276       utf8::utf32to8(ws.begin(), ws.end(), back_inserter(s));
00277     else 
00278       WARNING(L"Unexpected wchar size "+wstring_from<int>(sizeof(std::wstring::value_type)));
00279 
00280     return s;
00281   }
00282 
00283 
00288 
00289   template<class P1,class P2> 
00290     inline std::wstring util::pairlist2wstring(const std::list<std::pair<P1,P2> > &ls, const std::wstring &sep_pair, const std::wstring &sep_list) {
00291     // if nothing to convert, we are done
00292     if (ls.empty()) return L"";  
00293     // print first element to output
00294     typename std::list<std::pair<P1,P2> >::const_iterator i=ls.begin();
00295     std::wstringstream ss;  ss << i->first << sep_pair << i->second;
00296     // concatenate elements in list<pair>
00297     while (++i!=ls.end()) ss << sep_list << i->first << sep_pair << i->second;
00298     // return resulting string
00299     return(ss.str());
00300   }
00301 
00305 
00306   template<class P1,class P2> 
00307     inline std::list<std::pair<P1,P2> > util::wstring2pairlist(const std::wstring &s, const std::wstring &sep_pair, const std::wstring &sep_list) {
00308     // split string at sep_list
00309     std::list<std::wstring> ls = util::wstring_to<std::list<std::wstring> >(s,sep_list);
00310     // split each pair in ls at sep_pair, and store to lps
00311     std::list<std::pair<P1,P2> > lps;
00312     P1 elem1;
00313     P2 elem2;
00314     for (std::list<std::wstring>::const_iterator i=ls.begin(); i!=ls.end(); i++) {
00315       std::wstring::size_type p = i->find(sep_pair);
00316       std::wstringstream ss1(i->substr(0,p)); ss1 >> elem1;
00317       std::wstringstream ss2(i->substr(p+1)); ss2 >> elem2;
00318       lps.push_back(make_pair(elem1,elem2));
00319     }
00320 
00321     return(lps);
00322   }
00323 
00324 
00328 
00329   template<class T1,class T2> inline bool util::ascending_first(const std::pair<T1,T2> &p1, const std::pair<T1,T2> &p2) {
00330     return (p1.first<p2.first or (p1.first==p2.first and p1.second<p2.second));
00331   }
00332 
00336 
00337   template<class T1,class T2> inline bool util::ascending_second(const std::pair<T1,T2> &p1, const std::pair<T1,T2> &p2) {
00338     return (p1.second<p2.second or (p1.second==p2.second and p1.first<p2.first));
00339   }
00340 
00344 
00345   template<class T1,class T2> inline bool util::descending_first(const std::pair<T1,T2> &p1, const std::pair<T1,T2> &p2) {
00346     return (p1.first>p2.first or (p1.first==p2.first and p1.second>p2.second));
00347   }
00348 
00349 
00353 
00354   template<class T1,class T2> inline bool util::descending_second(const std::pair<T1,T2> &p1, const std::pair<T1,T2> &p2) {
00355     return (p1.second>p2.second or (p1.second==p2.second and p1.first>p2.first));
00356   }
00357 
00358 
00359 
00362 
00363 #define wstring2vector(x,y) wstring_to<std::vector<std::wstring> >(x,y)
00364 #define wstring2list(x,y) wstring_to<std::list<std::wstring> >(x,y)
00365 #define wstring2set(x,y) wstring_to<std::set<std::wstring> >(x,y)
00366 
00367 #define wstring2string(x) wstring_to<std::string>(x)
00368 #define wstring2int(x) wstring_to<int>(x) 
00369 #define wstring2double(x) wstring_to<double>(x) 
00370 #define wstring2longdouble(x) wstring_to<long double>(x) 
00371 
00372 #define vector2wstring(x,y) wstring_from(x,y)
00373 #define list2wstring(x,y) wstring_from(x,y)
00374 #define set2wstring(x,y) wstring_from(x,y)
00375 #define string2wstring(x) wstring_from(x)
00376 #define int2wstring(x) wstring_from(x)
00377 #define double2wstring(x) wstring_from(x)
00378 #define longdouble2wstring(x) wstring_from(x)
00379 
00380 #define wstring2pairlist(x,y,z) wstring2pairlist<std::wstring,std::wstring>(x,y,z)
00381 
00382 
00383 #undef MOD_TRACENAME
00384 #undef MOD_TRACECODE
00385 
00386 } //namespace
00387 
00388 #endif