FreeLing
3.1
|
00001 00002 // 00003 // Fries - Feature Retriever for Intensional Encoding of Sentences 00004 // 00005 // Copyright (C) 2006 TALP Research Center 00006 // Universitat Politecnica de Catalunya 00007 // 00008 // This file is part of the Fries library 00009 // 00010 // The Fries library is free software; you can redistribute it 00011 // and/or modify it under the terms of the GNU General Public 00012 // License as published by the Free Software Foundation; either 00013 // version 3 of the License, or (at your option) any later version. 00014 // 00015 // This library is distributed in the hope that it will be useful, 00016 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00018 // General Public License for more details. 00019 // 00020 // You should have received a copy of the GNU General Public 00021 // License along with this library; if not, write to the Free Software 00022 // Foundation, Inc., 51 Franklin St, 5th Floor, Boston, MA 02110-1301 USA 00023 // 00024 // contact: Lluis Padro (padro@lsi.upc.es) 00025 // TALP Research Center 00026 // despatx Omega.S112 - Campus Nord UPC 00027 // 08034 Barcelona. SPAIN 00028 // 00030 00031 #ifndef _UTIL 00032 #define _UTIL 00033 00034 #include <cstdio> 00035 #include <list> 00036 #include <string> 00037 #include <vector> 00038 #include <set> 00039 00040 #include <locale> 00041 #include <iostream> 00042 #include "freeling/utf8/utf8.h" 00043 00044 #include "freeling/regexp.h" 00045 #include "freeling/windll.h" 00046 #include "freeling/morfo/traces.h" 00047 00048 #ifdef WIN32 00049 #include <windows.h> 00050 #define getpid() GetCurrentProcessId() 00051 #define pid_t DWORD 00052 #define err_type errno_t 00053 #define NEW_TMPNAME(buf,sz) tmpnam_s(buf,sz) 00054 #define TMPNAME_FAILED(x) x 00055 #else 00056 #define err_type char* 00057 #define NEW_TMPNAME(buf,sz) tmpnam(buf) 00058 #define TMPNAME_FAILED(x) (x==NULL) 00059 #endif 00060 00061 00062 // Capitalization patterns 00063 #define UPPER_NONE 0 00064 #define UPPER_1ST 1 00065 #define UPPER_ALL 2 00066 00067 namespace freeling { 00068 00069 #define MOD_TRACENAME L"UTIL" 00070 #define MOD_TRACECODE UTIL_TRACE 00071 00077 00078 class WINDLL util { 00079 00080 public: 00082 static freeling::regexp RE_has_lowercase; // wstring contains lowercase chars 00083 static freeling::regexp RE_has_alphanum; // wstring contains alphanum chars 00084 static freeling::regexp RE_is_capitalized; // wstring is capitalized 00085 static freeling::regexp RE_all_digits; // wstring is all digits 00086 static freeling::regexp RE_all_caps; // wstring is uppercase 00087 static freeling::regexp RE_initial_dot; // wstring is an initial plus optional dot 00088 static freeling::regexp RE_all_caps_dot; // wstring is uppercase plus optional dot 00089 static freeling::regexp RE_capitalized_dot; // wstring is capitalized plus optional dot 00090 static freeling::regexp RE_has_digits; // wstring contains digits 00091 static freeling::regexp RE_lowercase_dot; // wstring is lowercase plus optional dot 00092 00093 static freeling::regexp RE_win_absolute_path; // to detect absolute paths in windows 00094 00096 static void init_locale(const std::wstring &s=L"default"); 00098 static void open_utf8_file(std::wifstream &, const std::wstring &); 00100 static void open_utf8_file(std::wofstream &, const std::wstring &); 00102 static std::wstring lowercase(const std::wstring &); 00104 static std::wstring uppercase(const std::wstring &); 00106 static bool is_absolute(const std::string &p); 00108 static bool is_absolute(const std::wstring &p); 00110 static std::string get_current_path(); 00112 static std::string absolute(const std::string &, const std::string &); 00114 static std::wstring absolute(const std::wstring &, const std::wstring &); 00116 static std::string expand_filename(const std::string &); 00118 static std::wstring expand_filename(const std::wstring &); 00120 static std::wstring new_tempfile_name(); 00122 static std::wstring remove_chars(const std::wstring &, const std::wstring &); 00124 static void find_and_replace(std::wstring &, const std::wstring &, const std::wstring &); 00125 00127 static int wstring2int(const std::wstring &); 00128 static double wstring2double(const std::wstring &); 00129 static long double wstring2longdouble(const std::wstring &); 00130 00131 template<class C> static std::wstring wstring_from(const C&, const std::wstring &); 00132 template<class C> static std::wstring wstring_from(const C&); 00133 template<class C> static C wstring_to(const std::wstring &, const std::wstring &, bool mcsep=true); 00134 template<class C> static C wstring_to(const std::wstring &); 00135 00136 template<class P1,class P2> static std::wstring pairlist2wstring(const std::list<std::pair<P1,P2> > &, const std::wstring &, const std::wstring &); 00137 template<class P1,class P2> static std::list<std::pair<P1,P2> > wstring2pairlist(const std::wstring &, const std::wstring &, const std::wstring &); 00138 00139 static int capitalization(const std::wstring &); 00140 static std::wstring capitalize(const std::wstring &, int, bool); 00141 00143 template<class T1,class T2> static bool ascending_first(const std::pair<T1,T2> &, const std::pair<T1,T2> &); 00144 template<class T1,class T2> static bool ascending_second(const std::pair<T1,T2> &, const std::pair<T1,T2> &); 00145 template<class T1,class T2> static bool descending_first(const std::pair<T1,T2> &, const std::pair<T1,T2> &); 00146 template<class T1,class T2> static bool descending_second(const std::pair<T1,T2> &, const std::pair<T1,T2> &); 00147 }; 00148 00149 00153 00154 inline std::wstring util::new_tempfile_name() { 00155 char* tempfile = new char[L_tmpnam+1]; 00156 err_type err = NEW_TMPNAME(tempfile,L_tmpnam+1); 00157 if (TMPNAME_FAILED(err)) 00158 ERROR_CRASH(L"Error occurred creating unique filename."); 00159 std::wstring fname = wstring_from(tempfile)+L"-FL-"+wstring_from(getpid()); 00160 delete[] tempfile; 00161 return fname; 00162 } 00163 00167 00168 template<class C> 00169 inline std::wstring util::wstring_from(const C& ls, const std::wstring &sep) { 00170 // if nothing to convert, we are done 00171 if (ls.empty()) return L""; 00172 // print first element to output 00173 typename C::const_iterator i=ls.begin(); 00174 std::wstring sn; sn=(*i); 00175 // print all remaining elements, adding separators 00176 while (++i!=ls.end()) sn += sep+(*i); 00177 // return resulting string 00178 return(sn); 00179 } 00180 00184 00185 template<class C> 00186 inline std::wstring util::wstring_from(const C & x) { 00187 std::wostringstream ss; 00188 ss<<std::fixed<<x; 00189 return ss.str(); 00190 } 00191 00195 00196 template<> 00197 inline std::wstring util::wstring_from(const long double &x) { 00198 std::wostringstream ss; 00199 ss<<std::fixed<<x; 00200 // remove decimal digits if all zeros. 00201 std::wstring s(ss.str()); 00202 std::wstring::size_type pos = s.find(L'.'); 00203 std::wstring::size_type posLast = s.find_last_not_of(L"0"); 00204 if ((pos != s.npos) && (posLast != s.npos) && (posLast >= pos)) { 00205 if (posLast == pos) s.erase(pos); 00206 else s.erase(posLast+1); 00207 } 00208 return s; 00209 } 00210 00214 00215 template<> 00216 inline std::wstring util::wstring_from(const std::string &s) { 00217 std::wstring ws; 00218 if (sizeof(std::wstring::value_type)==2) 00219 utf8::utf8to16(s.begin(), s.end(), back_inserter(ws)); 00220 else if (sizeof(std::wstring::value_type)==4) 00221 utf8::utf8to32(s.begin(), s.end(), back_inserter(ws)); 00222 else 00223 WARNING(L"Unexpected wchar size "+wstring_from<int>(sizeof(std::wstring::value_type))); 00224 return ws; 00225 } 00226 00227 00233 00234 template<class C> 00235 inline C util::wstring_to(const std::wstring &ws, const std::wstring &sep, bool mcsep) { 00236 C ls; 00237 std::wstring::size_type p,q; 00238 // at each occurence of separator "sep" in string "s", cut and insert at the end of the container 00239 p=0; q = (mcsep? ws.find(sep) : ws.find_first_of(sep)); 00240 while(q!=std::wstring::npos){ 00241 ls.insert(ls.end(),ws.substr(p,q-p)); 00242 p = q+sep.size(); 00243 q = (mcsep? ws.find(sep,p) : ws.find_first_of(sep,p)); 00244 } 00245 // piece remaining after last separator, if any. 00246 if (not ws.empty()) ls.insert(ls.end(),ws.substr(p,ws.size()-p)); 00247 return(ls); 00248 } 00249 00253 00254 template<class C> 00255 inline C util::wstring_to(const std::wstring &ws) { 00256 long double x; 00257 std::wistringstream ss; ss.str(ws); 00258 ss>>x; 00259 // if original wstring hasn't been fully emptied return default value 00260 std::wstring r; 00261 if (ss>>r) x= -99999; 00262 return static_cast<C>(x); 00263 } 00264 00265 00269 00270 template<> 00271 inline std::string util::wstring_to(const std::wstring &ws) { 00272 std::string s; 00273 if (sizeof(std::wstring::value_type)==2) 00274 utf8::utf16to8(ws.begin(), ws.end(), back_inserter(s)); 00275 else if (sizeof(std::wstring::value_type)==4) 00276 utf8::utf32to8(ws.begin(), ws.end(), back_inserter(s)); 00277 else 00278 WARNING(L"Unexpected wchar size "+wstring_from<int>(sizeof(std::wstring::value_type))); 00279 00280 return s; 00281 } 00282 00283 00288 00289 template<class P1,class P2> 00290 inline std::wstring util::pairlist2wstring(const std::list<std::pair<P1,P2> > &ls, const std::wstring &sep_pair, const std::wstring &sep_list) { 00291 // if nothing to convert, we are done 00292 if (ls.empty()) return L""; 00293 // print first element to output 00294 typename std::list<std::pair<P1,P2> >::const_iterator i=ls.begin(); 00295 std::wstringstream ss; ss << i->first << sep_pair << i->second; 00296 // concatenate elements in list<pair> 00297 while (++i!=ls.end()) ss << sep_list << i->first << sep_pair << i->second; 00298 // return resulting string 00299 return(ss.str()); 00300 } 00301 00305 00306 template<class P1,class P2> 00307 inline std::list<std::pair<P1,P2> > util::wstring2pairlist(const std::wstring &s, const std::wstring &sep_pair, const std::wstring &sep_list) { 00308 // split string at sep_list 00309 std::list<std::wstring> ls = util::wstring_to<std::list<std::wstring> >(s,sep_list); 00310 // split each pair in ls at sep_pair, and store to lps 00311 std::list<std::pair<P1,P2> > lps; 00312 P1 elem1; 00313 P2 elem2; 00314 for (std::list<std::wstring>::const_iterator i=ls.begin(); i!=ls.end(); i++) { 00315 std::wstring::size_type p = i->find(sep_pair); 00316 std::wstringstream ss1(i->substr(0,p)); ss1 >> elem1; 00317 std::wstringstream ss2(i->substr(p+1)); ss2 >> elem2; 00318 lps.push_back(make_pair(elem1,elem2)); 00319 } 00320 00321 return(lps); 00322 } 00323 00324 00328 00329 template<class T1,class T2> inline bool util::ascending_first(const std::pair<T1,T2> &p1, const std::pair<T1,T2> &p2) { 00330 return (p1.first<p2.first or (p1.first==p2.first and p1.second<p2.second)); 00331 } 00332 00336 00337 template<class T1,class T2> inline bool util::ascending_second(const std::pair<T1,T2> &p1, const std::pair<T1,T2> &p2) { 00338 return (p1.second<p2.second or (p1.second==p2.second and p1.first<p2.first)); 00339 } 00340 00344 00345 template<class T1,class T2> inline bool util::descending_first(const std::pair<T1,T2> &p1, const std::pair<T1,T2> &p2) { 00346 return (p1.first>p2.first or (p1.first==p2.first and p1.second>p2.second)); 00347 } 00348 00349 00353 00354 template<class T1,class T2> inline bool util::descending_second(const std::pair<T1,T2> &p1, const std::pair<T1,T2> &p2) { 00355 return (p1.second>p2.second or (p1.second==p2.second and p1.first>p2.first)); 00356 } 00357 00358 00359 00362 00363 #define wstring2vector(x,y) wstring_to<std::vector<std::wstring> >(x,y) 00364 #define wstring2list(x,y) wstring_to<std::list<std::wstring> >(x,y) 00365 #define wstring2set(x,y) wstring_to<std::set<std::wstring> >(x,y) 00366 00367 #define wstring2string(x) wstring_to<std::string>(x) 00368 #define wstring2int(x) wstring_to<int>(x) 00369 #define wstring2double(x) wstring_to<double>(x) 00370 #define wstring2longdouble(x) wstring_to<long double>(x) 00371 00372 #define vector2wstring(x,y) wstring_from(x,y) 00373 #define list2wstring(x,y) wstring_from(x,y) 00374 #define set2wstring(x,y) wstring_from(x,y) 00375 #define string2wstring(x) wstring_from(x) 00376 #define int2wstring(x) wstring_from(x) 00377 #define double2wstring(x) wstring_from(x) 00378 #define longdouble2wstring(x) wstring_from(x) 00379 00380 #define wstring2pairlist(x,y,z) wstring2pairlist<std::wstring,std::wstring>(x,y,z) 00381 00382 00383 #undef MOD_TRACENAME 00384 #undef MOD_TRACECODE 00385 00386 } //namespace 00387 00388 #endif