00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 
00024 
00025 
00026 
00027 
00028 
00029 
00030 
00035 
00036 
00037 #include "OW_String.hpp"
00038 #include "OW_Array.hpp"
00039 #include "OW_StringStream.hpp"
00040 #include "OW_UTF8Utils.hpp"
00041 #include <fstream>
00042 #include <cctype> 
00043 #include <algorithm>
00044 #include <vector>
00045 #include <iostream>
00046 #include <cassert>
00047 #include <map>
00048 #include <set>
00049 
00050 using namespace std;
00051 using namespace OpenWBEM;
00052 
00053 #if 1
00054 #define DEBUG(x) cout << x
00055 #else
00056 #define DEBUG(x)
00057 #endif
00058 
00059 class StateMachine
00060 {
00061 public:
00062    static const int start = 0;
00063    static const int invalid = -1;
00064 
00065    StateMachine()
00066    {
00067       
00068       m_states.resize(1);
00069    }
00070    int getTransition(int state, UInt8 input) const
00071    {
00072       assert(m_states.size() > state);
00073       return m_states[state].transitions[input].toState;
00074    }
00075 
00076    int getStateStr(int state, UInt8 input) const
00077    {
00078       assert(m_states.size() > state);
00079       return m_states[state].transitions[input].inputSelection;
00080    }
00081 
00082    int addState()
00083    {
00084       m_states.resize(m_states.size() + 1);
00085       int state = m_states.size() - 1;
00086       m_states[state].acceptState = false;
00087       return state;
00088    }
00089 
00090    void setStateAccept(int state)
00091    {
00092       assert(m_states.size() > state);
00093       m_states[state].acceptState = true;
00094    }
00095 
00096    
00097    
00098    void addTransition(int transitionsState, UInt8 input, int nextState, int inputSelection)
00099    {
00100       assert(m_states.size() > transitionsState);
00101       assert(m_states.size() > nextState);
00102       assert(inputSelection == 1 || inputSelection == 2);
00103       assert(m_states[transitionsState].transitions[input].toState == invalid);
00104       assert(m_states[transitionsState].transitions[input].inputSelection == -1);
00105       m_states[transitionsState].transitions[input].toState = nextState;
00106       m_states[transitionsState].transitions[input].inputSelection = inputSelection;
00107    }
00108 
00109    void debug() const
00110    {
00111       DEBUG("# states = " << m_states.size() << '\n');
00112    }
00113 
00114    void removeDuplicateState(int state1, int state2)
00115    {
00116       assert(state1 < state2);
00117       m_states.erase(m_states.begin() + state2);
00118       for (int i = 0; i < m_states.size(); ++i)
00119       {
00120          for (int j = 0; j < m_states[i].transitions.size(); ++j)
00121          {
00122             if (m_states[i].transitions[j].toState == state2)
00123             {
00124                m_states[i].transitions[j].toState = state1;
00125             }
00126             else if (m_states[i].transitions[j].toState > state2)
00127             {
00128                
00129                --m_states[i].transitions[j].toState;
00130             }
00131          }
00132       }
00133    }
00134    
00135    
00136    struct transition_t
00137    {
00138       transition_t()
00139       : inputSelection(-1)
00140       , toState(invalid)
00141       {
00142       }
00143       int inputSelection;
00144       int toState;
00145 
00146       friend bool operator==(const transition_t& x, const transition_t& y)
00147       {
00148          return (x.inputSelection == y.inputSelection) && (x.toState == y.toState);
00149       }
00150    };
00151 
00152    struct state_t
00153    {
00154       state_t() 
00155       : transitions()
00156       , acceptState(false)
00157       {
00158          transitions.resize(256);
00159       }
00160       vector<transition_t> transitions;
00161       bool acceptState;
00162 
00163       friend bool operator==(const state_t& x, const state_t& y)
00164       {
00165          return (x.transitions == y.transitions) && (x.acceptState == y.acceptState);
00166       }
00167    };
00168    vector<state_t> m_states;
00169 };
00170 
00171 StateMachine stateMachine;
00172 typedef std::multimap<String, String> mmap_t;
00173 typedef mmap_t::const_iterator ci_t;
00174 std::multimap<String, String> caseFoldingEntries;
00175 
00176 int followOrAddTransition(int curTransition, UInt8 input, int aux)
00177 {
00178    int nextTransition = stateMachine.getTransition(curTransition, input);
00179    if (nextTransition == StateMachine::invalid)
00180    {
00181       
00182       int newState = stateMachine.addState();
00183       DEBUG("added new state " << newState);
00184       
00185       stateMachine.addTransition(curTransition, input, newState, aux);
00186       DEBUG(" and transition from " << curTransition << " to " << newState << " on input " << aux << " = \\x" << int(input) << endl);
00187       nextTransition = newState;
00188    }
00189    else
00190    {
00191       if (stateMachine.getStateStr(curTransition, input) == -1)
00192       {
00193          DEBUG("?? stateMachine.getStateStr(curTransition, input) == -1 ??" << endl);
00194          
00195       }
00196       DEBUG("transition already exists.  curTransition = " << curTransition << " aux = " << aux << " getStateStr = " << stateMachine.getStateStr(curTransition, input) << endl);
00197       
00198       assert(stateMachine.getStateStr(curTransition, input) == aux);
00199    }
00200    return nextTransition;
00201 }
00202 
00203 void printStrings(const String& str1, const String& str2)
00204 {
00205    DEBUG("buildTransitions: str1 = \"");
00206    for (int i = 0; i < str1.length(); ++i)
00207    {
00208       DEBUG( hex << "\\x" << (int)(unsigned char)str1[i]);
00209    }
00210    DEBUG("\" str2 = \"");
00211    for (int i = 0; i < str2.length(); ++i)
00212    {
00213       DEBUG( hex << "\\x" << (int)(unsigned char)str2[i]);
00214    }
00215    DEBUG( "\"\n");
00216 }
00217 
00218 
00219 void buildTransitions(const String& str1, const String& str2)
00220 {
00221    printStrings(str1, str2);
00222 
00223    
00224    int trans = StateMachine::start;
00225    int pos1 = 0;
00226    int pos2 = 0;
00227    while (pos1 < str1.length() || pos2 < str2.length())
00228    {
00229       if (str1[pos1])
00230          trans = followOrAddTransition(trans, str1[pos1++], 1);
00231 
00232       if (str2[pos2])
00233          trans = followOrAddTransition(trans, str2[pos2++], 2);
00234    }
00235    DEBUG( "setStateAccept(" << trans << ")\n");
00236    stateMachine.setStateAccept(trans);
00237 }
00238 
00239 struct processLine
00240 {
00241    void operator()(const String& s) const
00242    {
00243       if (s.empty() || !isxdigit(s[0]))
00244          return;
00245 
00246       DEBUG("processLine: " << s << endl);
00247       StringArray a = s.tokenize(";"); 
00248       assert(a.size() >= 3);
00249       UInt32 c1 = a[0].toUInt32(16);
00250       StringArray a2 = a[2].tokenize(" "); 
00251       Array<UInt32> c2chars(a2.size());
00252       for (size_t i = 0; i < a2.size(); ++i)
00253       {
00254          c2chars[i] = a2[i].toUInt32(16);
00255       }
00256       String str1 = UTF8Utils::UCS4toUTF8(c1);
00257       String str2;
00258       for (size_t i = 0; i < c2chars.size(); ++i)
00259       {
00260          str2 += UTF8Utils::UCS4toUTF8(c2chars[i]);
00261       }
00262 
00263       caseFoldingEntries.insert(std::make_pair(str1, str2));
00264       caseFoldingEntries.insert(std::make_pair(str2, str1));
00265       
00266       
00267    }
00268 };
00269 
00270 struct isForInput : public unary_function<StateMachine::transition_t, bool>
00271 {
00272    isForInput(int input) : m_input(input) {}
00273    int m_input;
00274    bool operator()(const StateMachine::transition_t& t)
00275    {
00276       return t.inputSelection == m_input;
00277    }
00278 };
00279 
00280 void outputHeader()
00281 {
00282    cout << "/*******************************************************************************\n";
00283    cout << "* Copyright (C) 2003-2004 Vintela, Inc All rights reserved.\n";
00284    cout << "*\n";
00285    cout << "* Redistribution and use in source and binary forms, with or without\n";
00286    cout << "* modification, are permitted provided that the following conditions are met:\n";
00287    cout << "*\n";
00288    cout << "*  - Redistributions of source code must retain the above copyright notice,\n";
00289    cout << "*    this list of conditions and the following disclaimer.\n";
00290    cout << "*\n";
00291    cout << "*  - Redistributions in binary form must reproduce the above copyright notice,\n";
00292    cout << "*    this list of conditions and the following disclaimer in the documentation\n";
00293    cout << "*    and/or other materials provided with the distribution.\n";
00294    cout << "*\n";
00295    cout << "*  - Neither the name of Vintela, Inc nor the names of its\n";
00296    cout << "*    contributors may be used to endorse or promote products derived from this\n";
00297    cout << "*    software without specific prior written permission.\n";
00298    cout << "*\n";
00299    cout << "* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''\n";
00300    cout << "* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n";
00301    cout << "* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\n";
00302    cout << "* ARE DISCLAIMED. IN NO EVENT SHALL Vintela, Inc OR THE CONTRIBUTORS\n";
00303    cout << "* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR\n";
00304    cout << "* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\n";
00305    cout << "* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\n";
00306    cout << "* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\n";
00307    cout << "* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\n";
00308    cout << "* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\n";
00309    cout << "* POSSIBILITY OF SUCH DAMAGE.\n";
00310    cout << "*******************************************************************************/\n";
00311    cout << "\n// Do NOT modify this file.  It was generated by OW_GenCaseFoldingCompare.cpp\n";
00312    cout << "// If this file needs to be modified, change the generator and regenerate it.\n";
00313 }
00314 
00315 void outputTransitions(const StateMachine::state_t& state, int inputSelection, bool outputDefault)
00316 {
00317    for (int j = 0; j < state.transitions.size(); ++j)
00318    {
00319       if (state.transitions[j].toState != -1 && state.transitions[j].inputSelection == inputSelection)
00320       {
00321          cout <<  "\t\tcase 0x" << hex << j << ": goto state"  << state.transitions[j].toState << ";\n";
00322       }
00323    }
00324    if (outputDefault)
00325    {
00326       cout << "\t\tdefault: goto no_match;\n";
00327    }
00328    cout << "\t}\n";
00329 }
00330 
00331 void outputFirstState(const StateMachine::state_t& state)
00332 {
00333    cout << "\tswitch (*(str1++)){\n";
00334    
00335    cout << "\t\tcase 0x0: goto zero;\n";
00336    outputTransitions(state, 1, true);
00337 }
00338 
00339 void outputSwitch(const StateMachine::state_t& state, int inputSelection, bool outputDefault)
00340 {
00341    cout << "\tswitch (*(str" << inputSelection << "++)){\n";
00342    outputTransitions(state, inputSelection, outputDefault);
00343 }
00344 
00345 void outputCode()
00346 {
00347    outputHeader();
00348 
00349    cout << "#include \"OW_config.h\"\n";
00350    cout << "#include \"OW_UTF8Utils.hpp\"\n";
00351    cout << "\nnamespace OpenWBEM\n{\n";
00352    cout << "namespace UTF8Utils\n{\n";
00353    cout << "\n/////////////////////////////////////////////////////////////////////////////\n";
00354 
00355    cout << "int compareToIgnoreCase(const char* cstr1, const char* cstr2)\n";
00356    cout << "{\n";
00357    cout << "\tconst unsigned char* str1 = reinterpret_cast<const unsigned char*>(cstr1);\n";
00358    cout << "\tconst unsigned char* str2 = reinterpret_cast<const unsigned char*>(cstr2);\n";
00359    cout << "\tconst unsigned char* str1marker = 0;\n";
00360    cout << "\tconst unsigned char* str2marker = 0;\n";
00361    cout << "\tgoto state0;\n";
00362    cout << "no_match:\n";
00363    cout << "\tif (str1marker) {\n";
00364    cout << "\t\tstr1 = str1marker; str1marker = 0;\n";
00365    cout << "\t\tstr2 = str2marker; str2marker = 0;\n";
00366    cout << "\t\tgoto state0;\n";
00367    cout << "\t}\n";
00368    cout << "\treturn *(str1-1) - *(str2-1);\n";
00369    cout << "zero:\n";
00370    cout << "\treturn 0 - *str2;\n";
00371 
00372    for (int i = 0; i < stateMachine.m_states.size(); ++i)
00373    {
00374 
00375       cout << "state" << i << ":\n";
00376       int c1 = count_if (stateMachine.m_states[i].transitions.begin(), stateMachine.m_states[i].transitions.end(), isForInput(1));
00377       int c2 = count_if (stateMachine.m_states[i].transitions.begin(), stateMachine.m_states[i].transitions.end(), isForInput(2));
00378 
00379       if (i == 0)
00380       {
00381          outputFirstState(stateMachine.m_states[0]);
00382       }
00383       else
00384       {
00385          
00386          if ((c1 || c2) && stateMachine.m_states[i].acceptState) 
00387          {
00388             cout << "\tstr1marker = str1;\n";
00389             cout << "\tstr2marker = str2;\n";
00390          }
00391          if (c1)
00392          {
00393             outputSwitch(stateMachine.m_states[i], 1, c2 == 0);
00394          }
00395          
00396          if (c1 && c2)
00397          {
00398             cout << "\t--str1;\n";
00399          }
00400          if (c2)
00401          {
00402             outputSwitch(stateMachine.m_states[i], 2, true); 
00403          }
00404          if (c1 == 0 && c2 == 0)
00405          {
00406             if (stateMachine.m_states[i].acceptState)
00407             {
00408                cout << "\tgoto state0;\n";
00409             }
00410             else
00411             {
00412                cout << "\tgoto no_match;\n";
00413             }
00414          }
00415       }
00416 
00417 
00418    }
00419    cout << "}\n\n";
00420    cout << "} // end namespace UTF8Utils\n";
00421    cout << "} // end namespace OW_NAMESPACE\n\n";
00422 }
00423 
00424 bool findDuplicateStates(int& state1, int& state2)
00425 {
00426    for (int i = stateMachine.m_states.size() - 2; i > -1; --i)
00427    {
00428       for (int j = stateMachine.m_states.size() -1; j > i; --j)
00429       {
00430          if (stateMachine.m_states[i] == stateMachine.m_states[j])
00431          {
00432             state1 = i;
00433             state2 = j;
00434             DEBUG("found duplicate states: " << i << " and " << j << endl);
00435             return true;
00436          }
00437       }
00438    }
00439    return false;
00440 }
00441 
00442 void minimizeStateMachine()
00443 {
00444    
00445    
00446    DEBUG("minimizing state machine\n");
00447    int state1 = StateMachine::invalid;
00448    int state2 = StateMachine::invalid;
00449    while (findDuplicateStates(state1, state2))
00450    {
00451       stateMachine.removeDuplicateState(state1, state2);
00452    }
00453 }
00454 
00455 void getEntriesFor(const String& key, set<String>& rval)
00456 {
00457    for (ci_t ci = caseFoldingEntries.lower_bound(key);
00458       ci->first == key;
00459       ++ci)
00460    {
00461       if (rval.find(ci->second) == rval.end())
00462       {
00463          rval.insert(ci->second);
00464          getEntriesFor(ci->second, rval);
00465       }
00466    }
00467 }
00468 
00469 bool haveEntry(const String& key, const String& val)
00470 {
00471    for (ci_t ci = caseFoldingEntries.lower_bound(key);
00472       ci->first == key;
00473       ++ci)
00474    {
00475       if (ci->second == val)
00476          return true;
00477    }
00478    return false;
00479 }
00480 
00481 void calculateTransitiveClosure()
00482 {
00483    DEBUG("calculateTransitiveClosure\n");
00484 start_over:
00485    for (ci_t ci = caseFoldingEntries.begin();
00486       ci != caseFoldingEntries.end();
00487       ++ci)
00488    {
00489       set<String> newEntries;
00490       getEntriesFor(ci->second, newEntries);
00491       bool addedAnEntry = false;
00492       String key = ci->first;  
00493       for (set<String>::const_iterator curEntry = newEntries.begin(); curEntry != newEntries.end(); ++curEntry)
00494       {
00495          if (!haveEntry(key, *curEntry))
00496          {
00497             caseFoldingEntries.insert(std::make_pair(key, *curEntry));
00498             addedAnEntry = true;
00499          }
00500       }
00501 
00502       if (addedAnEntry)
00503          goto start_over; 
00504    }
00505 }
00506 
00507 void buildStateMachine()
00508 {
00509    for (ci_t ci = caseFoldingEntries.begin();
00510       ci != caseFoldingEntries.end();
00511       ++ci)
00512    {
00513       buildTransitions(ci->first, ci->second);
00514    }
00515 }
00516 
00517 int main(int argc, char** argv)
00518 {
00519    if (argc != 2)
00520    {
00521       cerr << "must pass filename (to CaseFolding.txt)" << endl;
00522       return 1;
00523    }
00524 
00525    ifstream in(argv[1]);
00526    if (!in)
00527    {
00528       cerr << "could not open " << argv[1] << endl;
00529       return 1;
00530    }
00531 
00532    
00533    for (int i = 1; i < 256; ++i)
00534    {
00535       String s = String(char(i));
00536       caseFoldingEntries.insert(std::make_pair(s, s));
00537       
00538    }
00539 
00540    
00541    OStringStream ss;
00542    ss << in.rdbuf();
00543    String s = ss.toString();
00544    StringArray sa = s.tokenize("\n");
00545    for_each(sa.begin(), sa.end(), processLine());
00546 
00547    calculateTransitiveClosure();
00548    buildStateMachine();
00549 
00550    
00551    minimizeStateMachine();
00552 
00553    
00554    outputCode();
00555 }
00556