OW_GenCaseFoldingCompare.cpp

Go to the documentation of this file.
00001 /*******************************************************************************
00002 * Copyright (C) 2003-2004 Vintela, Inc All rights reserved.
00003 *
00004 * Redistribution and use in source and binary forms, with or without
00005 * modification, are permitted provided that the following conditions are met:
00006 *
00007 *  - Redistributions of source code must retain the above copyright notice,
00008 *    this list of conditions and the following disclaimer.
00009 *
00010 *  - Redistributions in binary form must reproduce the above copyright notice,
00011 *    this list of conditions and the following disclaimer in the documentation
00012 *    and/or other materials provided with the distribution.
00013 *
00014 *  - Neither the name of Vintela, Inc. nor the names of its
00015 *    contributors may be used to endorse or promote products derived from this
00016 *    software without specific prior written permission.
00017 *
00018 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
00019 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00020 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00021 * ARE DISCLAIMED. IN NO EVENT SHALL Vintela, Inc. OR THE CONTRIBUTORS
00022 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00023 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00024 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00025 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00026 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00027 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00028 * POSSIBILITY OF SUCH DAMAGE.
00029 *******************************************************************************/
00030 
00035 // The source of the Unicode data is: http://www.unicode.org/Public/UNIDATA/
00036 
00037 #include "OW_String.hpp"
00038 #include "OW_Array.hpp"
00039 #include "OW_StringStream.hpp"
00040 #include "OW_UTF8Utils.hpp"
00041 #include <fstream>
00042 #include <cctype> // for isxdigit
00043 #include <algorithm>
00044 #include <vector>
00045 #include <iostream>
00046 #include <cassert>
00047 #include <map>
00048 #include <set>
00049 
00050 using namespace std;
00051 using namespace OpenWBEM;
00052 
00053 #if 1
00054 #define DEBUG(x) cout << x
00055 #else
00056 #define DEBUG(x)
00057 #endif
00058 
00059 class StateMachine
00060 {
00061 public:
00062    static const int start = 0;
00063    static const int invalid = -1;
00064 
00065    StateMachine()
00066    {
00067       // create the start state
00068       m_states.resize(1);
00069    }
00070    int getTransition(int state, UInt8 input) const
00071    {
00072       assert(m_states.size() > state);
00073       return m_states[state].transitions[input].toState;
00074    }
00075 
00076    int getStateStr(int state, UInt8 input) const
00077    {
00078       assert(m_states.size() > state);
00079       return m_states[state].transitions[input].inputSelection;
00080    }
00081 
00082    int addState()
00083    {
00084       m_states.resize(m_states.size() + 1);
00085       int state = m_states.size() - 1;
00086       m_states[state].acceptState = false;
00087       return state;
00088    }
00089 
00090    void setStateAccept(int state)
00091    {
00092       assert(m_states.size() > state);
00093       m_states[state].acceptState = true;
00094    }
00095 
00096    // val.inputSelection == 1, increment the pointer to str1
00097    // val.inputSelection == 2, increment the pointer to str2
00098    void addTransition(int transitionsState, UInt8 input, int nextState, int inputSelection)
00099    {
00100       assert(m_states.size() > transitionsState);
00101       assert(m_states.size() > nextState);
00102       assert(inputSelection == 1 || inputSelection == 2);
00103       assert(m_states[transitionsState].transitions[input].toState == invalid);
00104       assert(m_states[transitionsState].transitions[input].inputSelection == -1);
00105       m_states[transitionsState].transitions[input].toState = nextState;
00106       m_states[transitionsState].transitions[input].inputSelection = inputSelection;
00107    }
00108 
00109    void debug() const
00110    {
00111       DEBUG("# states = " << m_states.size() << '\n');
00112    }
00113 
00114    void removeDuplicateState(int state1, int state2)
00115    {
00116       assert(state1 < state2);
00117       m_states.erase(m_states.begin() + state2);
00118       for (int i = 0; i < m_states.size(); ++i)
00119       {
00120          for (int j = 0; j < m_states[i].transitions.size(); ++j)
00121          {
00122             if (m_states[i].transitions[j].toState == state2)
00123             {
00124                m_states[i].transitions[j].toState = state1;
00125             }
00126             else if (m_states[i].transitions[j].toState > state2)
00127             {
00128                // we removed a state, so we just decrement it by one.
00129                --m_states[i].transitions[j].toState;
00130             }
00131          }
00132       }
00133    }
00134    
00135    // transitions, and whether state applies to str1 or str2
00136    struct transition_t
00137    {
00138       transition_t()
00139       : inputSelection(-1)
00140       , toState(invalid)
00141       {
00142       }
00143       int inputSelection;
00144       int toState;
00145 
00146       friend bool operator==(const transition_t& x, const transition_t& y)
00147       {
00148          return (x.inputSelection == y.inputSelection) && (x.toState == y.toState);
00149       }
00150    };
00151 
00152    struct state_t
00153    {
00154       state_t() 
00155       : transitions()
00156       , acceptState(false)
00157       {
00158          transitions.resize(256);
00159       }
00160       vector<transition_t> transitions;
00161       bool acceptState;
00162 
00163       friend bool operator==(const state_t& x, const state_t& y)
00164       {
00165          return (x.transitions == y.transitions) && (x.acceptState == y.acceptState);
00166       }
00167    };
00168    vector<state_t> m_states;
00169 };
00170 
00171 StateMachine stateMachine;
00172 typedef std::multimap<String, String> mmap_t;
00173 typedef mmap_t::const_iterator ci_t;
00174 std::multimap<String, String> caseFoldingEntries;
00175 
00176 int followOrAddTransition(int curTransition, UInt8 input, int aux)
00177 {
00178    int nextTransition = stateMachine.getTransition(curTransition, input);
00179    if (nextTransition == StateMachine::invalid)
00180    {
00181       // no transition, add one
00182       int newState = stateMachine.addState();
00183       DEBUG("added new state " << newState);
00184       //stateMachine.setInputSelection(curTransition, input, aux);
00185       stateMachine.addTransition(curTransition, input, newState, aux);
00186       DEBUG(" and transition from " << curTransition << " to " << newState << " on input " << aux << " = \\x" << int(input) << endl);
00187       nextTransition = newState;
00188    }
00189    else
00190    {
00191       if (stateMachine.getStateStr(curTransition, input) == -1)
00192       {
00193          DEBUG("?? stateMachine.getStateStr(curTransition, input) == -1 ??" << endl);
00194          //stateMachine.setInputSelection(curTransition, input, aux);
00195       }
00196       DEBUG("transition already exists.  curTransition = " << curTransition << " aux = " << aux << " getStateStr = " << stateMachine.getStateStr(curTransition, input) << endl);
00197       // found a transition
00198       assert(stateMachine.getStateStr(curTransition, input) == aux);
00199    }
00200    return nextTransition;
00201 }
00202 
00203 void printStrings(const String& str1, const String& str2)
00204 {
00205    DEBUG("buildTransitions: str1 = \"");
00206    for (int i = 0; i < str1.length(); ++i)
00207    {
00208       DEBUG( hex << "\\x" << (int)(unsigned char)str1[i]);
00209    }
00210    DEBUG("\" str2 = \"");
00211    for (int i = 0; i < str2.length(); ++i)
00212    {
00213       DEBUG( hex << "\\x" << (int)(unsigned char)str2[i]);
00214    }
00215    DEBUG( "\"\n");
00216 }
00217 
00218 
00219 void buildTransitions(const String& str1, const String& str2)
00220 {
00221    printStrings(str1, str2);
00222 
00223    // do it transitions for str1/str2
00224    int trans = StateMachine::start;
00225    int pos1 = 0;
00226    int pos2 = 0;
00227    while (pos1 < str1.length() || pos2 < str2.length())
00228    {
00229       if (str1[pos1])
00230          trans = followOrAddTransition(trans, str1[pos1++], 1);
00231 
00232       if (str2[pos2])
00233          trans = followOrAddTransition(trans, str2[pos2++], 2);
00234    }
00235    DEBUG( "setStateAccept(" << trans << ")\n");
00236    stateMachine.setStateAccept(trans);
00237 }
00238 
00239 struct processLine
00240 {
00241    void operator()(const String& s) const
00242    {
00243       if (s.empty() || !isxdigit(s[0]))
00244          return;
00245 
00246       DEBUG("processLine: " << s << endl);
00247       StringArray a = s.tokenize(";"); // split up fields
00248       assert(a.size() >= 3);
00249       UInt32 c1 = a[0].toUInt32(16);
00250       StringArray a2 = a[2].tokenize(" "); // split up chars are separated by spaces
00251       Array<UInt32> c2chars(a2.size());
00252       for (size_t i = 0; i < a2.size(); ++i)
00253       {
00254          c2chars[i] = a2[i].toUInt32(16);
00255       }
00256       String str1 = UTF8Utils::UCS4toUTF8(c1);
00257       String str2;
00258       for (size_t i = 0; i < c2chars.size(); ++i)
00259       {
00260          str2 += UTF8Utils::UCS4toUTF8(c2chars[i]);
00261       }
00262 
00263       caseFoldingEntries.insert(std::make_pair(str1, str2));
00264       caseFoldingEntries.insert(std::make_pair(str2, str1));
00265       //buildTransitions(str1, str2);
00266       //buildTransitions(str2, str1);
00267    }
00268 };
00269 
00270 struct isForInput : public unary_function<StateMachine::transition_t, bool>
00271 {
00272    isForInput(int input) : m_input(input) {}
00273    int m_input;
00274    bool operator()(const StateMachine::transition_t& t)
00275    {
00276       return t.inputSelection == m_input;
00277    }
00278 };
00279 
00280 void outputHeader()
00281 {
00282    cout << "/*******************************************************************************\n";
00283    cout << "* Copyright (C) 2003-2004 Vintela, Inc All rights reserved.\n";
00284    cout << "*\n";
00285    cout << "* Redistribution and use in source and binary forms, with or without\n";
00286    cout << "* modification, are permitted provided that the following conditions are met:\n";
00287    cout << "*\n";
00288    cout << "*  - Redistributions of source code must retain the above copyright notice,\n";
00289    cout << "*    this list of conditions and the following disclaimer.\n";
00290    cout << "*\n";
00291    cout << "*  - Redistributions in binary form must reproduce the above copyright notice,\n";
00292    cout << "*    this list of conditions and the following disclaimer in the documentation\n";
00293    cout << "*    and/or other materials provided with the distribution.\n";
00294    cout << "*\n";
00295    cout << "*  - Neither the name of Vintela, Inc nor the names of its\n";
00296    cout << "*    contributors may be used to endorse or promote products derived from this\n";
00297    cout << "*    software without specific prior written permission.\n";
00298    cout << "*\n";
00299    cout << "* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''\n";
00300    cout << "* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n";
00301    cout << "* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\n";
00302    cout << "* ARE DISCLAIMED. IN NO EVENT SHALL Vintela, Inc OR THE CONTRIBUTORS\n";
00303    cout << "* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR\n";
00304    cout << "* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\n";
00305    cout << "* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\n";
00306    cout << "* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\n";
00307    cout << "* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\n";
00308    cout << "* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\n";
00309    cout << "* POSSIBILITY OF SUCH DAMAGE.\n";
00310    cout << "*******************************************************************************/\n";
00311    cout << "\n// Do NOT modify this file.  It was generated by OW_GenCaseFoldingCompare.cpp\n";
00312    cout << "// If this file needs to be modified, change the generator and regenerate it.\n";
00313 }
00314 
00315 void outputTransitions(const StateMachine::state_t& state, int inputSelection, bool outputDefault)
00316 {
00317    for (int j = 0; j < state.transitions.size(); ++j)
00318    {
00319       if (state.transitions[j].toState != -1 && state.transitions[j].inputSelection == inputSelection)
00320       {
00321          cout <<  "\t\tcase 0x" << hex << j << ": goto state"  << state.transitions[j].toState << ";\n";
00322       }
00323    }
00324    if (outputDefault)
00325    {
00326       cout << "\t\tdefault: goto no_match;\n";
00327    }
00328    cout << "\t}\n";
00329 }
00330 
00331 void outputFirstState(const StateMachine::state_t& state)
00332 {
00333    cout << "\tswitch (*(str1++)){\n";
00334    // first state has to handle 0
00335    cout << "\t\tcase 0x0: goto zero;\n";
00336    outputTransitions(state, 1, true);
00337 }
00338 
00339 void outputSwitch(const StateMachine::state_t& state, int inputSelection, bool outputDefault)
00340 {
00341    cout << "\tswitch (*(str" << inputSelection << "++)){\n";
00342    outputTransitions(state, inputSelection, outputDefault);
00343 }
00344 
00345 void outputCode()
00346 {
00347    outputHeader();
00348 
00349    cout << "#include \"OW_config.h\"\n";
00350    cout << "#include \"OW_UTF8Utils.hpp\"\n";
00351    cout << "\nnamespace OpenWBEM\n{\n";
00352    cout << "namespace UTF8Utils\n{\n";
00353    cout << "\n/////////////////////////////////////////////////////////////////////////////\n";
00354 
00355    cout << "int compareToIgnoreCase(const char* cstr1, const char* cstr2)\n";
00356    cout << "{\n";
00357    cout << "\tconst unsigned char* str1 = reinterpret_cast<const unsigned char*>(cstr1);\n";
00358    cout << "\tconst unsigned char* str2 = reinterpret_cast<const unsigned char*>(cstr2);\n";
00359    cout << "\tconst unsigned char* str1marker = 0;\n";
00360    cout << "\tconst unsigned char* str2marker = 0;\n";
00361    cout << "\tgoto state0;\n";
00362    cout << "no_match:\n";
00363    cout << "\tif (str1marker) {\n";
00364    cout << "\t\tstr1 = str1marker; str1marker = 0;\n";
00365    cout << "\t\tstr2 = str2marker; str2marker = 0;\n";
00366    cout << "\t\tgoto state0;\n";
00367    cout << "\t}\n";
00368    cout << "\treturn *(str1-1) - *(str2-1);\n";
00369    cout << "zero:\n";
00370    cout << "\treturn 0 - *str2;\n";
00371 
00372    for (int i = 0; i < stateMachine.m_states.size(); ++i)
00373    {
00374 
00375       cout << "state" << i << ":\n";
00376       int c1 = count_if (stateMachine.m_states[i].transitions.begin(), stateMachine.m_states[i].transitions.end(), isForInput(1));
00377       int c2 = count_if (stateMachine.m_states[i].transitions.begin(), stateMachine.m_states[i].transitions.end(), isForInput(2));
00378 
00379       if (i == 0)
00380       {
00381          outputFirstState(stateMachine.m_states[0]);
00382       }
00383       else
00384       {
00385          // we're in an accept state with outgoing transitions, we need to save our position
00386          if ((c1 || c2) && stateMachine.m_states[i].acceptState) 
00387          {
00388             cout << "\tstr1marker = str1;\n";
00389             cout << "\tstr2marker = str2;\n";
00390          }
00391          if (c1)
00392          {
00393             outputSwitch(stateMachine.m_states[i], 1, c2 == 0);
00394          }
00395          // need to rewind str1 in the case of 2 switches and the first one doesn't match
00396          if (c1 && c2)
00397          {
00398             cout << "\t--str1;\n";
00399          }
00400          if (c2)
00401          {
00402             outputSwitch(stateMachine.m_states[i], 2, true); // true means ouput the default
00403          }
00404          if (c1 == 0 && c2 == 0)
00405          {
00406             if (stateMachine.m_states[i].acceptState)
00407             {
00408                cout << "\tgoto state0;\n";
00409             }
00410             else
00411             {
00412                cout << "\tgoto no_match;\n";
00413             }
00414          }
00415       }
00416 
00417 
00418    }
00419    cout << "}\n\n";
00420    cout << "} // end namespace UTF8Utils\n";
00421    cout << "} // end namespace OW_NAMESPACE\n\n";
00422 }
00423 
00424 bool findDuplicateStates(int& state1, int& state2)
00425 {
00426    for (int i = stateMachine.m_states.size() - 2; i > -1; --i)
00427    {
00428       for (int j = stateMachine.m_states.size() -1; j > i; --j)
00429       {
00430          if (stateMachine.m_states[i] == stateMachine.m_states[j])
00431          {
00432             state1 = i;
00433             state2 = j;
00434             DEBUG("found duplicate states: " << i << " and " << j << endl);
00435             return true;
00436          }
00437       }
00438    }
00439    return false;
00440 }
00441 
00442 void minimizeStateMachine()
00443 {
00444    // this is a horribly inefficient way of doing this, but it works, was 
00445    // simple to code, and it only needs to be run once.
00446    DEBUG("minimizing state machine\n");
00447    int state1 = StateMachine::invalid;
00448    int state2 = StateMachine::invalid;
00449    while (findDuplicateStates(state1, state2))
00450    {
00451       stateMachine.removeDuplicateState(state1, state2);
00452    }
00453 }
00454 
00455 void getEntriesFor(const String& key, set<String>& rval)
00456 {
00457    for (ci_t ci = caseFoldingEntries.lower_bound(key);
00458       ci->first == key;
00459       ++ci)
00460    {
00461       if (rval.find(ci->second) == rval.end())
00462       {
00463          rval.insert(ci->second);
00464          getEntriesFor(ci->second, rval);
00465       }
00466    }
00467 }
00468 
00469 bool haveEntry(const String& key, const String& val)
00470 {
00471    for (ci_t ci = caseFoldingEntries.lower_bound(key);
00472       ci->first == key;
00473       ++ci)
00474    {
00475       if (ci->second == val)
00476          return true;
00477    }
00478    return false;
00479 }
00480 
00481 void calculateTransitiveClosure()
00482 {
00483    DEBUG("calculateTransitiveClosure\n");
00484 start_over:
00485    for (ci_t ci = caseFoldingEntries.begin();
00486       ci != caseFoldingEntries.end();
00487       ++ci)
00488    {
00489       set<String> newEntries;
00490       getEntriesFor(ci->second, newEntries);
00491       bool addedAnEntry = false;
00492       String key = ci->first;  // make a copy since the iterator may be invalidated after an insert
00493       for (set<String>::const_iterator curEntry = newEntries.begin(); curEntry != newEntries.end(); ++curEntry)
00494       {
00495          if (!haveEntry(key, *curEntry))
00496          {
00497             caseFoldingEntries.insert(std::make_pair(key, *curEntry));
00498             addedAnEntry = true;
00499          }
00500       }
00501 
00502       if (addedAnEntry)
00503          goto start_over; // since the iterators may be invalidated.
00504    }
00505 }
00506 
00507 void buildStateMachine()
00508 {
00509    for (ci_t ci = caseFoldingEntries.begin();
00510       ci != caseFoldingEntries.end();
00511       ++ci)
00512    {
00513       buildTransitions(ci->first, ci->second);
00514    }
00515 }
00516 
00517 int main(int argc, char** argv)
00518 {
00519    if (argc != 2)
00520    {
00521       cerr << "must pass filename (to CaseFolding.txt)" << endl;
00522       return 1;
00523    }
00524 
00525    ifstream in(argv[1]);
00526    if (!in)
00527    {
00528       cerr << "could not open " << argv[1] << endl;
00529       return 1;
00530    }
00531 
00532    // add transitions for equal matches
00533    for (int i = 1; i < 256; ++i)
00534    {
00535       String s = String(char(i));
00536       caseFoldingEntries.insert(std::make_pair(s, s));
00537       //buildTransitions(s, s);
00538    }
00539 
00540    // read in a process the input file
00541    OStringStream ss;
00542    ss << in.rdbuf();
00543    String s = ss.toString();
00544    StringArray sa = s.tokenize("\n");
00545    for_each(sa.begin(), sa.end(), processLine());
00546 
00547    calculateTransitiveClosure();
00548    buildStateMachine();
00549 
00550    // disable duplicate states
00551    minimizeStateMachine();
00552 
00553    // now generate the code:
00554    outputCode();
00555 }
00556 

Generated on Thu Feb 9 08:47:59 2006 for openwbem by  doxygen 1.4.6