00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00035
00036
00037 #include "OW_String.hpp"
00038 #include "OW_Array.hpp"
00039 #include "OW_StringStream.hpp"
00040 #include "OW_UTF8Utils.hpp"
00041 #include <fstream>
00042 #include <cctype>
00043 #include <algorithm>
00044 #include <vector>
00045 #include <iostream>
00046 #include <cassert>
00047 #include <map>
00048 #include <set>
00049
00050 using namespace std;
00051 using namespace OpenWBEM;
00052
00053 #if 1
00054 #define DEBUG(x) cout << x
00055 #else
00056 #define DEBUG(x)
00057 #endif
00058
00059 class StateMachine
00060 {
00061 public:
00062 static const int start = 0;
00063 static const int invalid = -1;
00064
00065 StateMachine()
00066 {
00067
00068 m_states.resize(1);
00069 }
00070 int getTransition(int state, UInt8 input) const
00071 {
00072 assert(m_states.size() > state);
00073 return m_states[state].transitions[input].toState;
00074 }
00075
00076 int getStateStr(int state, UInt8 input) const
00077 {
00078 assert(m_states.size() > state);
00079 return m_states[state].transitions[input].inputSelection;
00080 }
00081
00082 int addState()
00083 {
00084 m_states.resize(m_states.size() + 1);
00085 int state = m_states.size() - 1;
00086 m_states[state].acceptState = false;
00087 return state;
00088 }
00089
00090 void setStateAccept(int state)
00091 {
00092 assert(m_states.size() > state);
00093 m_states[state].acceptState = true;
00094 }
00095
00096
00097
00098 void addTransition(int transitionsState, UInt8 input, int nextState, int inputSelection)
00099 {
00100 assert(m_states.size() > transitionsState);
00101 assert(m_states.size() > nextState);
00102 assert(inputSelection == 1 || inputSelection == 2);
00103 assert(m_states[transitionsState].transitions[input].toState == invalid);
00104 assert(m_states[transitionsState].transitions[input].inputSelection == -1);
00105 m_states[transitionsState].transitions[input].toState = nextState;
00106 m_states[transitionsState].transitions[input].inputSelection = inputSelection;
00107 }
00108
00109 void debug() const
00110 {
00111 DEBUG("# states = " << m_states.size() << '\n');
00112 }
00113
00114 void removeDuplicateState(int state1, int state2)
00115 {
00116 assert(state1 < state2);
00117 m_states.erase(m_states.begin() + state2);
00118 for (int i = 0; i < m_states.size(); ++i)
00119 {
00120 for (int j = 0; j < m_states[i].transitions.size(); ++j)
00121 {
00122 if (m_states[i].transitions[j].toState == state2)
00123 {
00124 m_states[i].transitions[j].toState = state1;
00125 }
00126 else if (m_states[i].transitions[j].toState > state2)
00127 {
00128
00129 --m_states[i].transitions[j].toState;
00130 }
00131 }
00132 }
00133 }
00134
00135
00136 struct transition_t
00137 {
00138 transition_t()
00139 : inputSelection(-1)
00140 , toState(invalid)
00141 {
00142 }
00143 int inputSelection;
00144 int toState;
00145
00146 friend bool operator==(const transition_t& x, const transition_t& y)
00147 {
00148 return (x.inputSelection == y.inputSelection) && (x.toState == y.toState);
00149 }
00150 };
00151
00152 struct state_t
00153 {
00154 state_t()
00155 : transitions()
00156 , acceptState(false)
00157 {
00158 transitions.resize(256);
00159 }
00160 vector<transition_t> transitions;
00161 bool acceptState;
00162
00163 friend bool operator==(const state_t& x, const state_t& y)
00164 {
00165 return (x.transitions == y.transitions) && (x.acceptState == y.acceptState);
00166 }
00167 };
00168 vector<state_t> m_states;
00169 };
00170
00171 StateMachine stateMachine;
00172 typedef std::multimap<String, String> mmap_t;
00173 typedef mmap_t::const_iterator ci_t;
00174 std::multimap<String, String> caseFoldingEntries;
00175
00176 int followOrAddTransition(int curTransition, UInt8 input, int aux)
00177 {
00178 int nextTransition = stateMachine.getTransition(curTransition, input);
00179 if (nextTransition == StateMachine::invalid)
00180 {
00181
00182 int newState = stateMachine.addState();
00183 DEBUG("added new state " << newState);
00184
00185 stateMachine.addTransition(curTransition, input, newState, aux);
00186 DEBUG(" and transition from " << curTransition << " to " << newState << " on input " << aux << " = \\x" << int(input) << endl);
00187 nextTransition = newState;
00188 }
00189 else
00190 {
00191 if (stateMachine.getStateStr(curTransition, input) == -1)
00192 {
00193 DEBUG("?? stateMachine.getStateStr(curTransition, input) == -1 ??" << endl);
00194
00195 }
00196 DEBUG("transition already exists. curTransition = " << curTransition << " aux = " << aux << " getStateStr = " << stateMachine.getStateStr(curTransition, input) << endl);
00197
00198 assert(stateMachine.getStateStr(curTransition, input) == aux);
00199 }
00200 return nextTransition;
00201 }
00202
00203 void printStrings(const String& str1, const String& str2)
00204 {
00205 DEBUG("buildTransitions: str1 = \"");
00206 for (int i = 0; i < str1.length(); ++i)
00207 {
00208 DEBUG( hex << "\\x" << (int)(unsigned char)str1[i]);
00209 }
00210 DEBUG("\" str2 = \"");
00211 for (int i = 0; i < str2.length(); ++i)
00212 {
00213 DEBUG( hex << "\\x" << (int)(unsigned char)str2[i]);
00214 }
00215 DEBUG( "\"\n");
00216 }
00217
00218
00219 void buildTransitions(const String& str1, const String& str2)
00220 {
00221 printStrings(str1, str2);
00222
00223
00224 int trans = StateMachine::start;
00225 int pos1 = 0;
00226 int pos2 = 0;
00227 while (pos1 < str1.length() || pos2 < str2.length())
00228 {
00229 if (str1[pos1])
00230 trans = followOrAddTransition(trans, str1[pos1++], 1);
00231
00232 if (str2[pos2])
00233 trans = followOrAddTransition(trans, str2[pos2++], 2);
00234 }
00235 DEBUG( "setStateAccept(" << trans << ")\n");
00236 stateMachine.setStateAccept(trans);
00237 }
00238
00239 struct processLine
00240 {
00241 void operator()(const String& s) const
00242 {
00243 if (s.empty() || !isxdigit(s[0]))
00244 return;
00245
00246 DEBUG("processLine: " << s << endl);
00247 StringArray a = s.tokenize(";");
00248 assert(a.size() >= 3);
00249 UInt32 c1 = a[0].toUInt32(16);
00250 StringArray a2 = a[2].tokenize(" ");
00251 Array<UInt32> c2chars(a2.size());
00252 for (size_t i = 0; i < a2.size(); ++i)
00253 {
00254 c2chars[i] = a2[i].toUInt32(16);
00255 }
00256 String str1 = UTF8Utils::UCS4toUTF8(c1);
00257 String str2;
00258 for (size_t i = 0; i < c2chars.size(); ++i)
00259 {
00260 str2 += UTF8Utils::UCS4toUTF8(c2chars[i]);
00261 }
00262
00263 caseFoldingEntries.insert(std::make_pair(str1, str2));
00264 caseFoldingEntries.insert(std::make_pair(str2, str1));
00265
00266
00267 }
00268 };
00269
00270 struct isForInput : public unary_function<StateMachine::transition_t, bool>
00271 {
00272 isForInput(int input) : m_input(input) {}
00273 int m_input;
00274 bool operator()(const StateMachine::transition_t& t)
00275 {
00276 return t.inputSelection == m_input;
00277 }
00278 };
00279
00280 void outputHeader()
00281 {
00282 cout << "/*******************************************************************************\n";
00283 cout << "* Copyright (C) 2003-2004 Vintela, Inc All rights reserved.\n";
00284 cout << "*\n";
00285 cout << "* Redistribution and use in source and binary forms, with or without\n";
00286 cout << "* modification, are permitted provided that the following conditions are met:\n";
00287 cout << "*\n";
00288 cout << "* - Redistributions of source code must retain the above copyright notice,\n";
00289 cout << "* this list of conditions and the following disclaimer.\n";
00290 cout << "*\n";
00291 cout << "* - Redistributions in binary form must reproduce the above copyright notice,\n";
00292 cout << "* this list of conditions and the following disclaimer in the documentation\n";
00293 cout << "* and/or other materials provided with the distribution.\n";
00294 cout << "*\n";
00295 cout << "* - Neither the name of Vintela, Inc nor the names of its\n";
00296 cout << "* contributors may be used to endorse or promote products derived from this\n";
00297 cout << "* software without specific prior written permission.\n";
00298 cout << "*\n";
00299 cout << "* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''\n";
00300 cout << "* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n";
00301 cout << "* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\n";
00302 cout << "* ARE DISCLAIMED. IN NO EVENT SHALL Vintela, Inc OR THE CONTRIBUTORS\n";
00303 cout << "* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR\n";
00304 cout << "* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\n";
00305 cout << "* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\n";
00306 cout << "* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\n";
00307 cout << "* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\n";
00308 cout << "* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\n";
00309 cout << "* POSSIBILITY OF SUCH DAMAGE.\n";
00310 cout << "*******************************************************************************/\n";
00311 cout << "\n// Do NOT modify this file. It was generated by OW_GenCaseFoldingCompare.cpp\n";
00312 cout << "// If this file needs to be modified, change the generator and regenerate it.\n";
00313 }
00314
00315 void outputTransitions(const StateMachine::state_t& state, int inputSelection, bool outputDefault)
00316 {
00317 for (int j = 0; j < state.transitions.size(); ++j)
00318 {
00319 if (state.transitions[j].toState != -1 && state.transitions[j].inputSelection == inputSelection)
00320 {
00321 cout << "\t\tcase 0x" << hex << j << ": goto state" << state.transitions[j].toState << ";\n";
00322 }
00323 }
00324 if (outputDefault)
00325 {
00326 cout << "\t\tdefault: goto no_match;\n";
00327 }
00328 cout << "\t}\n";
00329 }
00330
00331 void outputFirstState(const StateMachine::state_t& state)
00332 {
00333 cout << "\tswitch (*(str1++)){\n";
00334
00335 cout << "\t\tcase 0x0: goto zero;\n";
00336 outputTransitions(state, 1, true);
00337 }
00338
00339 void outputSwitch(const StateMachine::state_t& state, int inputSelection, bool outputDefault)
00340 {
00341 cout << "\tswitch (*(str" << inputSelection << "++)){\n";
00342 outputTransitions(state, inputSelection, outputDefault);
00343 }
00344
00345 void outputCode()
00346 {
00347 outputHeader();
00348
00349 cout << "#include \"OW_config.h\"\n";
00350 cout << "#include \"OW_UTF8Utils.hpp\"\n";
00351 cout << "\nnamespace OpenWBEM\n{\n";
00352 cout << "namespace UTF8Utils\n{\n";
00353 cout << "\n/////////////////////////////////////////////////////////////////////////////\n";
00354
00355 cout << "int compareToIgnoreCase(const char* cstr1, const char* cstr2)\n";
00356 cout << "{\n";
00357 cout << "\tconst unsigned char* str1 = reinterpret_cast<const unsigned char*>(cstr1);\n";
00358 cout << "\tconst unsigned char* str2 = reinterpret_cast<const unsigned char*>(cstr2);\n";
00359 cout << "\tconst unsigned char* str1marker = 0;\n";
00360 cout << "\tconst unsigned char* str2marker = 0;\n";
00361 cout << "\tgoto state0;\n";
00362 cout << "no_match:\n";
00363 cout << "\tif (str1marker) {\n";
00364 cout << "\t\tstr1 = str1marker; str1marker = 0;\n";
00365 cout << "\t\tstr2 = str2marker; str2marker = 0;\n";
00366 cout << "\t\tgoto state0;\n";
00367 cout << "\t}\n";
00368 cout << "\treturn *(str1-1) - *(str2-1);\n";
00369 cout << "zero:\n";
00370 cout << "\treturn 0 - *str2;\n";
00371
00372 for (int i = 0; i < stateMachine.m_states.size(); ++i)
00373 {
00374
00375 cout << "state" << i << ":\n";
00376 int c1 = count_if (stateMachine.m_states[i].transitions.begin(), stateMachine.m_states[i].transitions.end(), isForInput(1));
00377 int c2 = count_if (stateMachine.m_states[i].transitions.begin(), stateMachine.m_states[i].transitions.end(), isForInput(2));
00378
00379 if (i == 0)
00380 {
00381 outputFirstState(stateMachine.m_states[0]);
00382 }
00383 else
00384 {
00385
00386 if ((c1 || c2) && stateMachine.m_states[i].acceptState)
00387 {
00388 cout << "\tstr1marker = str1;\n";
00389 cout << "\tstr2marker = str2;\n";
00390 }
00391 if (c1)
00392 {
00393 outputSwitch(stateMachine.m_states[i], 1, c2 == 0);
00394 }
00395
00396 if (c1 && c2)
00397 {
00398 cout << "\t--str1;\n";
00399 }
00400 if (c2)
00401 {
00402 outputSwitch(stateMachine.m_states[i], 2, true);
00403 }
00404 if (c1 == 0 && c2 == 0)
00405 {
00406 if (stateMachine.m_states[i].acceptState)
00407 {
00408 cout << "\tgoto state0;\n";
00409 }
00410 else
00411 {
00412 cout << "\tgoto no_match;\n";
00413 }
00414 }
00415 }
00416
00417
00418 }
00419 cout << "}\n\n";
00420 cout << "} // end namespace UTF8Utils\n";
00421 cout << "} // end namespace OW_NAMESPACE\n\n";
00422 }
00423
00424 bool findDuplicateStates(int& state1, int& state2)
00425 {
00426 for (int i = stateMachine.m_states.size() - 2; i > -1; --i)
00427 {
00428 for (int j = stateMachine.m_states.size() -1; j > i; --j)
00429 {
00430 if (stateMachine.m_states[i] == stateMachine.m_states[j])
00431 {
00432 state1 = i;
00433 state2 = j;
00434 DEBUG("found duplicate states: " << i << " and " << j << endl);
00435 return true;
00436 }
00437 }
00438 }
00439 return false;
00440 }
00441
00442 void minimizeStateMachine()
00443 {
00444
00445
00446 DEBUG("minimizing state machine\n");
00447 int state1 = StateMachine::invalid;
00448 int state2 = StateMachine::invalid;
00449 while (findDuplicateStates(state1, state2))
00450 {
00451 stateMachine.removeDuplicateState(state1, state2);
00452 }
00453 }
00454
00455 void getEntriesFor(const String& key, set<String>& rval)
00456 {
00457 for (ci_t ci = caseFoldingEntries.lower_bound(key);
00458 ci->first == key;
00459 ++ci)
00460 {
00461 if (rval.find(ci->second) == rval.end())
00462 {
00463 rval.insert(ci->second);
00464 getEntriesFor(ci->second, rval);
00465 }
00466 }
00467 }
00468
00469 bool haveEntry(const String& key, const String& val)
00470 {
00471 for (ci_t ci = caseFoldingEntries.lower_bound(key);
00472 ci->first == key;
00473 ++ci)
00474 {
00475 if (ci->second == val)
00476 return true;
00477 }
00478 return false;
00479 }
00480
00481 void calculateTransitiveClosure()
00482 {
00483 DEBUG("calculateTransitiveClosure\n");
00484 start_over:
00485 for (ci_t ci = caseFoldingEntries.begin();
00486 ci != caseFoldingEntries.end();
00487 ++ci)
00488 {
00489 set<String> newEntries;
00490 getEntriesFor(ci->second, newEntries);
00491 bool addedAnEntry = false;
00492 String key = ci->first;
00493 for (set<String>::const_iterator curEntry = newEntries.begin(); curEntry != newEntries.end(); ++curEntry)
00494 {
00495 if (!haveEntry(key, *curEntry))
00496 {
00497 caseFoldingEntries.insert(std::make_pair(key, *curEntry));
00498 addedAnEntry = true;
00499 }
00500 }
00501
00502 if (addedAnEntry)
00503 goto start_over;
00504 }
00505 }
00506
00507 void buildStateMachine()
00508 {
00509 for (ci_t ci = caseFoldingEntries.begin();
00510 ci != caseFoldingEntries.end();
00511 ++ci)
00512 {
00513 buildTransitions(ci->first, ci->second);
00514 }
00515 }
00516
00517 int main(int argc, char** argv)
00518 {
00519 if (argc != 2)
00520 {
00521 cerr << "must pass filename (to CaseFolding.txt)" << endl;
00522 return 1;
00523 }
00524
00525 ifstream in(argv[1]);
00526 if (!in)
00527 {
00528 cerr << "could not open " << argv[1] << endl;
00529 return 1;
00530 }
00531
00532
00533 for (int i = 1; i < 256; ++i)
00534 {
00535 String s = String(char(i));
00536 caseFoldingEntries.insert(std::make_pair(s, s));
00537
00538 }
00539
00540
00541 OStringStream ss;
00542 ss << in.rdbuf();
00543 String s = ss.toString();
00544 StringArray sa = s.tokenize("\n");
00545 for_each(sa.begin(), sa.end(), processLine());
00546
00547 calculateTransitiveClosure();
00548 buildStateMachine();
00549
00550
00551 minimizeStateMachine();
00552
00553
00554 outputCode();
00555 }
00556