openwbem: OW_GenCaseMappings.cpp Source File

00001 /*******************************************************************************
00002 * Copyright (C) 2003-2005 Vintela, Inc All rights reserved.
00003 *
00004 * Redistribution and use in source and binary forms, with or without
00005 * modification, are permitted provided that the following conditions are met:
00006 *
00007 *  - Redistributions of source code must retain the above copyright notice,
00008 *    this list of conditions and the following disclaimer.
00009 *
00010 *  - Redistributions in binary form must reproduce the above copyright notice,
00011 *    this list of conditions and the following disclaimer in the documentation
00012 *    and/or other materials provided with the distribution.
00013 *
00014 *  - Neither the name of Vintela, Inc nor the names of its
00015 *    contributors may be used to endorse or promote products derived from this
00016 *    software without specific prior written permission.
00017 *
00018 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
00019 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00020 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00021 * ARE DISCLAIMED. IN NO EVENT SHALL Vintela, Inc OR THE CONTRIBUTORS
00022 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00023 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00024 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00025 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00026 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00027 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00028 * POSSIBILITY OF SUCH DAMAGE.
00029 *******************************************************************************/
00030 
00035 // The source of the Unicode data is: http://www.unicode.org/Public/UNIDATA/
00036 
00037 #include "OW_String.hpp"
00038 #include "OW_Array.hpp"
00039 #include "OW_StringStream.hpp"
00040 #include "OW_UTF8Utils.hpp"
00041 #include <fstream>
00042 #include <iostream>
00043 #include <map>
00044 
00045 using namespace std;
00046 using namespace OpenWBEM;
00047 
00048 map<UInt32, UInt32> upperMap;
00049 map<UInt32, UInt32> lowerMap;
00050 
00051 struct processLine
00052 {
00053    void operator()(const String& s) const
00054    {
00055       if (s.empty() || !isxdigit(s[0]))
00056       {
00057          cout << "skipping line\n" << s << '\n';
00058          return;
00059       }
00060 
00061       StringArray a = s.tokenize(";", String::E_DISCARD_DELIMITERS, String::E_RETURN_EMPTY_TOKENS); // split up fields
00062       assert(a.size() >= 14);
00063       UInt32 c1 = a[0].toUInt32(16);
00064       if (a[13] != "")
00065       {
00066          lowerMap[c1] = a[13].toUInt32(16);
00067       }
00068       if (a[12] != "")
00069       {
00070          upperMap[c1] = a[12].toUInt32(16);
00071       }
00072    }
00073 };
00074 
00075 int utf8len(UInt32 ucs4char)
00076 {
00077    if (ucs4char < 0x80u)
00078    {
00079       return 1;
00080    }
00081    else if (ucs4char < 0x800u)
00082    {
00083       return 2;
00084    }
00085    else if (ucs4char < 0x10000u)
00086    {
00087       return 3;
00088    }
00089    else
00090    {
00091       return 4;
00092    }
00093 }
00094 
00095 
00096 int main(int argc, char** argv)
00097 {
00098    ifstream in("UnicodeData.txt");
00099    if (!in)
00100    {
00101       cerr << "could not open UnicodeData.txt" << endl;
00102       return 1;
00103    }
00104 
00105    // read in a process the input file
00106    OStringStream ss;
00107    ss << in.rdbuf();
00108    String s = ss.toString();
00109    StringArray sa = s.tokenize("\n");
00110    for_each(sa.begin(), sa.end(), processLine());
00111    cout << 
00112       "struct CaseMapping\n"
00113       "{\n"
00114       "\tUInt32 codePoint;\n"
00115       "\tUInt32 mapping;\n"
00116       "};\n";
00117    cout <<
00118       "const CaseMapping lowerMappings[] =\n"
00119       "{\n";
00120    for (map<UInt32, UInt32>::const_iterator i = lowerMap.begin(); i != lowerMap.end(); ++i)
00121    {
00122       cout << hex << "\t{0x" << i->first << ", 0x" << i->second << "},";
00123       if (utf8len(i->first) < utf8len(i->second))
00124       {
00125          // do this to see if there are any utf8 sequences that would grow when lower-casing them.
00126          cout << " // increasing utf8 length";
00127       }
00128       else if (utf8len(i->first) > utf8len(i->second))
00129       {
00130          // do this to see if there are any utf8 sequences that would grow when lower-casing them.
00131          cout << " // decreasing utf8 length";
00132       }
00133       cout << "\n";
00134    }
00135    cout << "};\n\n";
00136    cout << 
00137       "const CaseMapping upperMappings[] =\n"
00138       "{\n";
00139    for (map<UInt32, UInt32>::const_iterator i = upperMap.begin(); i != upperMap.end(); ++i)
00140    {
00141       cout << hex << "\t{0x" << i->first << ", 0x" << i->second << "},";
00142       if (utf8len(i->first) < utf8len(i->second))
00143       {
00144          // do this to see if there are any utf8 sequences that would grow when lower-casing them.
00145          cout << " // increasing utf8 length";
00146       }
00147       else if (utf8len(i->first) > utf8len(i->second))
00148       {
00149          // do this to see if there are any utf8 sequences that would grow when lower-casing them.
00150          cout << " // decreasing utf8 length";
00151       }
00152       cout << "\n";
00153    }
00154    cout << "};\n";
00155 }
00156