00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00035
00036
00037 #include "OW_String.hpp"
00038 #include "OW_Array.hpp"
00039 #include "OW_StringStream.hpp"
00040 #include "OW_UTF8Utils.hpp"
00041 #include <fstream>
00042 #include <iostream>
00043 #include <map>
00044
00045 using namespace std;
00046 using namespace OpenWBEM;
00047
00048 map<UInt32, UInt32> upperMap;
00049 map<UInt32, UInt32> lowerMap;
00050
00051 struct processLine
00052 {
00053 void operator()(const String& s) const
00054 {
00055 if (s.empty() || !isxdigit(s[0]))
00056 {
00057 cout << "skipping line\n" << s << '\n';
00058 return;
00059 }
00060
00061 StringArray a = s.tokenize(";", String::E_DISCARD_DELIMITERS, String::E_RETURN_EMPTY_TOKENS);
00062 assert(a.size() >= 14);
00063 UInt32 c1 = a[0].toUInt32(16);
00064 if (a[13] != "")
00065 {
00066 lowerMap[c1] = a[13].toUInt32(16);
00067 }
00068 if (a[12] != "")
00069 {
00070 upperMap[c1] = a[12].toUInt32(16);
00071 }
00072 }
00073 };
00074
00075 int utf8len(UInt32 ucs4char)
00076 {
00077 if (ucs4char < 0x80u)
00078 {
00079 return 1;
00080 }
00081 else if (ucs4char < 0x800u)
00082 {
00083 return 2;
00084 }
00085 else if (ucs4char < 0x10000u)
00086 {
00087 return 3;
00088 }
00089 else
00090 {
00091 return 4;
00092 }
00093 }
00094
00095
00096 int main(int argc, char** argv)
00097 {
00098 ifstream in("UnicodeData.txt");
00099 if (!in)
00100 {
00101 cerr << "could not open UnicodeData.txt" << endl;
00102 return 1;
00103 }
00104
00105
00106 OStringStream ss;
00107 ss << in.rdbuf();
00108 String s = ss.toString();
00109 StringArray sa = s.tokenize("\n");
00110 for_each(sa.begin(), sa.end(), processLine());
00111 cout <<
00112 "struct CaseMapping\n"
00113 "{\n"
00114 "\tUInt32 codePoint;\n"
00115 "\tUInt32 mapping;\n"
00116 "};\n";
00117 cout <<
00118 "const CaseMapping lowerMappings[] =\n"
00119 "{\n";
00120 for (map<UInt32, UInt32>::const_iterator i = lowerMap.begin(); i != lowerMap.end(); ++i)
00121 {
00122 cout << hex << "\t{0x" << i->first << ", 0x" << i->second << "},";
00123 if (utf8len(i->first) < utf8len(i->second))
00124 {
00125
00126 cout << " // increasing utf8 length";
00127 }
00128 else if (utf8len(i->first) > utf8len(i->second))
00129 {
00130
00131 cout << " // decreasing utf8 length";
00132 }
00133 cout << "\n";
00134 }
00135 cout << "};\n\n";
00136 cout <<
00137 "const CaseMapping upperMappings[] =\n"
00138 "{\n";
00139 for (map<UInt32, UInt32>::const_iterator i = upperMap.begin(); i != upperMap.end(); ++i)
00140 {
00141 cout << hex << "\t{0x" << i->first << ", 0x" << i->second << "},";
00142 if (utf8len(i->first) < utf8len(i->second))
00143 {
00144
00145 cout << " // increasing utf8 length";
00146 }
00147 else if (utf8len(i->first) > utf8len(i->second))
00148 {
00149
00150 cout << " // decreasing utf8 length";
00151 }
00152 cout << "\n";
00153 }
00154 cout << "};\n";
00155 }
00156