Modified UTF-8 與 UTF-32 相互轉換
自己的實現,經過一定的測試。
頭文件
1
/*
2
Convert Modified UTF-8 <==> UTF-32.
3
*/
4
5
6
/*
7
function : Convert Modified UTF-8 to UTF-32.
8
input : str_mutf8, a null terminated string in Modified UTF-8.
9
output : str_utf32, a null terminated string in UTF-32.
10
input : str_utf32_limit, the max length(character count)
11
of str_utf32 plus one(for 'null'), str_utf32 must have enough space
12
for str_utf32_limit characters.
13
return : -1 for errors;
14
else the length(character count) of str_utf32,
15
maybe larger than (str_utf32_limit-1) if the space
16
of str_utf32 isn't enougn.
17
note : convert 0xc080 to U+0000 字符串未結束
18
convert 0x00 to U+0000 字符串結束
19
*/
20
int mutf8_to_utf32( const unsigned char *str_mutf8,
21
unsigned int *str_utf32, int str_utf32_limit );
22
23
/*
24
function : Convert UTF-32 to Modified UTF-8.
25
input : str_utf32, a null terminated string in UTF-32.
26
output : str_mutf8, a null terminated string in Modified UTF-8.
27
input : str_mutf8_limit, the max length(byte count)
28
of str_mutf8 plus one(for 'null'), str_mutf8 must have enough space
29
for str_mutf8_limit bytes.
30
return : -1 for errors;
31
else the length(byte count) of str_mutf8,
32
maybe larger than (str_mutf8_limit-1) if the space
33
of str_mutf8 isn't enougn.
34
note : convet U+0000 to 0x00, not 0xc080 字符串結束
35
*/
36
int utf32_to_mutf8( const unsigned int *str_utf32,
37
unsigned char *str_mutf8, int str_mutf8_limit );
38
39
/*2
Convert Modified UTF-8 <==> UTF-32.3
*/4

5

6
/*7
function : Convert Modified UTF-8 to UTF-32.8
input : str_mutf8, a null terminated string in Modified UTF-8.9
output : str_utf32, a null terminated string in UTF-32.10
input : str_utf32_limit, the max length(character count) 11
of str_utf32 plus one(for 'null'), str_utf32 must have enough space 12
for str_utf32_limit characters.13
return : -1 for errors; 14
else the length(character count) of str_utf32, 15
maybe larger than (str_utf32_limit-1) if the space 16
of str_utf32 isn't enougn.17
note : convert 0xc080 to U+0000 字符串未結束18
convert 0x00 to U+0000 字符串結束19
*/20
int mutf8_to_utf32( const unsigned char *str_mutf8, 21
unsigned int *str_utf32, int str_utf32_limit );22

23
/*24
function : Convert UTF-32 to Modified UTF-8.25
input : str_utf32, a null terminated string in UTF-32.26
output : str_mutf8, a null terminated string in Modified UTF-8.27
input : str_mutf8_limit, the max length(byte count) 28
of str_mutf8 plus one(for 'null'), str_mutf8 must have enough space 29
for str_mutf8_limit bytes.30
return : -1 for errors; 31
else the length(byte count) of str_mutf8, 32
maybe larger than (str_mutf8_limit-1) if the space 33
of str_mutf8 isn't enougn.34
note : convet U+0000 to 0x00, not 0xc080 字符串結束35
*/36
int utf32_to_mutf8( const unsigned int *str_utf32, 37
unsigned char *str_mutf8, int str_mutf8_limit );38

39

C代碼
1
/*
2
Convert Modified UTF-8 <==> UTF-32.
3
*/
4
5
6
#include "cvt_mutf8_utf32.h"
7
#include <stdio.h>
8
9
10
/*
11
A U+0001 to U+007F
12
0+++ ++++ u &0x80 => 0x00
13
14
B U+0080 to U+07FF, and null character (U+0000)
15
110+ ++++ u &0xe0 => 0xc0
16
10++ ++++ v &0xc0 => 0x80
17
((u & 0x1f) << 6) + (v & 0x3f)
18
19
C U+0800 to U+FFFF
20
1110 ++++ u &0xf0 => 0xe0
21
10++ ++++ v &0xc0 => 0x80
22
10++ ++++ w &0xc0 => 0x80
23
((u & 0xf) << 12) + ((v & 0x3f) << 6) + (w & 0x3f)
24
25
D above U+FFFF (U+10000 to U+10FFFF)
26
1110 1101 u &0xff => 0xed
27
1010 ++++ v &0xf0 => 0xa0
28
10++ ++++ w &0xc0 => 0x80
29
1110 1101 x &0xff => 0xed
30
1011 ++++ y &0xf0 => 0xb0
31
10++ ++++ z &0xc0 => 0x80
32
0x10000+((v&0x0f)<<16)+((w&0x3f)<<10)+(y&0x0f)<<6)+(z&0x3f)
33
*/
34
35
int mutf8_to_utf32( const unsigned char *str_mutf8,
36
unsigned int *str_utf32, int str_utf32_limit ) {
37
unsigned int cod, u, v, w, x, y, z;
38
int len32 = 0;
39
if ( (NULL == str_mutf8) || (0 > str_utf32_limit) ) {
40
return (-1);
41
}
42
43
#define __ADD_UTF32_COD_Z__ do {\
44
if ( (NULL != str_utf32) && (len32 < str_utf32_limit) ) {\
45
str_utf32[ len32 ] = cod;\
46
}\
47
++len32;\
48
} while ( 0 )
49
50
for ( ; ; ) {
51
u = *str_mutf8++;
52
53
if ( 0 == u ) {
54
break;
55
}
56
57
if ( 0x00 == (0x80 & u) ) {
58
cod = u;
59
__ADD_UTF32_COD_Z__;
60
continue;
61
}
62
63
if ( 0xc0 == (0xe0 & u) ) {
64
v = *str_mutf8++;
65
if ( 0x80 != (0xc0 & v) ) {
66
return (-1);
67
}
68
cod = ((u&0x1f)<<6) |
69
(v&0x3f);
70
__ADD_UTF32_COD_Z__;
71
continue;
72
}
73
74
if ( 0xe0 == (0xf0 & u) ) {
75
v = *str_mutf8++;
76
if ( 0x80 != (0xc0 & v) ) {
77
return (-1);
78
}
79
w = *str_mutf8++;
80
if ( 0x80 != (0xc0 & w) ) {
81
return (-1);
82
}
83
if ( (0xed == (0xff & u)) &&
84
(0xa0 == (0xf0 & v)) &&
85
(0x80 == (0xc0 & w))
86
) {
87
x = *str_mutf8++;
88
if ( 0xed != (0xff & x) ) {
89
return (-1);
90
}
91
y = *str_mutf8++;
92
if ( 0xb0 != (0xf0 & y) ) {
93
return (-1);
94
}
95
z = *str_mutf8++;
96
if ( 0x80 != (0xc0 & z) ) {
97
return (-1);
98
}
99
cod = 0x10000 + (
100
((v&0x0f)<<16) |
101
((w&0x3f)<<10) |
102
((y&0x0f)<<6) |
103
(z&0x3f) );
104
__ADD_UTF32_COD_Z__;
105
continue;
106
}
107
cod = ((u&0xf)<<12) |
108
((v&0x3f)<<6) |
109
(w&0x3f);
110
__ADD_UTF32_COD_Z__;
111
continue;
112
}
113
114
return (-1);
115
}
116
117
if ( NULL == str_utf32 ) {
118
}
119
else if ( len32 < str_utf32_limit ) {
120
str_utf32[ len32 ] = 0;
121
}
122
else {
123
str_utf32[ str_utf32_limit-1 ] = 0;
124
}
125
126
return len32;
127
#undef __ADD_UTF32_COD_Z__
128
}
129
130
int utf32_to_mutf8( const unsigned int *str_utf32,
131
unsigned char *str_mutf8, int str_mutf8_limit ) {
132
unsigned int cod;
133
int len8 = 0;
134
if ( (NULL == str_utf32) || (0 > str_mutf8_limit) ) {
135
return (-1);
136
}
137
138
#define __ADD_MUTF8_B_Z__(b) do {\
139
if ( (NULL != str_mutf8) && (len8 < str_mutf8_limit) ) {\
140
str_mutf8[ len8 ] = (unsigned char)(b);\
141
}\
142
++len8;\
143
} while ( 0 )
144
145
for ( ; ; ) {
146
cod = *str_utf32++;
147
148
if ( 0 == cod ) {
149
break;
150
}
151
152
if ( 0x007f >= cod ) {
153
__ADD_MUTF8_B_Z__(cod);
154
continue;
155
}
156
157
if ( 0x07ff >= cod ) {
158
__ADD_MUTF8_B_Z__(0xc0|((cod>>6)&0x1f));
159
__ADD_MUTF8_B_Z__(0x80|(cod&0x3f));
160
continue;
161
}
162
163
if ( 0xffff >= cod ) {
164
__ADD_MUTF8_B_Z__(0xe0|((cod>>12)&0x0f));
165
__ADD_MUTF8_B_Z__(0x80|((cod>>6)&0x3f));
166
__ADD_MUTF8_B_Z__(0x80|(cod&0x3f));
167
continue;
168
}
169
170
if ( 0x10ffff >= cod ) {
171
cod -= 0x10000;
172
__ADD_MUTF8_B_Z__(0xed);
173
__ADD_MUTF8_B_Z__(0xa0|((cod>>16)&0x0f));
174
__ADD_MUTF8_B_Z__(0x80|((cod>>10)&0x3f));
175
__ADD_MUTF8_B_Z__(0xed);
176
__ADD_MUTF8_B_Z__(0xb0|((cod>>6)&0x0f));
177
__ADD_MUTF8_B_Z__(0x80|(cod&0x3f));
178
continue;
179
}
180
181
return (-1);
182
}
183
184
if ( NULL == str_mutf8 ) {
185
}
186
else if ( len8 < str_mutf8_limit ) {
187
str_mutf8[ len8 ] = 0;
188
}
189
else {
190
str_mutf8[ str_mutf8_limit-1 ] = 0;
191
}
192
193
return len8;
194
#undef __ADD_MUTF8_B_Z__
195
}
196
197
/*2
Convert Modified UTF-8 <==> UTF-32.3
*/4

5

6
#include "cvt_mutf8_utf32.h"7
#include <stdio.h> 8

9

10
/*11
A U+0001 to U+007F12
0+++ ++++ u &0x80 => 0x0013

14
B U+0080 to U+07FF, and null character (U+0000)15
110+ ++++ u &0xe0 => 0xc016
10++ ++++ v &0xc0 => 0x8017
((u & 0x1f) << 6) + (v & 0x3f)18

19
C U+0800 to U+FFFF20
1110 ++++ u &0xf0 => 0xe021
10++ ++++ v &0xc0 => 0x8022
10++ ++++ w &0xc0 => 0x8023
((u & 0xf) << 12) + ((v & 0x3f) << 6) + (w & 0x3f)24

25
D above U+FFFF (U+10000 to U+10FFFF)26
1110 1101 u &0xff => 0xed27
1010 ++++ v &0xf0 => 0xa028
10++ ++++ w &0xc0 => 0x8029
1110 1101 x &0xff => 0xed30
1011 ++++ y &0xf0 => 0xb031
10++ ++++ z &0xc0 => 0x8032
0x10000+((v&0x0f)<<16)+((w&0x3f)<<10)+(y&0x0f)<<6)+(z&0x3f) 33
*/34

35
int mutf8_to_utf32( const unsigned char *str_mutf8, 36
unsigned int *str_utf32, int str_utf32_limit ) {37
unsigned int cod, u, v, w, x, y, z;38
int len32 = 0;39
if ( (NULL == str_mutf8) || (0 > str_utf32_limit) ) {40
return (-1);41
}42

43
#define __ADD_UTF32_COD_Z__ do {\44
if ( (NULL != str_utf32) && (len32 < str_utf32_limit) ) {\45
str_utf32[ len32 ] = cod;\46
}\47
++len32;\48
} while ( 0 )49

50
for ( ; ; ) {51
u = *str_mutf8++;52

53
if ( 0 == u ) {54
break;55
}56

57
if ( 0x00 == (0x80 & u) ) {58
cod = u;59
__ADD_UTF32_COD_Z__;60
continue;61
}62

63
if ( 0xc0 == (0xe0 & u) ) {64
v = *str_mutf8++;65
if ( 0x80 != (0xc0 & v) ) {66
return (-1);67
}68
cod = ((u&0x1f)<<6) | 69
(v&0x3f);70
__ADD_UTF32_COD_Z__;71
continue;72
}73

74
if ( 0xe0 == (0xf0 & u) ) {75
v = *str_mutf8++;76
if ( 0x80 != (0xc0 & v) ) {77
return (-1);78
}79
w = *str_mutf8++;80
if ( 0x80 != (0xc0 & w) ) {81
return (-1);82
}83
if ( (0xed == (0xff & u)) && 84
(0xa0 == (0xf0 & v)) && 85
(0x80 == (0xc0 & w)) 86
) {87
x = *str_mutf8++;88
if ( 0xed != (0xff & x) ) {89
return (-1);90
}91
y = *str_mutf8++;92
if ( 0xb0 != (0xf0 & y) ) {93
return (-1);94
}95
z = *str_mutf8++;96
if ( 0x80 != (0xc0 & z) ) {97
return (-1);98
}99
cod = 0x10000 + (100
((v&0x0f)<<16) | 101
((w&0x3f)<<10) | 102
((y&0x0f)<<6) | 103
(z&0x3f) );104
__ADD_UTF32_COD_Z__;105
continue;106
}107
cod = ((u&0xf)<<12) | 108
((v&0x3f)<<6) | 109
(w&0x3f);110
__ADD_UTF32_COD_Z__;111
continue;112
}113

114
return (-1);115
}116

117
if ( NULL == str_utf32 ) {118
}119
else if ( len32 < str_utf32_limit ) {120
str_utf32[ len32 ] = 0;121
}122
else {123
str_utf32[ str_utf32_limit-1 ] = 0;124
}125

126
return len32;127
#undef __ADD_UTF32_COD_Z__128
}129

130
int utf32_to_mutf8( const unsigned int *str_utf32, 131
unsigned char *str_mutf8, int str_mutf8_limit ) {132
unsigned int cod;133
int len8 = 0;134
if ( (NULL == str_utf32) || (0 > str_mutf8_limit) ) {135
return (-1);136
}137

138
#define __ADD_MUTF8_B_Z__(b) do {\139
if ( (NULL != str_mutf8) && (len8 < str_mutf8_limit) ) {\140
str_mutf8[ len8 ] = (unsigned char)(b);\141
}\142
++len8;\143
} while ( 0 )144

145
for ( ; ; ) {146
cod = *str_utf32++;147

148
if ( 0 == cod ) {149
break;150
}151

152
if ( 0x007f >= cod ) {153
__ADD_MUTF8_B_Z__(cod);154
continue;155
}156

157
if ( 0x07ff >= cod ) {158
__ADD_MUTF8_B_Z__(0xc0|((cod>>6)&0x1f));159
__ADD_MUTF8_B_Z__(0x80|(cod&0x3f));160
continue;161
}162

163
if ( 0xffff >= cod ) {164
__ADD_MUTF8_B_Z__(0xe0|((cod>>12)&0x0f));165
__ADD_MUTF8_B_Z__(0x80|((cod>>6)&0x3f));166
__ADD_MUTF8_B_Z__(0x80|(cod&0x3f));167
continue;168
}169

170
if ( 0x10ffff >= cod ) {171
cod -= 0x10000;172
__ADD_MUTF8_B_Z__(0xed);173
__ADD_MUTF8_B_Z__(0xa0|((cod>>16)&0x0f));174
__ADD_MUTF8_B_Z__(0x80|((cod>>10)&0x3f));175
__ADD_MUTF8_B_Z__(0xed);176
__ADD_MUTF8_B_Z__(0xb0|((cod>>6)&0x0f));177
__ADD_MUTF8_B_Z__(0x80|(cod&0x3f));178
continue;179
}180

181
return (-1);182
}183

184
if ( NULL == str_mutf8 ) {185
}186
else if ( len8 < str_mutf8_limit ) {187
str_mutf8[ len8 ] = 0;188
}189
else {190
str_mutf8[ str_mutf8_limit-1 ] = 0;191
}192

193
return len8;194
#undef __ADD_MUTF8_B_Z__195
}196

197

posted on 2014-04-13 19:42 coreBugZJ 閱讀(1003) 評論(0) 編輯 收藏 引用 所屬分類: 技術視野



