coreBugZJ

此 blog 已棄。

Modified UTF-8 與 UTF-32 相互轉換

自己的實現，經過一定的測試。

頭文件

/*
2

Convert Modified UTF-8 <==> UTF-32.
3

*/
4

/*
7

function : Convert Modified UTF-8 to UTF-32.
8

input : str_mutf8, a null terminated string in Modified UTF-8.
9

output : str_utf32, a null terminated string in UTF-32.
10

input : str_utf32_limit, the max length(character count)
11

of str_utf32 plus one(for 'null'), str_utf32 must have enough space
12

for str_utf32_limit characters.
13

return : -1 for errors;
14

else the length(character count) of str_utf32,
15

maybe larger than (str_utf32_limit-1) if the space
16

of str_utf32 isn't enougn.
17

note : convert 0xc080 to U+0000 字符串未結束
18

convert 0x00 to U+0000 字符串結束
19

*/
20

int mutf8_to_utf32( const unsigned char *str_mutf8,
21

unsigned int *str_utf32, int str_utf32_limit );
22

/*
24

function : Convert UTF-32 to Modified UTF-8.
25

input : str_utf32, a null terminated string in UTF-32.
26

output : str_mutf8, a null terminated string in Modified UTF-8.
27

input : str_mutf8_limit, the max length(byte count)
28

of str_mutf8 plus one(for 'null'), str_mutf8 must have enough space
29

for str_mutf8_limit bytes.
30

return : -1 for errors;
31

else the length(byte count) of str_mutf8,
32

maybe larger than (str_mutf8_limit-1) if the space
33

of str_mutf8 isn't enougn.
34

note : convet U+0000 to 0x00, not 0xc080 字符串結束
35

*/
36

int utf32_to_mutf8( const unsigned int *str_utf32,
37

unsigned char *str_mutf8, int str_mutf8_limit );
38

C代碼

/*
2

Convert Modified UTF-8 <==> UTF-32.
3

*/
4

#include "cvt_mutf8_utf32.h"
7

#include <stdio.h>
8

/*
11

A U+0001 to U+007F
12

0+++ ++++ u &0x80 => 0x00
13

B U+0080 to U+07FF, and null character (U+0000)
15

110+ ++++ u &0xe0 => 0xc0
16

10++ ++++ v &0xc0 => 0x80
17

((u & 0x1f) << 6) + (v & 0x3f)
18

C U+0800 to U+FFFF
20

1110 ++++ u &0xf0 => 0xe0
21

10++ ++++ v &0xc0 => 0x80
22

10++ ++++ w &0xc0 => 0x80
23

((u & 0xf) << 12) + ((v & 0x3f) << 6) + (w & 0x3f)
24

D above U+FFFF (U+10000 to U+10FFFF)
26

1110 1101 u &0xff => 0xed
27

1010 ++++ v &0xf0 => 0xa0
28

10++ ++++ w &0xc0 => 0x80
29

1110 1101 x &0xff => 0xed
30

1011 ++++ y &0xf0 => 0xb0
31

10++ ++++ z &0xc0 => 0x80
32

0x10000+((v&0x0f)<<16)+((w&0x3f)<<10)+(y&0x0f)<<6)+(z&0x3f)
33

*/
34

int mutf8_to_utf32( const unsigned char *str_mutf8,
36

unsigned int *str_utf32, int str_utf32_limit ) {
37

unsigned int cod, u, v, w, x, y, z;
38

int len32 = 0;
39

if ( (NULL == str_mutf8) || (0 > str_utf32_limit) ) {
40

return (-1);
41

}
42

#define __ADD_UTF32_COD_Z__ do {\
44

if ( (NULL != str_utf32) && (len32 < str_utf32_limit) ) {\
45

str_utf32[ len32 ] = cod;\
46

}\
47

++len32;\
48

} while ( 0 )
49

for ( ; ; ) {
51

u = *str_mutf8++;
52

if ( 0 == u ) {
54

break;
55

}
56

if ( 0x00 == (0x80 & u) ) {
58

cod = u;
59

__ADD_UTF32_COD_Z__;
60

continue;
61

}
62

if ( 0xc0 == (0xe0 & u) ) {
64

v = *str_mutf8++;
65

if ( 0x80 != (0xc0 & v) ) {
66

return (-1);
67

}
68

cod = ((u&0x1f)<<6) |
69

(v&0x3f);
70

__ADD_UTF32_COD_Z__;
71

continue;
72

}
73

if ( 0xe0 == (0xf0 & u) ) {
75

v = *str_mutf8++;
76

if ( 0x80 != (0xc0 & v) ) {
77

return (-1);
78

}
79

w = *str_mutf8++;
80

if ( 0x80 != (0xc0 & w) ) {
81

return (-1);
82

}
83

if ( (0xed == (0xff & u)) &&
84

(0xa0 == (0xf0 & v)) &&
85

(0x80 == (0xc0 & w))
86

) {
87

x = *str_mutf8++;
88

if ( 0xed != (0xff & x) ) {
89

return (-1);
90

}
91

y = *str_mutf8++;
92

if ( 0xb0 != (0xf0 & y) ) {
93

return (-1);
94

}
95

z = *str_mutf8++;
96

if ( 0x80 != (0xc0 & z) ) {
97

return (-1);
98

}
99

cod = 0x10000 + (
100

((v&0x0f)<<16) |
101

((w&0x3f)<<10) |
102

((y&0x0f)<<6) |
103

(z&0x3f) );
104

__ADD_UTF32_COD_Z__;
105

continue;
106

}
107

cod = ((u&0xf)<<12) |
108

((v&0x3f)<<6) |
109

(w&0x3f);
110

__ADD_UTF32_COD_Z__;
111

continue;
112

}
113

114

return (-1);
115

}
116

117

if ( NULL == str_utf32 ) {
118

}
119

else if ( len32 < str_utf32_limit ) {
120

str_utf32[ len32 ] = 0;
121

}
122

else {
123

str_utf32[ str_utf32_limit-1 ] = 0;
124

}
125

126

return len32;
127

#undef __ADD_UTF32_COD_Z__
128

}
129

130

int utf32_to_mutf8( const unsigned int *str_utf32,
131

unsigned char *str_mutf8, int str_mutf8_limit ) {
132

unsigned int cod;
133

int len8 = 0;
134

if ( (NULL == str_utf32) || (0 > str_mutf8_limit) ) {
135

return (-1);
136

}
137

138

#define __ADD_MUTF8_B_Z__(b) do {\
139

if ( (NULL != str_mutf8) && (len8 < str_mutf8_limit) ) {\
140

str_mutf8[ len8 ] = (unsigned char)(b);\
141

}\
142

++len8;\
143

} while ( 0 )
144

145

for ( ; ; ) {
146

cod = *str_utf32++;
147

148

if ( 0 == cod ) {
149

break;
150

}
151

152

if ( 0x007f >= cod ) {
153

__ADD_MUTF8_B_Z__(cod);
154

continue;
155

}
156

157

if ( 0x07ff >= cod ) {
158

__ADD_MUTF8_B_Z__(0xc0|((cod>>6)&0x1f));
159

__ADD_MUTF8_B_Z__(0x80|(cod&0x3f));
160

continue;
161

}
162

163

if ( 0xffff >= cod ) {
164

__ADD_MUTF8_B_Z__(0xe0|((cod>>12)&0x0f));
165

__ADD_MUTF8_B_Z__(0x80|((cod>>6)&0x3f));
166

__ADD_MUTF8_B_Z__(0x80|(cod&0x3f));
167

continue;
168

}
169

170

if ( 0x10ffff >= cod ) {
171

cod -= 0x10000;
172

__ADD_MUTF8_B_Z__(0xed);
173

__ADD_MUTF8_B_Z__(0xa0|((cod>>16)&0x0f));
174

__ADD_MUTF8_B_Z__(0x80|((cod>>10)&0x3f));
175

__ADD_MUTF8_B_Z__(0xed);
176

__ADD_MUTF8_B_Z__(0xb0|((cod>>6)&0x0f));
177

__ADD_MUTF8_B_Z__(0x80|(cod&0x3f));
178

continue;
179

}
180

181

return (-1);
182

}
183

184

if ( NULL == str_mutf8 ) {
185

}
186

else if ( len8 < str_mutf8_limit ) {
187

str_mutf8[ len8 ] = 0;
188

}
189

else {
190

str_mutf8[ str_mutf8_limit-1 ] = 0;
191

}
192

193

return len8;
194

#undef __ADD_MUTF8_B_Z__
195

}
196

197

posted on 2014-04-13 19:42 coreBugZJ 閱讀(1003) 評論(0) 編輯收藏引用所屬分類: 技術視野

只有注冊用戶登錄后才能發表評論。


相關文章: Modified UTF-8 與 UTF-32 相互轉換大端小端,ascii,unicode,utf8,utf16,utf32,gb2312,gbk,gb18030等字符編碼問題（轉）醫學諾貝爾之路（1923）：尋找胰島素（轉）面對面的辦公室——紀念艾倫?圖靈百年誕辰 1912.6.23-2012.6.23 （轉）超越邏輯（轉）軟件開發中的破窗效應（轉）多任務讓你走得更慢（轉）為什么開發者總是選昏招（轉）程序員也要養生（轉）禪修程序員十誡（轉）

網站導航: 博客園 IT新聞 BlogJava 博問 Chat2DB 管理

青青草原综合久久大伊人导航_色综合久久天天综合_日日噜噜夜夜狠狠久久丁香五月_热久久这里只有精品

coreBugZJ

My Links

Blog Stats

常用鏈接

留言簿(10)

隨筆分類(458)

隨筆檔案(268)

相冊

ACM

AI

LaTeX

安全

編程語言

好有道理

技術

開源

科學

數學

圖形圖像

文化

問題（練習＆有趣）

資源

最新隨筆

搜索

最新評論

閱讀排行榜

評論排行榜

Modified UTF-8 與 UTF-32 相互轉換