source: trunk/Include/Classes/System/Text/UTF8Encoding.ab@ 478

Last change on this file since 478 was 411, checked in by イグトランス (egtra), 17 years ago

UTF8Encoding(仮)の追加

File size: 6.6 KB
Line 
1/*!
2@file Classes/System/Text/UTF8Encoding.ab
3@brief UTF8Encodingクラスとそれに関係するクラスなどの宣言・定義
4*/
5
6Namespace System
7Namespace Text
8
9Namespace Detail
10
11Class UTF8Encoder
12 Inherits Encoder
13Protected
14 Override Sub ConvertCore(chars As *WCHAR, charCount As Long, bytes As *Byte, byteCount As Long, flush As Boolean,
15 ByRef bytesUsed As Long, ByRef charsUsed As Long, ByRef completed As Boolean)
16
17 Dim i As Long, j = 0 As Long
18 For i = 0 To ELM(charCount)
19 If chars[i] < &h80 Then
20 '1バイト変換
21 If j + 1 > byteCount Then
22 'バッファ不足
23 Goto *BufferOver
24 End If
25 bytes[j] = chars[i] As Byte
26 j++
27 ElseIf chars[i] < &h800 Then
28 '2バイト変換
29 If j + 2 > byteCount Then
30 Goto *BufferOver
31 End If
32 bytes[j] = ((chars[i] >> 6) Or &hC0) As Byte
33 j++
34 bytes[j] = (chars[i] And &h3F Or &h80) As Byte
35 j++
36 ElseIf _System_IsHighSurrogate(chars[i]) Then
37 If i + 1 >= charCount Then
38 'バッファに貯め込む
39 If flush = False Then
40 buffer = chars[i]
41 Exit Sub
42 End If
43 'ToDo: chars[i + 1]が範囲外になる場合が考慮されていない
44 ElseIf _System_IsLowSurrogate(chars[i + 1]) = False Then
45 'EncoderFallback
46 End If
47 If j + 4 > byteCount Then
48 Goto *BufferOver
49 End If
50 'UTF-16列からUnicodeコードポイントを復元
51 Dim c = (((chars[i] And &h3FF) As DWord << 10) Or (chars[i + 1] And &h3FF)) + &h10000
52 '4バイト変換
53 bytes[j] = ((c >> 18) Or &hf0) As Byte
54 j++
55 bytes[j] = ((c >> 12) And &h3F Or &h80) As Byte
56 j++
57 bytes[j] = ((c >> 6) And &h3F Or &h80) As Byte
58 j++
59 bytes[j] = (c And &h3F Or &h80) As Byte
60 j++
61 i++
62 ElseIf _System_IsLowSurrogate(chars[i]) Then
63 'EncoderFallback
64 Else
65 '3バイト変換
66 If j + 3 > byteCount Then
67 Goto *BufferOver
68 End If
69 bytes[j] = ((chars[i] >> 12) Or &hE0) As Byte
70 j++
71 bytes[j] = ((chars[i] >> 6) And &h3F Or &h80) As Byte
72 j++
73 bytes[j] = (chars[i] And &h3F Or &h80) As Byte
74 j++
75 End If
76 Next
77
78 Exit Sub
79 *BufferOver
80 'バッファ不足
81 Throw New ArgumentException("Buffer is not enough.", "bytes")
82 End Sub
83
84Private
85 buffer As WCHAR
86End Class
87
88Class UTF8Decoder
89 Inherits Decoder
90Protected
91 Override Sub ConvertCore(bytes As *Byte, byteCount As Long, chars As *WCHAR, charCount As Long, flush As Boolean,
92 ByRef bytesUsed As Long, ByRef charsUsed As Long, ByRef completed As Boolean)
93 Dim i As Long, j = 0 As Long
94 For i = 0 To ELM(byteCount)
95 If state = 0 Then
96 If bytes[i] <= &h80 Then
97 '1バイト変換
98 If j = charCount Then Goto *BufferOver
99 chars[j] = bytes[i]
100 j++
101 ElseIf bytes[i] < &hC0 Then
102 'マルチバイトの2バイト目以降
103 'DecoderFallback完成までの暫定
104 If j = charCount Then Goto *BufferOver
105 chars[j] = &hfffd
106 j++
107 ElseIf bytes[i] < &hD0 Then
108 '2バイト文字の始まり
109 last = 2
110 buf = bytes[i] And &h3f
111 state++
112 ElseIf bytes[i] < &hF0 Then
113 '3バイト文字の始まり
114 last = 3
115 buf = bytes[i] And &h1f
116 state++
117 Else
118 '4バイト文字の始まり
119 last = 4
120 buf = bytes[i] And &h0f
121 state++
122 End If
123 Else
124 If &h80 <= bytes[i] And bytes[i] < &hC0 Then
125 'マルチバイト文字の2バイト目以降
126 buf <<= 6
127 buf Or= bytes[i] And &h3F
128 state++
129 If state = last Then '最終バイトに到達
130 If state = 2 And buf >= &h80 Then
131 chars[j] = buf As WCHAR
132 j++
133 ElseIf state = 3 And buf >= &h800 And buf < &hD800 And &hE0000 >= buf Then
134 chars[j] = buf As WCHAR
135 j++
136 ElseIf state = 4 And buf <= &h10ffff Then
137 buf -= &h10000
138 chars[j] = (&hD800 Or (buf >> 10)) As WCHAR
139 j++
140 chars[j] = (&hDC00 Or (buf And &h3FF)) As WCHAR
141 j++
142 Else
143 'DecoderFallback
144 If j = charCount Then Goto *BufferOver
145 chars[j] = &hfffd
146 j++
147 End If
148 state = 0
149 End If
150 Else
151 '3, 4バイト文字の先頭
152 'DecoderFallback
153 If j = charCount Then Goto *BufferOver
154 chars[j] = &hfffd
155 j++
156 End If
157 End If
158 Next
159 Exit Sub
160 *BufferOver
161 'バッファ不足
162 Throw New ArgumentException("Buffer is not enough.", "bytes")
163 End Sub
164
165Private
166 buf As DWord
167 state As Long
168 last As Long
169End Class
170
171End Namespace 'Detail
172
173
174/*!
175@brief UTF-8用のEncoding
176@date 2007/12/21
177@auther Egtra
178*/
179Class UTF8Encoding
180 Inherits Encoding
181Public
182
183 Override Function Clone() As Object
184 Dim c = New UTF8Encoding
185 c.DecoderFallback = This.DecoderFallback
186 c.EncoderFallback = This.EncoderFallback
187 Return c
188 End Function
189
190 Override Function GetDecoder() As Decoder
191 GetDecoder = New Detail.UTF8Decoder
192' GetDecoder.Fallback = DecoderFallback
193 End Function
194
195 Override Function GetEncoder() As Encoder
196 GetEncoder = New Detail.UTF8Encoder
197' GetEncoder.Fallback = EncoderFallback
198 End Function
199
200 Override Function GetMaxByteCount(charCount As Long) As Long
201 Return charCount * 3
202 '全てがUTF-8で3バイトになる文字の場合が最大。
203
204 'UTF-8で4バイトになる列は、UTF-16だとサロゲートペアで表現するので、
205 '1単位あたりでは2バイトしか食わないことになる。
206 End Function
207
208 Override Function GetMaxCharCount(byteCount As Long) As Long
209 '全てU+7F以下の文字だけだった場合
210 Return byteCount
211 End Function
212Protected
213 Override Function GetBytesCountCore(s As *WCHAR, n As Long) As Long
214 End Function
215
216 Override Function GetBytesCore(chars As *WCHAR, charCount As Long, bytes As *Byte, byteCount As Long) As Long
217 End Function
218
219 Override Function GetCharsCountCore(s As *Byte, n As Long) As Long
220 End Function
221
222 Override Function GetCharsCore(bytes As *Byte, byteCount As Long, chars As *WCHAR, charCount As Long) As Long
223 End Function
224Public
225 Override Function GetPreamble() As *Byte
226 Return bom
227 End Function
228
229 Override Function GetPreambleLength() As Long
230 Return Len(bom)
231 End Function
232
233 Override Function IsAlwaysNormalized() As Boolean
234 IsAlwaysNormalized = False
235 End Function
236
237 Override Function IsAlwaysNormalized(f As NormalizationForm) As Boolean
238 IsAlwaysNormalized = False
239 End Function
240
241 Override Function BodyName() As String
242 Return "utf-8"
243 End Function
244
245 Override Function HeaderName() As String
246 Return "utf-8"
247 End Function
248
249 Override Function EncodingName() As String
250 Return "UTF-8"
251 End Function
252
253 Override Function WebName() As String
254 Return "utf-8"
255 End Function
256
257 Override Function IsSingleByte() As Boolean
258 Return False
259 End Function
260Private
261 Static bom[2] = [&hEF, &hBB, &hBF] As Byte
262End Class
263
264End Namespace 'Text
265End Namespace 'System
Note: See TracBrowser for help on using the repository browser.