source: trunk/ab5.0/ablib/src/Classes/System/Text/UTF8Encoding.ab@ 676

Last change on this file since 676 was 676, checked in by イグトランス (egtra), 15 years ago

#231に関連して、エンコーディング周りを見直し、Encoder/Decoderをストリーム用に特化。
UTF8Encodingをコンパイル可能にし、ビルドに含めるようにした。ただし、実装が不完全なためテストは不可。
(#231)

File size: 5.5 KB
Line 
1/*!
2@file Classes/System/Text/UTF8Encoding.ab
3@brief UTF8Encodingクラスとそれに関係するクラスなどの宣言・定義
4*/
5
6Namespace System
7Namespace Text
8
9Namespace Detail
10
11Class UTF8Encoder
12 Inherits Encoder
13Protected
14 Override Sub EncodeImpl(src As *WCHAR, size As SIZE_T, s As IO.Stream, last As Boolean)
15 Dim i As Long
16 For i = 0 To ELM(size)
17 If buffer <> 0 Then
18 If _System_IsLowSurrogate(src[i]) Then
19 'UTF-16列からUnicodeコードポイントを復元
20 Dim c = (((buffer And &h3FF) As DWord << 10) Or (src[i] And &h3FF)) + &h10000
21 '4バイト変換
22 s.WriteByte(((c >> 18) Or &hf0) As Byte)
23 s.WriteByte(((c >> 12) And &h3F Or &h80) As Byte)
24 s.WriteByte(((c >> 6) And &h3F Or &h80) As Byte)
25 s.WriteByte((c And &h3F Or &h80) As Byte)
26 Else
27 writeReplacementChar(s)
28 End If
29 buffer = 0
30 ElseIf src[i] < &h80 Then
31 '1バイト変換
32 s.WriteByte(src[i] As Byte)
33 ElseIf src[i] < &h800 Then
34 '2バイト変換
35 s.WriteByte(((src[i] >> 6) Or &hC0) As Byte)
36 s.WriteByte((src[i] And &h3F Or &h80) As Byte)
37 ElseIf _System_IsHighSurrogate(src[i]) Then
38 'バッファに貯め込む
39 buffer = src[i]
40 ElseIf _System_IsLowSurrogate(src[i]) Then
41 writeReplacementChar(s)
42 Else
43 '3バイト変換
44 s.WriteByte(((src[i] >> 12) Or &hE0) As Byte)
45 s.WriteByte(((src[i] >> 6) And &h3F Or &h80) As Byte)
46 s.WriteByte((src[i] And &h3F Or &h80) As Byte)
47 End If
48 Next
49 End Sub
50
51Private
52 ' U+FFFD Replacement CharacterのUTF-8表現、EF BF BDを書き込む。
53 Sub writeReplacementChar(s As IO.Stream)
54 Dim rc[2] = [&hef, &hbf, &hbd] As Byte
55 s.Write(rc, 0, Len(rc))
56 End Sub
57
58 buffer As WCHAR
59End Class
60
61Class UTF8Decoder
62 Inherits Decoder
63Protected
64 Override Function DecodeImpl(dst As Collections.Generic.List<WCHAR>, s As IO.Stream) As Boolean
65 Dim i As Long
66 For i = 0 To DefalultDecodingBufferSize - 1 'ELM
67 Dim b = s.ReadByte()
68 If b = -1 Then
69 DecodeImpl = False
70 Exit Function
71 ElseIf state = 0 Then
72 If b <= &h80 Then
73 '1バイト変換
74 dst.Add(b As WCHAR)
75 ElseIf b < &hC0 Then
76 '先頭バイトがなく、いきなりマルチバイトの2バイト目以降
77 dst.Add(&hFFFD As WCHAR)
78 ElseIf b < &hD0 Then
79 '2バイト文字の始まり
80 last = 2
81 buffer = b And &h3F
82 state++
83 ElseIf b < &hF0 Then
84 '3バイト文字の始まり
85 last = 3
86 buffer = b And &h1F
87 state++
88 ElseIf b < &hF8 Then
89 '4バイト文字の始まり
90 last = 4
91 buffer = b And &h0F
92 state++
93 Else
94 '現在のUTF-8は4バイトを超える表現を認めていない。
95 dst.Add(&hFFFD As WCHAR)
96 End If
97 Else
98 If &h80 <= b And b < &hC0 Then
99 'マルチバイト文字の2バイト目以降
100 buffer <<= 6
101 buffer Or= b And &h3F
102 state++
103 If state = last Then '最終バイトに到達
104 If state = 2 And buffer >= &h80 Then
105 dst.Add(buffer As WCHAR)
106 ElseIf state = 3 And buffer >= &h800 And buffer < &hD800 And &hE0000 >= buffer Then
107 dst.Add(buffer As WCHAR)
108 ElseIf state = 4 And buffer <= &h10FFFF Then
109 buffer -= &h10000
110 dst.Add((&hD800 Or (buffer >> 10)) As WCHAR)
111 dst.Add((&hDC00 Or (buffer And &h3FF)) As WCHAR)
112 Else
113 '最短形式でないもの、4バイト形式で10FFFFを超えるコードポイントのもの
114 dst.Add(&hfffd As WCHAR)
115 End If
116 state = 0
117 End If
118 Else
119 'マルチバイト文字の途中なのに、それ以外のバイトが現れた場合
120 dst.Add(&hFFFD As WCHAR)
121 state = 0
122 End If
123 End If
124 Next
125 DecodeImpl = True
126 End Function
127
128Private
129 buffer As DWord
130 state As Long
131 last As Long
132End Class
133
134End Namespace 'Detail
135
136
137/*!
138@brief UTF-8用のEncoding
139@date 2007/12/21
140@auther Egtra
141*/
142Class UTF8Encoding
143 Inherits Encoding
144Public
145 Override Function Clone() As Object
146 Clone = New UTF8Encoding
147 End Function
148
149 Override Function GetDecoder() As Decoder
150 GetDecoder = New Detail.UTF8Decoder
151 End Function
152
153 Override Function GetEncoder() As Encoder
154 GetEncoder = New Detail.UTF8Encoder
155 End Function
156
157 Override Function GetMaxByteCount(charCount As Long) As Long
158 GetMaxByteCount = charCount * 3
159 '全てがUTF-8で3バイトになる文字の場合が最大。
160
161 'なお、UTF-8で4バイトになる列は、UTF-16だとサロゲートペアで表現するので、
162 '1単位あたりでは2バイトしか食わないことになり、最大ではない。
163 End Function
164
165 Override Function GetMaxCharCount(byteCount As Long) As Long
166 '全てU+7F以下の文字だけだった場合
167 GetMaxCharCount = byteCount
168 End Function
169
170 Override Function GetPreamble() As *Byte
171 Return bom
172 End Function
173
174 Override Function GetPreambleLength() As Long
175 Return Len(bom)
176 End Function
177/*
178 Override Function BodyName() As String
179 Return "utf-8"
180 End Function
181
182 Override Function HeaderName() As String
183 Return "utf-8"
184 End Function
185
186 Override Function EncodingName() As String
187 Return "UTF-8"
188 End Function
189
190 Override Function WebName() As String
191 Return "utf-8"
192 End Function
193
194 Override Function IsSingleByte() As Boolean
195 Return False
196 End Function
197*/
198Protected
199 Override Function GetBytesCountCore(src As *WCHAR, srcCount As Long) As Long
200 End Function
201
202 Override Function GetBytesCore(src As *WCHAR, srcCount As Long, dst As *Byte, dstCount As Long) As Long
203 End Function
204
205 Override Function GetCharsCountCore(src As *Byte, srcCount As Long) As Long
206 End Function
207
208 Override Function GetCharsCore(src As *Byte, srcCount As Long, dst As *WCHAR, dstCount As Long) As Long
209 End Function
210
211Private
212 Static bom[2] = [&hEF, &hBB, &hBF] As Byte
213End Class
214
215End Namespace 'Text
216End Namespace 'System
Note: See TracBrowser for help on using the repository browser.