Changeset 676 for trunk/ab5.0/ablib/src/Classes/System/Text/UTF8Encoding.ab
- Timestamp:
- Jan 13, 2009, 2:01:38 AM (15 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/ab5.0/ablib/src/Classes/System/Text/UTF8Encoding.ab
r411 r676 12 12 Inherits Encoder 13 13 Protected 14 Override Sub ConvertCore(chars As *WCHAR, charCount As Long, bytes As *Byte, byteCount As Long, flush As Boolean, 15 ByRef bytesUsed As Long, ByRef charsUsed As Long, ByRef completed As Boolean) 16 17 Dim i As Long, j = 0 As Long 18 For i = 0 To ELM(charCount) 19 If chars[i] < &h80 Then 14 Override Sub EncodeImpl(src As *WCHAR, size As SIZE_T, s As IO.Stream, last As Boolean) 15 Dim i As Long 16 For i = 0 To ELM(size) 17 If buffer <> 0 Then 18 If _System_IsLowSurrogate(src[i]) Then 19 'UTF-16列からUnicodeコードポイントを復元 20 Dim c = (((buffer And &h3FF) As DWord << 10) Or (src[i] And &h3FF)) + &h10000 21 '4バイト変換 22 s.WriteByte(((c >> 18) Or &hf0) As Byte) 23 s.WriteByte(((c >> 12) And &h3F Or &h80) As Byte) 24 s.WriteByte(((c >> 6) And &h3F Or &h80) As Byte) 25 s.WriteByte((c And &h3F Or &h80) As Byte) 26 Else 27 writeReplacementChar(s) 28 End If 29 buffer = 0 30 ElseIf src[i] < &h80 Then 20 31 '1バイト変換 21 If j + 1 > byteCount Then 22 'バッファ不足 23 Goto *BufferOver 24 End If 25 bytes[j] = chars[i] As Byte 26 j++ 27 ElseIf chars[i] < &h800 Then 32 s.WriteByte(src[i] As Byte) 33 ElseIf src[i] < &h800 Then 28 34 '2バイト変換 29 If j + 2 > byteCount Then 30 Goto *BufferOver 31 End If 32 bytes[j] = ((chars[i] >> 6) Or &hC0) As Byte 33 j++ 34 bytes[j] = (chars[i] And &h3F Or &h80) As Byte 35 j++ 36 ElseIf _System_IsHighSurrogate(chars[i]) Then 37 If i + 1 >= charCount Then 38 'バッファに貯め込む 39 If flush = False Then 40 buffer = chars[i] 41 Exit Sub 42 End If 43 'ToDo: chars[i + 1]が範囲外になる場合が考慮されていない 44 ElseIf _System_IsLowSurrogate(chars[i + 1]) = False Then 45 'EncoderFallback 46 End If 47 If j + 4 > byteCount Then 48 Goto *BufferOver 49 End If 50 'UTF-16列からUnicodeコードポイントを復元 51 Dim c = (((chars[i] And &h3FF) As DWord << 10) Or (chars[i + 1] And &h3FF)) + &h10000 52 '4バイト変換 53 bytes[j] = ((c >> 18) Or &hf0) As Byte 54 j++ 55 bytes[j] = ((c >> 12) And &h3F Or &h80) As Byte 56 j++ 57 bytes[j] = ((c >> 6) And &h3F Or &h80) As Byte 58 j++ 59 bytes[j] = (c And &h3F Or &h80) As Byte 60 j++ 61 i++ 62 ElseIf _System_IsLowSurrogate(chars[i]) Then 63 'EncoderFallback 35 s.WriteByte(((src[i] >> 6) Or &hC0) As Byte) 36 s.WriteByte((src[i] And &h3F Or &h80) As Byte) 37 ElseIf _System_IsHighSurrogate(src[i]) Then 38 'バッファに貯め込む 39 buffer = src[i] 40 ElseIf _System_IsLowSurrogate(src[i]) Then 41 writeReplacementChar(s) 64 42 Else 65 43 '3バイト変換 66 If j + 3 > byteCount Then 67 Goto *BufferOver 68 End If 69 bytes[j] = ((chars[i] >> 12) Or &hE0) As Byte 70 j++ 71 bytes[j] = ((chars[i] >> 6) And &h3F Or &h80) As Byte 72 j++ 73 bytes[j] = (chars[i] And &h3F Or &h80) As Byte 74 j++ 44 s.WriteByte(((src[i] >> 12) Or &hE0) As Byte) 45 s.WriteByte(((src[i] >> 6) And &h3F Or &h80) As Byte) 46 s.WriteByte((src[i] And &h3F Or &h80) As Byte) 75 47 End If 76 48 Next 77 78 Exit Sub79 *BufferOver80 'バッファ不足81 Throw New ArgumentException("Buffer is not enough.", "bytes")82 49 End Sub 83 50 84 51 Private 52 ' U+FFFD Replacement CharacterのUTF-8表現、EF BF BDを書き込む。 53 Sub writeReplacementChar(s As IO.Stream) 54 Dim rc[2] = [&hef, &hbf, &hbd] As Byte 55 s.Write(rc, 0, Len(rc)) 56 End Sub 57 85 58 buffer As WCHAR 86 59 End Class … … 89 62 Inherits Decoder 90 63 Protected 91 Override Sub ConvertCore(bytes As *Byte, byteCount As Long, chars As *WCHAR, charCount As Long, flush As Boolean, 92 ByRef bytesUsed As Long, ByRef charsUsed As Long, ByRef completed As Boolean) 93 Dim i As Long, j = 0 As Long 94 For i = 0 To ELM(byteCount) 95 If state = 0 Then 96 If bytes[i] <= &h80 Then 64 Override Function DecodeImpl(dst As Collections.Generic.List<WCHAR>, s As IO.Stream) As Boolean 65 Dim i As Long 66 For i = 0 To DefalultDecodingBufferSize - 1 'ELM 67 Dim b = s.ReadByte() 68 If b = -1 Then 69 DecodeImpl = False 70 Exit Function 71 ElseIf state = 0 Then 72 If b <= &h80 Then 97 73 '1バイト変換 98 If j = charCount Then Goto *BufferOver 99 chars[j] = bytes[i] 100 j++ 101 ElseIf bytes[i] < &hC0 Then 102 'マルチバイトの2バイト目以降 103 'DecoderFallback完成までの暫定 104 If j = charCount Then Goto *BufferOver 105 chars[j] = &hfffd 106 j++ 107 ElseIf bytes[i] < &hD0 Then 74 dst.Add(b As WCHAR) 75 ElseIf b < &hC0 Then 76 '先頭バイトがなく、いきなりマルチバイトの2バイト目以降 77 dst.Add(&hFFFD As WCHAR) 78 ElseIf b < &hD0 Then 108 79 '2バイト文字の始まり 109 80 last = 2 110 buf = bytes[i] And &h3f111 state++ 112 ElseIf b ytes[i]< &hF0 Then81 buffer = b And &h3F 82 state++ 83 ElseIf b < &hF0 Then 113 84 '3バイト文字の始まり 114 85 last = 3 115 buf = bytes[i] And &h1f116 state++ 117 Else 86 buffer = b And &h1F 87 state++ 88 ElseIf b < &hF8 Then 118 89 '4バイト文字の始まり 119 90 last = 4 120 buf = bytes[i] And &h0f 121 state++ 91 buffer = b And &h0F 92 state++ 93 Else 94 '現在のUTF-8は4バイトを超える表現を認めていない。 95 dst.Add(&hFFFD As WCHAR) 122 96 End If 123 97 Else 124 If &h80 <= b ytes[i] And bytes[i]< &hC0 Then98 If &h80 <= b And b < &hC0 Then 125 99 'マルチバイト文字の2バイト目以降 126 buf <<= 6127 buf Or= bytes[i]And &h3F100 buffer <<= 6 101 buffer Or= b And &h3F 128 102 state++ 129 103 If state = last Then '最終バイトに到達 130 If state = 2 And buf >= &h80 Then 131 chars[j] = buf As WCHAR 132 j++ 133 ElseIf state = 3 And buf >= &h800 And buf < &hD800 And &hE0000 >= buf Then 134 chars[j] = buf As WCHAR 135 j++ 136 ElseIf state = 4 And buf <= &h10ffff Then 137 buf -= &h10000 138 chars[j] = (&hD800 Or (buf >> 10)) As WCHAR 139 j++ 140 chars[j] = (&hDC00 Or (buf And &h3FF)) As WCHAR 141 j++ 104 If state = 2 And buffer >= &h80 Then 105 dst.Add(buffer As WCHAR) 106 ElseIf state = 3 And buffer >= &h800 And buffer < &hD800 And &hE0000 >= buffer Then 107 dst.Add(buffer As WCHAR) 108 ElseIf state = 4 And buffer <= &h10FFFF Then 109 buffer -= &h10000 110 dst.Add((&hD800 Or (buffer >> 10)) As WCHAR) 111 dst.Add((&hDC00 Or (buffer And &h3FF)) As WCHAR) 142 112 Else 143 'DecoderFallback 144 If j = charCount Then Goto *BufferOver 145 chars[j] = &hfffd 146 j++ 113 '最短形式でないもの、4バイト形式で10FFFFを超えるコードポイントのもの 114 dst.Add(&hfffd As WCHAR) 147 115 End If 148 116 state = 0 149 117 End If 150 118 Else 151 '3, 4バイト文字の先頭 152 'DecoderFallback 153 If j = charCount Then Goto *BufferOver 154 chars[j] = &hfffd 155 j++ 119 'マルチバイト文字の途中なのに、それ以外のバイトが現れた場合 120 dst.Add(&hFFFD As WCHAR) 121 state = 0 156 122 End If 157 123 End If 158 124 Next 159 Exit Sub 160 *BufferOver 161 'バッファ不足 162 Throw New ArgumentException("Buffer is not enough.", "bytes") 163 End Sub 125 DecodeImpl = True 126 End Function 164 127 165 128 Private 166 buf As DWord129 buffer As DWord 167 130 state As Long 168 131 last As Long … … 180 143 Inherits Encoding 181 144 Public 182 183 145 Override Function Clone() As Object 184 Dim c = New UTF8Encoding 185 c.DecoderFallback = This.DecoderFallback 186 c.EncoderFallback = This.EncoderFallback 187 Return c 146 Clone = New UTF8Encoding 188 147 End Function 189 148 190 149 Override Function GetDecoder() As Decoder 191 150 GetDecoder = New Detail.UTF8Decoder 192 ' GetDecoder.Fallback = DecoderFallback193 151 End Function 194 152 195 153 Override Function GetEncoder() As Encoder 196 154 GetEncoder = New Detail.UTF8Encoder 197 ' GetEncoder.Fallback = EncoderFallback198 155 End Function 199 156 200 157 Override Function GetMaxByteCount(charCount As Long) As Long 201 ReturncharCount * 3158 GetMaxByteCount = charCount * 3 202 159 '全てがUTF-8で3バイトになる文字の場合が最大。 203 160 204 ' UTF-8で4バイトになる列は、UTF-16だとサロゲートペアで表現するので、205 '1単位あたりでは2バイトしか食わないことにな る。161 'なお、UTF-8で4バイトになる列は、UTF-16だとサロゲートペアで表現するので、 162 '1単位あたりでは2バイトしか食わないことになり、最大ではない。 206 163 End Function 207 164 208 165 Override Function GetMaxCharCount(byteCount As Long) As Long 209 166 '全てU+7F以下の文字だけだった場合 210 Return byteCount 211 End Function 212 Protected 213 Override Function GetBytesCountCore(s As *WCHAR, n As Long) As Long 214 End Function 215 216 Override Function GetBytesCore(chars As *WCHAR, charCount As Long, bytes As *Byte, byteCount As Long) As Long 217 End Function 218 219 Override Function GetCharsCountCore(s As *Byte, n As Long) As Long 220 End Function 221 222 Override Function GetCharsCore(bytes As *Byte, byteCount As Long, chars As *WCHAR, charCount As Long) As Long 223 End Function 224 Public 167 GetMaxCharCount = byteCount 168 End Function 169 225 170 Override Function GetPreamble() As *Byte 226 171 Return bom … … 230 175 Return Len(bom) 231 176 End Function 232 233 Override Function IsAlwaysNormalized() As Boolean 234 IsAlwaysNormalized = False 235 End Function 236 237 Override Function IsAlwaysNormalized(f As NormalizationForm) As Boolean 238 IsAlwaysNormalized = False 239 End Function 240 177 /* 241 178 Override Function BodyName() As String 242 179 Return "utf-8" … … 258 195 Return False 259 196 End Function 197 */ 198 Protected 199 Override Function GetBytesCountCore(src As *WCHAR, srcCount As Long) As Long 200 End Function 201 202 Override Function GetBytesCore(src As *WCHAR, srcCount As Long, dst As *Byte, dstCount As Long) As Long 203 End Function 204 205 Override Function GetCharsCountCore(src As *Byte, srcCount As Long) As Long 206 End Function 207 208 Override Function GetCharsCore(src As *Byte, srcCount As Long, dst As *WCHAR, dstCount As Long) As Long 209 End Function 210 260 211 Private 261 212 Static bom[2] = [&hEF, &hBB, &hBF] As Byte
Note:
See TracChangeset
for help on using the changeset viewer.