/*! @file Classes/System/Text/UTF8Encoding.ab @brief UTF8Encodingクラスとそれに関係するクラスなどの宣言・定義 */ Namespace System Namespace Text Namespace Detail Class UTF8Encoder Inherits Encoder Protected Override Sub ConvertCore(chars As *WCHAR, charCount As Long, bytes As *Byte, byteCount As Long, flush As Boolean, ByRef bytesUsed As Long, ByRef charsUsed As Long, ByRef completed As Boolean) Dim i As Long, j = 0 As Long For i = 0 To ELM(charCount) If chars[i] < &h80 Then '1バイト変換 If j + 1 > byteCount Then 'バッファ不足 Goto *BufferOver End If bytes[j] = chars[i] As Byte j++ ElseIf chars[i] < &h800 Then '2バイト変換 If j + 2 > byteCount Then Goto *BufferOver End If bytes[j] = ((chars[i] >> 6) Or &hC0) As Byte j++ bytes[j] = (chars[i] And &h3F Or &h80) As Byte j++ ElseIf _System_IsHighSurrogate(chars[i]) Then If i + 1 >= charCount Then 'バッファに貯め込む If flush = False Then buffer = chars[i] Exit Sub End If 'ToDo: chars[i + 1]が範囲外になる場合が考慮されていない ElseIf _System_IsLowSurrogate(chars[i + 1]) = False Then 'EncoderFallback End If If j + 4 > byteCount Then Goto *BufferOver End If 'UTF-16列からUnicodeコードポイントを復元 Dim c = (((chars[i] And &h3FF) As DWord << 10) Or (chars[i + 1] And &h3FF)) + &h10000 '4バイト変換 bytes[j] = ((c >> 18) Or &hf0) As Byte j++ bytes[j] = ((c >> 12) And &h3F Or &h80) As Byte j++ bytes[j] = ((c >> 6) And &h3F Or &h80) As Byte j++ bytes[j] = (c And &h3F Or &h80) As Byte j++ i++ ElseIf _System_IsLowSurrogate(chars[i]) Then 'EncoderFallback Else '3バイト変換 If j + 3 > byteCount Then Goto *BufferOver End If bytes[j] = ((chars[i] >> 12) Or &hE0) As Byte j++ bytes[j] = ((chars[i] >> 6) And &h3F Or &h80) As Byte j++ bytes[j] = (chars[i] And &h3F Or &h80) As Byte j++ End If Next Exit Sub *BufferOver 'バッファ不足 Throw New ArgumentException("Buffer is not enough.", "bytes") End Sub Private buffer As WCHAR End Class Class UTF8Decoder Inherits Decoder Protected Override Sub ConvertCore(bytes As *Byte, byteCount As Long, chars As *WCHAR, charCount As Long, flush As Boolean, ByRef bytesUsed As Long, ByRef charsUsed As Long, ByRef completed As Boolean) Dim i As Long, j = 0 As Long For i = 0 To ELM(byteCount) If state = 0 Then If bytes[i] <= &h80 Then '1バイト変換 If j = charCount Then Goto *BufferOver chars[j] = bytes[i] j++ ElseIf bytes[i] < &hC0 Then 'マルチバイトの2バイト目以降 'DecoderFallback完成までの暫定 If j = charCount Then Goto *BufferOver chars[j] = &hfffd j++ ElseIf bytes[i] < &hD0 Then '2バイト文字の始まり last = 2 buf = bytes[i] And &h3f state++ ElseIf bytes[i] < &hF0 Then '3バイト文字の始まり last = 3 buf = bytes[i] And &h1f state++ Else '4バイト文字の始まり last = 4 buf = bytes[i] And &h0f state++ End If Else If &h80 <= bytes[i] And bytes[i] < &hC0 Then 'マルチバイト文字の2バイト目以降 buf <<= 6 buf Or= bytes[i] And &h3F state++ If state = last Then '最終バイトに到達 If state = 2 And buf >= &h80 Then chars[j] = buf As WCHAR j++ ElseIf state = 3 And buf >= &h800 And buf < &hD800 And &hE0000 >= buf Then chars[j] = buf As WCHAR j++ ElseIf state = 4 And buf <= &h10ffff Then buf -= &h10000 chars[j] = (&hD800 Or (buf >> 10)) As WCHAR j++ chars[j] = (&hDC00 Or (buf And &h3FF)) As WCHAR j++ Else 'DecoderFallback If j = charCount Then Goto *BufferOver chars[j] = &hfffd j++ End If state = 0 End If Else '3, 4バイト文字の先頭 'DecoderFallback If j = charCount Then Goto *BufferOver chars[j] = &hfffd j++ End If End If Next Exit Sub *BufferOver 'バッファ不足 Throw New ArgumentException("Buffer is not enough.", "bytes") End Sub Private buf As DWord state As Long last As Long End Class End Namespace 'Detail /*! @brief UTF-8用のEncoding @date 2007/12/21 @auther Egtra */ Class UTF8Encoding Inherits Encoding Public Override Function Clone() As Object Dim c = New UTF8Encoding c.DecoderFallback = This.DecoderFallback c.EncoderFallback = This.EncoderFallback Return c End Function Override Function GetDecoder() As Decoder GetDecoder = New Detail.UTF8Decoder ' GetDecoder.Fallback = DecoderFallback End Function Override Function GetEncoder() As Encoder GetEncoder = New Detail.UTF8Encoder ' GetEncoder.Fallback = EncoderFallback End Function Override Function GetMaxByteCount(charCount As Long) As Long Return charCount * 3 '全てがUTF-8で3バイトになる文字の場合が最大。 'UTF-8で4バイトになる列は、UTF-16だとサロゲートペアで表現するので、 '1単位あたりでは2バイトしか食わないことになる。 End Function Override Function GetMaxCharCount(byteCount As Long) As Long '全てU+7F以下の文字だけだった場合 Return byteCount End Function Protected Override Function GetBytesCountCore(s As *WCHAR, n As Long) As Long End Function Override Function GetBytesCore(chars As *WCHAR, charCount As Long, bytes As *Byte, byteCount As Long) As Long End Function Override Function GetCharsCountCore(s As *Byte, n As Long) As Long End Function Override Function GetCharsCore(bytes As *Byte, byteCount As Long, chars As *WCHAR, charCount As Long) As Long End Function Public Override Function GetPreamble() As *Byte Return bom End Function Override Function GetPreambleLength() As Long Return Len(bom) End Function Override Function IsAlwaysNormalized() As Boolean IsAlwaysNormalized = False End Function Override Function IsAlwaysNormalized(f As NormalizationForm) As Boolean IsAlwaysNormalized = False End Function Override Function BodyName() As String Return "utf-8" End Function Override Function HeaderName() As String Return "utf-8" End Function Override Function EncodingName() As String Return "UTF-8" End Function Override Function WebName() As String Return "utf-8" End Function Override Function IsSingleByte() As Boolean Return False End Function Private Static bom[2] = [&hEF, &hBB, &hBF] As Byte End Class End Namespace 'Text End Namespace 'System