/*! @file Classes/System/Text/UTF8Encoding.ab @brief UTF8Encodingクラスとそれに関係するクラスなどの宣言・定義 */ Namespace System Namespace Text Namespace Detail Class UTF8Encoder Inherits Encoder Protected Override Sub EncodeImpl(src As *WCHAR, size As SIZE_T, s As IO.Stream, last As Boolean) Dim i As Long For i = 0 To ELM(size) If buffer <> 0 Then If _System_IsLowSurrogate(src[i]) Then 'UTF-16列からUnicodeコードポイントを復元 Dim c = (((buffer And &h3FF) As DWord << 10) Or (src[i] And &h3FF)) + &h10000 '4バイト変換 s.WriteByte(((c >> 18) Or &hf0) As Byte) s.WriteByte(((c >> 12) And &h3F Or &h80) As Byte) s.WriteByte(((c >> 6) And &h3F Or &h80) As Byte) s.WriteByte((c And &h3F Or &h80) As Byte) Else writeReplacementChar(s) End If buffer = 0 ElseIf src[i] < &h80 Then '1バイト変換 s.WriteByte(src[i] As Byte) ElseIf src[i] < &h800 Then '2バイト変換 s.WriteByte(((src[i] >> 6) Or &hC0) As Byte) s.WriteByte((src[i] And &h3F Or &h80) As Byte) ElseIf _System_IsHighSurrogate(src[i]) Then 'バッファに貯め込む buffer = src[i] ElseIf _System_IsLowSurrogate(src[i]) Then writeReplacementChar(s) Else '3バイト変換 s.WriteByte(((src[i] >> 12) Or &hE0) As Byte) s.WriteByte(((src[i] >> 6) And &h3F Or &h80) As Byte) s.WriteByte((src[i] And &h3F Or &h80) As Byte) End If Next End Sub Private ' U+FFFD Replacement CharacterのUTF-8表現、EF BF BDを書き込む。 Sub writeReplacementChar(s As IO.Stream) Dim rc[2] = [&hef, &hbf, &hbd] As Byte s.Write(rc, 0, Len(rc)) End Sub buffer As WCHAR End Class Class UTF8Decoder Inherits Decoder Protected Override Function DecodeImpl(dst As Collections.Generic.List, s As IO.Stream) As Boolean Dim i As Long For i = 0 To DefalultDecodingBufferSize - 1 'ELM Dim b = s.ReadByte() If b = -1 Then DecodeImpl = False Exit Function ElseIf state = 0 Then If b <= &h80 Then '1バイト変換 dst.Add(b As WCHAR) ElseIf b < &hC0 Then '先頭バイトがなく、いきなりマルチバイトの2バイト目以降 dst.Add(&hFFFD As WCHAR) ElseIf b < &hD0 Then '2バイト文字の始まり last = 2 buffer = b And &h3F state++ ElseIf b < &hF0 Then '3バイト文字の始まり last = 3 buffer = b And &h1F state++ ElseIf b < &hF8 Then '4バイト文字の始まり last = 4 buffer = b And &h0F state++ Else '現在のUTF-8は4バイトを超える表現を認めていない。 dst.Add(&hFFFD As WCHAR) End If Else If &h80 <= b And b < &hC0 Then 'マルチバイト文字の2バイト目以降 buffer <<= 6 buffer Or= b And &h3F state++ If state = last Then '最終バイトに到達 If state = 2 And buffer >= &h80 Then dst.Add(buffer As WCHAR) ElseIf state = 3 And buffer >= &h800 And buffer < &hD800 And &hE0000 >= buffer Then dst.Add(buffer As WCHAR) ElseIf state = 4 And buffer <= &h10FFFF Then buffer -= &h10000 dst.Add((&hD800 Or (buffer >> 10)) As WCHAR) dst.Add((&hDC00 Or (buffer And &h3FF)) As WCHAR) Else '最短形式でないもの、または4バイト形式で10FFFFを超えるコードポイントのもの dst.Add(&hfffd As WCHAR) End If state = 0 End If Else 'マルチバイト文字の途中なのに、それ以外のバイトが現れた場合 dst.Add(&hFFFD As WCHAR) state = 0 End If End If Next DecodeImpl = True End Function Private buffer As DWord state As Long last As Long End Class End Namespace 'Detail /*! @brief UTF-8用のEncoding @date 2007/12/21 @auther Egtra */ Class UTF8Encoding Inherits Encoding Public Override Function Clone() As Object Clone = New UTF8Encoding End Function Override Function GetDecoder() As Decoder GetDecoder = New Detail.UTF8Decoder End Function Override Function GetEncoder() As Encoder GetEncoder = New Detail.UTF8Encoder End Function Override Function GetMaxByteCount(charCount As Long) As Long GetMaxByteCount = charCount * 3 '全てがUTF-8で3バイトになる文字の場合が最大。 'なお、UTF-8で4バイトになる列は、UTF-16だとサロゲートペアで表現するので、 '1単位あたりでは2バイトしか食わないことになり、最大ではない。 End Function Override Function GetMaxCharCount(byteCount As Long) As Long '全てU+7F以下の文字だけだった場合 GetMaxCharCount = byteCount End Function Override Function GetPreamble() As *Byte Return bom End Function Override Function GetPreambleLength() As Long Return Len(bom) End Function /* Override Function BodyName() As String Return "utf-8" End Function Override Function HeaderName() As String Return "utf-8" End Function Override Function EncodingName() As String Return "UTF-8" End Function Override Function WebName() As String Return "utf-8" End Function Override Function IsSingleByte() As Boolean Return False End Function */ Private Static bom[2] = [&hEF, &hBB, &hBF] As Byte End Class End Namespace 'Text End Namespace 'System