As far as I know, Excel use UTF-16 to represent string literals. I read from a console (Mac) / file (Windows), and in both cases the character encoding is messed up. I have to find a solution which works on both platforms, so ADO stream is not an option. I made some debugging and I see that actual bytes are:
Bytes | Displayed as | Should be | Correct byte 258,129 | Ă | Á | 193 258,356 | ĂŤ | Í | 205 313,176 | Ű | Ű | 219 313,144 | Ĺ | Ő | 213 258,347 | Ăś | Ü | 220 258,8211 | Ă– | Ö | 214 258,353 | Ăš | Ú | 218 258,8220 | Ă“ | Ó | 211 258,8240 | É | É | 201
(Comes from the good-old hungarian test-phrase, ÁRVÍZTŰRŐ TÜKÖRFÚRÓGÉP which contains all of our special characters). I am looking for an algorithm which results in the correct string both on Mac and Windows. Thanks!
None of the answers posted so far will correctly transcode an input string containing codepoints from the full Unicode range like for example "😀👩👩🦲👩👩👩👧👦🦲👩👧👦🦲👧👦UnicodeSupport𐀀est😀👩👩👩👧👦💁🏼♀️🧔🏻♂️👩❤️👨🏃🏻♀️".
That's why I wrote the following function, using only VBA inbuilt functions/statements that are available on both, Windows and MacOS.
This function works cross-platform and cross-app, and for the entire Unicode range.
codePoints > 65535
are also supported, even though VBAs inbuilt ChrW()
and AscW
don't support them, because the transcoding is done entirely "manually", including surrogate pairs. Performance should also be relatively good since the function works on a single byte-array buffer. If someone finds a bug or improvement, please let me know!
This code was improved as a result of this answer on CodeReview, many thanks to Cristian Buse for that!
'Function transcoding an UTF-8 encoded string to the VBA-native UTF-16-LE
'Author: Guido Witt-Dörring, https://stackoverflow.com/a/75787820/12287457
' https://github.com/guwidoe/VBA-StringTools
Public Function DecodeUTF8(ByRef utf8Str As String, _
Optional ByVal raiseErrors As Boolean = False) As String
Const methodName As String = "DecodeUTF8"
Dim i As Long
Dim numBytesOfCodePoint As Byte
Static numBytesOfCodePoints(0 To 255) As Byte
Static mask(2 To 4) As Long
Static minCp(2 To 4) As Long
If numBytesOfCodePoints(0) = 0 Then
For i = &H0& To &H7F&: numBytesOfCodePoints(i) = 1: Next i '0xxxxxxx
'110xxxxx - C0 and C1 are invalid (overlong encoding)
For i = &HC2& To &HDF&: numBytesOfCodePoints(i) = 2: Next i
For i = &HE0& To &HEF&: numBytesOfCodePoints(i) = 3: Next i '1110xxxx
'11110xxx - 11110100, 11110101+ (= &HF5+) outside of valid Unicode range
For i = &HF0& To &HF4&: numBytesOfCodePoints(i) = 4: Next i
For i = 2 To 4: mask(i) = (2 ^ (7 - i) - 1): Next i
minCp(2) = &H80&: minCp(3) = &H800&: minCp(4) = &H10000
End If
Dim codepoint As Long
Dim currByte As Byte
Dim utf8() As Byte: utf8 = utf8Str
Dim utf16() As Byte: ReDim utf16(0 To (UBound(utf8) - LBound(utf8) + 1) * 2)
Dim j As Long: j = 0
Dim k As Long
i = LBound(utf8)
Do While i <= UBound(utf8)
codepoint = utf8(i)
numBytesOfCodePoint = numBytesOfCodePoints(codepoint)
If numBytesOfCodePoint = 0 Then
If raiseErrors Then Err.Raise 5, methodName, "Invalid byte"
GoTo insertErrChar
ElseIf numBytesOfCodePoint = 1 Then
utf16(j) = codepoint
j = j + 2
ElseIf i + numBytesOfCodePoint - 1 > UBound(utf8) Then
If raiseErrors Then Err.Raise 5, methodName, _
"Incomplete UTF-8 codepoint at end of string."
GoTo insertErrChar
Else
codepoint = utf8(i) And mask(numBytesOfCodePoint)
For k = 1 To numBytesOfCodePoint - 1
currByte = utf8(i + k)
If (currByte And &HC0&) = &H80& Then
codepoint = (codepoint * &H40&) + (currByte And &H3F)
Else
If raiseErrors Then _
Err.Raise 5, methodName, "Invalid continuation byte"
GoTo insertErrChar
End If
Next k
'Convert the Unicode codepoint to UTF-16LE bytes
If codepoint < minCp(numBytesOfCodePoint) Then
If raiseErrors Then Err.Raise 5, methodName, "Overlong encoding"
GoTo insertErrChar
ElseIf codepoint < &HD800& Then
utf16(j) = codepoint And &HFF&
utf16(j + 1) = codepoint \ &H100&
j = j + 2
ElseIf codepoint < &HE000& Then
If raiseErrors Then Err.Raise 5, methodName, _
"Invalid Unicode codepoint.(Range reserved for surrogate pairs)"
GoTo insertErrChar
ElseIf codepoint < &H10000 Then
If codepoint = &HFEFF& Then GoTo nextCp '(BOM - will be ignored)
utf16(j) = codepoint And &HFF&
utf16(j + 1) = codepoint \ &H100&
j = j + 2
ElseIf codepoint < &H110000 Then 'Calculate surrogate pair
Dim m As Long: m = codepoint - &H10000
Dim loSurrogate As Long: loSurrogate = &HDC00& Or (m And &H3FF)
Dim hiSurrogate As Long: hiSurrogate = &HD800& Or (m \ &H400&)
utf16(j) = hiSurrogate And &HFF&
utf16(j + 1) = hiSurrogate \ &H100&
utf16(j + 2) = loSurrogate And &HFF&
utf16(j + 3) = loSurrogate \ &H100&
j = j + 4
Else
If raiseErrors Then Err.Raise 5, methodName, _
"Codepoint outside of valid Unicode range"
insertErrChar: utf16(j) = &HFD
utf16(j + 1) = &HFF
j = j + 2
If numBytesOfCodePoint = 0 Then numBytesOfCodePoint = 1
End If
End If
nextCp: i = i + numBytesOfCodePoint 'Move to the next UTF-8 codepoint
Loop
DecodeUTF8 = MidB$(utf16, 1, j)
End Function
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With