I have a function here to truncate a given string to a given byte length:
LENGTH_BY_PREFIX = [
(0xC0, 2), # first byte mask, total codepoint length
(0xE0, 3),
(0xF0, 4),
(0xF8, 5),
(0xFC, 6),
]
def codepoint_length(first_byte):
if first_byte < 128:
return 1 # ASCII
for mask, length in LENGTH_BY_PREFIX:
if first_byte & mask == mask:
return length
assert False, 'Invalid byte %r' % first_byte
def cut_string_to_bytes_length(unicode_text, byte_limit):
utf8_bytes = unicode_text.encode('UTF-8')
cut_index = 0
while cut_index < len(utf8_bytes):
step = codepoint_length(ord(utf8_bytes[cut_index]))
if cut_index + step > byte_limit:
# can't go a whole codepoint further, time to cut
return utf8_bytes[:cut_index]
else:
cut_index += step
# length limit is longer than our bytes strung, so no cutting
return utf8_bytes
This seemed to work fine until the question of Emoji was introduced:
string = u"\ud83d\ude14"
trunc = cut_string_to_bytes_length(string, 100)
Traceback (most recent call last):
File "<console>", line 1, in <module>
File "<console>", line 5, in cut_string_to_bytes_length
File "<console>", line 7, in codepoint_length
AssertionError: Invalid byte 152
Can anyone explain exactly what is going on here, and what a possible solution is?
Edit: I have another code snippet here that doesn't throw an exception, but has weird behavior sometimes:
import encodings
_incr_encoder = encodings.search_function('utf8').incrementalencoder()
def utf8_byte_truncate(text, max_bytes):
""" truncate utf-8 text string to no more than max_bytes long """
byte_len = 0
_incr_encoder.reset()
for index,ch in enumerate(text):
byte_len += len(_incr_encoder.encode(ch))
if byte_len > max_bytes:
break
else:
return text
return text[:index]
>>> string = u"\ud83d\ude14\ud83d\ude14\ud83d\ude14\ud83d\ude14\ud83d\ude14"
>>> print string
(prints a set of 5 Apple Emoji...)๐๐๐๐๐
>>> len(string)
10
>>> trunc = utf8_byte_truncate(string, 4)
>>> print trunc
???
>>> len(trunc)
1
So with this second example, I have a string of 10 bytes, truncate it to 4, but something weird happens, and the result is a string of size 1 byte.
The algorithm is wrong as @jwpat7 indicated. A simpler algorithm is the following, but note some perceived single characters (called graphemes) are made up of more than one Unicode code point such as ๐จโ๐ฉโ๐งโ๐ฆ. This doesn't attempt to maintain graphemes.
# NOTE: This is Python 2 to match OP's code
# s = u'\ud83d\ude14\ud83d\ude14\ud83d\ude14\ud83d\ude14\ud83d\ude14'
# Same as above
s = u'\U0001f614' * 5 # Unicode character U+1F614
def utf8_lead_byte(b):
'''A UTF-8 intermediate byte starts with the bits 10xxxxxx.'''
# (b & 0xC0) != 0x80 # Python 3 no need for ord()
return (ord(b) & 0xC0) != 0x80
def utf8_byte_truncate(text, max_bytes):
'''If text[max_bytes] is not a lead byte, back up until a lead byte is
found and truncate before that character.'''
utf8 = text.encode('utf8')
if len(utf8) <= max_bytes:
return utf8
i = max_bytes
while i > 0 and not utf8_lead_byte(utf8[i]):
i -= 1
return utf8[:i]
# test for various max_bytes:
for m in range(len(s.encode('utf8'))+1):
b = utf8_byte_truncate(s,m)
print m,len(b),b.decode('utf8')
###Output
0 0
1 0
2 0
3 0
4 4 ๐
5 4 ๐
6 4 ๐
7 4 ๐
8 8 ๐๐
9 8 ๐๐
10 8 ๐๐
11 8 ๐๐
12 12 ๐๐๐
13 12 ๐๐๐
14 12 ๐๐๐
15 12 ๐๐๐
16 16 ๐๐๐๐
17 16 ๐๐๐๐
18 16 ๐๐๐๐
19 16 ๐๐๐๐
20 20 ๐๐๐๐๐
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With