I am interacting with an api that accepts strings that are a maximum 5KB in size.
I want to take a string that may be more than 5KB and break it into chunks less than 5KB in size.
I then intend to pass each smaller-than-5kb-string
to the api endpoint, and perform further actions when all requests have finished, probably using something like:
await Promise.all([get_thing_from_api(string_1), get_thing_from_api(string_2), get_thing_from_api(string_3)])
I have read that characters in a string can be between 1 - 4 bytes.
For this reason, to calculate string length in bytes we can use:
// in Node, string is UTF-8
Buffer.byteLength("here is some text");
// in Javascript
new Blob(["here is some text"]).size
Source:
https://stackoverflow.com/a/56026151
https://stackoverflow.com/a/52254083
My searches for "how to split strings into chunks of a certain size"
return results that relate to splitting a string into strings of a particular character length, not byte length, eg:
var my_string = "1234 5 678905";
console.log(my_string.match(/.{1,2}/g));
// ["12", "34", " 5", " 6", "78", "90", "5"]
Source:
https://stackoverflow.com/a/7033662
https://stackoverflow.com/a/6259543
https://gist.github.com/hendriklammers/5231994
Question
Is there a way to split a string into strings of a particular byte length?
I could either:
but would prefer a more accurate solution.
I would be interested to know of both Node and plain JavaScript solutions, if they exist.
EDIT
This approach to calculating byteLength
might be helpful - by iterating over characters in a string, getting their character code and incrementing byteLength
accordingly:
function byteLength(str) {
// returns the byte length of an utf8 string
var s = str.length;
for (var i=str.length-1; i>=0; i--) {
var code = str.charCodeAt(i);
if (code > 0x7f && code <= 0x7ff) s++;
else if (code > 0x7ff && code <= 0xffff) s+=2;
if (code >= 0xDC00 && code <= 0xDFFF) i--; //trail surrogate
}
return s;
}
Source: https://stackoverflow.com/a/23329386
which led me to interesting experiments into the underlying data structures of Buffer:
var buf = Buffer.from('Hey! ф');
// <Buffer 48 65 79 21 20 d1 84>
buf.length // 7
buf.toString().charCodeAt(0) // 72
buf.toString().charCodeAt(5) // 1092
buf.toString().charCodeAt(6) // NaN
buf[0] // 72
for (let i = 0; i < buf.length; i++) {
console.log(buf[i]);
}
// 72 101 121 33 32 209 132 undefined
buf.slice(0,5).toString() // 'Hey! '
buf.slice(0,6).toString() // 'Hey! �'
buf.slice(0,7).toString() // 'Hey! ф'
but as @trincot pointed out in the comments, what is the correct way to handle multibyte characters? And how could I ensure chunks were split on spaces (so as not to 'break apart' a word?)
More info on Buffer: https://nodejs.org/api/buffer.html#buffer_buffer
EDIT
In case it helps anyone else understand the brilliant logic in the accepted answer, the snippet below is a heavily commented version I made so I could understand it better.
/**
* Takes a string and returns an array of substrings that are smaller than maxBytes.
*
* This is an overly commented version of the non-generator version of the accepted answer,
* in case it helps anyone understand its (brilliant) logic.
*
* Both plain js and node variations are shown below - simply un/comment out your preference
*
* @param {string} s - the string to be chunked
* @param {maxBytes} maxBytes - the maximum size of a chunk, in bytes
* @return {arrray} - an array of strings less than maxBytes (except in extreme edge cases)
*/
function chunk(s, maxBytes) {
// for plain js
const decoder = new TextDecoder("utf-8");
let buf = new TextEncoder("utf-8").encode(s);
// for node
// let buf = Buffer.from(s);
const result = [];
var counter = 0;
while (buf.length) {
console.log("=============== BEG LOOP " + counter + " ===============");
console.log("result is now:");
console.log(result);
console.log("buf is now:");
// for plain js
console.log(decoder.decode(buf));
// for node
// console.log(buf.toString());
/* get index of the last space character in the first chunk,
searching backwards from the maxBytes + 1 index */
let i = buf.lastIndexOf(32, maxBytes + 1);
console.log("i is: " + i);
/* if no space is found in the first chunk,
get index of the first space character in the whole string,
searching forwards from 0 - in edge cases where characters
between spaces exceeds maxBytes, eg chunk("123456789x 1", 9),
the chunk will exceed maxBytes */
if (i < 0) i = buf.indexOf(32, maxBytes);
console.log("at first condition, i is: " + i);
/* if there's no space at all, take the whole string,
again an edge case like chunk("123456789x", 9) will exceed maxBytes*/
if (i < 0) i = buf.length;
console.log("at second condition, i is: " + i);
// this is a safe cut-off point; never half-way a multi-byte
// because the index is always the index of a space
console.log("pushing buf.slice from 0 to " + i + " into result array");
// for plain js
result.push(decoder.decode(buf.slice(0, i)));
// for node
// result.push(buf.slice(0, i).toString());
console.log("buf.slicing with value: " + (i + 1));
// slice the string from the index + 1 forwards
// it won't erroneously slice out a value after i, because i is a space
buf = buf.slice(i + 1); // skip space (if any)
console.log("=============== END LOOP " + counter + " ===============");
counter++;
}
return result;
}
console.log(chunk("Hey there! € 100 to pay", 12));
Solution: To split a byte string into a list of lines—each line being a byte string itself—use the Bytes. split(delimiter) method and use the Bytes newline character b'\n' as a delimiter.
Python split() method is used to split the string into chunks, and it accepts one argument called separator. A separator can be any character or a symbol. If no separators are defined, then it will split the given string and whitespace will be used by default.
The string split() method breaks a given string around matches of the given regular expression. After splitting against the given regular expression, this method returns a string array.
Using Buffer
seems indeed the right direction. Given that:
Buffer
prototype has indexOf
and lastIndexOf
methods, and... you can proceed as follows:
function chunk(s, maxBytes) {
let buf = Buffer.from(s);
const result = [];
while (buf.length) {
let i = buf.lastIndexOf(32, maxBytes+1);
// If no space found, try forward search
if (i < 0) i = buf.indexOf(32, maxBytes);
// If there's no space at all, take the whole string
if (i < 0) i = buf.length;
// This is a safe cut-off point; never half-way a multi-byte
result.push(buf.slice(0, i).toString());
buf = buf.slice(i+1); // Skip space (if any)
}
return result;
}
console.log(chunk("Hey there! € 100 to pay", 12));
// -> [ 'Hey there!', '€ 100 to', 'pay' ]
You can consider extending this to also look for TAB, LF, or CR as split-characters. If so, and your input text can have CRLF sequences, you would need to detect those as well to avoid getting orphaned CR or LF characters in the chunks.
You can turn the above function into a generator, so that you control when you want to start the processing for getting the next chunk:
function * chunk(s, maxBytes) {
let buf = Buffer.from(s);
while (buf.length) {
let i = buf.lastIndexOf(32, maxBytes+1);
// If no space found, try forward search
if (i < 0) i = buf.indexOf(32, maxBytes);
// If there's no space at all, take all
if (i < 0) i = buf.length;
// This is a safe cut-off point; never half-way a multi-byte
yield buf.slice(0, i).toString();
buf = buf.slice(i+1); // Skip space (if any)
}
}
for (let s of chunk("Hey there! € 100 to pay", 12)) console.log(s);
Buffer
is specific to Node. Browsers however implement TextEncoder
and TextDecoder
, which leads to similar code:
function * chunk(s, maxBytes) {
const decoder = new TextDecoder("utf-8");
let buf = new TextEncoder("utf-8").encode(s);
while (buf.length) {
let i = buf.lastIndexOf(32, maxBytes+1);
// If no space found, try forward search
if (i < 0) i = buf.indexOf(32, maxBytes);
// If there's no space at all, take all
if (i < 0) i = buf.length;
// This is a safe cut-off point; never half-way a multi-byte
yield decoder.decode(buf.slice(0, i));
buf = buf.slice(i+1); // Skip space (if any)
}
}
for (let s of chunk("Hey there! € 100 to pay", 12)) console.log(s);
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With