Make a utf-8 string shorter with a utf-32 encoding in Javascript?

Tags:

I'm trying to find a way to compress/decompress a string in Javascript. By compress I mean to make the string look shorter (less char). That's my goal.

Here's an example of how things should work:

Click to copy

// The string that I want to make shorter
// It will only contain [a-zA-Z0-9] chars and some ponctuations like ()[]{}.,;'"!
var string = "I like bananas !";

// The compressed string, maybe something like "䐓㐛꯱字",
// which is shorter than the original
var shortString = compress(string);  

// The original string, "I like banana !"
var originalString = decompress(shortString);

Here's my first idea (maybe there's a better way to get to my goal, and if so I'm interested in it).

I know that my original string will be in utf-8. So I'm thinking of using utf-32 for the encoding, which should divide by 4 the length of the string.

But I don't know how to do these 2 functions that construct new strings with different encoding. Here's the code I have so far that doesn't work...

Click to copy

function compress(string) {
    string = unescape(encodeURIComponent(string));
    var newString = '';

    for (var i = 0; i < string.length; i++) {
        var char = string.charCodeAt(i);
        newString += parseInt(char, 8).toString(32);
    }

    return newString;
}

706

asked Oct 30 '17 18:10

Thomas

1 Answers

Since you're using a set of less than 100 characters and that javascript strings are encoded in UTF-16 (which mean you have 65536 possible characters), what you can do is concatenate the character codes so as to have one "compressed" character per two basic character. This allows you to compress strings to half the length.

Like this for example:

Click to copy

document.getElementById('compressBtn').addEventListener('click', function() {
  var stringToCompress = document.getElementById('tocompress').value;
  var compressedString = compress(stringToCompress);
  var decompressedString = decompress(compressedString);

  if (stringToCompress === decompressedString) {
    document.getElementById('display').innerHTML = stringToCompress + ", length of " + stringToCompress.length  + " characters compressed to " + compressedString + ", length of " + compressedString.length + " characters back to " + decompressedString;
  } else {
    document.getElementById('display').innerHTML = "This string cannot be compressed"
  }

})


function compress(string) {
  string = unescape(encodeURIComponent(string));
  var newString = '',
    char, nextChar, combinedCharCode;

  for (var i = 0; i < string.length; i += 2) {
    char = string.charCodeAt(i);

    if ((i + 1) < string.length) {

      // You need to make sure that you don't have 3 digits second character else you  might go over 65536. 
      // But in UTF-16 the 32 characters aren't in your basic character set. But it's a limitation, anything
      // under charCode 32 will cause an error
      nextChar = string.charCodeAt(i + 1) - 31;

      // this is to pad the result, because you could have a code that is single digit, which would make 
      // decompression a bit harder
      combinedCharCode = char + "" + nextChar.toLocaleString('en', {
        minimumIntegerDigits: 2
      });

      // You take the concanated code string and convert it back to a number, then a character
      newString += String.fromCharCode(parseInt(combinedCharCode, 10));

    } else {

      // Here because you won't always have pair number length
      newString += string.charAt(i);
    }
  }
  return newString;
}

function decompress(string) {

  var newString = '',
    char, codeStr, firstCharCode, lastCharCode;

  for (var i = 0; i < string.length; i++) {
    char = string.charCodeAt(i);
    if (char > 132) {
      codeStr = char.toString(10);

      // You take the first part of the compressed char code, it's your first letter
      firstCharCode = parseInt(codeStr.substring(0, codeStr.length - 2), 10);

      // For the second one you need to add 31 back.
      lastCharCode = parseInt(codeStr.substring(codeStr.length - 2, codeStr.length), 10) + 31;

      // You put back the 2 characters you had originally
      newString += String.fromCharCode(firstCharCode) + String.fromCharCode(lastCharCode);
    } else {
      newString += string.charAt(i);
    }
  }
  return newString;
}

var stringToCompress = 'I like bananas!';
var compressedString = compress(stringToCompress);
var decompressedString = decompress(compressedString);

document.getElementById('display').innerHTML = stringToCompress + ", length of " + stringToCompress.length  + " characters compressed to " + compressedString + ", length of " + compressedString.length + " characters back to " + decompressedString;

Click to copy

body {
  padding: 10px;
}

#tocompress {
  width: 200px;
}

Click to copy

<input id="tocompress" placeholder="enter string to compress" />
<button id="compressBtn">
  Compress input
</button>
<div id="display">

</div>

Regarding the possible use of UTF-32 to further compress, I'm not sure it's possible, I might be wrong on that, but from my understanding it's not feasible. Here's why:

The approach above is basically concatenating two 1 byte values in one 2 bytes value. This is possible because javascript strings are encoded in 2 bytes (or 16 bits) (note that from what I understand the engine could decide to store differently making this compression unnecessary from a purely memory space point of view - that being said, in the end, one character is considered being 16 bits). A cleaner way to make the compression above would in fact to user the binary numbers instead of the decimal, it would make much more sense. Like this for example:

Click to copy

document.getElementById('compressBtn').addEventListener('click', function() {
  var stringToCompress = document.getElementById('tocompress').value;
  var compressedString = compress(stringToCompress);
  var decompressedString = decompress(compressedString);

  if (stringToCompress === decompressedString) {
    document.getElementById('display').innerHTML = stringToCompress + ", length of " + stringToCompress.length  + " characters compressed to " + compressedString + ", length of " + compressedString.length + " characters back to " + decompressedString;
  } else {
    document.getElementById('display').innerHTML = "This string cannot be compressed"
  }

})


function compress(string) {
  string = unescape(encodeURIComponent(string));
  var newString = '',
    char, nextChar, combinedCharCode;

  for (var i = 0; i < string.length; i += 2) {
  
  // convert to binary instead of keeping the decimal
    char = string.charCodeAt(i).toString(2);

    if ((i + 1) < string.length) {

     
      nextChar = string.charCodeAt(i + 1).toString(2) ;
     

      // you still need padding, see this answer https://stackoverflow.com/questions/27641812/way-to-add-leading-zeroes-to-binary-string-in-javascript
      combinedCharCode = "0000000".substr(char.length) + char + "" + "0000000".substr(nextChar.length) + nextChar;

      // You take the concanated code string and convert it back to a binary number, then a character
      newString += String.fromCharCode(parseInt(combinedCharCode, 2));

    } else {

      // Here because you won't always have pair number length
      newString += string.charAt(i);
    }
  }
  return newString;
}

function decompress(string) {

  var newString = '',
    char, codeStr, firstCharCode, lastCharCode;

  for (var i = 0; i < string.length; i++) {
    char = string.charCodeAt(i);
    if (char > 132) {
      codeStr = char.toString(2);

      // You take the first part (the first byte) of the compressed char code, it's your first letter
      firstCharCode = parseInt(codeStr.substring(0, codeStr.length - 7), 2);

      // then the second byte
      lastCharCode = parseInt(codeStr.substring(codeStr.length - 7, codeStr.length), 2);

      // You put back the 2 characters you had originally
      newString += String.fromCharCode(firstCharCode) + String.fromCharCode(lastCharCode);
    } else {
      newString += string.charAt(i);
    }
  }
  return newString;
}

var stringToCompress = 'I like bananas!';
var compressedString = compress(stringToCompress);
var decompressedString = decompress(compressedString);

document.getElementById('display').innerHTML = stringToCompress + ", length of " + stringToCompress.length  + " characters compressed to " + compressedString + ", length of " + compressedString.length + " characters back to " + decompressedString;

Click to copy

<input id="tocompress" placeholder="enter string to compress" />
<button id="compressBtn">
  Compress input
</button>
<div id="display">

</div>

So why not push the logic and use utf-32, which should be 4 bytes, meaning four 1 byte characters. One problem is that javascript has 2 bytes string. It's true that you can use pairs of 16 bits characters to represent utf-32 characters. Like this:

Click to copy

document.getElementById('test').innerHTML = "\uD834\uDD1E";

Click to copy

<div id="test"></div>

But if you test the length of the resulting string, you'll see that it's 2, even if there's only one "character". So from a javascript perspective, you're not reducing the actual string length.

The other thing is that UTF-32 has in fact 2²¹ characters. See here: https://en.wikipedia.org/wiki/UTF-32

It is a protocol to encode Unicode code points that uses exactly 32 bits per Unicode code point (but a number of leading bits must be zero as there are fewer than 221 Unicode code points)

So you don't really have 4 bytes, in fact you don't even have 3, which would be needed to encode 3. So UTF-32 doesn't seem to be a way to compress even more. And since javascript has native 2 bytes strings, it seems to me to be the most efficient - using that approach at least.

125

answered Oct 15 '22 10:10

Julien Grégoire

Related questions
                            
                                AWS - One of the required keys was not given a value
                            
                                Styling Wysiwyg Editor
                            
                                React - Button Pressed, keep calling function
                            
                                Can Multiple fs.write to append to the same file guarantee the order of execution?
                            
                                Minify JavaScript during GitHub Pages build?
                            
                                Make google chart responsive
                            
                                How to use NPM config variables cross platform (win/linux)?
                            
                                YouTube embed player not working in Javascript
                            
                                How can I convert an integer to hex with a fix length in Javascript?
                            
                                Add a component dynamically to a child element using a directive
                            
                                Is a good convention using semicolon at the end of the line when using ES6 in babel [duplicate]
                            
                                Modal is transitioning error with bootstrap4 alpha6 modal
                            
                                Is it possible to pass a custom comparator to lodash's sortBy function?
                            
                                Mongo / Mongoose: Does Mongoose automatically create indexes on ObjectId types?
                            
                                Error: EXDEV: cross-device link not permitted, rename nodejs
                            
                                Do not mutate state directly. Use setState() react/no-direct-mutation-state
                            
                                React removing an element when onClick
                            
                                React Stateful Components without classes
                            
                                Passport - override strategy dynamically
                            
                                Angular4 - How to set select option value dynamically

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

Make a utf-8 string shorter with a utf-32 encoding in Javascript?

Tags:

javascript

string

encoding

utf-8

compression

Thomas

People also ask

1 Answers

Julien Grégoire

Recent Activity

Donate For Us