Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Reading Emojis through a pipe in C

Tags:

c

utf-8

pipe

emoji

I have a pipe with an endless amount of strings being written to it. These strings are a mix of ASCII and Emojis. The problem I am having is I am reading them like this

char msg[100];
int length = read(fd,&msg,99);
msg[length] =0;

But sometimes the emoji I'm guessing is multibyte and it is getting cut in half and then when I print to the screen I get the diamond question mark unknown UTF-8 symbol.

If anyone knows how to prevent this please fill me in; I've been searching for a while now.

like image 641
JRowan Avatar asked Jan 02 '20 23:01

JRowan


3 Answers

If you're reading chunks of bytes, and want to output chunks of UTF-8, you'll have to do at least some minimal UTF-8 decoding yourself. The simplest condition to check for is look at each byte (let's call it b) and see if it is a continuation byte:

bool is_cont = (0x80 == (0xC0 & b));

Any byte that is not a continuation starts a sequence, which continues until the next non-continuation byte. You'll need a 4-byte buffer to hold the chunks.

like image 169
Lee Daniel Crocker Avatar answered Nov 17 '22 18:11

Lee Daniel Crocker


Hint provided by lee-daniel-crocker is good to check weather particular byte is part of utf-8/utf-16 or not.

Along with this you need to add some more logic. When you find partial sequence of utf-8 at end of your stream, you need to look back in your stream( here it is buffer) to locate start position of this partial sequence.

Once you find start position of this partial utf-8 code sequence store this partial code, remove it from you buffer and process buffer. Prepend this partial code sequence to the buffer of next read cycle. This will allow you to combine partial utf-8 code sequence split by to read() operation.

Below is sample code for testing and validation.

App.c

// gcc -Wall app.c

#include <fcntl.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>

volatile sig_atomic_t g_process_run = 1;

void signal_handler(int signal) { g_process_run = 0; }

int child_process(int *pipe) {
  close(pipe[0]); // close read pipe
  srand(1234);
  int chars_to_send[] = {95, 97, 99, 100, 101, 103, 104, 105,
                         95, 97, 99, 100, 101, 103, 104, 105};
  // int chars_to_send[] = {6, 7, 8, 9,12,14,15,16};
  int fd = open("a.txt", O_RDONLY);
  if (fd == -1) {
    printf("Child: can't open file\n");
    return -1;
  }
  struct stat sb;
  if (fstat(fd, &sb) == -1) {
    printf("Child: can't get file stat\n");
    return -1;
  }
  off_t file_size = sb.st_size;
  char *addr = mmap(NULL, file_size, PROT_READ, MAP_PRIVATE, fd, 0);
  if (addr == MAP_FAILED) {
    printf("Child:mmap failed");
    return -1;
  }
  int start_address = 0;
  while (g_process_run != 0) {
    long index = rand();
    index = (index * 16) / RAND_MAX;
    int len = chars_to_send[index];
    if (start_address + len > file_size) {
      start_address = 0;
    }
    len = write(pipe[1], &addr[start_address], len);
    start_address = start_address + len;
    sleep(1);
  }
  munmap(addr, file_size);
  close(fd);
  close(pipe[1]);
  printf("child process exiting\n");
  return 0;
}
int parent_process(int *pipe) {
  close(pipe[1]); // close write pipe
  const int BUFF_SIZE = 99;
  char buff[BUFF_SIZE + 1];
  char buff_temp[10];
  int continueCount = 0;
  while (g_process_run != 0) {
    int len = read(pipe[0], &buff[continueCount],
                   BUFF_SIZE - continueCount) +
              continueCount; // addjust buffer position and size based
                             // on previous partial utf-8 sequence
    continueCount = 0;
    for (int i = len - 1; i > -1;
         --i) { // find and save if last sequence are partial utf-8
      if (0 != (0x80 & buff[i])) {
        buff_temp[continueCount] = buff[i];
        buff[i] = '\0';
        continueCount++;
      } else {
        break;
      }
    }
    buff[len] = '\0';
    printf("Parent:%s\n", buff);
    if (continueCount > 0) { // put partial utf-8 sequence to start of buffer,
                             // so it will prepend in next read cycle.
      printf("will resume with %d partial bytes\n", continueCount);
      for (int i = 0; i < continueCount; ++i) {
        buff[i] = buff_temp[continueCount - i - 1];
      }
    }
  }
  close(pipe[0]);
  wait(NULL);
  printf("parent process exiting\n");
  return 0;
}
int init_signal() {
  if (signal(SIGINT, signal_handler) == SIG_ERR) {
    return -1;
  }
  return 0;
}

int main(int argc, char **argv) {
  if (init_signal() != 0)
    return -1;
  int pipefd[2];
  if (pipe(pipefd) == -1) {
    printf("can't create pipe\n");
    return -1;
  }
  pid_t pid = fork();
  if (pid == -1) {
    printf("Can't fork process\n");
    return -1;
  } else if (pid == 0) { // child process
    return child_process(pipefd);
  }
  return parent_process(pipefd);
}

a.txt

12abc😫️a23😀️s345🤑️24ee🙃️dai😕️iodqs😥️dqk😓️pdoo9😛️93wd🤑️qd3👤️2om🍕️de9🤐️312abc😫️a23😀️s345🤑️24ee🙃️dai😕️iodqs😥️dqk😓️pdoo9😛️93wd🤑️qd3👤️2om🍕️de9🤐️312abc😫️a23😀️s345🤑️24ee🙃️dai😕️iodqs😥️dqk😓️pdoo9😛️93wd🤑️qd3👤️2om🍕️de9🤐️312abc😫️a23😀️s345🤑️24ee🙃️dai😕️iodqs😥️dqk😓️pdoo9😛️93wd🤑️qd3👤️2om🍕️de9🤐️312abc😫️a23😀️s345🤑️24ee🙃️dai😕️iodqs😥️dqk😓️pdoo9😛️93wd🤑️qd3👤️2om🍕️de9🤐️312abc😫️a23😀️s345🤑️24ee🙃️dai😕️iodqs😥️dqk😓️pdoo9😛️93wd🤑️qd3👤️2om🍕️de9🤐️312abc😫️a23😀️s345🤑️24ee🙃️dai😕️iodqs😥️dqk😓️pdoo9😛️93wd🤑️qd3👤️2om🍕️de9🤐️312abc😫️a23😀️s345🤑️24ee🙃️dai😕️iodqs😥️dqk😓️pdoo9😛️93wd🤑️qd3👤️2om🍕️de9🤐️3

You can find this code and test file here.

like image 2
Manthan Tilva Avatar answered Nov 17 '22 17:11

Manthan Tilva


I would go with something like that:

#include <stdio.h>
#include <unistd.h>

#define BUFFER_LENGTH   53

void print_function(char* message) {
        // \r or 0x0d - UTF-8 carriage return
        printf("%s\r", message);
}

void read_pipe(int pipe, void (*print_func)(char*))
{
        char message[BUFFER_LENGTH];
        char to_print[1 + BUFFER_LENGTH];
        char* pointer = message;
        do
        {
                int bytes_read = read(pipe, pointer, BUFFER_LENGTH - (pointer - message));
                if (0 == bytes_read)
                {
                        // print remaining bytes
                        *pointer = '\0';
                        print_func(message);
                        break;
                }

                // add bytes remained from previous run
                bytes_read += (pointer - message);

                // copy complete characters to buffer to_print
                int char_p = 0;
                char* to_print_p = to_print;
                for (int i = 0; i != bytes_read; ++i)
                {
                        if (0x80 != (0xc0 & *(message + i)))
                        {
                                for (; char_p != i; ++char_p)
                                {
                                        *(to_print_p++) = *(message + char_p);
                                }
                        }
                }

                // finish buffer with complete characters and print it
                *to_print_p = '\0';
                print_func(to_print);


                // move tail to the beginning of the input buffer,
                // pointer will point to the first free element in message buffer
                pointer = message;
                for (; char_p != bytes_read; ++char_p)
                {
                        *(pointer++) = *(message + char_p);
                }
        } while (1);
}

int main()
{
        read_pipe(STDIN_FILENO, print_function);

        return 0;
}

Here read_pipe infinitely reads from passed pipe descriptor, and prints data using passed print_func function.

Idea is to read buffer from pipe and then copy to print buffer only complete characters (condition courtesy by Lee Daniel Crocker), with assumption that there is a valid UTF-8 sequence. If buffer has tail of some incomplete UTF-8 character it will be used as a beginning of the next portion of data. So we loop until the end of the pipe.

For simplicity I use stdin as a pipe descriptor. To run and test:

gcc -Wall main.c -o run && perl -e 'print "\xf0\x9f\x98\xab"x1000;' > test.txt && ./run < test.txt > output.txt

P.S. Another approach would be to get character length as described here: UTF-8 Continuation bytes:

#include <stdio.h>
#include <unistd.h>

#define BUFFER_LENGTH   53

void print_function(char* message) {
        // \r or 0x0d - UTF-8 carriage return
        printf("%s\n", message);
}

void read_pipe(int pipe, void (*print_func)(char*))
{
        char message[BUFFER_LENGTH];
        char to_print[1 + BUFFER_LENGTH];
        char* pointer = message;
        do
        {
                int bytes_read = read(pipe, pointer, BUFFER_LENGTH - (pointer - message));
                if (0 == bytes_read)
                {
                        *pointer = '\0';
                        print_func(message);
                        break;
                }

                // add bytes remained from previous run
                bytes_read += (pointer - message);

                // copy complete characters to buffer to_print
                int char_p = 0;
                char* to_print_p = to_print;

                int length;
                do
                {
                        unsigned char c = *(message + char_p);
                        if (0xc0 == (0xc0 & c))
                        {
                                length = 0;
                                while (0 != (0x80 & c))
                                {
                                        c <<= 1;
                                        ++length;
                                }

                                if (char_p + length > bytes_read)
                                {
                                        break;
                                }
                        }
                        else
                        {
                                length = 1;
                        }

                        for (int i = 0; i != length; ++i)
                        {
                                *(to_print_p++) = *(message + char_p++);
                        }
                } while (char_p != bytes_read);

                // finish buffer with complete characters and print it
                *to_print_p = '\0';
                print_func(to_print);


                // move tail to the beginning of the input buffer,
                // pointer will point to the first free element in message buffer
                pointer = message;
                for (; char_p != bytes_read; ++char_p)
                {
                        *(pointer++) = *(message + char_p);
                }
        } while (1);
}

int main()
{
        read_pipe(STDIN_FILENO, print_function);

        return 0;
}
like image 1
Anton Gorev Avatar answered Nov 17 '22 16:11

Anton Gorev