I have a pipe with an endless amount of strings being written to it. These strings are a mix of ASCII and Emojis. The problem I am having is I am reading them like this
char msg[100];
int length = read(fd,&msg,99);
msg[length] =0;
But sometimes the emoji I'm guessing is multibyte and it is getting cut in half and then when I print to the screen I get the diamond question mark unknown UTF-8 symbol.
If anyone knows how to prevent this please fill me in; I've been searching for a while now.
If you're reading chunks of bytes, and want to output chunks of UTF-8, you'll have to do at least some minimal UTF-8 decoding yourself. The simplest condition to check for is look at each byte (let's call it b) and see if it is a continuation byte:
bool is_cont = (0x80 == (0xC0 & b));
Any byte that is not a continuation starts a sequence, which continues until the next non-continuation byte. You'll need a 4-byte buffer to hold the chunks.
Hint provided by lee-daniel-crocker is good to check weather particular byte is part of utf-8/utf-16 or not.
Along with this you need to add some more logic. When you find partial sequence of utf-8 at end of your stream, you need to look back in your stream( here it is buffer) to locate start position of this partial sequence.
Once you find start position of this partial utf-8 code sequence store this partial code, remove it from you buffer and process buffer. Prepend this partial code sequence to the buffer of next read cycle. This will allow you to combine partial utf-8 code sequence split by to read()
operation.
Below is sample code for testing and validation.
App.c
// gcc -Wall app.c
#include <fcntl.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
volatile sig_atomic_t g_process_run = 1;
void signal_handler(int signal) { g_process_run = 0; }
int child_process(int *pipe) {
close(pipe[0]); // close read pipe
srand(1234);
int chars_to_send[] = {95, 97, 99, 100, 101, 103, 104, 105,
95, 97, 99, 100, 101, 103, 104, 105};
// int chars_to_send[] = {6, 7, 8, 9,12,14,15,16};
int fd = open("a.txt", O_RDONLY);
if (fd == -1) {
printf("Child: can't open file\n");
return -1;
}
struct stat sb;
if (fstat(fd, &sb) == -1) {
printf("Child: can't get file stat\n");
return -1;
}
off_t file_size = sb.st_size;
char *addr = mmap(NULL, file_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (addr == MAP_FAILED) {
printf("Child:mmap failed");
return -1;
}
int start_address = 0;
while (g_process_run != 0) {
long index = rand();
index = (index * 16) / RAND_MAX;
int len = chars_to_send[index];
if (start_address + len > file_size) {
start_address = 0;
}
len = write(pipe[1], &addr[start_address], len);
start_address = start_address + len;
sleep(1);
}
munmap(addr, file_size);
close(fd);
close(pipe[1]);
printf("child process exiting\n");
return 0;
}
int parent_process(int *pipe) {
close(pipe[1]); // close write pipe
const int BUFF_SIZE = 99;
char buff[BUFF_SIZE + 1];
char buff_temp[10];
int continueCount = 0;
while (g_process_run != 0) {
int len = read(pipe[0], &buff[continueCount],
BUFF_SIZE - continueCount) +
continueCount; // addjust buffer position and size based
// on previous partial utf-8 sequence
continueCount = 0;
for (int i = len - 1; i > -1;
--i) { // find and save if last sequence are partial utf-8
if (0 != (0x80 & buff[i])) {
buff_temp[continueCount] = buff[i];
buff[i] = '\0';
continueCount++;
} else {
break;
}
}
buff[len] = '\0';
printf("Parent:%s\n", buff);
if (continueCount > 0) { // put partial utf-8 sequence to start of buffer,
// so it will prepend in next read cycle.
printf("will resume with %d partial bytes\n", continueCount);
for (int i = 0; i < continueCount; ++i) {
buff[i] = buff_temp[continueCount - i - 1];
}
}
}
close(pipe[0]);
wait(NULL);
printf("parent process exiting\n");
return 0;
}
int init_signal() {
if (signal(SIGINT, signal_handler) == SIG_ERR) {
return -1;
}
return 0;
}
int main(int argc, char **argv) {
if (init_signal() != 0)
return -1;
int pipefd[2];
if (pipe(pipefd) == -1) {
printf("can't create pipe\n");
return -1;
}
pid_t pid = fork();
if (pid == -1) {
printf("Can't fork process\n");
return -1;
} else if (pid == 0) { // child process
return child_process(pipefd);
}
return parent_process(pipefd);
}
a.txt
12abc😫️a23😀️s345🤑️24ee🙃️dai😕️iodqs😥️dqk😓️pdoo9😛️93wd🤑️qd3👤️2om🍕️de9🤐️312abc😫️a23😀️s345🤑️24ee🙃️dai😕️iodqs😥️dqk😓️pdoo9😛️93wd🤑️qd3👤️2om🍕️de9🤐️312abc😫️a23😀️s345🤑️24ee🙃️dai😕️iodqs😥️dqk😓️pdoo9😛️93wd🤑️qd3👤️2om🍕️de9🤐️312abc😫️a23😀️s345🤑️24ee🙃️dai😕️iodqs😥️dqk😓️pdoo9😛️93wd🤑️qd3👤️2om🍕️de9🤐️312abc😫️a23😀️s345🤑️24ee🙃️dai😕️iodqs😥️dqk😓️pdoo9😛️93wd🤑️qd3👤️2om🍕️de9🤐️312abc😫️a23😀️s345🤑️24ee🙃️dai😕️iodqs😥️dqk😓️pdoo9😛️93wd🤑️qd3👤️2om🍕️de9🤐️312abc😫️a23😀️s345🤑️24ee🙃️dai😕️iodqs😥️dqk😓️pdoo9😛️93wd🤑️qd3👤️2om🍕️de9🤐️312abc😫️a23😀️s345🤑️24ee🙃️dai😕️iodqs😥️dqk😓️pdoo9😛️93wd🤑️qd3👤️2om🍕️de9🤐️3
You can find this code and test file here.
I would go with something like that:
#include <stdio.h>
#include <unistd.h>
#define BUFFER_LENGTH 53
void print_function(char* message) {
// \r or 0x0d - UTF-8 carriage return
printf("%s\r", message);
}
void read_pipe(int pipe, void (*print_func)(char*))
{
char message[BUFFER_LENGTH];
char to_print[1 + BUFFER_LENGTH];
char* pointer = message;
do
{
int bytes_read = read(pipe, pointer, BUFFER_LENGTH - (pointer - message));
if (0 == bytes_read)
{
// print remaining bytes
*pointer = '\0';
print_func(message);
break;
}
// add bytes remained from previous run
bytes_read += (pointer - message);
// copy complete characters to buffer to_print
int char_p = 0;
char* to_print_p = to_print;
for (int i = 0; i != bytes_read; ++i)
{
if (0x80 != (0xc0 & *(message + i)))
{
for (; char_p != i; ++char_p)
{
*(to_print_p++) = *(message + char_p);
}
}
}
// finish buffer with complete characters and print it
*to_print_p = '\0';
print_func(to_print);
// move tail to the beginning of the input buffer,
// pointer will point to the first free element in message buffer
pointer = message;
for (; char_p != bytes_read; ++char_p)
{
*(pointer++) = *(message + char_p);
}
} while (1);
}
int main()
{
read_pipe(STDIN_FILENO, print_function);
return 0;
}
Here read_pipe infinitely reads from passed pipe
descriptor, and prints data using passed print_func
function.
Idea is to read buffer from pipe and then copy to print buffer only complete characters (condition courtesy by Lee Daniel Crocker), with assumption that there is a valid UTF-8 sequence. If buffer has tail of some incomplete UTF-8 character it will be used as a beginning of the next portion of data. So we loop until the end of the pipe.
For simplicity I use stdin
as a pipe descriptor. To run and test:
gcc -Wall main.c -o run && perl -e 'print "\xf0\x9f\x98\xab"x1000;' > test.txt && ./run < test.txt > output.txt
P.S. Another approach would be to get character length as described here: UTF-8 Continuation bytes:
#include <stdio.h>
#include <unistd.h>
#define BUFFER_LENGTH 53
void print_function(char* message) {
// \r or 0x0d - UTF-8 carriage return
printf("%s\n", message);
}
void read_pipe(int pipe, void (*print_func)(char*))
{
char message[BUFFER_LENGTH];
char to_print[1 + BUFFER_LENGTH];
char* pointer = message;
do
{
int bytes_read = read(pipe, pointer, BUFFER_LENGTH - (pointer - message));
if (0 == bytes_read)
{
*pointer = '\0';
print_func(message);
break;
}
// add bytes remained from previous run
bytes_read += (pointer - message);
// copy complete characters to buffer to_print
int char_p = 0;
char* to_print_p = to_print;
int length;
do
{
unsigned char c = *(message + char_p);
if (0xc0 == (0xc0 & c))
{
length = 0;
while (0 != (0x80 & c))
{
c <<= 1;
++length;
}
if (char_p + length > bytes_read)
{
break;
}
}
else
{
length = 1;
}
for (int i = 0; i != length; ++i)
{
*(to_print_p++) = *(message + char_p++);
}
} while (char_p != bytes_read);
// finish buffer with complete characters and print it
*to_print_p = '\0';
print_func(to_print);
// move tail to the beginning of the input buffer,
// pointer will point to the first free element in message buffer
pointer = message;
for (; char_p != bytes_read; ++char_p)
{
*(pointer++) = *(message + char_p);
}
} while (1);
}
int main()
{
read_pipe(STDIN_FILENO, print_function);
return 0;
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With