Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How to Read/Write UTF8 text files in C?

Tags:

c

linux

gcc

utf-8

i am trying to read UTF8 text from a text file, and then print some of it to another file. I am using Linux and gcc compiler. This is the code i am using:

#include <stdio.h>
#include <stdlib.h>

int main(){
    FILE *fin;
    FILE *fout;
    int character;
    fin=fopen("in.txt", "r");
    fout=fopen("out.txt","w");
    while((character=fgetc(fin))!=EOF){
        putchar(character); // It displays the right character (UTF8) in the terminal
        fprintf(fout,"%c ",character); // It displays weird characters in the file
    }
    fclose(fin);
    fclose(fout);
    printf("\nFile has been created...\n");
    return 0;
}

It works for English characters for now.

like image 664
user2768374 Avatar asked Feb 12 '14 19:02

user2768374


2 Answers

Instead of

fprintf(fout,"%c ",character);

use

fprintf(fout,"%c",character);

The second fprintf() does not contain a space after %c which is what was causing out.txt to display weird characters. The reason is that fgetc() is retrieving a single byte (the same thing as an ASCII character), not a UTF-8 character. Since UTF-8 is also ASCII compatible, it will write English characters to the file just fine.

putchar(character) output the bytes sequentially without the extra space between every byte so the original UTF-8 sequence remained intact. To see what I'm talking about, try

while((character=fgetc(fin))!=EOF){
    putchar(character);
    printf(" "); // This mimics what you are doing when you write to out.txt
    fprintf(fout,"%c ",character);
}

If you want to write UTF-8 characters with the space between them to out.txt, you would need to handle the variable length encoding of a UTF-8 character.

#include <stdio.h>
#include <stdlib.h>

/* The first byte of a UTF-8 character
 * indicates how many bytes are in
 * the character, so only check that
 */
int numberOfBytesInChar(unsigned char val) {
    if (val < 128) {
        return 1;
    } else if (val < 224) {
        return 2;
    } else if (val < 240) {
        return 3;
    } else {
        return 4;
    }
}

int main(){
    FILE *fin;
    FILE *fout;
    int character;
    fin = fopen("in.txt", "r");
    fout = fopen("out.txt","w");
    while( (character = fgetc(fin)) != EOF) {
        for (int i = 0; i < numberOfBytesInChar((unsigned char)character) - 1; i++) {
            putchar(character);
            fprintf(fout, "%c", character);
            character = fgetc(fin);
        }
        putchar(character);
        printf(" ");
        fprintf(fout, "%c ", character);
    }
    fclose(fin);
    fclose(fout);
    printf("\nFile has been created...\n");
    return 0;
}
like image 52
Josh Durham Avatar answered Oct 03 '22 18:10

Josh Durham


This code worked for me:

/* fgetwc example */
#include <stdio.h>
#include <wchar.h>
#include <stdlib.h>
#include <locale.h>
int main ()
{
  setlocale(LC_ALL, "en_US.UTF-8");
  FILE * fin;
  FILE * fout;
  wint_t wc;
  fin=fopen ("in.txt","r");
  fout=fopen("out.txt","w");
  while((wc=fgetwc(fin))!=WEOF){
        // work with: "wc"
  }
  fclose(fin);
  fclose(fout);
  printf("File has been created...\n");
  return 0;
}
like image 32
user2768374 Avatar answered Oct 03 '22 19:10

user2768374