Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How to read unicode (utf-8) / binary file line by line

Hi programmers,

I want read line by line a Unicode (UTF-8) text file created by Notepad, i don't want display the Unicode string in the screen, i want just read and compare the strings!.

This code read ANSI file line by line, and compare the strings

What i want

Read test_ansi.txt line by line

if the line = "b" print "YES!"

else print "NO!"

read_ansi_line_by_line.c

#include <stdio.h>

int main()
{
    char *inname = "test_ansi.txt";
    FILE *infile;
    char line_buffer[BUFSIZ]; /* BUFSIZ is defined if you include stdio.h */
    char line_number;

    infile = fopen(inname, "r");
    if (!infile) {
        printf("\nfile '%s' not found\n", inname);
        return 0;
    }
    printf("\n%s\n\n", inname);

    line_number = 0;
    while (fgets(line_buffer, sizeof(line_buffer), infile)) {
        ++line_number;
        /* note that the newline is in the buffer */
        if (strcmp("b\n", line_buffer) == 0 ){
            printf("%d: YES!\n", line_number);
        }else{
            printf("%d: NO!\n", line_number,line_buffer);
        }
    }
    printf("\n\nTotal: %d\n", line_number);
    return 0;
}

test_ansi.txt

a
b
c

Compiling

gcc -o read_ansi_line_by_line read_ansi_line_by_line.c

Output

test_ansi.txt

1: NO!
2: YES!
3: NO!


Total: 3

Now i need read Unicode (UTF-8) file created by Notepad, after more than 6 months i don't found any good code/library in C can read file coded in UTF-8!, i don't know exactly why but i think the standard C don't support Unicode!

Reading Unicode binary file its OK!, but the probleme is the binary file most be already created in binary mode!, that mean if we want read a Unicode (UTF-8) file created by Notepad we need to translate it from UTF-8 file to BINARY file!

This code write Unicode string to a binary file, NOTE the C file is coded in UTF-8 and compiled by GCC

What i want

Write the Unicode char "ب" to test_bin.dat

create_bin.c

#define UNICODE
#ifdef UNICODE
#define _UNICODE
#else
#define _MBCS
#endif

#include <stdio.h>
#include <wchar.h>

int main()
{
     /*Data to be stored in file*/
     wchar_t line_buffer[BUFSIZ]=L"ب";
     /*Opening file for writing in binary mode*/
     FILE *infile=fopen("test_bin.dat","wb");
     /*Writing data to file*/
     fwrite(line_buffer, 1, 13, infile);
     /*Closing File*/
     fclose(infile);

    return 0;
}

Compiling

gcc -o create_bin create_bin.c

Output

create test_bin.dat

Now i want read the binary file line by line and compare!

What i want

Read test_bin.dat line by line if the line = "ب" print "YES!" else print "NO!"

read_bin_line_by_line.c

#define UNICODE
#ifdef UNICODE
#define _UNICODE
#else
#define _MBCS
#endif

#include <stdio.h>
#include <wchar.h>

int main()
{
    wchar_t *inname = L"test_bin.dat";
    FILE *infile;
    wchar_t line_buffer[BUFSIZ]; /* BUFSIZ is defined if you include stdio.h */

    infile = _wfopen(inname,L"rb");
    if (!infile) {
        wprintf(L"\nfile '%s' not found\n", inname);
        return 0;
    }
    wprintf(L"\n%s\n\n", inname);

    /*Reading data from file into temporary buffer*/
    while (fread(line_buffer,1,13,infile)) {
        /* note that the newline is in the buffer */
        if ( wcscmp ( L"ب" , line_buffer ) == 0 ){
             wprintf(L"YES!\n");
        }else{
             wprintf(L"NO!\n", line_buffer);
        }
    }
    /*Closing File*/
    fclose(infile);
    return 0;
}

Output

test_bin.dat

YES!

THE PROBLEM

This method is VERY LONG! and NOT POWERFUL (i m beginner in software engineering)

Please any one know how to read Unicode file ? (i know its not easy!) Please any one know how to convert Unicode file to Binary file ? (simple method) Please any one know how to read Unicode file in binary mode ? (i m not sure)

Thank You.

like image 645
Freeseif Avatar asked Jan 21 '10 22:01

Freeseif


1 Answers

I found a solution to my problem, and I would like to share the solution to any one interested in reading UTF-8 file in C99.

void ReadUTF8(FILE* fp)
{
    unsigned char iobuf[255] = {0};
    while( fgets((char*)iobuf, sizeof(iobuf), fp) )
    {
            size_t len = strlen((char *)iobuf);
            if(len > 1 &&  iobuf[len-1] == '\n')
                iobuf[len-1] = 0;
            len = strlen((char *)iobuf);
            printf("(%d) \"%s\"  ", len, iobuf);
            if( iobuf[0] == '\n' )
                printf("Yes\n");
            else
                printf("No\n");
    }
}

void ReadUTF16BE(FILE* fp)
{
}

void ReadUTF16LE(FILE* fp)
{
}

int main()
{
    FILE* fp = fopen("test_utf8.txt", "r");
    if( fp != NULL)
    {
        // see http://en.wikipedia.org/wiki/Byte-order_mark for explaination of the BOM
        // encoding
        unsigned char b[3] = {0};
        fread(b,1,2, fp);
        if( b[0] == 0xEF && b[1] == 0xBB)
        {
            fread(b,1,1,fp); // 0xBF
            ReadUTF8(fp);
        }
        else if( b[0] == 0xFE && b[1] == 0xFF)
        {
            ReadUTF16BE(fp);
        }
        else if( b[0] == 0 && b[1] == 0)
        {
            fread(b,1,2,fp); 
            if( b[0] == 0xFE && b[1] == 0xFF)
                ReadUTF16LE(fp);
        }
        else
        {
            // we don't know what kind of file it is, so assume its standard
            // ascii with no BOM encoding
            rewind(fp);
            ReadUTF8(fp);
        }
    }        

    fclose(fp);
}
like image 156
Freeseif Avatar answered Sep 29 '22 01:09

Freeseif