Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Opening a Unicode file in pure C

I am trying to open a .txt file that is wholly Chinese. Can I use normal fopen/fclose procedures to it even though the stream would be 100% Unicode or are there any exlusive tools for handling wide characters? I'd be grateful for precise answers, I am a beginner programmer. I am using Linux with standard gcc.

I will attach my code, it compiles with no error but upon execution I get segmentation fault. I don't know what is wrong with it. The point of this programme is to copy each string of Chinese signs in which a specific sign from a given set is to be found and to write it in a separate file.

#include<stdio.h>
#include<stdlib.h>
#include<wchar.h>
#include <locale.h>
#define PLIK_IN in /*filenames*/
#define PLIK_OUT out
#define LKON 49 /*specifying the length of a string on the left from a desired sign*/
#define PKON 50 /*...and on the right*/
int wczytaj_pliki(FILE*, FILE*); /*open file*/
void krocz_po_pliku(FILE*, FILE*); /*search through file*/
int slownik(wchar_t); /*compare signs*/
void zapisz_pliki(FILE*, FILE*); /*write to file*/

void main(void)
{
    FILE *bin,*bout;
    setlocale(LC_CTYPE, "");

    wczytaj_pliki(bin, bout);
    krocz_po_pliku(bin, bout);
    zapisz_pliki(bin, bout);
}/*main*/

int slownik(wchar_t znak) /*compare characters*/
{
    wchar_t gznak1 = L'股', gznak2 = L'利', gznak3 = L'红';
    if ( ( znak == gznak1) || (znak == gznak2) || (znak == gznak3) ) return 1;
    return 0;
}/*slownik*/

void krocz_po_pliku(FILE* bin, FILE* bout) /*search through file*/
{
    wchar_t wch;
    wchar_t* kontekst;
    int i = 0, j, step = LKON, counter = 0, token = 0;

    while ( (wch = getwchar() ) != EOF )
    {
        if (!token) /*comparing consecutive signs*/
    {
        if ( slownik(wch) == 1 )
        {
            counter++;
            fprintf(bout,"###Wystapienie %d.\n\n", counter);
            if ( i<step ) step = i;
            fseek(bin,-step,1);
            j=0, token = 1;
        }/*if*/
        else i++;
    }/*if*/
    else /*writing consecutive signs within context*/
    {
        if ( j < LKON + PKON)
        {
            putwc(wch, bout);
            j++;
        }/*if*/
        else
        {
            fprintf(bout,"###\n\n");
            fflush(bout);
            token = 0;
        }/*else*/
    }/*else*/
    }/*while*/
        printf("Znalazlem %d wystapien\n", counter);
}/*krocz_po_pliku*/

int wczytaj_pliki(FILE* bin, FILE* bout)
{
    bin=fopen("PLIK_IN","r");
    bout=fopen("PLIK_OUT","w");
    rewind(bin);
    if(bin==NULL || bout==NULL)
{
    printf("Blad plikow\n");
    exit(0);
}/*if*/
    return 1;
}/*wczytaj pliki*/

void zapisz_pliki(FILE* bin, FILE* bout)
{
fclose(bin);
fclose(bout);
}
like image 222
yauser Avatar asked Oct 09 '22 08:10

yauser


1 Answers

Yes, fopen can open a file that contains any data, including Unicode data, as long as you can represent the filename in a char*. (On some platforms, namely Windows, files may have names that cannot be represented in a char*).

You will want to open the file in binary mode to prevent any new line substitution that may be done (unless the Unicode encoding is UTF-8 and then it doesn't matter), because the substitution will be done in terms of chars. Also, if the code units are more than one byte you will need to make sure you're reading them with the correct endianness.

Note that wchar_t isn't necessarily Unicode an may not be the right type for whatever Unicode encoding is being used by your files. And if your program supports multiple Unicode encodings do not use BOMs to guess which encoding a file uses.

like image 54
bames53 Avatar answered Oct 13 '22 10:10

bames53