Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Why doesn't mbtowc count character set as expected?

Tags:

c

linux

I want count the characters (in various charsets) in a file and I'm using the function 'mbtowc' to detect the characters. I cannot figure out why the chars and results values are different. Here is my example:

char buf[BUFFER_SIZE + MB_LEN_MAX];

int fd = open ("chinese_test", O_RDONLY);

unsigned int bytes, chars;

int bytes_read;

bytes = chars = 0;

while((bytes_read = read(fd, buf, BUFFER_SIZE)) > 0) {
    wchar_t wc_buf[BUFFER_SIZE], *wcp;
    char *p;
    int n = 0;

    bytes += bytes_read;

    p = buf;
    wcp = wc_buf;

    while((n = mbtowc(wcp, p, MB_LEN_MAX)) > 0) {
        p += n;
        wcp++;

        chars++;
    }

}

printf("chars: %d\tbytes: %d\n", chars, bytes);

I test the function with a text with some GB2312 characters, but chars and bytes are too different values.

My test returns -> chars: 4638 | bytes: 17473 but 'wc' linux command returns: chars: 16770 | bytes: 17473

Why this difference? What did I do wrong?


Now I've this code but there are still soe difference in the result.

char buf[BUFFER_SIZE * MB_LEN_MAX];

int fd = open ("test_chinese", O_RDONLY), filled = 0;

unsigned int bytes, chars;

int bytes_read;

bytes = chars = 0;

while((bytes_read = read(fd, buf, BUFFER_SIZE)) > 0) {
    wchar_t wc_buf[BUFFER_SIZE], *wcp;
    char *p;
    int n = 0;

    bytes += bytes_read;

    p = buf;
    wcp = wc_buf;



    while(bytes_read > 0) {
        n = mbtowc(NULL, p, MB_LEN_MAX);

        if (n <= 0) {
            p++;
            bytes_read--;
            continue;
        }
        p += n;

        bytes_read -= n;

        chars++;
    }

}

printf("\n\nchars: %d\tbytes: %d\n", chars, bytes);
like image 769
Figus Avatar asked Feb 10 '12 01:02

Figus


1 Answers

The problem is a combination of your BUFFER_SIZE, the file size of chinese_test and the byte alignment of wchar_t. As proof, try drastically increasing BUFFER_SIZE- you should start getting the answer you want.

What is happening is that your program works for the first block of text that it receives. But think about what happens in your code if a character is split between the first and second blocks as follows:

  | First Block                 | Second Block      |
  | [wchar_t] [wchar_t] ... [wchar_t] [wchar_t] ... |
  | [1,2,3,4] [1,2,3,4] ... [1,2,3,4] [1,2,3,4] ... |

Your code will begin the second block on the 3rd byte in the first character, and that will not be recognized as a valid character. Since mbtowc will return -1 when it does not find a valid character, your loop will immediately end and will count zero characters for that entire block. The same will apply for the following blocks.

EDIT:
Another issue I noticed is that you need to set the locale in order for mbtowc to work correctly. Taking all of these issues into account, I wrote the following which returns the same character count as wc for me:

#include <stdlib.h>
#include <stdio.h>
#include <locale.h>

int BUFFER_SIZE = 1024;
const char *DEFAULT_F_IN = "chinese_test";

struct counts {
    int bytes;
    int chars;
};

int count_block(struct counts *c, char *buf, int buf_size)
{
    int offset = 0;
    while (offset < buf_size) {
        int n = mbtowc(NULL, buf + offset, MB_CUR_MAX);
        if (n <= 0) {
            break;
        }

        offset += n;
        c->bytes += n;
        c->chars++;
    }

    return buf_size - offset;
}

void get_counts(struct counts *c, FILE *fd)
{
    char buf[BUFFER_SIZE];
    c->bytes = 0;
    c->chars = 0;

    int bytes_read;
    while((bytes_read = fread(buf, sizeof(*buf), BUFFER_SIZE, fd)) > 0) {
        int remaining = count_block(c, buf, bytes_read);
        if (remaining == 0) {
            continue;
        } else if (remaining < MB_CUR_MAX) {
            fseek(fd, -remaining, SEEK_CUR);
        } else {
            perror("Error");
            exit(1);
        }
    }
}

int main(int argc, char *argv[]) {
    FILE *fd;
    if (argc > 1) {
        fd = fopen(argv[1], "rb");
    } else {
        fd = fopen(DEFAULT_F_IN, "rb");
    }

    setlocale(LC_ALL, "");
    struct counts c;
    get_counts(&c, fd);
    printf("chars: %d\tbytes: %d\n", c.chars, c.bytes);

    return 0;
}
like image 141
Swiss Avatar answered Oct 20 '22 12:10

Swiss