Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How to printf accented characters in ANSI C (like á é í ó ú)

I tried to printf with some accented characters such as á é í ó ú:

printf("my name is Seán\n");

The text editor in the DEVC++ IDE displays them fine - i.e the source code looks fine. I guess I need some library other than stdio.h and maybe some variant of the normal printf.

I'm using IDE Bloodshed DEVC running on Windows XP.

like image 723
sean Avatar asked Sep 29 '12 11:09

sean


1 Answers

Perhaps the best is to use Unicode.

Here's how...

First, manually set your console font to "Consolas" or "Lucida Console" or whichever True-Type Unicode font you can choose ("Raster fonts" may not work, those aren't Unicode fonts, although they may include characters you're interested in).

Next, set the console code page to 65001 (UTF-8) with SetConsoleOutputCP(CP_UTF8).

Then convert your text to UTF-8 (if it's not yet in UTF-8) using WideCharToMultiByte(CP_UTF8, ...).

Finally, call WriteConsoleA() to output the UTF-8 text.

Here's a little function that does all these things for you, it's an "improved" variant of wprintf():

int _wprintf(const wchar_t* format, ...)
{
  int r;
  static int utf8ModeSet = 0;
  static wchar_t* bufWchar = NULL;
  static size_t bufWcharCount = 256;
  static char* bufMchar = NULL;
  static size_t bufMcharCount = 256;
  va_list vl;
  int mcharCount = 0;

  if (utf8ModeSet == 0)
  {
    if (!SetConsoleOutputCP(CP_UTF8))
    {
      DWORD err = GetLastError();
      fprintf(stderr, "SetConsoleOutputCP(CP_UTF8) failed with error 0x%X\n", err);
      utf8ModeSet = -1;
    }
    else
    {
      utf8ModeSet = 1;
    }
  }

  if (utf8ModeSet != 1)
  {
    va_start(vl, format);
    r = vwprintf(format, vl);
    va_end(vl);
    return r;
  }

  if (bufWchar == NULL)
  {
    if ((bufWchar = malloc(bufWcharCount * sizeof(wchar_t))) == NULL)
    {
      return -1;
    }
  }

  for (;;)
  {
    va_start(vl, format);
    r = vswprintf(bufWchar, bufWcharCount, format, vl);
    va_end(vl);

    if (r < 0)
    {
      break;
    }

    if (r + 2 <= bufWcharCount)
    {
      break;
    }

    free(bufWchar);
    if ((bufWchar = malloc(bufWcharCount * sizeof(wchar_t) * 2)) == NULL)
    {
      return -1;
    }
    bufWcharCount *= 2;
  }

  if (r > 0)
  {
    if (bufMchar == NULL)
    {
      if ((bufMchar = malloc(bufMcharCount)) == NULL)
      {
        return -1;
      }
    }

    for (;;)
    {
      mcharCount = WideCharToMultiByte(CP_UTF8,
                                       0,
                                       bufWchar,
                                       -1,
                                       bufMchar,
                                       bufMcharCount,
                                       NULL,
                                       NULL);
      if (mcharCount > 0)
      {
        break;
      }

      if (GetLastError() != ERROR_INSUFFICIENT_BUFFER)
      {
        return -1;
      }

      free(bufMchar);
      if ((bufMchar = malloc(bufMcharCount * 2)) == NULL)
      {
        return -1;
      }
      bufMcharCount *= 2;
    }
  }

  if (mcharCount > 1)
  {
    DWORD numberOfCharsWritten, consoleMode;

    if (GetConsoleMode(GetStdHandle(STD_OUTPUT_HANDLE), &consoleMode))
    {
      fflush(stdout);
      if (!WriteConsoleA(GetStdHandle(STD_OUTPUT_HANDLE),
                         bufMchar,
                         mcharCount - 1,
                         &numberOfCharsWritten,
                         NULL))
      {
        return -1;
      }
    }
    else
    {
      if (fputs(bufMchar, stdout) == EOF)
      {
        return -1;
      }
    }
  }

  return r;
}

Following tests this function:

_wprintf(L"\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7"
         L"\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF"
         L"\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7"
         L"\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF"
         L"\n"
         L"\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7"
         L"\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF"
         L"\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7"
         L"\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF"
         L"\n"
         L"\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7"
         L"\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF"
         L"\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7"
         L"\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF"
         L"\n");

_wprintf(L"\x391\x392\x393\x394\x395\x396\x397"
         L"\x398\x399\x39A\x39B\x39C\x39D\x39E\x39F"
         L"\x3A0\x3A1\x3A2\x3A3\x3A4\x3A5\x3A6\x3A7"
         L"\x3A8\x3A9\x3AA\x3AB\x3AC\x3AD\x3AE\x3AF\x3B0"
         L"\n"
         L"\x3B1\x3B2\x3B3\x3B4\x3B5\x3B6\x3B7"
         L"\x3B8\x3B9\x3BA\x3BB\x3BC\x3BD\x3BE\x3BF"
         L"\x3C0\x3C1\x3C2\x3C3\x3C4\x3C5\x3C6\x3C7"
         L"\x3C8\x3C9\x3CA\x3CB\x3CC\x3CD\x3CE"
         L"\n");

_wprintf(L"\x410\x411\x412\x413\x414\x415\x401\x416\x417"
         L"\x418\x419\x41A\x41B\x41C\x41D\x41E\x41F"
         L"\x420\x421\x422\x423\x424\x425\x426\x427"
         L"\x428\x429\x42A\x42B\x42C\x42D\x42E\x42F"
         L"\n"
         L"\x430\x431\x432\x433\x434\x435\x451\x436\x437"
         L"\x438\x439\x43A\x43B\x43C\x43D\x43E\x43F"
         L"\x440\x441\x442\x443\x444\x445\x446\x447"
         L"\x448\x449\x44A\x44B\x44C\x44D\x44E\x44F"
         L"\n");

And should result in the following text in the console:

 ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿
ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß
àáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ
ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡ΢ΣΤΥΦΧΨΩΪΫάέήίΰ
αβγδεζηθικλμνξοπρςστυφχψωϊϋόύώ
АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ
абвгдеёжзийклмнопрстуфхцчшщъыьэюя

I do not know the encoding in which your IDE stores non-ASCII characters in .c/.cpp files and I do not know what your compiler does when encounters non-ASCII characters. This part you should figure out yourself.

As long as you supply to _wprintf() properly encoded UTF-16 text or call WriteConsoleA() with properly encoded UTF-8 text, things should work.

P.S. Some gory details about console fonts can be found here.

like image 120
Alexey Frunze Avatar answered Oct 22 '22 10:10

Alexey Frunze