Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Detect if execution character set letters are contiguous

In the code, a switch is used to convert letters to contiguous values. Optimizing compilers in general won't do the job as well as the simple contiguous digit condition right before the switch. How can I detect which execution character set used and/or conclude that the letters are contiguous to replace it with simple conditionals ?

static long digit_value(char c)
{
    if (c >= '0' && c <= '9')
        return (c-'0');

    switch(c) {
    case 'a':
    case 'A':
        return 10;
    case 'b':
    case 'B':
        return 11;
    case 'c':
    case 'C':
        return 12;
    case 'd':
    case 'D':
        return 13;
    case 'e':
    case 'E':
        return 14;
    case 'f':
    case 'F':
        return 15;
    case 'g':
    case 'G':
        return 16;
    case 'h':
    case 'H':
        return 17;
    case 'i':
    case 'I':
        return 18;
    case 'j':
    case 'J':
        return 19;
    case 'k':
    case 'K':
        return 20;
    case 'l':
    case 'L':
        return 21;
    case 'm':
    case 'M':
        return 22;
    case 'n':
    case 'N':
        return 23;
    case 'o':
    case 'O':
        return 24;
    case 'p':
    case 'P':
        return 25;
    case 'q':
    case 'Q':
        return 26;
    case 'r':
    case 'R':
        return 27;
    case 's':
    case 'S':
        return 28;
    case 't':
    case 'T':
        return 29;
    case 'u':
    case 'U':
        return 30;
    case 'v':
    case 'V':
        return 31;
    case 'w':
    case 'W':
        return 32;
    case 'x':
    case 'X':
        return 33;
    case 'y':
    case 'Y':
        return 34;
    case 'z':
    case 'Z':
        return 35;
    default:
        break;
    }

    return -1;
}
like image 969
hdante Avatar asked Jul 20 '20 21:07

hdante


2 Answers

How can I detect which execution character set used and/or conclude that the letters are contiguous?

At compile time, simply test them all. ('a-z' left out for simplicity)

static_assert(
  'A' == ('B' - 1) &&
  'B' == ('C' - 1) && 'C' == ('D' - 1) && 'D' == ('E' - 1) && 'E' == ('F' - 1) && 'F' == ('G' - 1) && 'G' == ('H' - 1) && 'H' == ('I' - 1) && 'I' == ('J' - 1) && 'J' == ('K' - 1) && 'K' == ('L' - 1) && 'L' == ('M' - 1) && 'M' == ('N' - 1) && 'N' == ('O' - 1) && 'O' == ('P' - 1) && 'P' == ('Q' - 1) && 'Q' == ('R' - 1) && 'R' == ('S' - 1) && 'S' == ('T' - 1) && 'T' == ('U' - 1) && 'U' == ('V' - 1) && 'V' == ('W' - 1) && 'W' == ('X' - 1) && 'X' == ('Y' - 1) &&
  'Y' == ('Z' - 1), "Dinosaur: not continuous A-Z");

static int digit_value(char c) {
  if (c >= '0' && c <= '9') return c - '0';
  if (c >= 'A' && c <= 'Z') return c - 'A' + 10;
  return -1;
}

Other dinosaur tests.


Or use the slow, but highly portable:

static int digit_value(char c) {
  static const char *base36 = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
  const char *p = strchr(base36, (unsigned char) c);
  if (p && *p) {
    return (int) (p - base36);
  }
  return -1;
}

Or perhaps a big #if?

#if ('A' == ('B' - 1) && 'B' == ('C' - 1) && 'C' == ('D' - 1) && 'D' == ('E' - 1) && 'E' == ('F' - 1) && 'F' == ('G' - 1) && 'G' == ('H' - 1) && 'H' == ('I' - 1) && 'I' == ('J' - 1) && 'J' == ('K' - 1) && 'K' == ('L' - 1) && 'L' == ('M' - 1) && 'M' == ('N' - 1) && 'N' == ('O' - 1) && 'O' == ('P' - 1) && 'P' == ('Q' - 1) && 'Q' == ('R' - 1) && 'R' == ('S' - 1) && 'S' == ('T' - 1) && 'T' == ('U' - 1) && 'U' == ('V' - 1) && 'V' == ('W' - 1) && 'W' == ('X' - 1) && 'X' == ('Y' - 1) && 'Y' == ('Z' - 1))

static int digit_value(char c) {
  if (c >= '0' && c <= '9') return c - '0';
  if (c >= 'A' && c <= 'Z') return c - 'A' + 10;
  return -1;
}

#else

static int digit_value(char c) {
  static const char *base36 = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
  const char *p = strchr(base36, (unsigned char) c);
  if (p && *p) {
    return (int) (p - base36);
  }
  return -1;
}

#endif

Or .... if UCHAR_MAX not too big and concern about speed, make a lookup table and skip the sequential concerns.

#include <limits.h>
int digit_value(char c) {
  unsigned char val[UCHAR_MAX] = {['0'] = 1, ['1'] = 2, ['2'] = 3, ['3'] = 4,
      ['4'] = 5, ['5'] = 6, ['6'] = 7, ['7'] = 8, ['9'] = 10, 
      ['A'] = 11, ['B'] = 12, ['C'] = 13, ['D'] = 14, ['E'] = 15, ...
      ['a'] = 11, ['b'] = 12, ['c'] = 13, ['d'] = 14, ['e'] = 15, ...
  };
  return val[(unsigned char) c] - 1;
}
like image 200
chux - Reinstate Monica Avatar answered Oct 28 '22 05:10

chux - Reinstate Monica


You can write an appropriate test for conditional compilation, as @chux answered. However, if you need to support character sets with non-contiguous letters, then you need an implementation that supports that, and a table lookup would be a lot more efficient than what you present in the question. So much more so that you could consider using it for all cases instead of maintaining two separate implementations. For example:

static long digit_value(char c) {
    static const long dvs[UCHAR_MAX + 1] = {
      [0] = 1, [1] = 2, [2] = 3, [3] = 4, [5] = 6, [7] = 8, [8] = 9, [9] = 10,
      ['A'] = 11, ['B'] = 12, ['C'] = 13, ['D'] = 14, ['E'] = 15, ['F'] = 16, ['G'] = 17,
      ['H'] = 18, ['I'] = 19, ['J'] = 20, ['K'] = 21, ['L'] = 22, ['M'] = 23, ['N'] = 24,
      ['O'] = 25, ['P'] = 26, ['Q'] = 27, ['R'] = 28, ['S'] = 29, ['T'] = 30, ['U'] = 31,
      ['V'] = 32, ['W'] = 33, ['X'] = 34, ['Y'] = 35, ['Z'] = 36,
      ['a'] = 11, ['b'] = 12, ['c'] = 13, ['d'] = 14, ['e'] = 15, ['f'] = 16, ['g'] = 17,
      ['h'] = 18, ['i'] = 19, ['j'] = 20, ['k'] = 21, ['l'] = 22, ['m'] = 23, ['n'] = 24,
      ['o'] = 25, ['p'] = 26, ['q'] = 27, ['r'] = 28, ['s'] = 29, ['t'] = 30, ['u'] = 31,
      ['v'] = 32, ['w'] = 33, ['x'] = 34, ['y'] = 35, ['z'] = 36
    };

    return dvs[(unsigned char) c] - 1;
}

Note that the characters in the basic execution character set, which include all the decimal digits and upper- and lower-case Latin letters, are guaranteed to have non-negative integer values. That somewhat simplifies the initializer for the table. Note also that elements that are not explicitly initialized will be default-initialized to zero, which eventually gets converted to a -1 return value by subtracting 1 from the tabulated value.

like image 44
John Bollinger Avatar answered Oct 28 '22 05:10

John Bollinger