I'm a CUDA beginner. I have a pixel buffer of unsigned chars in global memory that can and is updated by any and all threads. To avoid weirdness in the pixel values, therefore, I want to perform an atomicExch when a thread attempts to update one. But the programming guide says that this function only works on 32- or 64-bit words, whereas I just want to atomically exchange one 8-bit byte. Is there a way to do this?
Thanks.
I just ran into this problem recently. In theory, atomic operations / optimistic retries are supposed to be faster than locks/mutexes, so the "hack" solutions that use atomic operations on other data types seem better to me than using critical sections.
Here are some implementations based on the threads for how to implement atomicMin for char and atomicAdd for short.
I've tested all of these, and my tests seem to show that they work fine so far.
Version 1 of atomicAdd for char
__device__ static inline char atomicAdd(char* address, char val) {
// offset, in bytes, of the char* address within the 32-bit address of the space that overlaps it
size_t long_address_modulo = (size_t) address & 3;
// the 32-bit address that overlaps the same memory
auto* base_address = (unsigned int*) ((char*) address - long_address_modulo);
// A 0x3210 selector in __byte_perm will simply select all four bytes in the first argument in the same order.
// The "4" signifies the position where the first byte of the second argument will end up in the output.
unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
// for selecting bytes within a 32-bit chunk that correspond to the char* address (relative to base_address)
unsigned int selector = selectors[long_address_modulo];
unsigned int long_old, long_assumed, long_val, replacement;
long_old = *base_address;
do {
long_assumed = long_old;
// replace bits in long_old that pertain to the char address with those from val
long_val = __byte_perm(long_old, 0, long_address_modulo) + val;
replacement = __byte_perm(long_old, long_val, selector);
long_old = atomicCAS(base_address, long_assumed, replacement);
} while (long_old != long_assumed);
return __byte_perm(long_old, 0, long_address_modulo);
}
atomicCAS for char
__device__ static inline char atomicCAS(char* address, char expected, char desired) {
size_t long_address_modulo = (size_t) address & 3;
auto* base_address = (unsigned int*) ((char*) address - long_address_modulo);
unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
unsigned int sel = selectors[long_address_modulo];
unsigned int long_old, long_assumed, long_val, replacement;
char old;
long_val = (unsigned int) desired;
long_old = *base_address;
do {
long_assumed = long_old;
replacement = __byte_perm(long_old, long_val, sel);
long_old = atomicCAS(base_address, long_assumed, replacement);
old = (char) ((long_old >> (long_address_modulo * 8)) & 0x000000ff);
} while (expected == old && long_assumed != long_old);
return old;
}
Version 2 of atomicAdd for char (uses bit shifts instead of __byte_perm and has to handle overflow as a result)
__device__ static inline char atomicAdd2(char* address, char val) {
size_t long_address_modulo = (size_t) address & 3;
auto* base_address = (unsigned int*) ((char*) address - long_address_modulo);
unsigned int long_val = (unsigned int) val << (8 * long_address_modulo);
unsigned int long_old = atomicAdd(base_address, long_val);
if (long_address_modulo == 3) {
// the first 8 bits of long_val represent the char value,
// hence the first 8 bits of long_old represent its previous value.
return (char) (long_old >> 24);
} else {
// bits that represent the char value within long_val
unsigned int mask = 0x000000ff << (8 * long_address_modulo);
unsigned int masked_old = long_old & mask;
// isolate the bits that represent the char value within long_old, add the long_val to that,
// then re-isolate by excluding bits that represent the char value
unsigned int overflow = (masked_old + long_val) & ~mask;
if (overflow) {
atomicSub(base_address, overflow);
}
return (char) (masked_old >> 8 * long_address_modulo);
}
}
For atomicMin, please check this thread.
You might implement a critical section using a mutex variable. So something like
get_the_lock
exch_data
release
http://forums.nvidia.com/index.php?showtopic=185809
Implementing a critical section in CUDA
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With