I know from tutorial that unaligned loading and storing it can look like:
//Load a vector from an unaligned location in memory
__vector unsigned char LoadUnaligned(const unsigned char * src )
{
__vector unsigned char permuteVector = vec_lvsl(0, src);
__vector unsigned char low = vec_ld( 0, src);
__vector unsigned char high = vec_ld( 16, src);
return vec_perm( low, high, permuteVector);
}
//Store a vector to an unaligned location in memory
void StoreUnaligned(__vector unsigned char v, __vector unsigned char * dst)
{
//Load the surrounding area
__vector unsigned char low = vec_ld( 0, dst);
__vector unsigned char high = vec_ld( 16, dst);
//Prepare the constants that we need
__vector unsigned char permuteVector = vec_lvsr( 0, (int*) dst);
__vector signed char oxFF = vec_splat_s8( -1 );
__vector signed char ox00 = vec_splat_s8( 0 );
//Make a mask for which parts of the vectors to swap out
__vector unsigned char mask = vec_perm( ox00, oxFF, permuteVector );
//Right rotate our input data
v = vec_perm( v, v, permuteVector );
//Insert our data into the low and high vectors
low = vec_sel( v, low, mask );
high = vec_sel( high, v, mask );
//Store the two aligned result vectors
vec_st( low, 0, dst);
vec_st( high, 16, dst);
}
It looks terrible. So great amount of work in order to store one vector! And It has appropriate performance loss.
void SomeFuncA(const unsigned char * src, size_t size, unsigned char * dst)
{
for(size_t i = 0; i < size; i += 16)
{
__vector unsigned char a = vec_ld(0, src + i);
//simple work
vec_st(a, 0, dst + i);
}
}
void SomeFuncU(const unsigned char * src, size_t size, unsigned char * dst)
{
for(size_t i = 0; i < size; i += 16)
{
__vector unsigned char a = LoadUnaligned(src + i);
//simple work
StoreUnaligned(dst + i, a);
}
}
The second function works in 3-4 times slowly than the first. Since I can't control align of input and output memory then I have to implement both versions. How can I minimize performance loss for unaligned case?
First of all I want mention that if you save Altivec vector to an unaligned memory many times you don't need to save previous memory state in the middle of array only at the beginning and at the end. So there is an useful function and class in the Simd Library, which implement this functionality:
typedef __vector uint8_t v128_u8;
const v128_u8 K8_00 = vec_splat_u8(0x00);
const v128_u8 K8_FF = vec_splat_u8(0xFF);
template <bool align> inline v128_u8 Load(const uint8_t * p);
template <> inline v128_u8 Load<false>(const uint8_t * p)
{
v128_u8 lo = vec_ld(0, p);
v128_u8 hi = vec_ld(16, p);
return vec_perm(lo, hi, vec_lvsl(0, p));
}
template <> inline v128_u8 Load<true>(const uint8_t * p)
{
return vec_ld(0, p);
}
template <bool align> struct Storer;
template <> struct Storer<true>
{
template <class T> Storer(T * ptr)
:_ptr((uint8_t*)ptr)
{
}
template <class T> inline void First(T value)
{
vec_st((v128_u8)value, 0, _ptr);
}
template <class T> inline void Next(T value)
{
_ptr += 16;
vec_st((v128_u8)value, 0, _ptr);
}
inline void Flush()
{
}
private:
uint8_t * _ptr;
};
template <> struct Storer<false>
{
template <class T> inline Storer(T * ptr)
:_ptr((uint8_t*)ptr)
{
_perm = vec_lvsr(0, _ptr);
_mask = vec_perm(K8_00, K8_FF, _perm);
}
template <class T> inline void First(T value)
{
_last = (v128_u8)value;
v128_u8 background = vec_ld(0, _ptr);
v128_u8 foreground = vec_perm(_last, _last, _perm);
vec_st(vec_sel(background, foreground, _mask), 0, _ptr);
}
template <class T> inline void Next(T value)
{
_ptr += 16;
vec_st(vec_perm(_last, (v128_u8)value, _perm), 0, _ptr);
_last = (v128_u8)value;
}
inline void Flush()
{
v128_u8 background = vec_ld(16, _ptr);
v128_u8 foreground = vec_perm(_last, _last, _perm);
vec_st(vec_sel(foreground, background, _mask), 16, _ptr);
}
private:
uint8_t * _ptr;
v128_u8 _perm;
v128_u8 _mask;
v128_u8 _last;
};
Its using will be look like:
template<bool align> void SomeFunc(const unsigned char * src, size_t size, unsigned char * dst)
{
Storer<align> _dst(dst);
__vector unsigned char a = Load<align>(src);
//simple work
_dst.First(a);// save first block
for(size_t i = 16; i < size; i += 16)
{
__vector unsigned char a = Load<align>(src + i);
//simple work
_dst.Next(a);// save body
}
_dst.Flush(); // save tail
}
Performance loss will be 30-40% compare to aligned version. It is unpleasantly of course but tolerantly.
Additional advantage is reducing of code - all functions (aligned and unaligned) have the same implementation.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With