Я пытаюсь реализовать функцию копирования памяти, которая использует инструкции SSE:
typedef unsigned char byte;
typedef unsigned int uint;
__forceinline static void SIMD_Copy(void* __restrict destination, void* __restrict source, const uint count)
{
//assert(count > 16)
#ifdef _M_IX86
const uint register_count = 8;
const uint step = register_count * 16;
const uint loop = count / (step);
#else
const uint register_count = 16;
const uint step = register_count * 16;
const uint loop = count / (step);
#endif
//assert(loop);
byte* from = static_cast<byte*>(source);
byte* to = static_cast<byte*>(destination);
uint debug_test = 0;
register uint counter = 0;
do
{
debug_test += step;
from += step;
to += step;
_mm_prefetch((const char*)(from), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 16), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 32), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 48), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 64), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 80), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 96), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 112), _MM_HINT_T0);
#ifdef _M_AMD64
_mm_prefetch((const char*)(from + 128), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 144), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 160), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 176), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 192), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 208), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 224), _MM_HINT_T0);
_mm_prefetch((const char*)(from + 240), _MM_HINT_T0);
#endif
_mm_store_si128((__m128i*)(to), _mm_load_si128((const __m128i*)(from)));
_mm_store_si128((__m128i*)(to + 16), _mm_load_si128((const __m128i*)(from + 16)));
_mm_store_si128((__m128i*)(to + 32), _mm_load_si128((const __m128i*)(from + 32)));
_mm_store_si128((__m128i*)(to + 48), _mm_load_si128((const __m128i*)(from + 48)));
_mm_store_si128((__m128i*)(to + 64), _mm_load_si128((const __m128i*)(from + 64)));
_mm_store_si128((__m128i*)(to + 80), _mm_load_si128((const __m128i*)(from + 80)));
_mm_store_si128((__m128i*)(to + 96), _mm_load_si128((const __m128i*)(from + 96)));
_mm_store_si128((__m128i*)(to + 112), _mm_load_si128((const __m128i*)(from + 112)));
#ifdef _M_AMD64
_mm_store_si128((__m128i*)(to + 128), _mm_load_si128((const __m128i*)(from + 128)));
_mm_store_si128((__m128i*)(to + 144), _mm_load_si128((const __m128i*)(from + 144)));
_mm_store_si128((__m128i*)(to + 160), _mm_load_si128((const __m128i*)(from + 160)));
_mm_store_si128((__m128i*)(to + 176), _mm_load_si128((const __m128i*)(from + 176)));
_mm_store_si128((__m128i*)(to + 192), _mm_load_si128((const __m128i*)(from + 192)));
_mm_store_si128((__m128i*)(to + 208), _mm_load_si128((const __m128i*)(from + 208)));
_mm_store_si128((__m128i*)(to + 224), _mm_load_si128((const __m128i*)(from + 224)));
_mm_store_si128((__m128i*)(to + 240), _mm_load_si128((const __m128i*)(from + 240)));
#endif
counter++;
}
while(counter < loop);
}
Вот как я его запускаю:
byte* arr1 = (byte*)_aligned_malloc(100 * 256, 16);
byte* arr2 = (byte*)_aligned_malloc(100 * 256, 16);
SIMD_Copy(arr2, arr1, 100 * 256);
_aligned_free(arr1);
_aligned_free(arr2);
Массивы имеют размер, кратный 256, так как он копирует минимум 256 байтов на x64, поэтому я упрощаю. Как только он достигает первого вызова _aligned_free, я получаю:
Unhandled exception at 0x77775C0C (ntdll.dll) in MyProgram.exe: 0xC0000374: A heap has been corrupted (parameters: 0x777A6478).
Когда я нажимаю продолжить, он переходит к:
Exception thrown at 0x776DEE01 (ntdll.dll) in MyProgram.exe: 0xC0000005: Access violation reading location 0x00000000.
Добавление:
_CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF | _CRTDBG_CHECK_ALWAYS_DF | _CRTDBG_CHECK_CRT_DF | _CRTDBG_DELAY_FREE_MEM_DF | _CRTDBG_CHECK_EVERY_16_DF);
в начало main, похоже, не помогает. Есть ли другой способ узнать, что происходит?