diff --git a/haversine_02/src/repetition_testing/main.cpp b/haversine_02/src/repetition_testing/main.cpp index 2eb5798..4e16837 100644 --- a/haversine_02/src/repetition_testing/main.cpp +++ b/haversine_02/src/repetition_testing/main.cpp @@ -9,6 +9,9 @@ #define ARR_LEN(ARR) sizeof(ARR) / sizeof(*ARR) +u64 *g_cache_output = NULL; +u64 g_size = 1024 * 1024 * 1024 / 128 * sizeof(u64); + extern "C" void mov_all_bytes_asm(char *buffer, u64 size); extern "C" void nop_all_bytes_asm(u64 size); extern "C" void nop_1x3_all_bytes_asm(u64 size); @@ -47,6 +50,7 @@ extern "C" void read_32x2_simd_offset(char *buffer, u64 size); extern "C" void read_32x2_simd_no_offset(char *buffer, u64 size); extern "C" void read_16x4_simd(char *buffer, u64 size); extern "C" void read_32x4_simd(char *buffer, u64 size); +extern "C" void cache_test(char *buffer, u64 size, u64 mask); void test_fread(reptester *tester, alloc_type type); void test_read(reptester *tester, alloc_type type); @@ -89,6 +93,20 @@ void test_read_32x2_simd_offset(reptester *tester, alloc_type type); void test_read_32x2_simd_no_offset(reptester *tester, alloc_type type); void test_read_16x4_simd(reptester *tester, alloc_type type); void test_read_32x4_simd(reptester *tester, alloc_type type); +void test_cache_test_16k(reptester *tester, alloc_type type); +void test_cache_test_32k(reptester *tester, alloc_type type); +void test_cache_test_64k(reptester *tester, alloc_type type); +void test_cache_test_128k(reptester *tester, alloc_type type); +void test_cache_test_512k(reptester *tester, alloc_type type); +void test_cache_test_1m(reptester *tester, alloc_type type); +void test_cache_test_2m(reptester *tester, alloc_type type); +void test_cache_test_4m(reptester *tester, alloc_type type); +void test_cache_test_8m(reptester *tester, alloc_type type); +void test_cache_test_16m(reptester *tester, alloc_type type); +void test_cache_test_32m(reptester *tester, alloc_type type); +void test_cache_test_64m(reptester *tester, alloc_type type); +void test_cache_test_512m(reptester *tester, alloc_type type); +void test_cache_test_full(reptester *tester, alloc_type type); u64 get_file_length(FILE *fp); int main(int argc, char *argv[]) { @@ -108,6 +126,8 @@ int main(int argc, char *argv[]) { break; } + g_cache_output = (u64 *)calloc(1, g_size); + // clang-format off reptester tester = { {filename, NULL, 0, 0}, // params @@ -118,20 +138,20 @@ int main(int argc, char *argv[]) { read_cpu_timer(), // test_start_time 1, // current_run - { - UINT64_MAX, // min_time - 0, // max_time - 0, // avg_time - 0, // total_time - }, - { - 0, // min_faults - 0, // max_faults - 0, // avg_faults - 0, // total_bytes - 0, // total_faults - }, - {}, // results + { + UINT64_MAX, // min_time + 0, // max_time + 0, // avg_time + 0, // total_time + }, + { + 0, // min_faults + 0, // max_faults + 0, // avg_faults + 0, // total_bytes + 0, // total_faults + }, + {}, // results }; // clang-format on @@ -182,15 +202,36 @@ int main(int argc, char *argv[]) { // {{"READ 2x2", "READ 2x2 WITH MALLOC"}, test_read_2x2}, // {{"READ 4x2", "READ 4x2 WITH MALLOC"}, test_read_4x2}, // {{"READ 8x2", "READ 8x2 WITH MALLOC"}, test_read_8x2}, - {{"READ 4x2_simd", "READ 4x2_simd WITH MALLOC"}, test_read_4x2_simd}, - {{"READ 8x2_simd", "READ 8x2_simd WITH MALLOC"}, test_read_8x2_simd}, - {{"READ 16x2_simd", "READ 16x2_simd WITH MALLOC"}, test_read_16x2_simd}, - {{"READ 32x2_simd_offset", "READ 32x2_simd_offset WITH MALLOC"}, - test_read_32x2_simd_offset}, - {{"READ 32x2_simd_no_offset", "READ 32x2_simd_no_offset WITH MALLOC"}, - test_read_32x2_simd_no_offset}, - {{"READ 16x4_simd", "READ 16x4_simd WITH MALLOC"}, test_read_16x4_simd}, - {{"READ 32x4_simd", "READ 32x4_simd WITH MALLOC"}, test_read_32x4_simd}, + // {{"READ 4x2_simd", "READ 4x2_simd WITH MALLOC"}, test_read_4x2_simd}, + // {{"READ 8x2_simd", "READ 8x2_simd WITH MALLOC"}, test_read_8x2_simd}, + // {{"READ 16x2_simd", "READ 16x2_simd WITH MALLOC"}, + // test_read_16x2_simd}, + // {{"READ 32x2_simd_offset", "READ 32x2_simd_offset WITH MALLOC"}, + // test_read_32x2_simd_offset}, + // {{"READ 32x2_simd_no_offset", "READ 32x2_simd_no_offset WITH MALLOC"}, + // test_read_32x2_simd_no_offset}, + // {{"READ 16x4_simd", "READ 16x4_simd WITH MALLOC"}, + // test_read_16x4_simd}, + // {{"READ 32x4_simd", "READ 32x4_simd WITH MALLOC"}, + // test_read_32x4_simd}, + {{"CACHE TEST 16K", "CACHE TEST 16K WITH MALLOC"}, test_cache_test_16k}, + {{"CACHE TEST 32K", "CACHE TEST 32K WITH MALLOC"}, test_cache_test_32k}, + {{"CACHE TEST 64K", "CACHE TEST 64K WITH MALLOC"}, test_cache_test_64k}, + {{"CACHE TEST 128K", "CACHE TEST 128K WITH MALLOC"}, + test_cache_test_128k}, + {{"CACHE TEST 512K", "CACHE TEST 512K WITH MALLOC"}, + test_cache_test_512k}, + {{"CACHE TEST 1M", "CACHE TEST 1M WITH MALLOC"}, test_cache_test_1m}, + {{"CACHE TEST 2M", "CACHE TEST 2M WITH MALLOC"}, test_cache_test_2m}, + {{"CACHE TEST 4M", "CACHE TEST 4M WITH MALLOC"}, test_cache_test_4m}, + {{"CACHE TEST 8M", "CACHE TEST 8M WITH MALLOC"}, test_cache_test_8m}, + {{"CACHE TEST 16M", "CACHE TEST 16M WITH MALLOC"}, test_cache_test_16m}, + {{"CACHE TEST 32M", "CACHE TEST 32M WITH MALLOC"}, test_cache_test_32m}, + {{"CACHE TEST 64M", "CACHE TEST 64M WITH MALLOC"}, test_cache_test_64m}, + {{"CACHE TEST 512M", "CACHE TEST 512M WITH MALLOC"}, + test_cache_test_512m}, + {{"CACHE TEST FULL", "CACHE TEST FULL WITH MALLOC"}, + test_cache_test_full}, }; tester.params.read_size = get_file_length(fp); @@ -586,7 +627,6 @@ void test_align63_loop(reptester *tester, alloc_type type) { } void test_align75_loop(reptester *tester, alloc_type type) { - u64 start = read_cpu_timer(); u64 fault_count_start = page_fault_count(); @@ -612,7 +652,6 @@ void test_align75_loop(reptester *tester, alloc_type type) { } void test_align90_loop(reptester *tester, alloc_type type) { - u64 start = read_cpu_timer(); u64 fault_count_start = page_fault_count(); @@ -1112,7 +1151,6 @@ void test_read_4x2_simd(reptester *tester, alloc_type type) { } void test_read_8x2_simd(reptester *tester, alloc_type type) { - u64 start = read_cpu_timer(); u64 fault_count_start = page_fault_count(); @@ -1138,7 +1176,6 @@ void test_read_8x2_simd(reptester *tester, alloc_type type) { } void test_read_16x2_simd(reptester *tester, alloc_type type) { - u64 start = read_cpu_timer(); u64 fault_count_start = page_fault_count(); @@ -1264,6 +1301,356 @@ void test_read_32x4_simd(reptester *tester, alloc_type type) { handle_free(tester, type); } +void test_cache_test_16k(reptester *tester, alloc_type type) { + u64 start = read_cpu_timer(); + u64 fault_count_start = page_fault_count(); + + handle_alloc(tester, type); + + u64 total_size = tester->params.read_size * tester->params.read_count; + + cache_test(tester->params.buffer, total_size, 0x3fff); + + u64 fault_count_end = page_fault_count(); + u64 end = read_cpu_timer(); + + u64 read_time = end - start; + u64 page_faults = fault_count_end - fault_count_start; + + tester->results = { + total_size, + read_time, + page_faults, + }; + + handle_free(tester, type); +} + +void test_cache_test_32k(reptester *tester, alloc_type type) { + u64 start = read_cpu_timer(); + u64 fault_count_start = page_fault_count(); + + handle_alloc(tester, type); + + u64 total_size = tester->params.read_size * tester->params.read_count; + + cache_test(tester->params.buffer, total_size, 0x7fff); + + u64 fault_count_end = page_fault_count(); + u64 end = read_cpu_timer(); + + u64 read_time = end - start; + u64 page_faults = fault_count_end - fault_count_start; + + tester->results = { + total_size, + read_time, + page_faults, + }; + + handle_free(tester, type); +} + +void test_cache_test_64k(reptester *tester, alloc_type type) { + u64 start = read_cpu_timer(); + u64 fault_count_start = page_fault_count(); + + handle_alloc(tester, type); + + u64 total_size = tester->params.read_size * tester->params.read_count; + + cache_test(tester->params.buffer, total_size, 0xffff); + + u64 fault_count_end = page_fault_count(); + u64 end = read_cpu_timer(); + + u64 read_time = end - start; + u64 page_faults = fault_count_end - fault_count_start; + + tester->results = { + total_size, + read_time, + page_faults, + }; + + handle_free(tester, type); +} + +void test_cache_test_128k(reptester *tester, alloc_type type) { + u64 start = read_cpu_timer(); + u64 fault_count_start = page_fault_count(); + + handle_alloc(tester, type); + + u64 total_size = tester->params.read_size * tester->params.read_count; + + cache_test(tester->params.buffer, total_size, 0x1ffff); + + u64 fault_count_end = page_fault_count(); + u64 end = read_cpu_timer(); + + u64 read_time = end - start; + u64 page_faults = fault_count_end - fault_count_start; + + tester->results = { + total_size, + read_time, + page_faults, + }; + + handle_free(tester, type); +} + +void test_cache_test_512k(reptester *tester, alloc_type type) { + u64 start = read_cpu_timer(); + u64 fault_count_start = page_fault_count(); + + handle_alloc(tester, type); + + u64 total_size = tester->params.read_size * tester->params.read_count; + + cache_test(tester->params.buffer, total_size, 0x7ffff); + + u64 fault_count_end = page_fault_count(); + u64 end = read_cpu_timer(); + + u64 read_time = end - start; + u64 page_faults = fault_count_end - fault_count_start; + + tester->results = { + total_size, + read_time, + page_faults, + }; + + handle_free(tester, type); +} + +void test_cache_test_1m(reptester *tester, alloc_type type) { + u64 start = read_cpu_timer(); + u64 fault_count_start = page_fault_count(); + + handle_alloc(tester, type); + + u64 total_size = tester->params.read_size * tester->params.read_count; + + cache_test(tester->params.buffer, total_size, 0xfffff); + + u64 fault_count_end = page_fault_count(); + u64 end = read_cpu_timer(); + + u64 read_time = end - start; + u64 page_faults = fault_count_end - fault_count_start; + + tester->results = { + total_size, + read_time, + page_faults, + }; + + handle_free(tester, type); +} + +void test_cache_test_2m(reptester *tester, alloc_type type) { + u64 start = read_cpu_timer(); + u64 fault_count_start = page_fault_count(); + + handle_alloc(tester, type); + + u64 total_size = tester->params.read_size * tester->params.read_count; + + cache_test(tester->params.buffer, total_size, 0x1fffff); + + u64 fault_count_end = page_fault_count(); + u64 end = read_cpu_timer(); + + u64 read_time = end - start; + u64 page_faults = fault_count_end - fault_count_start; + + tester->results = { + total_size, + read_time, + page_faults, + }; + + handle_free(tester, type); +} + +void test_cache_test_4m(reptester *tester, alloc_type type) { + u64 start = read_cpu_timer(); + u64 fault_count_start = page_fault_count(); + + handle_alloc(tester, type); + + u64 total_size = tester->params.read_size * tester->params.read_count; + + cache_test(tester->params.buffer, total_size, 0x3fffff); + + u64 fault_count_end = page_fault_count(); + u64 end = read_cpu_timer(); + + u64 read_time = end - start; + u64 page_faults = fault_count_end - fault_count_start; + + tester->results = { + total_size, + read_time, + page_faults, + }; + + handle_free(tester, type); +} + +void test_cache_test_8m(reptester *tester, alloc_type type) { + u64 start = read_cpu_timer(); + u64 fault_count_start = page_fault_count(); + + handle_alloc(tester, type); + + u64 total_size = tester->params.read_size * tester->params.read_count; + + cache_test(tester->params.buffer, total_size, 0x7fffff); + + u64 fault_count_end = page_fault_count(); + u64 end = read_cpu_timer(); + + u64 read_time = end - start; + u64 page_faults = fault_count_end - fault_count_start; + + tester->results = { + total_size, + read_time, + page_faults, + }; + + handle_free(tester, type); +} + +void test_cache_test_16m(reptester *tester, alloc_type type) { + u64 start = read_cpu_timer(); + u64 fault_count_start = page_fault_count(); + + handle_alloc(tester, type); + + u64 total_size = tester->params.read_size * tester->params.read_count; + + cache_test(tester->params.buffer, total_size, 0xffffff); + + u64 fault_count_end = page_fault_count(); + u64 end = read_cpu_timer(); + + u64 read_time = end - start; + u64 page_faults = fault_count_end - fault_count_start; + + tester->results = { + total_size, + read_time, + page_faults, + }; + + handle_free(tester, type); +} + +void test_cache_test_32m(reptester *tester, alloc_type type) { + u64 start = read_cpu_timer(); + u64 fault_count_start = page_fault_count(); + + handle_alloc(tester, type); + + u64 total_size = tester->params.read_size * tester->params.read_count; + + cache_test(tester->params.buffer, total_size, 0x1ffffff); + + u64 fault_count_end = page_fault_count(); + u64 end = read_cpu_timer(); + + u64 read_time = end - start; + u64 page_faults = fault_count_end - fault_count_start; + + tester->results = { + total_size, + read_time, + page_faults, + }; + + handle_free(tester, type); +} + +void test_cache_test_64m(reptester *tester, alloc_type type) { + u64 start = read_cpu_timer(); + u64 fault_count_start = page_fault_count(); + + handle_alloc(tester, type); + + u64 total_size = tester->params.read_size * tester->params.read_count; + + cache_test(tester->params.buffer, total_size, 0x3ffffff); + + u64 fault_count_end = page_fault_count(); + u64 end = read_cpu_timer(); + + u64 read_time = end - start; + u64 page_faults = fault_count_end - fault_count_start; + + tester->results = { + total_size, + read_time, + page_faults, + }; + + handle_free(tester, type); +} + +void test_cache_test_512m(reptester *tester, alloc_type type) { + u64 start = read_cpu_timer(); + u64 fault_count_start = page_fault_count(); + + handle_alloc(tester, type); + + u64 total_size = tester->params.read_size * tester->params.read_count; + + cache_test(tester->params.buffer, total_size, 0x1fffffff); + + u64 fault_count_end = page_fault_count(); + u64 end = read_cpu_timer(); + + u64 read_time = end - start; + u64 page_faults = fault_count_end - fault_count_start; + + tester->results = { + total_size, + read_time, + page_faults, + }; + + handle_free(tester, type); +} + +void test_cache_test_full(reptester *tester, alloc_type type) { + u64 start = read_cpu_timer(); + u64 fault_count_start = page_fault_count(); + + handle_alloc(tester, type); + + u64 total_size = tester->params.read_size * tester->params.read_count; + + cache_test(tester->params.buffer, total_size, 0xffffffffffffffff); + + u64 fault_count_end = page_fault_count(); + u64 end = read_cpu_timer(); + + u64 read_time = end - start; + u64 page_faults = fault_count_end - fault_count_start; + + tester->results = { + total_size, + read_time, + page_faults, + }; + + handle_free(tester, type); +} + u64 get_file_length(FILE *fp) { if (!fp) { return 0; diff --git a/haversine_02/src/repetition_testing/reptest_functions.asm b/haversine_02/src/repetition_testing/reptest_functions.asm index 59fd2fe..e7e5bdc 100644 --- a/haversine_02/src/repetition_testing/reptest_functions.asm +++ b/haversine_02/src/repetition_testing/reptest_functions.asm @@ -36,459 +36,476 @@ global read_32x2_simd_offset global read_32x2_simd_no_offset global read_16x4_simd global read_32x4_simd +global cache_test ; Expects 3 inputs (pointer, read_count, mask) mov_all_bytes_asm: - xor rax, rax + xor rax, rax - .loop: - mov BYTE [rdi + rax * 1], al - inc rax - cmp rsi, rax - jne .loop + .loop: + mov BYTE [rdi + rax * 1], al + inc rax + cmp rsi, rax + jne .loop - ret + ret nop_all_bytes_asm: - xor rax, rax + xor rax, rax - .loop: - db 0x0f, 0x1f, 0x00 - inc rax - cmp rdi, rax - jne .loop + .loop: + db 0x0f, 0x1f, 0x00 + inc rax + cmp rdi, rax + jne .loop - ret + ret nop_1x3_all_bytes_asm: - xor rax, rax + xor rax, rax - .loop: - nop - nop - nop - inc rax - cmp rdi, rax - jne .loop + .loop: + nop + nop + nop + inc rax + cmp rdi, rax + jne .loop - ret + ret nop_1x9_all_bytes_asm: - xor rax, rax + xor rax, rax - .loop: - nop - nop - nop - nop - nop - nop - nop - nop - nop - inc rax - cmp rdi, rax - jne .loop + .loop: + nop + nop + nop + nop + nop + nop + nop + nop + nop + inc rax + cmp rdi, rax + jne .loop - ret + ret inc_all_bytes_asm: - xor rax, rax + xor rax, rax - .loop: - inc rax - cmp rdi, rax - jne .loop + .loop: + inc rax + cmp rdi, rax + jne .loop - ret + ret dec_all_bytes_asm: - .loop: - dec rdi - jnz .loop + .loop: + dec rdi + jnz .loop - ret + ret align64_loop: - xor rax, rax + xor rax, rax - align 64 - .loop: - inc rax - cmp rdi, rax - jne .loop + align 64 + .loop: + inc rax + cmp rdi, rax + jne .loop - ret + ret align1_loop: - xor rax, rax + xor rax, rax - align 64 - nop - .loop: - inc rax - cmp rdi, rax - jne .loop + align 64 + nop + .loop: + inc rax + cmp rdi, rax + jne .loop - ret + ret align15_loop: - xor rax, rax + xor rax, rax - align 64 - %rep 15 - nop - %endrep - .loop: - inc rax - cmp rdi, rax - jne .loop + align 64 + %rep 15 + nop + %endrep + .loop: + inc rax + cmp rdi, rax + jne .loop - ret + ret align31_loop: - xor rax, rax + xor rax, rax - align 64 - %rep 31 - nop - %endrep - .loop: - inc rax - cmp rdi, rax - jne .loop + align 64 + %rep 31 + nop + %endrep + .loop: + inc rax + cmp rdi, rax + jne .loop - ret + ret align63_loop: - xor rax, rax + xor rax, rax - align 64 - %rep 63 - nop - %endrep - .loop: - inc rax - cmp rdi, rax - jne .loop + align 64 + %rep 63 + nop + %endrep + .loop: + inc rax + cmp rdi, rax + jne .loop - ret + ret align75_loop: - xor rax, rax + xor rax, rax - align 64 - %rep 75 - nop - %endrep - .loop: - inc rax - cmp rdi, rax - jne .loop + align 64 + %rep 75 + nop + %endrep + .loop: + inc rax + cmp rdi, rax + jne .loop - ret + ret align90_loop: - xor rax, rax + xor rax, rax - align 64 - %rep 90 - nop - %endrep - .loop: - inc rax - cmp rdi, rax - jne .loop + align 64 + %rep 90 + nop + %endrep + .loop: + inc rax + cmp rdi, rax + jne .loop - ret + ret align112_loop: - xor rax, rax + xor rax, rax - align 64 - %rep 112 - nop - %endrep - .loop: - inc rax - cmp rdi, rax - jne .loop + align 64 + %rep 112 + nop + %endrep + .loop: + inc rax + cmp rdi, rax + jne .loop - ret + ret rat_add: - mov rax, rdi + mov rax, rdi - .loop: - add rcx, 1 - add rcx, 1 - dec rax - jnz .loop - ret + .loop: + add rcx, 1 + add rcx, 1 + dec rax + jnz .loop + ret rat_mov_add: - mov rax, rdi + mov rax, rdi - .loop: - mov rcx, rax - add rcx, 1 - mov rcx, rax - add rcx, 1 - dec rax - jnz .loop - ret + .loop: + mov rcx, rax + add rcx, 1 + mov rcx, rax + add rcx, 1 + dec rax + jnz .loop + ret read_1: - align 64 - .loop: - mov rax, [rdi] - sub rsi, 1 - jnle .loop + align 64 + .loop: + mov rax, [rdi] + sub rsi, 1 + jnle .loop - ret + ret read_2: - align 64 - .loop: - %rep 2 - mov rax, [rdi] - %endrep - sub rsi, 2 - jnle .loop + align 64 + .loop: + %rep 2 + mov rax, [rdi] + %endrep + sub rsi, 2 + jnle .loop - ret + ret read_3: - align 64 - .loop: - %rep 3 - mov rax, [rdi] - %endrep - sub rsi, 3 - jnle .loop + align 64 + .loop: + %rep 3 + mov rax, [rdi] + %endrep + sub rsi, 3 + jnle .loop - ret + ret read_4: - align 64 - .loop: - %rep 4 - mov rax, [rdi] - %endrep - sub rsi, 4 - jnle .loop + align 64 + .loop: + %rep 4 + mov rax, [rdi] + %endrep + sub rsi, 4 + jnle .loop - ret + ret read_8: - align 64 - .loop: - %rep 8 - mov rax, [rdi] - %endrep - sub rsi, 8 - jnle .loop + align 64 + .loop: + %rep 8 + mov rax, [rdi] + %endrep + sub rsi, 8 + jnle .loop - ret + ret write_1: - align 64 - .loop: - mov QWORD [rdi], 0 - sub rsi, 1 - jnle .loop + align 64 + .loop: + mov QWORD [rdi], 0 + sub rsi, 1 + jnle .loop - ret + ret write_2: - align 64 - .loop: - %rep 2 - mov QWORD [rdi], 0 - %endrep - sub rsi, 2 - jnle .loop + align 64 + .loop: + %rep 2 + mov QWORD [rdi], 0 + %endrep + sub rsi, 2 + jnle .loop - ret + ret write_3: - align 64 - .loop: - %rep 3 - mov QWORD [rdi], 0 - %endrep - sub rsi, 3 - jnle .loop + align 64 + .loop: + %rep 3 + mov QWORD [rdi], 0 + %endrep + sub rsi, 3 + jnle .loop - ret + ret write_4: - align 64 - .loop: - %rep 4 - mov QWORD [rdi], 0 - %endrep - sub rsi, 4 - jnle .loop + align 64 + .loop: + %rep 4 + mov QWORD [rdi], 0 + %endrep + sub rsi, 4 + jnle .loop - ret + ret write_8: - align 64 - .loop: - %rep 8 - mov QWORD [rdi], 0 - %endrep - sub rsi, 8 - jnle .loop + align 64 + .loop: + %rep 8 + mov QWORD [rdi], 0 + %endrep + sub rsi, 8 + jnle .loop - ret + ret read_1x2_low: - align 64 - .loop: - %rep 2 - mov al, [rdi] - %endrep - sub rsi, 2 - jnle .loop + align 64 + .loop: + %rep 2 + mov al, [rdi] + %endrep + sub rsi, 2 + jnle .loop - ret + ret read_1x2_high: - align 64 - .loop: - %rep 2 - mov ah, [rdi] - %endrep - sub rsi, 2 - jnle .loop + align 64 + .loop: + %rep 2 + mov ah, [rdi] + %endrep + sub rsi, 2 + jnle .loop - ret + ret read_2x2: - align 64 - .loop: - %rep 2 - mov ax, [rdi] - %endrep - sub rsi, 2 - jnle .loop + align 64 + .loop: + %rep 2 + mov ax, [rdi] + %endrep + sub rsi, 2 + jnle .loop - ret + ret read_4x2: - align 64 - .loop: - %rep 2 - mov eax, [rdi] - %endrep - sub rsi, 2 - jnle .loop + align 64 + .loop: + %rep 2 + mov eax, [rdi] + %endrep + sub rsi, 2 + jnle .loop - ret + ret read_8x2: - align 64 - .loop: - %rep 2 - mov rax, [rdi] - %endrep - sub rsi, 2 - jnle .loop + align 64 + .loop: + %rep 2 + mov rax, [rdi] + %endrep + sub rsi, 2 + jnle .loop - ret + ret read_4x2_simd: - xor rax, rax + xor rax, rax - align 64 - .loop: - mov r8d, [rdi] - mov r8d, [rdi + 4] - add rax, 8 - cmp rax, rsi - jb .loop + align 64 + .loop: + mov r8d, [rdi] + mov r8d, [rdi + 4] + add rax, 8 + cmp rax, rsi + jb .loop - ret + ret read_8x2_simd: - xor rax, rax + xor rax, rax - align 64 - .loop: - mov r8, [rdi] - mov r8, [rdi + 8] - add rax, 16 - cmp rax, rsi - jb .loop + align 64 + .loop: + mov r8, [rdi] + mov r8, [rdi + 8] + add rax, 16 + cmp rax, rsi + jb .loop - ret + ret read_16x2_simd: - xor rax, rax + xor rax, rax - align 64 - .loop: - vmovdqu xmm0, [rdi] - vmovdqu xmm0, [rdi + 16] - add rax, 32 - cmp rax, rsi - jb .loop + align 64 + .loop: + vmovdqu xmm0, [rdi] + vmovdqu xmm0, [rdi + 16] + add rax, 32 + cmp rax, rsi + jb .loop - ret + ret read_32x2_simd_offset: - xor rax, rax + xor rax, rax - align 64 - .loop: - vmovdqu ymm0, [rdi] - vmovdqu ymm0, [rdi + 32] - add rax, 64 - cmp rax, rsi - jb .loop + align 64 + .loop: + vmovdqu ymm0, [rdi] + vmovdqu ymm0, [rdi + 32] + add rax, 64 + cmp rax, rsi + jb .loop - ret + ret read_32x2_simd_no_offset: - xor rax, rax + xor rax, rax - align 64 - .loop: - vmovdqu ymm0, [rdi] - vmovdqu ymm0, [rdi] - add rax, 64 - cmp rax, rsi - jb .loop + align 64 + .loop: + vmovdqu ymm0, [rdi] + vmovdqu ymm0, [rdi] + add rax, 64 + cmp rax, rsi + jb .loop - ret + ret read_16x4_simd: - xor rax, rax + xor rax, rax - align 64 - .loop: - %rep 2 - vmovdqu xmm0, [rdi] - vmovdqu xmm0, [rdi + 16] - %endrep - add rax, 64 - cmp rax, rsi - jb .loop + align 64 + .loop: + %rep 2 + vmovdqu xmm0, [rdi] + vmovdqu xmm0, [rdi + 16] + %endrep + add rax, 64 + cmp rax, rsi + jb .loop - ret + ret read_32x4_simd: - xor rax, rax + xor rax, rax - align 64 - .loop: - %rep 2 - vmovdqu ymm0, [rdi] - vmovdqu ymm0, [rdi] - %endrep - add rax, 128 - cmp rax, rsi - jb .loop + align 64 + .loop: + %rep 2 + vmovdqu ymm0, [rdi] + vmovdqu ymm0, [rdi] + %endrep + add rax, 128 + cmp rax, rsi + jb .loop - ret + ret + +cache_test: + xor r10, r10 ; Zero loop counter + mov rbx, rdi ; Save original pointer + .loop: + add rdi, r10 ; Advance the pointer + add r10, 128 ; Increment loop counter + and r10, rdx ; Mask offset + vmovdqu ymm0, [rdi + 0] + vmovdqu ymm1, [rdi + 32] + vmovdqu ymm2, [rdi + 64] + vmovdqu ymm3, [rdi + 96] + mov rdi, rbx ; Restore original pointer + sub rsi, 128 ; Decrement count + ja .loop + ret