diff --git a/haversine_02/src/repetition_testing/main.cpp b/haversine_02/src/repetition_testing/main.cpp index 363b6b2..2eb5798 100644 --- a/haversine_02/src/repetition_testing/main.cpp +++ b/haversine_02/src/repetition_testing/main.cpp @@ -40,6 +40,13 @@ extern "C" void read_1x2_high(char *buffer, u64 size); extern "C" void read_2x2(char *buffer, u64 size); extern "C" void read_4x2(char *buffer, u64 size); extern "C" void read_8x2(char *buffer, u64 size); +extern "C" void read_4x2_simd(char *buffer, u64 size); +extern "C" void read_8x2_simd(char *buffer, u64 size); +extern "C" void read_16x2_simd(char *buffer, u64 size); +extern "C" void read_32x2_simd_offset(char *buffer, u64 size); +extern "C" void read_32x2_simd_no_offset(char *buffer, u64 size); +extern "C" void read_16x4_simd(char *buffer, u64 size); +extern "C" void read_32x4_simd(char *buffer, u64 size); void test_fread(reptester *tester, alloc_type type); void test_read(reptester *tester, alloc_type type); @@ -75,6 +82,13 @@ void test_read_1x2_high(reptester *tester, alloc_type type); void test_read_2x2(reptester *tester, alloc_type type); void test_read_4x2(reptester *tester, alloc_type type); void test_read_8x2(reptester *tester, alloc_type type); +void test_read_4x2_simd(reptester *tester, alloc_type type); +void test_read_8x2_simd(reptester *tester, alloc_type type); +void test_read_16x2_simd(reptester *tester, alloc_type type); +void test_read_32x2_simd_offset(reptester *tester, alloc_type type); +void test_read_32x2_simd_no_offset(reptester *tester, alloc_type type); +void test_read_16x4_simd(reptester *tester, alloc_type type); +void test_read_32x4_simd(reptester *tester, alloc_type type); u64 get_file_length(FILE *fp); int main(int argc, char *argv[]) { @@ -163,11 +177,20 @@ int main(int argc, char *argv[]) { // {{"WRITE 3", "WRITE 3 WITH MALLOC"}, test_write_3}, // {{"WRITE 4", "WRITE 4 WITH MALLOC"}, test_write_4}, // {{"WRITE 8", "WRITE 8 WITH MALLOC"}, test_write_8}, - {{"READ 1x2 LOW", "READ 1x2 LOW WITH MALLOC"}, test_read_1x2_low}, - {{"READ 1x2 HIGH", "READ 1x2 HIGH WITH MALLOC"}, test_read_1x2_high}, - {{"READ 2x2", "READ 2x2 WITH MALLOC"}, test_read_2x2}, - {{"READ 4x2", "READ 4x2 WITH MALLOC"}, test_read_4x2}, - {{"READ 8x2", "READ 8x2 WITH MALLOC"}, test_read_8x2}, + // {{"READ 1x2 LOW", "READ 1x2 LOW WITH MALLOC"}, test_read_1x2_low}, + // {{"READ 1x2 HIGH", "READ 1x2 HIGH WITH MALLOC"}, test_read_1x2_high}, + // {{"READ 2x2", "READ 2x2 WITH MALLOC"}, test_read_2x2}, + // {{"READ 4x2", "READ 4x2 WITH MALLOC"}, test_read_4x2}, + // {{"READ 8x2", "READ 8x2 WITH MALLOC"}, test_read_8x2}, + {{"READ 4x2_simd", "READ 4x2_simd WITH MALLOC"}, test_read_4x2_simd}, + {{"READ 8x2_simd", "READ 8x2_simd WITH MALLOC"}, test_read_8x2_simd}, + {{"READ 16x2_simd", "READ 16x2_simd WITH MALLOC"}, test_read_16x2_simd}, + {{"READ 32x2_simd_offset", "READ 32x2_simd_offset WITH MALLOC"}, + test_read_32x2_simd_offset}, + {{"READ 32x2_simd_no_offset", "READ 32x2_simd_no_offset WITH MALLOC"}, + test_read_32x2_simd_no_offset}, + {{"READ 16x4_simd", "READ 16x4_simd WITH MALLOC"}, test_read_16x4_simd}, + {{"READ 32x4_simd", "READ 32x4_simd WITH MALLOC"}, test_read_32x4_simd}, }; tester.params.read_size = get_file_length(fp); @@ -1063,6 +1086,184 @@ void test_read_8x2(reptester *tester, alloc_type type) { handle_free(tester, type); } +void test_read_4x2_simd(reptester *tester, alloc_type type) { + u64 start = read_cpu_timer(); + u64 fault_count_start = page_fault_count(); + + handle_alloc(tester, type); + + u64 total_size = tester->params.read_size * tester->params.read_count; + + read_4x2_simd(tester->params.buffer, total_size); + + u64 fault_count_end = page_fault_count(); + u64 end = read_cpu_timer(); + + u64 read_time = end - start; + u64 page_faults = fault_count_end - fault_count_start; + + tester->results = { + total_size, + read_time, + page_faults, + }; + + handle_free(tester, type); +} + +void test_read_8x2_simd(reptester *tester, alloc_type type) { + + u64 start = read_cpu_timer(); + u64 fault_count_start = page_fault_count(); + + handle_alloc(tester, type); + + u64 total_size = tester->params.read_size * tester->params.read_count; + + read_8x2_simd(tester->params.buffer, total_size); + + u64 fault_count_end = page_fault_count(); + u64 end = read_cpu_timer(); + + u64 read_time = end - start; + u64 page_faults = fault_count_end - fault_count_start; + + tester->results = { + total_size, + read_time, + page_faults, + }; + + handle_free(tester, type); +} + +void test_read_16x2_simd(reptester *tester, alloc_type type) { + + u64 start = read_cpu_timer(); + u64 fault_count_start = page_fault_count(); + + handle_alloc(tester, type); + + u64 total_size = tester->params.read_size * tester->params.read_count; + + read_16x2_simd(tester->params.buffer, total_size); + + u64 fault_count_end = page_fault_count(); + u64 end = read_cpu_timer(); + + u64 read_time = end - start; + u64 page_faults = fault_count_end - fault_count_start; + + tester->results = { + total_size, + read_time, + page_faults, + }; + + handle_free(tester, type); +} + +void test_read_32x2_simd_offset(reptester *tester, alloc_type type) { + u64 start = read_cpu_timer(); + u64 fault_count_start = page_fault_count(); + + handle_alloc(tester, type); + + u64 total_size = tester->params.read_size * tester->params.read_count; + + read_32x2_simd_offset(tester->params.buffer, total_size); + + u64 fault_count_end = page_fault_count(); + u64 end = read_cpu_timer(); + + u64 read_time = end - start; + u64 page_faults = fault_count_end - fault_count_start; + + tester->results = { + total_size, + read_time, + page_faults, + }; + + handle_free(tester, type); +} + +void test_read_32x2_simd_no_offset(reptester *tester, alloc_type type) { + u64 start = read_cpu_timer(); + u64 fault_count_start = page_fault_count(); + + handle_alloc(tester, type); + + u64 total_size = tester->params.read_size * tester->params.read_count; + + read_32x2_simd_no_offset(tester->params.buffer, total_size); + + u64 fault_count_end = page_fault_count(); + u64 end = read_cpu_timer(); + + u64 read_time = end - start; + u64 page_faults = fault_count_end - fault_count_start; + + tester->results = { + total_size, + read_time, + page_faults, + }; + + handle_free(tester, type); +} + +void test_read_16x4_simd(reptester *tester, alloc_type type) { + + u64 start = read_cpu_timer(); + u64 fault_count_start = page_fault_count(); + + handle_alloc(tester, type); + + u64 total_size = tester->params.read_size * tester->params.read_count; + + read_16x4_simd(tester->params.buffer, total_size); + + u64 fault_count_end = page_fault_count(); + u64 end = read_cpu_timer(); + + u64 read_time = end - start; + u64 page_faults = fault_count_end - fault_count_start; + + tester->results = { + total_size, + read_time, + page_faults, + }; + + handle_free(tester, type); +} + +void test_read_32x4_simd(reptester *tester, alloc_type type) { + u64 start = read_cpu_timer(); + u64 fault_count_start = page_fault_count(); + + handle_alloc(tester, type); + + u64 total_size = tester->params.read_size * tester->params.read_count; + + read_32x4_simd(tester->params.buffer, total_size); + + u64 fault_count_end = page_fault_count(); + u64 end = read_cpu_timer(); + + u64 read_time = end - start; + u64 page_faults = fault_count_end - fault_count_start; + + tester->results = { + total_size, + read_time, + page_faults, + }; + + handle_free(tester, type); +} + u64 get_file_length(FILE *fp) { if (!fp) { return 0; diff --git a/haversine_02/src/repetition_testing/reptest_functions.asm b/haversine_02/src/repetition_testing/reptest_functions.asm index 1a950e9..59fd2fe 100644 --- a/haversine_02/src/repetition_testing/reptest_functions.asm +++ b/haversine_02/src/repetition_testing/reptest_functions.asm @@ -29,6 +29,13 @@ global read_1x2_high global read_2x2 global read_4x2 global read_8x2 +global read_4x2_simd +global read_8x2_simd +global read_16x2_simd +global read_32x2_simd_offset +global read_32x2_simd_no_offset +global read_16x4_simd +global read_32x4_simd mov_all_bytes_asm: xor rax, rax @@ -390,3 +397,98 @@ read_8x2: jnle .loop ret + +read_4x2_simd: + xor rax, rax + + align 64 + .loop: + mov r8d, [rdi] + mov r8d, [rdi + 4] + add rax, 8 + cmp rax, rsi + jb .loop + + ret + +read_8x2_simd: + xor rax, rax + + align 64 + .loop: + mov r8, [rdi] + mov r8, [rdi + 8] + add rax, 16 + cmp rax, rsi + jb .loop + + ret + +read_16x2_simd: + xor rax, rax + + align 64 + .loop: + vmovdqu xmm0, [rdi] + vmovdqu xmm0, [rdi + 16] + add rax, 32 + cmp rax, rsi + jb .loop + + ret + +read_32x2_simd_offset: + xor rax, rax + + align 64 + .loop: + vmovdqu ymm0, [rdi] + vmovdqu ymm0, [rdi + 32] + add rax, 64 + cmp rax, rsi + jb .loop + + ret + +read_32x2_simd_no_offset: + xor rax, rax + + align 64 + .loop: + vmovdqu ymm0, [rdi] + vmovdqu ymm0, [rdi] + add rax, 64 + cmp rax, rsi + jb .loop + + ret + +read_16x4_simd: + xor rax, rax + + align 64 + .loop: + %rep 2 + vmovdqu xmm0, [rdi] + vmovdqu xmm0, [rdi + 16] + %endrep + add rax, 64 + cmp rax, rsi + jb .loop + + ret + +read_32x4_simd: + xor rax, rax + + align 64 + .loop: + %rep 2 + vmovdqu ymm0, [rdi] + vmovdqu ymm0, [rdi] + %endrep + add rax, 128 + cmp rax, rsi + jb .loop + + ret