diff --git a/haversine_02/src/repetition_testing/main.cc b/haversine_02/src/repetition_testing/main.cc index b80c339..3e6b450 100644 --- a/haversine_02/src/repetition_testing/main.cc +++ b/haversine_02/src/repetition_testing/main.cc @@ -5,6 +5,7 @@ #include #include #include +#include #include #define ARR_LEN(ARR) sizeof(ARR) / sizeof(*ARR) @@ -51,6 +52,7 @@ extern "C" void read_32x2_simd_no_offset(char *buffer, u64 size); extern "C" void read_16x4_simd(char *buffer, u64 size); extern "C" void read_32x4_simd(char *buffer, u64 size); extern "C" void cache_test(char *buffer, u64 size, u64 mask); +extern "C" void cache_test_unaligned(char *buffer, u64 size, u64 mask); void test_fread(reptester *tester, alloc_type type); void test_read(reptester *tester, alloc_type type); @@ -94,7 +96,9 @@ void test_read_32x2_simd_no_offset(reptester *tester, alloc_type type); void test_read_16x4_simd(reptester *tester, alloc_type type); void test_read_32x4_simd(reptester *tester, alloc_type type); void test_cache_test_16k(reptester *tester, alloc_type type); +void test_cache_test_16k_unaligned(reptester *tester, alloc_type type); void test_cache_test_32k(reptester *tester, alloc_type type); +void test_cache_test_32k_unaligned(reptester *tester, alloc_type type); void test_cache_test_64k(reptester *tester, alloc_type type); void test_cache_test_128k(reptester *tester, alloc_type type); void test_cache_test_256k(reptester *tester, alloc_type type); @@ -216,30 +220,39 @@ int main(int argc, char *argv[]) { // {{"READ 32x4_simd", "READ 32x4_simd WITH MALLOC"}, // test_read_32x4_simd}, {{"CACHE TEST 16K", "CACHE TEST 16K WITH MALLOC"}, test_cache_test_16k}, - {{"CACHE TEST 32K", "CACHE TEST 32K WITH MALLOC"}, test_cache_test_32k}, - {{"CACHE TEST 64K", "CACHE TEST 64K WITH MALLOC"}, test_cache_test_64k}, - {{"CACHE TEST 128K", "CACHE TEST 128K WITH MALLOC"}, - test_cache_test_128k}, - {{"CACHE TEST 256K", "CACHE TEST 256K WITH MALLOC"}, - test_cache_test_256k}, - {{"CACHE TEST 512K", "CACHE TEST 512K WITH MALLOC"}, - test_cache_test_512k}, - {{"CACHE TEST 1M", "CACHE TEST 1M WITH MALLOC"}, test_cache_test_1m}, - {{"CACHE TEST 2M", "CACHE TEST 2M WITH MALLOC"}, test_cache_test_2m}, - {{"CACHE TEST 4M", "CACHE TEST 4M WITH MALLOC"}, test_cache_test_4m}, - {{"CACHE TEST 8M", "CACHE TEST 8M WITH MALLOC"}, test_cache_test_8m}, - {{"CACHE TEST 16M", "CACHE TEST 16M WITH MALLOC"}, test_cache_test_16m}, - {{"CACHE TEST 32M", "CACHE TEST 32M WITH MALLOC"}, test_cache_test_32m}, - {{"CACHE TEST 64M", "CACHE TEST 64M WITH MALLOC"}, test_cache_test_64m}, - {{"CACHE TEST 512M", "CACHE TEST 512M WITH MALLOC"}, - test_cache_test_512m}, - {{"CACHE TEST FULL", "CACHE TEST FULL WITH MALLOC"}, - test_cache_test_full}, + {{"CACHE TEST 16K UNALIGNED", "CACHE TEST 16K UNALIGNED WITH MALLOC"}, + test_cache_test_16k_unaligned}, + // {{"CACHE TEST 32K", "CACHE TEST 32K WITH MALLOC"}, + // test_cache_test_32k}, + // {{"CACHE TEST 64K", "CACHE TEST 64K WITH MALLOC"}, + // test_cache_test_64k}, + // {{"CACHE TEST 128K", "CACHE TEST 128K WITH MALLOC"}, + // test_cache_test_128k}, + // {{"CACHE TEST 256K", "CACHE TEST 256K WITH MALLOC"}, + // test_cache_test_256k}, + // {{"CACHE TEST 512K", "CACHE TEST 512K WITH MALLOC"}, + // test_cache_test_512k}, + // {{"CACHE TEST 1M", "CACHE TEST 1M WITH MALLOC"}, test_cache_test_1m}, + // {{"CACHE TEST 2M", "CACHE TEST 2M WITH MALLOC"}, test_cache_test_2m}, + // {{"CACHE TEST 4M", "CACHE TEST 4M WITH MALLOC"}, test_cache_test_4m}, + // {{"CACHE TEST 8M", "CACHE TEST 8M WITH MALLOC"}, test_cache_test_8m}, + // {{"CACHE TEST 16M", "CACHE TEST 16M WITH MALLOC"}, + // test_cache_test_16m}, + // {{"CACHE TEST 32M", "CACHE TEST 32M WITH MALLOC"}, + // test_cache_test_32m}, + // {{"CACHE TEST 64M", "CACHE TEST 64M WITH MALLOC"}, + // test_cache_test_64m}, + // {{"CACHE TEST 512M", "CACHE TEST 512M WITH MALLOC"}, + // test_cache_test_512m}, + // {{"CACHE TEST FULL", "CACHE TEST FULL WITH MALLOC"}, + // test_cache_test_full}, }; tester.params.read_size = get_file_length(fp); tester.params.read_count = 1; - tester.params.buffer = (char *)malloc(tester.params.read_size + 1); + tester.params.buffer = + (char *)mmap(NULL, tester.params.read_size + 1, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_SHARED | MAP_NORESERVE, -1, 0); memset(tester.params.buffer, 0, tester.params.read_size + 1); for (u64 i = 0; i < tester.params.read_size; ++i) { @@ -256,7 +269,7 @@ int main(int argc, char *argv[]) { fclose(fp); - free(tester.params.buffer); + munmap(tester.params.buffer, tester.params.read_size + 1); return 0; } @@ -1333,6 +1346,31 @@ void test_cache_test_16k(reptester *tester, alloc_type type) { handle_free(tester, type); } +void test_cache_test_16k_unaligned(reptester *tester, alloc_type type) { + u64 start = read_cpu_timer(); + u64 fault_count_start = page_fault_count(); + + handle_alloc(tester, type); + + u64 total_size = tester->params.read_size * tester->params.read_count; + + cache_test_unaligned(tester->params.buffer, total_size, 0x3fff); + + u64 fault_count_end = page_fault_count(); + u64 end = read_cpu_timer(); + + u64 read_time = end - start; + u64 page_faults = fault_count_end - fault_count_start; + + tester->results = { + total_size, + read_time, + page_faults, + }; + + handle_free(tester, type); +} + void test_cache_test_32k(reptester *tester, alloc_type type) { u64 start = read_cpu_timer(); u64 fault_count_start = page_fault_count(); @@ -1358,6 +1396,31 @@ void test_cache_test_32k(reptester *tester, alloc_type type) { handle_free(tester, type); } +void test_cache_test_32k_unaligned(reptester *tester, alloc_type type) { + u64 start = read_cpu_timer(); + u64 fault_count_start = page_fault_count(); + + handle_alloc(tester, type); + + u64 total_size = tester->params.read_size * tester->params.read_count; + + cache_test_unaligned(tester->params.buffer, total_size, 0x7fff); + + u64 fault_count_end = page_fault_count(); + u64 end = read_cpu_timer(); + + u64 read_time = end - start; + u64 page_faults = fault_count_end - fault_count_start; + + tester->results = { + total_size, + read_time, + page_faults, + }; + + handle_free(tester, type); +} + void test_cache_test_64k(reptester *tester, alloc_type type) { u64 start = read_cpu_timer(); u64 fault_count_start = page_fault_count(); diff --git a/haversine_02/src/repetition_testing/reptest_functions.asm b/haversine_02/src/repetition_testing/reptest_functions.asm index e7e5bdc..65d548e 100644 --- a/haversine_02/src/repetition_testing/reptest_functions.asm +++ b/haversine_02/src/repetition_testing/reptest_functions.asm @@ -37,6 +37,7 @@ global read_32x2_simd_no_offset global read_16x4_simd global read_32x4_simd global cache_test ; Expects 3 inputs (pointer, read_count, mask) +global cache_test_unaligned ; Expects 3 inputs (pointer, read_count, mask) mov_all_bytes_asm: xor rax, rax @@ -509,3 +510,20 @@ cache_test: sub rsi, 128 ; Decrement count ja .loop ret + +cache_test_unaligned: + xor r10, r10 ; Zero loop counter + add rdi, 5 ; Unalign pointer + mov rbx, rdi ; Save original pointer + .loop: + add rdi, r10 ; Advance the pointer + add r10, 128 ; Increment loop counter + and r10, rdx ; Mask offset + vmovdqu ymm0, [rdi + 0] + vmovdqu ymm1, [rdi + 32] + vmovdqu ymm2, [rdi + 64] + vmovdqu ymm3, [rdi + 96] + mov rdi, rbx ; Restore original pointer + sub rsi, 128 ; Decrement count + ja .loop + ret diff --git a/haversine_02/src/repetition_testing/reptester.cc b/haversine_02/src/repetition_testing/reptester.cc index 68f9136..9efe735 100644 --- a/haversine_02/src/repetition_testing/reptester.cc +++ b/haversine_02/src/repetition_testing/reptester.cc @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -10,7 +11,9 @@ void handle_alloc(reptester *tester, alloc_type type) { switch (type) { case ALLOC_TYPE_WITH_MALLOC: if (!(tester->params.buffer)) { - tester->params.buffer = (char *)malloc(tester->params.read_size + 1); + tester->params.buffer = (char *)mmap( + NULL, tester->params.read_size + 1, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_SHARED | MAP_NORESERVE, -1, 0); memset(tester->params.buffer, 0, tester->params.read_size + 1); } @@ -24,7 +27,7 @@ void handle_free(reptester *tester, alloc_type type) { switch (type) { case ALLOC_TYPE_WITH_MALLOC: if (tester->params.buffer) { - free(tester->params.buffer); + munmap(tester->params.buffer, tester->params.read_size + 1); tester->params.buffer = NULL; } @@ -58,7 +61,9 @@ void run_func_test(reptester *tester, reptest_func func, const char *func_name, if (type == ALLOC_TYPE_WITH_MALLOC) { buffer = tester->params.buffer; - tester->params.buffer = (char *)malloc(tester->params.read_size + 1); + tester->params.buffer = + (char *)mmap(NULL, tester->params.read_size + 1, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_SHARED | MAP_NORESERVE, -1, 0); memset(tester->params.buffer, 0, tester->params.read_size + 1); } @@ -100,7 +105,7 @@ void run_func_test(reptester *tester, reptest_func func, const char *func_name, } if (type == ALLOC_TYPE_WITH_MALLOC) { - free(tester->params.buffer); + munmap(tester->params.buffer, tester->params.read_size + 1); tester->params.buffer = buffer; }