Compare commits

...

2 Commits

Author SHA1 Message Date
ba31dd9f8c Add cache testing functions 2024-05-06 22:55:44 +01:00
f355ab2d25 Update .gitignore 2024-05-06 22:55:32 +01:00
3 changed files with 754 additions and 349 deletions

View File

@ -4,6 +4,7 @@
compile_commands.json compile_commands.json
count_and_distances count_and_distances
pairs.json pairs.json
cache_test
main main
genhavr genhavr
prochavr prochavr

View File

@ -9,6 +9,9 @@
#define ARR_LEN(ARR) sizeof(ARR) / sizeof(*ARR) #define ARR_LEN(ARR) sizeof(ARR) / sizeof(*ARR)
u64 *g_cache_output = NULL;
u64 g_size = 1024 * 1024 * 1024 / 128 * sizeof(u64);
extern "C" void mov_all_bytes_asm(char *buffer, u64 size); extern "C" void mov_all_bytes_asm(char *buffer, u64 size);
extern "C" void nop_all_bytes_asm(u64 size); extern "C" void nop_all_bytes_asm(u64 size);
extern "C" void nop_1x3_all_bytes_asm(u64 size); extern "C" void nop_1x3_all_bytes_asm(u64 size);
@ -47,6 +50,7 @@ extern "C" void read_32x2_simd_offset(char *buffer, u64 size);
extern "C" void read_32x2_simd_no_offset(char *buffer, u64 size); extern "C" void read_32x2_simd_no_offset(char *buffer, u64 size);
extern "C" void read_16x4_simd(char *buffer, u64 size); extern "C" void read_16x4_simd(char *buffer, u64 size);
extern "C" void read_32x4_simd(char *buffer, u64 size); extern "C" void read_32x4_simd(char *buffer, u64 size);
extern "C" void cache_test(char *buffer, u64 size, u64 mask);
void test_fread(reptester *tester, alloc_type type); void test_fread(reptester *tester, alloc_type type);
void test_read(reptester *tester, alloc_type type); void test_read(reptester *tester, alloc_type type);
@ -89,6 +93,20 @@ void test_read_32x2_simd_offset(reptester *tester, alloc_type type);
void test_read_32x2_simd_no_offset(reptester *tester, alloc_type type); void test_read_32x2_simd_no_offset(reptester *tester, alloc_type type);
void test_read_16x4_simd(reptester *tester, alloc_type type); void test_read_16x4_simd(reptester *tester, alloc_type type);
void test_read_32x4_simd(reptester *tester, alloc_type type); void test_read_32x4_simd(reptester *tester, alloc_type type);
void test_cache_test_16k(reptester *tester, alloc_type type);
void test_cache_test_32k(reptester *tester, alloc_type type);
void test_cache_test_64k(reptester *tester, alloc_type type);
void test_cache_test_128k(reptester *tester, alloc_type type);
void test_cache_test_512k(reptester *tester, alloc_type type);
void test_cache_test_1m(reptester *tester, alloc_type type);
void test_cache_test_2m(reptester *tester, alloc_type type);
void test_cache_test_4m(reptester *tester, alloc_type type);
void test_cache_test_8m(reptester *tester, alloc_type type);
void test_cache_test_16m(reptester *tester, alloc_type type);
void test_cache_test_32m(reptester *tester, alloc_type type);
void test_cache_test_64m(reptester *tester, alloc_type type);
void test_cache_test_512m(reptester *tester, alloc_type type);
void test_cache_test_full(reptester *tester, alloc_type type);
u64 get_file_length(FILE *fp); u64 get_file_length(FILE *fp);
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
@ -108,6 +126,8 @@ int main(int argc, char *argv[]) {
break; break;
} }
g_cache_output = (u64 *)calloc(1, g_size);
// clang-format off // clang-format off
reptester tester = { reptester tester = {
{filename, NULL, 0, 0}, // params {filename, NULL, 0, 0}, // params
@ -118,20 +138,20 @@ int main(int argc, char *argv[]) {
read_cpu_timer(), // test_start_time read_cpu_timer(), // test_start_time
1, // current_run 1, // current_run
{ {
UINT64_MAX, // min_time UINT64_MAX, // min_time
0, // max_time 0, // max_time
0, // avg_time 0, // avg_time
0, // total_time 0, // total_time
}, },
{ {
0, // min_faults 0, // min_faults
0, // max_faults 0, // max_faults
0, // avg_faults 0, // avg_faults
0, // total_bytes 0, // total_bytes
0, // total_faults 0, // total_faults
}, },
{}, // results {}, // results
}; };
// clang-format on // clang-format on
@ -182,15 +202,36 @@ int main(int argc, char *argv[]) {
// {{"READ 2x2", "READ 2x2 WITH MALLOC"}, test_read_2x2}, // {{"READ 2x2", "READ 2x2 WITH MALLOC"}, test_read_2x2},
// {{"READ 4x2", "READ 4x2 WITH MALLOC"}, test_read_4x2}, // {{"READ 4x2", "READ 4x2 WITH MALLOC"}, test_read_4x2},
// {{"READ 8x2", "READ 8x2 WITH MALLOC"}, test_read_8x2}, // {{"READ 8x2", "READ 8x2 WITH MALLOC"}, test_read_8x2},
{{"READ 4x2_simd", "READ 4x2_simd WITH MALLOC"}, test_read_4x2_simd}, // {{"READ 4x2_simd", "READ 4x2_simd WITH MALLOC"}, test_read_4x2_simd},
{{"READ 8x2_simd", "READ 8x2_simd WITH MALLOC"}, test_read_8x2_simd}, // {{"READ 8x2_simd", "READ 8x2_simd WITH MALLOC"}, test_read_8x2_simd},
{{"READ 16x2_simd", "READ 16x2_simd WITH MALLOC"}, test_read_16x2_simd}, // {{"READ 16x2_simd", "READ 16x2_simd WITH MALLOC"},
{{"READ 32x2_simd_offset", "READ 32x2_simd_offset WITH MALLOC"}, // test_read_16x2_simd},
test_read_32x2_simd_offset}, // {{"READ 32x2_simd_offset", "READ 32x2_simd_offset WITH MALLOC"},
{{"READ 32x2_simd_no_offset", "READ 32x2_simd_no_offset WITH MALLOC"}, // test_read_32x2_simd_offset},
test_read_32x2_simd_no_offset}, // {{"READ 32x2_simd_no_offset", "READ 32x2_simd_no_offset WITH MALLOC"},
{{"READ 16x4_simd", "READ 16x4_simd WITH MALLOC"}, test_read_16x4_simd}, // test_read_32x2_simd_no_offset},
{{"READ 32x4_simd", "READ 32x4_simd WITH MALLOC"}, test_read_32x4_simd}, // {{"READ 16x4_simd", "READ 16x4_simd WITH MALLOC"},
// test_read_16x4_simd},
// {{"READ 32x4_simd", "READ 32x4_simd WITH MALLOC"},
// test_read_32x4_simd},
{{"CACHE TEST 16K", "CACHE TEST 16K WITH MALLOC"}, test_cache_test_16k},
{{"CACHE TEST 32K", "CACHE TEST 32K WITH MALLOC"}, test_cache_test_32k},
{{"CACHE TEST 64K", "CACHE TEST 64K WITH MALLOC"}, test_cache_test_64k},
{{"CACHE TEST 128K", "CACHE TEST 128K WITH MALLOC"},
test_cache_test_128k},
{{"CACHE TEST 512K", "CACHE TEST 512K WITH MALLOC"},
test_cache_test_512k},
{{"CACHE TEST 1M", "CACHE TEST 1M WITH MALLOC"}, test_cache_test_1m},
{{"CACHE TEST 2M", "CACHE TEST 2M WITH MALLOC"}, test_cache_test_2m},
{{"CACHE TEST 4M", "CACHE TEST 4M WITH MALLOC"}, test_cache_test_4m},
{{"CACHE TEST 8M", "CACHE TEST 8M WITH MALLOC"}, test_cache_test_8m},
{{"CACHE TEST 16M", "CACHE TEST 16M WITH MALLOC"}, test_cache_test_16m},
{{"CACHE TEST 32M", "CACHE TEST 32M WITH MALLOC"}, test_cache_test_32m},
{{"CACHE TEST 64M", "CACHE TEST 64M WITH MALLOC"}, test_cache_test_64m},
{{"CACHE TEST 512M", "CACHE TEST 512M WITH MALLOC"},
test_cache_test_512m},
{{"CACHE TEST FULL", "CACHE TEST FULL WITH MALLOC"},
test_cache_test_full},
}; };
tester.params.read_size = get_file_length(fp); tester.params.read_size = get_file_length(fp);
@ -586,7 +627,6 @@ void test_align63_loop(reptester *tester, alloc_type type) {
} }
void test_align75_loop(reptester *tester, alloc_type type) { void test_align75_loop(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer(); u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count(); u64 fault_count_start = page_fault_count();
@ -612,7 +652,6 @@ void test_align75_loop(reptester *tester, alloc_type type) {
} }
void test_align90_loop(reptester *tester, alloc_type type) { void test_align90_loop(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer(); u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count(); u64 fault_count_start = page_fault_count();
@ -1112,7 +1151,6 @@ void test_read_4x2_simd(reptester *tester, alloc_type type) {
} }
void test_read_8x2_simd(reptester *tester, alloc_type type) { void test_read_8x2_simd(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer(); u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count(); u64 fault_count_start = page_fault_count();
@ -1138,7 +1176,6 @@ void test_read_8x2_simd(reptester *tester, alloc_type type) {
} }
void test_read_16x2_simd(reptester *tester, alloc_type type) { void test_read_16x2_simd(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer(); u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count(); u64 fault_count_start = page_fault_count();
@ -1264,6 +1301,356 @@ void test_read_32x4_simd(reptester *tester, alloc_type type) {
handle_free(tester, type); handle_free(tester, type);
} }
void test_cache_test_16k(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count();
handle_alloc(tester, type);
u64 total_size = tester->params.read_size * tester->params.read_count;
cache_test(tester->params.buffer, total_size, 0x3fff);
u64 fault_count_end = page_fault_count();
u64 end = read_cpu_timer();
u64 read_time = end - start;
u64 page_faults = fault_count_end - fault_count_start;
tester->results = {
total_size,
read_time,
page_faults,
};
handle_free(tester, type);
}
void test_cache_test_32k(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count();
handle_alloc(tester, type);
u64 total_size = tester->params.read_size * tester->params.read_count;
cache_test(tester->params.buffer, total_size, 0x7fff);
u64 fault_count_end = page_fault_count();
u64 end = read_cpu_timer();
u64 read_time = end - start;
u64 page_faults = fault_count_end - fault_count_start;
tester->results = {
total_size,
read_time,
page_faults,
};
handle_free(tester, type);
}
void test_cache_test_64k(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count();
handle_alloc(tester, type);
u64 total_size = tester->params.read_size * tester->params.read_count;
cache_test(tester->params.buffer, total_size, 0xffff);
u64 fault_count_end = page_fault_count();
u64 end = read_cpu_timer();
u64 read_time = end - start;
u64 page_faults = fault_count_end - fault_count_start;
tester->results = {
total_size,
read_time,
page_faults,
};
handle_free(tester, type);
}
void test_cache_test_128k(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count();
handle_alloc(tester, type);
u64 total_size = tester->params.read_size * tester->params.read_count;
cache_test(tester->params.buffer, total_size, 0x1ffff);
u64 fault_count_end = page_fault_count();
u64 end = read_cpu_timer();
u64 read_time = end - start;
u64 page_faults = fault_count_end - fault_count_start;
tester->results = {
total_size,
read_time,
page_faults,
};
handle_free(tester, type);
}
void test_cache_test_512k(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count();
handle_alloc(tester, type);
u64 total_size = tester->params.read_size * tester->params.read_count;
cache_test(tester->params.buffer, total_size, 0x7ffff);
u64 fault_count_end = page_fault_count();
u64 end = read_cpu_timer();
u64 read_time = end - start;
u64 page_faults = fault_count_end - fault_count_start;
tester->results = {
total_size,
read_time,
page_faults,
};
handle_free(tester, type);
}
void test_cache_test_1m(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count();
handle_alloc(tester, type);
u64 total_size = tester->params.read_size * tester->params.read_count;
cache_test(tester->params.buffer, total_size, 0xfffff);
u64 fault_count_end = page_fault_count();
u64 end = read_cpu_timer();
u64 read_time = end - start;
u64 page_faults = fault_count_end - fault_count_start;
tester->results = {
total_size,
read_time,
page_faults,
};
handle_free(tester, type);
}
void test_cache_test_2m(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count();
handle_alloc(tester, type);
u64 total_size = tester->params.read_size * tester->params.read_count;
cache_test(tester->params.buffer, total_size, 0x1fffff);
u64 fault_count_end = page_fault_count();
u64 end = read_cpu_timer();
u64 read_time = end - start;
u64 page_faults = fault_count_end - fault_count_start;
tester->results = {
total_size,
read_time,
page_faults,
};
handle_free(tester, type);
}
void test_cache_test_4m(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count();
handle_alloc(tester, type);
u64 total_size = tester->params.read_size * tester->params.read_count;
cache_test(tester->params.buffer, total_size, 0x3fffff);
u64 fault_count_end = page_fault_count();
u64 end = read_cpu_timer();
u64 read_time = end - start;
u64 page_faults = fault_count_end - fault_count_start;
tester->results = {
total_size,
read_time,
page_faults,
};
handle_free(tester, type);
}
void test_cache_test_8m(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count();
handle_alloc(tester, type);
u64 total_size = tester->params.read_size * tester->params.read_count;
cache_test(tester->params.buffer, total_size, 0x7fffff);
u64 fault_count_end = page_fault_count();
u64 end = read_cpu_timer();
u64 read_time = end - start;
u64 page_faults = fault_count_end - fault_count_start;
tester->results = {
total_size,
read_time,
page_faults,
};
handle_free(tester, type);
}
void test_cache_test_16m(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count();
handle_alloc(tester, type);
u64 total_size = tester->params.read_size * tester->params.read_count;
cache_test(tester->params.buffer, total_size, 0xffffff);
u64 fault_count_end = page_fault_count();
u64 end = read_cpu_timer();
u64 read_time = end - start;
u64 page_faults = fault_count_end - fault_count_start;
tester->results = {
total_size,
read_time,
page_faults,
};
handle_free(tester, type);
}
void test_cache_test_32m(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count();
handle_alloc(tester, type);
u64 total_size = tester->params.read_size * tester->params.read_count;
cache_test(tester->params.buffer, total_size, 0x1ffffff);
u64 fault_count_end = page_fault_count();
u64 end = read_cpu_timer();
u64 read_time = end - start;
u64 page_faults = fault_count_end - fault_count_start;
tester->results = {
total_size,
read_time,
page_faults,
};
handle_free(tester, type);
}
void test_cache_test_64m(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count();
handle_alloc(tester, type);
u64 total_size = tester->params.read_size * tester->params.read_count;
cache_test(tester->params.buffer, total_size, 0x3ffffff);
u64 fault_count_end = page_fault_count();
u64 end = read_cpu_timer();
u64 read_time = end - start;
u64 page_faults = fault_count_end - fault_count_start;
tester->results = {
total_size,
read_time,
page_faults,
};
handle_free(tester, type);
}
void test_cache_test_512m(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count();
handle_alloc(tester, type);
u64 total_size = tester->params.read_size * tester->params.read_count;
cache_test(tester->params.buffer, total_size, 0x1fffffff);
u64 fault_count_end = page_fault_count();
u64 end = read_cpu_timer();
u64 read_time = end - start;
u64 page_faults = fault_count_end - fault_count_start;
tester->results = {
total_size,
read_time,
page_faults,
};
handle_free(tester, type);
}
void test_cache_test_full(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count();
handle_alloc(tester, type);
u64 total_size = tester->params.read_size * tester->params.read_count;
cache_test(tester->params.buffer, total_size, 0xffffffffffffffff);
u64 fault_count_end = page_fault_count();
u64 end = read_cpu_timer();
u64 read_time = end - start;
u64 page_faults = fault_count_end - fault_count_start;
tester->results = {
total_size,
read_time,
page_faults,
};
handle_free(tester, type);
}
u64 get_file_length(FILE *fp) { u64 get_file_length(FILE *fp) {
if (!fp) { if (!fp) {
return 0; return 0;

View File

@ -36,459 +36,476 @@ global read_32x2_simd_offset
global read_32x2_simd_no_offset global read_32x2_simd_no_offset
global read_16x4_simd global read_16x4_simd
global read_32x4_simd global read_32x4_simd
global cache_test ; Expects 3 inputs (pointer, read_count, mask)
mov_all_bytes_asm: mov_all_bytes_asm:
xor rax, rax xor rax, rax
.loop: .loop:
mov BYTE [rdi + rax * 1], al mov BYTE [rdi + rax * 1], al
inc rax inc rax
cmp rsi, rax cmp rsi, rax
jne .loop jne .loop
ret ret
nop_all_bytes_asm: nop_all_bytes_asm:
xor rax, rax xor rax, rax
.loop: .loop:
db 0x0f, 0x1f, 0x00 db 0x0f, 0x1f, 0x00
inc rax inc rax
cmp rdi, rax cmp rdi, rax
jne .loop jne .loop
ret ret
nop_1x3_all_bytes_asm: nop_1x3_all_bytes_asm:
xor rax, rax xor rax, rax
.loop: .loop:
nop nop
nop nop
nop nop
inc rax inc rax
cmp rdi, rax cmp rdi, rax
jne .loop jne .loop
ret ret
nop_1x9_all_bytes_asm: nop_1x9_all_bytes_asm:
xor rax, rax xor rax, rax
.loop: .loop:
nop nop
nop nop
nop nop
nop nop
nop nop
nop nop
nop nop
nop nop
nop nop
inc rax inc rax
cmp rdi, rax cmp rdi, rax
jne .loop jne .loop
ret ret
inc_all_bytes_asm: inc_all_bytes_asm:
xor rax, rax xor rax, rax
.loop: .loop:
inc rax inc rax
cmp rdi, rax cmp rdi, rax
jne .loop jne .loop
ret ret
dec_all_bytes_asm: dec_all_bytes_asm:
.loop: .loop:
dec rdi dec rdi
jnz .loop jnz .loop
ret ret
align64_loop: align64_loop:
xor rax, rax xor rax, rax
align 64 align 64
.loop: .loop:
inc rax inc rax
cmp rdi, rax cmp rdi, rax
jne .loop jne .loop
ret ret
align1_loop: align1_loop:
xor rax, rax xor rax, rax
align 64 align 64
nop nop
.loop: .loop:
inc rax inc rax
cmp rdi, rax cmp rdi, rax
jne .loop jne .loop
ret ret
align15_loop: align15_loop:
xor rax, rax xor rax, rax
align 64 align 64
%rep 15 %rep 15
nop nop
%endrep %endrep
.loop: .loop:
inc rax inc rax
cmp rdi, rax cmp rdi, rax
jne .loop jne .loop
ret ret
align31_loop: align31_loop:
xor rax, rax xor rax, rax
align 64 align 64
%rep 31 %rep 31
nop nop
%endrep %endrep
.loop: .loop:
inc rax inc rax
cmp rdi, rax cmp rdi, rax
jne .loop jne .loop
ret ret
align63_loop: align63_loop:
xor rax, rax xor rax, rax
align 64 align 64
%rep 63 %rep 63
nop nop
%endrep %endrep
.loop: .loop:
inc rax inc rax
cmp rdi, rax cmp rdi, rax
jne .loop jne .loop
ret ret
align75_loop: align75_loop:
xor rax, rax xor rax, rax
align 64 align 64
%rep 75 %rep 75
nop nop
%endrep %endrep
.loop: .loop:
inc rax inc rax
cmp rdi, rax cmp rdi, rax
jne .loop jne .loop
ret ret
align90_loop: align90_loop:
xor rax, rax xor rax, rax
align 64 align 64
%rep 90 %rep 90
nop nop
%endrep %endrep
.loop: .loop:
inc rax inc rax
cmp rdi, rax cmp rdi, rax
jne .loop jne .loop
ret ret
align112_loop: align112_loop:
xor rax, rax xor rax, rax
align 64 align 64
%rep 112 %rep 112
nop nop
%endrep %endrep
.loop: .loop:
inc rax inc rax
cmp rdi, rax cmp rdi, rax
jne .loop jne .loop
ret ret
rat_add: rat_add:
mov rax, rdi mov rax, rdi
.loop: .loop:
add rcx, 1 add rcx, 1
add rcx, 1 add rcx, 1
dec rax dec rax
jnz .loop jnz .loop
ret ret
rat_mov_add: rat_mov_add:
mov rax, rdi mov rax, rdi
.loop: .loop:
mov rcx, rax mov rcx, rax
add rcx, 1 add rcx, 1
mov rcx, rax mov rcx, rax
add rcx, 1 add rcx, 1
dec rax dec rax
jnz .loop jnz .loop
ret ret
read_1: read_1:
align 64 align 64
.loop: .loop:
mov rax, [rdi] mov rax, [rdi]
sub rsi, 1 sub rsi, 1
jnle .loop jnle .loop
ret ret
read_2: read_2:
align 64 align 64
.loop: .loop:
%rep 2 %rep 2
mov rax, [rdi] mov rax, [rdi]
%endrep %endrep
sub rsi, 2 sub rsi, 2
jnle .loop jnle .loop
ret ret
read_3: read_3:
align 64 align 64
.loop: .loop:
%rep 3 %rep 3
mov rax, [rdi] mov rax, [rdi]
%endrep %endrep
sub rsi, 3 sub rsi, 3
jnle .loop jnle .loop
ret ret
read_4: read_4:
align 64 align 64
.loop: .loop:
%rep 4 %rep 4
mov rax, [rdi] mov rax, [rdi]
%endrep %endrep
sub rsi, 4 sub rsi, 4
jnle .loop jnle .loop
ret ret
read_8: read_8:
align 64 align 64
.loop: .loop:
%rep 8 %rep 8
mov rax, [rdi] mov rax, [rdi]
%endrep %endrep
sub rsi, 8 sub rsi, 8
jnle .loop jnle .loop
ret ret
write_1: write_1:
align 64 align 64
.loop: .loop:
mov QWORD [rdi], 0 mov QWORD [rdi], 0
sub rsi, 1 sub rsi, 1
jnle .loop jnle .loop
ret ret
write_2: write_2:
align 64 align 64
.loop: .loop:
%rep 2 %rep 2
mov QWORD [rdi], 0 mov QWORD [rdi], 0
%endrep %endrep
sub rsi, 2 sub rsi, 2
jnle .loop jnle .loop
ret ret
write_3: write_3:
align 64 align 64
.loop: .loop:
%rep 3 %rep 3
mov QWORD [rdi], 0 mov QWORD [rdi], 0
%endrep %endrep
sub rsi, 3 sub rsi, 3
jnle .loop jnle .loop
ret ret
write_4: write_4:
align 64 align 64
.loop: .loop:
%rep 4 %rep 4
mov QWORD [rdi], 0 mov QWORD [rdi], 0
%endrep %endrep
sub rsi, 4 sub rsi, 4
jnle .loop jnle .loop
ret ret
write_8: write_8:
align 64 align 64
.loop: .loop:
%rep 8 %rep 8
mov QWORD [rdi], 0 mov QWORD [rdi], 0
%endrep %endrep
sub rsi, 8 sub rsi, 8
jnle .loop jnle .loop
ret ret
read_1x2_low: read_1x2_low:
align 64 align 64
.loop: .loop:
%rep 2 %rep 2
mov al, [rdi] mov al, [rdi]
%endrep %endrep
sub rsi, 2 sub rsi, 2
jnle .loop jnle .loop
ret ret
read_1x2_high: read_1x2_high:
align 64 align 64
.loop: .loop:
%rep 2 %rep 2
mov ah, [rdi] mov ah, [rdi]
%endrep %endrep
sub rsi, 2 sub rsi, 2
jnle .loop jnle .loop
ret ret
read_2x2: read_2x2:
align 64 align 64
.loop: .loop:
%rep 2 %rep 2
mov ax, [rdi] mov ax, [rdi]
%endrep %endrep
sub rsi, 2 sub rsi, 2
jnle .loop jnle .loop
ret ret
read_4x2: read_4x2:
align 64 align 64
.loop: .loop:
%rep 2 %rep 2
mov eax, [rdi] mov eax, [rdi]
%endrep %endrep
sub rsi, 2 sub rsi, 2
jnle .loop jnle .loop
ret ret
read_8x2: read_8x2:
align 64 align 64
.loop: .loop:
%rep 2 %rep 2
mov rax, [rdi] mov rax, [rdi]
%endrep %endrep
sub rsi, 2 sub rsi, 2
jnle .loop jnle .loop
ret ret
read_4x2_simd: read_4x2_simd:
xor rax, rax xor rax, rax
align 64 align 64
.loop: .loop:
mov r8d, [rdi] mov r8d, [rdi]
mov r8d, [rdi + 4] mov r8d, [rdi + 4]
add rax, 8 add rax, 8
cmp rax, rsi cmp rax, rsi
jb .loop jb .loop
ret ret
read_8x2_simd: read_8x2_simd:
xor rax, rax xor rax, rax
align 64 align 64
.loop: .loop:
mov r8, [rdi] mov r8, [rdi]
mov r8, [rdi + 8] mov r8, [rdi + 8]
add rax, 16 add rax, 16
cmp rax, rsi cmp rax, rsi
jb .loop jb .loop
ret ret
read_16x2_simd: read_16x2_simd:
xor rax, rax xor rax, rax
align 64 align 64
.loop: .loop:
vmovdqu xmm0, [rdi] vmovdqu xmm0, [rdi]
vmovdqu xmm0, [rdi + 16] vmovdqu xmm0, [rdi + 16]
add rax, 32 add rax, 32
cmp rax, rsi cmp rax, rsi
jb .loop jb .loop
ret ret
read_32x2_simd_offset: read_32x2_simd_offset:
xor rax, rax xor rax, rax
align 64 align 64
.loop: .loop:
vmovdqu ymm0, [rdi] vmovdqu ymm0, [rdi]
vmovdqu ymm0, [rdi + 32] vmovdqu ymm0, [rdi + 32]
add rax, 64 add rax, 64
cmp rax, rsi cmp rax, rsi
jb .loop jb .loop
ret ret
read_32x2_simd_no_offset: read_32x2_simd_no_offset:
xor rax, rax xor rax, rax
align 64 align 64
.loop: .loop:
vmovdqu ymm0, [rdi] vmovdqu ymm0, [rdi]
vmovdqu ymm0, [rdi] vmovdqu ymm0, [rdi]
add rax, 64 add rax, 64
cmp rax, rsi cmp rax, rsi
jb .loop jb .loop
ret ret
read_16x4_simd: read_16x4_simd:
xor rax, rax xor rax, rax
align 64 align 64
.loop: .loop:
%rep 2 %rep 2
vmovdqu xmm0, [rdi] vmovdqu xmm0, [rdi]
vmovdqu xmm0, [rdi + 16] vmovdqu xmm0, [rdi + 16]
%endrep %endrep
add rax, 64 add rax, 64
cmp rax, rsi cmp rax, rsi
jb .loop jb .loop
ret ret
read_32x4_simd: read_32x4_simd:
xor rax, rax xor rax, rax
align 64 align 64
.loop: .loop:
%rep 2 %rep 2
vmovdqu ymm0, [rdi] vmovdqu ymm0, [rdi]
vmovdqu ymm0, [rdi] vmovdqu ymm0, [rdi]
%endrep %endrep
add rax, 128 add rax, 128
cmp rax, rsi cmp rax, rsi
jb .loop jb .loop
ret ret
cache_test:
xor r10, r10 ; Zero loop counter
mov rbx, rdi ; Save original pointer
.loop:
add rdi, r10 ; Advance the pointer
add r10, 128 ; Increment loop counter
and r10, rdx ; Mask offset
vmovdqu ymm0, [rdi + 0]
vmovdqu ymm1, [rdi + 32]
vmovdqu ymm2, [rdi + 64]
vmovdqu ymm3, [rdi + 96]
mov rdi, rbx ; Restore original pointer
sub rsi, 128 ; Decrement count
ja .loop
ret