SIMD homework

This commit is contained in:
Abdelrahman Said 2024-04-13 21:31:00 +01:00
parent 5c97a99839
commit 46ee06406f
2 changed files with 308 additions and 5 deletions

View File

@ -40,6 +40,13 @@ extern "C" void read_1x2_high(char *buffer, u64 size);
extern "C" void read_2x2(char *buffer, u64 size); extern "C" void read_2x2(char *buffer, u64 size);
extern "C" void read_4x2(char *buffer, u64 size); extern "C" void read_4x2(char *buffer, u64 size);
extern "C" void read_8x2(char *buffer, u64 size); extern "C" void read_8x2(char *buffer, u64 size);
extern "C" void read_4x2_simd(char *buffer, u64 size);
extern "C" void read_8x2_simd(char *buffer, u64 size);
extern "C" void read_16x2_simd(char *buffer, u64 size);
extern "C" void read_32x2_simd_offset(char *buffer, u64 size);
extern "C" void read_32x2_simd_no_offset(char *buffer, u64 size);
extern "C" void read_16x4_simd(char *buffer, u64 size);
extern "C" void read_32x4_simd(char *buffer, u64 size);
void test_fread(reptester *tester, alloc_type type); void test_fread(reptester *tester, alloc_type type);
void test_read(reptester *tester, alloc_type type); void test_read(reptester *tester, alloc_type type);
@ -75,6 +82,13 @@ void test_read_1x2_high(reptester *tester, alloc_type type);
void test_read_2x2(reptester *tester, alloc_type type); void test_read_2x2(reptester *tester, alloc_type type);
void test_read_4x2(reptester *tester, alloc_type type); void test_read_4x2(reptester *tester, alloc_type type);
void test_read_8x2(reptester *tester, alloc_type type); void test_read_8x2(reptester *tester, alloc_type type);
void test_read_4x2_simd(reptester *tester, alloc_type type);
void test_read_8x2_simd(reptester *tester, alloc_type type);
void test_read_16x2_simd(reptester *tester, alloc_type type);
void test_read_32x2_simd_offset(reptester *tester, alloc_type type);
void test_read_32x2_simd_no_offset(reptester *tester, alloc_type type);
void test_read_16x4_simd(reptester *tester, alloc_type type);
void test_read_32x4_simd(reptester *tester, alloc_type type);
u64 get_file_length(FILE *fp); u64 get_file_length(FILE *fp);
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
@ -163,11 +177,20 @@ int main(int argc, char *argv[]) {
// {{"WRITE 3", "WRITE 3 WITH MALLOC"}, test_write_3}, // {{"WRITE 3", "WRITE 3 WITH MALLOC"}, test_write_3},
// {{"WRITE 4", "WRITE 4 WITH MALLOC"}, test_write_4}, // {{"WRITE 4", "WRITE 4 WITH MALLOC"}, test_write_4},
// {{"WRITE 8", "WRITE 8 WITH MALLOC"}, test_write_8}, // {{"WRITE 8", "WRITE 8 WITH MALLOC"}, test_write_8},
{{"READ 1x2 LOW", "READ 1x2 LOW WITH MALLOC"}, test_read_1x2_low}, // {{"READ 1x2 LOW", "READ 1x2 LOW WITH MALLOC"}, test_read_1x2_low},
{{"READ 1x2 HIGH", "READ 1x2 HIGH WITH MALLOC"}, test_read_1x2_high}, // {{"READ 1x2 HIGH", "READ 1x2 HIGH WITH MALLOC"}, test_read_1x2_high},
{{"READ 2x2", "READ 2x2 WITH MALLOC"}, test_read_2x2}, // {{"READ 2x2", "READ 2x2 WITH MALLOC"}, test_read_2x2},
{{"READ 4x2", "READ 4x2 WITH MALLOC"}, test_read_4x2}, // {{"READ 4x2", "READ 4x2 WITH MALLOC"}, test_read_4x2},
{{"READ 8x2", "READ 8x2 WITH MALLOC"}, test_read_8x2}, // {{"READ 8x2", "READ 8x2 WITH MALLOC"}, test_read_8x2},
{{"READ 4x2_simd", "READ 4x2_simd WITH MALLOC"}, test_read_4x2_simd},
{{"READ 8x2_simd", "READ 8x2_simd WITH MALLOC"}, test_read_8x2_simd},
{{"READ 16x2_simd", "READ 16x2_simd WITH MALLOC"}, test_read_16x2_simd},
{{"READ 32x2_simd_offset", "READ 32x2_simd_offset WITH MALLOC"},
test_read_32x2_simd_offset},
{{"READ 32x2_simd_no_offset", "READ 32x2_simd_no_offset WITH MALLOC"},
test_read_32x2_simd_no_offset},
{{"READ 16x4_simd", "READ 16x4_simd WITH MALLOC"}, test_read_16x4_simd},
{{"READ 32x4_simd", "READ 32x4_simd WITH MALLOC"}, test_read_32x4_simd},
}; };
tester.params.read_size = get_file_length(fp); tester.params.read_size = get_file_length(fp);
@ -1063,6 +1086,184 @@ void test_read_8x2(reptester *tester, alloc_type type) {
handle_free(tester, type); handle_free(tester, type);
} }
void test_read_4x2_simd(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count();
handle_alloc(tester, type);
u64 total_size = tester->params.read_size * tester->params.read_count;
read_4x2_simd(tester->params.buffer, total_size);
u64 fault_count_end = page_fault_count();
u64 end = read_cpu_timer();
u64 read_time = end - start;
u64 page_faults = fault_count_end - fault_count_start;
tester->results = {
total_size,
read_time,
page_faults,
};
handle_free(tester, type);
}
void test_read_8x2_simd(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count();
handle_alloc(tester, type);
u64 total_size = tester->params.read_size * tester->params.read_count;
read_8x2_simd(tester->params.buffer, total_size);
u64 fault_count_end = page_fault_count();
u64 end = read_cpu_timer();
u64 read_time = end - start;
u64 page_faults = fault_count_end - fault_count_start;
tester->results = {
total_size,
read_time,
page_faults,
};
handle_free(tester, type);
}
void test_read_16x2_simd(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count();
handle_alloc(tester, type);
u64 total_size = tester->params.read_size * tester->params.read_count;
read_16x2_simd(tester->params.buffer, total_size);
u64 fault_count_end = page_fault_count();
u64 end = read_cpu_timer();
u64 read_time = end - start;
u64 page_faults = fault_count_end - fault_count_start;
tester->results = {
total_size,
read_time,
page_faults,
};
handle_free(tester, type);
}
void test_read_32x2_simd_offset(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count();
handle_alloc(tester, type);
u64 total_size = tester->params.read_size * tester->params.read_count;
read_32x2_simd_offset(tester->params.buffer, total_size);
u64 fault_count_end = page_fault_count();
u64 end = read_cpu_timer();
u64 read_time = end - start;
u64 page_faults = fault_count_end - fault_count_start;
tester->results = {
total_size,
read_time,
page_faults,
};
handle_free(tester, type);
}
void test_read_32x2_simd_no_offset(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count();
handle_alloc(tester, type);
u64 total_size = tester->params.read_size * tester->params.read_count;
read_32x2_simd_no_offset(tester->params.buffer, total_size);
u64 fault_count_end = page_fault_count();
u64 end = read_cpu_timer();
u64 read_time = end - start;
u64 page_faults = fault_count_end - fault_count_start;
tester->results = {
total_size,
read_time,
page_faults,
};
handle_free(tester, type);
}
void test_read_16x4_simd(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count();
handle_alloc(tester, type);
u64 total_size = tester->params.read_size * tester->params.read_count;
read_16x4_simd(tester->params.buffer, total_size);
u64 fault_count_end = page_fault_count();
u64 end = read_cpu_timer();
u64 read_time = end - start;
u64 page_faults = fault_count_end - fault_count_start;
tester->results = {
total_size,
read_time,
page_faults,
};
handle_free(tester, type);
}
void test_read_32x4_simd(reptester *tester, alloc_type type) {
u64 start = read_cpu_timer();
u64 fault_count_start = page_fault_count();
handle_alloc(tester, type);
u64 total_size = tester->params.read_size * tester->params.read_count;
read_32x4_simd(tester->params.buffer, total_size);
u64 fault_count_end = page_fault_count();
u64 end = read_cpu_timer();
u64 read_time = end - start;
u64 page_faults = fault_count_end - fault_count_start;
tester->results = {
total_size,
read_time,
page_faults,
};
handle_free(tester, type);
}
u64 get_file_length(FILE *fp) { u64 get_file_length(FILE *fp) {
if (!fp) { if (!fp) {
return 0; return 0;

View File

@ -29,6 +29,13 @@ global read_1x2_high
global read_2x2 global read_2x2
global read_4x2 global read_4x2
global read_8x2 global read_8x2
global read_4x2_simd
global read_8x2_simd
global read_16x2_simd
global read_32x2_simd_offset
global read_32x2_simd_no_offset
global read_16x4_simd
global read_32x4_simd
mov_all_bytes_asm: mov_all_bytes_asm:
xor rax, rax xor rax, rax
@ -390,3 +397,98 @@ read_8x2:
jnle .loop jnle .loop
ret ret
read_4x2_simd:
xor rax, rax
align 64
.loop:
mov r8d, [rdi]
mov r8d, [rdi + 4]
add rax, 8
cmp rax, rsi
jb .loop
ret
read_8x2_simd:
xor rax, rax
align 64
.loop:
mov r8, [rdi]
mov r8, [rdi + 8]
add rax, 16
cmp rax, rsi
jb .loop
ret
read_16x2_simd:
xor rax, rax
align 64
.loop:
vmovdqu xmm0, [rdi]
vmovdqu xmm0, [rdi + 16]
add rax, 32
cmp rax, rsi
jb .loop
ret
read_32x2_simd_offset:
xor rax, rax
align 64
.loop:
vmovdqu ymm0, [rdi]
vmovdqu ymm0, [rdi + 32]
add rax, 64
cmp rax, rsi
jb .loop
ret
read_32x2_simd_no_offset:
xor rax, rax
align 64
.loop:
vmovdqu ymm0, [rdi]
vmovdqu ymm0, [rdi]
add rax, 64
cmp rax, rsi
jb .loop
ret
read_16x4_simd:
xor rax, rax
align 64
.loop:
%rep 2
vmovdqu xmm0, [rdi]
vmovdqu xmm0, [rdi + 16]
%endrep
add rax, 64
cmp rax, rsi
jb .loop
ret
read_32x4_simd:
xor rax, rax
align 64
.loop:
%rep 2
vmovdqu ymm0, [rdi]
vmovdqu ymm0, [rdi]
%endrep
add rax, 128
cmp rax, rsi
jb .loop
ret