SIMD homework
This commit is contained in:
parent
5c97a99839
commit
46ee06406f
@ -40,6 +40,13 @@ extern "C" void read_1x2_high(char *buffer, u64 size);
|
||||
extern "C" void read_2x2(char *buffer, u64 size);
|
||||
extern "C" void read_4x2(char *buffer, u64 size);
|
||||
extern "C" void read_8x2(char *buffer, u64 size);
|
||||
extern "C" void read_4x2_simd(char *buffer, u64 size);
|
||||
extern "C" void read_8x2_simd(char *buffer, u64 size);
|
||||
extern "C" void read_16x2_simd(char *buffer, u64 size);
|
||||
extern "C" void read_32x2_simd_offset(char *buffer, u64 size);
|
||||
extern "C" void read_32x2_simd_no_offset(char *buffer, u64 size);
|
||||
extern "C" void read_16x4_simd(char *buffer, u64 size);
|
||||
extern "C" void read_32x4_simd(char *buffer, u64 size);
|
||||
|
||||
void test_fread(reptester *tester, alloc_type type);
|
||||
void test_read(reptester *tester, alloc_type type);
|
||||
@ -75,6 +82,13 @@ void test_read_1x2_high(reptester *tester, alloc_type type);
|
||||
void test_read_2x2(reptester *tester, alloc_type type);
|
||||
void test_read_4x2(reptester *tester, alloc_type type);
|
||||
void test_read_8x2(reptester *tester, alloc_type type);
|
||||
void test_read_4x2_simd(reptester *tester, alloc_type type);
|
||||
void test_read_8x2_simd(reptester *tester, alloc_type type);
|
||||
void test_read_16x2_simd(reptester *tester, alloc_type type);
|
||||
void test_read_32x2_simd_offset(reptester *tester, alloc_type type);
|
||||
void test_read_32x2_simd_no_offset(reptester *tester, alloc_type type);
|
||||
void test_read_16x4_simd(reptester *tester, alloc_type type);
|
||||
void test_read_32x4_simd(reptester *tester, alloc_type type);
|
||||
u64 get_file_length(FILE *fp);
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
@ -163,11 +177,20 @@ int main(int argc, char *argv[]) {
|
||||
// {{"WRITE 3", "WRITE 3 WITH MALLOC"}, test_write_3},
|
||||
// {{"WRITE 4", "WRITE 4 WITH MALLOC"}, test_write_4},
|
||||
// {{"WRITE 8", "WRITE 8 WITH MALLOC"}, test_write_8},
|
||||
{{"READ 1x2 LOW", "READ 1x2 LOW WITH MALLOC"}, test_read_1x2_low},
|
||||
{{"READ 1x2 HIGH", "READ 1x2 HIGH WITH MALLOC"}, test_read_1x2_high},
|
||||
{{"READ 2x2", "READ 2x2 WITH MALLOC"}, test_read_2x2},
|
||||
{{"READ 4x2", "READ 4x2 WITH MALLOC"}, test_read_4x2},
|
||||
{{"READ 8x2", "READ 8x2 WITH MALLOC"}, test_read_8x2},
|
||||
// {{"READ 1x2 LOW", "READ 1x2 LOW WITH MALLOC"}, test_read_1x2_low},
|
||||
// {{"READ 1x2 HIGH", "READ 1x2 HIGH WITH MALLOC"}, test_read_1x2_high},
|
||||
// {{"READ 2x2", "READ 2x2 WITH MALLOC"}, test_read_2x2},
|
||||
// {{"READ 4x2", "READ 4x2 WITH MALLOC"}, test_read_4x2},
|
||||
// {{"READ 8x2", "READ 8x2 WITH MALLOC"}, test_read_8x2},
|
||||
{{"READ 4x2_simd", "READ 4x2_simd WITH MALLOC"}, test_read_4x2_simd},
|
||||
{{"READ 8x2_simd", "READ 8x2_simd WITH MALLOC"}, test_read_8x2_simd},
|
||||
{{"READ 16x2_simd", "READ 16x2_simd WITH MALLOC"}, test_read_16x2_simd},
|
||||
{{"READ 32x2_simd_offset", "READ 32x2_simd_offset WITH MALLOC"},
|
||||
test_read_32x2_simd_offset},
|
||||
{{"READ 32x2_simd_no_offset", "READ 32x2_simd_no_offset WITH MALLOC"},
|
||||
test_read_32x2_simd_no_offset},
|
||||
{{"READ 16x4_simd", "READ 16x4_simd WITH MALLOC"}, test_read_16x4_simd},
|
||||
{{"READ 32x4_simd", "READ 32x4_simd WITH MALLOC"}, test_read_32x4_simd},
|
||||
};
|
||||
|
||||
tester.params.read_size = get_file_length(fp);
|
||||
@ -1063,6 +1086,184 @@ void test_read_8x2(reptester *tester, alloc_type type) {
|
||||
handle_free(tester, type);
|
||||
}
|
||||
|
||||
void test_read_4x2_simd(reptester *tester, alloc_type type) {
|
||||
u64 start = read_cpu_timer();
|
||||
u64 fault_count_start = page_fault_count();
|
||||
|
||||
handle_alloc(tester, type);
|
||||
|
||||
u64 total_size = tester->params.read_size * tester->params.read_count;
|
||||
|
||||
read_4x2_simd(tester->params.buffer, total_size);
|
||||
|
||||
u64 fault_count_end = page_fault_count();
|
||||
u64 end = read_cpu_timer();
|
||||
|
||||
u64 read_time = end - start;
|
||||
u64 page_faults = fault_count_end - fault_count_start;
|
||||
|
||||
tester->results = {
|
||||
total_size,
|
||||
read_time,
|
||||
page_faults,
|
||||
};
|
||||
|
||||
handle_free(tester, type);
|
||||
}
|
||||
|
||||
void test_read_8x2_simd(reptester *tester, alloc_type type) {
|
||||
|
||||
u64 start = read_cpu_timer();
|
||||
u64 fault_count_start = page_fault_count();
|
||||
|
||||
handle_alloc(tester, type);
|
||||
|
||||
u64 total_size = tester->params.read_size * tester->params.read_count;
|
||||
|
||||
read_8x2_simd(tester->params.buffer, total_size);
|
||||
|
||||
u64 fault_count_end = page_fault_count();
|
||||
u64 end = read_cpu_timer();
|
||||
|
||||
u64 read_time = end - start;
|
||||
u64 page_faults = fault_count_end - fault_count_start;
|
||||
|
||||
tester->results = {
|
||||
total_size,
|
||||
read_time,
|
||||
page_faults,
|
||||
};
|
||||
|
||||
handle_free(tester, type);
|
||||
}
|
||||
|
||||
void test_read_16x2_simd(reptester *tester, alloc_type type) {
|
||||
|
||||
u64 start = read_cpu_timer();
|
||||
u64 fault_count_start = page_fault_count();
|
||||
|
||||
handle_alloc(tester, type);
|
||||
|
||||
u64 total_size = tester->params.read_size * tester->params.read_count;
|
||||
|
||||
read_16x2_simd(tester->params.buffer, total_size);
|
||||
|
||||
u64 fault_count_end = page_fault_count();
|
||||
u64 end = read_cpu_timer();
|
||||
|
||||
u64 read_time = end - start;
|
||||
u64 page_faults = fault_count_end - fault_count_start;
|
||||
|
||||
tester->results = {
|
||||
total_size,
|
||||
read_time,
|
||||
page_faults,
|
||||
};
|
||||
|
||||
handle_free(tester, type);
|
||||
}
|
||||
|
||||
void test_read_32x2_simd_offset(reptester *tester, alloc_type type) {
|
||||
u64 start = read_cpu_timer();
|
||||
u64 fault_count_start = page_fault_count();
|
||||
|
||||
handle_alloc(tester, type);
|
||||
|
||||
u64 total_size = tester->params.read_size * tester->params.read_count;
|
||||
|
||||
read_32x2_simd_offset(tester->params.buffer, total_size);
|
||||
|
||||
u64 fault_count_end = page_fault_count();
|
||||
u64 end = read_cpu_timer();
|
||||
|
||||
u64 read_time = end - start;
|
||||
u64 page_faults = fault_count_end - fault_count_start;
|
||||
|
||||
tester->results = {
|
||||
total_size,
|
||||
read_time,
|
||||
page_faults,
|
||||
};
|
||||
|
||||
handle_free(tester, type);
|
||||
}
|
||||
|
||||
void test_read_32x2_simd_no_offset(reptester *tester, alloc_type type) {
|
||||
u64 start = read_cpu_timer();
|
||||
u64 fault_count_start = page_fault_count();
|
||||
|
||||
handle_alloc(tester, type);
|
||||
|
||||
u64 total_size = tester->params.read_size * tester->params.read_count;
|
||||
|
||||
read_32x2_simd_no_offset(tester->params.buffer, total_size);
|
||||
|
||||
u64 fault_count_end = page_fault_count();
|
||||
u64 end = read_cpu_timer();
|
||||
|
||||
u64 read_time = end - start;
|
||||
u64 page_faults = fault_count_end - fault_count_start;
|
||||
|
||||
tester->results = {
|
||||
total_size,
|
||||
read_time,
|
||||
page_faults,
|
||||
};
|
||||
|
||||
handle_free(tester, type);
|
||||
}
|
||||
|
||||
void test_read_16x4_simd(reptester *tester, alloc_type type) {
|
||||
|
||||
u64 start = read_cpu_timer();
|
||||
u64 fault_count_start = page_fault_count();
|
||||
|
||||
handle_alloc(tester, type);
|
||||
|
||||
u64 total_size = tester->params.read_size * tester->params.read_count;
|
||||
|
||||
read_16x4_simd(tester->params.buffer, total_size);
|
||||
|
||||
u64 fault_count_end = page_fault_count();
|
||||
u64 end = read_cpu_timer();
|
||||
|
||||
u64 read_time = end - start;
|
||||
u64 page_faults = fault_count_end - fault_count_start;
|
||||
|
||||
tester->results = {
|
||||
total_size,
|
||||
read_time,
|
||||
page_faults,
|
||||
};
|
||||
|
||||
handle_free(tester, type);
|
||||
}
|
||||
|
||||
void test_read_32x4_simd(reptester *tester, alloc_type type) {
|
||||
u64 start = read_cpu_timer();
|
||||
u64 fault_count_start = page_fault_count();
|
||||
|
||||
handle_alloc(tester, type);
|
||||
|
||||
u64 total_size = tester->params.read_size * tester->params.read_count;
|
||||
|
||||
read_32x4_simd(tester->params.buffer, total_size);
|
||||
|
||||
u64 fault_count_end = page_fault_count();
|
||||
u64 end = read_cpu_timer();
|
||||
|
||||
u64 read_time = end - start;
|
||||
u64 page_faults = fault_count_end - fault_count_start;
|
||||
|
||||
tester->results = {
|
||||
total_size,
|
||||
read_time,
|
||||
page_faults,
|
||||
};
|
||||
|
||||
handle_free(tester, type);
|
||||
}
|
||||
|
||||
u64 get_file_length(FILE *fp) {
|
||||
if (!fp) {
|
||||
return 0;
|
||||
|
@ -29,6 +29,13 @@ global read_1x2_high
|
||||
global read_2x2
|
||||
global read_4x2
|
||||
global read_8x2
|
||||
global read_4x2_simd
|
||||
global read_8x2_simd
|
||||
global read_16x2_simd
|
||||
global read_32x2_simd_offset
|
||||
global read_32x2_simd_no_offset
|
||||
global read_16x4_simd
|
||||
global read_32x4_simd
|
||||
|
||||
mov_all_bytes_asm:
|
||||
xor rax, rax
|
||||
@ -390,3 +397,98 @@ read_8x2:
|
||||
jnle .loop
|
||||
|
||||
ret
|
||||
|
||||
read_4x2_simd:
|
||||
xor rax, rax
|
||||
|
||||
align 64
|
||||
.loop:
|
||||
mov r8d, [rdi]
|
||||
mov r8d, [rdi + 4]
|
||||
add rax, 8
|
||||
cmp rax, rsi
|
||||
jb .loop
|
||||
|
||||
ret
|
||||
|
||||
read_8x2_simd:
|
||||
xor rax, rax
|
||||
|
||||
align 64
|
||||
.loop:
|
||||
mov r8, [rdi]
|
||||
mov r8, [rdi + 8]
|
||||
add rax, 16
|
||||
cmp rax, rsi
|
||||
jb .loop
|
||||
|
||||
ret
|
||||
|
||||
read_16x2_simd:
|
||||
xor rax, rax
|
||||
|
||||
align 64
|
||||
.loop:
|
||||
vmovdqu xmm0, [rdi]
|
||||
vmovdqu xmm0, [rdi + 16]
|
||||
add rax, 32
|
||||
cmp rax, rsi
|
||||
jb .loop
|
||||
|
||||
ret
|
||||
|
||||
read_32x2_simd_offset:
|
||||
xor rax, rax
|
||||
|
||||
align 64
|
||||
.loop:
|
||||
vmovdqu ymm0, [rdi]
|
||||
vmovdqu ymm0, [rdi + 32]
|
||||
add rax, 64
|
||||
cmp rax, rsi
|
||||
jb .loop
|
||||
|
||||
ret
|
||||
|
||||
read_32x2_simd_no_offset:
|
||||
xor rax, rax
|
||||
|
||||
align 64
|
||||
.loop:
|
||||
vmovdqu ymm0, [rdi]
|
||||
vmovdqu ymm0, [rdi]
|
||||
add rax, 64
|
||||
cmp rax, rsi
|
||||
jb .loop
|
||||
|
||||
ret
|
||||
|
||||
read_16x4_simd:
|
||||
xor rax, rax
|
||||
|
||||
align 64
|
||||
.loop:
|
||||
%rep 2
|
||||
vmovdqu xmm0, [rdi]
|
||||
vmovdqu xmm0, [rdi + 16]
|
||||
%endrep
|
||||
add rax, 64
|
||||
cmp rax, rsi
|
||||
jb .loop
|
||||
|
||||
ret
|
||||
|
||||
read_32x4_simd:
|
||||
xor rax, rax
|
||||
|
||||
align 64
|
||||
.loop:
|
||||
%rep 2
|
||||
vmovdqu ymm0, [rdi]
|
||||
vmovdqu ymm0, [rdi]
|
||||
%endrep
|
||||
add rax, 128
|
||||
cmp rax, rsi
|
||||
jb .loop
|
||||
|
||||
ret
|
||||
|
Loading…
Reference in New Issue
Block a user