SIMD homework
This commit is contained in:
		@@ -40,6 +40,13 @@ extern "C" void read_1x2_high(char *buffer, u64 size);
 | 
			
		||||
extern "C" void read_2x2(char *buffer, u64 size);
 | 
			
		||||
extern "C" void read_4x2(char *buffer, u64 size);
 | 
			
		||||
extern "C" void read_8x2(char *buffer, u64 size);
 | 
			
		||||
extern "C" void read_4x2_simd(char *buffer, u64 size);
 | 
			
		||||
extern "C" void read_8x2_simd(char *buffer, u64 size);
 | 
			
		||||
extern "C" void read_16x2_simd(char *buffer, u64 size);
 | 
			
		||||
extern "C" void read_32x2_simd_offset(char *buffer, u64 size);
 | 
			
		||||
extern "C" void read_32x2_simd_no_offset(char *buffer, u64 size);
 | 
			
		||||
extern "C" void read_16x4_simd(char *buffer, u64 size);
 | 
			
		||||
extern "C" void read_32x4_simd(char *buffer, u64 size);
 | 
			
		||||
 | 
			
		||||
void test_fread(reptester *tester, alloc_type type);
 | 
			
		||||
void test_read(reptester *tester, alloc_type type);
 | 
			
		||||
@@ -75,6 +82,13 @@ void test_read_1x2_high(reptester *tester, alloc_type type);
 | 
			
		||||
void test_read_2x2(reptester *tester, alloc_type type);
 | 
			
		||||
void test_read_4x2(reptester *tester, alloc_type type);
 | 
			
		||||
void test_read_8x2(reptester *tester, alloc_type type);
 | 
			
		||||
void test_read_4x2_simd(reptester *tester, alloc_type type);
 | 
			
		||||
void test_read_8x2_simd(reptester *tester, alloc_type type);
 | 
			
		||||
void test_read_16x2_simd(reptester *tester, alloc_type type);
 | 
			
		||||
void test_read_32x2_simd_offset(reptester *tester, alloc_type type);
 | 
			
		||||
void test_read_32x2_simd_no_offset(reptester *tester, alloc_type type);
 | 
			
		||||
void test_read_16x4_simd(reptester *tester, alloc_type type);
 | 
			
		||||
void test_read_32x4_simd(reptester *tester, alloc_type type);
 | 
			
		||||
u64 get_file_length(FILE *fp);
 | 
			
		||||
 | 
			
		||||
int main(int argc, char *argv[]) {
 | 
			
		||||
@@ -163,11 +177,20 @@ int main(int argc, char *argv[]) {
 | 
			
		||||
      // {{"WRITE 3", "WRITE 3 WITH MALLOC"}, test_write_3},
 | 
			
		||||
      // {{"WRITE 4", "WRITE 4 WITH MALLOC"}, test_write_4},
 | 
			
		||||
      // {{"WRITE 8", "WRITE 8 WITH MALLOC"}, test_write_8},
 | 
			
		||||
      {{"READ 1x2 LOW", "READ 1x2 LOW WITH MALLOC"}, test_read_1x2_low},
 | 
			
		||||
      {{"READ 1x2 HIGH", "READ 1x2 HIGH WITH MALLOC"}, test_read_1x2_high},
 | 
			
		||||
      {{"READ 2x2", "READ 2x2 WITH MALLOC"}, test_read_2x2},
 | 
			
		||||
      {{"READ 4x2", "READ 4x2 WITH MALLOC"}, test_read_4x2},
 | 
			
		||||
      {{"READ 8x2", "READ 8x2 WITH MALLOC"}, test_read_8x2},
 | 
			
		||||
      // {{"READ 1x2 LOW", "READ 1x2 LOW WITH MALLOC"}, test_read_1x2_low},
 | 
			
		||||
      // {{"READ 1x2 HIGH", "READ 1x2 HIGH WITH MALLOC"}, test_read_1x2_high},
 | 
			
		||||
      // {{"READ 2x2", "READ 2x2 WITH MALLOC"}, test_read_2x2},
 | 
			
		||||
      // {{"READ 4x2", "READ 4x2 WITH MALLOC"}, test_read_4x2},
 | 
			
		||||
      // {{"READ 8x2", "READ 8x2 WITH MALLOC"}, test_read_8x2},
 | 
			
		||||
      {{"READ 4x2_simd", "READ 4x2_simd WITH MALLOC"}, test_read_4x2_simd},
 | 
			
		||||
      {{"READ 8x2_simd", "READ 8x2_simd WITH MALLOC"}, test_read_8x2_simd},
 | 
			
		||||
      {{"READ 16x2_simd", "READ 16x2_simd WITH MALLOC"}, test_read_16x2_simd},
 | 
			
		||||
      {{"READ 32x2_simd_offset", "READ 32x2_simd_offset WITH MALLOC"},
 | 
			
		||||
       test_read_32x2_simd_offset},
 | 
			
		||||
      {{"READ 32x2_simd_no_offset", "READ 32x2_simd_no_offset WITH MALLOC"},
 | 
			
		||||
       test_read_32x2_simd_no_offset},
 | 
			
		||||
      {{"READ 16x4_simd", "READ 16x4_simd WITH MALLOC"}, test_read_16x4_simd},
 | 
			
		||||
      {{"READ 32x4_simd", "READ 32x4_simd WITH MALLOC"}, test_read_32x4_simd},
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  tester.params.read_size = get_file_length(fp);
 | 
			
		||||
@@ -1063,6 +1086,184 @@ void test_read_8x2(reptester *tester, alloc_type type) {
 | 
			
		||||
  handle_free(tester, type);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void test_read_4x2_simd(reptester *tester, alloc_type type) {
 | 
			
		||||
  u64 start = read_cpu_timer();
 | 
			
		||||
  u64 fault_count_start = page_fault_count();
 | 
			
		||||
 | 
			
		||||
  handle_alloc(tester, type);
 | 
			
		||||
 | 
			
		||||
  u64 total_size = tester->params.read_size * tester->params.read_count;
 | 
			
		||||
 | 
			
		||||
  read_4x2_simd(tester->params.buffer, total_size);
 | 
			
		||||
 | 
			
		||||
  u64 fault_count_end = page_fault_count();
 | 
			
		||||
  u64 end = read_cpu_timer();
 | 
			
		||||
 | 
			
		||||
  u64 read_time = end - start;
 | 
			
		||||
  u64 page_faults = fault_count_end - fault_count_start;
 | 
			
		||||
 | 
			
		||||
  tester->results = {
 | 
			
		||||
      total_size,
 | 
			
		||||
      read_time,
 | 
			
		||||
      page_faults,
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  handle_free(tester, type);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void test_read_8x2_simd(reptester *tester, alloc_type type) {
 | 
			
		||||
 | 
			
		||||
  u64 start = read_cpu_timer();
 | 
			
		||||
  u64 fault_count_start = page_fault_count();
 | 
			
		||||
 | 
			
		||||
  handle_alloc(tester, type);
 | 
			
		||||
 | 
			
		||||
  u64 total_size = tester->params.read_size * tester->params.read_count;
 | 
			
		||||
 | 
			
		||||
  read_8x2_simd(tester->params.buffer, total_size);
 | 
			
		||||
 | 
			
		||||
  u64 fault_count_end = page_fault_count();
 | 
			
		||||
  u64 end = read_cpu_timer();
 | 
			
		||||
 | 
			
		||||
  u64 read_time = end - start;
 | 
			
		||||
  u64 page_faults = fault_count_end - fault_count_start;
 | 
			
		||||
 | 
			
		||||
  tester->results = {
 | 
			
		||||
      total_size,
 | 
			
		||||
      read_time,
 | 
			
		||||
      page_faults,
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  handle_free(tester, type);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void test_read_16x2_simd(reptester *tester, alloc_type type) {
 | 
			
		||||
 | 
			
		||||
  u64 start = read_cpu_timer();
 | 
			
		||||
  u64 fault_count_start = page_fault_count();
 | 
			
		||||
 | 
			
		||||
  handle_alloc(tester, type);
 | 
			
		||||
 | 
			
		||||
  u64 total_size = tester->params.read_size * tester->params.read_count;
 | 
			
		||||
 | 
			
		||||
  read_16x2_simd(tester->params.buffer, total_size);
 | 
			
		||||
 | 
			
		||||
  u64 fault_count_end = page_fault_count();
 | 
			
		||||
  u64 end = read_cpu_timer();
 | 
			
		||||
 | 
			
		||||
  u64 read_time = end - start;
 | 
			
		||||
  u64 page_faults = fault_count_end - fault_count_start;
 | 
			
		||||
 | 
			
		||||
  tester->results = {
 | 
			
		||||
      total_size,
 | 
			
		||||
      read_time,
 | 
			
		||||
      page_faults,
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  handle_free(tester, type);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void test_read_32x2_simd_offset(reptester *tester, alloc_type type) {
 | 
			
		||||
  u64 start = read_cpu_timer();
 | 
			
		||||
  u64 fault_count_start = page_fault_count();
 | 
			
		||||
 | 
			
		||||
  handle_alloc(tester, type);
 | 
			
		||||
 | 
			
		||||
  u64 total_size = tester->params.read_size * tester->params.read_count;
 | 
			
		||||
 | 
			
		||||
  read_32x2_simd_offset(tester->params.buffer, total_size);
 | 
			
		||||
 | 
			
		||||
  u64 fault_count_end = page_fault_count();
 | 
			
		||||
  u64 end = read_cpu_timer();
 | 
			
		||||
 | 
			
		||||
  u64 read_time = end - start;
 | 
			
		||||
  u64 page_faults = fault_count_end - fault_count_start;
 | 
			
		||||
 | 
			
		||||
  tester->results = {
 | 
			
		||||
      total_size,
 | 
			
		||||
      read_time,
 | 
			
		||||
      page_faults,
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  handle_free(tester, type);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void test_read_32x2_simd_no_offset(reptester *tester, alloc_type type) {
 | 
			
		||||
  u64 start = read_cpu_timer();
 | 
			
		||||
  u64 fault_count_start = page_fault_count();
 | 
			
		||||
 | 
			
		||||
  handle_alloc(tester, type);
 | 
			
		||||
 | 
			
		||||
  u64 total_size = tester->params.read_size * tester->params.read_count;
 | 
			
		||||
 | 
			
		||||
  read_32x2_simd_no_offset(tester->params.buffer, total_size);
 | 
			
		||||
 | 
			
		||||
  u64 fault_count_end = page_fault_count();
 | 
			
		||||
  u64 end = read_cpu_timer();
 | 
			
		||||
 | 
			
		||||
  u64 read_time = end - start;
 | 
			
		||||
  u64 page_faults = fault_count_end - fault_count_start;
 | 
			
		||||
 | 
			
		||||
  tester->results = {
 | 
			
		||||
      total_size,
 | 
			
		||||
      read_time,
 | 
			
		||||
      page_faults,
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  handle_free(tester, type);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void test_read_16x4_simd(reptester *tester, alloc_type type) {
 | 
			
		||||
 | 
			
		||||
  u64 start = read_cpu_timer();
 | 
			
		||||
  u64 fault_count_start = page_fault_count();
 | 
			
		||||
 | 
			
		||||
  handle_alloc(tester, type);
 | 
			
		||||
 | 
			
		||||
  u64 total_size = tester->params.read_size * tester->params.read_count;
 | 
			
		||||
 | 
			
		||||
  read_16x4_simd(tester->params.buffer, total_size);
 | 
			
		||||
 | 
			
		||||
  u64 fault_count_end = page_fault_count();
 | 
			
		||||
  u64 end = read_cpu_timer();
 | 
			
		||||
 | 
			
		||||
  u64 read_time = end - start;
 | 
			
		||||
  u64 page_faults = fault_count_end - fault_count_start;
 | 
			
		||||
 | 
			
		||||
  tester->results = {
 | 
			
		||||
      total_size,
 | 
			
		||||
      read_time,
 | 
			
		||||
      page_faults,
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  handle_free(tester, type);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void test_read_32x4_simd(reptester *tester, alloc_type type) {
 | 
			
		||||
  u64 start = read_cpu_timer();
 | 
			
		||||
  u64 fault_count_start = page_fault_count();
 | 
			
		||||
 | 
			
		||||
  handle_alloc(tester, type);
 | 
			
		||||
 | 
			
		||||
  u64 total_size = tester->params.read_size * tester->params.read_count;
 | 
			
		||||
 | 
			
		||||
  read_32x4_simd(tester->params.buffer, total_size);
 | 
			
		||||
 | 
			
		||||
  u64 fault_count_end = page_fault_count();
 | 
			
		||||
  u64 end = read_cpu_timer();
 | 
			
		||||
 | 
			
		||||
  u64 read_time = end - start;
 | 
			
		||||
  u64 page_faults = fault_count_end - fault_count_start;
 | 
			
		||||
 | 
			
		||||
  tester->results = {
 | 
			
		||||
      total_size,
 | 
			
		||||
      read_time,
 | 
			
		||||
      page_faults,
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  handle_free(tester, type);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
u64 get_file_length(FILE *fp) {
 | 
			
		||||
  if (!fp) {
 | 
			
		||||
    return 0;
 | 
			
		||||
 
 | 
			
		||||
@@ -29,6 +29,13 @@ global read_1x2_high
 | 
			
		||||
global read_2x2
 | 
			
		||||
global read_4x2
 | 
			
		||||
global read_8x2
 | 
			
		||||
global read_4x2_simd
 | 
			
		||||
global read_8x2_simd
 | 
			
		||||
global read_16x2_simd
 | 
			
		||||
global read_32x2_simd_offset
 | 
			
		||||
global read_32x2_simd_no_offset
 | 
			
		||||
global read_16x4_simd
 | 
			
		||||
global read_32x4_simd
 | 
			
		||||
 | 
			
		||||
mov_all_bytes_asm:
 | 
			
		||||
	xor rax, rax
 | 
			
		||||
@@ -390,3 +397,98 @@ read_8x2:
 | 
			
		||||
		jnle .loop
 | 
			
		||||
 | 
			
		||||
	ret
 | 
			
		||||
 | 
			
		||||
read_4x2_simd:
 | 
			
		||||
	xor rax, rax
 | 
			
		||||
 | 
			
		||||
	align 64
 | 
			
		||||
	.loop:
 | 
			
		||||
		mov r8d, [rdi]
 | 
			
		||||
		mov r8d, [rdi + 4]
 | 
			
		||||
		add rax, 8
 | 
			
		||||
		cmp rax, rsi
 | 
			
		||||
		jb .loop
 | 
			
		||||
 | 
			
		||||
	ret
 | 
			
		||||
 | 
			
		||||
read_8x2_simd:
 | 
			
		||||
	xor rax, rax
 | 
			
		||||
 | 
			
		||||
	align 64
 | 
			
		||||
	.loop:
 | 
			
		||||
		mov r8, [rdi]
 | 
			
		||||
		mov r8, [rdi + 8]
 | 
			
		||||
		add rax, 16
 | 
			
		||||
		cmp rax, rsi
 | 
			
		||||
		jb .loop
 | 
			
		||||
 | 
			
		||||
	ret
 | 
			
		||||
 | 
			
		||||
read_16x2_simd:
 | 
			
		||||
	xor rax, rax
 | 
			
		||||
 | 
			
		||||
	align 64
 | 
			
		||||
	.loop:
 | 
			
		||||
		vmovdqu xmm0, [rdi]
 | 
			
		||||
		vmovdqu xmm0, [rdi + 16]
 | 
			
		||||
		add rax, 32
 | 
			
		||||
		cmp rax, rsi
 | 
			
		||||
		jb .loop
 | 
			
		||||
 | 
			
		||||
	ret
 | 
			
		||||
 | 
			
		||||
read_32x2_simd_offset:
 | 
			
		||||
	xor rax, rax
 | 
			
		||||
 | 
			
		||||
	align 64
 | 
			
		||||
	.loop:
 | 
			
		||||
		vmovdqu ymm0, [rdi]
 | 
			
		||||
		vmovdqu ymm0, [rdi + 32]
 | 
			
		||||
		add rax, 64
 | 
			
		||||
		cmp rax, rsi
 | 
			
		||||
		jb .loop
 | 
			
		||||
 | 
			
		||||
	ret
 | 
			
		||||
 | 
			
		||||
read_32x2_simd_no_offset:
 | 
			
		||||
	xor rax, rax
 | 
			
		||||
 | 
			
		||||
	align 64
 | 
			
		||||
	.loop:
 | 
			
		||||
		vmovdqu ymm0, [rdi]
 | 
			
		||||
		vmovdqu ymm0, [rdi]
 | 
			
		||||
		add rax, 64
 | 
			
		||||
		cmp rax, rsi
 | 
			
		||||
		jb .loop
 | 
			
		||||
 | 
			
		||||
	ret
 | 
			
		||||
 | 
			
		||||
read_16x4_simd:
 | 
			
		||||
	xor rax, rax
 | 
			
		||||
 | 
			
		||||
	align 64
 | 
			
		||||
	.loop:
 | 
			
		||||
		%rep 2
 | 
			
		||||
		vmovdqu xmm0, [rdi]
 | 
			
		||||
		vmovdqu xmm0, [rdi + 16]
 | 
			
		||||
		%endrep
 | 
			
		||||
		add rax, 64
 | 
			
		||||
		cmp rax, rsi
 | 
			
		||||
		jb .loop
 | 
			
		||||
 | 
			
		||||
	ret
 | 
			
		||||
 | 
			
		||||
read_32x4_simd:
 | 
			
		||||
	xor rax, rax
 | 
			
		||||
 | 
			
		||||
	align 64
 | 
			
		||||
	.loop:
 | 
			
		||||
		%rep 2
 | 
			
		||||
		vmovdqu ymm0, [rdi]
 | 
			
		||||
		vmovdqu ymm0, [rdi]
 | 
			
		||||
		%endrep
 | 
			
		||||
		add rax, 128
 | 
			
		||||
		cmp rax, rsi
 | 
			
		||||
		jb .loop
 | 
			
		||||
 | 
			
		||||
	ret
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user