Browse Source

improve code generation for `intrinsics.unaligned_load/store` on `#simd` types

the default implementation calls memcpy on an `alloca` constant, which
seems to heavily confuse the optimizer and produces overall suboptimal
code.

Introducing this specialization simplifies the intermediate
representation produced, resulting in more efficient code.
Andrea Piseri 2 years ago
parent
commit
af63eff8d7
1 changed files with 19 additions and 5 deletions
  1. 19 5
      src/llvm_backend_proc.cpp

+ 19 - 5
src/llvm_backend_proc.cpp

@@ -2363,9 +2363,15 @@ gb_internal lbValue lb_build_builtin_proc(lbProcedure *p, Ast *expr, TypeAndValu
 		{
 		{
 			lbValue dst = lb_build_expr(p, ce->args[0]);
 			lbValue dst = lb_build_expr(p, ce->args[0]);
 			lbValue src = lb_build_expr(p, ce->args[1]);
 			lbValue src = lb_build_expr(p, ce->args[1]);
-			src = lb_address_from_load_or_generate_local(p, src);
 			Type *t = type_deref(dst.type);
 			Type *t = type_deref(dst.type);
-			lb_mem_copy_non_overlapping(p, dst, src, lb_const_int(p->module, t_int, type_size_of(t)), false);
+
+			if (is_type_simd_vector(t)) {
+				LLVMValueRef store = LLVMBuildStore(p->builder, src.value, dst.value);
+				LLVMSetAlignment(store, 1);
+			} else {
+				src = lb_address_from_load_or_generate_local(p, src);
+				lb_mem_copy_non_overlapping(p, dst, src, lb_const_int(p->module, t_int, type_size_of(t)), false);
+			}
 			return {};
 			return {};
 		}
 		}
 
 
@@ -2373,9 +2379,17 @@ gb_internal lbValue lb_build_builtin_proc(lbProcedure *p, Ast *expr, TypeAndValu
 		{
 		{
 			lbValue src = lb_build_expr(p, ce->args[0]);
 			lbValue src = lb_build_expr(p, ce->args[0]);
 			Type *t = type_deref(src.type);
 			Type *t = type_deref(src.type);
-			lbAddr dst = lb_add_local_generated(p, t, false);
-			lb_mem_copy_non_overlapping(p, dst.addr, src, lb_const_int(p->module, t_int, type_size_of(t)), false);
-			return lb_addr_load(p, dst);
+			if (is_type_simd_vector(t)) {
+				lbValue res = {};
+				res.type = t;
+				res.value = LLVMBuildLoad2(p->builder, lb_type(p->module, t), src.value, "");
+				LLVMSetAlignment(res.value, 1);
+				return res;
+			} else {
+				lbAddr dst = lb_add_local_generated(p, t, false);
+				lb_mem_copy_non_overlapping(p, dst.addr, src, lb_const_int(p->module, t_int, type_size_of(t)), false);
+				return lb_addr_load(p, dst);
+			}
 		}
 		}
 
 
 	case BuiltinProc_atomic_add:
 	case BuiltinProc_atomic_add: