浏览代码

Merge pull request #5108 from Barinzaya/core-simd-indices-redadd-redmul

Alternate `reduce_add`/`reduce_mul` intrinsics
gingerBill 4 月之前
父节点
当前提交
0cf5b5984d
共有 5 个文件被更改,包括 274 次插入2 次删除
  1. 4 0
      base/intrinsics/intrinsics.odin
  2. 192 2
      core/simd/simd.odin
  3. 4 0
      src/check_builtin.cpp
  4. 8 0
      src/checker_builtin_procs.hpp
  5. 66 0
      src/llvm_backend_proc.cpp

+ 4 - 0
base/intrinsics/intrinsics.odin

@@ -274,8 +274,12 @@ simd_lanes_ge :: proc(a, b: #simd[N]T) -> #simd[N]Integer ---
 simd_extract :: proc(a: #simd[N]T, idx: uint) -> T ---
 simd_extract :: proc(a: #simd[N]T, idx: uint) -> T ---
 simd_replace :: proc(a: #simd[N]T, idx: uint, elem: T) -> #simd[N]T ---
 simd_replace :: proc(a: #simd[N]T, idx: uint, elem: T) -> #simd[N]T ---
 
 
+simd_reduce_add_bisect  :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
+simd_reduce_mul_bisect  :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
 simd_reduce_add_ordered :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
 simd_reduce_add_ordered :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
 simd_reduce_mul_ordered :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
 simd_reduce_mul_ordered :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
+simd_reduce_add_pairs   :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
+simd_reduce_mul_pairs   :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
 simd_reduce_min         :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
 simd_reduce_min         :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
 simd_reduce_max         :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
 simd_reduce_max         :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
 simd_reduce_and         :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---
 simd_reduce_and         :: proc(a: #simd[N]T) -> T where type_is_integer(T) || type_is_float(T)---

+ 192 - 2
core/simd/simd.odin

@@ -1759,7 +1759,103 @@ Returns:
 replace :: intrinsics.simd_replace
 replace :: intrinsics.simd_replace
 
 
 /*
 /*
-Reduce a vector to a scalar by adding up all the lanes.
+Reduce a vector to a scalar by adding up all the lanes in a bisecting fashion.
+
+This procedure returns a scalar that is the sum of all lanes, calculated by
+bisecting the vector into two parts, where the first contains lanes [0, N/2)
+and the second contains lanes [N/2, N), and adding the two halves element-wise
+to produce N/2 values. This is repeated until only a single element remains.
+This order may be faster to compute than the ordered sum for floats, as it can
+often be better parallelized.
+
+The order of the sum may be important for accounting for precision errors in
+floating-point computation, as floating-point addition is not associative, that
+is `(a+b)+c` may not be equal to `a+(b+c)`.
+
+Inputs:
+- `v`: The vector to reduce.
+
+Result:
+- Sum of all lanes, as a scalar.
+
+**Operation**:
+
+	for n > 1 {
+		n = n / 2
+		for i in 0 ..< n {
+			a[i] += a[i+n]
+		}
+	}
+	res := a[0]
+
+Graphical representation of the operation for N=4:
+
+	     +-----------------------+
+	     | v0  | v1  | v2  | v3  |
+	     +-----------------------+
+	        |     |     |     |
+	       [+]<-- | ---'      |
+	        |    [+]<--------'
+	        |     |
+	        `>[+]<'
+	           |
+	           v
+	        +-----+
+	result: | y0  |
+	        +-----+
+*/
+reduce_add_bisect :: intrinsics.simd_reduce_add_bisect
+
+/*
+Reduce a vector to a scalar by multiplying up all the lanes in a bisecting fashion.
+
+This procedure returns a scalar that is the product of all lanes, calculated by
+bisecting the vector into two parts, where the first contains indices [0, N/2)
+and the second contains indices [N/2, N), and multiplying the two halves
+together element-wise to produce N/2 values. This is repeated until only a
+single element remains. This order may be faster to compute than the ordered
+product for floats, as it can often be better parallelized.
+
+The order of the product may be important for accounting for precision errors
+in floating-point computation, as floating-point multiplication is not
+associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
+
+Inputs:
+- `v`: The vector to reduce.
+
+Result:
+- Product of all lanes, as a scalar.
+
+**Operation**:
+
+	for n > 1 {
+		n = n / 2
+		for i in 0 ..< n {
+			a[i] *= a[i+n]
+		}
+	}
+	res := a[0]
+
+Graphical representation of the operation for N=4:
+
+	     +-----------------------+
+	     | v0  | v1  | v2  | v3  |
+	     +-----------------------+
+	        |     |     |     |
+	       [x]<-- | ---'      |
+	        |    [x]<--------'
+	        |     |
+	        `>[x]<'
+	           |
+	           v
+	        +-----+
+	result: | y0  |
+	        +-----+
+*/
+reduce_mul_bisect :: intrinsics.simd_reduce_mul_bisect
+
+/*
+Reduce a vector to a scalar by adding up all the lanes in an ordered fashion.
 
 
 This procedure returns a scalar that is the ordered sum of all lanes. The
 This procedure returns a scalar that is the ordered sum of all lanes. The
 ordered sum may be important for accounting for precision errors in
 ordered sum may be important for accounting for precision errors in
@@ -1782,7 +1878,7 @@ Result:
 reduce_add_ordered :: intrinsics.simd_reduce_add_ordered
 reduce_add_ordered :: intrinsics.simd_reduce_add_ordered
 
 
 /*
 /*
-Reduce a vector to a scalar by multiplying all the lanes.
+Reduce a vector to a scalar by multiplying all the lanes in an ordered fashion.
 
 
 This procedure returns a scalar that is the ordered product of all lanes.
 This procedure returns a scalar that is the ordered product of all lanes.
 The ordered product may be important for accounting for precision errors in
 The ordered product may be important for accounting for precision errors in
@@ -1804,6 +1900,100 @@ Result:
 */
 */
 reduce_mul_ordered :: intrinsics.simd_reduce_mul_ordered
 reduce_mul_ordered :: intrinsics.simd_reduce_mul_ordered
 
 
+/*
+Reduce a vector to a scalar by adding up all the lanes in a pairwise fashion.
+
+This procedure returns a scalar that is the sum of all lanes, calculated by
+adding each even-indexed element with the following odd-indexed element to
+produce N/2 values. This is repeated until only a single element remains. This
+order is supported by hardware instructions for some types/architectures (e.g.
+i16/i32/f32/f64 on x86 SSE, i8/i16/i32/f32 on ARM NEON).
+
+The order of the sum may be important for accounting for precision errors in
+floating-point computation, as floating-point addition is not associative, that
+is `(a+b)+c` may not be equal to `a+(b+c)`.
+
+Inputs:
+- `v`: The vector to reduce.
+
+Result:
+- Sum of all lanes, as a scalar.
+
+**Operation**:
+
+	for n > 1 {
+		n = n / 2
+		for i in 0 ..< n {
+			a[i] = a[2*i+0] + a[2*i+1]
+		}
+	}
+	res := a[0]
+
+Graphical representation of the operation for N=4:
+
+	   +-----------------------+
+	v: | v0  | v1  | v2  | v3  |
+	   +-----------------------+
+	      |     |     |     |
+	      `>[+]<'     `>[+]<'
+	         |           |
+	         `--->[+]<--'
+	               |
+	               v
+	            +-----+
+	    result: | y0  |
+	            +-----+
+*/
+reduce_add_pairs :: intrinsics.simd_reduce_add_pairs
+
+/*
+Reduce a vector to a scalar by multiplying all the lanes in a pairwise fashion.
+
+This procedure returns a scalar that is the product of all lanes, calculated by
+bisecting the vector into two parts, where the first contains lanes [0, N/2)
+and the second contains lanes [N/2, N), and multiplying the two halves together
+multiplying each even-indexed element with the following odd-indexed element to
+produce N/2 values. This is repeated until only a single element remains. This
+order may be faster to compute than the ordered product for floats, as it can
+often be better parallelized.
+
+The order of the product may be important for accounting for precision errors
+in floating-point computation, as floating-point multiplication is not
+associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
+
+Inputs:
+- `v`: The vector to reduce.
+
+Result:
+- Product of all lanes, as a scalar.
+
+**Operation**:
+
+	for n > 1 {
+		n = n / 2
+		for i in 0 ..< n {
+			a[i] = a[2*i+0] * a[2*i+1]
+		}
+	}
+	res := a[0]
+
+Graphical representation of the operation for N=4:
+
+	   +-----------------------+
+	v: | v0  | v1  | v2  | v3  |
+	   +-----------------------+
+	      |     |     |     |
+	      `>[x]<'     `>[x]<'
+	         |           |
+	         `--->[x]<--'
+	               |
+	               v
+	            +-----+
+	    result: | y0  |
+	            +-----+
+*/
+reduce_mul_pairs :: intrinsics.simd_reduce_mul_pairs
+
 /*
 /*
 Reduce a vector to a scalar by finding the minimum value between all of the lanes.
 Reduce a vector to a scalar by finding the minimum value between all of the lanes.
 
 

+ 4 - 0
src/check_builtin.cpp

@@ -853,8 +853,12 @@ gb_internal bool check_builtin_simd_operation(CheckerContext *c, Operand *operan
 		}
 		}
 		break;
 		break;
 
 
+	case BuiltinProc_simd_reduce_add_bisect:
+	case BuiltinProc_simd_reduce_mul_bisect:
 	case BuiltinProc_simd_reduce_add_ordered:
 	case BuiltinProc_simd_reduce_add_ordered:
 	case BuiltinProc_simd_reduce_mul_ordered:
 	case BuiltinProc_simd_reduce_mul_ordered:
+	case BuiltinProc_simd_reduce_add_pairs:
+	case BuiltinProc_simd_reduce_mul_pairs:
 	case BuiltinProc_simd_reduce_min:
 	case BuiltinProc_simd_reduce_min:
 	case BuiltinProc_simd_reduce_max:
 	case BuiltinProc_simd_reduce_max:
 		{
 		{

+ 8 - 0
src/checker_builtin_procs.hpp

@@ -170,8 +170,12 @@ BuiltinProc__simd_begin,
 	BuiltinProc_simd_extract,
 	BuiltinProc_simd_extract,
 	BuiltinProc_simd_replace,
 	BuiltinProc_simd_replace,
 
 
+	BuiltinProc_simd_reduce_add_bisect,
+	BuiltinProc_simd_reduce_mul_bisect,
 	BuiltinProc_simd_reduce_add_ordered,
 	BuiltinProc_simd_reduce_add_ordered,
 	BuiltinProc_simd_reduce_mul_ordered,
 	BuiltinProc_simd_reduce_mul_ordered,
+	BuiltinProc_simd_reduce_add_pairs,
+	BuiltinProc_simd_reduce_mul_pairs,
 	BuiltinProc_simd_reduce_min,
 	BuiltinProc_simd_reduce_min,
 	BuiltinProc_simd_reduce_max,
 	BuiltinProc_simd_reduce_max,
 	BuiltinProc_simd_reduce_and,
 	BuiltinProc_simd_reduce_and,
@@ -518,8 +522,12 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = {
 	{STR_LIT("simd_extract"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_extract"), 2, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_replace"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_replace"), 3, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 
 
+	{STR_LIT("simd_reduce_add_bisect"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_reduce_mul_bisect"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_reduce_add_ordered"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_reduce_add_ordered"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_reduce_mul_ordered"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_reduce_mul_ordered"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_reduce_add_pairs"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
+	{STR_LIT("simd_reduce_mul_pairs"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_reduce_min"),         1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_reduce_min"),         1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_reduce_max"),         1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_reduce_max"),         1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_reduce_and"),         1, false, Expr_Expr, BuiltinProcPkg_intrinsics},
 	{STR_LIT("simd_reduce_and"),         1, false, Expr_Expr, BuiltinProcPkg_intrinsics},

+ 66 - 0
src/llvm_backend_proc.cpp

@@ -1495,6 +1495,38 @@ gb_internal lbValue lb_build_builtin_simd_proc(lbProcedure *p, Ast *expr, TypeAn
 		res.value = LLVMBuildInsertElement(p->builder, arg0.value, arg2.value, arg1.value, "");
 		res.value = LLVMBuildInsertElement(p->builder, arg0.value, arg2.value, arg1.value, "");
 		return res;
 		return res;
 
 
+	case BuiltinProc_simd_reduce_add_bisect:
+	case BuiltinProc_simd_reduce_mul_bisect:
+		{
+			GB_ASSERT(arg0.type->kind == Type_SimdVector);
+			i64 num_elems = arg0.type->SimdVector.count;
+
+			LLVMValueRef *indices = gb_alloc_array(temporary_allocator(), LLVMValueRef, num_elems);
+			for (i64 i = 0; i < num_elems; i++) {
+				indices[i] = lb_const_int(m, t_uint, cast(u64)i).value;
+			}
+
+			switch (builtin_id) {
+			case BuiltinProc_simd_reduce_add_bisect: op_code = is_float ? LLVMFAdd : LLVMAdd; break;
+			case BuiltinProc_simd_reduce_mul_bisect: op_code = is_float ? LLVMFMul : LLVMMul; break;
+			}
+
+			LLVMValueRef remaining = arg0.value;
+			i64 num_remaining = num_elems;
+
+			while (num_remaining > 1) {
+				num_remaining /= 2;
+				LLVMValueRef left_indices = LLVMConstVector(&indices[0], cast(unsigned)num_remaining);
+				LLVMValueRef left_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, left_indices, "");
+				LLVMValueRef right_indices = LLVMConstVector(&indices[num_remaining], cast(unsigned)num_remaining);
+				LLVMValueRef right_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, right_indices, "");
+				remaining = LLVMBuildBinOp(p->builder, op_code, left_value, right_value, "");
+			}
+
+			res.value = LLVMBuildExtractElement(p->builder, remaining, indices[0], "");
+			return res;
+		}
+
 	case BuiltinProc_simd_reduce_add_ordered:
 	case BuiltinProc_simd_reduce_add_ordered:
 	case BuiltinProc_simd_reduce_mul_ordered:
 	case BuiltinProc_simd_reduce_mul_ordered:
 		{
 		{
@@ -1527,6 +1559,40 @@ gb_internal lbValue lb_build_builtin_simd_proc(lbProcedure *p, Ast *expr, TypeAn
 			res.value = lb_call_intrinsic(p, name, args, cast(unsigned)args_count, types, gb_count_of(types));
 			res.value = lb_call_intrinsic(p, name, args, cast(unsigned)args_count, types, gb_count_of(types));
 			return res;
 			return res;
 		}
 		}
+
+	case BuiltinProc_simd_reduce_add_pairs:
+	case BuiltinProc_simd_reduce_mul_pairs:
+		{
+			GB_ASSERT(arg0.type->kind == Type_SimdVector);
+			i64 num_elems = arg0.type->SimdVector.count;
+
+			LLVMValueRef *indices = gb_alloc_array(temporary_allocator(), LLVMValueRef, num_elems);
+			for (i64 i = 0; i < num_elems/2; i++) {
+				indices[i] = lb_const_int(m, t_uint, cast(u64)(2*i)).value;
+				indices[i+num_elems/2] = lb_const_int(m, t_uint, cast(u64)(2*i+1)).value;
+			}
+
+			switch (builtin_id) {
+			case BuiltinProc_simd_reduce_add_pairs: op_code = is_float ? LLVMFAdd : LLVMAdd; break;
+			case BuiltinProc_simd_reduce_mul_pairs: op_code = is_float ? LLVMFMul : LLVMMul; break;
+			}
+
+			LLVMValueRef remaining = arg0.value;
+			i64 num_remaining = num_elems;
+
+			while (num_remaining > 1) {
+				num_remaining /= 2;
+				LLVMValueRef left_indices = LLVMConstVector(&indices[0], cast(unsigned)num_remaining);
+				LLVMValueRef left_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, left_indices, "");
+				LLVMValueRef right_indices = LLVMConstVector(&indices[num_elems/2], cast(unsigned)num_remaining);
+				LLVMValueRef right_value = LLVMBuildShuffleVector(p->builder, remaining, remaining, right_indices, "");
+				remaining = LLVMBuildBinOp(p->builder, op_code, left_value, right_value, "");
+			}
+
+			res.value = LLVMBuildExtractElement(p->builder, remaining, indices[0], "");
+			return res;
+		}
+
 	case BuiltinProc_simd_reduce_min:
 	case BuiltinProc_simd_reduce_min:
 	case BuiltinProc_simd_reduce_max:
 	case BuiltinProc_simd_reduce_max:
 	case BuiltinProc_simd_reduce_and:
 	case BuiltinProc_simd_reduce_and: