|
@@ -684,12 +684,6 @@ gb_internal lbValue lb_emit_matrix_flatten(lbProcedure *p, lbValue m, Type *type
|
|
|
Type *mt = base_type(m.type);
|
|
|
GB_ASSERT(mt->kind == Type_Matrix);
|
|
|
|
|
|
- // TODO(bill): Determine why this fails on Windows sometimes
|
|
|
- if (false && lb_is_matrix_simdable(mt)) {
|
|
|
- LLVMValueRef vector = lb_matrix_to_trimmed_vector(p, m);
|
|
|
- return lb_matrix_cast_vector_to_type(p, vector, type);
|
|
|
- }
|
|
|
-
|
|
|
lbAddr res = lb_add_local_generated(p, type, true);
|
|
|
|
|
|
i64 row_count = mt->Matrix.row_count;
|
|
@@ -763,6 +757,7 @@ gb_internal lbValue lb_emit_matrix_mul(lbProcedure *p, lbValue lhs, lbValue rhs,
|
|
|
GB_ASSERT(is_type_matrix(yt));
|
|
|
GB_ASSERT(xt->Matrix.column_count == yt->Matrix.row_count);
|
|
|
GB_ASSERT(are_types_identical(xt->Matrix.elem, yt->Matrix.elem));
|
|
|
+ GB_ASSERT(xt->Matrix.is_row_major == yt->Matrix.is_row_major);
|
|
|
|
|
|
Type *elem = xt->Matrix.elem;
|
|
|
|
|
@@ -770,7 +765,7 @@ gb_internal lbValue lb_emit_matrix_mul(lbProcedure *p, lbValue lhs, lbValue rhs,
|
|
|
unsigned inner = cast(unsigned)xt->Matrix.column_count;
|
|
|
unsigned outer_columns = cast(unsigned)yt->Matrix.column_count;
|
|
|
|
|
|
- if (lb_is_matrix_simdable(xt)) {
|
|
|
+ if (!xt->Matrix.is_row_major && lb_is_matrix_simdable(xt)) {
|
|
|
unsigned x_stride = cast(unsigned)matrix_type_stride_in_elems(xt);
|
|
|
unsigned y_stride = cast(unsigned)matrix_type_stride_in_elems(yt);
|
|
|
|
|
@@ -812,7 +807,7 @@ gb_internal lbValue lb_emit_matrix_mul(lbProcedure *p, lbValue lhs, lbValue rhs,
|
|
|
return lb_addr_load(p, res);
|
|
|
}
|
|
|
|
|
|
- {
|
|
|
+ if (!xt->Matrix.is_row_major) {
|
|
|
lbAddr res = lb_add_local_generated(p, type, true);
|
|
|
|
|
|
auto inners = slice_make<lbValue[2]>(permanent_allocator(), inner);
|
|
@@ -835,6 +830,30 @@ gb_internal lbValue lb_emit_matrix_mul(lbProcedure *p, lbValue lhs, lbValue rhs,
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ return lb_addr_load(p, res);
|
|
|
+ } else {
|
|
|
+ lbAddr res = lb_add_local_generated(p, type, true);
|
|
|
+
|
|
|
+ auto inners = slice_make<lbValue[2]>(permanent_allocator(), inner);
|
|
|
+
|
|
|
+ for (unsigned i = 0; i < outer_rows; i++) {
|
|
|
+ for (unsigned j = 0; j < outer_columns; j++) {
|
|
|
+ lbValue dst = lb_emit_matrix_epi(p, res.addr, i, j);
|
|
|
+ for (unsigned k = 0; k < inner; k++) {
|
|
|
+ inners[k][0] = lb_emit_matrix_ev(p, lhs, i, k);
|
|
|
+ inners[k][1] = lb_emit_matrix_ev(p, rhs, k, j);
|
|
|
+ }
|
|
|
+
|
|
|
+ lbValue sum = lb_const_nil(p->module, elem);
|
|
|
+ for (unsigned k = 0; k < inner; k++) {
|
|
|
+ lbValue a = inners[k][0];
|
|
|
+ lbValue b = inners[k][1];
|
|
|
+ sum = lb_emit_mul_add(p, a, b, sum, elem);
|
|
|
+ }
|
|
|
+ lb_emit_store(p, dst, sum);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
return lb_addr_load(p, res);
|
|
|
}
|
|
|
}
|
|
@@ -855,7 +874,7 @@ gb_internal lbValue lb_emit_matrix_mul_vector(lbProcedure *p, lbValue lhs, lbVal
|
|
|
|
|
|
Type *elem = mt->Matrix.elem;
|
|
|
|
|
|
- if (lb_is_matrix_simdable(mt)) {
|
|
|
+ if (!mt->Matrix.is_row_major && lb_is_matrix_simdable(mt)) {
|
|
|
unsigned stride = cast(unsigned)matrix_type_stride_in_elems(mt);
|
|
|
|
|
|
unsigned row_count = cast(unsigned)mt->Matrix.row_count;
|
|
@@ -924,7 +943,7 @@ gb_internal lbValue lb_emit_vector_mul_matrix(lbProcedure *p, lbValue lhs, lbVal
|
|
|
|
|
|
Type *elem = mt->Matrix.elem;
|
|
|
|
|
|
- if (lb_is_matrix_simdable(mt)) {
|
|
|
+ if (!mt->Matrix.is_row_major && lb_is_matrix_simdable(mt)) {
|
|
|
unsigned stride = cast(unsigned)matrix_type_stride_in_elems(mt);
|
|
|
|
|
|
unsigned row_count = cast(unsigned)mt->Matrix.row_count;
|