Bladeren bron

FFI: Optimize ffi.copy() and ffi.fill().

Mike Pall 13 jaren geleden
bovenliggende
commit
a7d1dbacb1
1 gewijzigde bestanden met toevoegingen van 261 en 26 verwijderingen
  1. 261 26
      src/lj_crecord.c

+ 261 - 26
src/lj_crecord.c

@@ -91,25 +91,7 @@ static CTypeID argv2ctype(jit_State *J, TRef tr, cTValue *o)
   }
 }
 
-/* -- Convert C type to C type -------------------------------------------- */
-
-/*
-** This code mirrors the code in lj_cconv.c. It performs the same steps
-** for the trace recorder that lj_cconv.c does for the interpreter.
-**
-** One major difference is that we can get away with much fewer checks
-** here. E.g. checks for casts, constness or correct types can often be
-** omitted, even if they might fail. The interpreter subsequently throws
-** an error, which aborts the trace.
-**
-** All operations are specialized to their C types, so the on-trace
-** outcome must be the same as the outcome in the interpreter. If the
-** interpreter doesn't throw an error, then the trace is correct, too.
-** Care must be taken not to generate invalid (temporary) IR or to
-** trigger asserts.
-*/
-
-/* Convert CType to IRType. */
+/* Convert CType to IRType (if possible). */
 static IRType crec_ct2irt(CTState *cts, CType *ct)
 {
   if (ctype_isenum(ct->info)) ct = ctype_child(cts, ct);
@@ -135,6 +117,253 @@ static IRType crec_ct2irt(CTState *cts, CType *ct)
   return IRT_CDATA;
 }
 
+/* -- Optimized memory fill and copy -------------------------------------- */
+
+/* Maximum length and unroll of inlined copy/fill. */
+#define CREC_COPY_MAXUNROLL		16
+#define CREC_COPY_MAXLEN		128
+
+#define CREC_FILL_MAXUNROLL		16
+#if LJ_TARGET_UNALIGNED
+#define CREC_FILL_MAXLEN		(CTSIZE_PTR * CREC_FILL_MAXUNROLL)
+#else
+#define CREC_FILL_MAXLEN		CREC_FILL_MAXUNROLL
+#endif
+
+/* Number of windowed registers used for optimized memory copy. */
+#if LJ_TARGET_X86
+#define CREC_COPY_REGWIN		2
+#elif LJ_TARGET_PPC || LJ_TARGET_MIPS
+#define CREC_COPY_REGWIN		8
+#else
+#define CREC_COPY_REGWIN		4
+#endif
+
+/* List of memory offsets for copy/fill. */
+typedef struct CRecMemList {
+  CTSize ofs;		/* Offset in bytes. */
+  IRType tp;		/* Type of load/store. */
+  TRef trofs;		/* TRef of interned offset. */
+  TRef trval;		/* TRef of load value. */
+} CRecMemList;
+
+/* Generate copy list for element-wise struct copy. */
+static MSize crec_copy_struct(CRecMemList *ml, CTState *cts, CType *ct)
+{
+  CTypeID fid = ct->sib;
+  MSize mlp = 0;
+  while (fid) {
+    CType *df = ctype_get(cts, fid);
+    fid = df->sib;
+    if (ctype_isfield(df->info)) {
+      CType *cct;
+      IRType tp;
+      if (!gcref(df->name)) continue;  /* Ignore unnamed fields. */
+      cct = ctype_rawchild(cts, df);  /* Field type. */
+      tp = crec_ct2irt(cts, cct);
+      if (tp == IRT_CDATA) return 0;  /* NYI: aggregates. */
+      if (mlp >= CREC_COPY_MAXUNROLL) return 0;
+      ml[mlp].ofs = df->size;
+      ml[mlp].tp = tp;
+      mlp++;
+      if (ctype_iscomplex(cct->info)) {
+	if (mlp >= CREC_COPY_MAXUNROLL) return 0;
+	ml[mlp].ofs = df->size + (cct->size >> 1);
+	ml[mlp].tp = tp;
+	mlp++;
+      }
+    } else if (!ctype_isconstval(df->info)) {
+      /* NYI: bitfields and sub-structures. */
+      return 0;
+    }
+  }
+  return mlp;
+}
+
+/* Generate unrolled copy list, from highest to lowest step size/alignment. */
+static MSize crec_copy_unroll(CRecMemList *ml, CTSize len, CTSize step,
+			      IRType tp)
+{
+  CTSize ofs = 0;
+  MSize mlp = 0;
+  if (tp == IRT_CDATA) tp = IRT_U8 + 2*lj_fls(step);
+  do {
+    while (ofs + step <= len) {
+      if (mlp >= CREC_COPY_MAXUNROLL) return 0;
+      ml[mlp].ofs = ofs;
+      ml[mlp].tp = tp;
+      mlp++;
+      ofs += step;
+    }
+    step >>= 1;
+    tp -= 2;
+  } while (ofs < len);
+  return mlp;
+}
+
+/*
+** Emit copy list with windowed loads/stores.
+** LJ_TARGET_UNALIGNED: may emit unaligned loads/stores (not marked as such).
+*/
+static void crec_copy_emit(jit_State *J, CRecMemList *ml, MSize mlp,
+			   TRef trdst, TRef trsrc)
+{
+  MSize i, j, rwin = 0;
+  for (i = 0, j = 0; i < mlp; ) {
+    TRef trofs = lj_ir_kintp(J, ml[i].ofs);
+    TRef trsptr = emitir(IRT(IR_ADD, IRT_PTR), trsrc, trofs);
+    ml[i].trval = emitir(IRT(IR_XLOAD, ml[i].tp), trsptr, 0);
+    ml[i].trofs = trofs;
+    i++;
+    rwin += (LJ_SOFTFP && ml[i].tp == IRT_NUM) ? 2 : 1;
+    if (rwin >= CREC_COPY_REGWIN || i >= mlp) {  /* Flush buffered stores. */
+      rwin = 0;
+      for ( ; j < i; j++) {
+	TRef trdptr = emitir(IRT(IR_ADD, IRT_PTR), trdst, ml[j].trofs);
+	emitir(IRT(IR_XSTORE, ml[j].tp), trdptr, ml[j].trval);
+      }
+    }
+  }
+}
+
+/* Optimized memory copy. */
+static void crec_copy(jit_State *J, TRef trdst, TRef trsrc, TRef trlen,
+		      CType *ct)
+{
+  if (tref_isk(trlen)) {  /* Length must be constant. */
+    CRecMemList ml[CREC_COPY_MAXUNROLL];
+    MSize mlp = 0;
+    CTSize step = 1, len = (CTSize)IR(tref_ref(trlen))->i;
+    IRType tp = IRT_CDATA;
+    int needxbar = 0;
+    if (len == 0) return;  /* Shortcut. */
+    if (len > CREC_COPY_MAXLEN) goto fallback;
+    if (ct) {
+      CTState *cts = ctype_ctsG(J2G(J));
+      lua_assert(ctype_isarray(ct->info) || ctype_isstruct(ct->info));
+      if (ctype_isarray(ct->info)) {
+	CType *cct = ctype_rawchild(cts, ct);
+	tp = crec_ct2irt(cts, cct);
+	if (tp == IRT_CDATA) goto rawcopy;
+	step = lj_ir_type_size[tp];
+	lua_assert((len & (step-1)) == 0);
+      } else if ((ct->info & CTF_UNION)) {
+	step = (1u << ctype_align(ct->info));
+	goto rawcopy;
+      } else {
+	mlp = crec_copy_struct(ml, cts, ct);
+	goto emitcopy;
+      }
+    } else {
+    rawcopy:
+      needxbar = 1;
+      if (LJ_TARGET_UNALIGNED || step >= CTSIZE_PTR)
+	step = CTSIZE_PTR;
+    }
+    mlp = crec_copy_unroll(ml, len, step, tp);
+  emitcopy:
+    if (mlp) {
+      crec_copy_emit(J, ml, mlp, trdst, trsrc);
+      if (needxbar)
+	emitir(IRT(IR_XBAR, IRT_NIL), 0, 0);
+      return;
+    }
+  }
+fallback:
+  /* Call memcpy. Always needs a barrier to disable alias analysis. */
+  lj_ir_call(J, IRCALL_memcpy, trdst, trsrc, trlen);
+  emitir(IRT(IR_XBAR, IRT_NIL), 0, 0);
+}
+
+/* Generate unrolled fill list, from highest to lowest step size/alignment. */
+static MSize crec_fill_unroll(CRecMemList *ml, CTSize len, CTSize step)
+{
+  CTSize ofs = 0;
+  MSize mlp = 0;
+  IRType tp = IRT_U8 + 2*lj_fls(step);
+  do {
+    while (ofs + step <= len) {
+      if (mlp >= CREC_COPY_MAXUNROLL) return 0;
+      ml[mlp].ofs = ofs;
+      ml[mlp].tp = tp;
+      mlp++;
+      ofs += step;
+    }
+    step >>= 1;
+    tp -= 2;
+  } while (ofs < len);
+  return mlp;
+}
+
+/*
+** Emit stores for fill list.
+** LJ_TARGET_UNALIGNED: may emit unaligned stores (not marked as such).
+*/
+static void crec_fill_emit(jit_State *J, CRecMemList *ml, MSize mlp,
+			   TRef trdst, TRef trfill)
+{
+  MSize i;
+  for (i = 0; i < mlp; i++) {
+    TRef trofs = lj_ir_kintp(J, ml[i].ofs);
+    TRef trdptr = emitir(IRT(IR_ADD, IRT_PTR), trdst, trofs);
+    emitir(IRT(IR_XSTORE, ml[i].tp), trdptr, trfill);
+  }
+}
+
+/* Optimized memory fill. */
+static void crec_fill(jit_State *J, TRef trdst, TRef trlen, TRef trfill,
+		      CTSize step)
+{
+  if (tref_isk(trlen)) {  /* Length must be constant. */
+    CRecMemList ml[CREC_FILL_MAXUNROLL];
+    MSize mlp;
+    CTSize len = (CTSize)IR(tref_ref(trlen))->i;
+    if (len == 0) return;  /* Shortcut. */
+    if (len > CREC_FILL_MAXLEN) goto fallback;
+    if (LJ_TARGET_UNALIGNED || step >= CTSIZE_PTR)
+      step = CTSIZE_PTR;
+    mlp = crec_fill_unroll(ml, len, step);
+    if (!mlp) goto fallback;
+    if (tref_isk(trfill) || ml[0].tp != IRT_U8)
+      trfill = emitconv(trfill, IRT_INT, IRT_U8, 0);
+    if (ml[0].tp != IRT_U8) {  /* Scatter U8 to U16/U32/U64. */
+      if (CTSIZE_PTR == 8 && ml[0].tp == IRT_U64) {
+	if (tref_isk(trfill))  /* Pointless on x64 with zero-extended regs. */
+	  trfill = emitconv(trfill, IRT_U64, IRT_U32, 0);
+	trfill = emitir(IRT(IR_MUL, IRT_U64), trfill,
+			lj_ir_kint64(J, U64x(01010101,01010101)));
+      } else {
+	trfill = emitir(IRTI(IR_MUL), trfill,
+		   lj_ir_kint(J, ml[0].tp == IRT_U16 ? 0x0101 : 0x01010101));
+      }
+    }
+    crec_fill_emit(J, ml, mlp, trdst, trfill);
+  } else {
+fallback:
+    /* Call memset. Always needs a barrier to disable alias analysis. */
+    lj_ir_call(J, IRCALL_memset, trdst, trfill, trlen);  /* Note: arg order! */
+  }
+  emitir(IRT(IR_XBAR, IRT_NIL), 0, 0);
+}
+
+/* -- Convert C type to C type -------------------------------------------- */
+
+/*
+** This code mirrors the code in lj_cconv.c. It performs the same steps
+** for the trace recorder that lj_cconv.c does for the interpreter.
+**
+** One major difference is that we can get away with much fewer checks
+** here. E.g. checks for casts, constness or correct types can often be
+** omitted, even if they might fail. The interpreter subsequently throws
+** an error, which aborts the trace.
+**
+** All operations are specialized to their C types, so the on-trace
+** outcome must be the same as the outcome in the interpreter. If the
+** interpreter doesn't throw an error, then the trace is correct, too.
+** Care must be taken not to generate invalid (temporary) IR or to
+** trigger asserts.
+*/
+
 /* Determine whether a passed number or cdata number is non-zero. */
 static int crec_isnonzero(CType *s, void *p)
 {
@@ -1298,26 +1527,32 @@ void LJ_FASTCALL recff_ffi_copy(jit_State *J, RecordFFData *rd)
       trlen = emitir(IRTI(IR_FLOAD), J->base[1], IRFL_STR_LEN);
       trlen = emitir(IRTI(IR_ADD), trlen, lj_ir_kint(J, 1));
     }
-    lj_ir_call(J, IRCALL_memcpy, trdst, trsrc, trlen);
-    emitir(IRT(IR_XBAR, IRT_NIL), 0, 0);
     rd->nres = 0;
+    crec_copy(J, trdst, trsrc, trlen, NULL);
   }  /* else: interpreter will throw. */
 }
 
 void LJ_FASTCALL recff_ffi_fill(jit_State *J, RecordFFData *rd)
 {
   CTState *cts = ctype_ctsG(J2G(J));
-  TRef tr = J->base[0], trlen = J->base[1], trfill = J->base[2];
-  if (tr && trlen) {
-    tr = crec_ct_tv(J, ctype_get(cts, CTID_P_VOID), 0, tr, &rd->argv[0]);
+  TRef trdst = J->base[0], trlen = J->base[1], trfill = J->base[2];
+  if (trdst && trlen) {
+    CTSize step = 1;
+    if (tviscdata(&rd->argv[0])) {  /* Get alignment of original destination. */
+      CTSize sz;
+      CType *ct = ctype_raw(cts, cdataV(&rd->argv[0])->ctypeid);
+      if (ctype_isptr(ct->info))
+	ct = ctype_rawchild(cts, ct);
+      step = (1u<<ctype_align(lj_ctype_info(cts, ctype_typeid(cts, ct), &sz)));
+    }
+    trdst = crec_ct_tv(J, ctype_get(cts, CTID_P_VOID), 0, trdst, &rd->argv[0]);
     trlen = crec_toint(J, cts, trlen, &rd->argv[1]);
     if (trfill)
       trfill = crec_toint(J, cts, trfill, &rd->argv[2]);
     else
       trfill = lj_ir_kint(J, 0);
-    lj_ir_call(J, IRCALL_memset, tr, trfill, trlen);
-    emitir(IRT(IR_XBAR, IRT_NIL), 0, 0);
     rd->nres = 0;
+    crec_fill(J, trdst, trlen, trfill, step);
   }  /* else: interpreter will throw. */
 }