/*
 * Copyright (c) 2008-2010 Stefan Krah. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */


#ifndef TYPEARITH_H
#define TYPEARITH_H


#include "mpdecimal.h"


/*****************************************************************************/
/*                      Native arithmetic on basic types                     */
/*****************************************************************************/


/** ------------------------------------------------------------
 **           Double width multiplication and division
 ** ------------------------------------------------------------
 */

#if defined(CONFIG_64)
#if defined(ANSI)
#if defined(HAVE_UINT128_T)
static inline void
_mpd_mul_words(mpd_uint_t *hi, mpd_uint_t *lo, mpd_uint_t a, mpd_uint_t b)
{
	__uint128_t hl;

	hl = (__uint128_t)a * b;

	*hi = hl >> 64;
	*lo = (mpd_uint_t)hl;
}

static inline void
_mpd_div_words(mpd_uint_t *q, mpd_uint_t *r, mpd_uint_t hi, mpd_uint_t lo,
               mpd_uint_t d)
{
	__uint128_t hl;

	hl = ((__uint128_t)hi<<64) + lo;
	*q = (mpd_uint_t)(hl / d); /* quotient is known to fit */
	*r = (mpd_uint_t)(hl - (__uint128_t)(*q) * d);
}
#else
static inline void
_mpd_mul_words(mpd_uint_t *hi, mpd_uint_t *lo, mpd_uint_t a, mpd_uint_t b)
{
	uint32_t w[4], carry;
	uint32_t ah, al, bh, bl;
	uint64_t hl;

	ah = (uint32_t)(a>>32); al = (uint32_t)a;
	bh = (uint32_t)(b>>32); bl = (uint32_t)b;

	hl = (uint64_t)al * bl;
	w[0] = (uint32_t)hl;
	carry = (uint32_t)(hl>>32);

	hl = (uint64_t)ah * bl + carry;
	w[1] = (uint32_t)hl;
	w[2] = (uint32_t)(hl>>32);

	hl = (uint64_t)al * bh + w[1];
	w[1] = (uint32_t)hl;
	carry = (uint32_t)(hl>>32);

	hl = ((uint64_t)ah * bh + w[2]) + carry;
	w[2] = (uint32_t)hl;
	w[3] = (uint32_t)(hl>>32);

	*hi = ((uint64_t)w[3]<<32) + w[2];
	*lo = ((uint64_t)w[1]<<32) + w[0];
}

/*
 * By Henry S. Warren: http://www.hackersdelight.org/HDcode/divlu.c.txt
 * http://www.hackersdelight.org/permissions.htm:
 * "You are free to use, copy, and distribute any of the code on this web
 *  site, whether modified by you or not. You need not give attribution."
 *
 * Slightly modified, comments are mine.
 */
static inline int
nlz(uint64_t x)
{
	int n;

	if (x == 0) return(64);

	n = 0;
	if (x <= 0x00000000FFFFFFFF) {n = n +32; x = x <<32;}
	if (x <= 0x0000FFFFFFFFFFFF) {n = n +16; x = x <<16;}
	if (x <= 0x00FFFFFFFFFFFFFF) {n = n + 8; x = x << 8;}
	if (x <= 0x0FFFFFFFFFFFFFFF) {n = n + 4; x = x << 4;}
	if (x <= 0x3FFFFFFFFFFFFFFF) {n = n + 2; x = x << 2;}
	if (x <= 0x7FFFFFFFFFFFFFFF) {n = n + 1;}

	return n;
}

static inline void
_mpd_div_words(mpd_uint_t *q, mpd_uint_t *r, mpd_uint_t u1, mpd_uint_t u0,
               mpd_uint_t v)
{
	const mpd_uint_t b = 4294967296;
	mpd_uint_t un1, un0,
	           vn1, vn0,
	           q1, q0,
	           un32, un21, un10,
	           rhat, t;
	int s;

	assert(u1 < v);

	s = nlz(v);
	v = v << s;
	vn1 = v >> 32;
	vn0 = v & 0xFFFFFFFF;

	t = (s == 0) ? 0 : u0 >> (64 - s);
	un32 = (u1 << s) | t;
	un10 = u0 << s;

	un1 = un10 >> 32;
	un0 = un10 & 0xFFFFFFFF;

	q1 = un32 / vn1;
	rhat = un32 - q1*vn1;
again1:
	if (q1 >= b || q1*vn0 > b*rhat + un1) {
		q1 = q1 - 1;
		rhat = rhat + vn1;
		if (rhat < b) goto again1;
	}

	/*
	 *  Before again1 we had:
	 *      (1) q1*vn1   + rhat         = un32
	 *      (2) q1*vn1*b + rhat*b + un1 = un32*b + un1
	 *
	 *  The statements inside the if-clause do not change the value
	 *  of the left-hand side of (2), and the loop is only exited
	 *  if q1*vn0 <= rhat*b + un1, so:
	 *
	 *      (3) q1*vn1*b + q1*vn0 <= un32*b + un1
	 *      (4)              q1*v <= un32*b + un1
	 *      (5)                 0 <= un32*b + un1 - q1*v
	 *
	 *  By (5) we are certain that the possible add-back step from
	 *  Knuth's algorithm D is never required.
	 *
	 *  Since the final quotient is less than 2**64, the following
	 *  must be true:
	 *
	 *      (6) un32*b + un1 - q1*v <= UINT64_MAX
	 *
	 *  This means that in the following line, the high words
	 *  of un32*b and q1*v can be discarded without any effect
	 *  on the result.
	 */
	un21 = un32*b + un1 - q1*v;

	q0 = un21 / vn1;
	rhat = un21 - q0*vn1;
again2:
	if (q0 >= b || q0*vn0 > b*rhat + un0) {
		q0 = q0 - 1;
		rhat = rhat + vn1;
		if (rhat < b) goto again2;
	}

	*q = q1*b + q0;
	*r = (un21*b + un0 - q0*v) >> s;
}
#endif

/* END ANSI */
#elif defined(ASM)
static inline void
_mpd_mul_words(mpd_uint_t *hi, mpd_uint_t *lo, mpd_uint_t a, mpd_uint_t b)
{
	mpd_uint_t h, l;

	asm (	"mulq %3\n\t"
		: "=d" (h), "=a" (l)
		: "%a" (a), "rm" (b)
		: "cc"
	);

	*hi = h;
	*lo = l;
}

static inline void
_mpd_div_words(mpd_uint_t *q, mpd_uint_t *r, mpd_uint_t hi, mpd_uint_t lo,
               mpd_uint_t d)
{
	mpd_uint_t qq, rr;

	asm (	"divq %4\n\t"
		: "=a" (qq), "=d" (rr)
		: "a" (lo), "d" (hi), "rm" (d)
		: "cc"
	);

	*q = qq;
	*r = rr;
}
/* END GCC ASM */
#elif defined(MASM)
#include <intrin.h>
#pragma intrinsic(_umul128)

static inline void
_mpd_mul_words(mpd_uint_t *hi, mpd_uint_t *lo, mpd_uint_t a, mpd_uint_t b)
{
	*lo = _umul128(a, b, hi);
}

void _mpd_div_words(mpd_uint_t *q, mpd_uint_t *r, mpd_uint_t hi, mpd_uint_t lo,
                    mpd_uint_t d);

/* END MASM (_MSC_VER) */
#else
  #error "need platform specific 128 bit multiplication and division"
#endif

static inline void
_mpd_divmod_pow10(mpd_uint_t *q, mpd_uint_t *r, mpd_uint_t v, mpd_uint_t exp)
{
  assert(exp <= 19);

  if (exp <= 9) {
    if (exp <= 4) {
      switch (exp) {
      case 0: *q = v; *r = 0; break;
      case 1: *q = v / 10ULL; *r = v - *q * 10ULL; break;
      case 2: *q = v / 100ULL; *r = v - *q * 100ULL; break;
      case 3: *q = v / 1000ULL; *r = v - *q * 1000ULL; break;
      case 4: *q = v / 10000ULL; *r = v - *q * 10000ULL; break;
      }
    }
    else {
      switch (exp) {
      case 5: *q = v / 100000ULL; *r = v - *q * 100000ULL; break;
      case 6: *q = v / 1000000ULL; *r = v - *q * 1000000ULL; break;
      case 7: *q = v / 10000000ULL; *r = v - *q * 10000000ULL; break;
      case 8: *q = v / 100000000ULL; *r = v - *q * 100000000ULL; break;
      case 9: *q = v / 1000000000ULL; *r = v - *q * 1000000000ULL; break;
      }
    }
  }
  else {
    if (exp <= 14) {
      switch (exp) {
      case 10: *q = v / 10000000000ULL; *r = v - *q * 10000000000ULL; break;
      case 11: *q = v / 100000000000ULL; *r = v - *q * 100000000000ULL; break;
      case 12: *q = v / 1000000000000ULL; *r = v - *q * 1000000000000ULL; break;
      case 13: *q = v / 10000000000000ULL; *r = v - *q * 10000000000000ULL; break;
      case 14: *q = v / 100000000000000ULL; *r = v - *q * 100000000000000ULL; break;
      }
    }
    else {
      switch (exp) {
      case 15: *q = v / 1000000000000000ULL; *r = v - *q * 1000000000000000ULL; break;
      case 16: *q = v / 10000000000000000ULL; *r = v - *q * 10000000000000000ULL; break;
      case 17: *q = v / 100000000000000000ULL; *r = v - *q * 100000000000000000ULL; break;
      case 18: *q = v / 1000000000000000000ULL; *r = v - *q * 1000000000000000000ULL; break;
      case 19: *q = v / 10000000000000000000ULL; *r = v - *q * 10000000000000000000ULL; break; /* GCOV_NOT_REACHED */
      }
    }
  }
}

/* END CONFIG_64 */
#elif defined(CONFIG_32)
#if defined(ANSI)
#if !defined(LEGACY_COMPILER)
static inline void
_mpd_mul_words(mpd_uint_t *hi, mpd_uint_t *lo, mpd_uint_t a, mpd_uint_t b)
{
	mpd_uuint_t hl;

	hl = (mpd_uuint_t)a * b;

	*hi = hl >> 32;
	*lo = (mpd_uint_t)hl;
}

static inline void
_mpd_div_words(mpd_uint_t *q, mpd_uint_t *r, mpd_uint_t hi, mpd_uint_t lo,
               mpd_uint_t d)
{
	mpd_uuint_t hl;

	hl = ((mpd_uuint_t)hi<<32) + lo;
	*q = (mpd_uint_t)(hl / d); /* quotient is known to fit */
	*r = (mpd_uint_t)(hl - (mpd_uuint_t)(*q) * d);
}
/* END ANSI + uint64_t */
#else
static inline void
_mpd_mul_words(mpd_uint_t *hi, mpd_uint_t *lo, mpd_uint_t a, mpd_uint_t b)
{
	uint16_t w[4], carry;
	uint16_t ah, al, bh, bl;
	uint32_t hl;

	ah = (uint16_t)(a>>16); al = (uint16_t)a;
	bh = (uint16_t)(b>>16); bl = (uint16_t)b;

	hl = (uint32_t)al * bl;
	w[0] = (uint16_t)hl;
	carry = (uint16_t)(hl>>16);

	hl = (uint32_t)ah * bl + carry;
	w[1] = (uint16_t)hl;
	w[2] = (uint16_t)(hl>>16);

	hl = (uint32_t)al * bh + w[1];
	w[1] = (uint16_t)hl;
	carry = (uint16_t)(hl>>16);

	hl = ((uint32_t)ah * bh + w[2]) + carry;
	w[2] = (uint16_t)hl;
	w[3] = (uint16_t)(hl>>16);

	*hi = ((uint32_t)w[3]<<16) + w[2];
	*lo = ((uint32_t)w[1]<<16) + w[0];
}

/*
 * By Henry S. Warren: http://www.hackersdelight.org/HDcode/divlu.c.txt
 * http://www.hackersdelight.org/permissions.htm:
 * "You are free to use, copy, and distribute any of the code on this web
 *  site, whether modified by you or not. You need not give attribution."
 *
 * Slightly modified, comments are mine.
 */
static inline int
nlz(uint32_t x)
{
	int n;

	if (x == 0) return(32);

	n = 0;
	if (x <= 0x0000FFFF) {n = n +16; x = x <<16;}
	if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;}
	if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;}
	if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;}
	if (x <= 0x7FFFFFFF) {n = n + 1;}

	return n;
}

static inline void
_mpd_div_words(mpd_uint_t *q, mpd_uint_t *r, mpd_uint_t u1, mpd_uint_t u0,
               mpd_uint_t v)
{
	const mpd_uint_t b = 65536;
	mpd_uint_t un1, un0,
	           vn1, vn0,
	           q1, q0,
	           un32, un21, un10,
	           rhat, t;
	int s;

	assert(u1 < v);

	s = nlz(v);
	v = v << s;
	vn1 = v >> 16;
	vn0 = v & 0xFFFF;

	t = (s == 0) ? 0 : u0 >> (32 - s);
	un32 = (u1 << s) | t;
	un10 = u0 << s;

	un1 = un10 >> 16;
	un0 = un10 & 0xFFFF;

	q1 = un32 / vn1;
	rhat = un32 - q1*vn1;
again1:
	if (q1 >= b || q1*vn0 > b*rhat + un1) {
		q1 = q1 - 1;
		rhat = rhat + vn1;
		if (rhat < b) goto again1;
	}

	/*
	 *  Before again1 we had:
	 *      (1) q1*vn1   + rhat         = un32
	 *      (2) q1*vn1*b + rhat*b + un1 = un32*b + un1
	 *
	 *  The statements inside the if-clause do not change the value
	 *  of the left-hand side of (2), and the loop is only exited
	 *  if q1*vn0 <= rhat*b + un1, so:
	 *
	 *      (3) q1*vn1*b + q1*vn0 <= un32*b + un1
	 *      (4)              q1*v <= un32*b + un1
	 *      (5)                 0 <= un32*b + un1 - q1*v
	 *
	 *  By (5) we are certain that the possible add-back step from
	 *  Knuth's algorithm D is never required.
	 *
	 *  Since the final quotient is less than 2**32, the following
	 *  must be true:
	 *
	 *      (6) un32*b + un1 - q1*v <= UINT32_MAX
	 *
	 *  This means that in the following line, the high words
	 *  of un32*b and q1*v can be discarded without any effect
	 *  on the result.
	 */
	un21 = un32*b + un1 - q1*v;

	q0 = un21 / vn1;
	rhat = un21 - q0*vn1;
again2:
	if (q0 >= b || q0*vn0 > b*rhat + un0) {
		q0 = q0 - 1;
		rhat = rhat + vn1;
		if (rhat < b) goto again2;
	}

	*q = q1*b + q0;
	*r = (un21*b + un0 - q0*v) >> s;
}
#endif /* END ANSI + LEGACY_COMPILER */

/* END ANSI */
#elif defined(ASM)
static inline void
_mpd_mul_words(mpd_uint_t *hi, mpd_uint_t *lo, mpd_uint_t a, mpd_uint_t b)
{
	mpd_uint_t h, l;

	asm (	"mull %3\n\t"
		: "=d" (h), "=a" (l)
		: "%a" (a), "rm" (b)
		: "cc"
	);

	*hi = h;
	*lo = l;
}

static inline void
_mpd_div_words(mpd_uint_t *q, mpd_uint_t *r, mpd_uint_t hi, mpd_uint_t lo,
               mpd_uint_t d)
{
	mpd_uint_t qq, rr;

	asm (	"divl %4\n\t"
		: "=a" (qq), "=d" (rr)
		: "a" (lo), "d" (hi), "rm" (d)
		: "cc"
	);

	*q = qq;
	*r = rr;
}
/* END GCC ASM */
#elif defined(MASM)
static inline void __cdecl
_mpd_mul_words(mpd_uint_t *hi, mpd_uint_t *lo, mpd_uint_t a, mpd_uint_t b)
{
	mpd_uint_t h, l;

	__asm {
		mov eax, a
		mul b
		mov h, edx
		mov l, eax
	}

	*hi = h;
	*lo = l;
}

static inline void __cdecl
_mpd_div_words(mpd_uint_t *q, mpd_uint_t *r, mpd_uint_t hi, mpd_uint_t lo,
               mpd_uint_t d)
{
	mpd_uint_t qq, rr;

	__asm {
		mov eax, lo
		mov edx, hi
		div d
		mov qq, eax
		mov rr, edx
	}

	*q = qq;
	*r = rr;
}
/* END MASM (_MSC_VER) */
#else
  #error "need platform specific 64 bit multiplication and division"
#endif

static inline void
_mpd_divmod_pow10(mpd_uint_t *q, mpd_uint_t *r, mpd_uint_t v, mpd_uint_t exp)
{
  assert(exp <= 9);

  if (exp <= 4) {
    switch (exp) {
    case 0: *q = v; *r = 0; break;
    case 1: *q = v / 10UL; *r = v - *q * 10UL; break;
    case 2: *q = v / 100UL; *r = v - *q * 100UL; break;
    case 3: *q = v / 1000UL; *r = v - *q * 1000UL; break;
    case 4: *q = v / 10000UL; *r = v - *q * 10000UL; break;
    }
  }
  else {
    switch (exp) {
    case 5: *q = v / 100000UL; *r = v - *q * 100000UL; break;
    case 6: *q = v / 1000000UL; *r = v - *q * 1000000UL; break;
    case 7: *q = v / 10000000UL; *r = v - *q * 10000000UL; break;
    case 8: *q = v / 100000000UL; *r = v - *q * 100000000UL; break;
    case 9: *q = v / 1000000000UL; *r = v - *q * 1000000000UL; break; /* GCOV_NOT_REACHED */
    }
  }
}
/* END CONFIG_32 */

/* NO CONFIG */
#else
  #error "define CONFIG_64 or CONFIG_32"
#endif /* CONFIG */


static inline void
_mpd_div_word(mpd_uint_t *q, mpd_uint_t *r, mpd_uint_t v, mpd_uint_t d)
{
	*q = v / d;
	*r = v - *q * d;
}

static inline void
_mpd_idiv_word(mpd_ssize_t *q, mpd_ssize_t *r, mpd_ssize_t v, mpd_ssize_t d)
{
	*q = v / d;
	*r = v - *q * d;
}


/** ------------------------------------------------------------
 **              Arithmetic with overflow checking
 ** ------------------------------------------------------------
 */
static inline mpd_size_t
add_size_t(mpd_size_t a, mpd_size_t b)
{
	if (a > MPD_SIZE_MAX - b) {
		mpd_err_fatal("add_size_t(): overflow: check the context"); /* GCOV_NOT_REACHED */
	}
	return a + b;
}

static inline mpd_size_t
sub_size_t(mpd_size_t a, mpd_size_t b)
{
	if (b > a) {
		mpd_err_fatal("sub_size_t(): overflow: check the context"); /* GCOV_NOT_REACHED */
	}
	return a - b;
}

#if MPD_SIZE_MAX != MPD_UINT_MAX
  #error "adapt mul_size_t() and mulmod_size_t()"
#endif

static inline mpd_size_t
mul_size_t(mpd_size_t a, mpd_size_t b)
{
	mpd_uint_t hi, lo;

	_mpd_mul_words(&hi, &lo, (mpd_uint_t)a, (mpd_uint_t)b);
	if (hi) {
		mpd_err_fatal("mul_size_t(): overflow: check the context"); /* GCOV_NOT_REACHED */
	}
	return lo;
}

static inline mpd_ssize_t
mod_mpd_ssize_t(mpd_ssize_t a, mpd_ssize_t m)
{
	mpd_ssize_t r = a % m;
	return (r < 0) ? r + m : r;
}

static inline mpd_size_t
mulmod_size_t(mpd_size_t a, mpd_size_t b, mpd_size_t m)
{
	mpd_uint_t hi, lo;
	mpd_uint_t q, r;

	_mpd_mul_words(&hi, &lo, (mpd_uint_t)a, (mpd_uint_t)b);
	_mpd_div_words(&q, &r, hi, lo, (mpd_uint_t)m);

	return r;
}


#endif /* TYPEARITH_H */