|
|
@@ -1,6 +1,6 @@
|
|
|
// SPDX-License-Identifier: Apache-2.0
|
|
|
// ----------------------------------------------------------------------------
|
|
|
-// Copyright 2011-2023 Arm Limited
|
|
|
+// Copyright 2011-2024 Arm Limited
|
|
|
//
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
|
|
// use this file except in compliance with the License. You may obtain a copy
|
|
|
@@ -29,6 +29,7 @@
|
|
|
#include <cstdio>
|
|
|
#endif
|
|
|
#include <cstdlib>
|
|
|
+#include <limits>
|
|
|
|
|
|
#include "astcenc.h"
|
|
|
#include "astcenc_mathlib.h"
|
|
|
@@ -325,10 +326,10 @@ struct partition_info
|
|
|
uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS];
|
|
|
|
|
|
/** @brief The partition of each texel in the block. */
|
|
|
- uint8_t partition_of_texel[BLOCK_MAX_TEXELS];
|
|
|
+ ASTCENC_ALIGNAS uint8_t partition_of_texel[BLOCK_MAX_TEXELS];
|
|
|
|
|
|
/** @brief The list of texels in each partition. */
|
|
|
- uint8_t texels_of_partition[BLOCK_MAX_PARTITIONS][BLOCK_MAX_TEXELS];
|
|
|
+ ASTCENC_ALIGNAS uint8_t texels_of_partition[BLOCK_MAX_PARTITIONS][BLOCK_MAX_TEXELS];
|
|
|
};
|
|
|
|
|
|
/**
|
|
|
@@ -366,40 +367,40 @@ struct decimation_info
|
|
|
* @brief The number of weights that contribute to each texel.
|
|
|
* Value is between 1 and 4.
|
|
|
*/
|
|
|
- uint8_t texel_weight_count[BLOCK_MAX_TEXELS];
|
|
|
+ ASTCENC_ALIGNAS uint8_t texel_weight_count[BLOCK_MAX_TEXELS];
|
|
|
|
|
|
/**
|
|
|
* @brief The weight index of the N weights that are interpolated for each texel.
|
|
|
* Stored transposed to improve vectorization.
|
|
|
*/
|
|
|
- uint8_t texel_weights_tr[4][BLOCK_MAX_TEXELS];
|
|
|
+ ASTCENC_ALIGNAS uint8_t texel_weights_tr[4][BLOCK_MAX_TEXELS];
|
|
|
|
|
|
/**
|
|
|
* @brief The bilinear contribution of the N weights that are interpolated for each texel.
|
|
|
* Value is between 0 and 16, stored transposed to improve vectorization.
|
|
|
*/
|
|
|
- uint8_t texel_weight_contribs_int_tr[4][BLOCK_MAX_TEXELS];
|
|
|
+ ASTCENC_ALIGNAS uint8_t texel_weight_contribs_int_tr[4][BLOCK_MAX_TEXELS];
|
|
|
|
|
|
/**
|
|
|
* @brief The bilinear contribution of the N weights that are interpolated for each texel.
|
|
|
* Value is between 0 and 1, stored transposed to improve vectorization.
|
|
|
*/
|
|
|
- alignas(ASTCENC_VECALIGN) float texel_weight_contribs_float_tr[4][BLOCK_MAX_TEXELS];
|
|
|
+ ASTCENC_ALIGNAS float texel_weight_contribs_float_tr[4][BLOCK_MAX_TEXELS];
|
|
|
|
|
|
/** @brief The number of texels that each stored weight contributes to. */
|
|
|
- uint8_t weight_texel_count[BLOCK_MAX_WEIGHTS];
|
|
|
+ ASTCENC_ALIGNAS uint8_t weight_texel_count[BLOCK_MAX_WEIGHTS];
|
|
|
|
|
|
/**
|
|
|
* @brief The list of texels that use a specific weight index.
|
|
|
* Stored transposed to improve vectorization.
|
|
|
*/
|
|
|
- uint8_t weight_texels_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
|
|
|
+ ASTCENC_ALIGNAS uint8_t weight_texels_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
|
|
|
|
|
|
/**
|
|
|
* @brief The bilinear contribution to the N texels that use each weight.
|
|
|
* Value is between 0 and 1, stored transposed to improve vectorization.
|
|
|
*/
|
|
|
- alignas(ASTCENC_VECALIGN) float weights_texel_contribs_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
|
|
|
+ ASTCENC_ALIGNAS float weights_texel_contribs_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
|
|
|
|
|
|
/**
|
|
|
* @brief The bilinear contribution to the Nth texel that uses each weight.
|
|
|
@@ -579,7 +580,7 @@ struct block_size_descriptor
|
|
|
decimation_mode decimation_modes[WEIGHTS_MAX_DECIMATION_MODES];
|
|
|
|
|
|
/** @brief The active decimation tables, stored in low indices. */
|
|
|
- alignas(ASTCENC_VECALIGN) decimation_info decimation_tables[WEIGHTS_MAX_DECIMATION_MODES];
|
|
|
+ ASTCENC_ALIGNAS decimation_info decimation_tables[WEIGHTS_MAX_DECIMATION_MODES];
|
|
|
|
|
|
/** @brief The packed block mode array index, or @c BLOCK_BAD_BLOCK_MODE if not active. */
|
|
|
uint16_t block_mode_packed_index[WEIGHTS_MAX_BLOCK_MODES];
|
|
|
@@ -731,7 +732,11 @@ struct block_size_descriptor
|
|
|
*
|
|
|
* The @c data_[rgba] fields store the image data in an encoded SoA float form designed for easy
|
|
|
* vectorization. Input data is converted to float and stored as values between 0 and 65535. LDR
|
|
|
- * data is stored as direct UNORM data, HDR data is stored as LNS data.
|
|
|
+ * data is stored as direct UNORM data, HDR data is stored as LNS data. They are allocated SIMD
|
|
|
+ * elements over-size to allow vectorized stores of unaligned and partial SIMD lanes (e.g. in a
|
|
|
+ * 6x6x6 block the final row write will read elements 210-217 (vec8) or 214-217 (vec4), which is
|
|
|
+ * two elements above the last real data element). The overspill values are never written to memory,
|
|
|
+ * and would be benign, but the padding avoids hitting undefined behavior.
|
|
|
*
|
|
|
* The @c rgb_lns and @c alpha_lns fields that assigned a per-texel use of HDR are only used during
|
|
|
* decompression. The current compressor will always use HDR endpoint formats when in HDR mode.
|
|
|
@@ -739,16 +744,16 @@ struct block_size_descriptor
|
|
|
struct image_block
|
|
|
{
|
|
|
/** @brief The input (compress) or output (decompress) data for the red color component. */
|
|
|
- alignas(ASTCENC_VECALIGN) float data_r[BLOCK_MAX_TEXELS];
|
|
|
+ ASTCENC_ALIGNAS float data_r[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];
|
|
|
|
|
|
/** @brief The input (compress) or output (decompress) data for the green color component. */
|
|
|
- alignas(ASTCENC_VECALIGN) float data_g[BLOCK_MAX_TEXELS];
|
|
|
+ ASTCENC_ALIGNAS float data_g[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];
|
|
|
|
|
|
/** @brief The input (compress) or output (decompress) data for the blue color component. */
|
|
|
- alignas(ASTCENC_VECALIGN) float data_b[BLOCK_MAX_TEXELS];
|
|
|
+ ASTCENC_ALIGNAS float data_b[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];
|
|
|
|
|
|
/** @brief The input (compress) or output (decompress) data for the alpha color component. */
|
|
|
- alignas(ASTCENC_VECALIGN) float data_a[BLOCK_MAX_TEXELS];
|
|
|
+ ASTCENC_ALIGNAS float data_a[BLOCK_MAX_TEXELS + ASTCENC_SIMD_WIDTH - 1];
|
|
|
|
|
|
/** @brief The number of texels in the block. */
|
|
|
uint8_t texel_count;
|
|
|
@@ -771,6 +776,9 @@ struct image_block
|
|
|
/** @brief Is this grayscale block where R == G == B for all texels? */
|
|
|
bool grayscale;
|
|
|
|
|
|
+ /** @brief Is the eventual decode using decode_unorm8 rounding? */
|
|
|
+ bool decode_unorm8;
|
|
|
+
|
|
|
/** @brief Set to 1 if a texel is using HDR RGB endpoints (decompression only). */
|
|
|
uint8_t rgb_lns[BLOCK_MAX_TEXELS];
|
|
|
|
|
|
@@ -897,10 +905,10 @@ struct endpoints_and_weights
|
|
|
endpoints ep;
|
|
|
|
|
|
/** @brief The ideal weight for each texel; may be undecimated or decimated. */
|
|
|
- alignas(ASTCENC_VECALIGN) float weights[BLOCK_MAX_TEXELS];
|
|
|
+ ASTCENC_ALIGNAS float weights[BLOCK_MAX_TEXELS];
|
|
|
|
|
|
/** @brief The ideal weight error scaling for each texel; may be undecimated or decimated. */
|
|
|
- alignas(ASTCENC_VECALIGN) float weight_error_scale[BLOCK_MAX_TEXELS];
|
|
|
+ ASTCENC_ALIGNAS float weight_error_scale[BLOCK_MAX_TEXELS];
|
|
|
};
|
|
|
|
|
|
/**
|
|
|
@@ -930,7 +938,7 @@ struct encoding_choice_errors
|
|
|
/**
|
|
|
* @brief Preallocated working buffers, allocated per thread during context creation.
|
|
|
*/
|
|
|
-struct alignas(ASTCENC_VECALIGN) compression_working_buffers
|
|
|
+struct ASTCENC_ALIGNAS compression_working_buffers
|
|
|
{
|
|
|
/** @brief Ideal endpoints and weights for plane 1. */
|
|
|
endpoints_and_weights ei1;
|
|
|
@@ -946,17 +954,17 @@ struct alignas(ASTCENC_VECALIGN) compression_working_buffers
|
|
|
*
|
|
|
* For two planes, second plane starts at @c WEIGHTS_PLANE2_OFFSET offsets.
|
|
|
*/
|
|
|
- alignas(ASTCENC_VECALIGN) float dec_weights_ideal[WEIGHTS_MAX_DECIMATION_MODES * BLOCK_MAX_WEIGHTS];
|
|
|
+ ASTCENC_ALIGNAS float dec_weights_ideal[WEIGHTS_MAX_DECIMATION_MODES * BLOCK_MAX_WEIGHTS];
|
|
|
|
|
|
/**
|
|
|
* @brief Decimated quantized weight values in the unquantized 0-64 range.
|
|
|
*
|
|
|
* For two planes, second plane starts at @c WEIGHTS_PLANE2_OFFSET offsets.
|
|
|
*/
|
|
|
- uint8_t dec_weights_uquant[WEIGHTS_MAX_BLOCK_MODES * BLOCK_MAX_WEIGHTS];
|
|
|
+ ASTCENC_ALIGNAS uint8_t dec_weights_uquant[WEIGHTS_MAX_BLOCK_MODES * BLOCK_MAX_WEIGHTS];
|
|
|
|
|
|
/** @brief Error of the best encoding combination for each block mode. */
|
|
|
- alignas(ASTCENC_VECALIGN) float errors_of_best_combination[WEIGHTS_MAX_BLOCK_MODES];
|
|
|
+ ASTCENC_ALIGNAS float errors_of_best_combination[WEIGHTS_MAX_BLOCK_MODES];
|
|
|
|
|
|
/** @brief The best color quant for each block mode. */
|
|
|
uint8_t best_quant_levels[WEIGHTS_MAX_BLOCK_MODES];
|
|
|
@@ -1107,7 +1115,7 @@ struct symbolic_compressed_block
|
|
|
*
|
|
|
* If dual plane, the second plane starts at @c weights[WEIGHTS_PLANE2_OFFSET].
|
|
|
*/
|
|
|
- uint8_t weights[BLOCK_MAX_WEIGHTS];
|
|
|
+ ASTCENC_ALIGNAS uint8_t weights[BLOCK_MAX_WEIGHTS];
|
|
|
|
|
|
/**
|
|
|
* @brief Get the weight quantization used by this block mode.
|
|
|
@@ -1563,6 +1571,33 @@ unsigned int find_best_partition_candidates(
|
|
|
Functionality for managing images and image related data.
|
|
|
============================================================================ */
|
|
|
|
|
|
+/**
|
|
|
+ * @brief Get a vector mask indicating lanes decompressing into a UNORM8 value.
|
|
|
+ *
|
|
|
+ * @param decode_mode The color profile for LDR_SRGB settings.
|
|
|
+ * @param blk The image block for output image bitness settings.
|
|
|
+ *
|
|
|
+ * @return The component mask vector.
|
|
|
+ */
|
|
|
+static inline vmask4 get_u8_component_mask(
|
|
|
+ astcenc_profile decode_mode,
|
|
|
+ const image_block& blk
|
|
|
+) {
|
|
|
+ vmask4 u8_mask(false);
|
|
|
+ // Decode mode writing to a unorm8 output value
|
|
|
+ if (blk.decode_unorm8)
|
|
|
+ {
|
|
|
+ u8_mask = vmask4(true);
|
|
|
+ }
|
|
|
+ // SRGB writing to a unorm8 RGB value
|
|
|
+ else if (decode_mode == ASTCENC_PRF_LDR_SRGB)
|
|
|
+ {
|
|
|
+ u8_mask = vmask4(true, true, true, false);
|
|
|
+ }
|
|
|
+
|
|
|
+ return u8_mask;
|
|
|
+}
|
|
|
+
|
|
|
/**
|
|
|
* @brief Setup computation of regional averages in an image.
|
|
|
*
|
|
|
@@ -1816,7 +1851,7 @@ uint8_t pack_color_endpoints(
|
|
|
*
|
|
|
* Endpoints must be unscrambled and converted into the 0-255 range before calling this functions.
|
|
|
*
|
|
|
- * @param decode_mode The decode mode (LDR, HDR).
|
|
|
+ * @param decode_mode The decode mode (LDR, HDR, etc).
|
|
|
* @param format The color endpoint mode used.
|
|
|
* @param input The raw array of encoded input integers. The length of this array
|
|
|
* depends on @c format; it can be safely assumed to be large enough.
|
|
|
@@ -2142,10 +2177,11 @@ Platform-specific functions.
|
|
|
/**
|
|
|
* @brief Allocate an aligned memory buffer.
|
|
|
*
|
|
|
- * Allocated memory must be freed by aligned_free;
|
|
|
+ * Allocated memory must be freed by aligned_free.
|
|
|
*
|
|
|
* @param size The desired buffer size.
|
|
|
- * @param align The desired buffer alignment; must be 2^N.
|
|
|
+ * @param align The desired buffer alignment; must be 2^N, may be increased
|
|
|
+ * by the implementation to a minimum allowable alignment.
|
|
|
*
|
|
|
* @return The memory buffer pointer or nullptr on allocation failure.
|
|
|
*/
|
|
|
@@ -2155,10 +2191,14 @@ T* aligned_malloc(size_t size, size_t align)
|
|
|
void* ptr;
|
|
|
int error = 0;
|
|
|
|
|
|
+ // Don't allow this to under-align a type
|
|
|
+ size_t min_align = astc::max(alignof(T), sizeof(void*));
|
|
|
+ size_t real_align = astc::max(min_align, align);
|
|
|
+
|
|
|
#if defined(_WIN32)
|
|
|
- ptr = _aligned_malloc(size, align);
|
|
|
+ ptr = _aligned_malloc(size, real_align);
|
|
|
#else
|
|
|
- error = posix_memalign(&ptr, align, size);
|
|
|
+ error = posix_memalign(&ptr, real_align, size);
|
|
|
#endif
|
|
|
|
|
|
if (error || (!ptr))
|