Add ktx

2026-06-14 19:09:18 +01:00
parent 14bd1a9271
commit 13fa90a0e9
3958 changed files with 999286 additions and 4 deletions
@@ -0,0 +1,65 @@
+#  SPDX-License-Identifier: Apache-2.0
+#  ----------------------------------------------------------------------------
+#  Copyright 2021 Arm Limited
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"); you may not
+#  use this file except in compliance with the License. You may obtain a copy
+#  of the License at:
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#  License for the specific language governing permissions and limitations
+#  under the License.
+#  ----------------------------------------------------------------------------
+
+# CMake configuration
+cmake_minimum_required(VERSION 3.15)
+include(ExternalProject)
+
+project(astcencoder_example VERSION 1.1.0)
+
+# Add the external project and pull out the project directories we need
+
+# The default build is a native build which supports the highest level of SIMD
+# exposed by the compiler when using default compiler flags. Add a single
+# SIMD enable to the CMAKE_CACHE_ARGS option to force something specific, but
+# remember to change the link library in target_link_libraries() to match.
+#
+#  *  Add "-DASTCENC_ISA_SSE2:String=ON" and link against "astcenc-sse2-static"
+#  *  Add "-DASTCENC_ISA_SSE41:String=ON" and link against "astcenc-sse4.1-static"
+#  *  Add "-DASTCENC_ISA_AVX2:String=ON" and link against "astcenc-avx2-static"
+#  *  Add "-DASTCENC_ISA_NEON:String=ON" and link against "astcenc-neon-static"
+ExternalProject_Add(astcencoder
+    GIT_REPOSITORY https://github.com/ARM-software/astc-encoder
+    GIT_TAG main
+    CMAKE_CACHE_ARGS -DASTCENC_CLI:STRING=OFF -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
+    INSTALL_COMMAND "")
+
+ExternalProject_Get_property(astcencoder
+    SOURCE_DIR)
+
+ExternalProject_Get_property(astcencoder
+    BINARY_DIR)
+
+# Build the command line
+add_executable(astcenc_example astc_api_example.cpp)
+
+# ... with astcencoder as a dependency
+add_dependencies(astcenc_example astcencoder)
+
+# ... with astcencoder Source dir on the include path
+target_include_directories(astcenc_example
+    PRIVATE
+        ${SOURCE_DIR}/Source)
+
+# ... with astcencoder Binary dir on the library path and as a library dep
+target_link_directories(astcenc_example
+    PRIVATE
+        ${BINARY_DIR}/Source)
+
+target_link_libraries(astcenc_example
+    PRIVATE
+        astcenc-native-static)
@@ -0,0 +1,55 @@
+# Library usage example
+
+This is a minimal example of using the astcenc codec as a library in another
+project. This sample shows:
+
+  * How to include astcenc as an external project CMake dependency.
+  * How to use the API to compress and decompress an image.
+
+For sake of simplicity the example application uses fixed compression settings,
+reading an uncompressed LDR image, compressing using 6x6 blocks at medium
+quality, and then decompressing and writing the decompressed image back to disk
+as a PNG file.
+
+## Building
+
+:warning: For sake of simplicity the example CMake project uses the CMake
+`ExternalProject` mechanism to import the astcenc project from GitHub. This is
+trivial to integrate, but requires network access during the build to pull the
+astcenc project.
+
+Most users will want to store a copy of astcenc in a project sub-directory,
+e.g. by using git submodules, and then use `add_subdirectory()` to include the
+project in their build. This allows the user to directly use the astcenc core
+library as a link requirement via `target_link_libraries()`, without the
+additional plumbing that `ExternalProject` requires.
+
+### Linux and macOS
+
+From the `./Utils/Example` directory.
+
+```
+mkdir build
+cd build
+cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release ..
+make -j8
+```
+
+### Windows
+
+From the `./Utils/Example` directory, in a Visual Studio command prompt.
+
+```
+mkdir build
+cd build
+cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release ..
+nmake
+```
+
+## Running
+
+From the build directory above.
+
+```
+astcenc_example <input.png> <output.png>
+```
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2021-2024 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+// This is a minimal example of using the astcenc library.
+//
+// This sample shows how to include the astcenc library in your CMake project
+// as an external dependency, and how to compress and decompress images using
+// the C library API.
+//
+// For sake of clarity the command line exposed by the sample is minimalistic,
+// and the compression uses a fixed set of options, but the code is commented
+// to indicate where extension would be possible. Errors handling points are
+// detected and logged, but resources are not cleaned up on error paths to keep
+// the sample control path simple, so resources will leak on error.
+
+#include <stdio.h>
+
+#include "astcenc.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "ThirdParty/stb_image.h"
+
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#include "ThirdParty/stb_image_write.h"
+
+int main(int argc, char **argv)
+{
+	// Parse command line
+	if (argc != 3)
+	{
+		printf("Usage:\n"
+		       "   %s <source> <dest>\n\n"
+		       "   <source> : Uncompressed LDR source image.\n"
+		       "   <dest>   : Uncompressed LDR destination image (png).\n"
+		       , argv[0]);
+		return 1;
+	}
+
+	// ------------------------------------------------------------------------
+	// For the purposes of this sample we hard-code the compressor settings
+	static const unsigned int thread_count = 1;
+	static const unsigned int block_x = 6;
+	static const unsigned int block_y = 6;
+	static const unsigned int block_z = 1;
+	static const astcenc_profile profile = ASTCENC_PRF_LDR;
+	static const float quality = ASTCENC_PRE_MEDIUM;
+	static const astcenc_swizzle swizzle {
+		ASTCENC_SWZ_R, ASTCENC_SWZ_G, ASTCENC_SWZ_B, ASTCENC_SWZ_A
+	};
+
+	// ------------------------------------------------------------------------
+	// Load input image, forcing 4 components
+	int image_x, image_y, image_c;
+	uint8_t *image_data = (uint8_t*)stbi_load(argv[1], &image_x, &image_y, &image_c, 4);
+	if (!image_data)
+	{
+		printf("Failed to load image \"%s\"\n", argv[1]);
+		return 1;
+	}
+
+	// Compute the number of ASTC blocks in each dimension
+	unsigned int block_count_x = (image_x + block_x - 1) / block_x;
+	unsigned int block_count_y = (image_y + block_y - 1) / block_y;
+
+	// ------------------------------------------------------------------------
+	// Initialize the default configuration for the block size and quality
+	astcenc_config config;
+	astcenc_error status;
+	status = astcenc_config_init(profile, block_x, block_y, block_z, quality, 0, &config);
+	if (status != ASTCENC_SUCCESS)
+	{
+		printf("ERROR: Codec config init failed: %s\n", astcenc_get_error_string(status));
+		return 1;
+	}
+
+	// ... power users can customize any config settings after calling
+	// config_init() and before calling context alloc().
+
+	// ------------------------------------------------------------------------
+	// Create a context based on the configuration
+	astcenc_context* context;
+	status = astcenc_context_alloc(&config, thread_count, &context);
+	if (status != ASTCENC_SUCCESS)
+	{
+		printf("ERROR: Codec context alloc failed: %s\n", astcenc_get_error_string(status));
+		return 1;
+	}
+
+	// ------------------------------------------------------------------------
+	// Compress the image
+	astcenc_image image;
+	image.dim_x = image_x;
+	image.dim_y = image_y;
+	image.dim_z = 1;
+	image.data_type = ASTCENC_TYPE_U8;
+	uint8_t* slices = image_data;
+	image.data = reinterpret_cast<void**>(&slices);
+
+	// Space needed for 16 bytes of output per compressed block
+	size_t comp_len = block_count_x * block_count_y * 16;
+	uint8_t* comp_data = new uint8_t[comp_len];
+
+	status = astcenc_compress_image(context, &image, &swizzle, comp_data, comp_len, 0);
+	if (status != ASTCENC_SUCCESS)
+	{
+		printf("ERROR: Codec compress failed: %s\n", astcenc_get_error_string(status));
+		return 1;
+	}
+
+	// ... the comp_data array contains the raw compressed data you would pass
+	// to the graphics API, or pack into a wrapper format such as a KTX file.
+
+	// If using multithreaded compression to sequentially compress multiple
+	// images you should reuse the same context, calling the function
+	// astcenc_compress_reset() between each image in the series.
+
+	// ------------------------------------------------------------------------
+	// Decompress the image
+	// Note we just reuse the image structure to store the output here ...
+	status = astcenc_decompress_image(context, comp_data, comp_len, &image, &swizzle, 0);
+	if (status != ASTCENC_SUCCESS)
+	{
+		printf("ERROR: Codec decompress failed: %s\n", astcenc_get_error_string(status));
+		return 1;
+	}
+
+	// If using multithreaded decompression to sequentially decompress multiple
+	// images you should reuse the same context, calling the function
+	// astcenc_decompress_reset() between each image in the series.
+
+	// ------------------------------------------------------------------------
+	// Store the result back to disk
+	stbi_write_png(argv[2], image_x, image_y, 4, image_data, 4 * image_x);
+
+	// ------------------------------------------------------------------------
+	// Cleanup library resources
+	stbi_image_free(image_data);
+	astcenc_context_free(context);
+	delete[] comp_data;
+
+	return 0;
+}
@@ -0,0 +1,298 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2021-2024 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+// This is a utility tool to test blend modes.
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "astcenc_mathlib.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "ThirdParty/stb_image.h"
+
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#include "ThirdParty/stb_image_write.h"
+
+/**
+ * @brief Linearize an sRGB value.
+ *
+ * @return The linearized value.
+ */
+static float srgb_to_linear(
+	float a
+) {
+	if (a <= 0.04045f)
+	{
+		return a * (1.0f / 12.92f);
+	}
+
+	return powf((a + 0.055f) * (1.0f / 1.055f), 2.4f);
+}
+
+/**
+ * @brief sRGB gamma-encode a linear value.
+ *
+ * @return The gamma encoded value.
+ */
+static float linear_to_srgb(
+	float a
+) {
+	if (a <= 0.0031308f)
+	{
+		return a * 12.92f;
+	}
+
+	return 1.055f * powf(a, 1.0f / 2.4f) - 0.055f;
+}
+
+int main(int argc, char **argv)
+{
+	// Parse command line
+	if (argc != 6)
+	{
+		printf("Usage: astc_blend_test <source> <dest> <format> <blend_mode> <filter>\n");
+		exit(1);
+	}
+
+	const char* src_file = argv[1];
+	const char* dst_file = argv[2];
+
+	bool use_linear = false;
+	if (!strcmp(argv[3], "linear"))
+	{
+		use_linear = true;
+	}
+	else if (!strcmp(argv[3], "srgb"))
+	{
+		use_linear = false;
+	}
+	else
+	{
+		printf("<format> must be either 'linear' or 'srgb'\n");
+		exit(1);
+	}
+
+	bool use_post_blend = false;
+	if (!strcmp(argv[4], "post"))
+	{
+		use_post_blend = true;
+	}
+	else if (!strcmp(argv[4], "pre"))
+	{
+		use_post_blend = false;
+	}
+	else
+	{
+		printf("<blend_mode> must be either 'post' or 'pre'\n");
+		exit(1);
+	}
+
+	bool use_filter = false;
+	if (!strcmp(argv[5], "on"))
+	{
+		use_filter = true;
+	}
+	else if (!strcmp(argv[5], "off"))
+	{
+		use_filter = false;
+	}
+	else
+	{
+		printf("<filter> must be either 'on' or 'off'\n");
+		exit(1);
+	}
+
+	// Load the input image
+	int dim_x;
+	int dim_y;
+	const uint8_t* data_in = stbi_load(src_file, &dim_x, &dim_y, nullptr, 4);
+	if (!data_in)
+	{
+		printf("ERROR: Failed to load input image.\n");
+		exit(1);
+	}
+
+	// Allocate the output image
+	uint8_t* data_out = (uint8_t*)malloc(4 * dim_y * dim_x);
+	if (!data_out)
+	{
+		printf("ERROR: Failed to allocate output image.\n");
+		exit(1);
+	}
+
+	// For each pixel blending and filtering
+	if (!use_filter)
+	{
+		for (int y = 0; y < dim_y; y++)
+		{
+			const uint8_t* row_in = data_in + (4 * dim_x * y);
+			uint8_t* row_out = data_out + (4 * dim_x * y);
+
+			for (int x = 0; x < dim_x; x++)
+			{
+				const uint8_t* pixel_in = row_in + 4 * x;
+				uint8_t* pixel_out = row_out + 4 * x;
+
+				float r_src = static_cast<float>(pixel_in[0]) / 255.0f;
+				float g_src = static_cast<float>(pixel_in[1]) / 255.0f;
+				float b_src = static_cast<float>(pixel_in[2]) / 255.0f;
+				float a_src = static_cast<float>(pixel_in[3]) / 255.0f;
+
+				if (use_linear == false)
+				{
+					r_src = srgb_to_linear(r_src);
+					g_src = srgb_to_linear(g_src);
+					b_src = srgb_to_linear(b_src);
+				}
+
+				float r_dst = 0.53f;
+				float g_dst = 0.53f;
+				float b_dst = 0.53f;
+
+				float r_out;
+				float g_out;
+				float b_out;
+				float a_out;
+
+				// Post-multiply blending
+				if (use_post_blend)
+				{
+					r_out = (r_dst * (1.0f - a_src)) + (r_src * a_src);
+					g_out = (g_dst * (1.0f - a_src)) + (g_src * a_src);
+					b_out = (b_dst * (1.0f - a_src)) + (b_src * a_src);
+					a_out = 1.0f;
+				}
+				// Pre-multiply blending
+				else
+				{
+					r_out = (r_dst * (1.0f - a_src)) + (r_src * 1.0f);
+					g_out = (g_dst * (1.0f - a_src)) + (g_src * 1.0f);
+					b_out = (b_dst * (1.0f - a_src)) + (b_src * 1.0f);
+					a_out = 1.0f;
+				}
+
+				// Clamp color between 0 and 1.0f
+				r_out = astc::min(r_out, 1.0f);
+				g_out = astc::min(g_out, 1.0f);
+				b_out = astc::min(b_out, 1.0f);
+
+				if (use_linear == false)
+				{
+					r_out = linear_to_srgb(r_out);
+					g_out = linear_to_srgb(g_out);
+					b_out = linear_to_srgb(b_out);
+				}
+
+				pixel_out[0] = (uint8_t)(r_out * 255.0f);
+				pixel_out[1] = (uint8_t)(g_out * 255.0f);
+				pixel_out[2] = (uint8_t)(b_out * 255.0f);
+				pixel_out[3] = (uint8_t)(a_out * 255.0f);
+			}
+		}
+	}
+	else
+	{
+		for (int y = 0; y < dim_y - 1; y++)
+		{
+			const uint8_t* row_in_0 = data_in + (4 * dim_x * y);
+			const uint8_t* row_in_1 = data_in + (4 * dim_x * (y + 1));
+
+			uint8_t* row_out = data_out + (4 * (dim_x - 1) * y);
+
+			for (int x = 0; x < dim_x - 1; x++)
+			{
+				const uint8_t* pixel_in_00 = row_in_0 + 4 * x;
+				const uint8_t* pixel_in_01 = row_in_0 + 4 * (x + 1);
+				const uint8_t* pixel_in_10 = row_in_1 + 4 * x;
+				const uint8_t* pixel_in_11 = row_in_1 + 4 * (x + 1);
+
+				uint8_t* pixel_out = row_out + 4 * x;
+
+				// Bilinear filter with a half-pixel offset
+				float r_src = static_cast<float>(pixel_in_00[0] + pixel_in_01[0] + pixel_in_10[0] + pixel_in_11[0]) / (255.0f * 4.0f);
+				float g_src = static_cast<float>(pixel_in_00[1] + pixel_in_01[1] + pixel_in_10[1] + pixel_in_11[1]) / (255.0f * 4.0f);
+				float b_src = static_cast<float>(pixel_in_00[2] + pixel_in_01[2] + pixel_in_10[2] + pixel_in_11[2]) / (255.0f * 4.0f);
+				float a_src = static_cast<float>(pixel_in_00[3] + pixel_in_01[3] + pixel_in_10[3] + pixel_in_11[3]) / (255.0f * 4.0f);
+
+				if (use_linear == false)
+				{
+					r_src = srgb_to_linear(r_src);
+					g_src = srgb_to_linear(g_src);
+					b_src = srgb_to_linear(b_src);
+				}
+
+				float r_dst = 0.8f;
+				float g_dst = 1.0f;
+				float b_dst = 0.8f;
+
+				float r_out;
+				float g_out;
+				float b_out;
+				float a_out;
+
+				// Post-multiply blending
+				if (use_post_blend)
+				{
+					r_out = (r_dst * (1.0f - a_src)) + (r_src * a_src);
+					g_out = (g_dst * (1.0f - a_src)) + (g_src * a_src);
+					b_out = (b_dst * (1.0f - a_src)) + (b_src * a_src);
+					a_out = 1.0f;
+				}
+				// Pre-multiply blending
+				else
+				{
+					r_out = (r_dst * (1.0f - a_src)) + (r_src * 1.0f);
+					g_out = (g_dst * (1.0f - a_src)) + (g_src * 1.0f);
+					b_out = (b_dst * (1.0f - a_src)) + (b_src * 1.0f);
+					a_out = 1.0f;
+				}
+
+				// Clamp color between 0 and 1.0f
+				r_out = astc::min(r_out, 1.0f);
+				g_out = astc::min(g_out, 1.0f);
+				b_out = astc::min(b_out, 1.0f);
+
+				if (use_linear == false)
+				{
+					r_out = linear_to_srgb(r_out);
+					g_out = linear_to_srgb(g_out);
+					b_out = linear_to_srgb(b_out);
+				}
+
+				pixel_out[0] = (uint8_t)(r_out * 255.0f);
+				pixel_out[1] = (uint8_t)(g_out * 255.0f);
+				pixel_out[2] = (uint8_t)(b_out * 255.0f);
+				pixel_out[3] = (uint8_t)(a_out * 255.0f);
+			}
+		}
+	}
+
+	// Write out the result
+	if (!use_filter)
+	{
+		stbi_write_png(dst_file, dim_x, dim_y, 4, data_out, 4 * dim_x);
+	}
+	else
+	{
+		stbi_write_png(dst_file, dim_x - 1, dim_y - 1, 4, data_out, 4 * (dim_x - 1));
+	}
+
+
+	return 0;
+}
@@ -0,0 +1,406 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+// This is a utility tool to generate quant tables
+#include <algorithm>
+#include <array>
+#include <bitset>
+#include <set>
+
+/**
+ * @brief The ASTC quantization methods.
+ *
+ * Note, the values here are used directly in the encoding in the format so do not rearrange.
+ */
+enum quant_method
+{
+	QUANT_2 = 0,
+	QUANT_3 = 1,
+	QUANT_4 = 2,
+	QUANT_5 = 3,
+	QUANT_6 = 4,
+	QUANT_8 = 5,
+	QUANT_10 = 6,
+	QUANT_12 = 7,
+	QUANT_16 = 8,
+	QUANT_20 = 9,
+	QUANT_24 = 10,
+	QUANT_32 = 11,
+	QUANT_40 = 12,
+	QUANT_48 = 13,
+	QUANT_64 = 14,
+	QUANT_80 = 15,
+	QUANT_96 = 16,
+	QUANT_128 = 17,
+	QUANT_160 = 18,
+	QUANT_192 = 19,
+	QUANT_256 = 20
+};
+
+static inline unsigned int get_quant_level(quant_method method)
+{
+	switch (method)
+	{
+	case QUANT_2:   return   2;
+	case QUANT_3:   return   3;
+	case QUANT_4:   return   4;
+	case QUANT_5:   return   5;
+	case QUANT_6:   return   6;
+	case QUANT_8:   return   8;
+	case QUANT_10:  return  10;
+	case QUANT_12:  return  12;
+	case QUANT_16:  return  16;
+	case QUANT_20:  return  20;
+	case QUANT_24:  return  24;
+	case QUANT_32:  return  32;
+	case QUANT_40:  return  40;
+	case QUANT_48:  return  48;
+	case QUANT_64:  return  64;
+	case QUANT_80:  return  80;
+	case QUANT_96:  return  96;
+	case QUANT_128: return 128;
+	case QUANT_160: return 160;
+	case QUANT_192: return 192;
+	case QUANT_256: return 256;
+	}
+
+	// Unreachable - the enum is fully described
+	return 0;
+}
+
+struct quant_config {
+	quant_method quant;
+	unsigned int bits;
+	unsigned int trits;
+	unsigned int quints;
+	unsigned int C;
+	unsigned int masks[6];
+};
+
+const std::array<quant_config, 17> quant_configs {{
+	{
+		QUANT_6,
+		1, 1, 0,
+		204,
+		{
+			0b000000000,
+			0b000000000,
+			0b000000000,
+			0b000000000,
+			0b000000000,
+			0b000000000
+		}
+	}, {
+		QUANT_8,
+		3, 0, 0,
+		0,
+		{ 0 }
+	}, {
+		QUANT_10,
+		1, 0, 1,
+		113,
+		{
+			0b000000000,
+			0b000000000,
+			0b000000000,
+			0b000000000,
+			0b000000000,
+			0b000000000
+		}
+	}, {
+		QUANT_12,
+		2, 1, 0,
+		93,
+		{
+			0b000000000,
+			0b100010110,
+			0b000000000,
+			0b000000000,
+			0b000000000,
+			0b000000000
+		}
+	}, {
+		QUANT_16,
+		4, 0, 0,
+		0,
+		{ 0 }
+	}, {
+		QUANT_20,
+		2, 0, 1,
+		54,
+		{
+			0b000000000,
+			0b100001100,
+			0b000000000,
+			0b000000000,
+			0b000000000,
+			0b000000000
+		}
+	}, {
+		QUANT_24,
+		3, 1, 0,
+		44,
+		{
+			0b000000000,
+			0b010000101,
+			0b100001010,
+			0b000000000,
+			0b000000000,
+			0b000000000
+		}
+	}, {
+		QUANT_32,
+		5, 0, 0,
+		0,
+		{ 0 }
+	},
+	{
+		QUANT_40,
+		3, 0, 1,
+		26,
+		{
+			0b000000000,
+			0b010000010,
+			0b100000101,
+			0b000000000,
+			0b000000000,
+			0b000000000
+		}
+	}, {
+		QUANT_48,
+		4, 1, 0,
+		22,
+		{
+			0b000000000,
+			0b001000001,
+			0b010000010,
+			0b100000100,
+			0b000000000,
+			0b000000000
+		}
+	}, {
+		QUANT_64,
+		6, 0, 0,
+		0,
+		{ 0 }
+	}, {
+		QUANT_80,
+		4, 0, 1,
+		13,
+		{
+			0b000000000,
+			0b001000000,
+			0b010000001,
+			0b100000010,
+			0b000000000,
+			0b000000000
+		}
+	}, {
+		QUANT_96,
+		5, 1, 0,
+		11,
+		{
+			0b000000000,
+			0b000100000,
+			0b001000000,
+			0b010000001,
+			0b100000010,
+			0b000000000
+		}
+	}, {
+		QUANT_128,
+		7, 0, 0,
+		0,
+		{ 0 }
+	}, {
+		QUANT_160,
+		5, 0, 1,
+		6,
+		{
+			0b000000000,
+			0b000100000,
+			0b001000000,
+			0b010000000,
+			0b100000001,
+			0b000000000
+		}
+	}, {
+		QUANT_192,
+		6, 1, 0,
+		5,
+		{
+			0b000000000,
+			0b000010000,
+			0b000100000,
+			0b001000000,
+			0b010000000,
+			0b100000001
+		}
+	}, {
+		QUANT_256,
+		8, 0, 0,
+		0,
+		{ 0 }
+	}
+}};
+
+void generate_unpacked_quant(
+	const quant_config& config,
+	std::set<unsigned int>& set
+) {
+	unsigned int levels = get_quant_level(config.quant);
+	unsigned int emitted = 0;
+
+	// Value has 1 trit and N bits
+	if (config.trits)
+	{
+		for (unsigned int D = 0; D < 3; D++)
+		{
+			unsigned int max_bits = 1 << config.bits;
+			for (unsigned int bits = 0; bits < max_bits; bits++)
+			{
+				unsigned int A = (bits & 1) * 0b111111111;
+				unsigned int B = 0;
+				unsigned int bit = bits;
+				for (const auto& mask_n: config.masks)
+				{
+					unsigned int bit_n = bit & 1;
+					bit >>= 1;
+					B += bit_n * mask_n;
+				}
+
+				unsigned int T = D * config.C + B;
+				T = T ^ A;
+				T = (A & 0x80) | (T >> 2);
+				set.insert(T);
+			}
+		}
+	}
+	// Value has 1 quint and N bits
+	else if (config.quints)
+	{
+		for (unsigned int D = 0; D < 5; D++)
+		{
+			unsigned int max_bits = 1 << config.bits;
+			for (unsigned int bits = 0; bits < max_bits; bits++)
+			{
+				unsigned int A = (bits & 1) * 0b111111111;
+				unsigned int B = 0;
+				unsigned int bit = bits;
+				for (const auto& mask_n: config.masks)
+				{
+					unsigned int bit_n = bit & 1;
+					bit >>= 1;
+					B += bit_n * mask_n;
+				}
+
+				unsigned int T = D * config.C + B;
+				T = T ^ A;
+				T = (A & 0x80) | (T >> 2);
+				set.insert(T);
+			}
+		}
+	}
+	// Value has N bits
+	else
+	{
+		unsigned int max_bits = 1 << config.bits;
+		for (unsigned int bits = 0; bits < max_bits; bits++)
+		{
+			unsigned int T = bits << (8 - config.bits);
+			int bits_remaining = 8 - config.bits;
+
+			while (bits_remaining > 0)
+			{
+				int shift = bits_remaining - config.bits;
+				bits_remaining -= config.bits;
+				if (shift > 0)
+				{
+					T |= bits << shift;
+				}
+				else
+				{
+					T |= bits >> -shift;
+				}
+			}
+			set.insert(T);
+		}
+	}
+}
+
+void generate_unquant_to_unpacked_quant(
+	const quant_config& config,
+	const std::set<unsigned int>& set
+) {
+	for (unsigned int i = 0; i < 256; i++)
+	{
+		unsigned int min_dist = 256;
+		unsigned int val_lo = 256;
+		unsigned int val_hi = 0;
+
+		for (const auto& val: set)
+		{
+			unsigned int dist = std::max(i, val) - std::min(i, val);
+
+			if (dist < min_dist)
+			{
+				min_dist = dist;
+				val_lo = val;
+				val_hi = val;
+			}
+			else if (dist == min_dist)
+			{
+				val_lo = std::min(val_lo, val);
+				val_hi = std::max(val_hi, val);
+			}
+		}
+
+		if ((i % 16) == 0)
+		{
+			printf("\t\t");
+		}
+
+		printf("%3u, %3u", val_lo, val_hi);
+
+		if (i != 255)
+		{
+			printf(", ");
+		}
+
+		if ((i % 16) == 15)
+		{
+			printf("\n");
+		}
+	}
+}
+
+int main(void)
+{
+	printf("const uint8_t color_unquant_to_uquant_tables[17][512] {\n");
+	for (size_t i = 0; i < quant_configs.size(); i++)
+	{
+		const auto& config = quant_configs[i];
+		std::set<unsigned int> set;
+
+		printf("\t{ // QUANT_%u\n", get_quant_level(config.quant));
+		generate_unpacked_quant(config, set);
+		generate_unquant_to_unpacked_quant(config, set);
+		printf("\t},\n");
+	}
+	printf("};\n");
+	return 0;
+}
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+// This is a utility tool to encode HDR into RGBM, or decode RGBM into HDR.
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "astcenc_mathlib.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "ThirdParty/stb_image.h"
+
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#include "ThirdParty/stb_image_write.h"
+
+#define MODE_ENCODE 0
+#define MODE_DECODE 1
+
+int main(int argc, char **argv)
+{
+	// Parse command line
+	if (argc != 6)
+	{
+		printf("Usage: astc_rgbm_codec [-ch|-dh] <M> <low_clamp> <source> <dest>\n");
+		exit(1);
+	}
+
+	int opmode;
+	if (strcmp(argv[1], "-ch") == 0)
+	{
+		opmode = MODE_ENCODE;
+	}
+	else if (strcmp(argv[1], "-dh") == 0)
+	{
+		opmode = MODE_DECODE;
+	}
+	else
+	{
+		printf("ERROR: Bad operation mode\n");
+		exit(1);
+	}
+
+ 	float rgbm_multiplier = atof(argv[2]);
+ 	float low_clamp = atof(argv[3]);
+
+	const char* src_file = argv[4];
+	const char* dst_file = argv[5];
+
+	// Convert an HDR input file into an RGBM encoded LDR file
+	if (opmode == MODE_ENCODE)
+	{
+		// Load the input image
+		int dim_x;
+		int dim_y;
+		const float* data_in = stbi_loadf(src_file, &dim_x, &dim_y, nullptr, 4);
+		if (!data_in)
+		{
+			printf("ERROR: Failed to load input image.\n");
+			exit(1);
+		}
+
+		// Allocate the output image
+		uint8_t* data_out = (uint8_t*)malloc(4 * dim_y * dim_x);
+		if (!data_out)
+		{
+			printf("ERROR: Failed to allow output image.\n");
+			exit(1);
+		}
+
+		// For each pixel apply RGBM encoding
+		for (int y = 0; y < dim_y; y++)
+		{
+			const float* row_in = data_in + (4 * dim_x * y);
+			uint8_t* row_out = data_out + (4 * dim_x * y);
+
+			for (int x = 0; x < dim_x; x++)
+			{
+				const float* pixel_in = row_in + 4 * x;
+				uint8_t* pixel_out = row_out + 4 * x;
+
+				float r_in = pixel_in[0] / rgbm_multiplier;
+				float g_in = pixel_in[1] / rgbm_multiplier;
+				float b_in = pixel_in[2] / rgbm_multiplier;
+
+				float max_rgb = astc::max(r_in, g_in, b_in);
+
+				// Ensure we always round up to next largest M
+				float m_scale = astc::min(1.0f, ceil(max_rgb * 255.0f) / 255.0f);
+
+				// But keep well above zero to avoid clamps in the compressor
+				m_scale = astc::max(m_scale, low_clamp / 255.0f);
+
+				float r_scale = astc::min(1.0f, r_in / m_scale);
+				float g_scale = astc::min(1.0f, g_in / m_scale);
+				float b_scale = astc::min(1.0f, b_in / m_scale);
+
+				pixel_out[0] = (uint8_t)(r_scale * 255.0f);
+				pixel_out[1] = (uint8_t)(g_scale * 255.0f);
+				pixel_out[2] = (uint8_t)(b_scale * 255.0f);
+				pixel_out[3] = (uint8_t)(m_scale * 255.0f);
+			}
+		}
+
+		// Write out the result
+		stbi_write_png(dst_file, dim_x, dim_y, 4, data_out, 4 * dim_x);
+	}
+	// Convert an RGBM encoded LDR file into an HDR file
+	else
+	{
+		// Load the input image
+		int dim_x;
+		int dim_y;
+		const uint8_t* data_in = stbi_load(src_file, &dim_x, &dim_y, nullptr, 4);
+		if (!data_in)
+		{
+			printf("ERROR: Failed to load input image.\n");
+			exit(1);
+		}
+
+		// Allocate the output image
+		float* data_out = (float*)malloc(4 * dim_y * dim_x * sizeof(float));
+		if (!data_out)
+		{
+			printf("ERROR: Failed to allow output image.\n");
+			exit(1);
+		}
+
+		// For each pixel apply RGBM decoding
+		for (int y = 0; y < dim_y; y++)
+		{
+			const uint8_t* row_in = data_in + (4 * dim_x * y);
+			float* row_out = data_out + (4 * dim_x * y);
+
+			for (int x = 0; x < dim_x; x++)
+			{
+				const uint8_t* pixel_in = row_in + 4 * x;
+				float* pixel_out = row_out + 4 * x;
+
+				float r_scale = ((float)pixel_in[0]) / 255.0f;
+				float g_scale = ((float)pixel_in[1]) / 255.0f;
+				float b_scale = ((float)pixel_in[2]) / 255.0f;
+
+				float m_scale = ((float)pixel_in[3]) / 255.0f;
+
+				pixel_out[0] = r_scale * (m_scale * rgbm_multiplier);
+				pixel_out[1] = g_scale * (m_scale * rgbm_multiplier);
+				pixel_out[2] = b_scale * (m_scale * rgbm_multiplier);
+				pixel_out[3] = 1.0f;
+			}
+		}
+
+		// Write out the result
+		stbi_write_hdr(dst_file, dim_x, dim_y, 4, data_out);
+	}
+
+	return 0;
+}
@@ -0,0 +1,203 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+// Overview
+// ========
+//
+// This is a utility tool to automatically generate single tile test vectors
+// out of a larger test image. This tool takes three input images:
+//
+//    - the uncompressed referenced,
+//    - the known-good compressed reference,
+//    - a new compressed image.
+//
+// The two compressed images are compared block-by-block, and if any block
+// differences are found the worst block is extracted from the uncompressed
+// reference and written back to disk as a single tile output image.
+//
+// Limitations
+// ===========
+//
+// This tool only currently supports 2D LDR images.
+//
+// Build
+// =====
+//
+// g++ astc_test_autoextract.cpp -I../Source -o astc_test_autoextract
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "ThirdParty/stb_image.h"
+
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#include "ThirdParty/stb_image_write.h"
+
+/**
+ * @brief Compute the array offset in a 2D image
+ */
+int pix(int x_pix, int y_idx, int x_idx, int chans, int p_idx)
+{
+	return ((y_idx * x_pix) + x_idx) * chans + p_idx;
+}
+
+int main(int argc, char **argv)
+{
+
+	// Parse command line
+	if (argc < 6)
+	{
+		printf("Usage: astc_test_extract <blocksize> <ref> <good> <bad> <out>\n");
+		return 1;
+	}
+
+	int blockdim_x, blockdim_y;
+	if (sscanf(argv[1], "%dx%d", &blockdim_x, &blockdim_y) < 2)
+	{
+		printf("blocksize must be of form WxH; e.g. 8x4\n");
+		return 1;
+	}
+
+	// Load the original reference image
+	int ref_dim_x, ref_dim_y, ref_ncomp;
+	uint8_t* data_ref = (uint8_t*)stbi_load(argv[2], &ref_dim_x, &ref_dim_y, &ref_ncomp, 4);
+	if (!data_ref)
+	{
+		printf("Failed to load reference image.\n");
+		return 1;
+	}
+
+	// Load the good test image
+	int good_dim_x, good_dim_y, good_ncomp;
+	uint8_t* data_good = (uint8_t*)stbi_load(argv[3], &good_dim_x, &good_dim_y, &good_ncomp, 4);
+	if (!data_good)
+	{
+		printf("Failed to load good test image.\n");
+		return 1;
+	}
+
+	// Load the bad test image
+	int bad_dim_x, bad_dim_y, bad_ncomp;
+	uint8_t* data_bad = (uint8_t*)stbi_load(argv[4], &bad_dim_x, &bad_dim_y, &bad_ncomp, 4);
+	if (!data_bad)
+	{
+		printf("Failed to load bad test image.\n");
+		return 1;
+	}
+
+	if (ref_dim_x != good_dim_x || ref_dim_x != bad_dim_x ||
+		ref_dim_y != good_dim_y || ref_dim_y != bad_dim_y)
+	{
+		printf("Failed as images are different resolutions.\n");
+		return 1;
+	}
+
+
+	int x_blocks = (ref_dim_x + blockdim_x - 1) / blockdim_x;
+	int y_blocks = (ref_dim_y + blockdim_y - 1) / blockdim_y;
+
+	int *errorsums = (int*)malloc(x_blocks * y_blocks * 4);
+	for (int i = 0; i < x_blocks * y_blocks; i++)
+	{
+		errorsums[i] = 0;
+	}
+
+	// Diff the two test images to find blocks that differ
+	for (int y = 0; y < ref_dim_y; y++)
+	{
+		for (int x = 0; x < ref_dim_x; x++)
+		{
+			int x_block = x / blockdim_x;
+			int y_block = y / blockdim_y;
+
+			int r_gd = data_good[pix(ref_dim_x, y, x, 4, 0)];
+			int g_gd = data_good[pix(ref_dim_x, y, x, 4, 1)];
+			int b_gd = data_good[pix(ref_dim_x, y, x, 4, 2)];
+			int a_gd = data_good[pix(ref_dim_x, y, x, 4, 3)];
+
+			int r_bd = data_bad[pix(ref_dim_x, y, x, 4, 0)];
+			int g_bd = data_bad[pix(ref_dim_x, y, x, 4, 1)];
+			int b_bd = data_bad[pix(ref_dim_x, y, x, 4, 2)];
+			int a_bd = data_bad[pix(ref_dim_x, y, x, 4, 3)];
+
+			int r_diff = (r_gd - r_bd) * (r_gd - r_bd);
+			int g_diff = (g_gd - g_bd) * (g_gd - g_bd);
+			int b_diff = (b_gd - b_bd) * (b_gd - b_bd);
+			int a_diff = (a_gd - a_bd) * (a_gd - a_bd);
+
+			int diff = r_diff + g_diff + b_diff + a_diff;
+			errorsums[pix(x_blocks, y_block, x_block, 1, 0)] += diff;
+		}
+	}
+
+	// Diff the two test images to find blocks that differ
+	float worst_error = 0.0f;
+	int worst_x_block = 0;
+	int worst_y_block = 0;
+	for (int y = 0; y < y_blocks; y++)
+	{
+		for (int x = 0; x < x_blocks; x++)
+		{
+			float error = errorsums[pix(x_blocks, y, x, 1, 0)];
+			if (error > worst_error)
+			{
+				worst_error = error;
+				worst_x_block = x;
+				worst_y_block = y;
+			}
+		}
+	}
+
+	if (worst_error == 0.0f)
+	{
+		printf("No block errors found\n");
+	}
+	else
+	{
+		int start_y = worst_y_block * blockdim_y;
+		int start_x = worst_x_block * blockdim_x;
+
+		int end_y = (worst_y_block + 1) * blockdim_y;
+		int end_x = (worst_x_block + 1) * blockdim_x;
+
+		if (end_x > ref_dim_x)
+		{
+			end_x = ref_dim_x;
+		}
+
+		if (end_y > ref_dim_y)
+		{
+			end_y = ref_dim_y;
+		}
+
+		int outblk_x = end_x - start_x;
+		int outblk_y = end_y - start_y;
+
+		printf("Block errors found at ~(%u, %u) px\n", start_x, start_y);
+
+		// Write out the worst bad block (from original reference)
+		uint8_t* data_out = &(data_ref[pix(ref_dim_x, start_y, start_x, 4, 0)]);
+		stbi_write_png(argv[5], outblk_x, outblk_y, 4, data_out, 4 * ref_dim_x);
+	}
+
+	free(errorsums);
+	stbi_image_free(data_ref);
+	stbi_image_free(data_good);
+	stbi_image_free(data_bad);
+	return 0;
+}
@@ -0,0 +1,216 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+// Overview
+// ========
+//
+// This is a utility tool to automatically generate single tile test vectors
+// out of a larger test image. This tool takes three input images:
+//
+//    - the uncompressed referenced,
+//    - the known-good compressed reference,
+//    - a new compressed image.
+//
+// The two compressed images are compared block-by-block, and if any block
+// differences are found the worst block is extracted from the uncompressed
+// reference and written back to disk as a single tile output image.
+//
+// Limitations
+// ===========
+//
+// This tool only currently supports 2D LDR images.
+//
+// Build
+// =====
+//
+// g++ astc_test_autoextract_hdr.cpp -I../Source -o astc_test_autoextract_hdr
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "ThirdParty/stb_image.h"
+
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#include "ThirdParty/stb_image_write.h"
+
+/**
+ * @brief Compute the array offset in a 2D image
+ */
+int pix(int x_pix, int y_idx, int x_idx, int chans, int p_idx)
+{
+	return ((y_idx * x_pix) + x_idx) * chans + p_idx;
+}
+
+int main(int argc, char **argv)
+{
+
+	// Parse command line
+	if (argc < 6)
+	{
+		printf("Usage: astc_test_extract <blocksize> <ref> <good> <bad> <out>\n");
+		return 1;
+	}
+
+	int blockdim_x, blockdim_y;
+	if (sscanf(argv[1], "%dx%d", &blockdim_x, &blockdim_y) < 2)
+	{
+		printf("blocksize must be of form WxH; e.g. 8x4\n");
+		return 1;
+	}
+
+	// Load the original reference image
+	int ref_dim_x, ref_dim_y, ref_ncomp;
+	float* data_ref = (float*)stbi_loadf(argv[2], &ref_dim_x, &ref_dim_y, &ref_ncomp, 4);
+	if (!data_ref)
+	{
+		printf("Failed to load reference image.\n");
+		return 1;
+	}
+
+	// Load the good test image
+	int good_dim_x, good_dim_y, good_ncomp;
+	float* data_good = (float*)stbi_loadf(argv[3], &good_dim_x, &good_dim_y, &good_ncomp, 4);
+	if (!data_good)
+	{
+		printf("Failed to load good test image.\n");
+		return 1;
+	}
+
+	// Load the bad test image
+	int bad_dim_x, bad_dim_y, bad_ncomp;
+	float* data_bad = (float*)stbi_loadf(argv[4], &bad_dim_x, &bad_dim_y, &bad_ncomp, 4);
+	if (!data_bad)
+	{
+		printf("Failed to load bad test image.\n");
+		return 1;
+	}
+
+	if (ref_dim_x != good_dim_x || ref_dim_x != bad_dim_x ||
+		ref_dim_y != good_dim_y || ref_dim_y != bad_dim_y)
+	{
+		printf("Failed as images are different resolutions.\n");
+		return 1;
+	}
+
+
+	int x_blocks = (ref_dim_x + blockdim_x - 1) / blockdim_x;
+	int y_blocks = (ref_dim_y + blockdim_y - 1) / blockdim_y;
+
+	float* errorsums = (float*)malloc(x_blocks * y_blocks * 4);
+	for (int i = 0; i < x_blocks * y_blocks; i++)
+	{
+		errorsums[i] = 0;
+	}
+
+	// Diff the two test images to find blocks that differ
+	for (int y = 0; y < ref_dim_y; y++)
+	{
+		for (int x = 0; x < ref_dim_x; x++)
+		{
+			int x_block = x / blockdim_x;
+			int y_block = y / blockdim_y;
+
+			float r_gd = data_good[pix(ref_dim_x, y, x, 4, 0)];
+			float g_gd = data_good[pix(ref_dim_x, y, x, 4, 1)];
+			float b_gd = data_good[pix(ref_dim_x, y, x, 4, 2)];
+			float a_gd = data_good[pix(ref_dim_x, y, x, 4, 3)];
+
+			float r_bd = data_bad[pix(ref_dim_x, y, x, 4, 0)];
+			float g_bd = data_bad[pix(ref_dim_x, y, x, 4, 1)];
+			float b_bd = data_bad[pix(ref_dim_x, y, x, 4, 2)];
+			float a_bd = data_bad[pix(ref_dim_x, y, x, 4, 3)];
+
+			float r_diff = (r_gd - r_bd) * (r_gd - r_bd);
+			float g_diff = (g_gd - g_bd) * (g_gd - g_bd);
+			float b_diff = (b_gd - b_bd) * (b_gd - b_bd);
+			float a_diff = (a_gd - a_bd) * (a_gd - a_bd);
+
+			float diff = abs(r_diff) + abs(g_diff) + abs(b_diff) + abs(a_diff);
+			errorsums[pix(x_blocks, y_block, x_block, 1, 0)] += diff;
+		}
+	}
+
+	// Diff the two test images to find blocks that differ
+	float worst_error = 0.0f;
+	int worst_x_block = 0;
+	int worst_y_block = 0;
+	for (int y = 0; y < y_blocks; y++)
+	{
+		for (int x = 0; x < x_blocks; x++)
+		{
+			float error = errorsums[pix(x_blocks, y, x, 1, 0)];
+			if (error > worst_error)
+			{
+				worst_error = error;
+				worst_x_block = x;
+				worst_y_block = y;
+			}
+		}
+	}
+
+	if (worst_error == 0.0f)
+	{
+		printf("No block errors found\n");
+	}
+	else
+	{
+		int start_y = worst_y_block * blockdim_y;
+		int start_x = worst_x_block * blockdim_x;
+
+		int end_y = (worst_y_block + 1) * blockdim_y;
+		int end_x = (worst_x_block + 1) * blockdim_x;
+
+		if (end_x > ref_dim_x)
+		{
+			end_x = ref_dim_x;
+		}
+
+		if (end_y > ref_dim_y)
+		{
+			end_y = ref_dim_y;
+		}
+
+		int outblk_x = end_x - start_x;
+		int outblk_y = end_y - start_y;
+
+		printf("Block errors found at ~(%u, %u) px\n", start_x, start_y);
+
+		float* data_out = (float*)malloc(blockdim_x * blockdim_y * 4 * 4);
+		for (int y = 0; y < outblk_y; y++)
+		{
+			for (int x = 0; x < outblk_x; x++)
+			{
+				data_out[(y * outblk_x * 4) + (x * 4) + 0] = data_ref[((start_y + y) * ref_dim_x * 4) + ((start_x + x) * 4) + 0];
+				data_out[(y * outblk_x * 4) + (x * 4) + 1] = data_ref[((start_y + y) * ref_dim_x * 4) + ((start_x + x) * 4) + 1];
+				data_out[(y * outblk_x * 4) + (x * 4) + 2] = data_ref[((start_y + y) * ref_dim_x * 4) + ((start_x + x) * 4) + 2];
+				data_out[(y * outblk_x * 4) + (x * 4) + 3] = data_ref[((start_y + y) * ref_dim_x * 4) + ((start_x + x) * 4) + 3];
+			}
+		}
+
+		// Write out the worst bad block (from original reference)
+		stbi_write_hdr(argv[5], outblk_x, outblk_y, 4, data_out);
+
+		free(data_out);
+	}
+
+	free(errorsums);
+	stbi_image_free(data_ref);
+	stbi_image_free(data_good);
+	stbi_image_free(data_bad);
+	return 0;
+}
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+// astcenc doesn't use the top 8 integer bits directly for sRGB RGB components
+// or when using the decode_unorm8 decode mode. An alterantive is used which
+// allows a common code path to be used. This test program shows that the two
+// produce equivalent output once rounded to a decode_unorm8 output.
+
+// Compile with e.g. clang++ astcenc_u8_test_bench.cpp -o astcenc_u8_test_bench -mavx2 -mf16c
+
+#define ASTCENC_AVX 2
+#define ASTCENC_F16C 1
+#define ASTCENC_SSE 41
+
+#include "../Source/astcenc_mathlib.cpp"
+#include "../Source/astcenc_color_unquantize.cpp"
+#include "../Source/astcenc_decompress_symbolic.cpp"
+
+int main()
+{
+    printf("Decode mode test bench\n");
+
+    for (int ep0 = 0; ep0 < 256; ep0++)
+    {
+        for (int ep1 = 0; ep1 < 256; ep1++)
+        {
+            for (int wt1 = 0; wt1 < 65; wt1++)
+            {
+                // Validate linear data with decode_unorm8 mode
+                {
+                    // Expand 8 bit to 16 bit
+                    vint4 weights(wt1);
+                    int ep0_v0 = ep0 * 257;
+                    int ep1_v0 = ep1 * 257;
+
+                    // Linear with decode_u8 handling
+                    vmask4 decode_u8_v0(true, true, true, true);
+                    vint4 ep0v0(ep0_v0, ep0_v0, ep0_v0, ep0_v0);
+                    vint4 ep1v0(ep1_v0, ep1_v0, ep1_v0, ep1_v0);
+
+                    // Linear without decode_u8 handling
+                    vmask4 decode_u8_v1(false, false, false, false);
+                    vint4 ep0v1(ep0_v0, ep0_v0, ep0_v0, ep0_v0);
+                    vint4 ep1v1(ep1_v0, ep1_v0, ep1_v0, ep1_v0);
+
+                    // Lerp both styles
+                    vint4 colorv0 = lerp_color_int(decode_u8_v0, ep0v0, ep1v0, weights);
+                    vint4 colorv1 = lerp_color_int(decode_u8_v1, ep0v1, ep1v1, weights);
+
+                    // Validate top 8 integer bits match in both cases
+                    //  - Shows that astcenc-style U8 doesn't differ from Khronos-style U8
+                    vint4 cs0 = lsr<8>(colorv0);
+                    vint4 cs1 = lsr<8>(colorv1);
+                    assert(cs0.lane<0>() == cs1.lane<0>());
+                    assert(cs0.lane<3>() == cs1.lane<3>());
+
+                    // Validate that astcenc output matches the top 8 integer bits
+                    vfloat4 colorv0f = decode_texel(colorv0, vmask4(false));
+                    vint4 colorv0_out = float_to_int_rtn(colorv0f * 255.0f);
+                    assert(colorv0_out.lane<0>() == cs0.lane<0>());
+                }
+
+                // Validate sRGB data with decode_unorm8 mode
+                {
+                    // Expand 8 bit to 16 bit
+                    vint4 weights(wt1);
+                    int ep0_v0s = (ep0 << 8) | 0x80;
+                    int ep1_v0s = (ep1 << 8) | 0x80;
+                    int ep0_v0 = ep0 * 257;
+                    int ep1_v0 = ep1 * 257;
+
+                    // sRGB RGB and linear A with decode_u8 handling
+                    vmask4 decode_u8_v0(true, true, true, true);
+                    vint4 ep0v0(ep0_v0s, ep0_v0s, ep0_v0s, ep0_v0);
+                    vint4 ep1v0(ep1_v0s, ep1_v0s, ep1_v0s, ep1_v0);
+
+                    // sRGB RGB and linear A without decode_u8 handling
+                    vmask4 decode_u8_v1(false, false, false, false);
+                    vint4 ep0v1(ep0_v0s, ep0_v0s, ep0_v0s, ep0_v0);
+                    vint4 ep1v1(ep1_v0s, ep1_v0s, ep1_v0s, ep1_v0);
+
+                    // Lerp both styles
+                    vint4 colorv0 = lerp_color_int(decode_u8_v0, ep0v0, ep1v0, weights);
+                    vint4 colorv1 = lerp_color_int(decode_u8_v1, ep0v1, ep1v1, weights);
+
+                    // Validate top 8 integer bits match in both cases
+                    //  - Shows that astcenc-style U8 doesn't differ from Khronos-style U8
+                    vint4 cs0 = lsr<8>(colorv0);
+                    vint4 cs1 = lsr<8>(colorv1);
+                    assert(cs0.lane<0>() == cs1.lane<0>());
+                    assert(cs0.lane<3>() == cs1.lane<3>());
+
+                    // Validate that astcenc output matches the top 8 integer bits
+                    vfloat4 colorv0f = decode_texel(colorv0, vmask4(false));
+                    vint4 colorv0_out = float_to_int_rtn(colorv0f * 255.0f);
+                    assert(colorv0_out.lane<0>() == cs0.lane<0>());
+                }
+            }
+        }
+    }
+
+    return 0;
+}