Add ktx

2026-06-14 19:09:18 +01:00
parent 14bd1a9271
commit 13fa90a0e9
3958 changed files with 999286 additions and 4 deletions
@@ -0,0 +1,7 @@
+# Copyright 2024 The Khronos Group Inc.
+# SPDX-License-Identifier: Apache-2.0
+---
+# Disable clang-format in this directory
+DisableFormat: true
+SortIncludes: false
+...
@@ -0,0 +1,12 @@
+<!-- Copyright 2025 Mark Callow -->
+<!-- SPDX-License-Identifier: Apache-2.0 -->
+
+SDL_gesture.h
+-------------
+
+The Gesture API was removed from SDL3. As a migration path they provided an equivalent single-header library `SDL_gesture.h` that can be dropped into an SDL3-based project.
+
+They do not make formal releases of this code; they say "just grab the latest and drop it into your project!"
+
+The origin of this file is fork https://github.com/MarkCallow/SDL_gesture.git whose upstream is
+https://github.com/libsdl-org/SDL_gesture. It includes modifications for robustness to prevent production of spurious GESTURE\_MULTIGESTURE events. 
@@ -0,0 +1,966 @@
+/*
+  Simple DirectMedia Layer
+  Copyright (C) 1997-2022 Sam Lantinga <slouken@libsdl.org>
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+*/
+
+/* Touch gestures were removed from SDL3, so this is the SDL2 implementation copied in here, and tweaked a little. */
+
+#ifndef INCL_SDL_GESTURE_H
+#define INCL_SDL_GESTURE_H
+
+#if !defined(SDL_MAJOR_VERSION)
+#error Please include SDL.h before including this header.
+#elif SDL_MAJOR_VERSION < 2
+#error This header requires SDL2 or later.
+#elif SDL_MAJOR_VERSION == 2
+/* building against SDL2? Just use the built-in SDL2 implementation. */
+#define Gesture_Init() (0)
+#define Gesture_Quit()
+#define Gesture_ID SDL_GestureID
+#define Gesture_LoadDollarTemplates SDL_LoadDollarTemplates
+#define Gesture_RecordGesture SDL_RecordGesture
+#define Gesture_SaveAllDollarTemplates SDL_SaveAllDollarTemplates
+#define Gesture_SaveDollarTemplate SDL_SaveDollarTemplate
+#define GESTURE_DOLLARGESTURE SDL_DOLLARGESTURE
+#define GESTURE_DOLLARRECORD SDL_DOLLARRECORD
+#define GESTURE_MULTIGESTURE SDL_MULTIGESTURE
+#define Gesture_MultiGestureEvent SDL_MultiGestureEvent
+#define Gesture_DollarGestureEvent SDL_DollarGestureEvent
+#else
+
+#include <cmath>
+
+/* Set up for C function definitions, even when using C++ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef Sint64 Gesture_ID;
+
+/* events... */
+
+/* generally you shouldn't hardcode event type numbers--and doubly so in
+   the reserved range!--but these match SDL2 and SDL3 promises to preserve
+   these values to help sdl2-compat. */
+#define GESTURE_DOLLARGESTURE 0x800
+#define GESTURE_DOLLARRECORD 0x801
+#define GESTURE_MULTIGESTURE 0x802
+
+typedef struct Gesture_MultiGestureEvent
+{
+    Uint32 type;
+    Uint32 reserved;
+    Uint64 timestamp;
+    SDL_TouchID touchID;
+    float dTheta;
+    float dDist;
+    float x;
+    float y;
+    Uint16 numFingers;
+    Uint16 padding;
+} Gesture_MultiGestureEvent;
+
+typedef struct Gesture_DollarGestureEvent
+{
+    Uint32 type;
+    Uint32 reserved;
+    Uint64 timestamp;
+    SDL_TouchID touchID;
+    Gesture_ID gestureId;
+    Uint32 numFingers;
+    float error;
+    float x;
+    float y;
+} Gesture_DollarGestureEvent;
+
+
+/* Function prototypes */
+
+/**
+ * Call this once, AFTER SDL_Init, to set up the Gesture API.
+ *
+ * \returns 0 on success, -1 on error. Call SDL_GetError() for specifics.
+ */
+extern int SDLCALL Gesture_Init(void);
+
+/**
+ * Call this once, BEFORE SDL_Quit, to clean up the Gesture API.
+ */
+extern void SDLCALL Gesture_Quit(void);
+
+/**
+ * Begin recording a gesture on a specified touch device or all touch devices.
+ *
+ * If the parameter `touchID` is -1 (i.e., all devices), this function will
+ * always return 1, regardless of whether there actually are any devices.
+ *
+ * \param touchID the touch device id, or -1 for all touch devices
+ * \returns 1 on success or 0 if the specified device could not be found.
+ */
+extern int SDLCALL Gesture_RecordGesture(SDL_TouchID touchID);
+
+/**
+ * Save all currently loaded Dollar Gesture templates.
+ *
+ * \param dst a SDL_IOStream to save to
+ * \returns the number of saved templates on success or 0 on failure; call
+ *          SDL_GetError() for more information.
+ *
+ * \since This function is available since SDL 2.0.0.
+ *
+ * \sa Gesture_LoadDollarTemplates
+ * \sa Gesture_SaveDollarTemplate
+ */
+extern int SDLCALL Gesture_SaveAllDollarTemplates(SDL_IOStream *dst);
+
+/**
+ * Save a currently loaded Dollar Gesture template.
+ *
+ * \param gestureId a gesture id
+ * \param dst a SDL_IOStream to save to
+ * \returns 1 on success or 0 on failure; call SDL_GetError() for more
+ *          information.
+ *
+ * \since This function is available since SDL 2.0.0.
+ *
+ * \sa SDL_LoadDollarTemplates
+ * \sa SDL_SaveAllDollarTemplates
+ */
+extern int SDLCALL Gesture_SaveDollarTemplate(Gesture_ID gestureId, SDL_IOStream *dst);
+
+/**
+ * Load Dollar Gesture templates from a file.
+ *
+ * \param touchID a touch id
+ * \param src a SDL_IOStream to load from
+ * \returns the number of loaded templates on success or a negative error code
+ *          (or 0) on failure; call SDL_GetError() for more information.
+ *
+ * \since This function is available since SDL 2.0.0.
+ *
+ * \sa SDL_SaveAllDollarTemplates
+ * \sa SDL_SaveDollarTemplate
+ */
+extern int SDLCALL Gesture_LoadDollarTemplates(SDL_TouchID touchID, SDL_IOStream *src);
+
+/* Ends C function definitions when using C++ */
+#ifdef __cplusplus
+}
+#endif
+
+#if defined(SDL_GESTURE_IMPLEMENTATION)
+
+#define GESTURE_MAX_DOLLAR_PATH_SIZE 1024
+#define GESTURE_DOLLARNPOINTS 64
+#define GESTURE_DOLLARSIZE 256
+#define GESTURE_PHI        0.618033989
+
+typedef struct
+{
+    float length;
+    int numPoints;
+    SDL_FPoint p[GESTURE_MAX_DOLLAR_PATH_SIZE];
+} GestureDollarPath;
+
+typedef struct
+{
+    SDL_FPoint path[GESTURE_DOLLARNPOINTS];
+    Sint64 hash;
+} GestureDollarTemplate;
+
+typedef struct
+{
+    SDL_TouchID touchID;
+    SDL_FPoint centroid;
+    GestureDollarPath dollarPath;
+    int numDownFingers;
+    int numDollarTemplates;
+    GestureDollarTemplate *dollarTemplate;
+    bool recording;
+} GestureTouch;
+
+static GestureTouch *GestureTouches = NULL;
+static int GestureNumTouches = 0;
+static bool GestureRecordAll = false;
+
+static void GestureProcessEvent(const SDL_Event *event);
+
+static bool SDLCALL GestureEventWatch(void *, SDL_Event *event)
+{
+    GestureProcessEvent(event);
+    return true;
+}
+
+int Gesture_Init(void)
+{
+    Gesture_Quit();
+    SDL_AddEventWatch(GestureEventWatch, NULL);
+    return 0;
+}
+
+
+static GestureTouch *GestureAddTouch(const SDL_TouchID touchID)
+{
+    GestureTouch *gestureTouch = (GestureTouch *)SDL_realloc(GestureTouches, (GestureNumTouches + 1) * sizeof(GestureTouch));
+    if (gestureTouch == NULL) {
+        SDL_OutOfMemory();
+        return NULL;
+    }
+
+    GestureTouches = gestureTouch;
+    SDL_zero(GestureTouches[GestureNumTouches]);
+    GestureTouches[GestureNumTouches].touchID = touchID;
+    return &GestureTouches[GestureNumTouches++];
+}
+
+#if 0
+static int GestureDelTouch(const SDL_TouchID touchID)
+{
+    int i;
+    for (i = 0; i < GestureNumTouches; i++) {
+        if (GestureTouches[i].touchID == touchID) {
+            break;
+        }
+    }
+
+    if (i == GestureNumTouches) {
+        /* not found */
+        return -1;
+    }
+
+    SDL_free(GestureTouches[i].dollarTemplate);
+    SDL_zero(GestureTouches[i]);
+
+    GestureNumTouches--;
+    if (i != GestureNumTouches) {
+        SDL_copyp(&GestureTouches[i], &GestureTouches[GestureNumTouches]);
+    }
+    return 0;
+}
+#endif
+
+static GestureTouch *GestureGetTouch(const SDL_TouchID touchID)
+{
+    int i;
+    for (i = 0; i < GestureNumTouches; i++) {
+        /* printf("%i ?= %i\n",GestureTouches[i].touchID,touchID); */
+        if (GestureTouches[i].touchID == touchID) {
+            return &GestureTouches[i];
+        }
+    }
+    return NULL;
+}
+
+int Gesture_RecordGesture(SDL_TouchID touchID)
+{
+	SDL_TouchID *devices;
+    int i;
+
+	devices = SDL_GetTouchDevices(NULL);
+	if (devices) {
+		/* make sure we know about all the devices SDL3 knows about, since we aren't connected as tightly as we were in SDL2. */
+		for (i = 0; devices[i]; i++) {
+			if (!GestureGetTouch(devices[i])) {
+				GestureAddTouch(devices[i]);
+			}
+		}
+		SDL_free(devices);
+	}
+
+    if (touchID != 0) {
+        GestureRecordAll = true;  /* !!! FIXME: this is never set back to false anywhere, that's probably a bug. */
+        for (i = 0; i < GestureNumTouches; i++) {
+            GestureTouches[i].recording = true;
+        }
+    } else {
+        GestureTouch *touch = GestureGetTouch(touchID);
+        if (!touch) {
+            return 0;  /* bogus touchid */
+        }
+        touch->recording = true;
+    }
+
+    return 1;
+}
+
+void Gesture_Quit(void)
+{
+    SDL_RemoveEventWatch(GestureEventWatch, NULL);
+    SDL_free(GestureTouches);
+    GestureTouches = NULL;
+    GestureNumTouches = 0;
+    GestureRecordAll = false;
+}
+
+static unsigned long GestureHashDollar(SDL_FPoint *points)
+{
+    unsigned long hash = 5381;
+    int i;
+    for (i = 0; i < GESTURE_DOLLARNPOINTS; i++) {
+        hash = ((hash << 5) + hash) + (unsigned long)points[i].x;
+        hash = ((hash << 5) + hash) + (unsigned long)points[i].y;
+    }
+    return hash;
+}
+
+static int GestureSaveTemplate(GestureDollarTemplate *templ, SDL_IOStream *dst)
+{
+    const size_t bytes = sizeof(templ->path[0]) * GESTURE_DOLLARNPOINTS;
+
+    if (dst == NULL) {
+        return 0;
+    }
+
+    /* No Longer storing the Hash, rehash on load */
+    /* if (SDL_IOWrite(dst, &(templ->hash), sizeof(templ->hash)) != sizeof(templ->hash)) return 0; */
+
+#if SDL_BYTEORDER == SDL_LIL_ENDIAN
+    if (SDL_WriteIO(dst, templ->path, bytes) != bytes) {
+        return 0;
+    }
+#else
+    {
+        GestureDollarTemplate copy = *templ;
+        SDL_FPoint *p = copy.path;
+        int i;
+        for (i = 0; i < GESTURE_DOLLARNPOINTS; i++, p++) {
+            p->x = SDL_SwapFloatLE(p->x);
+            p->y = SDL_SwapFloatLE(p->y);
+        }
+
+        if (SDL_WriteIO(dst, copy.path, bytes) != bytes) {
+            return 0;
+        }
+    }
+#endif
+
+    return 1;
+}
+
+SDL_DECLSPEC int SDLCALL
+Gesture_SaveAllDollarTemplates(SDL_IOStream *dst)
+{
+    int i, j, rtrn = 0;
+    for (i = 0; i < GestureNumTouches; i++) {
+        GestureTouch *touch = &GestureTouches[i];
+        for (j = 0; j < touch->numDollarTemplates; j++) {
+            rtrn += GestureSaveTemplate(&touch->dollarTemplate[j], dst);
+        }
+    }
+    return rtrn;
+}
+
+SDL_DECLSPEC int SDLCALL
+Gesture_SaveDollarTemplate(Gesture_ID gestureId, SDL_IOStream *dst)
+{
+    int i, j;
+    for (i = 0; i < GestureNumTouches; i++) {
+        GestureTouch *touch = &GestureTouches[i];
+        for (j = 0; j < touch->numDollarTemplates; j++) {
+            if (touch->dollarTemplate[j].hash == gestureId) {
+                return GestureSaveTemplate(&touch->dollarTemplate[j], dst);
+            }
+        }
+    }
+    return SDL_SetError("Unknown gestureId");
+}
+
+/* path is an already sampled set of points
+Returns the index of the gesture on success, or -1 */
+static int GestureAddDollar_one(GestureTouch *inTouch, SDL_FPoint *path)
+{
+    GestureDollarTemplate *dollarTemplate;
+    GestureDollarTemplate *templ;
+    int index;
+
+    index = inTouch->numDollarTemplates;
+    dollarTemplate = (GestureDollarTemplate *)SDL_realloc(inTouch->dollarTemplate, (index + 1) * sizeof(GestureDollarTemplate));
+    if (dollarTemplate == NULL) {
+        return SDL_OutOfMemory();
+    }
+    inTouch->dollarTemplate = dollarTemplate;
+
+    templ = &inTouch->dollarTemplate[index];
+    SDL_memcpy(templ->path, path, GESTURE_DOLLARNPOINTS * sizeof(SDL_FPoint));
+    templ->hash = GestureHashDollar(templ->path);
+    inTouch->numDollarTemplates++;
+
+    return index;
+}
+
+static int GestureAddDollar(GestureTouch *inTouch, SDL_FPoint *path)
+{
+    int index = -1;
+    int i = 0;
+    if (inTouch == NULL) {
+        if (GestureNumTouches == 0) {
+            return SDL_SetError("no gesture touch devices registered");
+        }
+        for (i = 0; i < GestureNumTouches; i++) {
+            inTouch = &GestureTouches[i];
+            index = GestureAddDollar_one(inTouch, path);
+            if (index < 0) {
+                return -1;
+            }
+        }
+        /* Use the index of the last one added. */
+        return index;
+    }
+    return GestureAddDollar_one(inTouch, path);
+}
+
+SDL_DECLSPEC int SDLCALL
+Gesture_LoadDollarTemplates(SDL_TouchID touchID, SDL_IOStream *src)
+{
+    int i, loaded = 0;
+    GestureTouch *touch = NULL;
+    if (src == NULL) {
+        return 0;
+    }
+    /* In SDL2 this test was `touchID >= 0` leading to warnings from gcc
+       because SDL_TouchId is now Uint64. In SDL2 it was Sint64. The
+       documentation does not say what < 0 means here but the only defined
+       negative touchID was SDL_MOUSE_TOUCHID (-1). In SDL3 SDL_PEN_TOUCHID (-2)
+       has been added hence this test. Given the lack of documentation
+       it is impossible to say if this updated test is correct. */
+    if (touchID < SDL_PEN_TOUCHID) {
+        for (i = 0; i < GestureNumTouches; i++) {
+            if (GestureTouches[i].touchID == touchID) {
+                touch = &GestureTouches[i];
+            }
+        }
+        if (touch == NULL) {
+            return SDL_SetError("given touch id not found");
+        }
+    }
+
+    while (1) {
+        GestureDollarTemplate templ;
+        const size_t bytes = sizeof(templ.path[0]) * GESTURE_DOLLARNPOINTS;
+
+        if (SDL_ReadIO(src, templ.path, bytes) < bytes) {
+            if (loaded == 0) {
+                return SDL_SetError("could not read any dollar gesture from rwops");
+            }
+            break;
+        }
+
+#if SDL_BYTEORDER != SDL_LIL_ENDIAN
+        for (i = 0; i < GESTURE_DOLLARNPOINTS; i++) {
+            SDL_FPoint *p = &templ.path[i];
+            p->x = SDL_SwapFloatLE(p->x);
+            p->y = SDL_SwapFloatLE(p->y);
+        }
+#endif
+
+        // See comment at line 436.
+        if (touchID < SDL_PEN_TOUCHID) {
+            /* printf("Adding loaded gesture to 1 touch\n"); */
+            if (GestureAddDollar(touch, templ.path) >= 0) {
+                loaded++;
+            }
+        } else {
+            /* printf("Adding to: %i touches\n",GestureNumTouches); */
+            for (i = 0; i < GestureNumTouches; i++) {
+                touch = &GestureTouches[i];
+                /* printf("Adding loaded gesture to + touches\n"); */
+                /* TODO: What if this fails? */
+                GestureAddDollar(touch, templ.path);
+            }
+            loaded++;
+        }
+    }
+
+    return loaded;
+}
+
+static float GestureDollarDifference(SDL_FPoint *points, SDL_FPoint *templ, float ang)
+{
+    /*  SDL_FPoint p[GESTURE_DOLLARNPOINTS]; */
+    float dist = 0;
+    SDL_FPoint p;
+    int i;
+    for (i = 0; i < GESTURE_DOLLARNPOINTS; i++) {
+        p.x = points[i].x * SDL_cosf(ang) - points[i].y * SDL_sinf(ang);
+        p.y = points[i].x * SDL_sinf(ang) + points[i].y * SDL_cosf(ang);
+        dist += SDL_sqrtf((p.x - templ[i].x) * (p.x - templ[i].x) + (p.y - templ[i].y) * (p.y - templ[i].y));
+    }
+    return dist / GESTURE_DOLLARNPOINTS;
+}
+
+static float GestureBestDollarDifference(SDL_FPoint *points, SDL_FPoint *templ)
+{
+    /*------------BEGIN DOLLAR BLACKBOX------------------
+      -TRANSLATED DIRECTLY FROM PSUDEO-CODE AVAILABLE AT-
+      -"http://depts.washington.edu/aimgroup/proj/dollar/"
+    */
+    double ta = -SDL_PI_D / 4;
+    double tb = SDL_PI_D / 4;
+    double dt = SDL_PI_D / 90;
+    float x1 = (float)(GESTURE_PHI * ta + (1 - GESTURE_PHI) * tb);
+    float f1 = GestureDollarDifference(points, templ, x1);
+    float x2 = (float)((1 - GESTURE_PHI) * ta + GESTURE_PHI * tb);
+    float f2 = GestureDollarDifference(points, templ, x2);
+    while (SDL_fabs(ta - tb) > dt) {
+        if (f1 < f2) {
+            tb = x2;
+            x2 = x1;
+            f2 = f1;
+            x1 = (float)(GESTURE_PHI * ta + (1 - GESTURE_PHI) * tb);
+            f1 = GestureDollarDifference(points, templ, x1);
+        } else {
+            ta = x1;
+            x1 = x2;
+            f1 = f2;
+            x2 = (float)((1 - GESTURE_PHI) * ta + GESTURE_PHI * tb);
+            f2 = GestureDollarDifference(points, templ, x2);
+        }
+    }
+    /*
+      if (f1 <= f2)
+          printf("Min angle (x1): %f\n",x1);
+      else if (f1 >  f2)
+          printf("Min angle (x2): %f\n",x2);
+    */
+    return SDL_min(f1, f2);
+}
+
+/* `path` contains raw points, plus (possibly) the calculated length */
+static int GestureDollarNormalize(const GestureDollarPath *path, SDL_FPoint *points, bool is_recording)
+{
+    int i;
+    float interval;
+    float dist;
+    int numPoints = 0;
+    SDL_FPoint centroid;
+    float xmin, xmax, ymin, ymax;
+    float ang;
+    float w, h;
+    float length = path->length;
+
+    /* Calculate length if it hasn't already been done */
+    if (length <= 0) {
+        for (i = 1; i < path->numPoints; i++) {
+            const float dx = path->p[i].x - path->p[i - 1].x;
+            const float dy = path->p[i].y - path->p[i - 1].y;
+            length += SDL_sqrtf(dx * dx + dy * dy);
+        }
+    }
+
+    /* Resample */
+    interval = length / (GESTURE_DOLLARNPOINTS - 1);
+    dist = interval;
+
+    centroid.x = 0;
+    centroid.y = 0;
+
+    /* printf("(%f,%f)\n",path->p[path->numPoints-1].x,path->p[path->numPoints-1].y); */
+    for (i = 1; i < path->numPoints; i++) {
+        const float d = SDL_sqrtf((path->p[i - 1].x - path->p[i].x) * (path->p[i - 1].x - path->p[i].x) + (path->p[i - 1].y - path->p[i].y) * (path->p[i - 1].y - path->p[i].y));
+        /* printf("d = %f dist = %f/%f\n",d,dist,interval); */
+        while (dist + d > interval) {
+            points[numPoints].x = path->p[i - 1].x +
+                                  ((interval - dist) / d) * (path->p[i].x - path->p[i - 1].x);
+            points[numPoints].y = path->p[i - 1].y +
+                                  ((interval - dist) / d) * (path->p[i].y - path->p[i - 1].y);
+            centroid.x += points[numPoints].x;
+            centroid.y += points[numPoints].y;
+            numPoints++;
+
+            dist -= interval;
+        }
+        dist += d;
+    }
+    if (numPoints < GESTURE_DOLLARNPOINTS - 1) {
+        if (is_recording) {
+            SDL_SetError("ERROR: NumPoints = %i", numPoints);
+        }
+        return 0;
+    }
+    /* copy the last point */
+    points[GESTURE_DOLLARNPOINTS - 1] = path->p[path->numPoints - 1];
+    numPoints = GESTURE_DOLLARNPOINTS;
+
+    centroid.x /= numPoints;
+    centroid.y /= numPoints;
+
+    /* printf("Centroid (%f,%f)",centroid.x,centroid.y); */
+    /* Rotate Points so point 0 is left of centroid and solve for the bounding box */
+    xmin = centroid.x;
+    xmax = centroid.x;
+    ymin = centroid.y;
+    ymax = centroid.y;
+
+    ang = SDL_atan2f(centroid.y - points[0].y, centroid.x - points[0].x);
+
+    for (i = 0; i < numPoints; i++) {
+        const float px = points[i].x;
+        const float py = points[i].y;
+        points[i].x = (px - centroid.x) * SDL_cosf(ang) - (py - centroid.y) * SDL_sinf(ang) + centroid.x;
+        points[i].y = (px - centroid.x) * SDL_sinf(ang) + (py - centroid.y) * SDL_cosf(ang) + centroid.y;
+
+        if (points[i].x < xmin) {
+            xmin = points[i].x;
+        }
+        if (points[i].x > xmax) {
+            xmax = points[i].x;
+        }
+        if (points[i].y < ymin) {
+            ymin = points[i].y;
+        }
+        if (points[i].y > ymax) {
+            ymax = points[i].y;
+        }
+    }
+
+    /* Scale points to GESTURE_DOLLARSIZE, and translate to the origin */
+    w = xmax - xmin;
+    h = ymax - ymin;
+
+    for (i = 0; i < numPoints; i++) {
+        points[i].x = (points[i].x - centroid.x) * GESTURE_DOLLARSIZE / w;
+        points[i].y = (points[i].y - centroid.y) * GESTURE_DOLLARSIZE / h;
+    }
+    return numPoints;
+}
+
+static float GestureDollarRecognize(const GestureDollarPath *path, int *bestTempl, GestureTouch *touch)
+{
+    SDL_FPoint points[GESTURE_DOLLARNPOINTS];
+    int i;
+    float bestDiff = 10000;
+
+    SDL_memset(points, 0, sizeof(points));
+
+    GestureDollarNormalize(path, points, false);
+
+    /* PrintPath(points); */
+    *bestTempl = -1;
+    for (i = 0; i < touch->numDollarTemplates; i++) {
+        const float diff = GestureBestDollarDifference(points, touch->dollarTemplate[i].path);
+        if (diff < bestDiff) {
+            bestDiff = diff;
+            *bestTempl = i;
+        }
+    }
+    return bestDiff;
+}
+
+static void GestureSendMulti(GestureTouch *touch, float dTheta, float dDist)
+{
+    if (SDL_EventEnabled(GESTURE_MULTIGESTURE)) {
+        Gesture_MultiGestureEvent mgesture;
+        mgesture.type = GESTURE_MULTIGESTURE;
+        mgesture.timestamp = 0;
+        mgesture.touchID = touch->touchID;
+        mgesture.x = touch->centroid.x;
+        mgesture.y = touch->centroid.y;
+        mgesture.dTheta = dTheta;
+        mgesture.dDist = dDist;
+        mgesture.numFingers = (Uint16)touch->numDownFingers;
+        SDL_PushEvent((SDL_Event*)&mgesture);
+    }
+}
+
+static void GestureSendDollar(GestureTouch *touch, Gesture_ID gestureId, float error)
+{
+    if (SDL_EventEnabled(GESTURE_DOLLARGESTURE)) {
+        Gesture_DollarGestureEvent dgesture;
+        dgesture.type = GESTURE_DOLLARGESTURE;
+        dgesture.timestamp = 0;
+        dgesture.touchID = touch->touchID;
+        dgesture.x = touch->centroid.x;
+        dgesture.y = touch->centroid.y;
+        dgesture.gestureId = gestureId;
+        dgesture.error = error;
+        /* A finger came up to trigger this event. */
+        dgesture.numFingers = touch->numDownFingers + 1;
+        SDL_PushEvent((SDL_Event*)&dgesture);
+    }
+}
+
+static void GestureSendDollarRecord(GestureTouch *touch, Gesture_ID gestureId)
+{
+    if (SDL_EventEnabled(GESTURE_DOLLARRECORD)) {
+        Gesture_DollarGestureEvent dgesture;
+        dgesture.type = GESTURE_DOLLARRECORD;
+        dgesture.timestamp = 0;
+        dgesture.touchID = touch->touchID;
+        dgesture.gestureId = gestureId;
+        SDL_PushEvent((SDL_Event*)&dgesture);
+    }
+}
+
+#if !defined(GESTURE_LOG_UP_DOWN_EVENTS)
+  #define GESTURE_LOG_UP_DOWN_EVENTS 0
+#endif
+#if !defined(GESTURE_LOG_MOTION_EVENTS)
+  #define GESTURE_LOG_MOTION_EVENTS 0
+#endif
+
+static void GestureProcessEvent(const SDL_Event *event)
+{
+    float x, y;
+    int index;
+    int i;
+    float pathDx, pathDy;
+    SDL_FPoint lastP;
+    SDL_FPoint lastCentroid;
+    float lDist;
+    float Dist;
+    float dtheta;
+    float dDist;
+
+    if (event->type == SDL_EVENT_FINGER_MOTION || event->type == SDL_EVENT_FINGER_DOWN || event->type == SDL_EVENT_FINGER_UP) {
+        GestureTouch *inTouch = GestureGetTouch(event->tfinger.touchID);
+        if (inTouch == NULL) {  /* we maybe didn't see this one before. */
+            inTouch = GestureAddTouch(event->tfinger.touchID);
+            if (!inTouch) {
+                return;  /* oh well. */
+            }
+        }
+        int numDownFingersReported;
+        SDL_Finger** fingers = SDL_GetTouchFingers(event->tfinger.touchID, &numDownFingersReported);
+
+        x = event->tfinger.x;
+        y = event->tfinger.y;
+
+        /* Finger Up */
+        if (event->type == SDL_EVENT_FINGER_UP) {
+#if GESTURE_LOG_UP_DOWN_EVENTS
+            SDL_Log("GPE: Finger: %#" SDL_PRIx64 " UP. Device: %#" SDL_PRIx64 ", fingers: %i, x: %f, y: %f, press: %f",
+                    event->tfinger.fingerID, event->tfinger.touchID, numDownFingersReported,
+                    event->tfinger.x, event->tfinger.y, event->tfinger.pressure);
+#endif
+            SDL_FPoint path[GESTURE_DOLLARNPOINTS];
+
+#if SDL_PLATFORM_MACOS
+            /* Workaround issue https://github.com/libsdl-org/SDL/issues/13428,
+               Extra SDL_EVENT_FINGER_{UP,DOWN} with mouse button press, by
+               ignoring events with fingerID of SDL_BUTTON_LEFT.
+
+               N.B. If SDL_HINT_MOUSE_TOUCH_EVENTS is set to 0 no touch
+               events are received from the trackpad. */
+            if (event->tfinger.fingerID == SDL_BUTTON_LEFT) return;
+#endif
+            /* Using the number of fingers returned by SDL_GetTouchFingers
+               is much more robust than counting finger up and down events.
+               With counting it is easy for the counted number to be higher
+               than the actual number. Unfortunately it has not been possible
+               to identify a sequence of actions that reliably reproduces
+               this but asserts have shown it happens often. Perhaps
+               sometimes a single UP or DOWN event is received for multiple
+               fingers.
+
+               Using the reported number is independent of how many events
+               are actually received. But, and this is a big one, in the
+               case of FINGER_UP SDL_GetTouchFingers reports the number of
+               fingers down *before* the up event.
+
+               N.B. In the case of a left button press on macOS,
+               SDL_GetTouchFingers reports 1 for the event that is not
+               ignored.
+               */
+            inTouch->numDownFingers = numDownFingersReported - 1;
+            assert(inTouch->numDownFingers >= 0);
+#if (GESTURE_LOG_UP_DOWN_EVENTS)
+            SDL_Log("GPE FINGER_UP, numDownFingers now = %i", inTouch->numDownFingers);
+#endif
+
+            if (inTouch->recording) {
+                inTouch->recording = false;
+                GestureDollarNormalize(&inTouch->dollarPath, path, true);
+                /* PrintPath(path); */
+                if (GestureRecordAll) {
+                    index = GestureAddDollar(NULL, path);
+                    for (i = 0; i < GestureNumTouches; i++) {
+                        GestureTouches[i].recording = false;
+                    }
+                } else {
+                    index = GestureAddDollar(inTouch, path);
+                }
+
+                if (index >= 0) {
+                    GestureSendDollarRecord(inTouch, inTouch->dollarTemplate[index].hash);
+                } else {
+                    GestureSendDollarRecord(inTouch, -1);
+                }
+            } else {
+                int bestTempl = -1;
+                const float error = GestureDollarRecognize(&inTouch->dollarPath, &bestTempl, inTouch);
+                if (bestTempl >= 0) {
+                    /* Send Event */
+                    const Gesture_ID gestureId = inTouch->dollarTemplate[bestTempl].hash;
+                    GestureSendDollar(inTouch, gestureId, error);
+                    /* printf ("%s\n",);("Dollar error: %f\n",error); */
+                }
+            }
+
+            /* inTouch->gestureLast[j] = inTouch->gestureLast[inTouch->numDownFingers]; */
+            if (inTouch->numDownFingers > 0) {
+                inTouch->centroid.x = (inTouch->centroid.x * (inTouch->numDownFingers + 1) - x) / inTouch->numDownFingers;
+                inTouch->centroid.y = (inTouch->centroid.y * (inTouch->numDownFingers + 1) - y) / inTouch->numDownFingers;
+            } else {
+                inTouch->centroid.x = inTouch->centroid.y = 0.0f;
+            }
+        } else if (event->type == SDL_EVENT_FINGER_MOTION) {
+            /* There is one FINGER_MOTION event per down finger. x,y gives
+               the position of the finger whose id is in the event. */
+            const float dx = event->tfinger.dx;
+            const float dy = event->tfinger.dy;
+            GestureDollarPath *path = &inTouch->dollarPath;
+
+#if GESTURE_LOG_MOTION_EVENTS
+           SDL_Log("GPE: Finger: %#" SDL_PRIx64 " MOTION: device: %#" SDL_PRIx64 ", timestamp = %"
+                    SDL_PRIu64 ", fingers: %i, x: %f, y: %f, press: %f, numDownFingers: %i",
+                    event->tfinger.fingerID, event->tfinger.touchID, event->tfinger.timestamp,
+                    numDownFingersReported, event->tfinger.x, event->tfinger.y, event->tfinger.pressure,
+                    inTouch->numDownFingers);
+#endif
+            assert(numDownFingersReported > 0);
+#if SDL_PLATFORM_MACOS
+            /* Workaround issue https://github.com/libsdl-org/SDL/issues/13428.
+               See comment at line 753 for more details. */
+            if (event->tfinger.fingerID == SDL_BUTTON_LEFT) return;
+            /* SDL_GetTouchFingers reports 2 fingers down in the motion event
+               for the other finger during button press. Fix up the number of
+               fingers. */
+            uint32_t reportedNumFingers = numDownFingersReported;
+            for (uint32_t i = 0; i < reportedNumFingers; i++) {
+                if (fingers[i]->id == SDL_BUTTON_LEFT) {
+                    numDownFingersReported--;
+                    break;
+                }
+            }
+#endif
+            /* See comment at line 762. One case where the count reliably
+               differs from reported is on iOS. When touching, dragging and
+               releasing 2 fingers, iOS sends a BUTTON_DOWN and BUTTON_UP
+               for one of the fingers. When the finger corresponding to the
+               button is raised, it sends the BUTTON_UP followed by the
+               FINGER_UP but FINGER_MOTION events can come before the
+               FINGER_UP and those events have only one finger down. */
+            inTouch->numDownFingers = numDownFingersReported;
+            if (path->numPoints < GESTURE_MAX_DOLLAR_PATH_SIZE) {
+                path->p[path->numPoints].x = inTouch->centroid.x;
+                path->p[path->numPoints].y = inTouch->centroid.y;
+                pathDx = (path->p[path->numPoints].x - path->p[path->numPoints - 1].x);
+                pathDy = (path->p[path->numPoints].y - path->p[path->numPoints - 1].y);
+                path->length += (float)SDL_sqrt(pathDx * pathDx + pathDy * pathDy);
+                path->numPoints++;
+            }
+
+            lastP.x = x - dx;
+            lastP.y = y - dy;
+            lastCentroid = inTouch->centroid;
+
+            inTouch->centroid.x += dx / inTouch->numDownFingers;
+            inTouch->centroid.y += dy / inTouch->numDownFingers;
+            /* printf("Centroid : (%f,%f)\n",inTouch->centroid.x,inTouch->centroid.y); */
+            if (inTouch->numDownFingers > 1) {
+                SDL_FPoint lv; /* Vector from centroid to last x,y position */
+                SDL_FPoint v;  /* Vector from centroid to current x,y position */
+                /* lv = inTouch->gestureLast[j].cv; */
+                lv.x = lastP.x - lastCentroid.x;
+                lv.y = lastP.y - lastCentroid.y;
+                lDist = SDL_sqrtf(lv.x * lv.x + lv.y * lv.y);
+                /* printf("lDist = %f\n",lDist); */
+                v.x = x - inTouch->centroid.x;
+                v.y = y - inTouch->centroid.y;
+                /* inTouch->gestureLast[j].cv = v; */
+                Dist = SDL_sqrtf(v.x * v.x + v.y * v.y);
+                /* SDL_cosf(dTheta) = (v . lv)/(|v| * |lv|) */
+
+                /* Normalize Vectors to simplify angle calculation */
+                lv.x /= lDist;
+                lv.y /= lDist;
+                v.x /= Dist;
+                v.y /= Dist;
+                dtheta = SDL_atan2f(lv.x * v.y - lv.y * v.x, lv.x * v.x + lv.y * v.y);
+
+                dDist = (Dist - lDist);
+                if (lDist == 0) {
+                    /* To avoid impossible values */
+                    dDist = 0;
+                    dtheta = 0;
+                }
+
+                /* inTouch->gestureLast[j].dDist = dDist;
+                inTouch->gestureLast[j].dtheta = dtheta;
+
+                printf("dDist = %f, dTheta = %f\n",dDist,dtheta);
+                gdtheta = gdtheta*.9 + dtheta*.1;
+                gdDist  =  gdDist*.9 +  dDist*.1
+                knob.r += dDist/numDownFingers;
+                knob.ang += dtheta;
+                printf("thetaSum = %f, distSum = %f\n",gdtheta,gdDist);
+                printf("id: %i dTheta = %f, dDist = %f\n",j,dtheta,dDist); */
+                GestureSendMulti(inTouch, dtheta, dDist);
+            } else {
+                /* inTouch->gestureLast[j].dDist = 0;
+                inTouch->gestureLast[j].dtheta = 0;
+                inTouch->gestureLast[j].cv.x = 0;
+                inTouch->gestureLast[j].cv.y = 0; */
+            }
+            /* inTouch->gestureLast[j].f.p.x = x;
+            inTouch->gestureLast[j].f.p.y = y;
+            break;
+            pressure? */
+        } else if (event->type == SDL_EVENT_FINGER_DOWN) {
+#if (GESTURE_LOG_UP_DOWN_EVENTS)
+            SDL_Log("GPE: Finger: %#" SDL_PRIx64 " DOWN. Device: %#" SDL_PRIx64 ", fingers: %i, x: %f, y: %f, press: %f",
+                    event->tfinger.fingerID, event->tfinger.touchID, numDownFingersReported,
+                    event->tfinger.x, event->tfinger.y, event->tfinger.pressure);
+#endif
+#if SDL_PLATFORM_MACOS
+            /* See comment starting at line 753. */
+            if (event->tfinger.fingerID == SDL_BUTTON_LEFT) return;
+#endif
+            /* Using the number of fingers returned by SDL_GetTouchFingers
+               is much more robust than counting finger up and down events.
+               With counting it is easy for the counted number to be higher
+               than the actual number. Unfortunately it has not been possible
+               to identify a sequence of actions that reliably reproduces
+               this. Using the reported number is independent of how many
+               events are actually received. */
+            inTouch->numDownFingers = numDownFingersReported;
+            inTouch->centroid.x = inTouch->centroid.y = 0.0;
+            for (i = 0; i < numDownFingersReported; i++) {
+                inTouch->centroid.x += fingers[i]->x;
+                inTouch->centroid.y += fingers[i]->y;
+            }
+            inTouch->centroid.x /= numDownFingersReported;
+            inTouch->centroid.y /= numDownFingersReported;
+            //printf("Finger Down: (%f,%f). Centroid: (%f,%f\n",x,y,
+            //     inTouch->centroid.x,inTouch->centroid.y);
+
+            inTouch->dollarPath.length = 0;
+            inTouch->dollarPath.p[0].x = x;
+            inTouch->dollarPath.p[0].y = y;
+            inTouch->dollarPath.numPoints = 1;
+        }
+        SDL_free(fingers);
+    }
+}
+
+#endif  /* defined(SDL_GESTURE_IMPLEMENTATION) */
+#endif  /* SDL version > 2 */
+#endif /* INCL_SDL_GESTURE_H */
+
+/* vi: set sts=4 ts=4 sw=4 expandtab: */
@@ -0,0 +1,21 @@
+# Text type files use auto line endings
+* text=auto
+
+# Explicitly declare text file types for this repo
+*.c text
+*.cpp text
+*.h text
+*.md text
+Jenkinsfile text
+
+# VS solutions always use Windows line endings
+*.sln text eol=crlf
+*.vcxproj text eol=crlf
+
+# Bash scripts always use *nux line endings
+*.sh text eol=lf
+
+# Denote all files that are truly binary and should not be modified.
+*.png binary
+*.hdr binary
+*.exe binary
@@ -0,0 +1,385 @@
+name: post-weekly-release
+run-name: Build, test, generate signed artifacts and optionally prepare release
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+    tags:
+      - '*'
+  schedule:
+    - cron: '17 2 * * 1'
+
+jobs:
+
+  coverity:
+    if: ${{ (!startsWith(github.event.ref, 'refs/tags/')) && (github.repository_owner == 'Arm-software') }}
+    name: Run Coverity static analysis
+    runs-on: [self-hosted-ubuntu-latest-x64]
+    steps:
+      - name: Clean workspace
+        uses: AutoModality/action-clean@v1
+
+      - name: Git checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: 'true'
+
+      - name: Coverity preparation
+        run: |
+          export PATH=$PATH:/usr/local/cov-analysis/bin
+          mkdir build_cov
+          cd build_cov
+          cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ -DASTCENC_ISA_AVX2=ON ..
+          cov-configure --config ${GITHUB_WORKSPACE}/coverity.conf --template --compiler cc --comptype gcc
+          cov-configure --config ${GITHUB_WORKSPACE}/coverity.conf --template --compiler c++ --comptype g++
+
+      - name: Coverity build
+        run: |
+          export PATH=$PATH:/usr/local/cov-analysis/bin
+          cd build_cov
+          cov-build --config ${GITHUB_WORKSPACE}/coverity.conf --dir ${GITHUB_WORKSPACE}/intermediate make install
+
+      - name: Coverity analyze
+        run: |
+          export PATH=$PATH:/usr/local/cov-analysis/bin
+          cd build_cov
+          cov-analyze --dir ${GITHUB_WORKSPACE}/intermediate
+
+      - name: Coverity upload
+        env:
+          COVERITY_KEY: ${{ secrets.COVERITY_KEY }}
+        run: |
+          export PATH=$PATH:/usr/local/cov-analysis/bin
+          echo "${COVERITY_KEY}" > coverity.key
+          chmod 400 coverity.key
+          cd build_cov
+          cov-commit-defects \
+            --dir ${GITHUB_WORKSPACE}/intermediate \
+            --stream astcenc-master \
+            --url https://coverity.cambridge.arm.com \
+            --auth-key-file ../coverity.key \
+            --strip-path ${GITHUB_WORKSPACE}
+
+  build-ubuntu-arm64:
+    name: Ubuntu arm64
+    runs-on: ubuntu-24.04-arm
+    steps:
+      - name: Git checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: 'true'
+
+      - name: Update apt packages
+        run: sudo apt-get update
+
+      - name: Install ImageMagick
+        run: sudo apt-get install imagemagick
+
+      - name: Build release
+        run: |
+          export CXX=clang++
+          mkdir build_rel
+          cd build_rel
+          cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ -DASTCENC_ISA_NEON=ON -DASTCENC_ISA_SVE_128=ON -DASTCENC_ISA_SVE_256=ON -DASTCENC_PACKAGE=arm64 ..
+          make install package -j4
+
+      - name: Upload binaries
+        uses: actions/upload-artifact@v4
+        with:
+          name: astcenc-linux-arm64
+          path: |
+            build_rel/*.zip
+            build_rel/*.zip.sha256
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Get Python modules
+        run: |
+          python -m pip install --upgrade pip
+          pip install numpy Pillow
+
+      - name: Run system tests
+        # Disable SVE testing for now
+        run: |
+          python ./Test/astc_test_functional.py --encoder neon
+          python ./Test/astc_test_image.py --encoder neon --test-set Small
+
+  build-ubuntu-x64:
+    name: Ubuntu x64
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Git checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: 'true'
+
+      - name: Build release
+        run: |
+          export CXX=clang++
+          mkdir build_rel
+          cd build_rel
+          cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ -DASTCENC_ISA_AVX2=ON -DASTCENC_ISA_SSE41=ON -DASTCENC_ISA_SSE2=ON -DASTCENC_PACKAGE=x64 ..
+          make install package -j4
+
+      - name: Upload binaries
+        uses: actions/upload-artifact@v4
+        with:
+          name: astcenc-linux-x86_64
+          path: |
+            build_rel/*.zip
+            build_rel/*.zip.sha256
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Get Python modules
+        run: |
+          python -m pip install --upgrade pip
+          pip install numpy Pillow
+
+      - name: Run system tests
+        run: |
+          python ./Test/astc_test_functional.py
+          python ./Test/astc_test_image.py --encoder all-x86 --test-set Small
+
+  build-macos-universal:
+    name: macOS universal
+    runs-on: macos-14
+    steps:
+      - name: Git checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: 'true'
+
+      - name: Build release
+        run: |
+          mkdir build_rel
+          cd build_rel
+          cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ -DASTCENC_PACKAGE=universal ..
+          make install package -j4
+
+      - name: Upload binaries
+        uses: actions/upload-artifact@v4
+        with:
+          name: astcenc-macos-universal
+          path: |
+            build_rel/*.zip
+            build_rel/*.zip.sha256
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Get Python modules
+        run: |
+          python -m pip install --upgrade pip
+          pip install numpy Pillow
+
+      - name: Run system tests
+        run: |
+          python ./Test/astc_test_image.py --test-set Small --encoder universal
+
+  build-windows-multi:
+    name: Windows multi
+    runs-on: windows-2022
+    steps:
+      - name: Git checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: 'true'
+
+      - name: Setup Visual Studio x86_64
+        uses: ilammy/msvc-dev-cmd@v1
+
+      - name: Build release x64
+        run: |
+          mkdir build_rel
+          cd build_rel
+          cmake -G "Visual Studio 17 2022" -T ClangCL -DCMAKE_INSTALL_PREFIX=../ -DASTCENC_ISA_AVX2=ON -DASTCENC_ISA_SSE41=ON -DASTCENC_ISA_SSE2=ON -DASTCENC_PACKAGE=x64 ..
+          msbuild astcencoder.sln -property:Configuration=Release
+          msbuild PACKAGE.vcxproj -property:Configuration=Release
+          msbuild INSTALL.vcxproj -property:Configuration=Release
+        shell: cmd
+
+      - name: Setup Visual Studio arm64
+        uses: ilammy/msvc-dev-cmd@v1
+        with:
+          arch: x86_arm64
+
+      - name: Build release arm64
+        run: |
+          mkdir build_rel_arm64
+          cd build_rel_arm64
+          cmake -G "Visual Studio 17 2022" -A ARM64 -T ClangCL -DCMAKE_INSTALL_PREFIX=../ -DASTCENC_ISA_NEON=ON -DASTCENC_PACKAGE=arm64 ..
+          msbuild astcencoder.sln -property:Configuration=Release
+          msbuild PACKAGE.vcxproj -property:Configuration=Release
+          msbuild INSTALL.vcxproj -property:Configuration=Release
+        shell: cmd
+
+      - name: Upload binaries
+        uses: actions/upload-artifact@v4
+        with:
+          name: astcenc-windows-multi-cl
+          path: |
+            build_rel/*.zip
+            build_rel/*.zip.sha256
+            build_rel_arm64/*.zip
+            build_rel_arm64/*.zip.sha256
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Get Python modules
+        run: |
+          python -m pip install --upgrade pip
+          pip install numpy Pillow
+        shell: cmd
+
+      - name: Run system tests
+        run: |
+          python ./Test/astc_test_image.py --test-set Small
+        shell: cmd
+
+  sign-binaries:
+    if: github.repository_owner == 'Arm-software'
+    name: Sign Windows and macOS
+    runs-on: [self-hosted-ubuntu-latest-x64]
+    needs: [build-macos-universal, build-windows-multi]
+    steps:
+      - name: Clean workspace
+        uses: AutoModality/action-clean@v1
+
+      - name: Checkout signing code
+        env:
+          SIGNING_REPO_URL: ${{ secrets.SIGNING_REPO_URL }}
+        run: |
+          git clone --depth 1 ${SIGNING_REPO_URL}
+
+      - name: Install code sign v2 client
+        env:
+          ARTIFACTORY_USER: ${{ secrets.ARTIFACTORY_USER }}
+          ARTIFACTORY_APIKEY: ${{ secrets.ARTIFACTORY_APIKEY }}
+          ARTIFACTORY_FQDN: ${{ secrets.ARTIFACTORY_FQDN }}
+        run: |
+          python3.11 -m venv cs
+          . ./cs/bin/activate
+          pip install -i https://${ARTIFACTORY_USER}:${ARTIFACTORY_APIKEY}@${ARTIFACTORY_FQDN}/artifactory/api/pypi/dsgcore.pypi/simple code-signer-client
+
+      - name: Download macOS binaries
+        uses: actions/download-artifact@v4
+        with:
+          name: astcenc-macos-universal
+          path: mac
+
+      - name: Download Windows binaries
+        uses: actions/download-artifact@v4
+        with:
+          name: astcenc-windows-multi-cl
+          path: windows
+
+      - name: Sign macOS binaries
+        env:
+          CODESIGNER_USER: ${{ secrets.CODESIGNER_USER }}
+        run: |
+          . ./cs/bin/activate
+          cd mac
+          python3 ${GITHUB_WORKSPACE}/signing/macos-client-wrapper.py ${CODESIGNER_USER} *.zip
+
+      - name: Sign Windows binaries
+        env:
+          ARTIFACTORY_APIKEY: ${{ secrets.ARTIFACTORY_APIKEY }}
+        run: |
+          . ./cs/bin/activate
+          cd windows
+          for FILENAME in */*; do mv ${FILENAME} .; done
+          for ZIPFILE in *.zip; do python3 ../signing/windows-client-wrapper.py -b ${GITHUB_RUN_NUMBER} -t ${ARTIFACTORY_APIKEY} ${ZIPFILE}; done
+
+      - name: Upload signed binaries
+        uses: actions/upload-artifact@v4
+        with:
+          name: signed-binaries
+          path: |
+            windows/*
+            mac/*
+
+      - name: Tidy intermediate artifacts
+        uses: geekyeggo/delete-artifact@v5
+        with:
+          name: |
+            astcenc-windows-multi-cl
+            astcenc-macos-universal
+
+  prepare-release:
+    if: ${{ (startsWith(github.event.ref, 'refs/tags/')) && (github.repository_owner == 'Arm-software') }}
+    name: Prepare release
+    runs-on: ubuntu-22.04
+    needs: [sign-binaries, build-ubuntu-x64]
+    steps:
+      - name: Git checkout
+        uses: actions/checkout@v4
+
+      - name: Download signed binaries
+        uses: actions/download-artifact@v4
+        with:
+          name: signed-binaries
+          path: prepare-release
+
+      - name: Download Linux x86_64 binaries
+        uses: actions/download-artifact@v4
+        with:
+          name: astcenc-linux-x86_64
+          path: prepare-release
+
+      - name: Download Linux arm64 binaries
+        uses: actions/download-artifact@v4
+        with:
+          name: astcenc-linux-arm64
+          path: prepare-release
+
+      - name: Flatten file structure
+        run: |
+          cd prepare-release
+          for FILENAME in */*; do mv ${FILENAME} .; done
+          rmdir windows
+          rmdir mac
+
+      - name: Create checksum file
+        run: |
+          cd prepare-release
+          cat *.sha256 > release-sha256.txt
+          rm *.sha256
+
+      - name: Create release body
+        run: |
+          export STATUS_DATE=$(date "+%B %Y")
+          GITHUB_REF=${{ github.ref }} ; export RELEASE_VERSION=${GITHUB_REF##*/}
+          export SHA_CHECKSUMS=$(cat prepare-release/release-sha256.txt)
+          envsubst < .github/workflows/release_body_template.md > prepare-release/release_body.md
+
+      - name: Create release
+        id: create_release
+        uses: comnoco/create-release-action@v2
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ github.ref }}
+          release_name: ${{ github.ref }}
+          body_path: prepare-release/release_body.md
+          draft: true
+
+      - name: Attach artifacts
+        uses: AButler/upload-release-assets@v3.0
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+          release-id: ${{ steps.create_release.outputs.id }}
+          files: "prepare-release/astcenc-*-*-*.zip;prepare-release/release-sha256.txt"
@@ -0,0 +1,13 @@
+**Status:** ${STATUS_DATE}
+
+The ${RELEASE_VERSION} release is a minor/major maintenance release.
+
+* **General:**
+  * **Bug fix:** Text here
+  * **Feature:** Text here
+
+## Binary release sha256 checksums
+
+```
+${SHA_CHECKSUMS}
+```
@@ -0,0 +1,47 @@
+# Editor and engineering scratch files
+.cache
+.vs
+.vscode
+.DS_Store
+*.log
+*.diff
+*.user
+*.o
+*.a
+__pycache__
+Scratch
+Proto
+
+# Precompiled reference binaries for comparison tests
+bin
+lib
+Binaries
+
+# Build artifacts
+astcenc
+build*
+
+# General build artifacts
+Test/DocOut
+
+# Test images we download from other sources
+Test/Images/Kodak*/**/*.png
+Test/Images/Scratch*
+
+# Test output
+TestOutput
+/*.xlsx
+/*.jpg
+/*.json
+/*.log
+/*.txt
+/*.hdr
+/*.png
+/*.exr
+/*.astc
+astc_reference-main*
+Docs/Profiling.md
+Source/astcenccli_version.h
+
+# Do not ignore workflows
+!.github/workflows/
@@ -0,0 +1,12 @@
+; DO NOT EDIT (unless you know what you are doing)
+;
+; This subdirectory is a git "subrepo", and this file is maintained by the
+; git-subrepo command. See https://github.com/git-commands/git-subrepo#readme
+;
+[subrepo]
+	remote = https://github.com/ARM-software/astc-encoder.git
+	branch = 5.3.0
+	commit = 30aabb3f42406df45a910d8496f9bee17eeba9bb
+	parent = f9c73388a58de9b83f260f11008b043d8f7c0954
+	method = merge
+	cmdver = 0.4.9
@@ -0,0 +1,532 @@
+[MASTER]
+
+# Control the amount of potential inferred values when inferring a single
+# object. This can help the performance when dealing with large functions or
+# complex, nested conditions.
+limit-inference-results=100
+
+# List of plugins (as comma separated values of python module names) to load,
+# usually to register additional checkers.
+load-plugins=pylint.extensions.docparams
+
+# Pickle collected data for later comparisons.
+persistent=yes
+
+# When enabled, pylint would attempt to guess common misconfiguration and emit
+# user-friendly hints instead of false-positive error messages.
+suggestion-mode=yes
+
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+
+# Ignore specific directories we don't author ourselves
+ignore=Test/DocSource
+
+[MESSAGES CONTROL]
+
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
+confidence=
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once). You can also use "--disable=all" to
+# disable everything first and then reenable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use "--disable=all --enable=classes
+# --disable=W".
+disable=print-statement,
+        parameter-unpacking,
+        unpacking-in-except,
+        old-raise-syntax,
+        backtick,
+        long-suffix,
+        old-ne-operator,
+        old-octal-literal,
+        import-star-module-level,
+        non-ascii-bytes-literal,
+        raw-checker-failed,
+        bad-inline-option,
+        locally-disabled,
+        file-ignored,
+        suppressed-message,
+        useless-suppression,
+        deprecated-pragma,
+        use-symbolic-message-instead,
+        apply-builtin,
+        basestring-builtin,
+        buffer-builtin,
+        cmp-builtin,
+        coerce-builtin,
+        execfile-builtin,
+        file-builtin,
+        long-builtin,
+        raw_input-builtin,
+        reduce-builtin,
+        standarderror-builtin,
+        unicode-builtin,
+        xrange-builtin,
+        coerce-method,
+        delslice-method,
+        getslice-method,
+        setslice-method,
+        no-absolute-import,
+        old-division,
+        dict-iter-method,
+        dict-view-method,
+        next-method-called,
+        metaclass-assignment,
+        indexing-exception,
+        raising-string,
+        reload-builtin,
+        oct-method,
+        hex-method,
+        nonzero-method,
+        cmp-method,
+        input-builtin,
+        round-builtin,
+        intern-builtin,
+        unichr-builtin,
+        map-builtin-not-iterating,
+        zip-builtin-not-iterating,
+        range-builtin-not-iterating,
+        filter-builtin-not-iterating,
+        using-cmp-argument,
+        eq-without-hash,
+        div-method,
+        idiv-method,
+        rdiv-method,
+        exception-message-attribute,
+        invalid-str-codec,
+        sys-max-int,
+        bad-python3-import,
+        deprecated-string-function,
+        deprecated-str-translate-call,
+        deprecated-itertools-function,
+        deprecated-types-field,
+        next-method-defined,
+        dict-items-not-iterating,
+        dict-keys-not-iterating,
+        dict-values-not-iterating,
+        deprecated-operator-function,
+        deprecated-urllib-function,
+        xreadlines-attribute,
+        deprecated-sys-function,
+        exception-escape,
+        comprehension-escape
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+enable=c-extension-no-member
+
+
+[REPORTS]
+
+# Python expression which should return a score less than or equal to 10. You
+# have access to the variables 'error', 'warning', 'refactor', and 'convention'
+# which contain the number of messages in each category, as well as 'statement'
+# which is the total number of statements analyzed. This score is used by the
+# global evaluation report (RP0004).
+evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
+
+# Set the output format. Available formats are text, parseable, colorized, json
+# and msvs (visual studio). You can also give a reporter class, e.g.
+# mypackage.mymodule.MyReporterClass.
+output-format=text
+
+# Tells whether to display a full report or only the messages.
+reports=no
+
+# Activate the evaluation score.
+score=yes
+
+
+[REFACTORING]
+
+# Maximum number of nested blocks for function / method body
+max-nested-blocks=5
+
+# Complete name of functions that never returns.
+never-returning-functions=sys.exit
+
+
+[BASIC]
+
+# Naming style matching correct argument names.
+argument-naming-style=camelCase
+
+# Regular expression matching correct argument names. Overrides argument-
+# naming-style.
+#argument-rgx=
+
+# Naming style matching correct attribute names.
+attr-naming-style=camelCase
+
+# Regular expression matching correct attribute names. Overrides attr-naming-
+# style.
+#attr-rgx=
+
+# Bad variable names which should always be refused, separated by a comma.
+bad-names=foo,
+          bar,
+          baz,
+          toto,
+          tutu,
+          tata
+
+# Naming style matching correct class attribute names.
+class-attribute-naming-style=any
+
+# Regular expression matching correct class attribute names. Overrides class-
+# attribute-naming-style.
+#class-attribute-rgx=
+
+# Naming style matching correct class names.
+class-naming-style=PascalCase
+
+# Regular expression matching correct class names. Overrides class-naming-
+# style.
+#class-rgx=
+
+# Naming style matching correct constant names.
+const-naming-style=UPPER_CASE
+
+# Regular expression matching correct constant names. Overrides const-naming-
+# style.
+#const-rgx=
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=-1
+
+# Naming style matching correct function names.
+function-naming-style=snake_case
+
+# Regular expression matching correct function names. Overrides function-
+# naming-style.
+#function-rgx=
+
+# Good variable names which should always be accepted, separated by a comma.
+good-names=i,j,k,x,y,z,w,r,g,b,a,ex,Run,_
+
+# Include a hint for the correct naming format with invalid-name.
+include-naming-hint=no
+
+# Naming style matching correct inline iteration names.
+inlinevar-naming-style=any
+
+# Regular expression matching correct inline iteration names. Overrides
+# inlinevar-naming-style.
+#inlinevar-rgx=
+
+# Naming style matching correct method names.
+method-naming-style=snake_case
+
+# Regular expression matching correct method names. Overrides method-naming-
+# style.
+#method-rgx=
+
+# Naming style matching correct module names.
+module-naming-style=snake_case
+
+# Regular expression matching correct module names. Overrides module-naming-
+# style.
+#module-rgx=
+
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=^_
+
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+# These decorators are taken in consideration only for invalid-name.
+property-classes=abc.abstractproperty
+
+# Naming style matching correct variable names.
+variable-naming-style=camelCase
+
+[FORMAT]
+
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+
+# Number of spaces of indent required inside a hanging or continued line.
+indent-after-paren=4
+
+# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
+# tab).
+indent-string='    '
+
+# Maximum number of characters on a single line.
+max-line-length=79
+
+# Maximum number of lines in a module.
+max-module-lines=1000
+
+# Allow the body of a class to be on the same line as the declaration if body
+# contains single statement.
+single-line-class-stmt=no
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=no
+
+
+[LOGGING]
+
+# Format style used to check logging format string. `old` means using %
+# formatting, `new` is for `{}` formatting,and `fstr` is for f-strings.
+logging-format-style=old
+
+# Logging modules to check that the string format arguments are in logging
+# function parameter format.
+logging-modules=logging
+
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,XXX,TODO
+
+
+[SIMILARITIES]
+
+# Ignore comments when computing similarities.
+ignore-comments=yes
+
+# Ignore docstrings when computing similarities.
+ignore-docstrings=yes
+
+# Ignore imports when computing similarities.
+ignore-imports=no
+
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+
+
+[SPELLING]
+
+# Limits count of emitted suggestions for spelling mistakes.
+max-spelling-suggestions=4
+
+# Spelling dictionary name. Available dictionaries: none. To make it work,
+# install the python-enchant package.
+spelling-dict=
+
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+
+# A path to a file that contains the private dictionary; one word per line.
+spelling-private-dict-file=
+
+# Tells whether to store unknown words to the private dictionary (see the
+# --spelling-private-dict-file option) instead of raising a message.
+spelling-store-unknown-words=no
+
+
+[STRING]
+
+# This flag controls whether the implicit-str-concat-in-sequence should
+# generate a warning on implicit string concatenation in sequences defined over
+# several lines.
+check-str-concat-over-line-jumps=no
+
+
+[TYPECHECK]
+
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=
+
+# Tells whether missing members accessed in mixin class should be ignored. A
+# mixin class is detected if its name ends with "mixin" (case insensitive).
+ignore-mixin-members=yes
+
+# Tells whether to warn about missing members when the owner of the attribute
+# is inferred to be None.
+ignore-none=yes
+
+# This flag controls whether pylint should warn about no-member and similar
+# checks whenever an opaque object is returned when inferring. The inference
+# can return multiple potential results while evaluating a Python object, but
+# some branches might not be evaluated, which results in partial inference. In
+# that case, it might be useful to still emit no-member and other checks for
+# the rest of the inferred objects.
+ignore-on-opaque-inference=yes
+
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis). It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=signal
+
+# Show a hint with possible names when a member name was not found. The aspect
+# of finding the hint is based on edit distance.
+missing-member-hint=yes
+
+# The minimum edit distance a name should have in order to be considered a
+# similar match for a missing member name.
+missing-member-hint-distance=1
+
+# The total number of similar names that should be taken in consideration when
+# showing a hint for a missing member.
+missing-member-max-choices=1
+
+# List of decorators that change the signature of a decorated function.
+signature-mutators=
+
+
+[VARIABLES]
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid defining new builtins when possible.
+additional-builtins=
+
+# Tells whether unused global variables should be treated as a violation.
+allow-global-unused-variables=yes
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,_cb
+
+# A regular expression matching the name of dummy variables (i.e. expected to
+# not be used).
+dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
+
+# Argument names that match this expression will be ignored. Default to name
+# with leading underscore.
+ignored-argument-names=_.*|^ignored_|^unused_
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
+
+
+[CLASSES]
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,
+                      __new__,
+                      setUp,
+                      __post_init__
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,
+                  _fields,
+                  _replace,
+                  _source,
+                  _make
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=cls
+
+
+[DESIGN]
+
+# Maximum number of arguments for function / method.
+max-args=7
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=16
+
+# Maximum number of boolean expressions in an if statement (see R0916).
+max-bool-expr=5
+
+# Maximum number of branch for function / method body.
+max-branches=12
+
+# Maximum number of locals for function / method body.
+max-locals=16
+
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+
+# Maximum number of return / yield for function / method body.
+max-returns=6
+
+# Maximum number of statements in function / method body.
+max-statements=50
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=0
+
+
+[IMPORTS]
+
+# List of modules that can be imported at any level, not just the top level
+# one.
+allow-any-import-level=
+
+# Allow wildcard imports from modules that define __all__.
+allow-wildcard-with-all=no
+
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+
+# Deprecated modules which should not be used, separated by a comma.
+deprecated-modules=optparse,tkinter.tix
+
+# Create a graph of external dependencies in the given file (report RP0402 must
+# not be disabled).
+ext-import-graph=
+
+# Create a graph of every (i.e. internal and external) dependencies in the
+# given file (report RP0402 must not be disabled).
+import-graph=
+
+# Create a graph of internal dependencies in the given file (report RP0402 must
+# not be disabled).
+int-import-graph=
+
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant
+
+# Couples of modules and preferred modules, separated by a comma.
+preferred-modules=
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when being caught. Defaults to
+# "BaseException, Exception".
+overgeneral-exceptions=BaseException,
+                       Exception
@@ -0,0 +1,315 @@
+# Building ASTC Encoder
+
+This page provides instructions for building `astcenc` from the sources in
+this repository.
+
+Builds must use CMake 3.15 or higher as the build system generator. The
+examples on this page show how to use it to generate build systems for NMake
+(Windows) and Make (Linux and macOS), but CMake supports other build system
+backends.
+
+## Windows
+
+Builds for Windows are tested with CMake 3.17, and Visual Studio 2019 or newer.
+
+### Configuring the build
+
+To use CMake you must first configure the build. Create a build directory in
+the root of the `astcenc` checkout, and then run `cmake` inside that directory
+to generate the build system.
+
+```shell
+# Create a build directory
+mkdir build
+cd build
+
+# Configure your build of choice, for example:
+
+# x86-64 using a Visual Studio solution
+cmake -G "Visual Studio 16 2019" -T ClangCL -DCMAKE_INSTALL_PREFIX=..\ ^
+    -DASTCENC_ISA_AVX2=ON -DASTCENC_ISA_SSE41=ON -DASTCENC_ISA_SSE2=ON ..
+
+# x86-64 using NMake
+cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=..\ ^
+    -DASTCENC_ISA_AVX2=ON -DASTCENC_ISA_SSE41=ON -DASTCENC_ISA_SSE2=ON ..
+```
+
+A single CMake configure can build multiple binaries for a single target CPU
+architecture, for example building x64 for both SSE2 and AVX2. Each binary name
+will include the build variant as a postfix. It is possible to build any set of
+the supported SIMD variants by enabling only the ones you require.
+
+Using the Visual Studio Clang-CL LLVM toolchain (`-T ClangCL`) is optional but
+produces significantly faster binaries than the default toolchain. The C++ LLVM
+toolchain component must be installed via the Visual Studio installer.
+
+### Building
+
+Once you have configured the build you can use NMake to compile the project
+from your build dir, and install to your target install directory.
+
+```shell
+# Run a build and install build outputs in `${CMAKE_INSTALL_PREFIX}/bin/`
+cd build
+nmake install
+```
+
+## macOS and Linux using Make
+
+Builds for macOS and Linux are tested with CMake 3.17, and clang++ 9.0 or
+newer.
+
+> Compiling using g++ is supported, but clang++ builds are faster by ~15%.
+
+### Configuring the build
+
+To use CMake you must first configure the build. Create a build directory
+in the root of the astcenc checkout, and then run `cmake` inside that directory
+to generate the build system.
+
+```shell
+# Select your compiler (clang++ recommended, but g++ works)
+export CXX=clang++
+
+# Create a build directory
+mkdir build
+cd build
+
+# Configure your build of choice, for example:
+
+# Arm arch64
+cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ \
+    -DASTCENC_ISA_NEON=ON ..
+
+# x86-64
+cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ \
+    -DASTCENC_ISA_AVX2=ON -DASTCENC_ISA_SSE41=ON -DASTCENC_ISA_SSE2=ON ..
+
+# macOS universal binary build
+cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ ..
+```
+
+A single CMake configure can build multiple binaries for a single target CPU
+architecture, for example building x64 for both SSE2 and AVX2. Each binary name
+will include the build variant as a postfix. It is possible to build any set of
+the supported SIMD variants by enabling only the ones you require.
+
+For macOS, we additionally support the ability to build a universal binary.
+This build includes SSE4.1 (`x86_64`), AVX2 (`x86_64h`), and NEON (`arm64`)
+build slices in a single output binary. The OS will select the correct variant
+to run for the machine being used. This is the default build target for a macOS
+build, but single-target binaries can still be built by setting
+`-DASTCENC_UNIVERSAL_BINARY=OFF` and then manually selecting the specific ISA
+variants that are required.
+
+### Building
+
+Once you have configured the build you can use Make to compile the project from
+your build dir, and install to your target install directory.
+
+```shell
+# Run a build and install build outputs in `${CMAKE_INSTALL_PREFIX}/bin/`
+# for executable binaries and `${CMAKE_INSTALL_PREFIX}/lib/` for libraries
+cd build
+make install -j16
+```
+
+## macOS using XCode
+
+Builds for macOS and Linux are tested with CMake 3.17, and XCode 14.0 or
+newer.
+
+### Configuring the build
+
+To use CMake you must first configure the build. Create a build directory
+in the root of the astcenc checkout, and then run `cmake` inside that directory
+to generate the build system.
+
+```shell
+# Create a build directory
+mkdir build
+cd build
+
+# Configure a universal build
+cmake -G Xcode -DCMAKE_INSTALL_PREFIX=../ ..
+```
+
+### Building
+
+Once you have configured the build you can use CMake to compile the project
+from your build dir, and install to your target install directory.
+
+```shell
+cmake --build . --config Release
+
+# Optionally install the binaries to the installation directory
+cmake --install . --config Release
+```
+
+## Advanced build options
+
+For codec developers and power users there are a number of useful features in
+the build system.
+
+### Build Types
+
+We support and test the following `CMAKE_BUILD_TYPE` options.
+
+| Value            | Description                                              |
+| ---------------- | -------------------------------------------------------- |
+| Release          | Optimized release build                                  |
+| RelWithDebInfo   | Optimized release build with debug info                  |
+| Debug            | Unoptimized debug build with debug info                  |
+
+Note that optimized release builds are compiled with link-time optimization,
+which can make profiling more challenging ...
+
+### Shared Libraries
+
+We support building the core library as a shared object by setting the CMake
+option `-DASTCENC_SHAREDLIB=ON` at configure time. For macOS build targets the
+shared library supports the same universal build configuration as the command
+line utility.
+
+Note that the command line tool is always statically linked; the shared objects
+are an extra build output that are not currently used by the command line tool.
+
+### Constrained block size builds
+
+All normal builds will support all ASTC block sizes, including the worst case
+6x6x6 3D block size (216 texels per block). Compressor memory footprint and
+performance can be improved by limiting the block sizes supported in the build
+by adding `-DASTCENC_BLOCK_MAX_TEXELS=<texel_count>` to to CMake command line
+when configuring. Legal block sizes that are unavailable in a restricted build
+will return the error `ASTCENC_ERR_NOT_IMPLEMENTED` during context creation.
+
+### Non-invariant builds
+
+All normal builds are designed to be invariant, so any build from the same git
+revision will produce bit-identical results for all compilers and CPU
+architectures. To achieve this we sacrifice some performance, so if this is
+not required you can specify `-DASTCENC_INVARIANCE=OFF` to enable additional
+optimizations. This has most benefit for AVX2 builds where we are able to
+enable use of the FMA instruction set extensions.
+
+### No intrinsics builds
+
+All normal builds will use SIMD accelerated code paths using intrinsics, as all
+supported target architectures (x86 and arm64) guarantee SIMD availability. For
+development purposes it is possible to build an intrinsic-free build which uses
+no explicit SIMD acceleration (the compiler may still auto-vectorize).
+
+To enable this binary variant add `-DASTCENC_ISA_NONE=ON` to the CMake command
+line when configuring. It is NOT recommended to use this for production; it is
+significantly slower than the vectorized SIMD builds.
+
+### No x86 gather instruction builds
+
+On many x86 microarchitectures the native AVX gather instructions are slower
+than simply performing manual scalar loads and combining the results. Gathers
+are enabled by default, but can be disabled by setting the CMake option
+`-DASTCENC_X86_GATHERS=OFF` on the command line when configuring.
+
+Note that we have seen mixed results when compiling the scalar fallback path,
+so we would recommend testing which option works best for the compiler and
+microarchitecture pairing that you are targeting.
+
+### Test builds
+
+We support building unit tests. These use the `googletest` framework, which is
+pulled in though a git submodule. On first use, you must fetch the submodule
+dependency:
+
+```shell
+git submodule init
+git submodule update
+```
+
+To build unit tests add `-DASTCENC_UNITTEST=ON` to the CMake command line when
+configuring.
+
+To run unit tests use the CMake `ctest` utility from your build directory after
+you have built the tests.
+
+```shell
+cd build
+ctest --verbose
+```
+
+### Sanitizer builds
+
+We support building with sanitizers on Linux and macOS when using Clang.
+
+To build binaries with ASAN checking enabled add `-DASTCENC_ASAN=ON` to the
+CMake command line when configuring.
+
+To build binaries with UBSAN checking enabled add `-DASTCENC_UBSAN=ON` to the
+CMake command line when configuring.
+
+### Android builds
+
+Builds of the command line utility for Android are not officially supported, but can be a useful
+development build for testing on e.g. different Arm CPU microarchitectures.
+
+The build script below shows one possible route to building the command line tool for Android. Once
+built the application can be pushed to e.g. `/data/local/tmp` and executed from an Android shell
+terminal over `adb`.
+
+```shell
+ANDROID_ABI=arm64-v8a
+ANDROID_NDK=/work/tools/android/ndk/22.1.7171670
+
+BUILD_TYPE=RelWithDebInfo
+
+BUILD_DIR=build
+
+mkdir -p ${BUILD_DIR}
+cd ${BUILD_DIR}
+
+cmake \
+    -DCMAKE_INSTALL_PREFIX=./ \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+    -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
+    -DANDROID_ABI=${ANDROID_ABI} \
+    -DANDROID_ARM_NEON=ON \
+    -DANDROID_PLATFORM=android-21 \
+    -DCMAKE_ANDROID_NDK_TOOLCHAIN_VERSION=clang \
+    -DANDROID_TOOLCHAIN=clang \
+    -DANDROID_STL=c++_static \
+    -DARCH=aarch64 \
+    -DASTCENC_ISA_NEON=ON \
+    ..
+
+make -j16
+```
+
+## Packaging a release bundle
+
+We support building a release bundle of all enabled binary configurations in
+the current CMake configuration using the `package` build target
+
+Configure CMake with:
+
+* `-DASTCENC_PACAKGE=<arch>` to set the package architecture/variant name used
+to name the package archive (not set by default).
+
+```shell
+# Run a build and package build outputs in `./astcenc-<ver>-<os>-<arch>.<fmt>`
+cd build
+make package -j16
+```
+
+Windows packages will use the `.zip` format, other packages will use the
+`.tar.gz` format.
+
+## Integrating as a library into another project
+
+The core codec of `astcenc` is built as a library, and so can be easily
+integrated into other projects using CMake. An example of the CMake integration
+and the codec API usage can be found in the `./Utils/Example` directory in the
+repository. See the [Example Readme](../Utils/Example/README.md) for more
+details.
+
+- - -
+
+_Copyright © 2019-2024, Arm Limited and contributors. All rights reserved._
@@ -0,0 +1,328 @@
+# 2.x series change log
+
+This page summarizes the major functional and performance changes in each
+release of the 2.x series.
+
+All performance data on this page is measured on an Intel Core i5-9600K
+clocked at 4.2 GHz, running astcenc using 6 threads.
+
+<!-- ---------------------------------------------------------------------- -->
+## 2.5
+
+**Status:** Released, March 2021
+
+The 2.5 release is the last major release in the 2.x series. After this release
+a `2.x` branch will provide stable long-term support, and the `main` branch
+will switch to focusing on more radical changes for the 3.x series.
+
+Reminder for users of the library interface - the API is not designed to be
+stable across versions, and this release is not compatible with earlier 2.x
+releases. Please update and rebuild your client-side code using the updated
+`astcenc.h` header.
+
+**General:**
+  * **Feature:** The `ISA_INVARIANCE` build option is no longer supported, as
+    there is no longer any performance benefit from the variant paths. All
+    builds are now using the equivalent of the `ISA_INVARIANCE=ON` setting, and
+    all builds (except Armv7) are now believed to be invariant across operating
+    systems, compilers, CPU architectures, and SIMD instruction sets.
+  * **Feature:** Armv8 32-bit builds with NEON are now supported, with
+    out-of-the-box support for Arm Linux soft-float and hard-float ABIs. There
+    are no pre-built binaries for these targets; support is included for
+    library users targeting older 32-bit Android and iOS devices.
+  * **Feature:** A compressor mode for encoding HDR textures that have been
+    encoded into LDR RGBM wrapper format is now supported. Note that this
+    encoding has some strong recommendations for how the RGBM encoding is
+    implemented to avoid block artifacts in the compressed image.
+* **Core API:**
+  * **API Change:** The core API has been changed to be a pure C API, making it
+    easier to wrap the codec in a stable shared library ABI. Some entry points
+    that used to accept references now expect pointers.
+  * **API Change:** The decompression functionality in the core API has been
+    changed to allow use of multiple threads. The design pattern matches the
+    compression functionality, requiring the caller to create the threads,
+    synchronize them between images, and to call the new
+    `astcenc_decompress_reset()` function between images.
+* **API Feature:** Defines to support exporting public API entry point
+    symbols from a shared object are provided, but not exposed off-the-shelf by
+    the CMake provided by the project.
+  * **API Feature:** New `astcenc_get_block_info()` function added to the core
+    API to allow users to perform high level analysis of compressed data. This
+    API is not implemented in decompressor-only builds.
+  * **API Feature:** Codec configuration structure has been extended to expose
+    the new RGBM compression mode. See the API header for details.
+
+<!-- ---------------------------------------------------------------------- -->
+## 2.4
+
+**Status:** Released, February 2021
+
+The 2.4 release is the fifth release in the 2.x series. It is primarily a bug
+fix release for HDR image handling, which impacts all earlier 2.x series
+releases.
+
+**General:**
+  * **Feature:** When using the `-a` option, or the equivalent config option
+    for the API, any 2D blocks that are entirely zero alpha after the alpha
+    filter radius is taken into account are replaced by transparent black
+    constant color blocks. This is an RDO-like technique to improve compression
+    ratios of any additional application packaging compression that is applied.
+**Command Line:**
+  * **Bug fix:** The command line wrapper now correctly loads HDR images that
+    have a non-square aspect ratio.
+
+<!-- ---------------------------------------------------------------------- -->
+## 2.3
+
+**Status:** Released, January 2021
+
+The 2.3 release is the fourth release in the 2.x series. It includes a number
+of performance improvements and new features.
+
+Reminder for users of the library interface - the API is not designed to be
+stable across versions, and this release is not compatible with 2.2. Please
+recompile your client-side code using the updated `astcenc.h` header.
+
+* **General:**
+  * **Feature:** Decompressor-only builds of the codec are supported again.
+    While this is primarily a feature for library users who want to shrink
+    binary size, a variant command line tool `astcdec` can be built by
+    specifying `DECOMPRESSOR=ON` on the CMake configure command line.
+  * **Feature:** Diagnostic builds of the codec can now be built. These builds
+    generate a JSON file containing a trace of the compressor execution.
+    Diagnostic builds are only suitable for codec development; they are slower
+    and JSON generation cannot be disabled. Build by setting `DIAGNOSTICS=ON`
+    on the CMake configure command line.
+  * **Feature:** Code compatibility improved with older versions of GCC,
+    earliest compiler now tested is GCC 7.5 (was GCC 9.3).
+  * **Feature:** Code compatibility improved with newer versions of LLVM,
+    latest compiler now tested is Clang 12.0 (was Clang 9.0).
+  * **Feature:** Code compatibility improved with the Visual Studio 2019 LLVM
+    toolset (`clang-cl`). Using the LLVM toolset gives 25% performance
+    improvements and is recommended.
+* **Command Line:**
+  * **Feature:** Quality level now accepts either a preset (`-fast`, etc) or a
+    float value between 0 and 100, allowing more control over the compression
+    quality vs performance trade-off. The presets are not evenly spaced in the
+    float range; they have been spaced to give the best distribution of points
+    between the fast and thorough presets.
+    * `-fastest`: 0.0
+    * `-fast`: 10.0
+    * `-medium`: 60.0
+    * `-thorough`: 98.0
+    * `-exhaustive`: 100.0
+* **Core API:**
+  * **API Change:** Quality level preset enum replaced with a float value
+    between 0 (`-fastest`) and 100 (`-exhaustive`). See above for more info.
+
+### Performance
+
+This release includes a number of optimizations to improve performance.
+
+* New compressor algorithm for handling encoding candidates and refinement.
+* Vectorized implementation of `compute_error_of_weight_set()`.
+* Unrolled implementation of `encode_ise()`.
+* Many other small improvements!
+
+The most significant change is the change to the compressor path, which now
+uses an adaptive approach to candidate trials and block refinement.
+
+In earlier releases the quality level will determine the number of encoding
+candidates and the number of iterative refinement passes that are used for each
+major encoding trial. This is a fixed behavior; it will always try the full N
+candidates and M refinement iterations specified by the quality level for each
+encoding trial.
+
+The new approach implements two optimizations for this:
+
+* Compression will complete when a block candidate hits the specified target
+  quality, after its M refinement iterations have been applied. Later block
+  candidates are simply abandoned.
+* Block candidates will predict how much refinement can improve them, and
+  abandon refinement if they are unlikely to improve upon the best known
+  encoding already in-hand.
+
+This pair of optimizations provides significant performance improvement to the
+high quality modes which use the most block candidates and refinement
+iterations. A minor loss of image quality is expected, as the blocks we no
+longer test or refine may have been better coding choices.
+
+**Absolute performance vs 2.2 release:**
+
+![Absolute scores 2.3 vs 2.2](./ChangeLogImg/absolute-2.2-to-2.3.png)
+
+**Relative performance vs 2.2 release:**
+
+![Relative scores 2.3 vs 2.2](./ChangeLogImg/relative-2.2-to-2.3.png)
+
+<!-- ---------------------------------------------------------------------- -->
+## 2.2
+
+**Status:** Released, January 2021
+
+The 2.2 release is the third release in the 2.x series. It includes a number
+of performance improvements and new features.
+
+Reminder for users of the library interface - the API is not designed to be
+stable across versions, and this release is not compatible with 2.1. Please
+recompile your client-side code using the updated `astcenc.h` header.
+
+* **General:**
+  * **Feature:** New Arm aarch64 NEON accelerated vector library support.
+  * **Improvement:** New CMake build system for all platforms.
+  * **Improvement:** SSE4.2 feature profile changed to SSE4.1, which more
+    accurately reflects the feature set used.
+* **Binary releases:**
+  * **Improvement:** Linux binaries changed to use Clang 9.0, which gives
+    up to 15% performance improvement.
+  * **Improvement:** Windows binaries are now code signed.
+  * **Improvement:** macOS binaries for Apple silicon platforms now provided.
+  * **Improvement:** macOS binaries are now code signed and notarized.
+* **Command Line:**
+  * **Feature:** New image preprocess `-pp-normalize` option added. This forces
+    normal vectors to be unit length, which is useful when compressing source
+    textures that use normal length to encode an NDF, which is incompatible
+    with ASTC's two channel encoding.
+  * **Feature:** New image preprocess `-pp-premultiply` option added. This
+    scales RGB values by the alpha value. This can be useful to minimize
+    cross-channel color bleed caused by GPU post-multiply filtering/blending.
+  * **Improvements:** Command line tool cleanly traps and reports errors for
+    corrupt input images rather than relying on standard library `assert()`
+    calls in release builds.
+* **Core API:**
+  * **API Change:** Images using region-based metrics no longer need to include
+    padding; all input images should be tightly packed and `dim_pad` is removed
+    from the `astcenc_image` structure. This makes it easier to directly use
+    images loaded from other libraries.
+  * **API Change:** Image `data` is no longer a 3D array accessed using
+    `data[z][y][x]` indexing, it's an array of 2D slices. This makes it easier
+    to directly use images loaded from other libraries.
+  * **API Change:** New `ASTCENC_FLG_SELF_DECOMPRESS_ONLY` flag added to the
+    codec config. Using this flag enables additional optimizations that
+    aggressively exploit implementation- and configuration-specific, behavior
+    to gain performance. When using this flag the codec can only reliably
+    decompress images that were compressed in the same context session. Images
+    produced via other means may fail to decompress correctly, even if they are
+    otherwise valid ASTC files.
+
+### Performance
+
+There is one major set of optimizations in this release, related to the new
+`ASTCENC_FLG_SELF_DECOMPRESS_ONLY` mode. These allow the compressor to only
+create data tables it knows that it is going to use, based on its current set
+of heuristics, rather than needing the full set the format allows.
+
+The first benefit of these changes is a reduced context creation time, which
+can be reduced by up to 250ms on our test machine. This is a significant
+percentage of the command line utility runtime for a small image when using a
+quick search preset. Compressing the whole Kodak test suite using the command
+line utility and the `-fastest` preset is ~30% faster with this release, which
+is mostly due to faster startup.
+
+The reduction in the data table size in this mode also improve the core codec
+speed. Our test sets show an average of 12% improvement in the codec for
+`-fastest` mode, and an average of 3% for `-medium` mode.
+
+Key for performance charts:
+
+* Color = block size (see legend).
+* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
+
+**Absolute performance vs 2.1 release:**
+
+![Absolute scores 2.2 vs 2.1](./ChangeLogImg/absolute-2.1-to-2.2.png)
+
+**Relative performance vs 2.1 release:**
+
+![Relative scores 2.2 vs 2.1](./ChangeLogImg/relative-2.1-to-2.2.png)
+
+
+<!-- ---------------------------------------------------------------------- -->
+## 2.1
+
+**Status:** Released, November 2020
+
+The 2.1 release is the second release in the 2.x series. It includes a number
+of performance optimizations and new features.
+
+Reminder for users of the library interface - the API is not designed to be
+stable across versions, and this release is not compatible with 2.0. Please
+recompile your client-side code using the updated `astcenc.h` header.
+
+### Features:
+
+* **Command line:**
+  * **Bug fix:** The meaning of the `-tH\cH\dH` and `-th\ch\dh` compression
+    modes was inverted. They now match the documentation; use `-*H` for HDR
+    RGBA, and `-*h` for HDR RGB with LDR alpha.
+  * **Feature:** A new `-fastest` quality preset is now available. This is
+    designed for fast "roughing out" of new content, and sacrifices significant
+    image quality compared to `-fast`. We do not recommend its use for
+    production builds.
+  * **Feature:** A new `-candidatelimit` compression tuning option is now
+    available. This is a power-user control to determine how many candidates
+    are returned for each block mode encoding trial. This feature is used
+	automatically by the search presets; see `-help` for details.
+  * **Improvement:** The compression test modes (`-tl\ts\th\tH`) now emit a
+    MTex/s performance metric, in addition to coding time.
+* **Core API:**
+  * **Feature:** A new quality preset `ASTCENC_PRE_FASTEST` is available. See
+    `-fastest` above for details.
+  * **Feature:** A new tuning option `tune_candidate_limit` is available in
+    the config structure. See `-candidatelimit` above for details.
+  * **Feature:** Image input/output can now use `ASTCENC_TYPE_F32` data types.
+* **Stability:**
+  * **Feature:** The SSE2, SSE4.2, and AVX2 variants now produce identical
+    compressed output when run on the same CPU when compiled with the
+    preprocessor define `ASTCENC_ISA_INVARIANCE=1`. For Make builds this can
+    be set on the command line by setting `ISA_INV=1`. ISA invariance is off
+    by default; it reduces performance by 1-3%.
+
+### Performance
+
+Key for performance charts:
+
+* Color = block size (see legend).
+* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
+
+**Absolute performance vs 2.0 release:**
+
+![Absolute scores 2.1 vs 2.0](./ChangeLogImg/absolute-2.0-to-2.1.png)
+
+**Relative performance vs 2.0 release:**
+
+![Relative scores 2.1 vs 2.0](./ChangeLogImg/relative-2.0-to-2.1.png)
+
+
+<!-- ---------------------------------------------------------------------- -->
+## 2.0
+
+**Status:** Released, August 2020
+
+The 2.0 release is first release in the 2.x series. It includes a number of
+major changes over the earlier 1.7 series, and is not command-line compatible.
+
+### Features:
+
+* The core codec can be built as a library, exposed via a new codec API.
+* The core codec supports accelerated SIMD paths for SSE2, SSE4.2, and AVX2.
+* The command line syntax has a clearer mapping to Khronos feature profiles.
+
+### Performance:
+
+Key for performance charts
+
+* Color = block size (see legend).
+* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
+
+**Absolute performance vs 1.7 release:**
+
+![Absolute scores 2.0 vs 1.7](./ChangeLogImg/absolute-1.7-to-2.0.png)
+
+**Relative performance vs 1.7 release:**
+
+![Relative scores 2.0 vs 1.7](./ChangeLogImg/relative-1.7-to-2.0.png)
+
+- - -
+
+_Copyright © 2020-2022, Arm Limited and contributors. All rights reserved._
@@ -0,0 +1,308 @@
+# 3.x series change log
+
+This page summarizes the major functional and performance changes in each
+release of the 3.x series.
+
+All performance data on this page is measured on an Intel Core i5-9600K
+clocked at 4.2 GHz, running `astcenc` using AVX2 and 6 threads.
+
+<!-- ---------------------------------------------------------------------- -->
+## 3.7
+
+**Status:** April 2022
+
+The 3.7 release contains another round of performance optimizations, including
+significant improvements to the command line front-end (faster PNG loader) and
+the arm64 build of the codec (faster NEON implementation).
+
+* **General:**
+  * **Feature:** The command line tool PNG loader has been switched to use
+    the Wuffs library, which is robust and significantly faster than the
+    current stb_image implementation.
+  * **Feature:** Support for non-invariant builds returns. Opt-in to slightly
+    faster, but not bit-exact, builds by setting `-DNO_INVARIANCE=ON` for the
+    CMake configuration. This improves performance by around 2%.
+  * **Optimization:** Changed SIMD `select()` so that it matches the default
+    NEON behavior (bitwise select), rather than the default x86-64 behavior
+    (lane select on MSB). Specialization `select_msb()` added for the one case
+    we want to select on a sign-bit, where NEON needs a different
+    implementation. This provides a significant (>25%) performance uplift on
+    NEON implementations.
+
+### Performance:
+
+Key for charts:
+
+* Color = block size (see legend).
+* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
+
+**Relative performance vs 3.5 release:**
+
+![Relative scores 3.7 vs 3.6](./ChangeLogImg/relative-3.6-to-3.7.png)
+
+<!-- ---------------------------------------------------------------------- -->
+## 3.6
+
+**Status:** April 2022
+
+The 3.6 release contains another round of performance optimizations.
+
+There are no interface changes in this release, but in general the API is not
+designed to be binary compatible across versions. We always recommend
+rebuilding your client-side code using the updated `astcenc.h` header.
+
+* **General:**
+  * **Feature:** Data tables are now optimized for contexts without the
+    `SELF_DECOMPRESS_ONLY` flag set. The flag therefore no longer improves
+    compression performance, but still reduces context creation time and
+    context data table memory footprint.
+  * **Feature:** Image quality for 4x4 `-fastest` configuration has been
+    improved.
+  * **Optimization:** Decimation modes are reliably excluded from processing
+    when they are only partially selected in the compressor configuration (e.g.
+    if used for single plane, but not dual plane modes). This is a significant
+    performance optimization for all quality levels.
+  * **Optimization:** Fast-path block load function variant added for 2D LDR
+    images with no swizzle. This is a moderate performance optimization for the
+    fast and fastest quality levels.
+
+### Performance:
+
+Key for charts:
+
+* Color = block size (see legend).
+* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
+
+**Relative performance vs 3.5 release:**
+
+![Relative scores 3.6 vs 3.5](./ChangeLogImg/relative-3.5-to-3.6.png)
+
+<!-- ---------------------------------------------------------------------- -->
+## 3.5
+
+**Status:** March 2022
+
+The 3.5 release contains another round of performance optimizations.
+
+There are no interface changes in this release, but in general the API is not
+designed to be binary compatible across versions. We always recommend
+rebuilding your client-side code using the updated `astcenc.h` header.
+
+* **General:**
+  * **Feature:** Compressor configurations using `SELF_DECOMPRESS_ONLY` mode
+    store compacted partition tables, which significantly improves both
+    context create time and runtime performance.
+  * **Feature:** Bilinear infill for decimated weight grids supports a new
+    variant for half-decimated grids which are only decimated in one axis.
+
+### Performance:
+
+Key for charts:
+
+* Color = block size (see legend).
+* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
+
+**Relative performance vs 3.4 release:**
+
+![Relative scores 3.5 vs 3.4](./ChangeLogImg/relative-3.4-to-3.5.png)
+
+
+<!-- ---------------------------------------------------------------------- -->
+## 3.4
+
+**Status:** February 2022
+
+The 3.4 release introduces another round of optimizations, removing a number
+of power-user configuration options to simplify the core compressor data path.
+
+Reminder for users of the library interface - the API is not designed to be
+binary compatible across versions, and this release is not compatible with
+earlier releases. Please update and rebuild your client-side code using the
+updated `astcenc.h` header.
+
+* **General:**
+  * **Feature:** Many memory allocations have been moved off the stack into
+    dynamically allocated working memory. This significantly reduces the peak
+    stack usage, allowing the compressor to run in systems with 128KB stack
+    limits.
+  * **Feature:** Builds now support `-DBLOCK_MAX_TEXELS=<count>` to allow a
+    compressor to support a subset of block sizes. This can reduce binary size
+    and runtime memory footprint, and improve performance.
+  * **Feature:** The `-v` and `-va` options to set a per-texel error weight
+    function are no longer supported.
+  * **Feature:** The `-b` option to set a per-texel error weight boost for
+    block border texels is no longer supported.
+  * **Feature:** The `-a` option to set a per-texel error weight based on texel
+    alpha value is no longer supported as an error weighting tool, but is still
+    supported for providing sprite-sheet RDO.
+  * **Feature:** The `-mask` option to set an error metric for mask map
+    textures is still supported, but is currently a no-op in the compressor.
+  * **Feature:** The `-perceptual` option to set a perceptual error metric is
+    still supported, but is currently a no-op in the compressor for mask map
+    and normal map textures.
+  * **Bug-fix:** Corrected decompression of error blocks in some cases, so now
+    returning the expected error color (magenta for LDR, NaN for HDR). Note
+    that astcenc determines the error color to use based on the output image
+    data type not the decoder profile.
+* **Binary releases:**
+  * **Improvement:** Windows binaries changed to use ClangCL 12.0, which gives
+    up to 10% performance improvement.
+
+### Performance:
+
+Key for charts:
+
+* Color = block size (see legend).
+* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
+
+**Relative performance vs 3.3 release:**
+
+![Relative scores 3.4 vs 3.3](./ChangeLogImg/relative-3.3-to-3.4.png)
+
+
+<!-- ---------------------------------------------------------------------- -->
+## 3.3
+
+**Status:** November 2021
+
+The 3.3 release improves image quality for normal maps, and two component
+textures. Normal maps are expected to compress 25% slower than the 3.2
+release, although it should be noted that they are still faster to compress
+in 3.3 than when using the 2.5 series. This release also fixes one reported
+stability issue.
+
+* **General:**
+  * **Feature:** Normal map image quality has been improved.
+  * **Feature:** Two component image quality has been improved, provided
+    that unused components are correctly zero-weighted using e.g. `-cw` on the
+    command line.
+  * **Bug-fix:** Improved stability when trying to compress complex blocks that
+    could not beat even the starting quality threshold. These will now always
+    compress in to a constant color blocks.
+
+<!-- ---------------------------------------------------------------------- -->
+## 3.2
+
+**Status:** August 2021
+
+The 3.2 release is a bugfix release; no significant image quality or
+performance differences are expected.
+
+* **General:**
+  * **Bug-fix:** Improved stability when new contexts were created while other
+    contexts were compressing or decompressing an image.
+  * **Bug-fix:** Improved stability when decompressing blocks with invalid
+    block encodings.
+
+<!-- ---------------------------------------------------------------------- -->
+## 3.1
+
+**Status:** July 2021
+
+The 3.1 release gives another performance boost, typically between 5 and 20%
+faster than the 3.0 release, as well as further incremental improvements to
+image quality. A number of build system improvements make astcenc easier and
+faster to integrate into other projects as a library, including support for
+building universal binaries on macOS. Full change list is shown below.
+
+Reminder for users of the library interface - the API is not designed to be
+binary compatible across versions, and this release is not compatible with
+earlier releases. Please update and rebuild your client-side code using the
+updated `astcenc.h` header.
+
+* **General:**
+  * **Feature:** RGB color data now supports `-perceptual` operation. The
+    current implementation is simple, weighting color channel errors by their
+    contribution to perceived luminance. This mimics the behavior of the human
+    visual system, which is most sensitive to green, then red, then blue.
+  * **Feature:** Codec supports a new low weight search mode, which is a
+    simpler weight assignment for encodings with a low number of weights in the
+    weight grid. The weight threshold can be overridden using the new
+    `-lowweightmodelimit` command line option.
+  * **Feature:** All platform builds now support building a native binary.
+    Native binaries automatically select the SIMD level based on the default
+    configuration of the compiler in use. Native binaries built on one machine
+    may use different SIMD options than native binaries build on another.
+  * **Feature:** macOS platform builds now support building universal binaries
+    containing both `x86_64` and `arm64` target support.
+  * **Feature:** Building the command line can be disabled when using as a
+    library in another project. Set `-DCLI=OFF` during the CMake configure
+    step.
+  * **Feature:** A standalone minimal example of the core codec API usage has
+    been added in the `./Utils/Example/` directory.
+* **Core API:**
+  * **Feature:** Config flag `ASTCENC_FLG_USE_PERCEPTUAL` works for color data.
+  * **Feature:** Config option `tune_low_weight_count_limit` added.
+  * **Feature:** New heuristic added which prunes dual weight plane searches if
+    they are unlikely to help. This heuristic is not user controllable.
+  * **Feature:** Image quality has been improved. In general we see significant
+    improvements (up to 0.2dB) for high bitrate encodings (4x4, 5x4), and a
+    smaller improvement (up to 0.1dB) for lower bitrate encodings.
+  * **Bug fix:** Arm "none" SIMD builds could be invariant with other builds.
+    This fix has also been back-ported to the 2.x LTS branch.
+
+### Performance:
+
+Key for charts:
+
+* Color = block size (see legend).
+* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
+
+**Relative performance vs 3.0 release:**
+
+![Relative scores 3.1 vs 3.0](./ChangeLogImg/relative-3.0-to-3.1.png)
+
+<!-- ---------------------------------------------------------------------- -->
+## 3.0
+
+**Status:** June 2021
+
+The 3.0 release is the first in a series of updates to the compressor that are
+making more radical changes than we felt we could make with the 2.x series.
+The primary goals of the 3.x series are to keep the image quality ~static or
+better compared to the 2.5 release, but continue to improve performance.
+
+Reminder for users of the library interface - the API is not designed to be
+binary compatible across versions, and this release is not compatible with
+earlier releases. Please update and rebuild your client-side code using the
+updated `astcenc.h` header.
+
+* **General:**
+  * **Feature:** The code has been significantly cleaned up, with improved
+    comments, API documentation, function naming, and variable naming.
+* **Core API:**
+  * **API Change:** The core APIs for `astcenc_compress_image()` and for
+    `astcenc_decompress_image()` now accept swizzle structures by `const`
+    pointer, instead of pass-by-value.
+  * **API Change:** Calling the `astcenc_compress_reset()` and the
+    `astcenc_decompress_reset()` functions between images is no longer required
+    if the context was created for use by a single thread.
+  * **Feature:** New heuristics have been added for controlling when to search
+    beyond 2 partitions and 1 plane, and when to search beyond 3 partitions and
+    1 plane. The previous `tune_partition_early_out_limit` config option has
+    been removed, and replaced with two new options
+    `tune_2_partition_early_out_limit_factor` and
+    `tune_3_partition_early_out_limit_factor`. See command line help for more
+    detailed documentation.
+  * **Feature:** New heuristics have been added for controlling when to use
+    dual weight planes. The previous `tune_two_plane_early_out_limit` has been
+    renamed to`tune_2_plane_early_out_limit_correlation`. See command line help
+    for more detailed documentation.
+  * **Feature:** Support for using dual weight planes has been restricted to
+    single partition blocks; it rarely helps blocks with 2 or more partitions
+    and takes considerable compression search time.
+
+### Performance:
+
+Key for charts:
+
+* Color = block size (see legend).
+* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
+
+**Relative performance vs 2.5 release:**
+
+![Relative scores 3.0 vs 2.5](./ChangeLogImg/relative-2.5-to-3.0.png)
+
+- - -
+
+_Copyright © 2021-2022, Arm Limited and contributors. All rights reserved._
@@ -0,0 +1,416 @@
+# 4.x series change log
+
+This page summarizes the major functional and performance changes in each
+release of the 4.x series.
+
+All performance data on this page is measured on an Intel Core i5-9600K
+clocked at 4.2 GHz, running `astcenc` using AVX2 and 6 threads.
+
+<!-- ---------------------------------------------------------------------- -->
+## 4.8.0
+
+**Status:** May 2024
+
+The 4.8.0 release is a minor maintenance release.
+
+* **General:**
+  * **Bug fix:** Native builds on macOS will now correctly build for arm64 when
+    run outside of Rosetta on an Apple silicon device.
+  * **Bug fix:** Multiple small improvements to remove use of undefined
+    language behavior, to improve support for deployment using Emscripten.
+  * **Feature:** Builds using Clang can now build with undefined behavior
+    sanitizer by setting `-DASTCENC_UBSAN=ON` on the CMake configure line.
+  * **Feature:** Updated to Wuffs library 0.3.4, which ignores tRNS alpha
+    chunks for type 4 (LA) and 6 (RGBA) PNGs, to improve compatibility with
+    libpng.
+
+<!-- ---------------------------------------------------------------------- -->
+## 4.7.0
+
+**Status:** January 2024
+
+The 4.7.0 release is a major maintenance release, fixing rounding behavior in
+the decompressor to match the Khronos specification. This fix includes the
+addition of explicit support for optimizing for `decode_unorm8` rounding.
+
+Reminder - the codec library API is not designed to be binary compatible across
+versions. We always recommend rebuilding your client-side code using the
+updated `astcenc.h` header.
+
+* **General:**
+  * **Bug fix:** sRGB LDR decompression now uses the correct endpoint expansion
+    method to create the 16-bit RGB endpoint colors, and removes the previous
+    correction code from the interpolation function. This bug could result in
+    LSB bit flips relative to the standard specification.
+  * **Bug fix:** Decompressing to an 8-bit per component output image now
+    matches the `decode_unorm8` extension rounding rules. This bug could result
+    in LSB bit flips relative to the standard specification.
+  * **Bug fix:** Code now avoids using `alignas()` in the reference C
+    implementation, as the  default `alignas(16)` is narrower than the
+    native minimum alignment requirement on some CPUs.
+  * **Feature:** Library configuration supports a new flag,
+    `ASTCENC_FLG_USE_DECODE_UNORM8`. This flag indicates that the image will be
+    used with the `decode_unorm8` decode mode. When set during compression
+    this allows the compressor to use the correct rounding when determining the
+    best encoding.
+  * **Feature:** Command line tool supports a new option, `-decode_unorm8`.
+    This option indicates that the image will be used with the `decode_unorm8`
+    decode mode. This option will automatically be set for decompression
+    (`-d*`) and trial (`-t*`) tool operation if the decompressed output image
+    is stored to an 8-bit per component file format. This option must be set
+    manually for compression (`-c*`) tool operation, as the desired decode mode
+    cannot be reliably determined.
+  * **Feature:** Library configuration supports a new optional progress
+    reporting callback to be specified. This is called during compression to
+    to allow interactive tooling use cases to display incremental progress. The
+    command line tool uses this feature to show compression progress unless
+    `-silent` is used.
+
+<!-- ---------------------------------------------------------------------- -->
+## 4.6.1
+
+**Status:** November 2023
+
+The 4.6.1 release is a minor maintenance release to fix a scaling bug on
+large core count Windows systems.
+
+* **General:**
+  * **Optimization:** Windows builds of the `astcenc` command line tool can now
+    use more than 64 cores on large core count systems. This change doubled
+    command line performance for `-exhaustive` compression when testing on an
+    96 core/192 thread system.
+  * **Feature:** Windows Arm64 native builds of the `astcenc` command line tool
+    are now included in the prebuilt release binaries.
+
+<!-- ---------------------------------------------------------------------- -->
+## 4.6.0
+
+**Status:** November 2023
+
+The 4.6.0 release retunes the compressor heuristics to give improvements to
+performance for trivial losses to image quality. It also includes some minor
+bug fixes and code quality improvements.
+
+Reminder - the codec library API is not designed to be binary compatible across
+versions. We always recommend rebuilding your client-side code using the updated
+`astcenc.h` header.
+
+* **General:**
+  * **Bug-fix:** Fixed context allocation for contexts allocated with the
+    `ASTCENC_FLG_DECOMPRESS_ONLY` flag.
+  * **Bug-fix:** Reduced use of `reinterpret_cast` in the core codec to
+    avoid strict aliasing violations.
+  * **Optimization:** `-medium` search quality no longer tests 4 partition
+     encodings for block sizes between 25 and 83 texels (inclusive). This
+     improves performance for a tiny drop in image quality.
+  * **Optimization:** `-thorough` and higher search qualities no longer test the
+     mode0 first search for block sizes between 25 and 83 texels (inclusive).
+     This improves performance for a tiny drop in image quality.
+  * **Optimization:** `TUNE_MAX_PARTITIONING_CANDIDATES` reduced from 32 to 8
+     to reduce the size of stack allocated data structures. This causes a tiny
+     drop in image quality for the `-verythorough` and `-exhaustive` presets.
+
+<!-- ---------------------------------------------------------------------- -->
+## 4.5.0
+
+**Status:** June 2023
+
+The 4.5.0 release is a maintenance release with small image quality
+improvements, and a number of build system quality of life improvements.
+
+* **General:**
+  * **Bug-fix:** Improved handling compiler arguments in CMake, including
+    consistent use of MSVC-style command line arguments for ClangCL.
+  * **Bug-fix:** Invariant Clang builds now use `-ffp-model=precise` with
+    `-ffp-contract=off` which is needed to restore invariance due to recent
+    changes in compiler defaults.
+  * **Change:** macOS binary releases are now distributed as a single universal
+    binary for all platforms.
+  * **Change:** Windows binary releases are now compiled with VS2022.
+  * **Change:** Invariant MSVC builds for VS2022 now use `/fp:precise` instead
+    of `/fp:strict`, which is is now possible because precise no longer implies
+    contraction. This should improve performance for MSVC builds.
+  * **Change:** Non-invariant Clang builds now use `-ffp-model=precise` with
+    `-ffp-contract=on`. This should improve performance on older Clang
+    versions which defaulted to no contraction.
+  * **Change:** Non-invariant MSVC builds for VS2022 now use `/fp:precise`
+    with `/fp:contract`. This should improve performance for MSVC builds.
+  * **Change:** CMake config variables now use an `ASTCENC_` prefix to add a
+    namespace and group options when the library is used in a larger project.
+  * **Change:** CMake config `ASTCENC_UNIVERSAL_BUILD` for building macOS
+    universal binaries has been improved to include the `x86_64h` slice for
+    AVX2 builds. Universal builds are now on by default for macOS, and always
+    include NEON (arm64), SSE4.1 (x86_64), and AVX2 (x86_64h) variants.
+  * **Change:** CMake config `ASTCENC_NO_INVARIANCE` has been inverted to
+    remove the negated option, and is now `ASTCENC_INVARIANCE` with a default
+    of `ON`. Disabling this option can substantially improve performance, but
+    images can different across platforms and compilers.
+  * **Optimization:** Color quantization and packing for LDR RGB and RGBA has
+    been vectorized to improve performance.
+  * **Change:** Color quantization for LDR RGB and RGBA endpoints will now try
+    multiple quantization packing methods, and pick the one with the lowest
+    endpoint encoding error. This gives a minor image quality improvement, for
+    no significant performance impact when combined with the vectorization
+    optimizations.
+
+<!-- ---------------------------------------------------------------------- -->
+## 4.4.0
+
+**Status:** March 2023
+
+The 4.4.0 release is a minor release with image quality improvements, a small
+performance boost, and a few new quality-of-life features.
+
+* **General:**
+  * **Change:** Core library no longer checks availability of required
+    instruction set extensions, such as SSE4.1 or AVX2. Checking compatibility
+    is now the responsibility of the caller. See `astcenccli_entry.cpp` for
+    an example of code performing this check.
+  * **Change:** Core library can be built as a shared object by setting the
+    `-DSHAREDLIB=ON` CMake option, resulting in e.g. `libastcenc-avx2-shared.so`.
+    Note that the command line tool is always statically linked.
+  * **Change:** Decompressed 3D images will now write one output file per
+    slice, if the target format is a 2D image format.
+  * **Change:** Command line errors print to stderr instead of stdout.
+  * **Change:** Color encoding uses new quantization tables, that now factor
+    in floating-point rounding if a distance tie is found when using the
+    integer quant256 value. This improves image quality for 4x4 and 5x5 block
+    sizes.
+  * **Optimization:** Partition selection uses a simplified line calculation
+    with a faster approximation. This improves performance for all block sizes.
+  * **Bug-fix:** Fixed missing symbol error in decompressor-only builds.
+  * **Bug-fix:** Fixed infinity handling in debug trace JSON files.
+
+### Performance:
+
+Key for charts:
+
+* Color = block size (see legend).
+* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
+
+**Relative performance vs 4.3 release:**
+
+![Relative scores 4.4 vs 4.3](./ChangeLogImg/relative-4.3-to-4.4.png)
+
+<!-- ---------------------------------------------------------------------- -->
+## 4.3.1
+
+**Status:** January 2023
+
+The 4.3.1 release is a minor maintenance release. No performance or image
+quality changes are expected.
+
+* **General:**
+  * **Bug-fix:** Fixed typo in `-2/3/4partitioncandidatelimit` CLI options.
+  * **Bug-fix:** Fixed handling for `-3/4partitionindexlimit` CLI options.
+  * **Bug-fix:** Updated to `stb_image.h` v2.28, which includes multiple fixes
+    and improvements for image loading.
+
+<!-- ---------------------------------------------------------------------- -->
+## 4.3.0
+
+**Status:** January 2023
+
+The 4.3.0 release is an optimization release. There are minor performance
+and image quality improvements in this release.
+
+Reminder - the codec library API is not designed to be binary compatible across
+versions. We always recommend rebuilding your client-side code using the updated
+`astcenc.h` header.
+
+* **General:**
+  * **Bug-fix:** Use lower case `windows.h` include for MinGW compatibility.
+  * **Change:** The `-mask` command line option, `ASTCENC_FLG_MAP_MASK` in the
+    library API, has been removed.
+  * **Optimization:** Always skip blue-contraction for `QUANT_256` encodings.
+    This gives a small image quality improvement for the 4x4 block size.
+  * **Optimization:** Always skip RGBO vector calculation for LDR encodings.
+  * **Optimization:** Defer color packing and scrambling to physical layer.
+  * **Optimization:** Remove folded `decimation_info` lookup tables. This
+    significantly reduces compressor memory footprint and improves context
+    creation time. Impact increases with the active block size.
+  * **Optimization:** Increased trial and refinement pruning by using stricter
+    target errors when determining whether to skip iterations.
+
+### Performance:
+
+Key for charts:
+
+* Color = block size (see legend).
+* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
+
+**Relative performance vs 4.2 release:**
+
+![Relative scores 4.3 vs 4.2](./ChangeLogImg/relative-4.2-to-4.3.png)
+
+
+<!-- ---------------------------------------------------------------------- -->
+## 4.2.0
+
+**Status:** November 2022
+
+The 4.2.0 release is an optimization release. There are significant performance
+improvements, minor image quality improvements, and library interface changes in
+this release.
+
+Reminder - the codec library API is not designed to be binary compatible across
+versions. We always recommend rebuilding your client-side code using the updated
+`astcenc.h` header.
+
+* **General:**
+  * **Bug-fix:** Compression for RGB and RGBA base+offset encodings no
+    longer generate endpoints with the incorrect blue-contract behavior.
+  * **Bug-fix:** Lowest channel correlation calculation now correctly ignores
+    constant color channels for the purposes of filtering 2 plane encodings.
+    On average this improves both performance and image quality.
+  * **Bug-fix:** ISA compatibility now checked in `config_init()` as well as
+    in `context_alloc()`.
+  * **Change:** Removed the low-weight count optimization, as more recent
+    changes had significantly reduced its performance benefit. Option removed
+    from both command line and configuration structure.
+  * **Feature:** The `-exhaustive` mode now runs full trials on more
+    partitioning candidates and block candidates. This improves image quality
+    by 0.1 to 0.25 dB, but slows down compression by 3x. The `-verythorough`
+    and `-thorough` modes also test more candidates.
+  * **Feature:** A new preset, `-verythorough`, has been introduced to provide
+    a standard performance point between `-thorough` and the re-tuned
+    `-exhaustive` mode. This new mode is faster and higher quality than the
+    `-exhaustive` preset in the 4.1 release.
+  * **Feature:** The compressor can now independently vary the number of
+    partitionings considered for error estimation for 2/3/4 partitions. This
+    allows heuristics to put more effort into 2 partitions, and less in to
+    3/4 partitions.
+  * **Feature:** The compressor can now run trials on a variable number of
+    candidate partitionings, allowing high quality modes to explore more of the
+    search space at the expense of slower compression. The number of trials is
+    independently configurable for 2/3/4 partition cases.
+  * **Optimization:** Introduce early-out threshold for 2/3/4 partition
+    searches based on the results after 1 of 2 trials. This significantly
+    improves performance for `-medium` and `-thorough` searches, for a minor
+    loss in image quality.
+  * **Optimization:** Reduce early-out threshold for 3/4 partition searches
+    based on 2/3 partition results. This significantly improves performance,
+    especially for `-thorough` searches, for a minor loss in image quality.
+  * **Optimization:** Use direct vector compare to create a SIMD mask instead
+    of a scalar compare that is broadcast to a vector mask.
+  * **Optimization:** Remove obsolete partition validity masks from the
+    partition selection algorithm.
+  * **Optimization:** Removed obsolete channel scaling from partition
+    `avgs_and_dirs()` calculation.
+
+### Performance:
+
+Key for charts:
+
+* Color = block size (see legend).
+* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
+
+**Relative performance vs 4.0 and 4.1 release:**
+
+![Relative scores 4.2 vs 4.0](./ChangeLogImg/relative-4.0-to-4.2.png)
+
+
+<!-- ---------------------------------------------------------------------- -->
+## 4.1.0
+
+**Status:** August 2022
+
+The 4.1.0 release is a maintenance release. There is no performance or image
+quality change in this release.
+
+* **General:**
+  * **Change:** Command line decompressor no longer uses the legacy
+    `GL_LUMINANCE` or `GL_LUMINANCE_ALPHA` format enums when writing KTX
+    output files. Luminance textures now use the `GL_RED` format and
+    luminance_alpha textures now use the `GL_RG` format.
+  * **Change:** Command line tool gains a new `-dimage` option to generate
+    diagnostic images showing aspects of the compression encoding. The output
+    file name with its extension stripped is used as the stem of the diagnostic
+    image file names.
+  * **Bug-fix:** Library decompressor builds for SSE no longer use masked store
+    `maskmovdqu` instructions, as they can generate faults on masked lanes.
+  * **Bug-fix:** Command line decompressor now correctly uses sized type enums
+    for the internal format when writing output KTX files.
+  * **Bug-fix:** Command line compressor now correctly loads 16 and 32-bit per
+    component input KTX files.
+  * **Bug-fix:** Fixed GCC9 compiler warnings on Arm aarch64.
+
+<!-- ---------------------------------------------------------------------- -->
+## 4.0.0
+
+**Status:** July 2022
+
+The 4.0.0 release introduces some major performance enhancement, and a number
+of larger changes to the heuristics used in the codec to find a more effective
+cost:quality trade off.
+
+* **General:**
+  * **Change:** The `-array` option for specifying the number of image planes
+    for ASTC 3D volumetric block compression been renamed to `-zdim`.
+  * **Change:** The build root package directory is now `bin` instead of
+    `astcenc`, allowing the CMake install step to write binaries into
+    `/usr/local/bin` if the user wishes to do so.
+  * **Feature:** A new `-ssw` option for specifying the shader sampling swizzle
+    has been added as convenience alternative to the `-cw` option. This is
+    needed to correct error weighting during compression if not all components
+    are read in the shader. For example, to extract and compress two components
+    from an RGBA input image, weighting the two components equally when
+    sampling through .ra in the shader, use `-esw ggga -ssw ra`. In this
+    example `-ssw ra` is equivalent to the alternative `-cw 1 0 0 1` encoding.
+  * **Feature:** The `-a` alpha weighting option has been re-enabled in the
+    backend, and now again applies alpha scaling to the RGB error metrics when
+    encoding. This is based on the maximum alpha in each block, not the
+    individual texel alpha values used in the earlier implementation.
+  * **Feature:** The command line tool now has `-repeats <count>` for testing,
+    which will iterate around compression and decompression `count` times.
+    Reported performance metrics also now separate compression and
+    decompression scores.
+  * **Feature:** The core codec is now warning clean up to /W4 for both MSVC
+    `cl.exe` and `clangcl.exe` compilers.
+  * **Feature:** The core codec now supports arm64 for both MSVC `cl.exe` and
+    `clangcl.exe` compilers.
+  * **Feature:** `NO_INVARIANCE` builds will enable the `-ffp-contract=fast`
+    option for all targets when using Clang or GCC. In addition AVX2 targets
+    will also set the `-mfma` option. This reduces image quality by up to 0.2dB
+    (normally much less), but improves performance by up to 5-20%.
+  * **Optimization:** Angular endpoint min/max weight selection is restricted
+    to weight `QUANT_11` or lower. Higher quantization levels assume default
+    0-1 range, which is less accurate but much faster.
+  * **Optimization:** Maximum weight quantization for later trials is selected
+    based on the weight quantization of the best encoding from the 1 plane 1
+    partition trial. This significantly reduces the search space for the later
+    trials with more planes or partitions.
+  * **Optimization:** Small data tables now use in-register SIMD permutes
+    rather than gathers (AVX2) or unrolled scalar lookups (SSE/NEON). This can
+    be a significant optimization for paths that are load unit limited.
+  * **Optimization:** Decompressed image block writes in the decompressor now
+    use a vectorized approach to writing each row of texels in the block,
+    including to ability to exploit masked stores if the target supports them.
+  * **Optimization:** Weight scrambling has been moved into the physical layer;
+    the rest of the codec now uses linear order weights.
+  * **Optimization:** Weight packing has been moved into the physical layer;
+    the rest of the codec now uses unpacked weights in the 0-64 range.
+  * **Optimization:** Consistently vectorize the creation of unquantized weight
+    grids when they are needed.
+  * **Optimization:** Remove redundant per-decimation mode copies of endpoint
+    and weight structures, which were really read-only duplicates.
+  * **Optimization:** Early-out the same endpoint mode color calculation if it
+    cannot be applied.
+  * **Optimization:** Numerous type size reductions applied to arrays to reduce
+    both context working buffer size usage and stack usage.
+
+### Performance:
+
+Key for charts:
+
+* Color = block size (see legend).
+* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
+
+**Relative performance vs 3.7 release:**
+
+![Relative scores 4.0 vs 3.7](./ChangeLogImg/relative-3.7-to-4.0.png)
+
+
+- - -
+
+_Copyright © 2022-2024, Arm Limited and contributors. All rights reserved._
@@ -0,0 +1,105 @@
+# 5.x series change log
+
+This page summarizes the major functional and performance changes in each
+release of the 5.x series.
+
+All performance data on this page is measured on an Intel Core i5-9600K
+clocked at 4.2 GHz, running `astcenc` using AVX2 and 6 threads.
+
+<!-- ---------------------------------------------------------------------- -->
+## 5.3.0
+
+**Status:** March 2025
+
+The 5.3.0 release is a minor maintenance release.
+
+* **General:**
+  * **Feature:** Reference C builds (`ASTCENC_ISA_NONE`) now support compiling
+    for big-endian CPUs. Compile with `-DASTCENC_BIG_ENDIAN=ON` when compiling
+    for a big-endian target; it is not auto-detected.
+  * **Improvement:** Builds using GCC now specify `-flto=auto` to allow
+    parallel link steps, and remove the log warnings about not setting a CPU
+    count parameter value.
+  * **Bug fix:** Builds using MSVC `cl.exe` that do not specify an explicit
+    ISA using the preprocessor configuration defines will now correctly
+    default to the SSE2 backend on x86-64 and the NEON backend on Arm64. Previously they would have defaulted to the reference C implementation,
+    which is around 3.25 times slower.
+
+
+<!-- ---------------------------------------------------------------------- -->
+## 5.2.0
+
+**Status:** February 2025
+
+The 5.2.0 release is a minor maintenance release.
+
+This release includes changes to the public interface in the `astcenc.h`
+header.  We always recommend rebuilding your client-side code using the
+header from the same release to avoid compatibility issues.
+
+* **General:**
+  * **Change:** Changed sRGB alpha channel endpoint expansion to match the
+    revised Khronos Data Format Specification (v1.4.0), which reverts an
+    unintended specification change. Compared to previous releases, this change
+    can cause LSB bit differences in the alpha channel of compressed images.
+  * **Feature:** Arm64 builds for Linux added to the GitHub Actions builds, and
+    Arm64 binaries for NEON, 128-bit SVE 128 and 256-bit SVE added to release
+    builds.
+  * **Feature:** Added a new codec API, `astcenc_compress_cancel()`, which can
+    be used to cancel an in-flight compression. This is designed to help make
+    it easier to integrate the codec into an interactive user interface that
+    can respond to user events with low latency.
+  * **Bug fix:** Removed incorrect `static` variable qualifier, which could
+    result in an incorrect `tune_mse_overshoot` heuristic threshold being used
+    if a user ran multiple concurrent compressions with different settings.
+
+<!-- ---------------------------------------------------------------------- -->
+## 5.1.0
+
+**Status:** November 2024
+
+The 5.1.0 release is an optimization release, giving moderate performance
+improvements on all platforms. There are no image quality differences.
+
+* **General:**
+  * **Feature:** Added a new CMake build option to control use of native
+    gathers, as they can be slower than scalar loads on some common x86
+    microarchitectures. Build with `-DASTCENC_X86_GATHERS=OFF` to disable use
+    of native gathers in AVX2 builds.
+  * **Optimization:** Added new `gather()` abstraction for gathers using byte
+    indices, allowing implementations without gather hardware to skip the
+    byte-to-int index conversion.
+  * **Optimization:** Optimized `compute_lowest_and_highest_weight()` to
+    pre-compute min/max outside of the main loop.
+  * **Optimization:** Added improved intrinsics sequence for SSE and AVX2
+    integer `hmin()` and `hmax()`.
+  * **Optimization:** Added improved intrinsics sequence for `vint4(uint8_t*)`
+    on systems implementing Arm SVE.
+
+<!-- ---------------------------------------------------------------------- -->
+## 5.0.0
+
+**Status:** November 2024
+
+The 5.0.0 release is the first stable release in the 5.x series. The main new
+feature is support for the Arm Scalable Vector Extensions (SVE) SIMD instruction
+set.
+
+* **General:**
+  * **Bug fix:** Fixed incorrect return type in "None" vector library
+    reference implementation.
+  * **Bug fix:** Fixed sincos table index under/overflow.
+  * **Feature:** Changed `ASTCENC_ISA_NATIVE` builds to use `-march=native` and
+    `-mcpu=native`.
+  * **Feature:** Added backend for Arm SVE fixed-width 256-bit builds. These
+    can only run on hardware implementing 256-bit SVE.
+  * **Feature:** Added backend for Arm SVE 128-bit builds. These are portable
+    builds and can run on hardware implementing any SVE vector length, but the
+    explicit SVE use is augmented NEON and will only use the bottom 128-bits of
+    each SVE vector.
+  * **Feature:** Optimized NEON mask `any()` and `all()` functions.
+  * **Feature:** Migrated build and test to GitHub Actions pipelines.
+
+- - -
+
+_Copyright © 2022-2025, Arm Limited and contributors. All rights reserved._
@@ -0,0 +1,235 @@
+# Effective ASTC Encoding
+
+Most texture compression schemes encode a single color format at single
+bitrate, so there are relatively few configuration options available to content
+creators beyond selecting which compressed format to use.
+
+ASTC on the other hand is an extremely flexible container format which can
+compress multiple color formats at multiple bit rates. Inevitably this
+flexibility gives rise to questions about how to best use ASTC to encode a
+specific color format, or what the equivalent settings are to get a close
+match to another compression format.
+
+This page aims to give some guidelines, but note that they are only guidelines
+and are not exhaustive so please deviate from them as needed.
+
+## Traditional format reference
+
+The most commonly used non-ASTC compressed formats, their color format, and
+their compressed bitrate are shown in the table below.
+
+| Name     | Color Format | Bits/Pixel | Notes            |
+| -------- | ------------ | ---------- | ---------------- |
+| BC1      | RGB+A        | 4          | RGB565 + 1-bit A |
+| BC3      | RGB+A        | 8          | BC1 RGB + BC4 A  |
+| BC3nm    | G+R          | 8          | BC1 G   + BC4 R  |
+| BC4      | R            | 4          | L8               |
+| BC5      | R+G          | 8          | BC1 R + BC1 G    |
+| BC6H     | RGB (HDR)    | 8          |                  |
+| BC7      | RGB / RGBA   | 8          |                  |
+| EAC_R11  | R            | 4          | R11              |
+| EAC_RG11 | RG           | 8          | RG11             |
+| ETC1     | RGB          | 4          | RGB565           |
+| ETC2     | RGB+A        | 4          | RGB565 + 1-bit A |
+| ETC2+EAC | RGB+A        | 8          | RGB565 + EAC A   |
+| PVRTC    | RGBA         | 2 or 4     |                  |
+
+**Note:** BC2 (RGB+A) is not included in the table because it's rarely used in
+practice due to poor quality alpha encoding; BC3 is nearly always used instead.
+
+**Note:** Color representations shown with a `+` symbol indicate non-correlated
+compression groups; e.g. an `RGB + A` format compresses `RGB` and `A`
+independently and does not assume the two signals are correlated. This can be
+a strength (it improves quality when compressing non-correlated signals), but
+also a weakness (it reduces quality when compressing correlated signals).
+
+# ASTC Format Mapping
+
+The main question which arises with the mapping of another format on to ASTC
+is how to handle cases where the input isn't a 4 component RGBA input. ASTC is
+a container format which always decompresses in to a 4 component RGBA result.
+However, the internal compressed representation is very flexible and can store
+1-4 components as needed on a per-block basis.
+
+To get the best quality for a given bitrate, or the lowest bitrate for a given
+quality, it is important that as few components as possible are stored in the
+internal representation to avoid wasting coding space.
+
+Specific optimizations in the ASTC coding scheme exist for:
+
+* Encoding the RGB components as a single luminance component, so only a single
+  value needs to be stored in the coding instead of three.
+* Encoding the A component as a constant 1.0 value, so the coding doesn't
+  actually need to store a per-pixel alpha value at all.
+
+... so mapping your inputs given to the compressor to hit these paths is
+really important if you want to get the best output quality for your chosen
+bitrate.
+
+## Encoding 1-4 component data
+
+The table below shows the recommended component usage for data with different
+numbers of color components present in the data.
+
+The coding swizzle should be applied when compressing an image. This can be
+handled by the compressor when reading an uncompressed input image by
+specifying the swizzle using the `-esw` command line option.
+
+The sampling swizzle is what you should use in your shader programs to read
+the data from the compressed texture, assuming no additional API-level
+component swizzling is specified by the application.
+
+| Input components |  ASTC Endpoint | Coding Swizzle | Sampling Swizzle   |
+| -------------- |  ------------- | -------------- | ------------------ |
+| 1              |  L + 1         | `rrr1`         | `.g` <sup>1</sup>  |
+| 2              |  L + A         | `rrrg`         | `.ga` <sup>1</sup> |
+| 3              |  RGB + 1       | `rgb1`         | `.rgb`             |
+| 4              |  RGB + A       | `rgba`         | `.rgba`            |
+
+**1:** Sampling from `g` is preferred to sampling from `r` because it allows a
+single shader to be compatible with ASTC, BC1, or ETC formats. BC1 and ETC1
+store color endpoints as RGB565 data, so the `g` component will have higher
+precision. For ASTC it doesn't actually make any difference; the same single
+component luminance will be returned for all three of the `.rgb` components.
+
+## Equivalence with other formats
+
+Based on these component encoding requirements we can now derive the the ASTC
+coding equivalents for most of the other texture compression formats in common
+use today.
+
+| Formant  | ASTC Coding Swizzle | ASTC Sampling Swizzle | Notes            |
+| -------- | ------------------- | --------------------- | ---------------- |
+| BC1      | `rgba` <sup>1</sup> | `.rgba`               |                  |
+| BC3      | `rgba`              | `.rgba`               |                  |
+| BC3nm    | `gggr`              | `.ag`                 |                  |
+| BC4      | `rrr1`              | `.r`                  |                  |
+| BC5      | `rrrg`              | `.ra` <sup>2</sup>    |                  |
+| BC6H     | `rgb1`              | `.rgb` <sup>3</sup>   | HDR profile only |
+| BC7      | `rgba`              | `.rgba`               |                  |
+| EAC_R11  | `rrr1`              | `.r`                  |                  |
+| EAC_RG11 | `rrrg`              | `.ra` <sup>2</sup>    |                  |
+| ETC1     | `rgb1`              | `.rgb`                |                  |
+| ETC2     | `rgba` <sup>1</sup> | `.rgba`               |                  |
+| ETC2+EAC | `rgba`              | `.rgba`               |                  |
+| ETC2+EAC | `rgba`              | `.rgba`               |                  |
+
+**1:** ASTC has no equivalent of the 1-bit punch-through alpha encoding
+supported by BC1 or ETC2; if alpha is present it will be a full alpha
+component.
+
+**2:** ASTC relies on using the L+A color endpoint type for coding efficiency
+for two component data. It therefore has no direct equivalent of a two-plane
+format sampled though the `.rg` components such as BC5 or EAC_RG11. This can
+be emulated by setting texture component swizzles in the runtime API - e.g. via
+`glTexParameteri()` for OpenGL ES - although it has been noted that API
+controlled swizzles are not available in WebGL.
+
+**3:** ASTC can only store unsigned values, and has no equivalent of the BC6
+signed endpoint mode.
+
+# Other Considerations
+
+This section outlines some of the other things to consider when encoding
+textures using ASTC.
+
+## Decode mode extensions
+
+ASTC is specified to decompress into a 16-bit per component RGBA output by
+default, with the exception of the sRGB format which uses an 8-bit value for the
+RGB components.
+
+Decompressing in to a 16-bit per component output format is often higher than
+many use cases require, especially for LDR textures which originally came from
+an 8-bit per component source image. Most implementations of ASTC support the
+decode mode extensions, which allow an application to opt-in to a lower
+precision decompressed format (RGBA8 for LDR, RGB9E5 for HDR). Using these
+extensions can improve GPU texture cache efficiency, and even improve texturing
+filtering throughput, for use cases that do not need the higher precision.
+
+The ASTC format uses different data rounding rules when the decode mode
+extensions are used. To ensure that the compressor chooses the best encodings
+for the RGBA8 rounding rules, you can specify `-decode_unorm8` when compressing
+textures that will be decompressed into the RGBA8 intermediate. This gives a
+small image quality boost.
+
+**Note:** This mode is automatically enabled if you use the `astcenc`
+decompressor to write an 8-bit per component output image.
+
+## Encoding non-correlated components
+
+Most other texture compression formats have a static component assignment in
+terms of the expected data correlation. For example, ETC2+EAC assumes that RGB
+are always correlated and that alpha is non-correlated. ASTC can automatically
+encode data as either fully correlated across all 4 components, or with any one
+component assigned to a separate non-correlated partition to the other three.
+
+The non-correlated component can be changed on a block-by-block basis, so the
+compressor can dynamically adjust the coding based on the data present in the
+image. This means that there is no need for non-correlated data to be stored
+in a specific component in the input image.
+
+It is however worth noting that the alpha component is treated differently to
+the RGB color components in some circumstances:
+
+* When coding for sRGB the alpha component will always be stored in linear
+  space.
+* When coding for HDR the alpha component can optionally be kept as LDR data.
+
+## Encoding normal maps
+
+The best way to store normal maps using ASTC is similar to the scheme used by
+BC5; store the X and Y components of a unit-length normal. The Z component of
+the normal can be reconstructed in shader code based on the knowledge that the
+vector is unit length.
+
+To encode this we need to store only two input components in the compressed
+data, and therefore use the `rrrg` coding swizzle to align the data with the
+ASTC luminance+alpha endpoint. We can sample this in shader code using the
+`.ga` sampling swizzle, and reconstruct the Z value with:
+
+    vec3 nml;
+    nml.xy = texture(...).ga;                // Load normals (range 0 to 1)
+    nml.xy = nml.xy * 2.0 - 1.0;             // Unpack normals (range -1 to +1)
+    nml.z = sqrt(1 - dot(nml.xy, nml.xy));   // Compute Z, given unit length
+
+The encoding swizzle and appropriate component weighting is enabled by using
+the `-normal` command line option. If you wish to use a different pair of
+components you can specify a custom swizzle after setting the `-normal`
+parameter. For example, to match BC5n component ordering use
+`-normal -esw gggr` for compression and `-normal -dsw arz1` for decompression.
+
+## Encoding sRGB data
+
+The ASTC LDR profile can compress sRGB encoded color, which is a more
+efficient use of bits than storing linear encoded color because the gamma
+corrected value distribution more closely matches human perception of
+luminance.
+
+For color data it is nearly always a perceptual quality win to use sRGB input
+source textures that are then compressed using the ASTC sRGB compression mode
+(compress using the `-cs` command line option rather than the `-cl` command
+line option). Note that sRGB gamma correction is only applied to the RGB
+components during decode; the alpha component is always treated as linear
+encoded data.
+
+*Important:* The uncompressed input texture provided on the command line must
+be stored in the sRGB color space for `-cs` to function correctly.
+
+## Encoding HDR data
+
+HDR data can be encoded just like LDR data, but with some caveats around
+handling the alpha component.
+
+For many use cases the alpha component is an actual alpha opacity component and
+is therefore used for storing an LDR value between 0 and 1. For these cases use
+the `-ch` compressor option which will treat the RGB components as HDR, but the
+A component as LDR.
+
+For other use cases the alpha component is simply a fourth data component which
+is also storing an HDR value. For these cases use the `-cH` compressor option
+which will treat all components as HDR data.
+
+- - -
+
+_Copyright © 2019-2024, Arm Limited and contributors. All rights reserved._
@@ -0,0 +1,71 @@
+# The .astc File Format
+
+The default file format for compressed textures generated by `astcenc`, as well
+as from many other ASTC compressors, is the `.astc` format. This is a very
+simple format consisting of a small header followed immediately by the binary
+payload for a single image surface.
+
+Header
+======
+
+The header is a fixed 16 byte structure, defined as storing only bytes to avoid
+any endianness issues or incur any padding overhead.
+
+```
+struct astc_header
+{
+    uint8_t magic[4];
+    uint8_t block_x;
+    uint8_t block_y;
+    uint8_t block_z;
+    uint8_t dim_x[3];
+    uint8_t dim_y[3];
+    uint8_t dim_z[3];
+};
+```
+
+Magic number
+------------
+
+The 4 byte magic number at the start of the file acts as a format identifier.
+
+```
+    magic[0] = 0x13;
+    magic[1] = 0xAB;
+    magic[2] = 0xA1;
+    magic[3] = 0x5C;
+```
+
+Block size
+----------
+
+The `block_*` fields store the ASTC block dimensions in texels. For 2D images
+the Z dimension must be set to 1.
+
+Image dimensions
+----------------
+
+The `dim_*` fields store the image dimensions in texels.  For 2D images the
+Z dimension must be set to 1.
+
+Note that the image is not required to be an exact multiple of the compressed
+block size; the compressed data may include padding that is discarded during
+decompression.
+
+Each dimension is a 24 bit unsigned value that is reconstructed from the stored
+byte values as:
+
+```
+decoded_dim = dim[0] + (dim[1] << 8) + (dim[2] << 16);
+```
+
+Binary payload
+==============
+
+The binary payload is a byte stream that immediately follows the header. It
+contains 16 bytes per compressed block. The number of compressed blocks is
+determined from the header information.
+
+- - -
+
+_Copyright © 2020-2022, Arm Limited and contributors. All rights reserved._
@@ -0,0 +1,488 @@
+# ASTC Format Overview
+
+Adaptive Scalable Texture Compression (ASTC) is an advanced lossy texture
+compression technology developed by Arm and AMD. It has been adopted as an
+official Khronos extension to the OpenGL and OpenGL ES APIs, and as a standard
+optional feature for the Vulkan API.
+
+ASTC offers a number of advantages over earlier texture compression formats:
+
+* **Format flexibility:** ASTC supports compressing between 1 and 4 channels of
+  data, including support for one non-correlated channel such as RGB+A
+  (correlated RGB, non-correlated alpha).
+* **Bit rate flexibility:** ASTC supports compressing images with a fine
+  grained choice of bit rates between 0.89 and 8 bits per texel (bpt). The bit
+  rate choice is independent to the color format choice.
+* **Advanced format support:** ASTC supports compressing images in either low
+  dynamic range (LDR), LDR sRGB, or high dynamic range (HDR) color spaces, as
+  well as support for compressing 3D volumetric textures.
+* **Improved image quality:** Despite the high degree of format flexibility,
+  ASTC manages to beat nearly all legacy texture compression formats -- such as
+  ETC2, PVRCT, and the BC formats -- on image quality at equivalent bit
+  rates.
+
+This article explores the ASTC format, and how it manages to generate the
+flexibility and quality improvements that it achieves.
+
+
+Why ASTC?
+=========
+
+Before the creation of ASTC, the format and bit rate coverage of the available
+formats was very sparse:
+
+![Legacy texture compression formats and bit rates](./FormatOverviewImg/coverage-legacy.svg)
+
+In reality the situation is even worse than this diagram shows, as many of
+these formats are proprietary or simply not available on some operating
+systems, so any single platform will have very limited compression choices.
+
+For developers this situation makes developing content which is portable across
+multiple platforms a tricky proposition. It's almost certain that differently
+compressed assets will be needed for different platforms. Each asset pack would
+likely then need to use different levels of compression, and may even have to
+fall back to no compression for some assets on some platforms, which leaves
+either some image quality or some memory bandwidth efficiency untapped.
+
+It was clear a better way was needed, so the Khronos group asked members to
+submit proposals for a new compression algorithm to be adopted in the same
+manner that the earlier ETC algorithm was adopted for OpenGL ES. ASTC was the
+result of this, and has been adopted as an official algorithm for OpenGL,
+OpenGL ES, and Vulkan.
+
+
+Format overview
+===============
+
+Given the fragmentation issues with the existing compression formats, it should
+be no surprise that the high level design objectives for ASTC were to have
+something which could be used across the whole range of art assets found in
+modern content, and which allows artists to have more control over the quality
+to bit rate tradeoff.
+
+There are quite a few technical components which make up the ASTC format, so
+before we dive into detail it will be useful to give an overview of how ASTC
+works at a higher level.
+
+
+Block compression
+-----------------
+
+Compression formats for real-time graphics need the ability to quickly and
+efficiently make random samples into a texture. This places two technical
+requirements on any compression format:
+
+* It must be possible to compute the address of data in memory given only a
+  sample coordinate.
+* It must be possible to decompress random samples without decompressing too
+  much surrounding data.
+
+The standard solution for this used by all contemporary real-time formats,
+including ASTC, is to divide the image into fixed-size blocks of texels, each
+of which is compressed into a fixed number of output bits. This feature makes
+it possible to access texels quickly, in any order, and with a well-bounded
+decompression cost.
+
+The 2D block footprints in ASTC range from 4x4 texels up to 12x12 texels, which
+all compress into 128-bit output blocks. By dividing 128 bits by the number of
+texels in the footprint, we derive the format bit rates which range from 8 bpt
+(`128/(4*4)`) down to 0.89 bpt (`128/(12*12)`).
+
+
+Color encoding
+--------------
+
+ASTC uses gradients to assign the color values of each texel. Each compressed
+block stores the end-point colors for a gradient, and an interpolation weight
+for each texel which defines the texel's location along that gradient. During
+decompression the color value for each texel is generated by interpolating
+between the two end-point colors, based on the per-texel weight.
+
+![One partition gradient storage](./FormatOverviewImg/gradient-1p.svg)
+
+In many cases a block will contain a complex distribution of colors, for
+example a red ball sitting on green grass. In these scenarios a single color
+gradient will not be able to accurately represent all of the texels' values. To
+support this ASTC allows a block to define up to four distinct color gradients,
+known as partitions, and can assign each texel to a single partition. For our
+example we require two partitions, one for our ball texels and one for our
+grass texels.
+
+![Two partition gradient storage](./FormatOverviewImg/gradient-2p.svg)
+
+Now that you know the high level operation of the format, we can dive into more
+detail.
+
+
+Integer encoding
+================
+
+Initially the idea of fractional bits per texel sounds implausible, or even
+impossible, because we're so used to storing numbers as a whole number of bits.
+However, it's not quite as strange as it sounds. ASTC uses an encoding
+technique called Bounded Integer Sequence Encoding (BISE), which makes heavy
+use of storing numbers with a fractional number of bits to pack information
+more efficiently.
+
+
+Storing alphabets
+-----------------
+
+Even though color and weight values per texel are notionally floating-point
+values, we have far too few bits available to directly store the actual values,
+so they must be quantized during compression to reduce the storage size. For
+example, if we have a floating-point weight for each texel in the range 0.0 to
+1.0 we could choose to quantize it to five values - 0.0, 0.25, 0.5, 0.75, and
+1.0 - which we can then represent in storage using the integer values 0 to 4.
+
+In the general case we need to be able to efficiently store characters of an
+alphabet containing N symbols if we choose quantize to N levels. An N symbol
+alphabet contains `log2(N)` bits of information per character. If we have an
+alphabet of 5 possible symbols then each character contains ~2.32 bits of
+information, but simple binary storage would require us to round up to 3 bits.
+This wastes 22.3% of our storage capacity. The chart below shows the percentage
+of our bit-space wasted when using simple binary encoding to store an arbitrary
+N symbol alphabet:
+
+![Binary encoding efficiency](./FormatOverviewImg/binary.png)
+
+... which shows for most alphabet sizes we waste a lot of our storage capacity
+when using an integer number of bits per character. Efficiency is of critical
+importance to a compression format, so this is something we needed to be able
+to improve.
+
+**Note:** We could have chosen to round-up the quantization level to the next
+power of two, and at least use the bits we're spending. However, this forces
+the encoder to spend bits which could be used elsewhere for a bigger benefit,
+so it will reduce image quality and is a sub-optimal solution.
+
+
+Quints
+------
+
+Instead of rounding up a 5 symbol alphabet - called a "quint" in BISE - to
+three bits, we could choose to instead pack three quint characters together.
+Three characters in a 5-symbol alphabet have 5<sup>3</sup> (125) combinations,
+and contain 6.97 bits of information. We can store this in 7 bits and have a
+storage waste of only 0.5%.
+
+
+Trits
+-----
+
+We can similarly construct a 3-symbol alphabet - called a "trit" in BISE - and
+pack trit characters in groups of five. Each character group has 3<sup>5</sup>
+(243) combinations, and contains 7.92 bits of information. We can store this in
+8 bits and have a storage waste of only 1%.
+
+
+BISE
+----
+
+The BISE encoding used by ASTC allows storage of character sequences using
+arbitrary alphabets of up to 256 symbols, encoding each alphabet size in the
+most space-efficient choice of bits, trits, and quints.
+
+* Alphabets with up to (2<sup>n</sup> - 1) symbols can be encoded using n bits
+  per character.
+* Alphabets with up (3 * 2<sup>n</sup> - 1) symbols can be encoded using n bits
+  (m) and a trit (t) per character, and reconstructed using the equation
+  (t * 2<sup>n</sup> + m).
+* Alphabets with up to (5 * 2<sup>n</sup> - 1) symbols can be encoded using n
+  bits (m) and a quint (q) per character, and reconstructed using the equation
+  (q * 2<sup>n</sup> + m).
+
+When the number of characters in a sequence is not a multiple of three or five
+we need to avoid wasting storage at the end of the sequence, so we add another
+constraint on the encoding. If the last few values in the sequence to encode
+are zero, the last few bits in the encoded bit string must also be zero.
+Ideally, the number of non-zero bits should be easily calculated and not depend
+on the magnitudes of the previous encoded values. This is a little tricky to
+arrange during compression, but it is possible. This means that we do not need
+to store any padding after the end of the bit sequence, as we can safely assume
+that they are zero bits.
+
+With this constraint in place - and by some smart packing the bits, trits, and
+quints - BISE encodes an string of S characters in an N symbol alphabet using a
+fixed number of bits:
+
+* S values up to (2<sup>n</sup> - 1) uses (NS) bits.
+* S values up to (3 * 2<sup>n</sup> - 1) uses (NS + ceil(8S / 5)) bits.
+* S values up to (5 * 2<sup>n</sup> - 1) uses (NS + ceil(7S / 3)) bits.
+
+... and the compressor will choose the one of these which produces the smallest
+storage for the alphabet size being stored; some will use binary, some will use
+bits and a trit, and some will use bits and a quint. If we compare the storage
+efficiency of BISE against simple binary for the range of possible alphabet
+sizes we might want to encode we can see that it is much more efficient.
+
+![BISE encoding efficiency](./FormatOverviewImg/bise.png)
+
+
+Block sizes
+===========
+
+ASTC always compresses blocks of texels into 128-bit outputs, but allows the
+developer to select from a range of block sizes to enable a fine-grained
+tradeoff between image quality and size.
+
+| Block footprint | Bits/texel |     | Block footprint | Bits/texel |
+| --------------- | ---------- | --- | --------------- | ---------- |
+|             4x4 |       8.00 |     |            10x5 |       2.56 |
+|             5x4 |       6.40 |     |            10x6 |       2.13 |
+|             5x5 |       5.12 |     |             8x8 |       2.00 |
+|             6x5 |       4.27 |     |            10x8 |       1.60 |
+|             6x6 |       3.56 |     |           10x10 |       1.28 |
+|             8x5 |       3.20 |     |           12x10 |       1.07 |
+|             8x6 |       2.67 |     |           12x12 |       0.89 |
+
+
+
+Color endpoints
+===============
+
+The color data for a block is encoded as a gradient between two color
+endpoints, with each texel selecting a position along that gradient which is
+then interpolated during decompression. ASTC supports 16 color endpoint
+encoding schemes, known as "endpoint modes". Options for endpoint modes
+include:
+
+* Varying the number of color channels: e.g. luminance, luminance + alpha, rgb,
+  and rgba.
+* Varying the encoding method: e.g. direct, base+offset, base+scale,
+  quantization level.
+* Varying the data range: e.g. low dynamic range, or high dynamic range
+
+The endpoint modes, and the endpoint color BISE quantization level, can be
+chosen on a per-block basis.
+
+
+Color partitions
+================
+
+Colors within a block are often complex, and cannot be accurately captured by a
+single color gradient, as discussed earlier with our example of a red ball
+lying on green grass. ASTC allows up to four color gradients - known as
+"partitions" - to be assigned to a single block. Each texel is then assigned to
+a single partition for the purposes of decompression.
+
+Rather then directly storing the partition assignment for each texel, which
+would need a lot of decompressor hardware to store it for all block sizes, we
+generate it procedurally. Each block only needs to store the partition index -
+which is the seed for the procedural generator - and the per texel assignment
+can then be generated on-the-fly during decompression. The image below shows
+the generated texel assignments for two (top), three (middle), and four
+(bottom) partitions for the 8x8 block size.
+
+![ASTC partition table](./FormatOverviewImg/hash.png)
+
+The number of partitions and the partition index can be chosen on a per-block
+basis, and a different color endpoint mode can be chosen per partition.
+
+**Note:** ASTC uses a 10-bit seed to drive the partition assignments. The hash
+used will introduce horizontal bias in a third of the partitions, vertical bias
+in a third, and no bias in the rest. As they are procedurally generated not all
+of the partitions are useful, in particular with the smaller block sizes.
+
+* Many partitions are duplicates.
+* Many partitions are degenerate (an N partition hash results in at least one
+  partition assignment that contains no texels).
+
+
+Texel weights
+=============
+
+Each texel requires a weight, which defines the relative contribution of each
+color endpoint when interpolating the color gradient.
+
+For smaller block sizes we can choose to store the weight directly, with one
+weight per texel, but for the larger block sizes we simply do not have enough
+bits of storage to do this. To work around this ASTC allows the weight grid to
+be stored at a lower resolution than the texel grid. The per-texel weights are
+interpolated from the stored weight grid during decompression using a bilinear
+interpolation.
+
+The number of texel weights, and the weight value BISE quantization level, can
+be chosen on a per-block basis.
+
+
+Dual-plane weights
+------------------
+
+Using a single weight for all color channels works well when there is good
+correlation across the channels, but this is not always the case. Common
+examples where we would expect to get low correlation at least some of the time
+are textures storing RGBA data - alpha masks are not usually closely
+correlated with the color value - or normal data - the X and Y normal values
+often change independently.
+
+ASTC allows a dual-plane mode, which uses two separate weight grids for each
+texel. A single channel can be assigned to a second plane of weights, while
+the other three use the first plane of weights.
+
+The use of dual-plane mode can be chosen on a per-block basis, but its use
+prevents the use of four color partitions as we do not have enough bits to
+concurrently store both an extra plane of weights and an extra set of color
+endpoints.
+
+
+End results
+===========
+
+So, if we pull all of this together what do we end up with?
+
+
+Adaptive
+--------
+
+The first word in the name of ASTC is "adaptive", and it should now hopefully
+be clear why. Each block always compresses into 128-bits of storage, but the
+developer can choose from a wide range of texel block sizes and the compressor
+gets a huge amount of latitude to determine how those 128 bits are used.
+
+The compressor can trade off the number of bits assigned to colors (number of
+partitions, endpoint mode, and stored quantization level) and weights (number
+of weights per block, use of dual-plane, and stored quantization level) on a
+per-block basis to get the best image quality possible.
+
+![ASTC compressed parrot at various bit rates](./FormatOverviewImg/astc-quality.png)
+
+
+Format support
+--------------
+
+The compression scheme used by ASTC effectively compresses arbitrary sequences
+of floating point numbers, with a flexible number of channels, across any of
+the supported block sizes. There is no real notion of "color format" in the
+format itself at all, beyond the color endpoint mode selection, although a
+sensible compressor will want to use some format-specific heuristics to drive
+an efficient state-space search.
+
+The orthogonal encoding design allows ASTC to provide almost complete coverage
+of our desirable format matrix from earlier, across a wide range of bit rates:
+
+![ASTC 2D formats and bit rates](./FormatOverviewImg/coverage-astc.svg)
+
+The only significant omission is the absence of a dedicated two channel
+encoding for HDR textures. We simply ran out of entries in the space we had for
+encoding color endpoint modes, and this one didn't make the cut.
+
+The flexibility allowed by ASTC ticks the requirement that almost any asset can
+be compressed to some degree, at an appropriate bitrate for its quality needs.
+This is a powerful enabler for a compression format, because it puts control in
+the hands of content creators and not arbitrary format restrictions.
+
+
+Image quality
+-------------
+
+The normal expectation would be that this level of format flexibility would
+come at a cost of image quality; it has to cost something, right? Luckily this
+isn't true. The high packing efficiency allowed by BISE encoding, and the
+ability to dynamically choose where to spend encoding space on a per-block
+basis, means that an ASTC compressor is not forced to spend bits on things that
+don't help image quality.
+
+This gives some significant improvements in image quality compared to the older
+texture formats, even though ASTC also handles a much wider range of options.
+
+* ASTC at 2 bpt outperforms PVRTC at 2 bpt by ~2.0dB.
+* ASTC at 3.56 bpt outperforms PVRTC and BC1 at 4 bpt by ~1.5dB, and ETC2 by
+  ~0.7dB, despite a 10% bit rate disadvantage.
+* ASTC at 8 bpt for LDR formats is comparable in quality to BC7 at 8 bpt.
+* ASTC at 8 bpt for HDR formats is comparable in quality to BC6H at 8 bpt.
+
+Differences as small as 0.25dB are visible to the human eye, and remember that
+dB uses a logarithmic scale, so these are significant image quality
+improvements.
+
+
+3D compression
+--------------
+
+One of the nice bonus features of ASTC is that the techniques which underpin
+the format generalize to compressing volumetric texture data without needing
+very much additional decompression hardware.
+
+ASTC is therefore also able to optionally support compression of 3D textures,
+which is a unique feature not found in any earlier format, at the following
+bit rates:
+
+| Block footprint | Bits/texel |     | Block footprint | Bits/texel |
+| --------------- | ---------- | --- | --------------- | ---------- |
+|           3x3x3 |       4.74 |     |           5x5x4 |       1.28 |
+|           4x3x3 |       3.56 |     |           5x5x5 |       1.02 |
+|           4x4x3 |       2.67 |     |           6x5x5 |       0.85 |
+|           4x4x4 |       2.00 |     |           6x6x5 |       0.71 |
+|           5x4x4 |       1.60 |     |           6x6x6 |       0.59 |
+
+
+Availability
+============
+
+The ASTC functionality is specified as a set of feature profiles, allowing
+GPU hardware manufacturers to select which parts of the standard they
+implement. There are four commonly seen profiles:
+
+* "LDR":
+    * 2D blocks.
+    * LDR and sRGB color space.
+    * [KHR_texture_compression_astc_ldr][astc_ldr]: KHR OpenGL ES extension.
+* "LDR + Sliced 3D":
+    * 2D blocks and sliced 3D blocks.
+    * LDR and sRGB color space.
+    * [KHR_texture_compression_astc_sliced_3d][astc_3d]: KHR OpenGL ES extension.
+* "HDR":
+    * 2D and sliced 3D blocks.
+    * LDR, sRGB, and HDR color spaces.
+    * [KHR_texture_compression_astc_hdr][astc_ldr]: KHR OpenGL ES extension.
+* "Full":
+    * 2D, sliced 3D, and volumetric 3D blocks.
+    * LDR, sRGB, and HDR color spaces.
+	* [OES_texture_compression_astc][astc_full]: OES OpenGL ES extension.
+
+The LDR profile is mandatory in OpenGL ES 3.2 and a standardized optional
+feature for Vulkan, and therefore widely supported on contemporary mobile
+devices. The 2D HDR profile is not mandatory, but is widely supported.
+
+3D texturing
+------------
+
+The APIs expose 3D textures in two flavors.
+
+The sliced 3D texture support builds a 3D texture from an array of 2D image
+slices that have each been individually compressed using 2D ASTC compression.
+This is required for the HDR profile, so is also widely supported.
+
+The volumetric 3D texture support uses the native 3D block sizes provided by
+ASTC to implement true volumetric compression. This enables a wider choice of
+low bitrate options than the 2D blocks, which is particularly important for 3D
+textures of any non-trivial size. Volumetric formats are not widely supported,
+but are supported on all of the Arm Mali GPUs that support ASTC.
+
+ASTC decode mode
+----------------
+
+ASTC is specified to decompress texels into fp16 intermediate values, except
+for sRGB which always decompresses into 8-bit UNORM intermediates. For many use
+cases this gives more dynamic range and precision than required. This can cause
+a reduction in both texture cache efficiency and texture filtering performance
+due to the larger decompressed data size.
+
+A pair of extensions exist, and are widely supported on recent mobile GPUs,
+which allow applications to reduce the intermediate precision to either UNORM8
+(recommended for LDR textures) or RGB9e5 (recommended for HDR textures).
+
+* [OES_texture_compression_astc_decode_mode][astc_decode]: Allow UNORM8
+  intermediates
+* [OES_texture_compression_astc_decode_mode_rgb9e5][astc_decode]: Allow RGB9e5
+  intermediates
+
+[astc_ldr]: https://www.khronos.org/registry/OpenGL/extensions/KHR/KHR_texture_compression_astc_hdr.txt
+[astc_3d]: https://www.khronos.org/registry/OpenGL/extensions/KHR/KHR_texture_compression_astc_sliced_3d.txt
+[astc_full]: https://www.khronos.org/registry/OpenGL/extensions/OES/OES_texture_compression_astc.txt
+[astc_decode]: https://www.khronos.org/registry/OpenGL/extensions/EXT/EXT_texture_compression_astc_decode_mode.txt
+
+- - -
+
+_Copyright © 2019-2022, Arm Limited and contributors. All rights reserved._
@@ -0,0 +1,79 @@
+# Terminology for the ASTC Encoder
+
+Like most software, the `astcenc` code base has a set of naming conventions
+for variables which are used to ensure both accuracy and reasonable brevity.
+
+:construction: These conventions are being used for new patches, so new code
+will conform to this, but older code is still being cleaned up to follow
+these conventions.
+
+## Counts
+
+For counts of things prefer `<x>_count` rather than `<x>s`. For example:
+
+* `plane_count`
+* `weight_count`
+* `texel_count`
+
+Where possible aim for descriptive loop variables, as these are more literate
+than simple `i` or `j` variables. For example:
+
+* `plane_index`
+* `weight_index`
+* `texel_index`
+
+## Ideal, Unpacked Quantized, vs Packed Quantized
+
+Variables that are quantized, such as endpoint colors and weights, have
+multiple states depending on how they are being used.
+
+**Ideal values** represent arbitrary numeric values that can take any value.
+These are often used during compression to work out the best value before
+any quantization is applied. For example, integer weights in the 0-64 range can
+take any of the 65 values available.
+
+**Quant uvalues** represent the unpacked numeric value after any quantization
+rounding has been applied. These are often used during compression to work out
+the error for the quantized value compared to the ideal value. For example,
+`QUANT_3` weights in the 0-64 range can only take one of `[0, 32, 64]`.
+
+**Quant pvalues** represent the packed numeric value in the quantized alphabet.
+This is what ends up encoded in the ASTC data, although note that the encoded
+ordering is scrambled to simplify hardware. For example, `QUANT_3` weights
+originally in the 0-64 range can only take one of `[0, 1, 2]`.
+
+For example:
+
+* `weights_ideal_value`
+* `weights_quant_uvalue`
+* `weights_quant_pvalue`
+
+## Full vs Decimated interpolation weights
+
+Weight grids have multiple states depending on how they are being used.
+
+**full_weights** represent per texel weight grids, storing one weight per texel.
+
+**decimated_weights** represent reduced weight grids, which can store fewer
+weights and which are bilinear interpolated to generate the full weight grid.
+
+Full weights have no variable prefix,but decimated weights are stored with
+a `dec_` prefix.
+
+* `dec_weights_ideal_value`
+* `dec_weights_quant_uvalue`
+* `dec_weights_quant_pvalue`
+
+## Weight vs Significance
+
+The original encoder used "weight" for multiple purposes - texel significance
+(weight the error), color channel significance (weight the error), and endpoint
+interpolation weights. This gets very confusing in functions using all three!
+
+We are slowly refactoring the code to only use "weight" to mean the endpoint
+interpolation weights. The error weighting factors used for other purposes are
+being updated to use the using the term "significance".
+
+- - -
+
+_Copyright © 2020-2022, Arm Limited and contributors. All rights reserved._
@@ -0,0 +1,120 @@
+# Testing astcenc
+
+The repository contains a small suite of tests which can be used to sanity
+check source code changes to the compressor. It must be noted that this test
+suite is relatively limited in scope and does not cover every feature or
+bitrate of the standard.
+
+# Required software
+
+Running the tests requires Python 3.7 to be installed on the host machine, and
+an `astcenc-avx2` release build to have been previously compiled and installed
+into an directory called `astcenc` in the root of the git checkout. This
+can be achieved by configuring the CMake build using the install prefix
+`-DCMAKE_INSTALL_PREFIX=../` and then running a build with the `install` build
+target.
+
+# Running C++ unit tests
+
+We support a small (but growing) number of C++ unit tests, which are written
+using the `googletest` framework and integrated in the CMake "CTest" test
+framework.
+
+To build unit tests pull the `googletest` git submodule and add
+`-DASTCENC_UNITTEST=ON` to the CMake command line when configuring.
+
+To run unit tests use the CMake `ctest` utility from your build directory after
+you have built the tests.
+
+```shell
+cd build
+ctest --verbose
+```
+
+# Running command line tests
+
+To run the command line tests, which aim to get coverage of the command line
+options and core codec stability without testing the compression quality
+itself, run the command line:
+
+    python3 -m unittest discover -s Test -p astc_test*.py -v
+
+# Running image tests
+
+To run the image test suite run the following command from the root directory
+of the repository:
+
+    python3 ./Test/astc_test_image.py
+
+This will run though a series of image compression tests, comparing the image
+PSNR against a set of reference results from the last stable baseline. The test
+will fail if any reduction in PSNR above a set threshold is detected. Note that
+performance information is reported, but regressions will not flag a failure.
+
+For debug purposes, all decompressed test output images and result CSV files
+are stored in the `TestOutput` directory, using the same test set structure as
+the `Test/Images` folder.
+
+## Test selection
+
+The runner supports a number of options to filter down what is run, enabling
+developers to focus local testing on the parts of the code they are working on.
+
+* `--encoder` selects which encoder to run. By default the `avx2` encoder is
+  selected. Note that some out-of-tree reference encoders (older encoders, and
+  some third-party encoders) are supported for comparison purposes. These will
+  not work without the binaries being manually provided; they are not
+  distributed here.
+* `--test-set` selects which image set to run. By default the `Small` image
+  test set is selected, which aims to provide basic coverage of many different
+  color formats and color profiles.
+* `--block-size` selects which block size to run. By default a range of
+  block sizes (2D and 3D) are used.
+* `--color-profile` selects which color profiles from the standard should be
+  used (LDR, LDR sRGB, or HDR) to select images. By default all are selected.
+* `--color-format` selects which color formats should be used (L, XY, RGB,
+  RGBA) to select images. By default all are selected.
+
+## Performance tests
+
+To provide less noisy performance results the test suite supports compressing
+each image multiple times and returning the best measured performance. To
+enable this mode use the following options:
+
+* `--repeats <M>` : Run M test compression passes which are timed.
+
+**Note:**  The reference CSV contains performance results measured on an Intel
+Core i5 9600K running at 4.3GHz, running each test 5 times.
+
+## Updating reference data
+
+The reference PSNR and performance scores are stored in CSVs committed to the
+repository. This data is created by running the tests using the last stable
+release on a standard test machine we use for performance testing builds.
+
+It can be useful for developers to rebuild the reference results for their
+local machine, in particular for measuring performance improvements. To build
+new reference CSVs, download the current reference `astcenc` binary (1.7) from
+GitHub for your host OS and place it in to the `./Binaries/1.7/` directory.
+Once this is done, run the command:
+
+    python3 ./Test/astc_test_image.py --encoder 1.7 --test-set all --repeats 5
+
+... to regenerate the reference CSV files.
+
+**WARNING:** This can take some hours to complete, and it is best done when the
+test suite gets exclusive use of the machine to avoid other processing slowing
+down the compression and disturbing the performance data. It is recommended to
+shutdown or disable any background applications that are running.
+
+## Valgrind memcheck
+
+It is always worth running the Valgrind memcheck tool to validate that we have
+not introduced any obvious memory errors. Build a release build with symbols
+information with `-DCMAKE_BUILD_TYPE=RelWithDebInfo` and then run:
+
+    valgrind --tool=memcheck --track-origins=yes <command>
+
+- - -
+
+_Copyright © 2019-2022, Arm Limited and contributors. All rights reserved._
@@ -0,0 +1,250 @@
+# About
+
+The Arm® Adaptive Scalable Texture Compression (ASTC) Encoder, `astcenc`, is
+a command-line tool for compressing and decompressing images using the ASTC
+texture compression standard.
+
+## The ASTC format
+
+The ASTC compressed data format, developed by Arm® and AMD, has been adopted as
+an official extension to the OpenGL®, OpenGL ES, and Vulkan® graphics APIs. It
+provides a major step forward in terms of both the image quality at a given
+bitrate, and the format and bitrate flexibility available to content creators.
+This allows more assets to use compression, often at a reduced bitrate compared
+to other formats, reducing memory storage and bandwidth requirements.
+
+Read the [ASTC Format Overview][1] for a quick introduction to the format, or
+read the full [Khronos Data Format Specification][2] for all the details.
+
+## License
+
+This project is licensed under the Apache 2.0 license. By downloading any
+component from this repository you acknowledge that you accept terms specified
+in the [LICENSE.txt](LICENSE.txt) file.
+
+# Encoder feature support
+
+The encoder supports compression of low dynamic range (BMP, JPEG, PNG, TGA) and
+high dynamic range (EXR, HDR) images, as well as a subset of image data wrapped
+in the DDS and KTX container formats, into ASTC or KTX format output images.
+
+The decoder supports decompression of ASTC or KTX format input images into low
+dynamic range (BMP, PNG, TGA), high dynamic range (EXR, HDR), or DDS and KTX
+wrapped output images.
+
+The encoder allows control over the compression time/quality tradeoff with
+`exhaustive`, `verythorough`, `thorough`, `medium`, `fast`, and `fastest`
+encoding quality presets.
+
+The encoder allows compression time and quality analysis by reporting the
+compression time, and the Peak Signal-to-Noise Ratio (PSNR) between the input
+image and the compressed output.
+
+## ASTC format support
+
+The `astcenc` compressor supports generation of images for all three profiles
+allowed by the ASTC specification:
+
+* 2D Low Dynamic Range (LDR profile)
+* 2D LDR and High Dynamic Range (HDR profile)
+* 2D and 3D, LDR and HDR (Full profile)
+
+It also supports all of the ASTC block sizes and compression modes, allowing
+content creators to use the full spectrum of quality-to-bitrate options ranging
+from 0.89 bits/pixel up to 8 bits/pixel.
+
+# Prebuilt binaries
+
+Release build binaries for the `astcenc` stable releases are provided in the
+[GitHub Releases page][3].
+
+* Change log: [5.x series](./Docs/ChangeLog-5x.md)
+
+Binaries are provided for 64-bit builds on Windows, macOS, and Linux.
+
+## Windows and Linux
+
+For Windows and Linux the builds of the astcenc are provided as multiple
+binaries, each tuned for a specific SIMD instruction set.
+
+For x86-64 we provide, in order of increasing performance:
+
+* `astcenc-sse2` - uses SSE2
+* `astcenc-sse4.1` - uses SSE4.1 and POPCNT
+* `astcenc-avx2` - uses AVX2, SSE4.2, POPCNT, and F16C
+
+The x86-64 SSE2 builds will work on all x86-64 machines, but it is the slowest
+of the three. The other two require extended CPU instruction set support which
+is not universally available, but each step gains ~15% more performance.
+
+For Arm we provide, in order of increasing performance:
+
+* `astcenc-sve_256` - uses 256-bit SVE
+* `astcenc-sve_128` - uses 128-bit SVE
+* `astcenc-neon` - uses NEON
+
+Note: The Arm Scalable Vector Extensions (SVE) allow CPUs to have a variable
+vector length. The astcenc implementation is not written in a length-agnostic
+style and requires the binary to match the vector length on the host CPU.
+
+## macOS
+
+For macOS devices we provide a single universal binary `astcenc`, which allows
+the OS to automatically use the correct binary variant for the current host
+machine. Support is provided for three architecture slices:
+
+* `x86_64` - uses the `astcenc-sse4.1` build defined above.
+* `x86_64h` - uses the `astcenc-avx2` build defined above.
+* `arm64` - uses the `astcenc-neon` build defined above.
+
+## Repository branches
+
+The `main` branch is an active development branch for the compressor. It aims
+to be a stable branch for the latest major release series, but as it is used
+for ongoing development expect it to have some volatility. We recommend using
+the latest stable release tag for production development.
+
+The `4.x` branch is a stable branch for the older 4.x release series. It is no
+longer under active development, but is a supported branch that continues to
+get back-ported bug fixes.
+
+The `1.x`, `2.x`, and `3.x` branches are stable branches for older releases.
+They are no longer under active development or getting bug fixes.
+
+Any other branches you might find are development branches for new features or
+optimizations, so might be interesting to play with but should be considered
+transient and unstable.
+
+
+# Getting started
+
+Open a terminal, change to the appropriate directory for your system, and run
+the astcenc encoder program, like this on Linux or macOS:
+
+    ./astcenc
+
+... or like this on Windows:
+
+    astcenc
+
+Invoking `astcenc -help` gives an extensive help message, including usage
+instructions and details of all available command line options. A summary of
+the main encoder options are shown below.
+
+## Compressing an image
+
+Compress an image using the `-cl` \ `-cs` \ `-ch` \ `-cH` modes. For example:
+
+    astcenc -cl example.png example.astc 6x6 -medium
+
+This compresses `example.png` using the LDR color profile and a 6x6 block
+footprint (3.56 bits/pixel). The `-medium` quality preset gives a reasonable
+image quality for a relatively fast compression speed, so is a good starting
+point for compression. The output is stored to a linear color space compressed
+image, `example.astc`.
+
+The modes available are:
+
+* `-cl` : use the linear LDR color profile.
+* `-cs` : use the sRGB LDR color profile.
+* `-ch` : use the HDR color profile, tuned for HDR RGB and LDR A.
+* `-cH` : use the HDR color profile, tuned for HDR RGBA.
+
+If you intend to use the resulting image with the decode mode extensions to
+limit the decompressed precision to UNORM8, it is recommended that you also
+specify the `-decode_unorm8` flag. This will ensure that the compressor uses
+the correct rounding rules when choosing encodings.
+
+## Decompressing an image
+
+Decompress an image using the `-dl` \ `-ds` \ `-dh` \ `-dH` modes. For example:
+
+    astcenc -dh example.astc example.tga
+
+This decompresses `example.astc` using the full HDR feature profile, storing
+the decompressed output to `example.tga`.
+
+The modes available mirror the options used for compression, but use a `d`
+prefix. Note that for decompression there is no difference between the two HDR
+modes, they are both provided simply to maintain symmetry across operations.
+
+## Measuring image quality
+
+Review the compression quality using the `-tl` \ `-ts` \ `-th` \ `-tH` modes.
+For example:
+
+    astcenc -tl example.png example.tga 5x5 -thorough
+
+This is equivalent to using using the LDR color profile and a 5x5 block size
+to compress the image, using the `-thorough` quality preset, and then
+immediately decompressing the image and saving the result. This can be used
+to enable a visual inspection of the compressed image quality. In addition
+this mode also prints out some image quality metrics to the console.
+
+The modes available mirror the options used for compression, but use a `t`
+prefix.
+
+## Experimenting
+
+Efficient real-time graphics benefits from minimizing compressed texture size,
+as it reduces memory footprint, reduces memory bandwidth, saves energy, and can
+improve texture cache efficiency. However, like any lossy compression format
+there will come a point where the compressed image quality is unacceptable
+because there are simply not enough bits to represent the output with the
+precision needed. We recommend experimenting with the block footprint to find
+the optimum balance between size and quality, as the finely adjustable
+compression ratio is one of major strengths of the ASTC format.
+
+The compression speed can be controlled from `-fastest`, through `-fast`,
+`-medium` and `-thorough`, up to `-exhaustive`. In general, the more time the
+encoder has to spend looking for good encodings the better the results, but it
+does result in increasingly small improvements for the amount of time required.
+
+There are many other command line options for tuning the encoder parameters
+which can be used to fine tune the compression algorithm. See the command line
+help message for more details.
+
+# Documentation
+
+The [ASTC Format Overview](./Docs/FormatOverview.md) page provides a high level
+introduction to the ASTC texture format, how it encodes data, and why it is
+both flexible and efficient.
+
+The [Effective ASTC Encoding](./Docs/Encoding.md) page looks at some of the
+guidelines that should be followed when compressing data using `astcenc`.
+It covers:
+
+* How to efficiently encode data with fewer than 4 channels.
+* How to efficiently encode normal maps, sRGB data, and HDR data.
+* Coding equivalents to other compression formats.
+
+The [ASTC Developer Guide][5] document (external link) provides a more detailed
+guide for developers using the `astcenc` compressor.
+
+The [.astc File Format](./Docs/FileFormat.md) page provides a light-weight
+specification for the `.astc` file format and how to read or write it.
+
+The [Building ASTC Encoder](./Docs/Building.md) page provides instructions on
+how to build `astcenc` from the sources in this repository.
+
+The [Testing ASTC Encoder](./Docs/Testing.md) page provides instructions on
+how to test any modifications to the source code in this repository.
+
+# Support
+
+If you have issues with the `astcenc` encoder, or questions about the ASTC
+texture format itself, please raise them in the GitHub issue tracker.
+
+If you have any questions about Arm GPUs, application development for Arm GPUs,
+or general mobile graphics development or technology please submit them on the
+[Arm Community graphics forums][4].
+
+- - -
+
+_Copyright © 2013-2025, Arm Limited and contributors. All rights reserved._
+
+[1]: ./Docs/FormatOverview.md
+[2]: https://www.khronos.org/registry/DataFormat/specs/1.4/dataformat.1.4.html#ASTC
+[3]: https://github.com/ARM-software/astc-encoder/releases
+[4]: https://community.arm.com/support-forums/f/graphics-gaming-and-vr-forum/
+[5]: https://developer.arm.com/documentation/102162/latest/?lang=en
@@ -0,0 +1,126 @@
+#  SPDX-License-Identifier: Apache-2.0
+#  ----------------------------------------------------------------------------
+#  Copyright 2020-2025 Arm Limited
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"); you may not
+#  use this file except in compliance with the License. You may obtain a copy
+#  of the License at:
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#  License for the specific language governing permissions and limitations
+#  under the License.
+#  ----------------------------------------------------------------------------
+
+# Overwrite the LTO flags to force fat LTO; worth 3-4% performance
+# See https://gitlab.kitware.com/cmake/cmake/-/issues/16808
+if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND ${ASTCENC_CLI})
+    set(CMAKE_CXX_COMPILE_OPTIONS_IPO "-flto")
+endif()
+
+if(CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND ${ASTCENC_CLI})
+    set(CMAKE_CXX_COMPILE_OPTIONS_IPO "-flto=auto")
+endif()
+
+if(${ASTCENC_DECOMPRESSOR})
+    set(ASTCENC_CODEC dec)
+else()
+    set(ASTCENC_CODEC enc)
+endif()
+
+set(ASTCENC_ARTIFACTS native none sve_256 sve_128 neon avx2 sse4.1 sse2)
+set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_SVE_256} ${ASTCENC_ISA_SVE_128} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2})
+list(LENGTH ASTCENC_ARTIFACTS ASTCENC_ARTIFACTS_LEN)
+math(EXPR ASTCENC_ARTIFACTS_LEN "${ASTCENC_ARTIFACTS_LEN} - 1")
+
+foreach(INDEX RANGE ${ASTCENC_ARTIFACTS_LEN})
+    list(GET ASTCENC_ARTIFACTS ${INDEX} ASTCENC_ARTIFACT)
+    list(GET ASTCENC_CONFIGS ${INDEX} ASTCENC_CONFIG)
+    if(${ASTCENC_CONFIG})
+        set(ASTCENC_ISA_SIMD ${ASTCENC_ARTIFACT})
+
+        if(${ASTCENC_ISA_SIMD} MATCHES "sve_256")
+           # Not suported on macOS
+        elseif(${ASTCENC_ISA_SIMD} MATCHES "sve_128")
+           # Not suported on macOS
+        elseif(${ASTCENC_ISA_SIMD} MATCHES "neon")
+           set(CMAKE_OSX_ARCHITECTURES arm64)
+        elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
+           set(CMAKE_OSX_ARCHITECTURES x86_64)
+        elseif(${ASTCENC_ISA_SIMD} MATCHES "sse4.1")
+           set(CMAKE_OSX_ARCHITECTURES x86_64)
+        elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2")
+           set(CMAKE_OSX_ARCHITECTURES x86_64h)
+        elseif(${ASTCENC_ISA_SIMD} MATCHES "none")
+           # Using "none" uses implicit architecture
+        elseif(${ASTCENC_ISA_SIMD} MATCHES "native")
+           # Using "native" uses implicit architecture
+        else()
+            message(FATAL_ERROR "'${ASTCENC_ISA_SIMD}' is unknown ISA")
+        endif()
+
+        include(cmake_core.cmake)
+    endif()
+endforeach()
+
+if(${ASTCENC_CLI} AND ${ASTCENC_UNIVERSAL_BUILD})
+    add_custom_target(
+        astc${ASTCENC_CODEC}
+            ALL
+            COMMAND
+                lipo -create -output $<TARGET_FILE_DIR:astc${ASTCENC_CODEC}-sse4.1>/astc${ASTCENC_CODEC} -arch x86_64 $<TARGET_FILE:astc${ASTCENC_CODEC}-sse4.1> -arch x86_64h $<TARGET_FILE:astc${ASTCENC_CODEC}-avx2> -arch arm64 $<TARGET_FILE:astc${ASTCENC_CODEC}-neon>
+            VERBATIM)
+
+    add_dependencies(
+        astc${ASTCENC_CODEC}
+            astc${ASTCENC_CODEC}-sse4.1
+            astc${ASTCENC_CODEC}-avx2
+            astc${ASTCENC_CODEC}-neon)
+
+    install(PROGRAMS $<TARGET_FILE_DIR:astc${ASTCENC_CODEC}-sse4.1>/astc${ASTCENC_CODEC}
+            DESTINATION bin)
+endif()
+
+if(${ASTCENC_SHAREDLIB} AND ${ASTCENC_UNIVERSAL_BUILD})
+    add_custom_target(
+        astc${ASTCENC_CODEC}-shared
+            ALL
+            COMMAND
+                lipo -create -output $<TARGET_FILE_DIR:astc${ASTCENC_CODEC}-sse4.1-shared>/libastc${ASTCENC_CODEC}-shared.dylib -arch x86_64 $<TARGET_FILE:astc${ASTCENC_CODEC}-sse4.1-shared> -arch x86_64h $<TARGET_FILE:astc${ASTCENC_CODEC}-avx2-shared> -arch arm64 $<TARGET_FILE:astc${ASTCENC_CODEC}-neon-shared>
+            VERBATIM)
+
+    add_dependencies(
+        astc${ASTCENC_CODEC}-shared
+            astc${ASTCENC_CODEC}-sse4.1-shared
+            astc${ASTCENC_CODEC}-avx2-shared
+            astc${ASTCENC_CODEC}-neon-shared)
+
+    install(PROGRAMS $<TARGET_FILE_DIR:astc${ASTCENC_CODEC}-sse4.1-shared>/libastc${ASTCENC_CODEC}-shared.dylib
+            DESTINATION lib)
+endif()
+
+# - - - - - - - - - - - - - - - - - -
+# Unit testing
+if(${ASTCENC_UNITTEST})
+    set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
+    set(CMAKE_OSX_ARCHITECTURES x86_64;arm64)
+    add_subdirectory(GoogleTest)
+
+    # Workaround GoogleTest CRT selection issue issue
+    # See https://github.com/google/googletest/issues/4067
+    set_property(
+        TARGET gtest
+            PROPERTY
+                MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
+
+    set_property(
+        TARGET gtest_main
+            PROPERTY
+                MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
+
+    enable_testing()
+    add_subdirectory(UnitTest)
+endif()
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2020-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Fuzz target for physical_to_symbolic().
+ *
+ * This function is the first entrypoint for decompressing a 16 byte block of
+ * input ASTC data from disk. The 16 bytes can contain arbitrary data; they
+ * are read from an external source, but the block size used must be a valid
+ * ASTC block footprint.
+ */
+
+
+#include "astcenc_internal.h"
+
+#include <fuzzer/FuzzedDataProvider.h>
+#include <array>
+#include <vector>
+
+struct BlockSizes
+{
+	int x;
+	int y;
+	int z;
+};
+
+std::array<BlockSizes, 3> testSz {{
+	{ 4,  4, 1}, // Highest bitrate
+	{12, 12, 1}, // Largest 2D block
+	{6,  6,  6}  // Largest 3D block
+}};
+
+std::array<block_size_descriptor, 3> testBSD;
+
+/**
+ * @brief Utility function to create all of the block size descriptors needed.
+ *
+ * This is triggered once via a static initializer.
+ *
+ * Triggering once is important so that we only create a single BSD per block
+ * size we need, rather than one per fuzzer iteration (it's expensive). This
+ * improves fuzzer throughput by ~ 1000x!
+ *
+ * Triggering via a static initializer, rather than a lazy init in the fuzzer
+ * function, is important because is means that the BSD is allocated before
+ * fuzzing starts. This means that leaksanitizer will ignore the fact that we
+ * "leak" the dynamic allocations inside the BSD (we never call term()).
+ */
+bool bsd_initializer()
+{
+	for (int i = 0; i < testSz.size(); i++)
+	{
+		init_block_size_descriptor(
+		    testSz[i].x,
+		    testSz[i].y,
+		    testSz[i].z,
+		    false,
+		    4,
+		    1.0f,
+		    testBSD[i]);
+	}
+
+	return true;
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
+{
+	// Preinitialize the block size descriptors we need
+	static bool init = bsd_initializer();
+
+	// Must have 4 (select block size) and 16 (payload) bytes
+	if (size < 4 + 16)
+	{
+		return 0;
+	}
+
+	FuzzedDataProvider stream(data, size);
+
+	// Select a block size to test
+	int i = stream.ConsumeIntegralInRange<int>(0, testSz.size() - 1);
+
+	// Populate the physical block
+	uint8_t pcb[16];
+	std::vector<uint8_t> buffer = stream.ConsumeBytes<uint8_t>(16);
+	std::memcpy(pcb, buffer.data(), 16);
+
+	// Call the function under test
+	symbolic_compressed_block scb;
+	physical_to_symbolic(testBSD[i], pcb, scb);
+
+	return 0;
+}
@@ -0,0 +1,51 @@
+#  SPDX-License-Identifier: Apache-2.0
+#  ----------------------------------------------------------------------------
+#  Copyright 2020-2024 Arm Limited
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"); you may not
+#  use this file except in compliance with the License. You may obtain a copy
+#  of the License at:
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#  License for the specific language governing permissions and limitations
+#  under the License.
+#  ----------------------------------------------------------------------------
+
+set(ASTCENC_ARTIFACTS native none sve_256 sve_128 neon avx2 sse4.1 sse2)
+set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_SVE_256} ${ASTCENC_ISA_SVE_128} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2})
+list(LENGTH ASTCENC_ARTIFACTS ASTCENC_ARTIFACTS_LEN)
+math(EXPR ASTCENC_ARTIFACTS_LEN "${ASTCENC_ARTIFACTS_LEN} - 1")
+
+foreach(INDEX RANGE ${ASTCENC_ARTIFACTS_LEN})
+    list(GET ASTCENC_ARTIFACTS ${INDEX} ASTCENC_ARTIFACT)
+    list(GET ASTCENC_CONFIGS ${INDEX} ASTCENC_CONFIG)
+    if(${ASTCENC_CONFIG})
+        set(ASTCENC_ISA_SIMD ${ASTCENC_ARTIFACT})
+
+        if(${ASTCENC_ISA_SIMD} MATCHES "sve_256")
+           # Not supported on macOS
+        elseif(${ASTCENC_ISA_SIMD} MATCHES "sve_128")
+           # Not supported on macOS
+        elseif(${ASTCENC_ISA_SIMD} MATCHES "neon")
+           set(CMAKE_OSX_ARCHITECTURES arm64)
+        elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
+           set(CMAKE_OSX_ARCHITECTURES x86_64)
+        elseif(${ASTCENC_ISA_SIMD} MATCHES "sse4.1")
+           set(CMAKE_OSX_ARCHITECTURES x86_64)
+        elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2")
+           set(CMAKE_OSX_ARCHITECTURES x86_64h)
+        elseif(${ASTCENC_ISA_SIMD} MATCHES "none")
+           # Using "none" uses implicit architecture
+        elseif(${ASTCENC_ISA_SIMD} MATCHES "native")
+           # Using "native" uses implicit architecture
+        else()
+            message(FATAL_ERROR "'${ASTCENC_ISA_SIMD}' is unknown ISA")
+        endif()
+
+        include(cmake_core.cmake)
+    endif()
+endforeach()
@@ -0,0 +1,198 @@
+#  SPDX-License-Identifier: Apache-2.0
+#  ----------------------------------------------------------------------------
+#  Copyright 2020-2025 Arm Limited
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"); you may not
+#  use this file except in compliance with the License. You may obtain a copy
+#  of the License at:
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#  License for the specific language governing permissions and limitations
+#  under the License.
+#  ----------------------------------------------------------------------------
+
+include(../cmake_compiler.cmake)
+
+set(ASTCENC_TEST test-unit-${ASTCENC_ISA_SIMD})
+
+add_executable(${ASTCENC_TEST})
+
+set_property(TARGET ${ASTCENC_TEST}
+    PROPERTY
+        CXX_STANDARD 17)
+
+# Enable LTO under the conditions where the codec library will use LTO.
+# The library link will fail if the settings don't match
+if(${ASTCENC_CLI})
+    set_property(TARGET ${ASTCENC_TEST}
+        PROPERTY
+            INTERPROCEDURAL_OPTIMIZATION_RELEASE True)
+endif()
+
+# Use a static runtime on MSVC builds (ignored on non-MSVC compilers)
+set_property(TARGET ${ASTCENC_TEST}
+    PROPERTY
+        MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
+
+
+target_sources(${ASTCENC_TEST}
+    PRIVATE
+        test_simd.cpp
+        test_softfloat.cpp
+        test_decode.cpp)
+
+target_include_directories(${ASTCENC_TEST}
+    PRIVATE
+        ${gtest_SOURCE_DIR}/include)
+
+target_link_libraries(${ASTCENC_TEST}
+    PRIVATE
+        astcenc-${ASTCENC_ISA_SIMD}-static)
+
+target_compile_options(${ASTCENC_TEST}
+    PRIVATE
+        # Use pthreads on Linux/macOS
+        $<$<PLATFORM_ID:Linux,Darwin>:-pthread>
+
+        # MSVC compiler defines
+        $<${is_msvc_fe}:/EHsc>
+        $<$<AND:$<BOOL:${ASTCENC_WERROR}>,${is_msvc_fe}>:/WX>
+        $<${is_msvccl}:/wd4324>
+
+        # G++ and Clang++ compiler defines
+        $<${is_gnu_fe}:-Wall>
+        $<${is_gnu_fe}:-Wextra>
+        $<${is_gnu_fe}:-Wpedantic>
+        $<$<AND:$<BOOL:${ASTCENC_WERROR}>,${is_gnu_fe}>:-Werror>
+        $<${is_gnu_fe}:-Wshadow>
+        $<${is_gnu_fe}:-Wdouble-promotion>
+        $<${is_clang}:-Wdocumentation>
+
+        # Hide noise thrown up by Clang 10 and clang-cl
+        $<${is_gnu_fe}:-Wno-unknown-warning-option>
+        $<${is_gnu_fe}:-Wno-c++98-compat-pedantic>
+        $<${is_gnu_fe}:-Wno-c++98-c++11-compat-pedantic>
+        $<${is_gnu_fe}:-Wno-float-equal>
+        $<${is_gnu_fe}:-Wno-overriding-option>
+        $<${is_gnu_fe}:-Wno-unsafe-buffer-usage>
+        $<${is_clang}:-Wno-switch-default>
+
+        # Ignore things that the googletest build triggers
+        $<${is_gnu_fe}:-Wno-unknown-warning-option>
+        $<${is_gnu_fe}:-Wno-double-promotion>
+        $<${is_gnu_fe}:-Wno-undef>
+        $<${is_gnu_fe}:-Wno-reserved-identifier>
+        $<${is_gnu_fe}:-Wno-global-constructors>)
+
+# Set up configuration for SIMD ISA builds
+if(${ASTCENC_ISA_SIMD} MATCHES "none")
+    target_compile_definitions(${ASTCENC_TEST}
+        PRIVATE
+            ASTCENC_NEON=0
+            ASTCENC_SVE=0
+            ASTCENC_SSE=0
+            ASTCENC_AVX=0
+            ASTCENC_POPCNT=0
+            ASTCENC_F16C=0)
+
+    if(${ASTCENC_BIG_ENDIAN})
+        target_compile_definitions(${ASTCENC_TEST}
+            PRIVATE
+                ASTCENC_BIG_ENDIAN=1)
+    endif()
+
+elseif(${ASTCENC_ISA_SIMD} MATCHES "neon")
+    target_compile_definitions(${ASTCENC_TEST}
+        PRIVATE
+            ASTCENC_NEON=1
+            ASTCENC_SVE=0
+            ASTCENC_SSE=0
+            ASTCENC_AVX=0
+            ASTCENC_POPCNT=0
+            ASTCENC_F16C=0)
+
+elseif(${ASTCENC_ISA_SIMD} MATCHES "sve_256")
+    target_compile_definitions(${ASTCENC_TEST}
+        PRIVATE
+            ASTCENC_NEON=1
+            ASTCENC_SVE=8
+            ASTCENC_SSE=0
+            ASTCENC_AVX=0
+            ASTCENC_POPCNT=0
+            ASTCENC_F16C=0)
+
+    # Enable SVE
+    target_compile_options(${ASTCENC_TEST}
+        PRIVATE
+            -march=armv8-a+sve -msve-vector-bits=256)
+
+elseif(${ASTCENC_ISA_SIMD} MATCHES "sve_128")
+    target_compile_definitions(${ASTCENC_TEST}
+        PRIVATE
+            ASTCENC_NEON=1
+            ASTCENC_SVE=4
+            ASTCENC_SSE=0
+            ASTCENC_AVX=0
+            ASTCENC_POPCNT=0
+            ASTCENC_F16C=0)
+
+    # Enable SVE
+    target_compile_options(${ASTCENC_TEST}
+        PRIVATE
+            -march=armv8-a+sve)
+
+elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
+    target_compile_definitions(${ASTCENC_TEST}
+        PRIVATE
+            ASTCENC_NEON=0
+            ASTCENC_SVE=0
+            ASTCENC_SSE=20
+            ASTCENC_AVX=0
+            ASTCENC_POPCNT=0
+            ASTCENC_F16C=0)
+
+    target_compile_options(${ASTCENC_TEST}
+        PRIVATE
+        $<$<CXX_COMPILER_ID:${GNU_LIKE}>:-msse2>)
+
+elseif(${ASTCENC_ISA_SIMD} MATCHES "sse4.1")
+    target_compile_definitions(${ASTCENC_TEST}
+        PRIVATE
+            ASTCENC_NEON=0
+            ASTCENC_SVE=0
+            ASTCENC_SSE=41
+            ASTCENC_AVX=0
+            ASTCENC_POPCNT=1
+            ASTCENC_F16C=0)
+
+    target_compile_options(${ASTCENC_TEST}
+        PRIVATE
+            $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-msse4.1 -mpopcnt>)
+
+elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2")
+    target_compile_definitions(${ASTCENC_TEST}
+        PRIVATE
+            ASTCENC_NEON=0
+            ASTCENC_SVE=0
+            ASTCENC_SSE=41
+            ASTCENC_AVX=2
+            ASTCENC_POPCNT=1
+            ASTCENC_F16C=1)
+
+    target_compile_options(${ASTCENC_TEST}
+        PRIVATE
+            $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-mavx2 -mpopcnt -mf16c>
+            $<$<CXX_COMPILER_ID:MSVC>:/arch:AVX2>)
+
+endif()
+
+target_link_libraries(${ASTCENC_TEST}
+    PRIVATE
+        gtest_main)
+
+add_test(NAME ${ASTCENC_TEST}
+         COMMAND ${ASTCENC_TEST})
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Unit tests for the vectorized SIMD functionality.
+ */
+
+#include <limits>
+
+#include "gtest/gtest.h"
+
+#include "../astcenc.h"
+
+namespace astcenc
+{
+
+/** @brief Test harness for exploring issue #447. */
+TEST(decode, decode12x12)
+{
+	astcenc_error status;
+	astcenc_config config;
+	astcenc_context* context;
+
+	static const astcenc_swizzle swizzle {
+		ASTCENC_SWZ_R, ASTCENC_SWZ_G, ASTCENC_SWZ_B, ASTCENC_SWZ_A
+	};
+
+	uint8_t data[16] {
+#if 0
+		0x84,0x00,0x38,0xC8,0x00,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0xB3,0x4D,0x78
+#else
+		0x29,0x00,0x1A,0x97,0x01,0x00,0x00,0x00,
+		0x00,0x00,0x00,0x00,0x00,0xCF,0x97,0x86
+#endif
+	};
+
+	uint8_t output[12*12*4];
+	astcenc_config_init(ASTCENC_PRF_LDR, 12, 12, 1, ASTCENC_PRE_MEDIUM, 0, &config);
+
+	status = astcenc_context_alloc(&config, 1, &context);
+	EXPECT_EQ(status, ASTCENC_SUCCESS);
+
+	astcenc_image image;
+	image.dim_x = 12;
+	image.dim_y = 12;
+	image.dim_z = 1;
+	image.data_type = ASTCENC_TYPE_U8;
+	uint8_t* slices = output;
+	image.data = reinterpret_cast<void**>(&slices);
+
+	status = astcenc_decompress_image(context, data, 16, &image, &swizzle, 0);
+	EXPECT_EQ(status, ASTCENC_SUCCESS);
+#if 0
+	for (int y = 0; y < 12; y++)
+	{
+		for (int x = 0; x < 12; x++)
+		{
+			uint8_t* pixel = output + (12 * 4 * y) + (4 * x);
+			printf("[%2dx%2d] = %03d, %03d, %03d, %03d\n", x, y, pixel[0], pixel[1], pixel[2], pixel[3]);
+		}
+	}
+#endif
+}
+
+}
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Unit tests for the software half-float library.
+ */
+
+#include "gtest/gtest.h"
+
+#include "../astcenc_internal.h"
+
+namespace astcenc
+{
+
+#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
+
+/** @brief Test normal numbers. */
+TEST(softfloat, FP16NormalNumbers)
+{
+	float result = sf16_to_float((15 << 10) + 1);
+	EXPECT_NEAR(result,  1.00098f, 0.00005f);
+}
+
+/** @brief Test denormal numbers. */
+TEST(softfloat, FP16DenormalNumbers)
+{
+	float result = sf16_to_float((0 << 10) + 1);
+	EXPECT_NEAR(result, 5.96046e-08f, 0.00005f);
+}
+
+/** @brief Test zero. */
+TEST(softfloat, FP16Zero)
+{
+	float result = sf16_to_float(0x0000);
+	EXPECT_EQ(result, 0.0f);
+}
+
+/** @brief Test infinity. */
+TEST(softfloat, FP16Infinity)
+{
+	float result = sf16_to_float((31 << 10) + 0);
+	EXPECT_TRUE(std::isinf(result));
+}
+
+/** @brief Test NaN. */
+TEST(softfloat, FP16NaN)
+{
+	float result = sf16_to_float(0xFFFF);
+	EXPECT_TRUE(std::isnan(result));
+}
+
+#endif
+
+}
@@ -0,0 +1,874 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2020-2025 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief The core astcenc codec library interface.
+ *
+ * This interface is the entry point to the core astcenc codec. It aims to be easy to use for
+ * non-experts, but also to allow experts to have fine control over the compressor heuristics if
+ * needed. The core codec only handles compression and decompression, transferring all inputs and
+ * outputs via memory buffers. To catch obvious input/output buffer sizing issues, which can cause
+ * security and stability problems, all transfer buffers are explicitly sized.
+ *
+ * While the aim is that we keep this interface mostly stable, it should be viewed as a mutable
+ * interface tied to a specific source version. We are not trying to maintain backwards
+ * compatibility across codec versions.
+ *
+ * The API state management is based around an explicit context object, which is the context for all
+ * allocated memory resources needed to compress and decompress a single image. A context can be
+ * used to sequentially compress multiple images using the same configuration, allowing setup
+ * overheads to be amortized over multiple images, which is particularly important when images are
+ * small.
+ *
+ * Multi-threading can be used two ways.
+ *
+ *     * An application wishing to process multiple images in parallel can allocate multiple
+ *       contexts and assign each context to a thread.
+ *     * An application wishing to process a single image in using multiple threads can configure
+ *       contexts for multi-threaded use, and invoke astcenc_compress/decompress() once per thread
+ *       for faster processing. The caller is responsible for creating the worker threads, and
+ *       synchronizing between images.
+ *
+ * Extended instruction set support
+ * ================================
+ *
+ * This library supports use of extended instruction sets, such as SSE4.1 and AVX2. These are
+ * enabled at compile time when building the library. There is no runtime checking in the core
+ * library that the instruction sets used are actually available. Checking compatibility is the
+ * responsibility of the calling code.
+ *
+ * Threading
+ * =========
+ *
+ * In pseudo-code, the usage for manual user threading looks like this:
+ *
+ *     // Configure the compressor run
+ *     astcenc_config my_config;
+ *     astcenc_config_init(..., &my_config);
+ *
+ *     // Power users can tweak <my_config> settings here ...
+ *
+ *     // Allocate working state given config and thread_count
+ *     astcenc_context* my_context;
+ *     astcenc_context_alloc(&my_config, thread_count, &my_context);
+ *
+ *     // Compress each image using these config settings
+ *     foreach image:
+ *         // For each thread in the thread pool
+ *         for i in range(0, thread_count):
+ *             astcenc_compress_image(my_context, &my_input, my_output, i);
+ *
+ *         astcenc_compress_reset(my_context);
+ *
+ *     // Clean up
+ *     astcenc_context_free(my_context);
+ *
+ * Images
+ * ======
+ *
+ * The codec supports compressing single images, which can be either 2D images or volumetric 3D
+ * images. Calling code is responsible for any handling of aggregate types, such as mipmap chains,
+ * texture arrays, or sliced 3D textures.
+ *
+ * Images are passed in as an astcenc_image structure. Inputs can be either 8-bit unorm, 16-bit
+ * half-float, or 32-bit float, as indicated by the data_type field.
+ *
+ * Images can be any dimension; there is no requirement to be a multiple of the ASTC block size.
+ *
+ * Data is always passed in as 4 color components, and accessed as an array of 2D image slices. Data
+ * within an image slice is always tightly packed without padding. Addressing looks like this:
+ *
+ *     data[z_coord][y_coord * x_dim * 4 + x_coord * 4    ]   // Red
+ *     data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 1]   // Green
+ *     data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 2]   // Blue
+ *     data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 3]   // Alpha
+ *
+ * Common compressor usage
+ * =======================
+ *
+ * One of the most important things for coding image quality is to align the input data component
+ * count with the ASTC color endpoint mode. This avoids wasting bits encoding components you don't
+ * actually need in the endpoint colors.
+ *
+ *         | Input data   | Encoding swizzle | Sampling swizzle |
+ *         | ------------ | ---------------- | ---------------- |
+ *         | 1 component  | RRR1             | .[rgb]           |
+ *         | 2 components | RRRG             | .[rgb]a          |
+ *         | 3 components | RGB1             | .rgb             |
+ *         | 4 components | RGBA             | .rgba            |
+ *
+ * The 1 and 2 component modes recommend sampling from "g" to recover the luminance value as this
+ * provide best compatibility with other texture formats where the green component may be stored at
+ * higher precision than the others, such as RGB565. For ASTC any of the RGB components can be used;
+ * the luminance endpoint component will be returned for all three.
+ *
+ * When using the normal map compression mode ASTC will store normals as a two component X+Y map.
+ * Input images must contain unit-length normalized and should be passed in using a two component
+ * swizzle. The astcenc command line tool defaults to an RRRG swizzle, but some developers prefer
+ * to use GGGR for compatability with BC5n which will work just as well. The Z component can be
+ * recovered programmatically in shader code, using knowledge that the vector is unit length and
+ * that Z must be positive for a tangent-space normal map.
+ *
+ * Decompress-only usage
+ * =====================
+ *
+ * For some use cases it is useful to have a cut-down context and/or library which supports
+ * decompression but not compression.
+ *
+ * A context can be made decompress-only using the ASTCENC_FLG_DECOMPRESS_ONLY flag when the context
+ * is allocated. These contexts have lower dynamic memory footprint than a full context.
+ *
+ * The entire library can be made decompress-only by building the files with the define
+ * ASTCENC_DECOMPRESS_ONLY set. In this build the context will be smaller, and the library will
+ * exclude the functionality which is only needed for compression. This reduces the binary size by
+ * ~180KB. For these builds contexts must be created with the ASTCENC_FLG_DECOMPRESS_ONLY flag.
+ *
+ * Note that context structures returned by a library built as decompress-only are incompatible with
+ * a library built with compression included, and visa versa, as they have different sizes and
+ * memory layout.
+ *
+ * Self-decompress-only usage
+ * ==========================
+ *
+ * ASTC is a complex format with a large search space. The parts of this search space that are
+ * searched is determined by heuristics that are, in part, tied to the quality level used when
+ * creating the context.
+ *
+ * A normal context is capable of decompressing any ASTC texture, including those generated by other
+ * compressors with unknown heuristics. This is the most flexible implementation, but forces the
+ * data tables used by the codec to include entries that are not needed during compression. This
+ * can slow down context creation by a significant amount, especially for the faster compression
+ * modes where few data table entries are actually used. To optimize this use case the context can
+ * be created with the ASTCENC_FLG_SELF_DECOMPRESS_ONLY flag. This tells the compressor that it will
+ * only be asked to decompress images that it compressed itself, allowing the data tables to
+ * exclude entries that are not needed by the current compression configuration. This reduces the
+ * size of the context data tables in memory and improves context creation performance. Note that,
+ * as of the 3.6 release, this flag no longer affects compression performance.
+ *
+ * Using this flag while attempting to decompress an valid image which was created by another
+ * compressor, or even another astcenc compressor version or configuration, may result in blocks
+ * returning as solid magenta or NaN value error blocks.
+ */
+
+#ifndef ASTCENC_INCLUDED
+#define ASTCENC_INCLUDED
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ASTCENC_DYNAMIC_LIBRARY)
+	#if defined(_MSC_VER)
+		#define ASTCENC_PUBLIC extern "C" __declspec(dllexport)
+	#else
+		#define ASTCENC_PUBLIC extern "C" __attribute__ ((visibility ("default")))
+	#endif
+#else
+	#define ASTCENC_PUBLIC
+#endif
+
+/* ============================================================================
+    Data declarations
+============================================================================ */
+
+/**
+ * @brief An opaque structure; see astcenc_internal.h for definition.
+ */
+struct astcenc_context;
+
+/**
+ * @brief A codec API error code.
+ */
+enum astcenc_error {
+	/** @brief The call was successful. */
+	ASTCENC_SUCCESS = 0,
+	/** @brief The call failed due to low memory, or undersized I/O buffers. */
+	ASTCENC_ERR_OUT_OF_MEM,
+	/** @brief The call failed due to the build using fast math. */
+	ASTCENC_ERR_BAD_CPU_FLOAT,
+	/** @brief The call failed due to an out-of-spec parameter. */
+	ASTCENC_ERR_BAD_PARAM,
+	/** @brief The call failed due to an out-of-spec block size. */
+	ASTCENC_ERR_BAD_BLOCK_SIZE,
+	/** @brief The call failed due to an out-of-spec color profile. */
+	ASTCENC_ERR_BAD_PROFILE,
+	/** @brief The call failed due to an out-of-spec quality value. */
+	ASTCENC_ERR_BAD_QUALITY,
+	/** @brief The call failed due to an out-of-spec component swizzle. */
+	ASTCENC_ERR_BAD_SWIZZLE,
+	/** @brief The call failed due to an out-of-spec flag set. */
+	ASTCENC_ERR_BAD_FLAGS,
+	/** @brief The call failed due to the context not supporting the operation. */
+	ASTCENC_ERR_BAD_CONTEXT,
+	/** @brief The call failed due to unimplemented functionality. */
+	ASTCENC_ERR_NOT_IMPLEMENTED,
+	/** @brief The call failed due to an out-of-spec decode mode flag set. */
+	ASTCENC_ERR_BAD_DECODE_MODE,
+#if defined(ASTCENC_DIAGNOSTICS)
+	/** @brief The call failed due to an issue with diagnostic tracing. */
+	ASTCENC_ERR_DTRACE_FAILURE,
+#endif
+};
+
+/**
+ * @brief A codec color profile.
+ */
+enum astcenc_profile {
+	/** @brief The LDR sRGB color profile. */
+	ASTCENC_PRF_LDR_SRGB = 0,
+	/** @brief The LDR linear color profile. */
+	ASTCENC_PRF_LDR,
+	/** @brief The HDR RGB with LDR alpha color profile. */
+	ASTCENC_PRF_HDR_RGB_LDR_A,
+	/** @brief The HDR RGBA color profile. */
+	ASTCENC_PRF_HDR
+};
+
+/** @brief The fastest, lowest quality, search preset. */
+static const float ASTCENC_PRE_FASTEST = 0.0f;
+
+/** @brief The fast search preset. */
+static const float ASTCENC_PRE_FAST = 10.0f;
+
+/** @brief The medium quality search preset. */
+static const float ASTCENC_PRE_MEDIUM = 60.0f;
+
+/** @brief The thorough quality search preset. */
+static const float ASTCENC_PRE_THOROUGH = 98.0f;
+
+/** @brief The thorough quality search preset. */
+static const float ASTCENC_PRE_VERYTHOROUGH = 99.0f;
+
+/** @brief The exhaustive, highest quality, search preset. */
+static const float ASTCENC_PRE_EXHAUSTIVE = 100.0f;
+
+/**
+ * @brief A codec component swizzle selector.
+ */
+enum astcenc_swz
+{
+	/** @brief Select the red component. */
+	ASTCENC_SWZ_R = 0,
+	/** @brief Select the green component. */
+	ASTCENC_SWZ_G = 1,
+	/** @brief Select the blue component. */
+	ASTCENC_SWZ_B = 2,
+	/** @brief Select the alpha component. */
+	ASTCENC_SWZ_A = 3,
+	/** @brief Use a constant zero component. */
+	ASTCENC_SWZ_0 = 4,
+	/** @brief Use a constant one component. */
+	ASTCENC_SWZ_1 = 5,
+	/** @brief Use a reconstructed normal vector Z component. */
+	ASTCENC_SWZ_Z = 6
+};
+
+/**
+ * @brief A texel component swizzle.
+ */
+struct astcenc_swizzle
+{
+	/** @brief The red component selector. */
+	astcenc_swz r;
+	/** @brief The green component selector. */
+	astcenc_swz g;
+	/** @brief The blue component selector. */
+	astcenc_swz b;
+	/** @brief The alpha component selector. */
+	astcenc_swz a;
+};
+
+/**
+ * @brief A texel component data format.
+ */
+enum astcenc_type
+{
+	/** @brief Unorm 8-bit data per component. */
+	ASTCENC_TYPE_U8 = 0,
+	/** @brief 16-bit float per component. */
+	ASTCENC_TYPE_F16 = 1,
+	/** @brief 32-bit float per component. */
+	ASTCENC_TYPE_F32 = 2
+};
+
+/**
+ * @brief Function pointer type for compression progress reporting callback.
+ */
+extern "C" typedef void (*astcenc_progress_callback)(float);
+
+/**
+ * @brief Enable normal map compression.
+ *
+ * Input data will be treated a two component normal map, storing X and Y, and the codec will
+ * optimize for angular error rather than simple linear PSNR. In this mode the input swizzle should
+ * be e.g. rrrg (the default ordering for ASTC normals on the command line) or gggr (the ordering
+ * used by BC5n).
+ */
+static const unsigned int ASTCENC_FLG_MAP_NORMAL          = 1 << 0;
+
+/**
+ * @brief Enable compression heuristics that assume use of decode_unorm8 decode mode.
+ *
+ * The decode_unorm8 decode mode rounds differently to the decode_fp16 decode mode, so enabling this
+ * flag during compression will allow the compressor to use the correct rounding when selecting
+ * encodings. This will improve the compressed image quality if your application is using the
+ * decode_unorm8 decode mode, but will reduce image quality if using decode_fp16.
+ *
+ * Note that LDR_SRGB images will always use decode_unorm8 for the RGB channels, irrespective of
+ * this setting.
+ */
+static const unsigned int ASTCENC_FLG_USE_DECODE_UNORM8        = 1 << 1;
+
+/**
+ * @brief Enable alpha weighting.
+ *
+ * The input alpha value is used for transparency, so errors in the RGB components are weighted by
+ * the transparency level. This allows the codec to more accurately encode the alpha value in areas
+ * where the color value is less significant.
+ */
+static const unsigned int ASTCENC_FLG_USE_ALPHA_WEIGHT     = 1 << 2;
+
+/**
+ * @brief Enable perceptual error metrics.
+ *
+ * This mode enables perceptual compression mode, which will optimize for perceptual error rather
+ * than best PSNR. Only some input modes support perceptual error metrics.
+ */
+static const unsigned int ASTCENC_FLG_USE_PERCEPTUAL       = 1 << 3;
+
+/**
+ * @brief Create a decompression-only context.
+ *
+ * This mode disables support for compression. This enables context allocation to skip some
+ * transient buffer allocation, resulting in lower memory usage.
+ */
+static const unsigned int ASTCENC_FLG_DECOMPRESS_ONLY      = 1 << 4;
+
+/**
+ * @brief Create a self-decompression context.
+ *
+ * This mode configures the compressor so that it is only guaranteed to be able to decompress images
+ * that were actually created using the current context. This is the common case for compression use
+ * cases, and setting this flag enables additional optimizations, but does mean that the context
+ * cannot reliably decompress arbitrary ASTC images.
+ */
+static const unsigned int ASTCENC_FLG_SELF_DECOMPRESS_ONLY = 1 << 5;
+
+/**
+ * @brief Enable RGBM map compression.
+ *
+ * Input data will be treated as HDR data that has been stored in an LDR RGBM-encoded wrapper
+ * format. Data must be preprocessed by the user to be in LDR RGBM format before calling the
+ * compression function, this flag is only used to control the use of RGBM-specific heuristics and
+ * error metrics.
+ *
+ * IMPORTANT: The ASTC format is prone to bad failure modes with unconstrained RGBM data; very small
+ * M values can round to zero due to quantization and result in black or white pixels. It is highly
+ * recommended that the minimum value of M used in the encoding is kept above a lower threshold (try
+ * 16 or 32). Applying this threshold reduces the number of very dark colors that can be
+ * represented, but is still higher precision than 8-bit LDR.
+ *
+ * When this flag is set the value of @c rgbm_m_scale in the context must be set to the RGBM scale
+ * factor used during reconstruction. This defaults to 5 when in RGBM mode.
+ *
+ * It is recommended that the value of @c cw_a_weight is set to twice the value of the multiplier
+ * scale, ensuring that the M value is accurately encoded. This defaults to 10 when in RGBM mode,
+ * matching the default scale factor.
+ */
+static const unsigned int ASTCENC_FLG_MAP_RGBM             = 1 << 6;
+
+/**
+ * @brief The bit mask of all valid flags.
+ */
+static const unsigned int ASTCENC_ALL_FLAGS =
+                              ASTCENC_FLG_MAP_NORMAL |
+                              ASTCENC_FLG_MAP_RGBM |
+                              ASTCENC_FLG_USE_ALPHA_WEIGHT |
+                              ASTCENC_FLG_USE_PERCEPTUAL |
+                              ASTCENC_FLG_USE_DECODE_UNORM8 |
+                              ASTCENC_FLG_DECOMPRESS_ONLY |
+                              ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
+
+/**
+ * @brief The config structure.
+ *
+ * This structure will initially be populated by a call to astcenc_config_init, but power users may
+ * modify it before calling astcenc_context_alloc. See astcenccli_toplevel_help.cpp for full user
+ * documentation of the power-user settings.
+ *
+ * Note for any settings which are associated with a specific color component, the value in the
+ * config applies to the component that exists after any compression data swizzle is applied.
+ */
+struct astcenc_config
+{
+	/** @brief The color profile. */
+	astcenc_profile profile;
+
+	/** @brief The set of set flags. */
+	unsigned int flags;
+
+	/** @brief The ASTC block size X dimension. */
+	unsigned int block_x;
+
+	/** @brief The ASTC block size Y dimension. */
+	unsigned int block_y;
+
+	/** @brief The ASTC block size Z dimension. */
+	unsigned int block_z;
+
+	/** @brief The red component weight scale for error weighting (-cw). */
+	float cw_r_weight;
+
+	/** @brief The green component weight scale for error weighting (-cw). */
+	float cw_g_weight;
+
+	/** @brief The blue component weight scale for error weighting (-cw). */
+	float cw_b_weight;
+
+	/** @brief The alpha component weight scale for error weighting (-cw). */
+	float cw_a_weight;
+
+	/**
+	 * @brief The radius for any alpha-weight scaling (-a).
+	 *
+	 * It is recommended that this is set to 1 when using FLG_USE_ALPHA_WEIGHT on a texture that
+	 * will be sampled using linear texture filtering to minimize color bleed out of transparent
+	 * texels that are adjacent to non-transparent texels.
+	 */
+	unsigned int a_scale_radius;
+
+	/** @brief The RGBM scale factor for the shared multiplier (-rgbm). */
+	float rgbm_m_scale;
+
+	/**
+	 * @brief The maximum number of partitions searched (-partitioncountlimit).
+	 *
+	 * Valid values are between 1 and 4.
+	 */
+	unsigned int tune_partition_count_limit;
+
+	/**
+	 * @brief The maximum number of partitions searched (-2partitionindexlimit).
+	 *
+	 * Valid values are between 1 and 1024.
+	 */
+	unsigned int tune_2partition_index_limit;
+
+	/**
+	 * @brief The maximum number of partitions searched (-3partitionindexlimit).
+	 *
+	 * Valid values are between 1 and 1024.
+	 */
+	unsigned int tune_3partition_index_limit;
+
+	/**
+	 * @brief The maximum number of partitions searched (-4partitionindexlimit).
+	 *
+	 * Valid values are between 1 and 1024.
+	 */
+	unsigned int tune_4partition_index_limit;
+
+	/**
+	 * @brief The maximum centile for block modes searched (-blockmodelimit).
+	 *
+	 * Valid values are between 1 and 100.
+	 */
+	unsigned int tune_block_mode_limit;
+
+	/**
+	 * @brief The maximum iterative refinements applied (-refinementlimit).
+	 *
+	 * Valid values are between 1 and N; there is no technical upper limit
+	 * but little benefit is expected after N=4.
+	 */
+	unsigned int tune_refinement_limit;
+
+	/**
+	 * @brief The number of trial candidates per mode search (-candidatelimit).
+	 *
+	 * Valid values are between 1 and TUNE_MAX_TRIAL_CANDIDATES.
+	 */
+	unsigned int tune_candidate_limit;
+
+	/**
+	 * @brief The number of trial partitionings per search (-2partitioncandidatelimit).
+	 *
+	 * Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
+	 */
+	unsigned int tune_2partitioning_candidate_limit;
+
+	/**
+	 * @brief The number of trial partitionings per search (-3partitioncandidatelimit).
+	 *
+	 * Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
+	 */
+	unsigned int tune_3partitioning_candidate_limit;
+
+	/**
+	 * @brief The number of trial partitionings per search (-4partitioncandidatelimit).
+	 *
+	 * Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
+	 */
+	unsigned int tune_4partitioning_candidate_limit;
+
+	/**
+	 * @brief The dB threshold for stopping block search (-dblimit).
+	 *
+	 * This option is ineffective for HDR textures.
+	 */
+	float tune_db_limit;
+
+	/**
+	 * @brief The amount of MSE overshoot needed to early-out trials.
+	 *
+	 * The first early-out is for 1 partition, 1 plane trials, where we try a minimal encode using
+	 * the high probability block modes. This can short-cut compression for simple blocks.
+	 *
+	 * The second early-out is for refinement trials, where we can exit refinement once quality is
+	 * reached.
+	 */
+	float tune_mse_overshoot;
+
+	/**
+	 * @brief The threshold for skipping 3.1/4.1 trials (-2partitionlimitfactor).
+	 *
+	 * This option is further scaled for normal maps, so it skips less often.
+	 */
+	float tune_2partition_early_out_limit_factor;
+
+	/**
+	 * @brief The threshold for skipping 4.1 trials (-3partitionlimitfactor).
+	 *
+	 * This option is further scaled for normal maps, so it skips less often.
+	 */
+	float tune_3partition_early_out_limit_factor;
+
+	/**
+	 * @brief The threshold for skipping two weight planes (-2planelimitcorrelation).
+	 *
+	 * This option is ineffective for normal maps.
+	 */
+	float tune_2plane_early_out_limit_correlation;
+
+	/**
+	 * @brief The config enable for the mode0 fast-path search.
+	 *
+	 * If this is set to TUNE_MIN_TEXELS_MODE0 or higher then the early-out fast mode0
+	 * search is enabled. This option is ineffective for 3D block sizes.
+	 */
+	float tune_search_mode0_enable;
+
+	/**
+	 * @brief The progress callback, can be @c nullptr.
+	 *
+	 * If this is specified the codec will peridocially report progress for
+	 * compression as a percentage between 0 and 100. The callback is called from one
+	 * of the compressor threads, so doing significant work in the callback will
+	 * reduce compression performance.
+	 */
+	astcenc_progress_callback progress_callback;
+
+#if defined(ASTCENC_DIAGNOSTICS)
+	/**
+	 * @brief The path to save the diagnostic trace data to.
+	 *
+	 * This option is not part of the public API, and requires special builds
+	 * of the library.
+	 */
+	const char* trace_file_path;
+#endif
+};
+
+/**
+ * @brief An uncompressed 2D or 3D image.
+ *
+ * 3D image are passed in as an array of 2D slices. Each slice has identical
+ * size and color format.
+ */
+struct astcenc_image
+{
+	/** @brief The X dimension of the image, in texels. */
+	unsigned int dim_x;
+
+	/** @brief The Y dimension of the image, in texels. */
+	unsigned int dim_y;
+
+	/** @brief The Z dimension of the image, in texels. */
+	unsigned int dim_z;
+
+	/** @brief The data type per component. */
+	astcenc_type data_type;
+
+	/** @brief The array of 2D slices, of length @c dim_z. */
+	void** data;
+};
+
+/**
+ * @brief A block encoding metadata query result.
+ *
+ * If the block is an error block or a constant color block or an error block all fields other than
+ * the profile, block dimensions, and error/constant indicator will be zero.
+ */
+struct astcenc_block_info
+{
+	/** @brief The block encoding color profile. */
+	astcenc_profile profile;
+
+	/** @brief The number of texels in the X dimension. */
+	unsigned int block_x;
+
+	/** @brief The number of texels in the Y dimension. */
+	unsigned int block_y;
+
+	/** @brief The number of texel in the Z dimension. */
+	unsigned int block_z;
+
+	/** @brief The number of texels in the block. */
+	unsigned int texel_count;
+
+	/** @brief True if this block is an error block. */
+	bool is_error_block;
+
+	/** @brief True if this block is a constant color block. */
+	bool is_constant_block;
+
+	/** @brief True if this block is an HDR block. */
+	bool is_hdr_block;
+
+	/** @brief True if this block uses two weight planes. */
+	bool is_dual_plane_block;
+
+	/** @brief The number of partitions if not constant color. */
+	unsigned int partition_count;
+
+	/** @brief The partition index if 2 - 4 partitions used. */
+	unsigned int partition_index;
+
+	/** @brief The component index of the second plane if dual plane. */
+	unsigned int dual_plane_component;
+
+	/** @brief The color endpoint encoding mode for each partition. */
+	unsigned int color_endpoint_modes[4];
+
+	/** @brief The number of color endpoint quantization levels. */
+	unsigned int color_level_count;
+
+	/** @brief The number of weight quantization levels. */
+	unsigned int weight_level_count;
+
+	/** @brief The number of weights in the X dimension. */
+	unsigned int weight_x;
+
+	/** @brief The number of weights in the Y dimension. */
+	unsigned int weight_y;
+
+	/** @brief The number of weights in the Z dimension. */
+	unsigned int weight_z;
+
+	/** @brief The unpacked color endpoints for each partition. */
+	float color_endpoints[4][2][4];
+
+	/** @brief The per-texel interpolation weights for the block. */
+	float weight_values_plane1[216];
+
+	/** @brief The per-texel interpolation weights for the block. */
+	float weight_values_plane2[216];
+
+	/** @brief The per-texel partition assignments for the block. */
+	uint8_t partition_assignment[216];
+};
+
+/**
+ * Populate a codec config based on default settings.
+ *
+ * Power users can edit the returned config struct to fine tune before allocating the context.
+ *
+ * @param      profile   Color profile.
+ * @param      block_x   ASTC block size X dimension.
+ * @param      block_y   ASTC block size Y dimension.
+ * @param      block_z   ASTC block size Z dimension.
+ * @param      quality   Search quality preset / effort level. Either an
+ *                       @c ASTCENC_PRE_* value, or a effort level between 0
+ *                       and 100. Performance is not linear between 0 and 100.
+
+ * @param      flags     A valid set of @c ASTCENC_FLG_* flag bits.
+ * @param[out] config    Output config struct to populate.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if the inputs are invalid
+ * either individually, or in combination.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_config_init(
+	astcenc_profile profile,
+	unsigned int block_x,
+	unsigned int block_y,
+	unsigned int block_z,
+	float quality,
+	unsigned int flags,
+	astcenc_config* config);
+
+/**
+ * @brief Allocate a new codec context based on a config.
+ *
+ * This function allocates all of the memory resources and threads needed by the codec. This can be
+ * slow, so it is recommended that contexts are reused to serially compress or decompress multiple
+ * images to amortize setup cost.
+ *
+ * Contexts can be allocated to support only decompression using the @c ASTCENC_FLG_DECOMPRESS_ONLY
+ * flag when creating the configuration. The compression functions will fail if invoked. For a
+ * decompress-only library build the @c ASTCENC_FLG_DECOMPRESS_ONLY flag must be set when creating
+ * any context.
+ *
+ * @param[in]  config         Codec config.
+ * @param      thread_count   Thread count to configure for.
+ * @param[out] context        Location to store an opaque context pointer.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if context creation failed.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_context_alloc(
+	const astcenc_config* config,
+	unsigned int thread_count,
+	astcenc_context** context);
+
+/**
+ * @brief Compress an image.
+ *
+ * A single context can only compress or decompress a single image at a time.
+ *
+ * For a context configured for multi-threading, any set of the N threads can call this function.
+ * Work will be dynamically scheduled across the threads available. Each thread must have a unique
+ * @c thread_index.
+ *
+ * @param         context        Codec context.
+ * @param[in,out] image          An input image, in 2D slices.
+ * @param         swizzle        Compression data swizzle, applied before compression.
+ * @param[out]    data_out       Pointer to output data array.
+ * @param         data_len       Length of the output data array.
+ * @param         thread_index   Thread index [0..N-1] of calling thread.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if compression failed.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_compress_image(
+	astcenc_context* context,
+	astcenc_image* image,
+	const astcenc_swizzle* swizzle,
+	uint8_t* data_out,
+	size_t data_len,
+	unsigned int thread_index);
+
+/**
+ * @brief Reset the codec state for a new compression.
+ *
+ * The caller is responsible for synchronizing threads in the worker thread pool. This function must
+ * only be called when all threads have exited the @c astcenc_compress_image() function for image N,
+ * but before any thread enters it for image N + 1.
+ *
+ * Calling this is not required (but won't hurt), if the context is created for single threaded use.
+ *
+ * @param context   Codec context.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if reset failed.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_compress_reset(
+	astcenc_context* context);
+
+/**
+ * @brief Cancel any pending compression operation.
+ *
+ * The caller must behave as if the compression completed normally, even though the data will be
+ * undefined. They are still responsible for synchronizing threads in the worker thread pool, and
+ * must call reset before starting another compression.
+ *
+ * @param context   Codec context.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if cancellation failed.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_compress_cancel(
+	astcenc_context* context);
+
+/**
+ * @brief Decompress an image.
+ *
+ * @param         context        Codec context.
+ * @param[in]     data           Pointer to compressed data.
+ * @param         data_len       Length of the compressed data, in bytes.
+ * @param[in,out] image_out      Output image.
+ * @param         swizzle        Decompression data swizzle, applied after decompression.
+ * @param         thread_index   Thread index [0..N-1] of calling thread.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if decompression failed.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_decompress_image(
+	astcenc_context* context,
+	const uint8_t* data,
+	size_t data_len,
+	astcenc_image* image_out,
+	const astcenc_swizzle* swizzle,
+	unsigned int thread_index);
+
+/**
+ * @brief Reset the codec state for a new decompression.
+ *
+ * The caller is responsible for synchronizing threads in the worker thread pool. This function must
+ * only be called when all threads have exited the @c astcenc_decompress_image() function for image
+ * N, but before any thread enters it for image N + 1.
+ *
+ * Calling this is not required (but won't hurt), if the context is created for single threaded use.
+ *
+ * @param context   Codec context.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if reset failed.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_decompress_reset(
+	astcenc_context* context);
+
+/**
+ * Free the compressor context.
+ *
+ * @param context   The codec context.
+ */
+ASTCENC_PUBLIC void astcenc_context_free(
+	astcenc_context* context);
+
+/**
+ * @brief Provide a high level summary of a block's encoding.
+ *
+ * This feature is primarily useful for codec developers but may be useful for developers building
+ * advanced content packaging pipelines.
+ *
+ * @param context   Codec context.
+ * @param data      One block of compressed ASTC data.
+ * @param info      The output info structure to populate.
+ *
+ * @return @c ASTCENC_SUCCESS if the block was decoded, or an error otherwise. Note that this
+ *         function will return success even if the block itself was an error block encoding, as the
+ *         decode was correctly handled.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_get_block_info(
+	astcenc_context* context,
+	const uint8_t data[16],
+	astcenc_block_info* info);
+
+/**
+ * @brief Get a printable string for specific status code.
+ *
+ * @param status   The status value.
+ *
+ * @return A human readable nul-terminated string.
+ */
+ASTCENC_PUBLIC const char* astcenc_get_error_string(
+	astcenc_error status);
+
+#endif
@@ -0,0 +1,948 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2025 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for finding dominant direction of a set of colors.
+ */
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+#include "astcenc_internal.h"
+
+#include <cassert>
+
+/**
+ * @brief Compute the average RGB color of each partition.
+ *
+ * The algorithm here uses a vectorized sequential scan and per-partition
+ * color accumulators, using select() to mask texel lanes in other partitions.
+ *
+ * We only accumulate sums for N-1 partitions during the scan; the value for
+ * the last partition can be computed given that we know the block-wide average
+ * already.
+ *
+ * Because of this we could reduce the loop iteration count so it "just" spans
+ * the max texel index needed for the N-1 partitions, which could need fewer
+ * iterations than the full block texel count. However, this makes the loop
+ * count erratic and causes more branch mispredictions so is a net loss.
+ *
+ * @param      pi         The partitioning to use.
+ * @param      blk        The block data to process.
+ * @param[out] averages   The output averages. Unused partition indices will
+ *                        not be initialized, and lane<3> will be zero.
+ */
+static void compute_partition_averages_rgb(
+	const partition_info& pi,
+	const image_block& blk,
+	vfloat4 averages[BLOCK_MAX_PARTITIONS]
+) {
+	unsigned int partition_count = pi.partition_count;
+	size_t texel_count = blk.texel_count;
+	promise(texel_count > 0);
+
+	// For 1 partition just use the precomputed mean
+	if (partition_count == 1)
+	{
+		averages[0] = blk.data_mean.swz<0, 1, 2>();
+	}
+	// For 2 partitions scan results for partition 0, compute partition 1
+	else if (partition_count == 2)
+	{
+		vfloatacc pp_avg_rgb[3] {};
+
+		vint lane_id = vint::lane_id();
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint texel_partition(pi.partition_of_texel + i);
+
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
+			lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
+
+			vfloat data_r = loada(blk.data_r + i);
+			haccumulate(pp_avg_rgb[0], data_r, p0_mask);
+
+			vfloat data_g = loada(blk.data_g + i);
+			haccumulate(pp_avg_rgb[1], data_g, p0_mask);
+
+			vfloat data_b = loada(blk.data_b + i);
+			haccumulate(pp_avg_rgb[2], data_b, p0_mask);
+		}
+
+		vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
+
+		vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0]),
+		                           hadd_s(pp_avg_rgb[1]),
+		                           hadd_s(pp_avg_rgb[2]));
+
+		vfloat4 p1_total = block_total - p0_total;
+
+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
+	}
+	// For 3 partitions scan results for partition 0/1, compute partition 2
+	else if (partition_count == 3)
+	{
+		vfloatacc pp_avg_rgb[2][3] {};
+
+		vint lane_id = vint::lane_id();
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint texel_partition(pi.partition_of_texel + i);
+
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
+			lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
+			vmask p1_mask = lane_mask & (texel_partition == vint(1));
+
+			vfloat data_r = loada(blk.data_r + i);
+			haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
+			haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
+
+			vfloat data_g = loada(blk.data_g + i);
+			haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
+			haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
+
+			vfloat data_b = loada(blk.data_b + i);
+			haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
+			haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
+		}
+
+		vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
+
+		vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
+		                           hadd_s(pp_avg_rgb[0][1]),
+		                           hadd_s(pp_avg_rgb[0][2]));
+
+		vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
+		                           hadd_s(pp_avg_rgb[1][1]),
+		                           hadd_s(pp_avg_rgb[1][2]));
+
+		vfloat4 p2_total = block_total - p0_total - p1_total;
+
+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
+		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
+	}
+	else
+	{
+		// For 4 partitions scan results for partition 0/1/2, compute partition 3
+		vfloatacc pp_avg_rgb[3][3] {};
+
+		vint lane_id = vint::lane_id();
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint texel_partition(pi.partition_of_texel + i);
+
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
+			lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
+			vmask p1_mask = lane_mask & (texel_partition == vint(1));
+			vmask p2_mask = lane_mask & (texel_partition == vint(2));
+
+			vfloat data_r = loada(blk.data_r + i);
+			haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
+			haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
+			haccumulate(pp_avg_rgb[2][0], data_r, p2_mask);
+
+			vfloat data_g = loada(blk.data_g + i);
+			haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
+			haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
+			haccumulate(pp_avg_rgb[2][1], data_g, p2_mask);
+
+			vfloat data_b = loada(blk.data_b + i);
+			haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
+			haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
+			haccumulate(pp_avg_rgb[2][2], data_b, p2_mask);
+		}
+
+		vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
+
+		vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
+		                           hadd_s(pp_avg_rgb[0][1]),
+		                           hadd_s(pp_avg_rgb[0][2]));
+
+		vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
+		                           hadd_s(pp_avg_rgb[1][1]),
+		                           hadd_s(pp_avg_rgb[1][2]));
+
+		vfloat4 p2_total = vfloat3(hadd_s(pp_avg_rgb[2][0]),
+		                           hadd_s(pp_avg_rgb[2][1]),
+		                           hadd_s(pp_avg_rgb[2][2]));
+
+		vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
+
+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
+		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
+		averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
+	}
+}
+
+/**
+ * @brief Compute the average RGBA color of each partition.
+ *
+ * The algorithm here uses a vectorized sequential scan and per-partition
+ * color accumulators, using select() to mask texel lanes in other partitions.
+ *
+ * We only accumulate sums for N-1 partitions during the scan; the value for
+ * the last partition can be computed given that we know the block-wide average
+ * already.
+ *
+ * Because of this we could reduce the loop iteration count so it "just" spans
+ * the max texel index needed for the N-1 partitions, which could need fewer
+ * iterations than the full block texel count. However, this makes the loop
+ * count erratic and causes more branch mispredictions so is a net loss.
+ *
+ * @param      pi         The partitioning to use.
+ * @param      blk        The block data to process.
+ * @param[out] averages   The output averages. Unused partition indices will
+ *                        not be initialized.
+ */
+static void compute_partition_averages_rgba(
+	const partition_info& pi,
+	const image_block& blk,
+	vfloat4 averages[BLOCK_MAX_PARTITIONS]
+) {
+	unsigned int partition_count = pi.partition_count;
+	size_t texel_count = blk.texel_count;
+	promise(texel_count > 0);
+
+	// For 1 partition just use the precomputed mean
+	if (partition_count == 1)
+	{
+		averages[0] = blk.data_mean;
+	}
+	// For 2 partitions scan results for partition 0, compute partition 1
+	else if (partition_count == 2)
+	{
+		vfloat4 pp_avg_rgba[4] {};
+
+		vint lane_id = vint::lane_id();
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint texel_partition(pi.partition_of_texel + i);
+
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
+			lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
+
+			vfloat data_r = loada(blk.data_r + i);
+			haccumulate(pp_avg_rgba[0], data_r, p0_mask);
+
+			vfloat data_g = loada(blk.data_g + i);
+			haccumulate(pp_avg_rgba[1], data_g, p0_mask);
+
+			vfloat data_b = loada(blk.data_b + i);
+			haccumulate(pp_avg_rgba[2], data_b, p0_mask);
+
+			vfloat data_a = loada(blk.data_a + i);
+			haccumulate(pp_avg_rgba[3], data_a, p0_mask);
+		}
+
+		vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
+
+		vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0]),
+		                           hadd_s(pp_avg_rgba[1]),
+		                           hadd_s(pp_avg_rgba[2]),
+		                           hadd_s(pp_avg_rgba[3]));
+
+		vfloat4 p1_total = block_total - p0_total;
+
+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
+	}
+	// For 3 partitions scan results for partition 0/1, compute partition 2
+	else if (partition_count == 3)
+	{
+		vfloat4 pp_avg_rgba[2][4] {};
+
+		vint lane_id = vint::lane_id();
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint texel_partition(pi.partition_of_texel + i);
+
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
+			lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
+			vmask p1_mask = lane_mask & (texel_partition == vint(1));
+
+			vfloat data_r = loada(blk.data_r + i);
+			haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
+			haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
+
+			vfloat data_g = loada(blk.data_g + i);
+			haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
+			haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
+
+			vfloat data_b = loada(blk.data_b + i);
+			haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
+			haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
+
+			vfloat data_a = loada(blk.data_a + i);
+			haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
+			haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
+		}
+
+		vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
+
+		vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
+		                           hadd_s(pp_avg_rgba[0][1]),
+		                           hadd_s(pp_avg_rgba[0][2]),
+		                           hadd_s(pp_avg_rgba[0][3]));
+
+		vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
+		                           hadd_s(pp_avg_rgba[1][1]),
+		                           hadd_s(pp_avg_rgba[1][2]),
+		                           hadd_s(pp_avg_rgba[1][3]));
+
+		vfloat4 p2_total = block_total - p0_total - p1_total;
+
+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
+		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
+	}
+	else
+	{
+		// For 4 partitions scan results for partition 0/1/2, compute partition 3
+		vfloat4 pp_avg_rgba[3][4] {};
+
+		vint lane_id = vint::lane_id();
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint texel_partition(pi.partition_of_texel + i);
+
+			vmask lane_mask = lane_id < vint_from_size(texel_count);
+			lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
+			vmask p1_mask = lane_mask & (texel_partition == vint(1));
+			vmask p2_mask = lane_mask & (texel_partition == vint(2));
+
+			vfloat data_r = loada(blk.data_r + i);
+			haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
+			haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
+			haccumulate(pp_avg_rgba[2][0], data_r, p2_mask);
+
+			vfloat data_g = loada(blk.data_g + i);
+			haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
+			haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
+			haccumulate(pp_avg_rgba[2][1], data_g, p2_mask);
+
+			vfloat data_b = loada(blk.data_b + i);
+			haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
+			haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
+			haccumulate(pp_avg_rgba[2][2], data_b, p2_mask);
+
+			vfloat data_a = loada(blk.data_a + i);
+			haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
+			haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
+			haccumulate(pp_avg_rgba[2][3], data_a, p2_mask);
+		}
+
+		vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
+
+		vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
+		                           hadd_s(pp_avg_rgba[0][1]),
+		                           hadd_s(pp_avg_rgba[0][2]),
+		                           hadd_s(pp_avg_rgba[0][3]));
+
+		vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
+		                           hadd_s(pp_avg_rgba[1][1]),
+		                           hadd_s(pp_avg_rgba[1][2]),
+		                           hadd_s(pp_avg_rgba[1][3]));
+
+		vfloat4 p2_total = vfloat4(hadd_s(pp_avg_rgba[2][0]),
+		                           hadd_s(pp_avg_rgba[2][1]),
+		                           hadd_s(pp_avg_rgba[2][2]),
+		                           hadd_s(pp_avg_rgba[2][3]));
+
+		vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
+
+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
+		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
+		averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
+	}
+}
+
+/* See header for documentation. */
+void compute_avgs_and_dirs_4_comp(
+	const partition_info& pi,
+	const image_block& blk,
+	partition_metrics pm[BLOCK_MAX_PARTITIONS]
+) {
+	size_t partition_count = pi.partition_count;
+	promise(partition_count > 0);
+
+	// Pre-compute partition_averages
+	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
+	compute_partition_averages_rgba(pi, blk, partition_averages);
+
+	for (size_t partition = 0; partition < partition_count; partition++)
+	{
+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
+		size_t texel_count = pi.partition_texel_count[partition];
+		promise(texel_count > 0);
+
+		vfloat4 average = partition_averages[partition];
+		pm[partition].avg = average;
+
+		vfloat4 sum_xp = vfloat4::zero();
+		vfloat4 sum_yp = vfloat4::zero();
+		vfloat4 sum_zp = vfloat4::zero();
+		vfloat4 sum_wp = vfloat4::zero();
+
+		for (size_t i = 0; i < texel_count; i++)
+		{
+			unsigned int iwt = texel_indexes[i];
+			vfloat4 texel_datum = blk.texel(iwt);
+			texel_datum = texel_datum - average;
+
+			vfloat4 zero = vfloat4::zero();
+
+			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
+			sum_xp += select(zero, texel_datum, tdm0);
+
+			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
+			sum_yp += select(zero, texel_datum, tdm1);
+
+			vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
+			sum_zp += select(zero, texel_datum, tdm2);
+
+			vmask4 tdm3 = texel_datum.swz<3,3,3,3>() > zero;
+			sum_wp += select(zero, texel_datum, tdm3);
+		}
+
+		vfloat4 prod_xp = dot(sum_xp, sum_xp);
+		vfloat4 prod_yp = dot(sum_yp, sum_yp);
+		vfloat4 prod_zp = dot(sum_zp, sum_zp);
+		vfloat4 prod_wp = dot(sum_wp, sum_wp);
+
+		vfloat4 best_vector = sum_xp;
+		vfloat4 best_sum = prod_xp;
+
+		vmask4 mask = prod_yp > best_sum;
+		best_vector = select(best_vector, sum_yp, mask);
+		best_sum = select(best_sum, prod_yp, mask);
+
+		mask = prod_zp > best_sum;
+		best_vector = select(best_vector, sum_zp, mask);
+		best_sum = select(best_sum, prod_zp, mask);
+
+		mask = prod_wp > best_sum;
+		best_vector = select(best_vector, sum_wp, mask);
+
+		pm[partition].dir = best_vector;
+	}
+}
+
+/* See header for documentation. */
+void compute_avgs_and_dirs_3_comp(
+	const partition_info& pi,
+	const image_block& blk,
+	unsigned int omitted_component,
+	partition_metrics pm[BLOCK_MAX_PARTITIONS]
+) {
+	// Pre-compute partition_averages
+	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
+	compute_partition_averages_rgba(pi, blk, partition_averages);
+
+	const float* data_vr = blk.data_r;
+	const float* data_vg = blk.data_g;
+	const float* data_vb = blk.data_b;
+
+	// TODO: Data-driven permute would be useful to avoid this ...
+	if (omitted_component == 0)
+	{
+		partition_averages[0] = partition_averages[0].swz<1, 2, 3>();
+		partition_averages[1] = partition_averages[1].swz<1, 2, 3>();
+		partition_averages[2] = partition_averages[2].swz<1, 2, 3>();
+		partition_averages[3] = partition_averages[3].swz<1, 2, 3>();
+
+		data_vr = blk.data_g;
+		data_vg = blk.data_b;
+		data_vb = blk.data_a;
+	}
+	else if (omitted_component == 1)
+	{
+		partition_averages[0] = partition_averages[0].swz<0, 2, 3>();
+		partition_averages[1] = partition_averages[1].swz<0, 2, 3>();
+		partition_averages[2] = partition_averages[2].swz<0, 2, 3>();
+		partition_averages[3] = partition_averages[3].swz<0, 2, 3>();
+
+		data_vg = blk.data_b;
+		data_vb = blk.data_a;
+	}
+	else if (omitted_component == 2)
+	{
+		partition_averages[0] = partition_averages[0].swz<0, 1, 3>();
+		partition_averages[1] = partition_averages[1].swz<0, 1, 3>();
+		partition_averages[2] = partition_averages[2].swz<0, 1, 3>();
+		partition_averages[3] = partition_averages[3].swz<0, 1, 3>();
+
+		data_vb = blk.data_a;
+	}
+	else
+	{
+		partition_averages[0] = partition_averages[0].swz<0, 1, 2>();
+		partition_averages[1] = partition_averages[1].swz<0, 1, 2>();
+		partition_averages[2] = partition_averages[2].swz<0, 1, 2>();
+		partition_averages[3] = partition_averages[3].swz<0, 1, 2>();
+	}
+
+	size_t partition_count = pi.partition_count;
+	promise(partition_count > 0);
+
+	for (size_t partition = 0; partition < partition_count; partition++)
+	{
+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
+		size_t texel_count = pi.partition_texel_count[partition];
+		promise(texel_count > 0);
+
+		vfloat4 average = partition_averages[partition];
+		pm[partition].avg = average;
+
+		vfloat4 sum_xp = vfloat4::zero();
+		vfloat4 sum_yp = vfloat4::zero();
+		vfloat4 sum_zp = vfloat4::zero();
+
+		for (size_t i = 0; i < texel_count; i++)
+		{
+			unsigned int iwt = texel_indexes[i];
+
+			vfloat4 texel_datum = vfloat3(data_vr[iwt],
+			                              data_vg[iwt],
+			                              data_vb[iwt]);
+			texel_datum = texel_datum - average;
+
+			vfloat4 zero = vfloat4::zero();
+
+			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
+			sum_xp += select(zero, texel_datum, tdm0);
+
+			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
+			sum_yp += select(zero, texel_datum, tdm1);
+
+			vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
+			sum_zp += select(zero, texel_datum, tdm2);
+		}
+
+		vfloat4 prod_xp = dot(sum_xp, sum_xp);
+		vfloat4 prod_yp = dot(sum_yp, sum_yp);
+		vfloat4 prod_zp = dot(sum_zp, sum_zp);
+
+		vfloat4 best_vector = sum_xp;
+		vfloat4 best_sum = prod_xp;
+
+		vmask4 mask = prod_yp > best_sum;
+		best_vector = select(best_vector, sum_yp, mask);
+		best_sum = select(best_sum, prod_yp, mask);
+
+		mask = prod_zp > best_sum;
+		best_vector = select(best_vector, sum_zp, mask);
+
+		pm[partition].dir = best_vector;
+	}
+}
+
+/* See header for documentation. */
+void compute_avgs_and_dirs_3_comp_rgb(
+	const partition_info& pi,
+	const image_block& blk,
+	partition_metrics pm[BLOCK_MAX_PARTITIONS]
+) {
+	size_t partition_count = pi.partition_count;
+	promise(partition_count > 0);
+
+	// Pre-compute partition_averages
+	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
+	compute_partition_averages_rgb(pi, blk, partition_averages);
+
+	for (size_t partition = 0; partition < partition_count; partition++)
+	{
+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
+		size_t texel_count = pi.partition_texel_count[partition];
+		promise(texel_count > 0);
+
+		vfloat4 average = partition_averages[partition];
+		pm[partition].avg = average;
+
+		vfloat4 sum_xp = vfloat4::zero();
+		vfloat4 sum_yp = vfloat4::zero();
+		vfloat4 sum_zp = vfloat4::zero();
+
+		for (size_t i = 0; i < texel_count; i++)
+		{
+			unsigned int iwt = texel_indexes[i];
+
+			vfloat4 texel_datum = blk.texel3(iwt);
+			texel_datum = texel_datum - average;
+
+			vfloat4 zero = vfloat4::zero();
+
+			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
+			sum_xp += select(zero, texel_datum, tdm0);
+
+			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
+			sum_yp += select(zero, texel_datum, tdm1);
+
+			vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
+			sum_zp += select(zero, texel_datum, tdm2);
+		}
+
+		vfloat4 prod_xp = dot(sum_xp, sum_xp);
+		vfloat4 prod_yp = dot(sum_yp, sum_yp);
+		vfloat4 prod_zp = dot(sum_zp, sum_zp);
+
+		vfloat4 best_vector = sum_xp;
+		vfloat4 best_sum = prod_xp;
+
+		vmask4 mask = prod_yp > best_sum;
+		best_vector = select(best_vector, sum_yp, mask);
+		best_sum = select(best_sum, prod_yp, mask);
+
+		mask = prod_zp > best_sum;
+		best_vector = select(best_vector, sum_zp, mask);
+
+		pm[partition].dir = best_vector;
+	}
+}
+
+/* See header for documentation. */
+void compute_avgs_and_dirs_2_comp(
+	const partition_info& pt,
+	const image_block& blk,
+	unsigned int component1,
+	unsigned int component2,
+	partition_metrics pm[BLOCK_MAX_PARTITIONS]
+) {
+	vfloat4 average;
+
+	const float* data_vr = nullptr;
+	const float* data_vg = nullptr;
+
+	if (component1 == 0 && component2 == 1)
+	{
+		average = blk.data_mean.swz<0, 1>();
+
+		data_vr = blk.data_r;
+		data_vg = blk.data_g;
+	}
+	else if (component1 == 0 && component2 == 2)
+	{
+		average = blk.data_mean.swz<0, 2>();
+
+		data_vr = blk.data_r;
+		data_vg = blk.data_b;
+	}
+	else // (component1 == 1 && component2 == 2)
+	{
+		assert(component1 == 1 && component2 == 2);
+
+		average = blk.data_mean.swz<1, 2>();
+
+		data_vr = blk.data_g;
+		data_vg = blk.data_b;
+	}
+
+	size_t partition_count = pt.partition_count;
+	promise(partition_count > 0);
+
+	for (size_t partition = 0; partition < partition_count; partition++)
+	{
+		const uint8_t *texel_indexes = pt.texels_of_partition[partition];
+		size_t texel_count = pt.partition_texel_count[partition];
+		promise(texel_count > 0);
+
+		// Only compute a partition mean if more than one partition
+		if (partition_count > 1)
+		{
+			average = vfloat4::zero();
+			for (size_t i = 0; i < texel_count; i++)
+			{
+				unsigned int iwt = texel_indexes[i];
+				average += vfloat2(data_vr[iwt], data_vg[iwt]);
+			}
+
+			average = average / static_cast<float>(texel_count);
+		}
+
+		pm[partition].avg = average;
+
+		vfloat4 sum_xp = vfloat4::zero();
+		vfloat4 sum_yp = vfloat4::zero();
+
+		for (size_t i = 0; i < texel_count; i++)
+		{
+			unsigned int iwt = texel_indexes[i];
+			vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]);
+			texel_datum = texel_datum - average;
+
+			vfloat4 zero = vfloat4::zero();
+
+			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
+			sum_xp += select(zero, texel_datum, tdm0);
+
+			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
+			sum_yp += select(zero, texel_datum, tdm1);
+		}
+
+		vfloat4 prod_xp = dot(sum_xp, sum_xp);
+		vfloat4 prod_yp = dot(sum_yp, sum_yp);
+
+		vfloat4 best_vector = sum_xp;
+		vfloat4 best_sum = prod_xp;
+
+		vmask4 mask = prod_yp > best_sum;
+		best_vector = select(best_vector, sum_yp, mask);
+
+		pm[partition].dir = best_vector;
+	}
+}
+
+/* See header for documentation. */
+void compute_error_squared_rgba(
+	const partition_info& pi,
+	const image_block& blk,
+	const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],
+	const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],
+	float line_lengths[BLOCK_MAX_PARTITIONS],
+	float& uncor_error,
+	float& samec_error
+) {
+	size_t partition_count = pi.partition_count;
+	promise(partition_count > 0);
+
+	vfloatacc uncor_errorsumv = vfloatacc::zero();
+	vfloatacc samec_errorsumv = vfloatacc::zero();
+
+	for (size_t partition = 0; partition < partition_count; partition++)
+	{
+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
+
+		processed_line4 l_uncor = uncor_plines[partition];
+		processed_line4 l_samec = samec_plines[partition];
+
+		size_t texel_count = pi.partition_texel_count[partition];
+		promise(texel_count > 0);
+
+		// Vectorize some useful scalar inputs
+		vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
+		vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
+		vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
+		vfloat l_uncor_bs3(l_uncor.bs.lane<3>());
+
+		vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
+		vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
+		vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
+		vfloat l_uncor_amod3(l_uncor.amod.lane<3>());
+
+		vfloat l_samec_bs0(l_samec.bs.lane<0>());
+		vfloat l_samec_bs1(l_samec.bs.lane<1>());
+		vfloat l_samec_bs2(l_samec.bs.lane<2>());
+		vfloat l_samec_bs3(l_samec.bs.lane<3>());
+
+		assert(all(l_samec.amod == vfloat4(0.0f)));
+
+		vfloat uncor_loparamv(1e10f);
+		vfloat uncor_hiparamv(-1e10f);
+
+		vfloat ew_r(blk.channel_weight.lane<0>());
+		vfloat ew_g(blk.channel_weight.lane<1>());
+		vfloat ew_b(blk.channel_weight.lane<2>());
+		vfloat ew_a(blk.channel_weight.lane<3>());
+
+		// This implementation over-shoots, but this is safe as we initialize the texel_indexes
+		// array to extend the last value. This means min/max are not impacted, but we need to mask
+		// out the dummy values when we compute the line weighting.
+		vint lane_ids = vint::lane_id();
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vmask mask = lane_ids < vint_from_size(texel_count);
+			const uint8_t* texel_idxs = texel_indexes + i;
+
+			vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
+			vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
+			vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
+			vfloat data_a = gatherf_byte_inds<vfloat>(blk.data_a, texel_idxs);
+
+			vfloat uncor_param = (data_r * l_uncor_bs0)
+			                   + (data_g * l_uncor_bs1)
+			                   + (data_b * l_uncor_bs2)
+			                   + (data_a * l_uncor_bs3);
+
+			uncor_loparamv = min(uncor_param, uncor_loparamv);
+			uncor_hiparamv = max(uncor_param, uncor_hiparamv);
+
+			vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
+			                   + (uncor_param * l_uncor_bs0);
+			vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
+			                   + (uncor_param * l_uncor_bs1);
+			vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
+			                   + (uncor_param * l_uncor_bs2);
+			vfloat uncor_dist3 = (l_uncor_amod3 - data_a)
+			                   + (uncor_param * l_uncor_bs3);
+
+			vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
+			                 + (ew_g * uncor_dist1 * uncor_dist1)
+			                 + (ew_b * uncor_dist2 * uncor_dist2)
+			                 + (ew_a * uncor_dist3 * uncor_dist3);
+
+			haccumulate(uncor_errorsumv, uncor_err, mask);
+
+			// Process samechroma data
+			vfloat samec_param = (data_r * l_samec_bs0)
+			                   + (data_g * l_samec_bs1)
+			                   + (data_b * l_samec_bs2)
+			                   + (data_a * l_samec_bs3);
+
+			vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
+			vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
+			vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
+			vfloat samec_dist3 = samec_param * l_samec_bs3 - data_a;
+
+			vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
+			                 + (ew_g * samec_dist1 * samec_dist1)
+			                 + (ew_b * samec_dist2 * samec_dist2)
+			                 + (ew_a * samec_dist3 * samec_dist3);
+
+			haccumulate(samec_errorsumv, samec_err, mask);
+
+			lane_ids += vint(ASTCENC_SIMD_WIDTH);
+		}
+
+		// Turn very small numbers and NaNs into a small number
+		float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);
+		line_lengths[partition] = astc::max(uncor_linelen, 1e-7f);
+	}
+
+	uncor_error = hadd_s(uncor_errorsumv);
+	samec_error = hadd_s(samec_errorsumv);
+}
+
+/* See header for documentation. */
+void compute_error_squared_rgb(
+	const partition_info& pi,
+	const image_block& blk,
+	partition_lines3 plines[BLOCK_MAX_PARTITIONS],
+	float& uncor_error,
+	float& samec_error
+) {
+	size_t partition_count = pi.partition_count;
+	promise(partition_count > 0);
+
+	vfloatacc uncor_errorsumv = vfloatacc::zero();
+	vfloatacc samec_errorsumv = vfloatacc::zero();
+
+	for (size_t partition = 0; partition < partition_count; partition++)
+	{
+		partition_lines3& pl = plines[partition];
+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
+		size_t texel_count = pi.partition_texel_count[partition];
+		promise(texel_count > 0);
+
+		processed_line3 l_uncor = pl.uncor_pline;
+		processed_line3 l_samec = pl.samec_pline;
+
+		// Vectorize some useful scalar inputs
+		vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
+		vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
+		vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
+
+		vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
+		vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
+		vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
+
+		vfloat l_samec_bs0(l_samec.bs.lane<0>());
+		vfloat l_samec_bs1(l_samec.bs.lane<1>());
+		vfloat l_samec_bs2(l_samec.bs.lane<2>());
+
+		assert(all(l_samec.amod == vfloat4(0.0f)));
+
+		vfloat uncor_loparamv(1e10f);
+		vfloat uncor_hiparamv(-1e10f);
+
+		vfloat ew_r(blk.channel_weight.lane<0>());
+		vfloat ew_g(blk.channel_weight.lane<1>());
+		vfloat ew_b(blk.channel_weight.lane<2>());
+
+		// This implementation over-shoots, but this is safe as we initialize the weights array
+		// to extend the last value. This means min/max are not impacted, but we need to mask
+		// out the dummy values when we compute the line weighting.
+		vint lane_ids = vint::lane_id();
+		for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vmask mask = lane_ids < vint_from_size(texel_count);
+			const uint8_t* texel_idxs = texel_indexes + i;
+
+			vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
+			vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
+			vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
+
+			vfloat uncor_param = (data_r * l_uncor_bs0)
+			                   + (data_g * l_uncor_bs1)
+			                   + (data_b * l_uncor_bs2);
+
+			uncor_loparamv = min(uncor_param, uncor_loparamv);
+			uncor_hiparamv = max(uncor_param, uncor_hiparamv);
+
+			vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
+			                   + (uncor_param * l_uncor_bs0);
+			vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
+			                   + (uncor_param * l_uncor_bs1);
+			vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
+			                   + (uncor_param * l_uncor_bs2);
+
+			vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
+			                 + (ew_g * uncor_dist1 * uncor_dist1)
+			                 + (ew_b * uncor_dist2 * uncor_dist2);
+
+			haccumulate(uncor_errorsumv, uncor_err, mask);
+
+			// Process samechroma data
+			vfloat samec_param = (data_r * l_samec_bs0)
+			                   + (data_g * l_samec_bs1)
+			                   + (data_b * l_samec_bs2);
+
+			vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
+			vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
+			vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
+
+			vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
+			                 + (ew_g * samec_dist1 * samec_dist1)
+			                 + (ew_b * samec_dist2 * samec_dist2);
+
+			haccumulate(samec_errorsumv, samec_err, mask);
+
+			lane_ids += vint(ASTCENC_SIMD_WIDTH);
+		}
+
+		// Turn very small numbers and NaNs into a small number
+		float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);
+		pl.line_length = astc::max(uncor_linelen, 1e-7f);
+	}
+
+	uncor_error = hadd_s(uncor_errorsumv);
+	samec_error = hadd_s(samec_errorsumv);
+}
+
+#endif
@@ -0,0 +1,941 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#include <utility>
+
+/**
+ * @brief Functions for color unquantization.
+ */
+
+#include "astcenc_internal.h"
+
+/**
+ * @brief Un-blue-contract a color.
+ *
+ * This function reverses any applied blue contraction.
+ *
+ * @param input   The input color that has been blue-contracted.
+ *
+ * @return The uncontracted color.
+ */
+static ASTCENC_SIMD_INLINE vint4 uncontract_color(
+	vint4 input
+) {
+	vmask4 mask(true, true, false, false);
+	vint4 bc0 = asr<1>(input + input.lane<2>());
+	return select(input, bc0, mask);
+}
+
+void rgba_delta_unpack(
+	vint4 input0,
+	vint4 input1,
+	vint4& output0,
+	vint4& output1
+) {
+	// Apply bit transfer
+	bit_transfer_signed(input1, input0);
+
+	// Apply blue-uncontraction if needed
+	int rgb_sum = hadd_rgb_s(input1);
+	input1 = input1 + input0;
+	if (rgb_sum < 0)
+	{
+		input0 = uncontract_color(input0);
+		input1 = uncontract_color(input1);
+		std::swap(input0, input1);
+	}
+
+	output0 = clamp(0, 255, input0);
+	output1 = clamp(0, 255, input1);
+}
+
+/**
+ * @brief Unpack an LDR RGB color that uses delta encoding.
+ *
+ * Output alpha set to 255.
+ *
+ * @param      input0    The packed endpoint 0 color.
+ * @param      input1    The packed endpoint 1 color deltas.
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void rgb_delta_unpack(
+	vint4 input0,
+	vint4 input1,
+	vint4& output0,
+	vint4& output1
+) {
+	rgba_delta_unpack(input0, input1, output0, output1);
+	output0.set_lane<3>(255);
+	output1.set_lane<3>(255);
+}
+
+void rgba_unpack(
+	vint4 input0,
+	vint4 input1,
+	vint4& output0,
+	vint4& output1
+) {
+	// Apply blue-uncontraction if needed
+	if (hadd_rgb_s(input0) > hadd_rgb_s(input1))
+	{
+		input0 = uncontract_color(input0);
+		input1 = uncontract_color(input1);
+		std::swap(input0, input1);
+	}
+
+	output0 = input0;
+	output1 = input1;
+}
+
+/**
+ * @brief Unpack an LDR RGB color that uses direct encoding.
+ *
+ * Output alpha set to 255.
+ *
+ * @param      input0    The packed endpoint 0 color.
+ * @param      input1    The packed endpoint 1 color.
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void rgb_unpack(
+	vint4 input0,
+	vint4 input1,
+	vint4& output0,
+	vint4& output1
+) {
+	rgba_unpack(input0, input1, output0, output1);
+	output0.set_lane<3>(255);
+	output1.set_lane<3>(255);
+}
+
+/**
+ * @brief Unpack an LDR RGBA color that uses scaled encoding.
+ *
+ * Note only the RGB channels use the scaled encoding, alpha uses direct.
+ *
+ * @param      input0    The packed endpoint 0 color.
+ * @param      alpha1    The packed endpoint 1 alpha value.
+ * @param      scale     The packed quantized scale.
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void rgb_scale_alpha_unpack(
+	vint4 input0,
+	uint8_t alpha1,
+	uint8_t scale,
+	vint4& output0,
+	vint4& output1
+) {
+	output1 = input0;
+	output1.set_lane<3>(alpha1);
+
+	output0 = asr<8>(input0 * scale);
+	output0.set_lane<3>(input0.lane<3>());
+}
+
+/**
+ * @brief Unpack an LDR RGB color that uses scaled encoding.
+ *
+ * Output alpha is 255.
+ *
+ * @param      input0    The packed endpoint 0 color.
+ * @param      scale     The packed scale.
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void rgb_scale_unpack(
+	vint4 input0,
+	int scale,
+	vint4& output0,
+	vint4& output1
+) {
+	output1 = input0;
+	output1.set_lane<3>(255);
+
+	output0 = asr<8>(input0 * scale);
+	output0.set_lane<3>(255);
+}
+
+/**
+ * @brief Unpack an LDR L color that uses direct encoding.
+ *
+ * Output alpha is 255.
+ *
+ * @param      input     The packed endpoints.
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void luminance_unpack(
+	const uint8_t input[2],
+	vint4& output0,
+	vint4& output1
+) {
+	int lum0 = input[0];
+	int lum1 = input[1];
+	output0 = vint4(lum0, lum0, lum0, 255);
+	output1 = vint4(lum1, lum1, lum1, 255);
+}
+
+/**
+ * @brief Unpack an LDR L color that uses delta encoding.
+ *
+ * Output alpha is 255.
+ *
+ * @param      input     The packed endpoints (L0, L1).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void luminance_delta_unpack(
+	const uint8_t input[2],
+	vint4& output0,
+	vint4& output1
+) {
+	int v0 = input[0];
+	int v1 = input[1];
+	int l0 = (v0 >> 2) | (v1 & 0xC0);
+	int l1 = l0 + (v1 & 0x3F);
+
+	l1 = astc::min(l1, 255);
+
+	output0 = vint4(l0, l0, l0, 255);
+	output1 = vint4(l1, l1, l1, 255);
+}
+
+/**
+ * @brief Unpack an LDR LA color that uses direct encoding.
+ *
+ * @param      input     The packed endpoints (L0, L1, A0, A1).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void luminance_alpha_unpack(
+	const uint8_t input[4],
+	vint4& output0,
+	vint4& output1
+) {
+	int lum0 = input[0];
+	int lum1 = input[1];
+	int alpha0 = input[2];
+	int alpha1 = input[3];
+	output0 = vint4(lum0, lum0, lum0, alpha0);
+	output1 = vint4(lum1, lum1, lum1, alpha1);
+}
+
+/**
+ * @brief Unpack an LDR LA color that uses delta encoding.
+ *
+ * @param      input     The packed endpoints (L0, L1, A0, A1).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void luminance_alpha_delta_unpack(
+	const uint8_t input[4],
+	vint4& output0,
+	vint4& output1
+) {
+	int lum0 = input[0];
+	int lum1 = input[1];
+	int alpha0 = input[2];
+	int alpha1 = input[3];
+
+	lum0 |= (lum1 & 0x80) << 1;
+	alpha0 |= (alpha1 & 0x80) << 1;
+	lum1 &= 0x7F;
+	alpha1 &= 0x7F;
+
+	if (lum1 & 0x40)
+	{
+		lum1 -= 0x80;
+	}
+
+	if (alpha1 & 0x40)
+	{
+		alpha1 -= 0x80;
+	}
+
+	lum0 >>= 1;
+	lum1 >>= 1;
+	alpha0 >>= 1;
+	alpha1 >>= 1;
+	lum1 += lum0;
+	alpha1 += alpha0;
+
+	lum1 = astc::clamp(lum1, 0, 255);
+	alpha1 = astc::clamp(alpha1, 0, 255);
+
+	output0 = vint4(lum0, lum0, lum0, alpha0);
+	output1 = vint4(lum1, lum1, lum1, alpha1);
+}
+
+/**
+ * @brief Unpack an HDR RGB + offset encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_rgbo_unpack(
+	const uint8_t input[4],
+	vint4& output0,
+	vint4& output1
+) {
+	int v0 = input[0];
+	int v1 = input[1];
+	int v2 = input[2];
+	int v3 = input[3];
+
+	int modeval = ((v0 & 0xC0) >> 6) | (((v1 & 0x80) >> 7) << 2) | (((v2 & 0x80) >> 7) << 3);
+
+	int majcomp;
+	int mode;
+	if ((modeval & 0xC) != 0xC)
+	{
+		majcomp = modeval >> 2;
+		mode = modeval & 3;
+	}
+	else if (modeval != 0xF)
+	{
+		majcomp = modeval & 3;
+		mode = 4;
+	}
+	else
+	{
+		majcomp = 0;
+		mode = 5;
+	}
+
+	int red = v0 & 0x3F;
+	int green = v1 & 0x1F;
+	int blue = v2 & 0x1F;
+	int scale = v3 & 0x1F;
+
+	int bit0 = (v1 >> 6) & 1;
+	int bit1 = (v1 >> 5) & 1;
+	int bit2 = (v2 >> 6) & 1;
+	int bit3 = (v2 >> 5) & 1;
+	int bit4 = (v3 >> 7) & 1;
+	int bit5 = (v3 >> 6) & 1;
+	int bit6 = (v3 >> 5) & 1;
+
+	int ohcomp = 1 << mode;
+
+	if (ohcomp & 0x30)
+		green |= bit0 << 6;
+	if (ohcomp & 0x3A)
+		green |= bit1 << 5;
+	if (ohcomp & 0x30)
+		blue |= bit2 << 6;
+	if (ohcomp & 0x3A)
+		blue |= bit3 << 5;
+
+	if (ohcomp & 0x3D)
+		scale |= bit6 << 5;
+	if (ohcomp & 0x2D)
+		scale |= bit5 << 6;
+	if (ohcomp & 0x04)
+		scale |= bit4 << 7;
+
+	if (ohcomp & 0x3B)
+		red |= bit4 << 6;
+	if (ohcomp & 0x04)
+		red |= bit3 << 6;
+
+	if (ohcomp & 0x10)
+		red |= bit5 << 7;
+	if (ohcomp & 0x0F)
+		red |= bit2 << 7;
+
+	if (ohcomp & 0x05)
+		red |= bit1 << 8;
+	if (ohcomp & 0x0A)
+		red |= bit0 << 8;
+
+	if (ohcomp & 0x05)
+		red |= bit0 << 9;
+	if (ohcomp & 0x02)
+		red |= bit6 << 9;
+
+	if (ohcomp & 0x01)
+		red |= bit3 << 10;
+	if (ohcomp & 0x02)
+		red |= bit5 << 10;
+
+	// expand to 12 bits.
+	static const int shamts[6] { 1, 1, 2, 3, 4, 5 };
+	int shamt = shamts[mode];
+	red <<= shamt;
+	green <<= shamt;
+	blue <<= shamt;
+	scale <<= shamt;
+
+	// on modes 0 to 4, the values stored for "green" and "blue" are differentials,
+	// not absolute values.
+	if (mode != 5)
+	{
+		green = red - green;
+		blue = red - blue;
+	}
+
+	// switch around components.
+	int temp;
+	switch (majcomp)
+	{
+	case 1:
+		temp = red;
+		red = green;
+		green = temp;
+		break;
+	case 2:
+		temp = red;
+		red = blue;
+		blue = temp;
+		break;
+	default:
+		break;
+	}
+
+	int red0 = red - scale;
+	int green0 = green - scale;
+	int blue0 = blue - scale;
+
+	// clamp to [0,0xFFF].
+	if (red < 0)
+		red = 0;
+	if (green < 0)
+		green = 0;
+	if (blue < 0)
+		blue = 0;
+
+	if (red0 < 0)
+		red0 = 0;
+	if (green0 < 0)
+		green0 = 0;
+	if (blue0 < 0)
+		blue0 = 0;
+
+	output0 = vint4(red0 << 4, green0 << 4, blue0 << 4, 0x7800);
+	output1 = vint4(red << 4, green << 4, blue << 4, 0x7800);
+}
+
+/**
+ * @brief Unpack an HDR RGB direct encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_rgb_unpack(
+	const uint8_t input[6],
+	vint4& output0,
+	vint4& output1
+) {
+
+	int v0 = input[0];
+	int v1 = input[1];
+	int v2 = input[2];
+	int v3 = input[3];
+	int v4 = input[4];
+	int v5 = input[5];
+
+	// extract all the fixed-placement bitfields
+	int modeval = ((v1 & 0x80) >> 7) | (((v2 & 0x80) >> 7) << 1) | (((v3 & 0x80) >> 7) << 2);
+
+	int majcomp = ((v4 & 0x80) >> 7) | (((v5 & 0x80) >> 7) << 1);
+
+	if (majcomp == 3)
+	{
+		output0 = vint4(v0 << 8, v2 << 8, (v4 & 0x7F) << 9, 0x7800);
+		output1 = vint4(v1 << 8, v3 << 8, (v5 & 0x7F) << 9, 0x7800);
+		return;
+	}
+
+	int a = v0 | ((v1 & 0x40) << 2);
+	int b0 = v2 & 0x3f;
+	int b1 = v3 & 0x3f;
+	int c = v1 & 0x3f;
+	int d0 = v4 & 0x7f;
+	int d1 = v5 & 0x7f;
+
+	// get hold of the number of bits in 'd0' and 'd1'
+	static const int dbits_tab[8] { 7, 6, 7, 6, 5, 6, 5, 6 };
+	int dbits = dbits_tab[modeval];
+
+	// extract six variable-placement bits
+	int bit0 = (v2 >> 6) & 1;
+	int bit1 = (v3 >> 6) & 1;
+	int bit2 = (v4 >> 6) & 1;
+	int bit3 = (v5 >> 6) & 1;
+	int bit4 = (v4 >> 5) & 1;
+	int bit5 = (v5 >> 5) & 1;
+
+	// and prepend the variable-placement bits depending on mode.
+	int ohmod = 1 << modeval;	// one-hot-mode
+	if (ohmod & 0xA4)
+		a |= bit0 << 9;
+	if (ohmod & 0x8)
+		a |= bit2 << 9;
+	if (ohmod & 0x50)
+		a |= bit4 << 9;
+
+	if (ohmod & 0x50)
+		a |= bit5 << 10;
+	if (ohmod & 0xA0)
+		a |= bit1 << 10;
+
+	if (ohmod & 0xC0)
+		a |= bit2 << 11;
+
+	if (ohmod & 0x4)
+		c |= bit1 << 6;
+	if (ohmod & 0xE8)
+		c |= bit3 << 6;
+
+	if (ohmod & 0x20)
+		c |= bit2 << 7;
+
+	if (ohmod & 0x5B)
+	{
+		b0 |= bit0 << 6;
+		b1 |= bit1 << 6;
+	}
+
+	if (ohmod & 0x12)
+	{
+		b0 |= bit2 << 7;
+		b1 |= bit3 << 7;
+	}
+
+	if (ohmod & 0xAF)
+	{
+		d0 |= bit4 << 5;
+		d1 |= bit5 << 5;
+	}
+
+	if (ohmod & 0x5)
+	{
+		d0 |= bit2 << 6;
+		d1 |= bit3 << 6;
+	}
+
+	// sign-extend 'd0' and 'd1'
+	// note: this code assumes that signed right-shift actually sign-fills, not zero-fills.
+	int32_t d0x = d0;
+	int32_t d1x = d1;
+	int sx_shamt = 32 - dbits;
+	d0x <<= sx_shamt;
+	d0x >>= sx_shamt;
+	d1x <<= sx_shamt;
+	d1x >>= sx_shamt;
+	d0 = d0x;
+	d1 = d1x;
+
+	// expand all values to 12 bits, with left-shift as needed.
+	int val_shamt = (modeval >> 1) ^ 3;
+	a <<= val_shamt;
+	b0 <<= val_shamt;
+	b1 <<= val_shamt;
+	c <<= val_shamt;
+	d0 <<= val_shamt;
+	d1 <<= val_shamt;
+
+	// then compute the actual color values.
+	int red1 = a;
+	int green1 = a - b0;
+	int blue1 = a - b1;
+	int red0 = a - c;
+	int green0 = a - b0 - c - d0;
+	int blue0 = a - b1 - c - d1;
+
+	// clamp the color components to [0,2^12 - 1]
+	red0 = astc::clamp(red0, 0, 4095);
+	green0 = astc::clamp(green0, 0, 4095);
+	blue0 = astc::clamp(blue0, 0, 4095);
+
+	red1 = astc::clamp(red1, 0, 4095);
+	green1 = astc::clamp(green1, 0, 4095);
+	blue1 = astc::clamp(blue1, 0, 4095);
+
+	// switch around the color components
+	int temp0, temp1;
+	switch (majcomp)
+	{
+	case 1:					// switch around red and green
+		temp0 = red0;
+		temp1 = red1;
+		red0 = green0;
+		red1 = green1;
+		green0 = temp0;
+		green1 = temp1;
+		break;
+	case 2:					// switch around red and blue
+		temp0 = red0;
+		temp1 = red1;
+		red0 = blue0;
+		red1 = blue1;
+		blue0 = temp0;
+		blue1 = temp1;
+		break;
+	case 0:					// no switch
+		break;
+	}
+
+	output0 = vint4(red0 << 4, green0 << 4, blue0 << 4, 0x7800);
+	output1 = vint4(red1 << 4, green1 << 4, blue1 << 4, 0x7800);
+}
+
+/**
+ * @brief Unpack an HDR RGB + LDR A direct encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_rgb_ldr_alpha_unpack(
+	const uint8_t input[8],
+	vint4& output0,
+	vint4& output1
+) {
+	hdr_rgb_unpack(input, output0, output1);
+
+	int v6 = input[6];
+	int v7 = input[7];
+	output0.set_lane<3>(v6);
+	output1.set_lane<3>(v7);
+}
+
+/**
+ * @brief Unpack an HDR L (small range) direct encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_luminance_small_range_unpack(
+	const uint8_t input[2],
+	vint4& output0,
+	vint4& output1
+) {
+	int v0 = input[0];
+	int v1 = input[1];
+
+	int y0, y1;
+	if (v0 & 0x80)
+	{
+		y0 = ((v1 & 0xE0) << 4) | ((v0 & 0x7F) << 2);
+		y1 = (v1 & 0x1F) << 2;
+	}
+	else
+	{
+		y0 = ((v1 & 0xF0) << 4) | ((v0 & 0x7F) << 1);
+		y1 = (v1 & 0xF) << 1;
+	}
+
+	y1 += y0;
+	if (y1 > 0xFFF)
+	{
+		y1 = 0xFFF;
+	}
+
+	output0 = vint4(y0 << 4, y0 << 4, y0 << 4, 0x7800);
+	output1 = vint4(y1 << 4, y1 << 4, y1 << 4, 0x7800);
+}
+
+/**
+ * @brief Unpack an HDR L (large range) direct encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_luminance_large_range_unpack(
+	const uint8_t input[2],
+	vint4& output0,
+	vint4& output1
+) {
+	int v0 = input[0];
+	int v1 = input[1];
+
+	int y0, y1;
+	if (v1 >= v0)
+	{
+		y0 = v0 << 4;
+		y1 = v1 << 4;
+	}
+	else
+	{
+		y0 = (v1 << 4) + 8;
+		y1 = (v0 << 4) - 8;
+	}
+
+	output0 = vint4(y0 << 4, y0 << 4, y0 << 4, 0x7800);
+	output1 = vint4(y1 << 4, y1 << 4, y1 << 4, 0x7800);
+}
+
+/**
+ * @brief Unpack an HDR A direct encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_alpha_unpack(
+	const uint8_t input[2],
+	int& output0,
+	int& output1
+) {
+
+	int v6 = input[0];
+	int v7 = input[1];
+
+	int selector = ((v6 >> 7) & 1) | ((v7 >> 6) & 2);
+	v6 &= 0x7F;
+	v7 &= 0x7F;
+	if (selector == 3)
+	{
+		output0 = v6 << 5;
+		output1 = v7 << 5;
+	}
+	else
+	{
+		v6 |= (v7 << (selector + 1)) & 0x780;
+		v7 &= (0x3f >> selector);
+		v7 ^= 32 >> selector;
+		v7 -= 32 >> selector;
+		v6 <<= (4 - selector);
+		v7 <<= (4 - selector);
+		v7 += v6;
+
+		if (v7 < 0)
+		{
+			v7 = 0;
+		}
+		else if (v7 > 0xFFF)
+		{
+			v7 = 0xFFF;
+		}
+
+		output0 = v6;
+		output1 = v7;
+	}
+
+	output0 <<= 4;
+	output1 <<= 4;
+}
+
+/**
+ * @brief Unpack an HDR RGBA direct encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_rgb_hdr_alpha_unpack(
+	const uint8_t input[8],
+	vint4& output0,
+	vint4& output1
+) {
+	hdr_rgb_unpack(input, output0, output1);
+
+	int alpha0, alpha1;
+	hdr_alpha_unpack(input + 6, alpha0, alpha1);
+
+	output0.set_lane<3>(alpha0);
+	output1.set_lane<3>(alpha1);
+}
+
+/* See header for documentation. */
+void unpack_color_endpoints(
+	astcenc_profile decode_mode,
+	int format,
+	const uint8_t* input,
+	bool& rgb_hdr,
+	bool& alpha_hdr,
+	vint4& output0,
+	vint4& output1
+) {
+	// Assume no NaNs and LDR endpoints unless set later
+	rgb_hdr = false;
+	alpha_hdr = false;
+
+	bool alpha_hdr_default = false;
+
+	switch (format)
+	{
+	case FMT_LUMINANCE:
+		luminance_unpack(input, output0, output1);
+		break;
+
+	case FMT_LUMINANCE_DELTA:
+		luminance_delta_unpack(input, output0, output1);
+		break;
+
+	case FMT_HDR_LUMINANCE_SMALL_RANGE:
+		rgb_hdr = true;
+		alpha_hdr_default = true;
+		hdr_luminance_small_range_unpack(input, output0, output1);
+		break;
+
+	case FMT_HDR_LUMINANCE_LARGE_RANGE:
+		rgb_hdr = true;
+		alpha_hdr_default = true;
+		hdr_luminance_large_range_unpack(input, output0, output1);
+		break;
+
+	case FMT_LUMINANCE_ALPHA:
+		luminance_alpha_unpack(input, output0, output1);
+		break;
+
+	case FMT_LUMINANCE_ALPHA_DELTA:
+		luminance_alpha_delta_unpack(input, output0, output1);
+		break;
+
+	case FMT_RGB_SCALE:
+		{
+			vint4 input0q(input[0], input[1], input[2], 0);
+			uint8_t scale = input[3];
+			rgb_scale_unpack(input0q, scale, output0, output1);
+		}
+		break;
+
+	case FMT_RGB_SCALE_ALPHA:
+		{
+			vint4 input0q(input[0], input[1], input[2], input[4]);
+			uint8_t alpha1q = input[5];
+			uint8_t scaleq = input[3];
+			rgb_scale_alpha_unpack(input0q, alpha1q, scaleq, output0, output1);
+		}
+		break;
+
+	case FMT_HDR_RGB_SCALE:
+		rgb_hdr = true;
+		alpha_hdr_default = true;
+		hdr_rgbo_unpack(input, output0, output1);
+		break;
+
+	case FMT_RGB:
+		{
+			vint4 input0q(input[0], input[2], input[4], 0);
+			vint4 input1q(input[1], input[3], input[5], 0);
+			rgb_unpack(input0q, input1q, output0, output1);
+		}
+		break;
+
+	case FMT_RGB_DELTA:
+		{
+			vint4 input0q(input[0], input[2], input[4], 0);
+			vint4 input1q(input[1], input[3], input[5], 0);
+			rgb_delta_unpack(input0q, input1q, output0, output1);
+		}
+		break;
+
+	case FMT_HDR_RGB:
+		rgb_hdr = true;
+		alpha_hdr_default = true;
+		hdr_rgb_unpack(input, output0, output1);
+		break;
+
+	case FMT_RGBA:
+		{
+			vint4 input0q(input[0], input[2], input[4], input[6]);
+			vint4 input1q(input[1], input[3], input[5], input[7]);
+			rgba_unpack(input0q, input1q, output0, output1);
+		}
+		break;
+
+	case FMT_RGBA_DELTA:
+		{
+			vint4 input0q(input[0], input[2], input[4], input[6]);
+			vint4 input1q(input[1], input[3], input[5], input[7]);
+			rgba_delta_unpack(input0q, input1q, output0, output1);
+		}
+		break;
+
+	case FMT_HDR_RGB_LDR_ALPHA:
+		rgb_hdr = true;
+		hdr_rgb_ldr_alpha_unpack(input, output0, output1);
+		break;
+
+	case FMT_HDR_RGBA:
+		rgb_hdr = true;
+		alpha_hdr = true;
+		hdr_rgb_hdr_alpha_unpack(input, output0, output1);
+		break;
+	}
+
+	// Assign a correct default alpha
+	if (alpha_hdr_default)
+	{
+		if (decode_mode == ASTCENC_PRF_HDR)
+		{
+			output0.set_lane<3>(0x7800);
+			output1.set_lane<3>(0x7800);
+			alpha_hdr = true;
+		}
+		else
+		{
+			output0.set_lane<3>(0x00FF);
+			output1.set_lane<3>(0x00FF);
+			alpha_hdr = false;
+		}
+	}
+
+	// Handle endpoint errors and expansion
+
+	// Linear LDR 8-bit endpoints are expanded to 16-bit by replication
+	if (decode_mode == ASTCENC_PRF_LDR)
+	{
+		// Error color - HDR endpoint in an LDR encoding
+		if (rgb_hdr || alpha_hdr)
+		{
+			output0 = vint4(0xFF, 0x00, 0xFF, 0xFF);
+			output1 = vint4(0xFF, 0x00, 0xFF, 0xFF);
+			rgb_hdr = false;
+			alpha_hdr = false;
+		}
+
+		output0 = output0 * 257;
+		output1 = output1 * 257;
+	}
+	// sRGB LDR 8-bit endpoints are expanded to 16 bit by:
+	//  - RGB = shift left by 8 bits and OR with 0x80
+	//  - A = replication
+	else if (decode_mode == ASTCENC_PRF_LDR_SRGB)
+	{
+		// Error color - HDR endpoint in an LDR encoding
+		if (rgb_hdr || alpha_hdr)
+		{
+			output0 = vint4(0xFF, 0x00, 0xFF, 0xFF);
+			output1 = vint4(0xFF, 0x00, 0xFF, 0xFF);
+			rgb_hdr = false;
+			alpha_hdr = false;
+		}
+
+		output0 = lsl<8>(output0) | vint4(0x80);
+		output1 = lsl<8>(output1) | vint4(0x80);
+	}
+	// An HDR profile decode, but may be using linear LDR endpoints
+	// Linear LDR 8-bit endpoints are expanded to 16-bit by replication
+	// HDR endpoints are already 16-bit
+	else
+	{
+		vmask4 hdr_lanes(rgb_hdr, rgb_hdr, rgb_hdr, alpha_hdr);
+		vint4 output_scale = select(vint4(257), vint4(1), hdr_lanes);
+		output0 = output0 * output_scale;
+		output1 = output1 * output_scale;
+	}
+}
@@ -0,0 +1,472 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/**
+ * @brief Functions to calculate variance per component in a NxN footprint.
+ *
+ * We need N to be parametric, so the routine below uses summed area tables in order to execute in
+ * O(1) time independent of how big N is.
+ *
+ * The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first
+ * perform a binary reduction, and then distributes the results. This method means that there is no
+ * serial dependency between a given element and the next one, and also significantly improves
+ * numerical stability allowing us to use floats rather than doubles.
+ */
+
+#include "astcenc_internal.h"
+
+#include <cassert>
+
+/**
+ * @brief Generate a prefix-sum array using the Brent-Kung algorithm.
+ *
+ * This will take an input array of the form:
+ *     v0, v1, v2, ...
+ * ... and modify in-place to turn it into a prefix-sum array of the form:
+ *     v0, v0+v1, v0+v1+v2, ...
+ *
+ * @param d      The array to prefix-sum.
+ * @param items  The number of items in the array.
+ * @param stride The item spacing in the array; i.e. dense arrays should use 1.
+ */
+static void brent_kung_prefix_sum(
+	vfloat4* d,
+	size_t items,
+	int stride
+) {
+	if (items < 2)
+		return;
+
+	size_t lc_stride = 2;
+	size_t log2_stride = 1;
+
+	// The reduction-tree loop
+	do {
+		size_t step = lc_stride >> 1;
+		size_t start = lc_stride - 1;
+		size_t iters = items >> log2_stride;
+
+		vfloat4 *da = d + (start * stride);
+		ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
+		size_t ofs_stride = stride << log2_stride;
+
+		while (iters)
+		{
+			*da = *da + da[ofs];
+			da += ofs_stride;
+			iters--;
+		}
+
+		log2_stride += 1;
+		lc_stride <<= 1;
+	} while (lc_stride <= items);
+
+	// The expansion-tree loop
+	do {
+		log2_stride -= 1;
+		lc_stride >>= 1;
+
+		size_t step = lc_stride >> 1;
+		size_t start = step + lc_stride - 1;
+		size_t iters = (items - step) >> log2_stride;
+
+		vfloat4 *da = d + (start * stride);
+		ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
+		size_t ofs_stride = stride << log2_stride;
+
+		while (iters)
+		{
+			*da = *da + da[ofs];
+			da += ofs_stride;
+			iters--;
+		}
+	} while (lc_stride > 2);
+}
+
+/* See header for documentation. */
+void compute_pixel_region_variance(
+	astcenc_contexti& ctx,
+	const pixel_region_args& arg
+) {
+	// Unpack the memory structure into local variables
+	const astcenc_image* img = arg.img;
+	astcenc_swizzle swz = arg.swz;
+	bool have_z = arg.have_z;
+
+	int size_x = arg.size_x;
+	int size_y = arg.size_y;
+	int size_z = arg.size_z;
+
+	int offset_x = arg.offset_x;
+	int offset_y = arg.offset_y;
+	int offset_z = arg.offset_z;
+
+	int alpha_kernel_radius = arg.alpha_kernel_radius;
+
+	float*   input_alpha_averages = ctx.input_alpha_averages;
+	vfloat4* work_memory = arg.work_memory;
+
+	// Compute memory sizes and dimensions that we need
+	int kernel_radius = alpha_kernel_radius;
+	int kerneldim = 2 * kernel_radius + 1;
+	int kernel_radius_xy = kernel_radius;
+	int kernel_radius_z = have_z ? kernel_radius : 0;
+
+	int padsize_x = size_x + kerneldim;
+	int padsize_y = size_y + kerneldim;
+	int padsize_z = size_z + (have_z ? kerneldim : 0);
+	int sizeprod = padsize_x * padsize_y * padsize_z;
+
+	int zd_start = have_z ? 1 : 0;
+
+	vfloat4 *varbuf1 = work_memory;
+	vfloat4 *varbuf2 = work_memory + sizeprod;
+
+	// Scaling factors to apply to Y and Z for accesses into the work buffers
+	int yst = padsize_x;
+	int zst = padsize_x * padsize_y;
+
+	// Scaling factors to apply to Y and Z for accesses into result buffers
+	int ydt = img->dim_x;
+	int zdt = img->dim_x * img->dim_y;
+
+	// Macros to act as accessor functions for the work-memory
+	#define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x]
+	#define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x]
+
+	// Load N and N^2 values into the work buffers
+	if (img->data_type == ASTCENC_TYPE_U8)
+	{
+		// Swizzle data structure 4 = ZERO, 5 = ONE
+		uint8_t data[6];
+		data[ASTCENC_SWZ_0] = 0;
+		data[ASTCENC_SWZ_1] = 255;
+
+		for (int z = zd_start; z < padsize_z; z++)
+		{
+			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
+			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
+			uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]);
+
+			for (int y = 1; y < padsize_y; y++)
+			{
+				int y_src = (y - 1) + offset_y - kernel_radius_xy;
+				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
+
+				for (int x = 1; x < padsize_x; x++)
+				{
+					int x_src = (x - 1) + offset_x - kernel_radius_xy;
+					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
+
+					data[0] = data8[(4 * img->dim_x * y_src) + (4 * x_src    )];
+					data[1] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
+					data[2] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
+					data[3] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
+
+					uint8_t r = data[swz.r];
+					uint8_t g = data[swz.g];
+					uint8_t b = data[swz.b];
+					uint8_t a = data[swz.a];
+
+					vfloat4 d = vfloat4 (r * (1.0f / 255.0f),
+					                     g * (1.0f / 255.0f),
+					                     b * (1.0f / 255.0f),
+					                     a * (1.0f / 255.0f));
+
+					VARBUF1(z, y, x) = d;
+					VARBUF2(z, y, x) = d * d;
+				}
+			}
+		}
+	}
+	else if (img->data_type == ASTCENC_TYPE_F16)
+	{
+		// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
+		uint16_t data[6];
+		data[ASTCENC_SWZ_0] = 0;
+		data[ASTCENC_SWZ_1] = 0x3C00;
+
+		for (int z = zd_start; z < padsize_z; z++)
+		{
+			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
+			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
+			uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]);
+
+			for (int y = 1; y < padsize_y; y++)
+			{
+				int y_src = (y - 1) + offset_y - kernel_radius_xy;
+				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
+
+				for (int x = 1; x < padsize_x; x++)
+				{
+					int x_src = (x - 1) + offset_x - kernel_radius_xy;
+					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
+
+					data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src    )];
+					data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
+					data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
+					data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
+
+					vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
+					vfloat4 d = float16_to_float(di);
+
+					VARBUF1(z, y, x) = d;
+					VARBUF2(z, y, x) = d * d;
+				}
+			}
+		}
+	}
+	else // if (img->data_type == ASTCENC_TYPE_F32)
+	{
+		assert(img->data_type == ASTCENC_TYPE_F32);
+
+		// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
+		float data[6];
+		data[ASTCENC_SWZ_0] = 0.0f;
+		data[ASTCENC_SWZ_1] = 1.0f;
+
+		for (int z = zd_start; z < padsize_z; z++)
+		{
+			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
+			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
+			float* data32 = static_cast<float*>(img->data[z_src]);
+
+			for (int y = 1; y < padsize_y; y++)
+			{
+				int y_src = (y - 1) + offset_y - kernel_radius_xy;
+				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
+
+				for (int x = 1; x < padsize_x; x++)
+				{
+					int x_src = (x - 1) + offset_x - kernel_radius_xy;
+					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
+
+					data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src    )];
+					data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
+					data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
+					data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
+
+					float r = data[swz.r];
+					float g = data[swz.g];
+					float b = data[swz.b];
+					float a = data[swz.a];
+
+					vfloat4 d(r, g, b, a);
+
+					VARBUF1(z, y, x) = d;
+					VARBUF2(z, y, x) = d * d;
+				}
+			}
+		}
+	}
+
+	// Pad with an extra layer of 0s; this forms the edge of the SAT tables
+	vfloat4 vbz = vfloat4::zero();
+	for (int z = 0; z < padsize_z; z++)
+	{
+		for (int y = 0; y < padsize_y; y++)
+		{
+			VARBUF1(z, y, 0) = vbz;
+			VARBUF2(z, y, 0) = vbz;
+		}
+
+		for (int x = 0; x < padsize_x; x++)
+		{
+			VARBUF1(z, 0, x) = vbz;
+			VARBUF2(z, 0, x) = vbz;
+		}
+	}
+
+	if (have_z)
+	{
+		for (int y = 0; y < padsize_y; y++)
+		{
+			for (int x = 0; x < padsize_x; x++)
+			{
+				VARBUF1(0, y, x) = vbz;
+				VARBUF2(0, y, x) = vbz;
+			}
+		}
+	}
+
+	// Generate summed-area tables for N and N^2; this is done in-place, using
+	// a Brent-Kung parallel-prefix based algorithm to minimize precision loss
+	for (int z = zd_start; z < padsize_z; z++)
+	{
+		for (int y = 1; y < padsize_y; y++)
+		{
+			brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1);
+			brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1);
+		}
+	}
+
+	for (int z = zd_start; z < padsize_z; z++)
+	{
+		for (int x = 1; x < padsize_x; x++)
+		{
+			brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst);
+			brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst);
+		}
+	}
+
+	if (have_z)
+	{
+		for (int y = 1; y < padsize_y; y++)
+		{
+			for (int x = 1; x < padsize_x; x++)
+			{
+				brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst);
+				brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst);
+			}
+		}
+	}
+
+	// Compute a few constants used in the variance-calculation.
+	float alpha_kdim = static_cast<float>(2 * alpha_kernel_radius + 1);
+	float alpha_rsamples;
+
+	if (have_z)
+	{
+		alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim * alpha_kdim);
+	}
+	else
+	{
+		alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim);
+	}
+
+	// Use the summed-area tables to compute variance for each neighborhood
+	if (have_z)
+	{
+		for (int z = 0; z < size_z; z++)
+		{
+			int z_src = z + kernel_radius_z;
+			int z_dst = z + offset_z;
+			int z_low  = z_src - alpha_kernel_radius;
+			int z_high = z_src + alpha_kernel_radius + 1;
+
+			for (int y = 0; y < size_y; y++)
+			{
+				int y_src = y + kernel_radius_xy;
+				int y_dst = y + offset_y;
+				int y_low  = y_src - alpha_kernel_radius;
+				int y_high = y_src + alpha_kernel_radius + 1;
+
+				for (int x = 0; x < size_x; x++)
+				{
+					int x_src = x + kernel_radius_xy;
+					int x_dst = x + offset_x;
+					int x_low  = x_src - alpha_kernel_radius;
+					int x_high = x_src + alpha_kernel_radius + 1;
+
+					// Summed-area table lookups for alpha average
+					float vasum = (  VARBUF1(z_high, y_low,  x_low).lane<3>()
+					               - VARBUF1(z_high, y_low,  x_high).lane<3>()
+					               - VARBUF1(z_high, y_high, x_low).lane<3>()
+					               + VARBUF1(z_high, y_high, x_high).lane<3>()) -
+					              (  VARBUF1(z_low,  y_low,  x_low).lane<3>()
+					               - VARBUF1(z_low,  y_low,  x_high).lane<3>()
+					               - VARBUF1(z_low,  y_high, x_low).lane<3>()
+					               + VARBUF1(z_low,  y_high, x_high).lane<3>());
+
+					int out_index = z_dst * zdt + y_dst * ydt + x_dst;
+					input_alpha_averages[out_index] = (vasum * alpha_rsamples);
+				}
+			}
+		}
+	}
+	else
+	{
+		for (int y = 0; y < size_y; y++)
+		{
+			int y_src = y + kernel_radius_xy;
+			int y_dst = y + offset_y;
+			int y_low  = y_src - alpha_kernel_radius;
+			int y_high = y_src + alpha_kernel_radius + 1;
+
+			for (int x = 0; x < size_x; x++)
+			{
+				int x_src = x + kernel_radius_xy;
+				int x_dst = x + offset_x;
+				int x_low  = x_src - alpha_kernel_radius;
+				int x_high = x_src + alpha_kernel_radius + 1;
+
+				// Summed-area table lookups for alpha average
+				float vasum = VARBUF1(0, y_low,  x_low).lane<3>()
+				            - VARBUF1(0, y_low,  x_high).lane<3>()
+				            - VARBUF1(0, y_high, x_low).lane<3>()
+				            + VARBUF1(0, y_high, x_high).lane<3>();
+
+				int out_index = y_dst * ydt + x_dst;
+				input_alpha_averages[out_index] = (vasum * alpha_rsamples);
+			}
+		}
+	}
+}
+
+/* See header for documentation. */
+unsigned int init_compute_averages(
+	const astcenc_image& img,
+	unsigned int alpha_kernel_radius,
+	const astcenc_swizzle& swz,
+	avg_args& ag
+) {
+	unsigned int size_x = img.dim_x;
+	unsigned int size_y = img.dim_y;
+	unsigned int size_z = img.dim_z;
+
+	// Compute maximum block size and from that the working memory buffer size
+	unsigned int kernel_radius = alpha_kernel_radius;
+	unsigned int kerneldim = 2 * kernel_radius + 1;
+
+	bool have_z = (size_z > 1);
+	unsigned int max_blk_size_xy = have_z ? 16 : 32;
+	unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u);
+
+	unsigned int max_padsize_xy = max_blk_size_xy + kerneldim;
+	unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0);
+
+	// Perform block-wise averages calculations across the image
+	// Initialize fields which are not populated until later
+	ag.arg.size_x = 0;
+	ag.arg.size_y = 0;
+	ag.arg.size_z = 0;
+	ag.arg.offset_x = 0;
+	ag.arg.offset_y = 0;
+	ag.arg.offset_z = 0;
+	ag.arg.work_memory = nullptr;
+
+	ag.arg.img = &img;
+	ag.arg.swz = swz;
+	ag.arg.have_z = have_z;
+	ag.arg.alpha_kernel_radius = alpha_kernel_radius;
+
+	ag.img_size_x = size_x;
+	ag.img_size_y = size_y;
+	ag.img_size_z = size_z;
+	ag.blk_size_xy = max_blk_size_xy;
+	ag.blk_size_z = max_blk_size_z;
+	ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z;
+
+	// The parallel task count
+	unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z;
+	unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy;
+	return z_tasks * y_tasks;
+}
+
+#endif
@@ -0,0 +1,622 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2024 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions to decompress a symbolic block.
+ */
+
+#include "astcenc_internal.h"
+
+#include <stdio.h>
+#include <assert.h>
+
+/**
+ * @brief Compute the integer linear interpolation of two color endpoints.
+ *
+ * @param u8_mask       The mask for lanes using decode_unorm8 rather than decode_f16.
+ * @param color0        The endpoint0 color.
+ * @param color1        The endpoint1 color.
+ * @param weights       The interpolation weight (between 0 and 64).
+ *
+ * @return The interpolated color.
+ */
+static vint4 lerp_color_int(
+	vmask4 u8_mask,
+	vint4 color0,
+	vint4 color1,
+	vint4 weights
+) {
+	vint4 weight1 = weights;
+	vint4 weight0 = vint4(64) - weight1;
+
+	vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
+	color = asr<6>(color);
+
+	// For decode_unorm8 values force the codec to bit replicate. This allows the
+	// rest of the codec to assume the full 0xFFFF range for everything and ignore
+	// the decode_mode setting
+	vint4 color_u8 = asr<8>(color) * vint4(257);
+	color = select(color, color_u8, u8_mask);
+
+	return color;
+}
+
+/**
+ * @brief Convert integer color value into a float value for the decoder.
+ *
+ * @param data       The integer color value post-interpolation.
+ * @param lns_mask   If set treat lane as HDR (LNS) else LDR (unorm16).
+ *
+ * @return The float color value.
+ */
+static inline vfloat4 decode_texel(
+	vint4 data,
+	vmask4 lns_mask
+) {
+	vint4 color_lns = vint4::zero();
+	vint4 color_unorm = vint4::zero();
+
+	if (any(lns_mask))
+	{
+		color_lns = lns_to_sf16(data);
+	}
+
+	if (!all(lns_mask))
+	{
+		color_unorm = unorm16_to_sf16(data);
+	}
+
+	// Pick components and then convert to FP16
+	vint4 datai = select(color_unorm, color_lns, lns_mask);
+	return float16_to_float(datai);
+}
+
+/* See header for documentation. */
+void unpack_weights(
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	const decimation_info& di,
+	bool is_dual_plane,
+	int weights_plane1[BLOCK_MAX_TEXELS],
+	int weights_plane2[BLOCK_MAX_TEXELS]
+) {
+	// Safe to overshoot as all arrays are allocated to full size
+	if (!is_dual_plane)
+	{
+		// Build full 64-entry weight lookup table
+		vtable_64x8 table;
+		vtable_prepare(table, scb.weights);
+
+		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint summed_value(8);
+			vint weight_count(di.texel_weight_count + i);
+			int max_weight_count = hmax_s(weight_count);
+
+			promise(max_weight_count > 0);
+			for (int j = 0; j < max_weight_count; j++)
+			{
+				vint texel_weights(di.texel_weights_tr[j] + i);
+				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
+
+				summed_value += vtable_lookup_32bit(table, texel_weights) * texel_weights_int;
+			}
+
+			store(lsr<4>(summed_value), weights_plane1 + i);
+		}
+	}
+	else
+	{
+		// Build a 32-entry weight lookup table per plane
+		// Plane 1
+		vtable_32x8 tab_plane1;
+		vtable_prepare(tab_plane1, scb.weights);
+
+		// Plane 2
+		vtable_32x8 tab_plane2;
+		vtable_prepare(tab_plane2, scb.weights + 32);
+
+		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint sum_plane1(8);
+			vint sum_plane2(8);
+
+			vint weight_count(di.texel_weight_count + i);
+			int max_weight_count = hmax_s(weight_count);
+
+			promise(max_weight_count > 0);
+			for (int j = 0; j < max_weight_count; j++)
+			{
+				vint texel_weights(di.texel_weights_tr[j] + i);
+				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
+
+				sum_plane1 += vtable_lookup_32bit(tab_plane1, texel_weights) * texel_weights_int;
+				sum_plane2 += vtable_lookup_32bit(tab_plane2, texel_weights) * texel_weights_int;
+			}
+
+			store(lsr<4>(sum_plane1), weights_plane1 + i);
+			store(lsr<4>(sum_plane2), weights_plane2 + i);
+		}
+	}
+}
+
+/**
+ * @brief Return an FP32 NaN value for use in error colors.
+ *
+ * This NaN encoding will turn into 0xFFFF when converted to an FP16 NaN.
+ *
+ * @return The float color value.
+ */
+static float error_color_nan()
+{
+	if32 v;
+	v.u = 0xFFFFE000U;
+	return v.f;
+}
+
+/* See header for documentation. */
+void decompress_symbolic_block(
+	astcenc_profile decode_mode,
+	const block_size_descriptor& bsd,
+	int xpos,
+	int ypos,
+	int zpos,
+	const symbolic_compressed_block& scb,
+	image_block& blk
+) {
+	blk.xpos = xpos;
+	blk.ypos = ypos;
+	blk.zpos = zpos;
+
+	blk.data_min = vfloat4::zero();
+	blk.data_mean = vfloat4::zero();
+	blk.data_max = vfloat4::zero();
+	blk.grayscale = false;
+
+	// If we detected an error-block, blow up immediately.
+	if (scb.block_type == SYM_BTYPE_ERROR)
+	{
+		for (unsigned int i = 0; i < bsd.texel_count; i++)
+		{
+			blk.data_r[i] = error_color_nan();
+			blk.data_g[i] = error_color_nan();
+			blk.data_b[i] = error_color_nan();
+			blk.data_a[i] = error_color_nan();
+			blk.rgb_lns[i] = 0;
+			blk.alpha_lns[i] = 0;
+		}
+
+		return;
+	}
+
+	if ((scb.block_type == SYM_BTYPE_CONST_F16) ||
+	    (scb.block_type == SYM_BTYPE_CONST_U16))
+	{
+		vfloat4 color;
+		uint8_t use_lns = 0;
+
+		// UNORM16 constant color block
+		if (scb.block_type == SYM_BTYPE_CONST_U16)
+		{
+			vint4 colori(scb.constant_color);
+
+			// Determine the UNORM8 rounding on the decode
+			vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
+
+			// The real decoder would just use the top 8 bits, but we rescale
+			// in to a 16-bit value that rounds correctly.
+			vint4 colori_u8 = asr<8>(colori) * 257;
+			colori = select(colori, colori_u8, u8_mask);
+
+			vint4 colorf16 = unorm16_to_sf16(colori);
+			color = float16_to_float(colorf16);
+		}
+		// FLOAT16 constant color block
+		else
+		{
+			switch (decode_mode)
+			{
+			case ASTCENC_PRF_LDR_SRGB:
+			case ASTCENC_PRF_LDR:
+				color = vfloat4(error_color_nan());
+				break;
+			case ASTCENC_PRF_HDR_RGB_LDR_A:
+			case ASTCENC_PRF_HDR:
+				// Constant-color block; unpack from FP16 to FP32.
+				color = float16_to_float(vint4(scb.constant_color));
+				use_lns = 1;
+				break;
+			}
+		}
+
+		for (unsigned int i = 0; i < bsd.texel_count; i++)
+		{
+			blk.data_r[i] = color.lane<0>();
+			blk.data_g[i] = color.lane<1>();
+			blk.data_b[i] = color.lane<2>();
+			blk.data_a[i] = color.lane<3>();
+			blk.rgb_lns[i] = use_lns;
+			blk.alpha_lns[i] = use_lns;
+		}
+
+		return;
+	}
+
+	// Get the appropriate partition-table entry
+	int partition_count = scb.partition_count;
+	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
+
+	// Get the appropriate block descriptors
+	const auto& bm = bsd.get_block_mode(scb.block_mode);
+	const auto& di = bsd.get_decimation_info(bm.decimation_mode);
+
+	bool is_dual_plane = static_cast<bool>(bm.is_dual_plane);
+
+	// Unquantize and undecimate the weights
+	int plane1_weights[BLOCK_MAX_TEXELS];
+	int plane2_weights[BLOCK_MAX_TEXELS];
+	unpack_weights(bsd, scb, di, is_dual_plane, plane1_weights, plane2_weights);
+
+	// Now that we have endpoint colors and weights, we can unpack texel colors
+	int plane2_component = scb.plane2_component;
+	vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);
+
+	vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
+
+	for (int i = 0; i < partition_count; i++)
+	{
+		// Decode the color endpoints for this partition
+		vint4 ep0;
+		vint4 ep1;
+		bool rgb_lns;
+		bool a_lns;
+
+		unpack_color_endpoints(decode_mode,
+		                       scb.color_formats[i],
+		                       scb.color_values[i],
+		                       rgb_lns, a_lns,
+		                       ep0, ep1);
+
+		vmask4 lns_mask(rgb_lns, rgb_lns, rgb_lns, a_lns);
+
+		int texel_count = pi.partition_texel_count[i];
+		for (int j = 0; j < texel_count; j++)
+		{
+			int tix = pi.texels_of_partition[i][j];
+			vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
+			vint4 color = lerp_color_int(u8_mask, ep0, ep1, weight);
+			vfloat4 colorf = decode_texel(color, lns_mask);
+
+			blk.data_r[tix] = colorf.lane<0>();
+			blk.data_g[tix] = colorf.lane<1>();
+			blk.data_b[tix] = colorf.lane<2>();
+			blk.data_a[tix] = colorf.lane<3>();
+		}
+	}
+}
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/* See header for documentation. */
+float compute_symbolic_block_difference_2plane(
+	const astcenc_config& config,
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	const image_block& blk
+) {
+	// If we detected an error-block, blow up immediately.
+	if (scb.block_type == SYM_BTYPE_ERROR)
+	{
+		return ERROR_CALC_DEFAULT;
+	}
+
+	assert(scb.block_mode >= 0);
+	assert(scb.partition_count == 1);
+	assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 1);
+
+	// Get the appropriate block descriptor
+	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
+	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
+
+	// Unquantize and undecimate the weights
+	int plane1_weights[BLOCK_MAX_TEXELS];
+	int plane2_weights[BLOCK_MAX_TEXELS];
+	unpack_weights(bsd, scb, di, true, plane1_weights, plane2_weights);
+
+	vmask4 plane2_mask = vint4::lane_id() == vint4(scb.plane2_component);
+
+	vfloat4 summa = vfloat4::zero();
+
+	// Decode the color endpoints for this partition
+	vint4 ep0;
+	vint4 ep1;
+	bool rgb_lns;
+	bool a_lns;
+
+	unpack_color_endpoints(config.profile,
+	                       scb.color_formats[0],
+	                       scb.color_values[0],
+	                       rgb_lns, a_lns,
+	                       ep0, ep1);
+
+	vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
+
+	// Unpack and compute error for each texel in the partition
+	unsigned int texel_count = bsd.texel_count;
+	for (unsigned int i = 0; i < texel_count; i++)
+	{
+		vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
+		vint4 colori = lerp_color_int(u8_mask, ep0, ep1, weight);
+
+		vfloat4 color = int_to_float(colori);
+		vfloat4 oldColor = blk.texel(i);
+
+		// Compare error using a perceptual decode metric for RGBM textures
+		if (config.flags & ASTCENC_FLG_MAP_RGBM)
+		{
+			// Fail encodings that result in zero weight M pixels. Note that this can cause
+			// "interesting" artifacts if we reject all useful encodings - we typically get max
+			// brightness encodings instead which look just as bad. We recommend users apply a
+			// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
+			// getting small M values post-quantization, but we can't prove it would never
+			// happen, especially at low bit rates ...
+			if (color.lane<3>() == 0.0f)
+			{
+				return -ERROR_CALC_DEFAULT;
+			}
+
+			// Compute error based on decoded RGBM color
+			color = vfloat4(
+				color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
+				color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
+				color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
+				1.0f
+			);
+
+			oldColor = vfloat4(
+				oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
+				oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
+				oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
+				1.0f
+			);
+		}
+
+		vfloat4 error = oldColor - color;
+		error = min(abs(error), 1e15f);
+		error = error * error;
+
+		summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
+	}
+
+	return summa.lane<0>();
+}
+
+/* See header for documentation. */
+float compute_symbolic_block_difference_1plane(
+	const astcenc_config& config,
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	const image_block& blk
+) {
+	assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 0);
+
+	// If we detected an error-block, blow up immediately.
+	if (scb.block_type == SYM_BTYPE_ERROR)
+	{
+		return ERROR_CALC_DEFAULT;
+	}
+
+	assert(scb.block_mode >= 0);
+
+	// Get the appropriate partition-table entry
+	unsigned int partition_count = scb.partition_count;
+	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
+
+	// Get the appropriate block descriptor
+	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
+	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
+
+	// Unquantize and undecimate the weights
+	int plane1_weights[BLOCK_MAX_TEXELS];
+	unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
+
+	vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
+
+	vfloat4 summa = vfloat4::zero();
+	for (unsigned int i = 0; i < partition_count; i++)
+	{
+		// Decode the color endpoints for this partition
+		vint4 ep0;
+		vint4 ep1;
+		bool rgb_lns;
+		bool a_lns;
+
+		unpack_color_endpoints(config.profile,
+		                       scb.color_formats[i],
+		                       scb.color_values[i],
+		                       rgb_lns, a_lns,
+		                       ep0, ep1);
+
+		// Unpack and compute error for each texel in the partition
+		unsigned int texel_count = pi.partition_texel_count[i];
+		for (unsigned int j = 0; j < texel_count; j++)
+		{
+			unsigned int tix = pi.texels_of_partition[i][j];
+			vint4 colori = lerp_color_int(u8_mask, ep0, ep1,
+			                              vint4(plane1_weights[tix]));
+
+			vfloat4 color = int_to_float(colori);
+			vfloat4 oldColor = blk.texel(tix);
+
+			// Compare error using a perceptual decode metric for RGBM textures
+			if (config.flags & ASTCENC_FLG_MAP_RGBM)
+			{
+				// Fail encodings that result in zero weight M pixels. Note that this can cause
+				// "interesting" artifacts if we reject all useful encodings - we typically get max
+				// brightness encodings instead which look just as bad. We recommend users apply a
+				// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
+				// getting small M values post-quantization, but we can't prove it would never
+				// happen, especially at low bit rates ...
+				if (color.lane<3>() == 0.0f)
+				{
+					return -ERROR_CALC_DEFAULT;
+				}
+
+				// Compute error based on decoded RGBM color
+				color = vfloat4(
+					color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
+					color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
+					color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
+					1.0f
+				);
+
+				oldColor = vfloat4(
+					oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
+					oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
+					oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
+					1.0f
+				);
+			}
+
+			vfloat4 error = oldColor - color;
+			error = min(abs(error), 1e15f);
+			error = error * error;
+
+			summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
+		}
+	}
+
+	return summa.lane<0>();
+}
+
+/* See header for documentation. */
+float compute_symbolic_block_difference_1plane_1partition(
+	const astcenc_config& config,
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	const image_block& blk
+) {
+	// If we detected an error-block, blow up immediately.
+	if (scb.block_type == SYM_BTYPE_ERROR)
+	{
+		return ERROR_CALC_DEFAULT;
+	}
+
+	assert(scb.block_mode >= 0);
+	assert(bsd.get_partition_info(scb.partition_count, scb.partition_index).partition_count == 1);
+
+	// Get the appropriate block descriptor
+	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
+	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
+
+	// Unquantize and undecimate the weights
+	ASTCENC_ALIGNAS int plane1_weights[BLOCK_MAX_TEXELS];
+	unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
+
+	// Decode the color endpoints for this partition
+	vint4 ep0;
+	vint4 ep1;
+	bool rgb_lns;
+	bool a_lns;
+
+	unpack_color_endpoints(config.profile,
+	                       scb.color_formats[0],
+	                       scb.color_values[0],
+	                       rgb_lns, a_lns,
+	                       ep0, ep1);
+
+	vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
+
+	// Unpack and compute error for each texel in the partition
+	vfloatacc summav = vfloatacc::zero();
+
+	vint lane_id = vint::lane_id();
+
+	unsigned int texel_count = bsd.texel_count;
+	for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+	{
+		// Compute EP1 contribution
+		vint weight1 = vint::loada(plane1_weights + i);
+		vint ep1_r = vint(ep1.lane<0>()) * weight1;
+		vint ep1_g = vint(ep1.lane<1>()) * weight1;
+		vint ep1_b = vint(ep1.lane<2>()) * weight1;
+		vint ep1_a = vint(ep1.lane<3>()) * weight1;
+
+		// Compute EP0 contribution
+		vint weight0 = vint(64) - weight1;
+		vint ep0_r = vint(ep0.lane<0>()) * weight0;
+		vint ep0_g = vint(ep0.lane<1>()) * weight0;
+		vint ep0_b = vint(ep0.lane<2>()) * weight0;
+		vint ep0_a = vint(ep0.lane<3>()) * weight0;
+
+		// Combine contributions
+		vint colori_r = asr<6>(ep0_r + ep1_r + vint(32));
+		vint colori_g = asr<6>(ep0_g + ep1_g + vint(32));
+		vint colori_b = asr<6>(ep0_b + ep1_b + vint(32));
+		vint colori_a = asr<6>(ep0_a + ep1_a + vint(32));
+
+		// If using a U8 decode mode bit replicate top 8 bits
+		// so rest of codec can assume 0xFFFF max range everywhere
+		vint colori_r8 = asr<8>(colori_r) * vint(257);
+		colori_r = select(colori_r, colori_r8, vmask(u8_mask.lane<0>()));
+
+		vint colori_g8 = asr<8>(colori_g) * vint(257);
+		colori_g = select(colori_g, colori_g8, vmask(u8_mask.lane<1>()));
+
+		vint colori_b8 = asr<8>(colori_b) * vint(257);
+		colori_b = select(colori_b, colori_b8, vmask(u8_mask.lane<2>()));
+
+		vint colori_a8 = asr<8>(colori_a) * vint(257);
+		colori_a = select(colori_a, colori_a8, vmask(u8_mask.lane<3>()));
+
+		// Compute color diff
+		vfloat color_r = int_to_float(colori_r);
+		vfloat color_g = int_to_float(colori_g);
+		vfloat color_b = int_to_float(colori_b);
+		vfloat color_a = int_to_float(colori_a);
+
+		vfloat color_orig_r = loada(blk.data_r + i);
+		vfloat color_orig_g = loada(blk.data_g + i);
+		vfloat color_orig_b = loada(blk.data_b + i);
+		vfloat color_orig_a = loada(blk.data_a + i);
+
+		vfloat color_error_r = min(abs(color_orig_r - color_r), vfloat(1e15f));
+		vfloat color_error_g = min(abs(color_orig_g - color_g), vfloat(1e15f));
+		vfloat color_error_b = min(abs(color_orig_b - color_b), vfloat(1e15f));
+		vfloat color_error_a = min(abs(color_orig_a - color_a), vfloat(1e15f));
+
+		// Compute squared error metric
+		color_error_r = color_error_r * color_error_r;
+		color_error_g = color_error_g * color_error_g;
+		color_error_b = color_error_b * color_error_b;
+		color_error_a = color_error_a * color_error_a;
+
+		vfloat metric = color_error_r * blk.channel_weight.lane<0>()
+		              + color_error_g * blk.channel_weight.lane<1>()
+		              + color_error_b * blk.channel_weight.lane<2>()
+		              + color_error_a * blk.channel_weight.lane<3>();
+
+		// Mask off bad lanes
+		vmask mask = lane_id < vint(texel_count);
+		lane_id += vint(ASTCENC_SIMD_WIDTH);
+		haccumulate(summav, metric, mask);
+	}
+
+	return hadd_s(summav);
+}
+
+#endif
@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2021-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for the library entrypoint.
+ */
+
+#if defined(ASTCENC_DIAGNOSTICS)
+
+#include <cassert>
+#include <cstdarg>
+#include <cstdio>
+#include <cmath>
+#include <limits>
+#include <string>
+
+#include "astcenc_diagnostic_trace.h"
+
+/** @brief The global trace logger. */
+static TraceLog* g_TraceLog = nullptr;
+
+/** @brief The JSON indentation level. */
+static const size_t g_trace_indent = 2;
+
+TraceLog::TraceLog(
+	const char* file_name):
+	m_file(file_name, std::ofstream::out | std::ofstream::binary)
+{
+	assert(!g_TraceLog);
+	g_TraceLog = this;
+	m_root = new TraceNode("root");
+}
+
+/* See header for documentation. */
+TraceNode* TraceLog::get_current_leaf()
+{
+	if (m_stack.size())
+	{
+		return m_stack.back();
+	}
+
+	return nullptr;
+}
+
+/* See header for documentation. */
+size_t TraceLog::get_depth()
+{
+	return m_stack.size();
+}
+
+/* See header for documentation. */
+TraceLog::~TraceLog()
+{
+	assert(g_TraceLog == this);
+	delete m_root;
+	g_TraceLog = nullptr;
+}
+
+/* See header for documentation. */
+TraceNode::TraceNode(
+	const char* format,
+	...
+) {
+	// Format the name string
+	constexpr size_t bufsz = 256;
+	char buffer[bufsz];
+
+	va_list args;
+	va_start (args, format);
+	vsnprintf (buffer, bufsz, format, args);
+	va_end (args);
+
+	// Guarantee there is a nul terminator
+	buffer[bufsz - 1] = 0;
+
+	// Generate the node
+	TraceNode* parent = g_TraceLog->get_current_leaf();
+	size_t depth = g_TraceLog->get_depth();
+	g_TraceLog->m_stack.push_back(this);
+
+	bool comma = parent && parent->m_attrib_count;
+	auto& out = g_TraceLog->m_file;
+
+	if (parent)
+	{
+		parent->m_attrib_count++;
+	}
+
+	if (comma)
+	{
+		out << ',';
+	}
+
+	if (depth)
+	{
+		out << '\n';
+	}
+
+	size_t out_indent = (depth * 2) * g_trace_indent;
+	size_t in_indent = (depth * 2 + 1) * g_trace_indent;
+
+	std::string out_indents("");
+	if (out_indent)
+	{
+		out_indents = std::string(out_indent, ' ');
+	}
+
+	std::string in_indents(in_indent, ' ');
+
+	out << out_indents << "[ \"node\", \"" << buffer << "\",\n";
+	out << in_indents << "[";
+}
+
+/* See header for documentation. */
+void TraceNode::add_attrib(
+	std::string type,
+	std::string key,
+	std::string value
+) {
+	(void)type;
+
+	size_t depth = g_TraceLog->get_depth();
+	size_t indent = (depth * 2) * g_trace_indent;
+	auto& out = g_TraceLog->m_file;
+	bool comma = m_attrib_count;
+	m_attrib_count++;
+
+	if (comma)
+	{
+		out << ',';
+	}
+
+	out << '\n';
+	out << std::string(indent, ' ') << "[ "
+	                                << "\"" << key << "\", "
+	                                << value << " ]";
+}
+
+/* See header for documentation. */
+TraceNode::~TraceNode()
+{
+	g_TraceLog->m_stack.pop_back();
+
+	auto& out = g_TraceLog->m_file;
+	size_t depth = g_TraceLog->get_depth();
+	size_t out_indent = (depth * 2) * g_trace_indent;
+	size_t in_indent = (depth * 2 + 1) * g_trace_indent;
+
+	std::string out_indents("");
+	if (out_indent)
+	{
+		out_indents = std::string(out_indent, ' ');
+	}
+
+	std::string in_indents(in_indent, ' ');
+
+	if (m_attrib_count)
+	{
+		out << "\n" << in_indents;
+	}
+	out << "]\n";
+
+	out << out_indents << "]";
+}
+
+/* See header for documentation. */
+void trace_add_data(
+	const char* key,
+	const char* format,
+	...
+) {
+	constexpr size_t bufsz = 256;
+	char buffer[bufsz];
+
+	va_list args;
+	va_start (args, format);
+	vsnprintf (buffer, bufsz, format, args);
+	va_end (args);
+
+	// Guarantee there is a nul terminator
+	buffer[bufsz - 1] = 0;
+
+	std::string value = "\"" + std::string(buffer) + "\"";
+
+	TraceNode* node = g_TraceLog->get_current_leaf();
+	node->add_attrib("str", key, value);
+}
+
+/* See header for documentation. */
+void trace_add_data(
+	const char* key,
+	float value
+) {
+	// Turn infinities into parseable values
+	if (std::isinf(value))
+	{
+		if (value > 0.0f)
+		{
+			value = std::numeric_limits<float>::max();
+		}
+		else
+		{
+			value = -std::numeric_limits<float>::max();
+		}
+	}
+
+	char buffer[256];
+	sprintf(buffer, "%.20g", (double)value);
+	TraceNode* node = g_TraceLog->get_current_leaf();
+	node->add_attrib("float", key, buffer);
+}
+
+/* See header for documentation. */
+void trace_add_data(
+	const char* key,
+	int value
+) {
+	TraceNode* node = g_TraceLog->get_current_leaf();
+	node->add_attrib("int", key, std::to_string(value));
+}
+
+/* See header for documentation. */
+void trace_add_data(
+	const char* key,
+	unsigned int value
+) {
+	TraceNode* node = g_TraceLog->get_current_leaf();
+	node->add_attrib("int", key, std::to_string(value));
+}
+
+#endif
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2021-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief This module provides a set of diagnostic tracing utilities.
+ *
+ * Overview
+ * ========
+ *
+ * The built-in diagnostic trace tool generates a hierarchical JSON tree structure. The tree
+ * hierarchy contains three levels:
+ *
+ *    - block
+ *        - pass
+ *           - candidate
+ *
+ * One block node exists for each compressed block in the image. One pass node exists for each major
+ * pass (N partition, M planes, O components) applied to a block. One candidate node exists for each
+ * encoding candidate trialed for a pass.
+ *
+ * Each node contains both the hierarchy but also a number of attributes which explain the behavior.
+ * For example, the block node contains the block coordinates in the image, the pass explains the
+ * pass configuration, and the candidate will explain the candidate encoding such as weight
+ * decimation, refinement error, etc.
+ *
+ * Trace Nodes are designed as scope-managed C++ objects with stack-like push/pop behavior.
+ * Constructing a trace node on the stack will automatically add it to the current node as a child,
+ * and then make it the current node. Destroying the current node will pop the stack and set the
+ * parent to the current node. This provides a robust mechanism for ensuring reliable nesting in the
+ * tree structure.
+ *
+ * A set of utility macros are provided to add attribute annotations to the current trace node.
+ *
+ * Usage
+ * =====
+ *
+ * Create Trace Nodes on the stack using the @c TRACE_NODE() macro. This will compile-out completely
+ * in builds with diagnostics disabled.
+ *
+ * Add annotations to the current trace node using the @c trace_add_data() macro. This will
+ * similarly compile out completely in builds with diagnostics disabled.
+ *
+ * If you need to add additional code to support diagnostics-only behavior wrap
+ * it in preprocessor guards:
+ *
+ *     #if defined(ASTCENC_DIAGNOSTICS)
+ *     #endif
+ */
+
+#ifndef ASTCENC_DIAGNOSTIC_TRACE_INCLUDED
+#define ASTCENC_DIAGNOSTIC_TRACE_INCLUDED
+
+#if defined(ASTCENC_DIAGNOSTICS)
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+/**
+ * @brief Class representing a single node in the trace hierarchy.
+ */
+class TraceNode
+{
+public:
+	/**
+	 * @brief Construct a new node.
+	 *
+	 * Constructing a node will push to the the top of the stack, automatically making it a child of
+	 * the current node, and then setting it to become the current node.
+	 *
+	 * @param format   The format template for the node name.
+	 * @param ...      The format parameters.
+	 */
+	TraceNode(const char* format, ...);
+
+	/**
+	 * @brief Add an attribute to this node.
+	 *
+	 * Note that no quoting is applied to the @c value, so if quoting is needed it must be done by
+	 * the caller.
+	 *
+	 * @param type    The type of the attribute.
+	 * @param key     The key of the attribute.
+	 * @param value   The value of the attribute.
+	 */
+	void add_attrib(std::string type, std::string key, std::string value);
+
+	/**
+	 * @brief Destroy this node.
+	 *
+	 * Destroying a node will pop it from the top of the stack, making its parent the current node.
+	 * It is invalid behavior to destroy a node that is not the current node; usage must conform to
+	 * stack push-pop semantics.
+	 */
+	~TraceNode();
+
+	/**
+	 * @brief The number of attributes and child nodes in this node.
+	 */
+	unsigned int m_attrib_count { 0 };
+};
+
+/**
+ * @brief Class representing the trace log file being written.
+ */
+class TraceLog
+{
+public:
+	/**
+	 * @brief Create a new trace log.
+	 *
+	 * The trace log is global; there can be only one at a time.
+	 *
+	 * @param file_name   The name of the file to write.
+	 */
+	TraceLog(const char* file_name);
+
+	/**
+	 * @brief Detroy the trace log.
+	 *
+	 * Trace logs MUST be cleanly destroyed to ensure the file gets written.
+	 */
+	~TraceLog();
+
+	/**
+	 * @brief Get the current child node.
+	 *
+	 * @return The current leaf node.
+	 */
+	TraceNode* get_current_leaf();
+
+	/**
+	 * @brief Get the stack depth of the current child node.
+	 *
+	 * @return The current leaf node stack depth.
+	 */
+	size_t get_depth();
+
+	/**
+	 * @brief The file stream to write to.
+	 */
+	std::ofstream m_file;
+
+	/**
+	 * @brief The stack of nodes (newest at the back).
+	 */
+	std::vector<TraceNode*> m_stack;
+
+private:
+	/**
+	 * @brief The root node in the JSON file.
+	 */
+	TraceNode* m_root;
+};
+
+/**
+ * @brief Utility macro to create a trace node on the stack.
+ *
+ * @param name     The variable name to use.
+ * @param ...      The name template and format parameters.
+ */
+#define TRACE_NODE(name, ...) TraceNode name(__VA_ARGS__);
+
+/**
+ * @brief Add a string annotation to the current node.
+ *
+ * @param key      The name of the attribute.
+ * @param format   The format template for the attribute value.
+ * @param ...      The format parameters.
+ */
+void trace_add_data(const char* key, const char* format, ...);
+
+/**
+ * @brief Add a float annotation to the current node.
+ *
+ * @param key     The name of the attribute.
+ * @param value   The value of the attribute.
+ */
+void trace_add_data(const char* key, float value);
+
+/**
+ * @brief Add an integer annotation to the current node.
+ *
+ * @param key     The name of the attribute.
+ * @param value   The value of the attribute.
+ */
+void trace_add_data(const char* key, int value);
+
+/**
+ * @brief Add an unsigned integer annotation to the current node.
+ *
+ * @param key     The name of the attribute.
+ * @param value   The value of the attribute.
+ */
+void trace_add_data(const char* key, unsigned int value);
+
+#else
+
+#define TRACE_NODE(name, ...)
+
+#define trace_add_data(...)
+
+#endif
+
+#endif
@@ -0,0 +1,781 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2025 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/**
+ * @brief Functions for finding best partition for a block.
+ *
+ * The partition search operates in two stages. The first pass uses kmeans clustering to group
+ * texels into an ideal partitioning for the requested partition count, and then compares that
+ * against the 1024 partitionings generated by the ASTC partition hash function. The generated
+ * partitions are then ranked by the number of texels in the wrong partition, compared to the ideal
+ * clustering. All 1024 partitions are tested for similarity and ranked, apart from duplicates and
+ * partitionings that actually generate fewer than the requested partition count, but only the top
+ * N candidates are actually put through a more detailed search. N is determined by the compressor
+ * quality preset.
+ *
+ * For the detailed search, each candidate is checked against two possible encoding methods:
+ *
+ *   - The best partitioning assuming different chroma colors (RGB + RGB or RGB + delta endpoints).
+ *   - The best partitioning assuming same chroma colors (RGB + scale endpoints).
+ *
+ * This is implemented by computing the compute mean color and dominant direction for each
+ * partition. This defines two lines, both of which go through the mean color value.
+ *
+ * - One line has a direction defined by the dominant direction; this is used to assess the error
+ *   from using an uncorrelated color representation.
+ * - The other line goes through (0,0,0,1) and is used to assess the error from using a same chroma
+ *   (RGB + scale) color representation.
+ *
+ * The best candidate is selected by computing the squared-errors that result from using these
+ * lines for endpoint selection.
+ */
+
+#include <limits>
+#include "astcenc_internal.h"
+
+/**
+ * @brief Pick some initial kmeans cluster centers.
+ *
+ * @param      blk               The image block color data to compress.
+ * @param      texel_count       The number of texels in the block.
+ * @param      partition_count   The number of partitions in the block.
+ * @param[out] cluster_centers   The initial partition cluster center colors.
+ */
+static void kmeans_init(
+	const image_block& blk,
+	unsigned int texel_count,
+	unsigned int partition_count,
+	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS]
+) {
+	promise(texel_count > 0);
+	promise(partition_count > 0);
+
+	unsigned int clusters_selected = 0;
+	float distances[BLOCK_MAX_TEXELS];
+
+	// Pick a random sample as first cluster center; 145897 from random.org
+	unsigned int sample = 145897 % texel_count;
+	vfloat4 center_color = blk.texel(sample);
+	cluster_centers[clusters_selected] = center_color;
+	clusters_selected++;
+
+	// Compute the distance to the first cluster center
+	float distance_sum = 0.0f;
+	for (unsigned int i = 0; i < texel_count; i++)
+	{
+		vfloat4 color = blk.texel(i);
+		vfloat4 diff = color - center_color;
+		float distance = dot_s(diff * diff, blk.channel_weight);
+		distance_sum += distance;
+		distances[i] = distance;
+	}
+
+	// More numbers from random.org for weighted-random center selection
+	const float cluster_cutoffs[9] {
+		0.626220f, 0.932770f, 0.275454f,
+		0.318558f, 0.240113f, 0.009190f,
+		0.347661f, 0.731960f, 0.156391f
+	};
+
+	unsigned int cutoff = (clusters_selected - 1) + 3 * (partition_count - 2);
+
+	// Pick the remaining samples as needed
+	while (true)
+	{
+		// Pick the next center in a weighted-random fashion.
+		float summa = 0.0f;
+		float distance_cutoff = distance_sum * cluster_cutoffs[cutoff++];
+		for (sample = 0; sample < texel_count; sample++)
+		{
+			summa += distances[sample];
+			if (summa >= distance_cutoff)
+			{
+				break;
+			}
+		}
+
+		// Clamp to a valid range and store the selected cluster center
+		sample = astc::min(sample, texel_count - 1);
+
+		center_color = blk.texel(sample);
+		cluster_centers[clusters_selected++] = center_color;
+		if (clusters_selected >= partition_count)
+		{
+			break;
+		}
+
+		// Compute the distance to the new cluster center, keep the min dist
+		distance_sum = 0.0f;
+		for (unsigned int i = 0; i < texel_count; i++)
+		{
+			vfloat4 color = blk.texel(i);
+			vfloat4 diff = color - center_color;
+			float distance = dot_s(diff * diff, blk.channel_weight);
+			distance = astc::min(distance, distances[i]);
+			distance_sum += distance;
+			distances[i] = distance;
+		}
+	}
+}
+
+/**
+ * @brief Assign texels to clusters, based on a set of chosen center points.
+ *
+ * @param      blk                  The image block color data to compress.
+ * @param      texel_count          The number of texels in the block.
+ * @param      partition_count      The number of partitions in the block.
+ * @param      cluster_centers      The partition cluster center colors.
+ * @param[out] partition_of_texel   The partition assigned for each texel.
+ */
+static void kmeans_assign(
+	const image_block& blk,
+	unsigned int texel_count,
+	unsigned int partition_count,
+	const vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
+	uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
+) {
+	promise(texel_count > 0);
+	promise(partition_count > 0);
+
+	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
+
+	// Find the best partition for every texel
+	for (unsigned int i = 0; i < texel_count; i++)
+	{
+		float best_distance = std::numeric_limits<float>::max();
+		unsigned int best_partition = 0;
+
+		vfloat4 color = blk.texel(i);
+		for (unsigned int j = 0; j < partition_count; j++)
+		{
+			vfloat4 diff = color - cluster_centers[j];
+			float distance = dot_s(diff * diff, blk.channel_weight);
+			if (distance < best_distance)
+			{
+				best_distance = distance;
+				best_partition = j;
+			}
+		}
+
+		partition_of_texel[i] = static_cast<uint8_t>(best_partition);
+		partition_texel_count[best_partition]++;
+	}
+
+	// It is possible to get a situation where a partition ends up without any texels. In this case,
+	// assign texel N to partition N. This is silly, but ensures that every partition retains at
+	// least one texel. Reassigning a texel in this manner may cause another partition to go empty,
+	// so if we actually did a reassignment, run the whole loop over again.
+	bool problem_case;
+	do
+	{
+		problem_case = false;
+		for (unsigned int i = 0; i < partition_count; i++)
+		{
+			if (partition_texel_count[i] == 0)
+			{
+				partition_texel_count[partition_of_texel[i]]--;
+				partition_texel_count[i]++;
+				partition_of_texel[i] = static_cast<uint8_t>(i);
+				problem_case = true;
+			}
+		}
+	} while (problem_case);
+}
+
+/**
+ * @brief Compute new cluster centers based on their center of gravity.
+ *
+ * @param       blk                  The image block color data to compress.
+ * @param       texel_count          The number of texels in the block.
+ * @param       partition_count      The number of partitions in the block.
+ * @param[out]  cluster_centers      The new cluster center colors.
+ * @param       partition_of_texel   The partition assigned for each texel.
+ */
+static void kmeans_update(
+	const image_block& blk,
+	unsigned int texel_count,
+	unsigned int partition_count,
+	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
+	const uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
+) {
+	promise(texel_count > 0);
+	promise(partition_count > 0);
+
+	vfloat4 color_sum[BLOCK_MAX_PARTITIONS] {
+		vfloat4::zero(),
+		vfloat4::zero(),
+		vfloat4::zero(),
+		vfloat4::zero()
+	};
+
+	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
+
+	// Find the center of gravity in each cluster
+	for (unsigned int i = 0; i < texel_count; i++)
+	{
+		uint8_t partition = partition_of_texel[i];
+		color_sum[partition] += blk.texel(i);
+		partition_texel_count[partition]++;
+	}
+
+	// Set the center of gravity to be the new cluster center
+	for (unsigned int i = 0; i < partition_count; i++)
+	{
+		float scale = 1.0f / static_cast<float>(partition_texel_count[i]);
+		cluster_centers[i] = color_sum[i] * scale;
+	}
+}
+
+/**
+ * @brief Compute bit-mismatch for partitioning in 2-partition mode.
+ *
+ * @param a   The texel assignment bitvector for the block.
+ * @param b   The texel assignment bitvector for the partition table.
+ *
+ * @return    The number of bit mismatches.
+ */
+static inline uint8_t partition_mismatch2(
+	const uint64_t a[2],
+	const uint64_t b[2]
+) {
+	int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]);
+	int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]);
+
+	// Divide by 2 because XOR always counts errors twice, once when missing
+	// in the expected position, and again when present in the wrong partition
+	return static_cast<uint8_t>(astc::min(v1, v2) / 2);
+}
+
+/**
+ * @brief Compute bit-mismatch for partitioning in 3-partition mode.
+ *
+ * @param a   The texel assignment bitvector for the block.
+ * @param b   The texel assignment bitvector for the partition table.
+ *
+ * @return    The number of bit mismatches.
+ */
+static inline uint8_t partition_mismatch3(
+	const uint64_t a[3],
+	const uint64_t b[3]
+) {
+	int p00 = popcount(a[0] ^ b[0]);
+	int p01 = popcount(a[0] ^ b[1]);
+	int p02 = popcount(a[0] ^ b[2]);
+
+	int p10 = popcount(a[1] ^ b[0]);
+	int p11 = popcount(a[1] ^ b[1]);
+	int p12 = popcount(a[1] ^ b[2]);
+
+	int p20 = popcount(a[2] ^ b[0]);
+	int p21 = popcount(a[2] ^ b[1]);
+	int p22 = popcount(a[2] ^ b[2]);
+
+	int s0 = p11 + p22;
+	int s1 = p12 + p21;
+	int v0 = astc::min(s0, s1) + p00;
+
+	int s2 = p10 + p22;
+	int s3 = p12 + p20;
+	int v1 = astc::min(s2, s3) + p01;
+
+	int s4 = p10 + p21;
+	int s5 = p11 + p20;
+	int v2 = astc::min(s4, s5) + p02;
+
+	// Divide by 2 because XOR always counts errors twice, once when missing
+	// in the expected position, and again when present in the wrong partition
+	return static_cast<uint8_t>(astc::min(v0, v1, v2) / 2);
+}
+
+/**
+ * @brief Compute bit-mismatch for partitioning in 4-partition mode.
+ *
+ * @param a   The texel assignment bitvector for the block.
+ * @param b   The texel assignment bitvector for the partition table.
+ *
+ * @return    The number of bit mismatches.
+ */
+static inline uint8_t partition_mismatch4(
+	const uint64_t a[4],
+	const uint64_t b[4]
+) {
+	int p00 = popcount(a[0] ^ b[0]);
+	int p01 = popcount(a[0] ^ b[1]);
+	int p02 = popcount(a[0] ^ b[2]);
+	int p03 = popcount(a[0] ^ b[3]);
+
+	int p10 = popcount(a[1] ^ b[0]);
+	int p11 = popcount(a[1] ^ b[1]);
+	int p12 = popcount(a[1] ^ b[2]);
+	int p13 = popcount(a[1] ^ b[3]);
+
+	int p20 = popcount(a[2] ^ b[0]);
+	int p21 = popcount(a[2] ^ b[1]);
+	int p22 = popcount(a[2] ^ b[2]);
+	int p23 = popcount(a[2] ^ b[3]);
+
+	int p30 = popcount(a[3] ^ b[0]);
+	int p31 = popcount(a[3] ^ b[1]);
+	int p32 = popcount(a[3] ^ b[2]);
+	int p33 = popcount(a[3] ^ b[3]);
+
+	int mx23 = astc::min(p22 + p33, p23 + p32);
+	int mx13 = astc::min(p21 + p33, p23 + p31);
+	int mx12 = astc::min(p21 + p32, p22 + p31);
+	int mx03 = astc::min(p20 + p33, p23 + p30);
+	int mx02 = astc::min(p20 + p32, p22 + p30);
+	int mx01 = astc::min(p21 + p30, p20 + p31);
+
+	int v0 = p00 + astc::min(p11 + mx23, p12 + mx13, p13 + mx12);
+	int v1 = p01 + astc::min(p10 + mx23, p12 + mx03, p13 + mx02);
+	int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01);
+	int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12);
+
+	// Divide by 2 because XOR always counts errors twice, once when missing
+	// in the expected position, and again when present in the wrong partition
+	return static_cast<uint8_t>(astc::min(v0, v1, v2, v3) / 2);
+}
+
+using mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*);
+
+/**
+ * @brief Count the partition table mismatches vs the data clustering.
+ *
+ * @param      bsd               The block size information.
+ * @param      partition_count   The number of partitions in the block.
+ * @param      bitmaps           The block texel partition assignment patterns.
+ * @param[out] mismatch_counts   The array storing per partitioning mismatch counts.
+ */
+static void count_partition_mismatch_bits(
+	const block_size_descriptor& bsd,
+	unsigned int partition_count,
+	const uint64_t bitmaps[BLOCK_MAX_PARTITIONS],
+	uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS]
+) {
+	unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1];
+	promise(active_count > 0);
+
+	if (partition_count == 2)
+	{
+		for (unsigned int i = 0; i < active_count; i++)
+		{
+			mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]);
+			assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
+			assert(mismatch_counts[i] < bsd.texel_count);
+		}
+	}
+	else if (partition_count == 3)
+	{
+		for (unsigned int i = 0; i < active_count; i++)
+		{
+			mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]);
+			assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
+			assert(mismatch_counts[i] < bsd.texel_count);
+		}
+	}
+	else
+	{
+		for (unsigned int i = 0; i < active_count; i++)
+		{
+			mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]);
+			assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
+			assert(mismatch_counts[i] < bsd.texel_count);
+		}
+	}
+}
+
+/**
+ * @brief Use counting sort on the mismatch array to sort partition candidates.
+ *
+ * @param      partitioning_count   The number of packed partitionings.
+ * @param      mismatch_count       Partitioning mismatch counts, in index order.
+ * @param[out] partition_ordering   Partition index values, in mismatch order.
+ *
+ * @return The number of active partitions in this selection.
+ */
+static unsigned int get_partition_ordering_by_mismatch_bits(
+	unsigned int texel_count,
+	unsigned int partitioning_count,
+	const uint8_t mismatch_count[BLOCK_MAX_PARTITIONINGS],
+	uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
+) {
+	promise(partitioning_count > 0);
+	uint16_t mscount[BLOCK_MAX_KMEANS_TEXELS] { 0 };
+
+	// Create the histogram of mismatch counts
+	for (unsigned int i = 0; i < partitioning_count; i++)
+	{
+		mscount[mismatch_count[i]]++;
+	}
+
+	// Create a running sum from the histogram array
+	// Indices store previous values only; i.e. exclude self after sum
+	uint16_t sum = 0;
+	for (unsigned int i = 0; i < texel_count; i++)
+	{
+		uint16_t cnt = mscount[i];
+		mscount[i] = sum;
+		sum += cnt;
+	}
+
+	// Use the running sum as the index, incrementing after read to allow
+	// sequential entries with the same count
+	for (unsigned int i = 0; i < partitioning_count; i++)
+	{
+		unsigned int idx = mscount[mismatch_count[i]]++;
+		partition_ordering[idx] = static_cast<uint16_t>(i);
+	}
+
+	return partitioning_count;
+}
+
+/**
+ * @brief Use k-means clustering to compute a partition ordering for a block..
+ *
+ * @param      bsd                  The block size information.
+ * @param      blk                  The image block color data to compress.
+ * @param      partition_count      The desired number of partitions in the block.
+ * @param[out] partition_ordering   The list of recommended partition indices, in priority order.
+ *
+ * @return The number of active partitionings in this selection.
+ */
+static unsigned int compute_kmeans_partition_ordering(
+	const block_size_descriptor& bsd,
+	const image_block& blk,
+	unsigned int partition_count,
+	uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
+) {
+	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS];
+	uint8_t texel_partitions[BLOCK_MAX_TEXELS];
+
+	// Use three passes of k-means clustering to partition the block data
+	for (unsigned int i = 0; i < 3; i++)
+	{
+		if (i == 0)
+		{
+			kmeans_init(blk, bsd.texel_count, partition_count, cluster_centers);
+		}
+		else
+		{
+			kmeans_update(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
+		}
+
+		kmeans_assign(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
+	}
+
+	// Construct the block bitmaps of texel assignments to each partition
+	uint64_t bitmaps[BLOCK_MAX_PARTITIONS] { 0 };
+	unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS);
+	promise(texels_to_process > 0);
+	for (unsigned int i = 0; i < texels_to_process; i++)
+	{
+		unsigned int idx = bsd.kmeans_texels[i];
+		bitmaps[texel_partitions[idx]] |= 1ULL << i;
+	}
+
+	// Count the mismatch between the block and the format's partition tables
+	uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS];
+	count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts);
+
+	// Sort the partitions based on the number of mismatched bits
+	return get_partition_ordering_by_mismatch_bits(
+	    texels_to_process,
+	    bsd.partitioning_count_selected[partition_count - 1],
+	    mismatch_counts, partition_ordering);
+}
+
+/**
+ * @brief Insert a partitioning into an order list of results, sorted by error.
+ *
+ * @param      max_values      The max number of entries in the best result arrays.
+ * @param      this_error      The error of the new entry.
+ * @param      this_partition  The partition ID of the new entry.
+ * @param[out] best_errors     The array of best error values.
+ * @param[out] best_partitions The array of best partition values.
+ */
+static void insert_result(
+	unsigned int max_values,
+	float this_error,
+	unsigned int this_partition,
+	float* best_errors,
+	unsigned int* best_partitions)
+{
+	promise(max_values > 0);
+
+	// Don't bother searching if the current worst error beats the new error
+	if (this_error >= best_errors[max_values - 1])
+	{
+		return;
+	}
+
+	// Else insert into the list in error-order
+	for (unsigned int i = 0; i < max_values; i++)
+	{
+		// Existing result is better - move on ...
+		if (this_error > best_errors[i])
+		{
+			continue;
+		}
+
+		// Move existing results down one
+		for (unsigned int j = max_values - 1; j > i; j--)
+		{
+			best_errors[j] = best_errors[j - 1];
+			best_partitions[j] = best_partitions[j - 1];
+		}
+
+		// Insert new result
+		best_errors[i] = this_error;
+		best_partitions[i] = this_partition;
+		break;
+	}
+}
+
+/* See header for documentation. */
+unsigned int find_best_partition_candidates(
+	const block_size_descriptor& bsd,
+	const image_block& blk,
+	unsigned int partition_count,
+	unsigned int partition_search_limit,
+	unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES],
+	unsigned int requested_candidates
+) {
+	// Constant used to estimate quantization error for a given partitioning; the optimal value for
+	// this depends on bitrate. These values have been determined empirically.
+	unsigned int texels_per_block = bsd.texel_count;
+	float weight_imprecision_estim = 0.055f;
+	if (texels_per_block <= 20)
+	{
+		weight_imprecision_estim = 0.03f;
+	}
+	else if (texels_per_block <= 31)
+	{
+		weight_imprecision_estim = 0.04f;
+	}
+	else if (texels_per_block <= 41)
+	{
+		weight_imprecision_estim = 0.05f;
+	}
+
+	promise(partition_count > 0);
+	promise(partition_search_limit > 0);
+
+	weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim;
+
+	uint16_t partition_sequence[BLOCK_MAX_PARTITIONINGS];
+	unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence);
+	partition_search_limit = astc::min(partition_search_limit, sequence_len);
+	requested_candidates = astc::min(partition_search_limit, requested_candidates);
+
+	bool uses_alpha = !blk.is_constant_channel(3);
+
+	// Partitioning errors assuming uncorrelated-chrominance endpoints
+	float uncor_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
+	unsigned int uncor_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
+
+	// Partitioning errors assuming same-chrominance endpoints
+	float samec_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
+	unsigned int samec_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
+
+	for (unsigned int i = 0; i < requested_candidates; i++)
+	{
+		uncor_best_errors[i] = ERROR_CALC_DEFAULT;
+		samec_best_errors[i] = ERROR_CALC_DEFAULT;
+	}
+
+	if (uses_alpha)
+	{
+		for (unsigned int i = 0; i < partition_search_limit; i++)
+		{
+			unsigned int partition = partition_sequence[i];
+			const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
+
+			// Compute weighting to give to each component in each partition
+			partition_metrics pms[BLOCK_MAX_PARTITIONS];
+
+			compute_avgs_and_dirs_4_comp(pi, blk, pms);
+
+			line4 uncor_lines[BLOCK_MAX_PARTITIONS];
+			line4 samec_lines[BLOCK_MAX_PARTITIONS];
+
+			processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS];
+			processed_line4 samec_plines[BLOCK_MAX_PARTITIONS];
+
+			float line_lengths[BLOCK_MAX_PARTITIONS];
+
+			for (unsigned int j = 0; j < partition_count; j++)
+			{
+				partition_metrics& pm = pms[j];
+
+				uncor_lines[j].a = pm.avg;
+				uncor_lines[j].b = normalize_safe(pm.dir, unit4());
+
+				uncor_plines[j].amod = uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b);
+				uncor_plines[j].bs = uncor_lines[j].b;
+
+				samec_lines[j].a = vfloat4::zero();
+				samec_lines[j].b = normalize_safe(pm.avg, unit4());
+
+				samec_plines[j].amod = vfloat4::zero();
+				samec_plines[j].bs = samec_lines[j].b;
+			}
+
+			float uncor_error = 0.0f;
+			float samec_error = 0.0f;
+
+			compute_error_squared_rgba(pi,
+			                           blk,
+			                           uncor_plines,
+			                           samec_plines,
+			                           line_lengths,
+			                           uncor_error,
+			                           samec_error);
+
+			// Compute an estimate of error introduced by weight quantization imprecision.
+			// This error is computed as follows, for each partition
+			//     1: compute the principal-axis vector (full length) in error-space
+			//     2: convert the principal-axis vector to regular RGB-space
+			//     3: scale the vector by a constant that estimates average quantization error
+			//     4: for each texel, square the vector, then do a dot-product with the texel's
+			//        error weight; sum up the results across all texels.
+			//     4(optimized): square the vector once, then do a dot-product with the average
+			//        texel error, then multiply by the number of texels.
+
+			for (unsigned int j = 0; j < partition_count; j++)
+			{
+				float tpp = static_cast<float>(pi.partition_texel_count[j]);
+				vfloat4 error_weights(tpp * weight_imprecision_estim);
+
+				vfloat4 uncor_vector = uncor_lines[j].b * line_lengths[j];
+				vfloat4 samec_vector = samec_lines[j].b * line_lengths[j];
+
+				uncor_error += dot_s(uncor_vector * uncor_vector, error_weights);
+				samec_error += dot_s(samec_vector * samec_vector, error_weights);
+			}
+
+			insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
+			insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
+		}
+	}
+	else
+	{
+		for (unsigned int i = 0; i < partition_search_limit; i++)
+		{
+			unsigned int partition = partition_sequence[i];
+			const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
+
+			// Compute weighting to give to each component in each partition
+			partition_metrics pms[BLOCK_MAX_PARTITIONS];
+			compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
+
+			partition_lines3 plines[BLOCK_MAX_PARTITIONS];
+
+			for (unsigned int j = 0; j < partition_count; j++)
+			{
+				partition_metrics& pm = pms[j];
+				partition_lines3& pl = plines[j];
+
+				pl.uncor_line.a = pm.avg;
+				pl.uncor_line.b = normalize_safe(pm.dir, unit3());
+
+				pl.samec_line.a = vfloat4::zero();
+				pl.samec_line.b = normalize_safe(pm.avg, unit3());
+
+				pl.uncor_pline.amod = pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b);
+				pl.uncor_pline.bs   = pl.uncor_line.b;
+
+				pl.samec_pline.amod = vfloat4::zero();
+				pl.samec_pline.bs   = pl.samec_line.b;
+			}
+
+			float uncor_error = 0.0f;
+			float samec_error = 0.0f;
+
+			compute_error_squared_rgb(pi,
+			                          blk,
+			                          plines,
+			                          uncor_error,
+			                          samec_error);
+
+			// Compute an estimate of error introduced by weight quantization imprecision.
+			// This error is computed as follows, for each partition
+			//     1: compute the principal-axis vector (full length) in error-space
+			//     2: convert the principal-axis vector to regular RGB-space
+			//     3: scale the vector by a constant that estimates average quantization error
+			//     4: for each texel, square the vector, then do a dot-product with the texel's
+			//        error weight; sum up the results across all texels.
+			//     4(optimized): square the vector once, then do a dot-product with the average
+			//        texel error, then multiply by the number of texels.
+
+			for (unsigned int j = 0; j < partition_count; j++)
+			{
+				partition_lines3& pl = plines[j];
+
+				float tpp = static_cast<float>(pi.partition_texel_count[j]);
+				vfloat4 error_weights(tpp * weight_imprecision_estim);
+
+				vfloat4 uncor_vector = pl.uncor_line.b * pl.line_length;
+				vfloat4 samec_vector = pl.samec_line.b * pl.line_length;
+
+				uncor_error += dot3_s(uncor_vector * uncor_vector, error_weights);
+				samec_error += dot3_s(samec_vector * samec_vector, error_weights);
+			}
+
+			insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
+			insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
+		}
+	}
+
+	unsigned int interleave[2 * TUNE_MAX_PARTITIONING_CANDIDATES];
+	for (unsigned int i = 0; i < requested_candidates; i++)
+	{
+		interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
+		interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
+	}
+
+	uint64_t bitmasks[1024/64] { 0 };
+	unsigned int emitted = 0;
+
+	// Deduplicate the first "requested" entries
+	for (unsigned int i = 0; i < requested_candidates * 2;  i++)
+	{
+		unsigned int partition = interleave[i];
+
+		unsigned int word = partition / 64;
+		unsigned int bit = partition % 64;
+
+		bool written = bitmasks[word] & (1ull << bit);
+
+		if (!written)
+		{
+			best_partitions[emitted] = partition;
+			bitmasks[word] |= 1ull << bit;
+			emitted++;
+
+			if (emitted == requested_candidates)
+			{
+				break;
+			}
+		}
+	}
+
+	return emitted;
+}
+
+#endif
@@ -0,0 +1,558 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2024 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for creating in-memory ASTC image structures.
+ */
+
+#include <cassert>
+#include <cstring>
+
+#include "astcenc_internal.h"
+
+/**
+ * @brief Loader pipeline function type for data fetch from memory.
+ */
+using pixel_loader = vfloat4(*)(const void*, int);
+
+/**
+ * @brief Loader pipeline function type for swizzling data in a vector.
+ */
+using pixel_swizzler = vfloat4(*)(vfloat4, const astcenc_swizzle&);
+
+/**
+ * @brief Loader pipeline function type for converting data in a vector to LNS.
+ */
+using pixel_converter = vfloat4(*)(vfloat4, vmask4);
+
+/**
+ * @brief Load a 8-bit UNORM texel from a data array.
+ *
+ * @param data          The data pointer.
+ * @param base_offset   The index offset to the start of the pixel.
+ */
+static vfloat4 load_texel_u8(
+	const void* data,
+	int base_offset
+) {
+	const uint8_t* data8 = static_cast<const uint8_t*>(data);
+	return int_to_float(vint4(data8 + base_offset)) / 255.0f;
+}
+
+/**
+ * @brief Load a 16-bit fp16 texel from a data array.
+ *
+ * @param data          The data pointer.
+ * @param base_offset   The index offset to the start of the pixel.
+ */
+static vfloat4 load_texel_f16(
+	const void* data,
+	int base_offset
+) {
+	const uint16_t* data16 = static_cast<const uint16_t*>(data);
+	int r = data16[base_offset    ];
+	int g = data16[base_offset + 1];
+	int b = data16[base_offset + 2];
+	int a = data16[base_offset + 3];
+	return float16_to_float(vint4(r, g, b, a));
+}
+
+/**
+ * @brief Load a 32-bit float texel from a data array.
+ *
+ * @param data          The data pointer.
+ * @param base_offset   The index offset to the start of the pixel.
+ */
+static vfloat4 load_texel_f32(
+	const void* data,
+	int base_offset
+) {
+	const float* data32 = static_cast<const float*>(data);
+	return vfloat4(data32 + base_offset);
+}
+
+/**
+ * @brief Dummy no-op swizzle function.
+ *
+ * @param data   The source RGBA vector to swizzle.
+ * @param swz    The swizzle to use.
+ */
+static vfloat4 swz_texel_skip(
+	vfloat4 data,
+	const astcenc_swizzle& swz
+) {
+	(void)swz;
+	return data;
+}
+
+/**
+ * @brief Swizzle a texel into a new arrangement.
+ *
+ * @param data   The source RGBA vector to swizzle.
+ * @param swz    The swizzle to use.
+ */
+static vfloat4 swz_texel(
+	vfloat4 data,
+	const astcenc_swizzle& swz
+) {
+	ASTCENC_ALIGNAS float datas[6];
+
+	storea(data, datas);
+	datas[ASTCENC_SWZ_0] = 0.0f;
+	datas[ASTCENC_SWZ_1] = 1.0f;
+
+	return vfloat4(datas[swz.r], datas[swz.g], datas[swz.b], datas[swz.a]);
+}
+
+/**
+ * @brief Encode a texel that is entirely LDR linear.
+ *
+ * @param data       The RGBA data to encode.
+ * @param lns_mask   The mask for the HDR channels than need LNS encoding.
+ */
+static vfloat4 encode_texel_unorm(
+	vfloat4 data,
+	vmask4 lns_mask
+) {
+	(void)lns_mask;
+	return data * 65535.0f;
+}
+
+/**
+ * @brief Encode a texel that includes at least some HDR LNS texels.
+ *
+ * @param data       The RGBA data to encode.
+ * @param lns_mask   The mask for the HDR channels than need LNS encoding.
+ */
+static vfloat4 encode_texel_lns(
+	vfloat4 data,
+	vmask4 lns_mask
+) {
+	vfloat4 datav_unorm = data * 65535.0f;
+	vfloat4 datav_lns = float_to_lns(data);
+	return select(datav_unorm, datav_lns, lns_mask);
+}
+
+/* See header for documentation. */
+void load_image_block(
+	astcenc_profile decode_mode,
+	const astcenc_image& img,
+	image_block& blk,
+	const block_size_descriptor& bsd,
+	unsigned int xpos,
+	unsigned int ypos,
+	unsigned int zpos,
+	const astcenc_swizzle& swz
+) {
+	unsigned int xsize = img.dim_x;
+	unsigned int ysize = img.dim_y;
+	unsigned int zsize = img.dim_z;
+
+	blk.xpos = xpos;
+	blk.ypos = ypos;
+	blk.zpos = zpos;
+
+	// True if any non-identity swizzle
+	bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
+	                 (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
+
+	int idx = 0;
+
+	vfloat4 data_min(1e38f);
+	vfloat4 data_mean(0.0f);
+	vfloat4 data_mean_scale(1.0f / static_cast<float>(bsd.texel_count));
+	vfloat4 data_max(-1e38f);
+	vmask4 grayscalev(true);
+
+	// This works because we impose the same choice everywhere during encode
+	uint8_t rgb_lns = (decode_mode == ASTCENC_PRF_HDR) ||
+	                  (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A) ? 1 : 0;
+	uint8_t a_lns = decode_mode == ASTCENC_PRF_HDR ? 1 : 0;
+	vint4 use_lns(rgb_lns, rgb_lns, rgb_lns, a_lns);
+	vmask4 lns_mask = use_lns != vint4::zero();
+
+	// Set up the function pointers for loading pipeline as needed
+	pixel_loader loader = load_texel_u8;
+	if (img.data_type == ASTCENC_TYPE_F16)
+	{
+		loader = load_texel_f16;
+	}
+	else if  (img.data_type == ASTCENC_TYPE_F32)
+	{
+		loader = load_texel_f32;
+	}
+
+	pixel_swizzler swizzler = swz_texel_skip;
+	if (needs_swz)
+	{
+		swizzler = swz_texel;
+	}
+
+	pixel_converter converter = encode_texel_unorm;
+	if (any(lns_mask))
+	{
+		converter = encode_texel_lns;
+	}
+
+	for (unsigned int z = 0; z < bsd.zdim; z++)
+	{
+		unsigned int zi = astc::min(zpos + z, zsize - 1);
+		void* plane = img.data[zi];
+
+		for (unsigned int y = 0; y < bsd.ydim; y++)
+		{
+			unsigned int yi = astc::min(ypos + y, ysize - 1);
+
+			for (unsigned int x = 0; x < bsd.xdim; x++)
+			{
+				unsigned int xi = astc::min(xpos + x, xsize - 1);
+
+				vfloat4 datav = loader(plane, (4 * xsize * yi) + (4 * xi));
+				datav = swizzler(datav, swz);
+				datav = converter(datav, lns_mask);
+
+				// Compute block metadata
+				data_min = min(data_min, datav);
+				data_mean += datav * data_mean_scale;
+				data_max = max(data_max, datav);
+
+				grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
+
+				blk.data_r[idx] = datav.lane<0>();
+				blk.data_g[idx] = datav.lane<1>();
+				blk.data_b[idx] = datav.lane<2>();
+				blk.data_a[idx] = datav.lane<3>();
+
+				blk.rgb_lns[idx] = rgb_lns;
+				blk.alpha_lns[idx] = a_lns;
+
+				idx++;
+			}
+		}
+	}
+
+	// Reverse the encoding so we store origin block in the original format
+	vfloat4 data_enc = blk.texel(0);
+	vfloat4 data_enc_unorm = data_enc / 65535.0f;
+	vfloat4 data_enc_lns = vfloat4::zero();
+
+	if (rgb_lns || a_lns)
+	{
+		data_enc_lns = float16_to_float(lns_to_sf16(float_to_int(data_enc)));
+	}
+
+	blk.origin_texel = select(data_enc_unorm, data_enc_lns, lns_mask);
+
+	// Store block metadata
+	blk.data_min = data_min;
+	blk.data_mean = data_mean;
+	blk.data_max = data_max;
+	blk.grayscale = all(grayscalev);
+}
+
+/* See header for documentation. */
+void load_image_block_fast_ldr(
+	astcenc_profile decode_mode,
+	const astcenc_image& img,
+	image_block& blk,
+	const block_size_descriptor& bsd,
+	unsigned int xpos,
+	unsigned int ypos,
+	unsigned int zpos,
+	const astcenc_swizzle& swz
+) {
+	(void)swz;
+	(void)decode_mode;
+
+	unsigned int xsize = img.dim_x;
+	unsigned int ysize = img.dim_y;
+
+	blk.xpos = xpos;
+	blk.ypos = ypos;
+	blk.zpos = zpos;
+
+	vfloat4 data_min(1e38f);
+	vfloat4 data_mean = vfloat4::zero();
+	vfloat4 data_max(-1e38f);
+	vmask4 grayscalev(true);
+	int idx = 0;
+
+	const uint8_t* plane = static_cast<const uint8_t*>(img.data[0]);
+	for (unsigned int y = ypos; y < ypos + bsd.ydim; y++)
+	{
+		unsigned int yi = astc::min(y, ysize - 1);
+
+		for (unsigned int x = xpos; x < xpos + bsd.xdim; x++)
+		{
+			unsigned int xi = astc::min(x, xsize - 1);
+
+			vint4 datavi = vint4(plane + (4 * xsize * yi) + (4 * xi));
+			vfloat4 datav = int_to_float(datavi) * (65535.0f / 255.0f);
+
+			// Compute block metadata
+			data_min = min(data_min, datav);
+			data_mean += datav;
+			data_max = max(data_max, datav);
+
+			grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
+
+			blk.data_r[idx] = datav.lane<0>();
+			blk.data_g[idx] = datav.lane<1>();
+			blk.data_b[idx] = datav.lane<2>();
+			blk.data_a[idx] = datav.lane<3>();
+
+			idx++;
+		}
+	}
+
+	// Reverse the encoding so we store origin block in the original format
+	blk.origin_texel = blk.texel(0) / 65535.0f;
+
+	// Store block metadata
+	blk.rgb_lns[0] = 0;
+	blk.alpha_lns[0] = 0;
+	blk.data_min = data_min;
+	blk.data_mean = data_mean / static_cast<float>(bsd.texel_count);
+	blk.data_max = data_max;
+	blk.grayscale = all(grayscalev);
+}
+
+/* See header for documentation. */
+void store_image_block(
+	astcenc_image& img,
+	const image_block& blk,
+	const block_size_descriptor& bsd,
+	unsigned int xpos,
+	unsigned int ypos,
+	unsigned int zpos,
+	const astcenc_swizzle& swz
+) {
+	unsigned int x_size = img.dim_x;
+	unsigned int x_start = xpos;
+	unsigned int x_end = astc::min(x_size, xpos + bsd.xdim);
+	unsigned int x_count = x_end - x_start;
+	unsigned int x_nudge = bsd.xdim - x_count;
+
+	unsigned int y_size = img.dim_y;
+	unsigned int y_start = ypos;
+	unsigned int y_end = astc::min(y_size, ypos + bsd.ydim);
+	unsigned int y_count = y_end - y_start;
+	unsigned int y_nudge = (bsd.ydim - y_count) * bsd.xdim;
+
+	unsigned int z_size = img.dim_z;
+	unsigned int z_start = zpos;
+	unsigned int z_end = astc::min(z_size, zpos + bsd.zdim);
+
+	// True if any non-identity swizzle
+	bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
+	                 (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
+
+	// True if any swizzle uses Z reconstruct
+	bool needs_z = (swz.r == ASTCENC_SWZ_Z) || (swz.g == ASTCENC_SWZ_Z) ||
+	               (swz.b == ASTCENC_SWZ_Z) || (swz.a == ASTCENC_SWZ_Z);
+
+	int idx = 0;
+	if (img.data_type == ASTCENC_TYPE_U8)
+	{
+		for (unsigned int z = z_start; z < z_end; z++)
+		{
+			// Fetch the image plane
+			uint8_t* data8 = static_cast<uint8_t*>(img.data[z]);
+
+			for (unsigned int y = y_start; y < y_end; y++)
+			{
+				uint8_t* data8_row = data8 + (4 * x_size * y) + (4 * x_start);
+
+				for (unsigned int x = 0; x < x_count; x += ASTCENC_SIMD_WIDTH)
+				{
+					unsigned int max_texels = ASTCENC_SIMD_WIDTH;
+					unsigned int used_texels = astc::min(x_count - x, max_texels);
+
+					// Unaligned load as rows are not always SIMD_WIDTH long
+					vfloat data_r(blk.data_r + idx);
+					vfloat data_g(blk.data_g + idx);
+					vfloat data_b(blk.data_b + idx);
+					vfloat data_a(blk.data_a + idx);
+
+					vint data_ri = float_to_int_rtn(min(data_r, 1.0f) * 255.0f);
+					vint data_gi = float_to_int_rtn(min(data_g, 1.0f) * 255.0f);
+					vint data_bi = float_to_int_rtn(min(data_b, 1.0f) * 255.0f);
+					vint data_ai = float_to_int_rtn(min(data_a, 1.0f) * 255.0f);
+
+					if (needs_swz)
+					{
+						vint swizzle_table[7];
+						swizzle_table[ASTCENC_SWZ_0] = vint(0);
+						swizzle_table[ASTCENC_SWZ_1] = vint(255);
+						swizzle_table[ASTCENC_SWZ_R] = data_ri;
+						swizzle_table[ASTCENC_SWZ_G] = data_gi;
+						swizzle_table[ASTCENC_SWZ_B] = data_bi;
+						swizzle_table[ASTCENC_SWZ_A] = data_ai;
+
+						if (needs_z)
+						{
+							vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f);
+							vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f);
+							vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y);
+							data_z = max(data_z, 0.0f);
+							data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f);
+
+							swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f);
+						}
+
+						data_ri = swizzle_table[swz.r];
+						data_gi = swizzle_table[swz.g];
+						data_bi = swizzle_table[swz.b];
+						data_ai = swizzle_table[swz.a];
+					}
+
+					// Errors are NaN encoded - convert to magenta error color
+					// Branch is OK here - it is almost never true so predicts well
+					vmask nan_mask = data_r != data_r;
+					if (any(nan_mask))
+					{
+						data_ri = select(data_ri, vint(0xFF), nan_mask);
+						data_gi = select(data_gi, vint(0x00), nan_mask);
+						data_bi = select(data_bi, vint(0xFF), nan_mask);
+						data_ai = select(data_ai, vint(0xFF), nan_mask);
+					}
+
+					vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai);
+					vmask store_mask = vint::lane_id() < vint(used_texels);
+					store_lanes_masked(data8_row, data_rgbai, store_mask);
+
+					data8_row += ASTCENC_SIMD_WIDTH * 4;
+					idx += used_texels;
+				}
+				idx += x_nudge;
+			}
+			idx += y_nudge;
+		}
+	}
+	else if (img.data_type == ASTCENC_TYPE_F16)
+	{
+		for (unsigned int z = z_start; z < z_end; z++)
+		{
+			// Fetch the image plane
+			uint16_t* data16 = static_cast<uint16_t*>(img.data[z]);
+
+			for (unsigned int y = y_start; y < y_end; y++)
+			{
+				uint16_t* data16_row = data16 + (4 * x_size * y) + (4 * x_start);
+
+				for (unsigned int x = 0; x < x_count; x++)
+				{
+					vint4 color;
+
+					// NaNs are handled inline - no need to special case
+					if (needs_swz)
+					{
+						float data[7];
+						data[ASTCENC_SWZ_0] = 0.0f;
+						data[ASTCENC_SWZ_1] = 1.0f;
+						data[ASTCENC_SWZ_R] = blk.data_r[idx];
+						data[ASTCENC_SWZ_G] = blk.data_g[idx];
+						data[ASTCENC_SWZ_B] = blk.data_b[idx];
+						data[ASTCENC_SWZ_A] = blk.data_a[idx];
+
+						if (needs_z)
+						{
+							float xN = (data[0] * 2.0f) - 1.0f;
+							float yN = (data[3] * 2.0f) - 1.0f;
+							float zN = 1.0f - xN * xN - yN * yN;
+							if (zN < 0.0f)
+							{
+								zN = 0.0f;
+							}
+							data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
+						}
+
+						vfloat4 colorf(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
+						color = float_to_float16(colorf);
+					}
+					else
+					{
+						vfloat4 colorf = blk.texel(idx);
+						color = float_to_float16(colorf);
+					}
+
+					// TODO: Vectorize with store N shorts?
+					data16_row[0] = static_cast<uint16_t>(color.lane<0>());
+					data16_row[1] = static_cast<uint16_t>(color.lane<1>());
+					data16_row[2] = static_cast<uint16_t>(color.lane<2>());
+					data16_row[3] = static_cast<uint16_t>(color.lane<3>());
+					data16_row += 4;
+					idx++;
+				}
+				idx += x_nudge;
+			}
+			idx += y_nudge;
+		}
+	}
+	else // if (img.data_type == ASTCENC_TYPE_F32)
+	{
+		assert(img.data_type == ASTCENC_TYPE_F32);
+
+		for (unsigned int z = z_start; z < z_end; z++)
+		{
+			// Fetch the image plane
+			float* data32 = static_cast<float*>(img.data[z]);
+
+			for (unsigned int y = y_start; y < y_end; y++)
+			{
+				float* data32_row = data32 + (4 * x_size * y) + (4 * x_start);
+
+				for (unsigned int x = 0; x < x_count; x++)
+				{
+					vfloat4 color = blk.texel(idx);
+
+					// NaNs are handled inline - no need to special case
+					if (needs_swz)
+					{
+						float data[7];
+						data[ASTCENC_SWZ_0] = 0.0f;
+						data[ASTCENC_SWZ_1] = 1.0f;
+						data[ASTCENC_SWZ_R] = color.lane<0>();
+						data[ASTCENC_SWZ_G] = color.lane<1>();
+						data[ASTCENC_SWZ_B] = color.lane<2>();
+						data[ASTCENC_SWZ_A] = color.lane<3>();
+
+						if (needs_z)
+						{
+							float xN = (data[0] * 2.0f) - 1.0f;
+							float yN = (data[3] * 2.0f) - 1.0f;
+							float zN = 1.0f - xN * xN - yN * yN;
+							if (zN < 0.0f)
+							{
+								zN = 0.0f;
+							}
+							data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
+						}
+
+						color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
+					}
+
+					store(color, data32_row);
+					data32_row += 4;
+					idx++;
+				}
+				idx += x_nudge;
+			}
+			idx += y_nudge;
+		}
+	}
+}
@@ -0,0 +1,739 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2024 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for encoding/decoding Bounded Integer Sequence Encoding.
+ */
+
+#include "astcenc_internal.h"
+
+#include <array>
+
+/** @brief Unpacked quint triplets <low,middle,high> for each packed value */
+// TODO: Bitpack these into a uint16_t?
+static const uint8_t quints_of_integer[128][3] {
+	{0, 0, 0}, {1, 0, 0}, {2, 0, 0}, {3, 0, 0},
+	{4, 0, 0}, {0, 4, 0}, {4, 4, 0}, {4, 4, 4},
+	{0, 1, 0}, {1, 1, 0}, {2, 1, 0}, {3, 1, 0},
+	{4, 1, 0}, {1, 4, 0}, {4, 4, 1}, {4, 4, 4},
+	{0, 2, 0}, {1, 2, 0}, {2, 2, 0}, {3, 2, 0},
+	{4, 2, 0}, {2, 4, 0}, {4, 4, 2}, {4, 4, 4},
+	{0, 3, 0}, {1, 3, 0}, {2, 3, 0}, {3, 3, 0},
+	{4, 3, 0}, {3, 4, 0}, {4, 4, 3}, {4, 4, 4},
+	{0, 0, 1}, {1, 0, 1}, {2, 0, 1}, {3, 0, 1},
+	{4, 0, 1}, {0, 4, 1}, {4, 0, 4}, {0, 4, 4},
+	{0, 1, 1}, {1, 1, 1}, {2, 1, 1}, {3, 1, 1},
+	{4, 1, 1}, {1, 4, 1}, {4, 1, 4}, {1, 4, 4},
+	{0, 2, 1}, {1, 2, 1}, {2, 2, 1}, {3, 2, 1},
+	{4, 2, 1}, {2, 4, 1}, {4, 2, 4}, {2, 4, 4},
+	{0, 3, 1}, {1, 3, 1}, {2, 3, 1}, {3, 3, 1},
+	{4, 3, 1}, {3, 4, 1}, {4, 3, 4}, {3, 4, 4},
+	{0, 0, 2}, {1, 0, 2}, {2, 0, 2}, {3, 0, 2},
+	{4, 0, 2}, {0, 4, 2}, {2, 0, 4}, {3, 0, 4},
+	{0, 1, 2}, {1, 1, 2}, {2, 1, 2}, {3, 1, 2},
+	{4, 1, 2}, {1, 4, 2}, {2, 1, 4}, {3, 1, 4},
+	{0, 2, 2}, {1, 2, 2}, {2, 2, 2}, {3, 2, 2},
+	{4, 2, 2}, {2, 4, 2}, {2, 2, 4}, {3, 2, 4},
+	{0, 3, 2}, {1, 3, 2}, {2, 3, 2}, {3, 3, 2},
+	{4, 3, 2}, {3, 4, 2}, {2, 3, 4}, {3, 3, 4},
+	{0, 0, 3}, {1, 0, 3}, {2, 0, 3}, {3, 0, 3},
+	{4, 0, 3}, {0, 4, 3}, {0, 0, 4}, {1, 0, 4},
+	{0, 1, 3}, {1, 1, 3}, {2, 1, 3}, {3, 1, 3},
+	{4, 1, 3}, {1, 4, 3}, {0, 1, 4}, {1, 1, 4},
+	{0, 2, 3}, {1, 2, 3}, {2, 2, 3}, {3, 2, 3},
+	{4, 2, 3}, {2, 4, 3}, {0, 2, 4}, {1, 2, 4},
+	{0, 3, 3}, {1, 3, 3}, {2, 3, 3}, {3, 3, 3},
+	{4, 3, 3}, {3, 4, 3}, {0, 3, 4}, {1, 3, 4}
+};
+
+/** @brief Packed quint values for each unpacked value, indexed [hi][mid][lo]. */
+static const uint8_t integer_of_quints[5][5][5] {
+	{
+		{0, 1, 2, 3, 4},
+		{8, 9, 10, 11, 12},
+		{16, 17, 18, 19, 20},
+		{24, 25, 26, 27, 28},
+		{5, 13, 21, 29, 6}
+	},
+	{
+		{32, 33, 34, 35, 36},
+		{40, 41, 42, 43, 44},
+		{48, 49, 50, 51, 52},
+		{56, 57, 58, 59, 60},
+		{37, 45, 53, 61, 14}
+	},
+	{
+		{64, 65, 66, 67, 68},
+		{72, 73, 74, 75, 76},
+		{80, 81, 82, 83, 84},
+		{88, 89, 90, 91, 92},
+		{69, 77, 85, 93, 22}
+	},
+	{
+		{96, 97, 98, 99, 100},
+		{104, 105, 106, 107, 108},
+		{112, 113, 114, 115, 116},
+		{120, 121, 122, 123, 124},
+		{101, 109, 117, 125, 30}
+	},
+	{
+		{102, 103, 70, 71, 38},
+		{110, 111, 78, 79, 46},
+		{118, 119, 86, 87, 54},
+		{126, 127, 94, 95, 62},
+		{39, 47, 55, 63, 31}
+	}
+};
+
+/** @brief Unpacked trit quintuplets <low,...,high> for each packed value */
+// TODO: Bitpack these into a uint16_t?
+static const uint8_t trits_of_integer[256][5] {
+	{0, 0, 0, 0, 0}, {1, 0, 0, 0, 0}, {2, 0, 0, 0, 0}, {0, 0, 2, 0, 0},
+	{0, 1, 0, 0, 0}, {1, 1, 0, 0, 0}, {2, 1, 0, 0, 0}, {1, 0, 2, 0, 0},
+	{0, 2, 0, 0, 0}, {1, 2, 0, 0, 0}, {2, 2, 0, 0, 0}, {2, 0, 2, 0, 0},
+	{0, 2, 2, 0, 0}, {1, 2, 2, 0, 0}, {2, 2, 2, 0, 0}, {2, 0, 2, 0, 0},
+	{0, 0, 1, 0, 0}, {1, 0, 1, 0, 0}, {2, 0, 1, 0, 0}, {0, 1, 2, 0, 0},
+	{0, 1, 1, 0, 0}, {1, 1, 1, 0, 0}, {2, 1, 1, 0, 0}, {1, 1, 2, 0, 0},
+	{0, 2, 1, 0, 0}, {1, 2, 1, 0, 0}, {2, 2, 1, 0, 0}, {2, 1, 2, 0, 0},
+	{0, 0, 0, 2, 2}, {1, 0, 0, 2, 2}, {2, 0, 0, 2, 2}, {0, 0, 2, 2, 2},
+	{0, 0, 0, 1, 0}, {1, 0, 0, 1, 0}, {2, 0, 0, 1, 0}, {0, 0, 2, 1, 0},
+	{0, 1, 0, 1, 0}, {1, 1, 0, 1, 0}, {2, 1, 0, 1, 0}, {1, 0, 2, 1, 0},
+	{0, 2, 0, 1, 0}, {1, 2, 0, 1, 0}, {2, 2, 0, 1, 0}, {2, 0, 2, 1, 0},
+	{0, 2, 2, 1, 0}, {1, 2, 2, 1, 0}, {2, 2, 2, 1, 0}, {2, 0, 2, 1, 0},
+	{0, 0, 1, 1, 0}, {1, 0, 1, 1, 0}, {2, 0, 1, 1, 0}, {0, 1, 2, 1, 0},
+	{0, 1, 1, 1, 0}, {1, 1, 1, 1, 0}, {2, 1, 1, 1, 0}, {1, 1, 2, 1, 0},
+	{0, 2, 1, 1, 0}, {1, 2, 1, 1, 0}, {2, 2, 1, 1, 0}, {2, 1, 2, 1, 0},
+	{0, 1, 0, 2, 2}, {1, 1, 0, 2, 2}, {2, 1, 0, 2, 2}, {1, 0, 2, 2, 2},
+	{0, 0, 0, 2, 0}, {1, 0, 0, 2, 0}, {2, 0, 0, 2, 0}, {0, 0, 2, 2, 0},
+	{0, 1, 0, 2, 0}, {1, 1, 0, 2, 0}, {2, 1, 0, 2, 0}, {1, 0, 2, 2, 0},
+	{0, 2, 0, 2, 0}, {1, 2, 0, 2, 0}, {2, 2, 0, 2, 0}, {2, 0, 2, 2, 0},
+	{0, 2, 2, 2, 0}, {1, 2, 2, 2, 0}, {2, 2, 2, 2, 0}, {2, 0, 2, 2, 0},
+	{0, 0, 1, 2, 0}, {1, 0, 1, 2, 0}, {2, 0, 1, 2, 0}, {0, 1, 2, 2, 0},
+	{0, 1, 1, 2, 0}, {1, 1, 1, 2, 0}, {2, 1, 1, 2, 0}, {1, 1, 2, 2, 0},
+	{0, 2, 1, 2, 0}, {1, 2, 1, 2, 0}, {2, 2, 1, 2, 0}, {2, 1, 2, 2, 0},
+	{0, 2, 0, 2, 2}, {1, 2, 0, 2, 2}, {2, 2, 0, 2, 2}, {2, 0, 2, 2, 2},
+	{0, 0, 0, 0, 2}, {1, 0, 0, 0, 2}, {2, 0, 0, 0, 2}, {0, 0, 2, 0, 2},
+	{0, 1, 0, 0, 2}, {1, 1, 0, 0, 2}, {2, 1, 0, 0, 2}, {1, 0, 2, 0, 2},
+	{0, 2, 0, 0, 2}, {1, 2, 0, 0, 2}, {2, 2, 0, 0, 2}, {2, 0, 2, 0, 2},
+	{0, 2, 2, 0, 2}, {1, 2, 2, 0, 2}, {2, 2, 2, 0, 2}, {2, 0, 2, 0, 2},
+	{0, 0, 1, 0, 2}, {1, 0, 1, 0, 2}, {2, 0, 1, 0, 2}, {0, 1, 2, 0, 2},
+	{0, 1, 1, 0, 2}, {1, 1, 1, 0, 2}, {2, 1, 1, 0, 2}, {1, 1, 2, 0, 2},
+	{0, 2, 1, 0, 2}, {1, 2, 1, 0, 2}, {2, 2, 1, 0, 2}, {2, 1, 2, 0, 2},
+	{0, 2, 2, 2, 2}, {1, 2, 2, 2, 2}, {2, 2, 2, 2, 2}, {2, 0, 2, 2, 2},
+	{0, 0, 0, 0, 1}, {1, 0, 0, 0, 1}, {2, 0, 0, 0, 1}, {0, 0, 2, 0, 1},
+	{0, 1, 0, 0, 1}, {1, 1, 0, 0, 1}, {2, 1, 0, 0, 1}, {1, 0, 2, 0, 1},
+	{0, 2, 0, 0, 1}, {1, 2, 0, 0, 1}, {2, 2, 0, 0, 1}, {2, 0, 2, 0, 1},
+	{0, 2, 2, 0, 1}, {1, 2, 2, 0, 1}, {2, 2, 2, 0, 1}, {2, 0, 2, 0, 1},
+	{0, 0, 1, 0, 1}, {1, 0, 1, 0, 1}, {2, 0, 1, 0, 1}, {0, 1, 2, 0, 1},
+	{0, 1, 1, 0, 1}, {1, 1, 1, 0, 1}, {2, 1, 1, 0, 1}, {1, 1, 2, 0, 1},
+	{0, 2, 1, 0, 1}, {1, 2, 1, 0, 1}, {2, 2, 1, 0, 1}, {2, 1, 2, 0, 1},
+	{0, 0, 1, 2, 2}, {1, 0, 1, 2, 2}, {2, 0, 1, 2, 2}, {0, 1, 2, 2, 2},
+	{0, 0, 0, 1, 1}, {1, 0, 0, 1, 1}, {2, 0, 0, 1, 1}, {0, 0, 2, 1, 1},
+	{0, 1, 0, 1, 1}, {1, 1, 0, 1, 1}, {2, 1, 0, 1, 1}, {1, 0, 2, 1, 1},
+	{0, 2, 0, 1, 1}, {1, 2, 0, 1, 1}, {2, 2, 0, 1, 1}, {2, 0, 2, 1, 1},
+	{0, 2, 2, 1, 1}, {1, 2, 2, 1, 1}, {2, 2, 2, 1, 1}, {2, 0, 2, 1, 1},
+	{0, 0, 1, 1, 1}, {1, 0, 1, 1, 1}, {2, 0, 1, 1, 1}, {0, 1, 2, 1, 1},
+	{0, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 1, 1}, {1, 1, 2, 1, 1},
+	{0, 2, 1, 1, 1}, {1, 2, 1, 1, 1}, {2, 2, 1, 1, 1}, {2, 1, 2, 1, 1},
+	{0, 1, 1, 2, 2}, {1, 1, 1, 2, 2}, {2, 1, 1, 2, 2}, {1, 1, 2, 2, 2},
+	{0, 0, 0, 2, 1}, {1, 0, 0, 2, 1}, {2, 0, 0, 2, 1}, {0, 0, 2, 2, 1},
+	{0, 1, 0, 2, 1}, {1, 1, 0, 2, 1}, {2, 1, 0, 2, 1}, {1, 0, 2, 2, 1},
+	{0, 2, 0, 2, 1}, {1, 2, 0, 2, 1}, {2, 2, 0, 2, 1}, {2, 0, 2, 2, 1},
+	{0, 2, 2, 2, 1}, {1, 2, 2, 2, 1}, {2, 2, 2, 2, 1}, {2, 0, 2, 2, 1},
+	{0, 0, 1, 2, 1}, {1, 0, 1, 2, 1}, {2, 0, 1, 2, 1}, {0, 1, 2, 2, 1},
+	{0, 1, 1, 2, 1}, {1, 1, 1, 2, 1}, {2, 1, 1, 2, 1}, {1, 1, 2, 2, 1},
+	{0, 2, 1, 2, 1}, {1, 2, 1, 2, 1}, {2, 2, 1, 2, 1}, {2, 1, 2, 2, 1},
+	{0, 2, 1, 2, 2}, {1, 2, 1, 2, 2}, {2, 2, 1, 2, 2}, {2, 1, 2, 2, 2},
+	{0, 0, 0, 1, 2}, {1, 0, 0, 1, 2}, {2, 0, 0, 1, 2}, {0, 0, 2, 1, 2},
+	{0, 1, 0, 1, 2}, {1, 1, 0, 1, 2}, {2, 1, 0, 1, 2}, {1, 0, 2, 1, 2},
+	{0, 2, 0, 1, 2}, {1, 2, 0, 1, 2}, {2, 2, 0, 1, 2}, {2, 0, 2, 1, 2},
+	{0, 2, 2, 1, 2}, {1, 2, 2, 1, 2}, {2, 2, 2, 1, 2}, {2, 0, 2, 1, 2},
+	{0, 0, 1, 1, 2}, {1, 0, 1, 1, 2}, {2, 0, 1, 1, 2}, {0, 1, 2, 1, 2},
+	{0, 1, 1, 1, 2}, {1, 1, 1, 1, 2}, {2, 1, 1, 1, 2}, {1, 1, 2, 1, 2},
+	{0, 2, 1, 1, 2}, {1, 2, 1, 1, 2}, {2, 2, 1, 1, 2}, {2, 1, 2, 1, 2},
+	{0, 2, 2, 2, 2}, {1, 2, 2, 2, 2}, {2, 2, 2, 2, 2}, {2, 1, 2, 2, 2}
+};
+
+/** @brief Packed trit values for each unpacked value, indexed [hi][][][][lo]. */
+static const uint8_t integer_of_trits[3][3][3][3][3] {
+	{
+		{
+			{
+				{0, 1, 2},
+				{4, 5, 6},
+				{8, 9, 10}
+			},
+			{
+				{16, 17, 18},
+				{20, 21, 22},
+				{24, 25, 26}
+			},
+			{
+				{3, 7, 15},
+				{19, 23, 27},
+				{12, 13, 14}
+			}
+		},
+		{
+			{
+				{32, 33, 34},
+				{36, 37, 38},
+				{40, 41, 42}
+			},
+			{
+				{48, 49, 50},
+				{52, 53, 54},
+				{56, 57, 58}
+			},
+			{
+				{35, 39, 47},
+				{51, 55, 59},
+				{44, 45, 46}
+			}
+		},
+		{
+			{
+				{64, 65, 66},
+				{68, 69, 70},
+				{72, 73, 74}
+			},
+			{
+				{80, 81, 82},
+				{84, 85, 86},
+				{88, 89, 90}
+			},
+			{
+				{67, 71, 79},
+				{83, 87, 91},
+				{76, 77, 78}
+			}
+		}
+	},
+	{
+		{
+			{
+				{128, 129, 130},
+				{132, 133, 134},
+				{136, 137, 138}
+			},
+			{
+				{144, 145, 146},
+				{148, 149, 150},
+				{152, 153, 154}
+			},
+			{
+				{131, 135, 143},
+				{147, 151, 155},
+				{140, 141, 142}
+			}
+		},
+		{
+			{
+				{160, 161, 162},
+				{164, 165, 166},
+				{168, 169, 170}
+			},
+			{
+				{176, 177, 178},
+				{180, 181, 182},
+				{184, 185, 186}
+			},
+			{
+				{163, 167, 175},
+				{179, 183, 187},
+				{172, 173, 174}
+			}
+		},
+		{
+			{
+				{192, 193, 194},
+				{196, 197, 198},
+				{200, 201, 202}
+			},
+			{
+				{208, 209, 210},
+				{212, 213, 214},
+				{216, 217, 218}
+			},
+			{
+				{195, 199, 207},
+				{211, 215, 219},
+				{204, 205, 206}
+			}
+		}
+	},
+	{
+		{
+			{
+				{96, 97, 98},
+				{100, 101, 102},
+				{104, 105, 106}
+			},
+			{
+				{112, 113, 114},
+				{116, 117, 118},
+				{120, 121, 122}
+			},
+			{
+				{99, 103, 111},
+				{115, 119, 123},
+				{108, 109, 110}
+			}
+		},
+		{
+			{
+				{224, 225, 226},
+				{228, 229, 230},
+				{232, 233, 234}
+			},
+			{
+				{240, 241, 242},
+				{244, 245, 246},
+				{248, 249, 250}
+			},
+			{
+				{227, 231, 239},
+				{243, 247, 251},
+				{236, 237, 238}
+			}
+		},
+		{
+			{
+				{28, 29, 30},
+				{60, 61, 62},
+				{92, 93, 94}
+			},
+			{
+				{156, 157, 158},
+				{188, 189, 190},
+				{220, 221, 222}
+			},
+			{
+				{31, 63, 127},
+				{159, 191, 255},
+				{252, 253, 254}
+			}
+		}
+	}
+};
+
+/**
+ * @brief The number of bits, trits, and quints needed for a quant level.
+ */
+struct btq_count
+{
+	/** @brief The number of bits. */
+	uint8_t bits:6;
+
+	/** @brief The number of trits. */
+	uint8_t trits:1;
+
+	/** @brief The number of quints. */
+	uint8_t quints:1;
+};
+
+/**
+ * @brief The table of bits, trits, and quints needed for a quant encode.
+ */
+static const std::array<btq_count, 21> btq_counts {{
+	{ 1, 0, 0 }, // QUANT_2
+	{ 0, 1, 0 }, // QUANT_3
+	{ 2, 0, 0 }, // QUANT_4
+	{ 0, 0, 1 }, // QUANT_5
+	{ 1, 1, 0 }, // QUANT_6
+	{ 3, 0, 0 }, // QUANT_8
+	{ 1, 0, 1 }, // QUANT_10
+	{ 2, 1, 0 }, // QUANT_12
+	{ 4, 0, 0 }, // QUANT_16
+	{ 2, 0, 1 }, // QUANT_20
+	{ 3, 1, 0 }, // QUANT_24
+	{ 5, 0, 0 }, // QUANT_32
+	{ 3, 0, 1 }, // QUANT_40
+	{ 4, 1, 0 }, // QUANT_48
+	{ 6, 0, 0 }, // QUANT_64
+	{ 4, 0, 1 }, // QUANT_80
+	{ 5, 1, 0 }, // QUANT_96
+	{ 7, 0, 0 }, // QUANT_128
+	{ 5, 0, 1 }, // QUANT_160
+	{ 6, 1, 0 }, // QUANT_192
+	{ 8, 0, 0 }  // QUANT_256
+}};
+
+/**
+ * @brief The sequence scale, round, and divisors needed to compute sizing.
+ *
+ * The length of a quantized sequence in bits is:
+ *     (scale * <sequence_len> + round) / divisor
+ */
+struct ise_size
+{
+	/** @brief The scaling parameter. */
+	uint8_t scale:6;
+
+	/** @brief The divisor parameter. */
+	uint8_t divisor:2;
+};
+
+/**
+ * @brief The table of scale, round, and divisors needed for quant sizing.
+ */
+static const std::array<ise_size, 21> ise_sizes {{
+	{  1, 0 }, // QUANT_2
+	{  8, 2 }, // QUANT_3
+	{  2, 0 }, // QUANT_4
+	{  7, 1 }, // QUANT_5
+	{ 13, 2 }, // QUANT_6
+	{  3, 0 }, // QUANT_8
+	{ 10, 1 }, // QUANT_10
+	{ 18, 2 }, // QUANT_12
+	{  4, 0 }, // QUANT_16
+	{ 13, 1 }, // QUANT_20
+	{ 23, 2 }, // QUANT_24
+	{  5, 0 }, // QUANT_32
+	{ 16, 1 }, // QUANT_40
+	{ 28, 2 }, // QUANT_48
+	{  6, 0 }, // QUANT_64
+	{ 19, 1 }, // QUANT_80
+	{ 33, 2 }, // QUANT_96
+	{  7, 0 }, // QUANT_128
+	{ 22, 1 }, // QUANT_160
+	{ 38, 2 }, // QUANT_192
+	{  8, 0 }  // QUANT_256
+}};
+
+/* See header for documentation. */
+unsigned int get_ise_sequence_bitcount(
+	unsigned int character_count,
+	quant_method quant_level
+) {
+	// Cope with out-of bounds values - input might be invalid
+	if (static_cast<size_t>(quant_level) >= ise_sizes.size())
+	{
+		// Arbitrary large number that's more than an ASTC block can hold
+		return 1024;
+	}
+
+	auto& entry = ise_sizes[quant_level];
+	unsigned int divisor = (entry.divisor << 1) + 1;
+	return (entry.scale * character_count + divisor - 1) / divisor;
+}
+
+/**
+ * @brief Write up to 8 bits at an arbitrary bit offset.
+ *
+ * The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
+ * span two separate bytes in memory.
+ *
+ * @param         value       The value to write.
+ * @param         bitcount    The number of bits to write, starting from LSB.
+ * @param         bitoffset   The bit offset to store at, between 0 and 7.
+ * @param[in,out] ptr         The data pointer to write to.
+ */
+static inline void write_bits(
+	unsigned int value,
+	unsigned int bitcount,
+	unsigned int bitoffset,
+	uint8_t ptr[2]
+) {
+	unsigned int mask = (1 << bitcount) - 1;
+	value &= mask;
+	ptr += bitoffset >> 3;
+	bitoffset &= 7;
+	value <<= bitoffset;
+	mask <<= bitoffset;
+	mask = ~mask;
+
+	ptr[0] &= mask;
+	ptr[0] |= value;
+	ptr[1] &= mask >> 8;
+	ptr[1] |= value >> 8;
+}
+
+/**
+ * @brief Read up to 16 bits from two bytes.
+ *
+ * This function reads a packed N-bit field from two bytes in memory. The stored value must exist
+ * within the two bytes, but can start at an arbitary bit offset and span the two bytes in memory.
+ *
+ * @param         bitcount    The number of bits to read.
+ * @param         bitoffset   The bit offset to read from, between 0 and 7.
+ * @param[in,out] ptr         The data pointer to read from.
+ *
+ * @return The read value.
+ */
+static inline unsigned int read_bits(
+	unsigned int bitcount,
+	unsigned int bitoffset,
+	const uint8_t* ptr
+) {
+	unsigned int mask = (1 << bitcount) - 1;
+	ptr += bitoffset >> 3;
+	bitoffset &= 7;
+	unsigned int value = ptr[0] | (ptr[1] << 8);
+	value >>= bitoffset;
+	value &= mask;
+	return value;
+}
+
+/* See header for documentation. */
+void encode_ise(
+	quant_method quant_level,
+	unsigned int character_count,
+	const uint8_t* input_data,
+	uint8_t* output_data,
+	unsigned int bit_offset
+) {
+	promise(character_count > 0);
+
+	unsigned int bits = btq_counts[quant_level].bits;
+	unsigned int trits = btq_counts[quant_level].trits;
+	unsigned int quints = btq_counts[quant_level].quints;
+	unsigned int mask = (1 << bits) - 1;
+
+	// Write out trits and bits
+	if (trits)
+	{
+		unsigned int i = 0;
+		unsigned int full_trit_blocks = character_count / 5;
+
+		for (unsigned int j = 0; j < full_trit_blocks; j++)
+		{
+			unsigned int i4 = input_data[i + 4] >> bits;
+			unsigned int i3 = input_data[i + 3] >> bits;
+			unsigned int i2 = input_data[i + 2] >> bits;
+			unsigned int i1 = input_data[i + 1] >> bits;
+			unsigned int i0 = input_data[i + 0] >> bits;
+
+			uint8_t T = integer_of_trits[i4][i3][i2][i1][i0];
+
+			// The max size of a trit bit count is 6, so we can always safely
+			// pack a single MX value with the following 1 or 2 T bits.
+			uint8_t pack;
+
+			// Element 0 + T0 + T1
+			pack = (input_data[i++] & mask) | (((T >> 0) & 0x3) << bits);
+			write_bits(pack, bits + 2, bit_offset, output_data);
+			bit_offset += bits + 2;
+
+			// Element 1 + T2 + T3
+			pack = (input_data[i++] & mask) | (((T >> 2) & 0x3) << bits);
+			write_bits(pack, bits + 2, bit_offset, output_data);
+			bit_offset += bits + 2;
+
+			// Element 2 + T4
+			pack = (input_data[i++] & mask) | (((T >> 4) & 0x1) << bits);
+			write_bits(pack, bits + 1, bit_offset, output_data);
+			bit_offset += bits + 1;
+
+			// Element 3 + T5 + T6
+			pack = (input_data[i++] & mask) | (((T >> 5) & 0x3) << bits);
+			write_bits(pack, bits + 2, bit_offset, output_data);
+			bit_offset += bits + 2;
+
+			// Element 4 + T7
+			pack = (input_data[i++] & mask) | (((T >> 7) & 0x1) << bits);
+			write_bits(pack, bits + 1, bit_offset, output_data);
+			bit_offset += bits + 1;
+		}
+
+		// Loop tail for a partial block
+		if (i != character_count)
+		{
+			// i4 cannot be present - we know the block is partial
+			// i0 must be present - we know the block isn't empty
+			unsigned int i4 =                            0;
+			unsigned int i3 = i + 3 >= character_count ? 0 : input_data[i + 3] >> bits;
+			unsigned int i2 = i + 2 >= character_count ? 0 : input_data[i + 2] >> bits;
+			unsigned int i1 = i + 1 >= character_count ? 0 : input_data[i + 1] >> bits;
+			unsigned int i0 =                                input_data[i + 0] >> bits;
+
+			uint8_t T = integer_of_trits[i4][i3][i2][i1][i0];
+
+			for (unsigned int j = 0; i < character_count; i++, j++)
+			{
+				// Truncated table as this iteration is always partital
+				static const uint8_t tbits[4]  { 2, 2, 1, 2 };
+				static const uint8_t tshift[4] { 0, 2, 4, 5 };
+
+				uint8_t pack = (input_data[i] & mask) |
+				               (((T >> tshift[j]) & ((1 << tbits[j]) - 1)) << bits);
+
+				write_bits(pack, bits + tbits[j], bit_offset, output_data);
+				bit_offset += bits + tbits[j];
+			}
+		}
+	}
+	// Write out quints and bits
+	else if (quints)
+	{
+		unsigned int i = 0;
+		unsigned int full_quint_blocks = character_count / 3;
+
+		for (unsigned int j = 0; j < full_quint_blocks; j++)
+		{
+			unsigned int i2 = input_data[i + 2] >> bits;
+			unsigned int i1 = input_data[i + 1] >> bits;
+			unsigned int i0 = input_data[i + 0] >> bits;
+
+			uint8_t T = integer_of_quints[i2][i1][i0];
+
+			// The max size of a quint bit count is 5, so we can always safely
+			// pack a single M value with the following 2 or 3 T bits.
+			uint8_t pack;
+
+			// Element 0
+			pack = (input_data[i++] & mask) | (((T >> 0) & 0x7) << bits);
+			write_bits(pack, bits + 3, bit_offset, output_data);
+			bit_offset += bits + 3;
+
+			// Element 1
+			pack = (input_data[i++] & mask) | (((T >> 3) & 0x3) << bits);
+			write_bits(pack, bits + 2, bit_offset, output_data);
+			bit_offset += bits + 2;
+
+			// Element 2
+			pack = (input_data[i++] & mask) | (((T >> 5) & 0x3) << bits);
+			write_bits(pack, bits + 2, bit_offset, output_data);
+			bit_offset += bits + 2;
+		}
+
+		// Loop tail for a partial block
+		if (i != character_count)
+		{
+			// i2 cannot be present - we know the block is partial
+			// i0 must be present - we know the block isn't empty
+			unsigned int i2 =                            0;
+			unsigned int i1 = i + 1 >= character_count ? 0 : input_data[i + 1] >> bits;
+			unsigned int i0 =                                input_data[i + 0] >> bits;
+
+			uint8_t T = integer_of_quints[i2][i1][i0];
+
+			for (unsigned int j = 0; i < character_count; i++, j++)
+			{
+				// Truncated table as this iteration is always partital
+				static const uint8_t tbits[2]  { 3, 2 };
+				static const uint8_t tshift[2] { 0, 3 };
+
+				uint8_t pack = (input_data[i] & mask) |
+				               (((T >> tshift[j]) & ((1 << tbits[j]) - 1)) << bits);
+
+				write_bits(pack, bits + tbits[j], bit_offset, output_data);
+				bit_offset += bits + tbits[j];
+			}
+		}
+	}
+	// Write out just bits
+	else
+	{
+		for (unsigned int i = 0; i < character_count; i++)
+		{
+			write_bits(input_data[i], bits, bit_offset, output_data);
+			bit_offset += bits;
+		}
+	}
+}
+
+/* See header for documentation. */
+void decode_ise(
+	quant_method quant_level,
+	unsigned int character_count,
+	const uint8_t* input_data,
+	uint8_t* output_data,
+	unsigned int bit_offset
+) {
+	promise(character_count > 0);
+
+	// Note: due to how the trit/quint-block unpacking is done in this function, we may write more
+	// temporary results than the number of outputs. The maximum actual number of results is 64 bit,
+	// but we keep 4 additional character_count of padding.
+	uint8_t results[68];
+	uint8_t tq_blocks[22] { 0 }; // Trit-blocks or quint-blocks, must be zeroed
+
+	unsigned int bits = btq_counts[quant_level].bits;
+	unsigned int trits = btq_counts[quant_level].trits;
+	unsigned int quints = btq_counts[quant_level].quints;
+
+	unsigned int lcounter = 0;
+	unsigned int hcounter = 0;
+
+	// Collect bits for each element, as well as bits for any trit-blocks and quint-blocks.
+	for (unsigned int i = 0; i < character_count; i++)
+	{
+		results[i] = static_cast<uint8_t>(read_bits(bits, bit_offset, input_data));
+		bit_offset += bits;
+
+		if (trits)
+		{
+			static const uint8_t bits_to_read[5]  { 2, 2, 1, 2, 1 };
+			static const uint8_t block_shift[5]   { 0, 2, 4, 5, 7 };
+			static const uint8_t next_lcounter[5] { 1, 2, 3, 4, 0 };
+			static const uint8_t hcounter_incr[5] { 0, 0, 0, 0, 1 };
+			unsigned int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
+			bit_offset += bits_to_read[lcounter];
+			tq_blocks[hcounter] |= tdata << block_shift[lcounter];
+			hcounter += hcounter_incr[lcounter];
+			lcounter = next_lcounter[lcounter];
+		}
+
+		if (quints)
+		{
+			static const uint8_t bits_to_read[3]  { 3, 2, 2 };
+			static const uint8_t block_shift[3]   { 0, 3, 5 };
+			static const uint8_t next_lcounter[3] { 1, 2, 0 };
+			static const uint8_t hcounter_incr[3] { 0, 0, 1 };
+			unsigned int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
+			bit_offset += bits_to_read[lcounter];
+			tq_blocks[hcounter] |= tdata << block_shift[lcounter];
+			hcounter += hcounter_incr[lcounter];
+			lcounter = next_lcounter[lcounter];
+		}
+	}
+
+	// Unpack trit-blocks or quint-blocks as needed
+	if (trits)
+	{
+		unsigned int trit_blocks = (character_count + 4) / 5;
+		promise(trit_blocks > 0);
+		for (unsigned int i = 0; i < trit_blocks; i++)
+		{
+			const uint8_t *tritptr = trits_of_integer[tq_blocks[i]];
+			results[5 * i    ] |= tritptr[0] << bits;
+			results[5 * i + 1] |= tritptr[1] << bits;
+			results[5 * i + 2] |= tritptr[2] << bits;
+			results[5 * i + 3] |= tritptr[3] << bits;
+			results[5 * i + 4] |= tritptr[4] << bits;
+		}
+	}
+
+	if (quints)
+	{
+		unsigned int quint_blocks = (character_count + 2) / 3;
+		promise(quint_blocks > 0);
+		for (unsigned int i = 0; i < quint_blocks; i++)
+		{
+			const uint8_t *quintptr = quints_of_integer[tq_blocks[i]];
+			results[3 * i    ] |= quintptr[0] << bits;
+			results[3 * i + 1] |= quintptr[1] << bits;
+			results[3 * i + 2] |= quintptr[2] << bits;
+		}
+	}
+
+	for (unsigned int i = 0; i < character_count; i++)
+	{
+		output_data[i] = results[i];
+	}
+}
@@ -0,0 +1,346 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2025 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions and data declarations for the outer context.
+ *
+ * The outer context includes thread-pool management, which is slower to
+ * compile due to increased use of C++ stdlib. The inner context used in the
+ * majority of the codec library does not include this.
+ */
+
+#ifndef ASTCENC_INTERNAL_ENTRY_INCLUDED
+#define ASTCENC_INTERNAL_ENTRY_INCLUDED
+
+#include <atomic>
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+
+#include "astcenc_internal.h"
+
+/* ============================================================================
+  Parallel execution control
+============================================================================ */
+
+/**
+ * @brief A simple counter-based manager for parallel task execution.
+ *
+ * The task processing execution consists of:
+ *
+ *     * A single-threaded init stage.
+ *     * A multi-threaded processing stage.
+ *     * A condition variable so threads can wait for processing completion.
+ *
+ * The init stage will be executed by the first thread to arrive in the critical section, there is
+ * no main thread in the thread pool.
+ *
+ * The processing stage uses dynamic dispatch to assign task tickets to threads on an on-demand
+ * basis. Threads may each therefore executed different numbers of tasks, depending on their
+ * processing complexity. The task queue and the task tickets are just counters; the caller must map
+ * these integers to an actual processing partition in a specific problem domain.
+ *
+ * The exit wait condition is needed to ensure processing has finished before a worker thread can
+ * progress to the next stage of the pipeline. Specifically a worker may exit the processing stage
+ * because there are no new tasks to assign to it while other worker threads are still processing.
+ * Calling @c wait() will ensure that all other worker have finished before the thread can proceed.
+ *
+ * The basic usage model:
+ *
+ *     // --------- From single-threaded code ---------
+ *
+ *     // Reset the tracker state
+ *     manager->reset()
+ *
+ *     // --------- From multi-threaded code ---------
+ *
+ *     // Run the stage init; only first thread actually runs the lambda
+ *     manager->init(<lambda>)
+ *
+ *     do
+ *     {
+ *         // Request a task assignment
+ *         uint task_count;
+ *         uint base_index = manager->get_tasks(<granule>, task_count);
+ *
+ *         // Process any tasks we were given (task_count <= granule size)
+ *         if (task_count)
+ *         {
+ *             // Run the user task processing code for N tasks here
+ *             ...
+ *
+ *             // Flag these tasks as complete
+ *             manager->complete_tasks(task_count);
+ *         }
+ *     } while (task_count);
+ *
+ *     // Wait for all threads to complete tasks before progressing
+ *     manager->wait()
+ *
+  *     // Run the stage term; only first thread actually runs the lambda
+ *     manager->term(<lambda>)
+ */
+class ParallelManager
+{
+private:
+	/** @brief Lock used for critical section and condition synchronization. */
+	std::mutex m_lock;
+
+	/** @brief True if the current operation is cancelled. */
+	std::atomic<bool> m_is_cancelled;
+
+	/** @brief True if the stage init() step has been executed. */
+	bool m_init_done;
+
+	/** @brief True if the stage term() step has been executed. */
+	bool m_term_done;
+
+	/** @brief Condition variable for tracking stage processing completion. */
+	std::condition_variable m_complete;
+
+	/** @brief Number of tasks started, but not necessarily finished. */
+	std::atomic<unsigned int> m_start_count;
+
+	/** @brief Number of tasks finished. */
+	unsigned int m_done_count;
+
+	/** @brief Number of tasks that need to be processed. */
+	unsigned int m_task_count;
+
+	/** @brief Progress callback (optional). */
+	astcenc_progress_callback m_callback;
+
+	/** @brief Lock used for callback synchronization. */
+	std::mutex m_callback_lock;
+
+	/** @brief Minimum progress before making a callback. */
+	float m_callback_min_diff;
+
+	/** @brief Last progress callback value. */
+	float m_callback_last_value;
+
+public:
+	/** @brief Create a new ParallelManager. */
+	ParallelManager()
+	{
+		reset();
+	}
+
+	/**
+	 * @brief Reset the tracker for a new processing batch.
+	 *
+	 * This must be called from single-threaded code before starting the multi-threaded processing
+	 * operations.
+	 */
+	void reset()
+	{
+		m_init_done = false;
+		m_term_done = false;
+		m_is_cancelled = false;
+		m_start_count = 0;
+		m_done_count = 0;
+		m_task_count = 0;
+		m_callback = nullptr;
+		m_callback_last_value = 0.0f;
+		m_callback_min_diff = 1.0f;
+	}
+
+	/**
+	 * @brief Clear the tracker and stop new tasks being assigned.
+	 *
+	 * Note, all in-flight tasks in a worker will still complete normally.
+	 */
+	void cancel()
+	{
+		m_is_cancelled = true;
+	}
+
+	/**
+	 * @brief Trigger the pipeline stage init step.
+	 *
+	 * This can be called from multi-threaded code. The first thread to hit this will process the
+	 * initialization. Other threads will block and wait for it to complete.
+	 *
+	 * @param init_func   Callable which executes the stage initialization. It must return the
+	 *                    total number of tasks in the stage.
+	 */
+	void init(std::function<unsigned int(void)> init_func)
+	{
+		std::lock_guard<std::mutex> lck(m_lock);
+		if (!m_init_done)
+		{
+			m_task_count = init_func();
+			m_init_done = true;
+		}
+	}
+
+	/**
+	 * @brief Trigger the pipeline stage init step.
+	 *
+	 * This can be called from multi-threaded code. The first thread to hit this will process the
+	 * initialization. Other threads will block and wait for it to complete.
+	 *
+	 * @param task_count   Total number of tasks needing processing.
+	 * @param callback     Function pointer for progress status callbacks.
+	 */
+	void init(unsigned int task_count, astcenc_progress_callback callback)
+	{
+		std::lock_guard<std::mutex> lck(m_lock);
+		if (!m_init_done)
+		{
+			m_callback = callback;
+			m_task_count = task_count;
+			m_init_done = true;
+
+			// Report every 1% or 4096 blocks, whichever is larger, to avoid callback overhead
+			float min_diff = (4096.0f / static_cast<float>(task_count)) * 100.0f;
+			m_callback_min_diff = astc::max(min_diff, 1.0f);
+		}
+	}
+
+	/**
+	 * @brief Request a task assignment.
+	 *
+	 * Assign up to @c granule tasks to the caller for processing.
+	 *
+	 * @param      granule   Maximum number of tasks that can be assigned.
+	 * @param[out] count     Actual number of tasks assigned, or zero if no tasks were assigned.
+	 *
+	 * @return Task index of the first assigned task; assigned tasks increment from this.
+	 */
+	unsigned int get_task_assignment(unsigned int granule, unsigned int& count)
+	{
+		unsigned int base = m_start_count.fetch_add(granule, std::memory_order_relaxed);
+		if (m_is_cancelled || base >= m_task_count)
+		{
+			count = 0;
+			return 0;
+		}
+
+		count = astc::min(m_task_count - base, granule);
+		return base;
+	}
+
+	/**
+	 * @brief Complete a task assignment.
+	 *
+	 * Mark @c count tasks as complete. This will notify all threads blocked on @c wait() if this
+	 * completes the processing of the stage.
+	 *
+	 * @param count   The number of completed tasks.
+	 */
+	void complete_task_assignment(unsigned int count)
+	{
+		// Note: m_done_count cannot use an atomic without the mutex; this has a race between the
+		// update here and the wait() for other threads
+		unsigned int local_count;
+		float local_last_value;
+		{
+			std::unique_lock<std::mutex> lck(m_lock);
+			m_done_count += count;
+			local_count = m_done_count;
+			local_last_value = m_callback_last_value;
+
+			// Ensure the progress bar hits 100%
+			if (m_callback && m_done_count == m_task_count)
+			{
+				std::unique_lock<std::mutex> cblck(m_callback_lock);
+				m_callback(100.0f);
+				m_callback_last_value = 100.0f;
+			}
+
+			// Notify if nothing left to do
+			if (m_is_cancelled || m_done_count == m_task_count)
+			{
+				lck.unlock();
+				m_complete.notify_all();
+			}
+		}
+
+		// Process progress callback if we have one
+		if (m_callback)
+		{
+			// Initial lockless test - have we progressed enough to emit?
+			float num = static_cast<float>(local_count);
+			float den = static_cast<float>(m_task_count);
+			float this_value =  (num / den) * 100.0f;
+			bool report_test = (this_value - local_last_value) > m_callback_min_diff;
+
+			// Recheck under lock, because another thread might report first
+			if (report_test)
+			{
+				std::unique_lock<std::mutex> cblck(m_callback_lock);
+				bool report_retest = (this_value - m_callback_last_value) > m_callback_min_diff;
+				if (report_retest)
+				{
+					m_callback(this_value);
+					m_callback_last_value = this_value;
+				}
+			}
+		}
+	}
+
+	/**
+	 * @brief Wait for stage processing to complete.
+	 */
+	void wait()
+	{
+		std::unique_lock<std::mutex> lck(m_lock);
+		m_complete.wait(lck, [this]{ return m_is_cancelled || m_done_count == m_task_count; });
+	}
+
+	/**
+	 * @brief Trigger the pipeline stage term step.
+	 *
+	 * This can be called from multi-threaded code. The first thread to hit this will process the
+	 * work pool termination. Caller must have called @c wait() prior to calling this function to
+	 * ensure that processing is complete.
+	 *
+	 * @param term_func   Callable which executes the stage termination.
+	 */
+	void term(std::function<void(void)> term_func)
+	{
+		std::lock_guard<std::mutex> lck(m_lock);
+		if (!m_term_done)
+		{
+			term_func();
+			m_term_done = true;
+		}
+	}
+};
+
+/**
+ * @brief The astcenc compression context.
+ */
+struct astcenc_context
+{
+	/** @brief The context internal state. */
+	astcenc_contexti context;
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+	/** @brief The parallel manager for averages computation. */
+	ParallelManager manage_avg;
+
+	/** @brief The parallel manager for compression. */
+	ParallelManager manage_compress;
+#endif
+
+	/** @brief The parallel manager for decompression. */
+	ParallelManager manage_decompress;
+};
+
+#endif
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#include "astcenc_mathlib.h"
+
+/**
+ * @brief 64-bit rotate left.
+ *
+ * @param val   The value to rotate.
+ * @param count The rotation, in bits.
+ */
+static inline uint64_t rotl(uint64_t val, int count)
+{
+	return (val << count) | (val >> (64 - count));
+}
+
+/* See header for documentation. */
+void astc::rand_init(uint64_t state[2])
+{
+	state[0] = 0xfaf9e171cea1ec6bULL;
+	state[1] = 0xf1b318cc06af5d71ULL;
+}
+
+/* See header for documentation. */
+uint64_t astc::rand(uint64_t state[2])
+{
+	uint64_t s0 = state[0];
+	uint64_t s1 = state[1];
+	uint64_t res = s0 + s1;
+	s1 ^= s0;
+	state[0] = rotl(s0, 24) ^ s1 ^ (s1 << 16);
+	state[1] = rotl(s1, 37);
+	return res;
+}
@@ -0,0 +1,505 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2025 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/*
+ * This module implements a variety of mathematical data types and library
+ * functions used by the codec.
+ */
+
+#ifndef ASTC_MATHLIB_H_INCLUDED
+#define ASTC_MATHLIB_H_INCLUDED
+
+#include <cassert>
+#include <cstdint>
+#include <cmath>
+
+#ifndef ASTCENC_POPCNT
+  #if defined(__POPCNT__)
+    #define ASTCENC_POPCNT 1
+  #else
+    #define ASTCENC_POPCNT 0
+  #endif
+#endif
+
+#ifndef ASTCENC_F16C
+  #if defined(__F16C__)
+    #define ASTCENC_F16C 1
+  #else
+    #define ASTCENC_F16C 0
+  #endif
+#endif
+
+#ifndef ASTCENC_SSE
+  #if defined(__SSE4_2__)
+    #define ASTCENC_SSE 42
+  #elif defined(__SSE4_1__)
+    #define ASTCENC_SSE 41
+  #elif defined(__SSE2__) || (defined(_M_AMD64) && !defined(_M_ARM64EC))
+    #define ASTCENC_SSE 20
+  #else
+    #define ASTCENC_SSE 0
+  #endif
+#endif
+
+#ifndef ASTCENC_AVX
+  #if defined(__AVX2__)
+    #define ASTCENC_AVX 2
+    #define ASTCENC_X86_GATHERS 1
+  #elif defined(__AVX__)
+    #define ASTCENC_AVX 1
+    #define ASTCENC_X86_GATHERS 1
+  #else
+    #define ASTCENC_AVX 0
+  #endif
+#endif
+
+#ifndef ASTCENC_NEON
+  #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+    #define ASTCENC_NEON 1
+  #else
+    #define ASTCENC_NEON 0
+  #endif
+#endif
+
+#ifndef ASTCENC_SVE
+  #if defined(__ARM_FEATURE_SVE)
+    #if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256
+      #define ASTCENC_SVE 8
+    // Auto-detected SVE can only assume vector width of 4 is available, but
+    // must also allow for hardware being longer and so all use of intrinsics
+    // must explicitly use predicate masks to limit to 4-wide.
+    #else
+      #define ASTCENC_SVE 4
+    #endif
+    #else
+    #define ASTCENC_SVE 0
+  #endif
+#endif
+
+// Force vector-sized SIMD alignment
+#if ASTCENC_AVX || ASTCENC_SVE == 8
+  #define ASTCENC_VECALIGN 32
+#elif ASTCENC_SSE || ASTCENC_NEON || ASTCENC_SVE == 4
+  #define ASTCENC_VECALIGN 16
+// Use default alignment for non-SIMD builds
+#else
+  #define ASTCENC_VECALIGN 0
+#endif
+
+// C++11 states that alignas(0) should be ignored but GCC doesn't do
+// this on some versions, so workaround and avoid emitting alignas(0)
+#if ASTCENC_VECALIGN > 0
+	#define ASTCENC_ALIGNAS alignas(ASTCENC_VECALIGN)
+#else
+	#define ASTCENC_ALIGNAS
+#endif
+
+#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 || ASTCENC_POPCNT != 0
+	#include <immintrin.h>
+#endif
+
+/* ============================================================================
+  Fast math library; note that many of the higher-order functions in this set
+  use approximations which are less accurate, but faster, than <cmath> standard
+  library equivalents.
+
+  Note: Many of these are not necessarily faster than simple C versions when
+  used on a single scalar value, but are included for testing purposes as most
+  have an option based on SSE intrinsics and therefore provide an obvious route
+  to future vectorization.
+============================================================================ */
+
+// Union for manipulation of float bit patterns
+typedef union
+{
+	uint32_t u;
+	int32_t s;
+	float f;
+} if32;
+
+// These are namespaced to avoid colliding with C standard library functions.
+namespace astc
+{
+
+static const float PI          = 3.14159265358979323846f;
+static const float PI_OVER_TWO = 1.57079632679489661923f;
+
+/**
+ * @brief SP float absolute value.
+ *
+ * @param v   The value to make absolute.
+ *
+ * @return The absolute value.
+ */
+static inline float fabs(float v)
+{
+	return std::fabs(v);
+}
+
+/**
+ * @brief Test if a float value is a nan.
+ *
+ * @param v    The value test.
+ *
+ * @return Zero is not a NaN, non-zero otherwise.
+ */
+static inline bool isnan(float v)
+{
+	return v != v;
+}
+
+/**
+ * @brief Return the minimum of two values.
+ *
+ * For floats, NaNs are turned into @c q.
+ *
+ * @param p   The first value to compare.
+ * @param q   The second value to compare.
+ *
+ * @return The smallest value.
+ */
+template<typename T>
+static inline T min(T p, T q)
+{
+	return p < q ? p : q;
+}
+
+/**
+ * @brief Return the minimum of three values.
+ *
+ * For floats, NaNs are turned into @c r.
+ *
+ * @param p   The first value to compare.
+ * @param q   The second value to compare.
+ * @param r   The third value to compare.
+ *
+ * @return The smallest value.
+ */
+template<typename T>
+static inline T min(T p, T q, T r)
+{
+	return min(min(p, q), r);
+}
+
+/**
+ * @brief Return the minimum of four values.
+ *
+ * For floats, NaNs are turned into @c s.
+ *
+ * @param p   The first value to compare.
+ * @param q   The second value to compare.
+ * @param r   The third value to compare.
+ * @param s   The fourth value to compare.
+ *
+ * @return The smallest value.
+ */
+template<typename T>
+static inline T min(T p, T q, T r, T s)
+{
+	return min(min(p, q), min(r, s));
+}
+
+/**
+ * @brief Return the maximum of two values.
+ *
+ * For floats, NaNs are turned into @c q.
+ *
+ * @param p   The first value to compare.
+ * @param q   The second value to compare.
+ *
+ * @return The largest value.
+ */
+template<typename T>
+static inline T max(T p, T q)
+{
+	return p > q ? p : q;
+}
+
+/**
+ * @brief Return the maximum of three values.
+ *
+ * For floats, NaNs are turned into @c r.
+ *
+ * @param p   The first value to compare.
+ * @param q   The second value to compare.
+ * @param r   The third value to compare.
+ *
+ * @return The largest value.
+ */
+template<typename T>
+static inline T max(T p, T q, T r)
+{
+	return max(max(p, q), r);
+}
+
+/**
+ * @brief Return the maximum of four values.
+ *
+ * For floats, NaNs are turned into @c s.
+ *
+ * @param p   The first value to compare.
+ * @param q   The second value to compare.
+ * @param r   The third value to compare.
+ * @param s   The fourth value to compare.
+ *
+ * @return The largest value.
+ */
+template<typename T>
+static inline T max(T p, T q, T r, T s)
+{
+	return max(max(p, q), max(r, s));
+}
+
+/**
+ * @brief Clamp a value value between @c mn and @c mx.
+ *
+ * For floats, NaNs are turned into @c mn.
+ *
+ * @param v      The value to clamp.
+ * @param mn     The min value (inclusive).
+ * @param mx     The max value (inclusive).
+ *
+ * @return The clamped value.
+ */
+template<typename T>
+inline T clamp(T v, T mn, T mx)
+{
+	// Do not reorder; correct NaN handling relies on the fact that comparison
+	// with NaN returns false and will fall-though to the "min" value.
+	if (v > mx) return mx;
+	if (v > mn) return v;
+	return mn;
+}
+
+/**
+ * @brief Clamp a float value between 0.0f and 1.0f.
+ *
+ * NaNs are turned into 0.0f.
+ *
+ * @param v   The value to clamp.
+ *
+ * @return The clamped value.
+ */
+static inline float clamp1f(float v)
+{
+	return astc::clamp(v, 0.0f, 1.0f);
+}
+
+/**
+ * @brief Clamp a float value between 0.0f and 255.0f.
+ *
+ * NaNs are turned into 0.0f.
+ *
+ * @param v  The value to clamp.
+ *
+ * @return The clamped value.
+ */
+static inline float clamp255f(float v)
+{
+	return astc::clamp(v, 0.0f, 255.0f);
+}
+
+/**
+ * @brief SP float round-down.
+ *
+ * @param v   The value to round.
+ *
+ * @return The rounded value.
+ */
+static inline float flt_rd(float v)
+{
+	return std::floor(v);
+}
+
+/**
+ * @brief SP float round-to-nearest and convert to integer.
+ *
+ * @param v   The value to round.
+ *
+ * @return The rounded value.
+ */
+static inline int flt2int_rtn(float v)
+{
+
+	return static_cast<int>(v + 0.5f);
+}
+
+/**
+ * @brief SP float round down and convert to integer.
+ *
+ * @param v   The value to round.
+ *
+ * @return The rounded value.
+ */
+static inline int flt2int_rd(float v)
+{
+	return static_cast<int>(v);
+}
+
+/**
+ * @brief SP float bit-interpreted as an integer.
+ *
+ * @param v   The value to bitcast.
+ *
+ * @return The converted value.
+ */
+static inline int float_as_int(float v)
+{
+	union { int a; float b; } u;
+	u.b = v;
+	return u.a;
+}
+
+/**
+ * @brief Integer bit-interpreted as an SP float.
+ *
+ * @param v   The value to bitcast.
+ *
+ * @return The converted value.
+ */
+static inline float int_as_float(int v)
+{
+	union { int a; float b; } u;
+	u.a = v;
+	return u.b;
+}
+
+/**
+ * @brief Fast approximation of 1.0 / sqrt(val).
+ *
+ * @param v   The input value.
+ *
+ * @return The approximated result.
+ */
+static inline float rsqrt(float v)
+{
+	return 1.0f / std::sqrt(v);
+}
+
+/**
+ * @brief Fast approximation of sqrt(val).
+ *
+ * @param v   The input value.
+ *
+ * @return The approximated result.
+ */
+static inline float sqrt(float v)
+{
+	return std::sqrt(v);
+}
+
+/**
+ * @brief Extract mantissa and exponent of a float value.
+ *
+ * @param      v      The input value.
+ * @param[out] expo   The output exponent.
+ *
+ * @return The mantissa.
+ */
+static inline float frexp(float v, int* expo)
+{
+	if32 p;
+	p.f = v;
+	*expo = ((p.u >> 23) & 0xFF) - 126;
+	p.u = (p.u & 0x807fffff) | 0x3f000000;
+	return p.f;
+}
+
+/**
+ * @brief Initialize the seed structure for a random number generator.
+ *
+ * Important note: For the purposes of ASTC we want sets of random numbers to
+ * use the codec, but we want the same seed value across instances and threads
+ * to ensure that image output is stable across compressor runs and across
+ * platforms. Every PRNG created by this call will therefore return the same
+ * sequence of values ...
+ *
+ * @param state The state structure to initialize.
+ */
+void rand_init(uint64_t state[2]);
+
+/**
+ * @brief Return the next random number from the generator.
+ *
+ * This RNG is an implementation of the "xoroshoro-128+ 1.0" PRNG, based on the
+ * public-domain implementation given by David Blackman & Sebastiano Vigna at
+ * http://vigna.di.unimi.it/xorshift/xoroshiro128plus.c
+ *
+ * @param state The state structure to use/update.
+ */
+uint64_t rand(uint64_t state[2]);
+
+}
+
+/* ============================================================================
+  Softfloat library with fp32 and fp16 conversion functionality.
+============================================================================ */
+#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
+	/* narrowing float->float conversions */
+	uint16_t float_to_sf16(float val);
+	float sf16_to_float(uint16_t val);
+#endif
+
+/*********************************
+  Vector library
+*********************************/
+#include "astcenc_vecmathlib.h"
+
+/*********************************
+  Declaration of line types
+*********************************/
+// parametric line, 2D: The line is given by line = a + b * t.
+
+struct line2
+{
+	vfloat4 a;
+	vfloat4 b;
+};
+
+// parametric line, 3D
+struct line3
+{
+	vfloat4 a;
+	vfloat4 b;
+};
+
+struct line4
+{
+	vfloat4 a;
+	vfloat4 b;
+};
+
+
+struct processed_line2
+{
+	vfloat4 amod;
+	vfloat4 bs;
+};
+
+struct processed_line3
+{
+	vfloat4 amod;
+	vfloat4 bs;
+};
+
+struct processed_line4
+{
+	vfloat4 amod;
+	vfloat4 bs;
+};
+
+#endif
@@ -0,0 +1,411 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Soft-float library for IEEE-754.
+ */
+#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
+
+#include "astcenc_mathlib.h"
+
+/*	sized soft-float types. These are mapped to the sized integer
+    types of C99, instead of C's floating-point types; this is because
+    the library needs to maintain exact, bit-level control on all
+    operations on these data types. */
+typedef uint16_t sf16;
+typedef uint32_t sf32;
+
+/******************************************
+  helper functions and their lookup tables
+ ******************************************/
+/* count leading zeros functions. Only used when the input is nonzero. */
+
+#if defined(__GNUC__) && (defined(__i386) || defined(__amd64))
+#elif defined(__arm__) && defined(__ARMCC_VERSION)
+#elif defined(__arm__) && defined(__GNUC__)
+#else
+	/* table used for the slow default versions. */
+	static const uint8_t clz_table[256] =
+	{
+		8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+		3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	};
+#endif
+
+/*
+   32-bit count-leading-zeros function: use the Assembly instruction whenever possible. */
+static uint32_t clz32(uint32_t inp)
+{
+	#if defined(__GNUC__) && (defined(__i386) || defined(__amd64))
+		uint32_t bsr;
+		__asm__("bsrl %1, %0": "=r"(bsr):"r"(inp | 1));
+		return 31 - bsr;
+	#else
+		#if defined(__arm__) && defined(__ARMCC_VERSION)
+			return __clz(inp);			/* armcc builtin */
+		#else
+			#if defined(__arm__) && defined(__GNUC__)
+				uint32_t lz;
+				__asm__("clz %0, %1": "=r"(lz):"r"(inp));
+				return lz;
+			#else
+				/* slow default version */
+				uint32_t summa = 24;
+				if (inp >= UINT32_C(0x10000))
+				{
+					inp >>= 16;
+					summa -= 16;
+				}
+				if (inp >= UINT32_C(0x100))
+				{
+					inp >>= 8;
+					summa -= 8;
+				}
+				return summa + clz_table[inp];
+			#endif
+		#endif
+	#endif
+}
+
+/* the five rounding modes that IEEE-754r defines */
+typedef enum
+{
+	SF_UP = 0,				/* round towards positive infinity */
+	SF_DOWN = 1,			/* round towards negative infinity */
+	SF_TOZERO = 2,			/* round towards zero */
+	SF_NEARESTEVEN = 3,		/* round toward nearest value; if mid-between, round to even value */
+	SF_NEARESTAWAY = 4		/* round toward nearest value; if mid-between, round away from zero */
+} roundmode;
+
+
+static uint32_t rtne_shift32(uint32_t inp, uint32_t shamt)
+{
+	uint32_t vl1 = UINT32_C(1) << shamt;
+	uint32_t inp2 = inp + (vl1 >> 1);	/* added 0.5 ULP */
+	uint32_t msk = (inp | UINT32_C(1)) & vl1;	/* nonzero if odd. '| 1' forces it to 1 if the shamt is 0. */
+	msk--;						/* negative if even, nonnegative if odd. */
+	inp2 -= (msk >> 31);		/* subtract epsilon before shift if even. */
+	inp2 >>= shamt;
+	return inp2;
+}
+
+static uint32_t rtna_shift32(uint32_t inp, uint32_t shamt)
+{
+	uint32_t vl1 = (UINT32_C(1) << shamt) >> 1;
+	inp += vl1;
+	inp >>= shamt;
+	return inp;
+}
+
+static uint32_t rtup_shift32(uint32_t inp, uint32_t shamt)
+{
+	uint32_t vl1 = UINT32_C(1) << shamt;
+	inp += vl1;
+	inp--;
+	inp >>= shamt;
+	return inp;
+}
+
+/* convert from FP16 to FP32. */
+static sf32 sf16_to_sf32(sf16 inp)
+{
+	uint32_t inpx = inp;
+
+	/*
+		This table contains, for every FP16 sign/exponent value combination,
+		the difference between the input FP16 value and the value obtained
+		by shifting the correct FP32 result right by 13 bits.
+		This table allows us to handle every case except denormals and NaN
+		with just 1 table lookup, 2 shifts and 1 add.
+	*/
+
+	#define WITH_MSB(a) (UINT32_C(a) | (1u << 31))
+	static const uint32_t tbl[64] =
+	{
+		WITH_MSB(0x00000), 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,          0x1C000,
+		         0x1C000,  0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,          0x1C000,
+		         0x1C000,  0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,          0x1C000,
+		         0x1C000,  0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, WITH_MSB(0x38000),
+		WITH_MSB(0x38000), 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,          0x54000,
+		         0x54000,  0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,          0x54000,
+		         0x54000,  0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,          0x54000,
+		         0x54000,  0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, WITH_MSB(0x70000)
+	};
+
+	uint32_t res = tbl[inpx >> 10];
+	res += inpx;
+
+	/* Normal cases: MSB of 'res' not set. */
+	if ((res & WITH_MSB(0)) == 0)
+	{
+		return res << 13;
+	}
+
+	/* Infinity and Zero: 10 LSB of 'res' not set. */
+	if ((res & 0x3FF) == 0)
+	{
+		return res << 13;
+	}
+
+	/* NaN: the exponent field of 'inp' is non-zero. */
+	if ((inpx & 0x7C00) != 0)
+	{
+		/* All NaNs are quietened. */
+		return (res << 13) | 0x400000;
+	}
+
+	/* Denormal cases */
+	uint32_t sign = (inpx & 0x8000) << 16;
+	uint32_t mskval = inpx & 0x7FFF;
+	uint32_t leadingzeroes = clz32(mskval);
+	mskval <<= leadingzeroes;
+	return (mskval >> 8) + ((0x85 - leadingzeroes) << 23) + sign;
+}
+
+/* Conversion routine that converts from FP32 to FP16. It supports denormals and all rounding modes. If a NaN is given as input, it is quietened. */
+static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
+{
+	/* for each possible sign/exponent combination, store a case index. This gives a 512-byte table */
+	static const uint8_t tab[512] {
+		0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+		20, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+		30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 50,
+
+		5, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+		25, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
+		35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 55,
+	};
+
+	/* many of the cases below use a case-dependent magic constant. So we look up a magic constant before actually performing the switch. This table allows us to group cases, thereby minimizing code
+	   size. */
+	static const uint32_t tabx[60] {
+		UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x80000000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
+		UINT32_C(1), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8001), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
+		UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
+		UINT32_C(0xC8001FFF), UINT32_C(0xC8000000), UINT32_C(0xC8000000), UINT32_C(0xC8000FFF), UINT32_C(0xC8001000),
+		UINT32_C(0x58000000), UINT32_C(0x38001FFF), UINT32_C(0x58000000), UINT32_C(0x58000FFF), UINT32_C(0x58001000),
+		UINT32_C(0x7C00), UINT32_C(0x7BFF), UINT32_C(0x7BFF), UINT32_C(0x7C00), UINT32_C(0x7C00),
+		UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFC00),
+		UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000),
+		UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000)
+	};
+
+	uint32_t p;
+	uint32_t idx = rmode + tab[inp >> 23];
+	uint32_t vlx = tabx[idx];
+	switch (idx)
+	{
+		/*
+			Positive number which may be Infinity or NaN.
+			We need to check whether it is NaN; if it is, quieten it by setting the top bit of the mantissa.
+			(If we don't do this quieting, then a NaN  that is distinguished only by having
+			its low-order bits set, would be turned into an INF. */
+	case 50:
+	case 51:
+	case 52:
+	case 53:
+	case 54:
+	case 55:
+	case 56:
+	case 57:
+	case 58:
+	case 59:
+		/*
+			the input value is 0x7F800000 or 0xFF800000 if it is INF.
+			By subtracting 1, we get 7F7FFFFF or FF7FFFFF, that is, bit 23 becomes zero.
+			For NaNs, however, this operation will keep bit 23 with the value 1.
+			We can then extract bit 23, and logical-OR bit 9 of the result with this
+			bit in order to quieten the NaN (a Quiet NaN is a NaN where the top bit
+			of the mantissa is set.)
+		*/
+		p = (inp - 1) & UINT32_C(0x800000);	/* zero if INF, nonzero if NaN. */
+		return static_cast<sf16>(((inp + vlx) >> 13) | (p >> 14));
+		/*
+			positive, exponent = 0, round-mode == UP; need to check whether number actually is 0.
+			If it is, then return 0, else return 1 (the smallest representable nonzero number)
+		*/
+	case 0:
+		/*
+			-inp will set the MSB if the input number is nonzero.
+			Thus (-inp) >> 31 will turn into 0 if the input number is 0 and 1 otherwise.
+		*/
+		return static_cast<sf16>(static_cast<uint32_t>((-static_cast<int32_t>(inp))) >> 31);
+
+		/*
+			negative, exponent = , round-mode == DOWN, need to check whether number is
+			actually 0. If it is, return 0x8000 ( float -0.0 )
+			Else return the smallest negative number ( 0x8001 ) */
+	case 6:
+		/*
+			in this case 'vlx' is 0x80000000. By subtracting the input value from it,
+			we obtain a value that is 0 if the input value is in fact zero and has
+			the MSB set if it isn't. We then right-shift the value by 31 places to
+			get a value that is 0 if the input is -0.0 and 1 otherwise.
+		*/
+		return static_cast<sf16>(((vlx - inp) >> 31) + UINT32_C(0x8000));
+
+		/*
+			for all other cases involving underflow/overflow, we don't need to
+			do actual tests; we just return 'vlx'.
+		*/
+	case 1:
+	case 2:
+	case 3:
+	case 4:
+	case 5:
+	case 7:
+	case 8:
+	case 9:
+	case 10:
+	case 11:
+	case 12:
+	case 13:
+	case 14:
+	case 15:
+	case 16:
+	case 17:
+	case 18:
+	case 19:
+	case 40:
+	case 41:
+	case 42:
+	case 43:
+	case 44:
+	case 45:
+	case 46:
+	case 47:
+	case 48:
+	case 49:
+		return static_cast<sf16>(vlx);
+
+		/*
+			for normal numbers, 'vlx' is the difference between the FP32 value of a number and the
+			FP16 representation of the same number left-shifted by 13 places. In addition, a rounding constant is
+			baked into 'vlx': for rounding-away-from zero, the constant is 2^13 - 1, causing roundoff away
+			from zero. for round-to-nearest away, the constant is 2^12, causing roundoff away from zero.
+			for round-to-nearest-even, the constant is 2^12 - 1. This causes correct round-to-nearest-even
+			except for odd input numbers. For odd input numbers, we need to add 1 to the constant. */
+
+		/* normal number, all rounding modes except round-to-nearest-even: */
+	case 30:
+	case 31:
+	case 32:
+	case 34:
+	case 35:
+	case 36:
+	case 37:
+	case 39:
+		return static_cast<sf16>((inp + vlx) >> 13);
+
+		/* normal number, round-to-nearest-even. */
+	case 33:
+	case 38:
+		p = inp + vlx;
+		p += (inp >> 13) & 1;
+		return static_cast<sf16>(p >> 13);
+
+		/*
+			the various denormal cases. These are not expected to be common, so their performance is a bit
+			less important. For each of these cases, we need to extract an exponent and a mantissa
+			(including the implicit '1'!), and then right-shift the mantissa by a shift-amount that
+			depends on the exponent. The shift must apply the correct rounding mode. 'vlx' is used to supply the
+			sign of the resulting denormal number.
+		*/
+	case 21:
+	case 22:
+	case 25:
+	case 27:
+		/* denormal, round towards zero. */
+		p = 126 - ((inp >> 23) & 0xFF);
+		return static_cast<sf16>((((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000)) >> p) | vlx);
+	case 20:
+	case 26:
+		/* denormal, round away from zero. */
+		p = 126 - ((inp >> 23) & 0xFF);
+		return static_cast<sf16>(rtup_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
+	case 24:
+	case 29:
+		/* denormal, round to nearest-away */
+		p = 126 - ((inp >> 23) & 0xFF);
+		return static_cast<sf16>(rtna_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
+	case 23:
+	case 28:
+		/* denormal, round to nearest-even. */
+		p = 126 - ((inp >> 23) & 0xFF);
+		return static_cast<sf16>(rtne_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
+	}
+
+	return 0;
+}
+
+/* convert from soft-float to native-float */
+float sf16_to_float(uint16_t p)
+{
+	if32 i;
+	i.u = sf16_to_sf32(p);
+	return i.f;
+}
+
+/* convert from native-float to soft-float */
+uint16_t float_to_sf16(float p)
+{
+	if32 i;
+	i.f = p;
+	return sf32_to_sf16(i.u, SF_NEARESTEVEN);
+}
+
+#endif
@@ -0,0 +1,481 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for generating partition tables on demand.
+ */
+
+#include "astcenc_internal.h"
+
+/** @brief The number of 64-bit words needed to represent a canonical partition bit pattern. */
+#define BIT_PATTERN_WORDS (((ASTCENC_BLOCK_MAX_TEXELS * 2) + 63) / 64)
+
+/**
+ * @brief Generate a canonical representation of a partition pattern.
+ *
+ * The returned value stores two bits per texel, for up to 6x6x6 texels, where the two bits store
+ * the remapped texel index. Remapping ensures that we only match on the partition pattern,
+ * independent of the partition order generated by the hash.
+ *
+ * @param      texel_count          The number of texels in the block.
+ * @param      partition_of_texel   The partition assignments, in hash order.
+ * @param[out] bit_pattern          The output bit pattern representation.
+ */
+static void generate_canonical_partitioning(
+	unsigned int texel_count,
+	const uint8_t* partition_of_texel,
+	uint64_t bit_pattern[BIT_PATTERN_WORDS]
+) {
+	// Clear the pattern
+	for (unsigned int i = 0; i < BIT_PATTERN_WORDS; i++)
+	{
+		bit_pattern[i] = 0;
+	}
+
+	// Store a mapping to reorder the raw partitions so that the partitions are ordered such
+	// that the lowest texel index in partition N is smaller than the lowest texel index in
+	// partition N + 1.
+	int mapped_index[BLOCK_MAX_PARTITIONS];
+	int map_weight_count = 0;
+
+	for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONS; i++)
+	{
+		mapped_index[i] = -1;
+	}
+
+	for (unsigned int i = 0; i < texel_count; i++)
+	{
+		int index = partition_of_texel[i];
+		if (mapped_index[index] < 0)
+		{
+			mapped_index[index] = map_weight_count++;
+		}
+
+		uint64_t xlat_index = mapped_index[index];
+		bit_pattern[i >> 5] |= xlat_index << (2 * (i & 0x1F));
+	}
+}
+
+/**
+ * @brief Compare two canonical patterns to see if they are the same.
+ *
+ * @param part1   The first canonical bit pattern to check.
+ * @param part2   The second canonical bit pattern to check.
+ *
+ * @return @c true if the patterns are the same, @c false otherwise.
+ */
+static bool compare_canonical_partitionings(
+	const uint64_t part1[BIT_PATTERN_WORDS],
+	const uint64_t part2[BIT_PATTERN_WORDS]
+) {
+	return (part1[0] == part2[0])
+#if BIT_PATTERN_WORDS > 1
+	    && (part1[1] == part2[1])
+#endif
+#if BIT_PATTERN_WORDS > 2
+	    && (part1[2] == part2[2])
+#endif
+#if BIT_PATTERN_WORDS > 3
+	    && (part1[3] == part2[3])
+#endif
+#if BIT_PATTERN_WORDS > 4
+	    && (part1[4] == part2[4])
+#endif
+#if BIT_PATTERN_WORDS > 5
+	    && (part1[5] == part2[5])
+#endif
+#if BIT_PATTERN_WORDS > 6
+	    && (part1[6] == part2[6])
+#endif
+	    ;
+}
+
+/**
+ * @brief Hash function used for procedural partition assignment.
+ *
+ * @param inp   The hash seed.
+ *
+ * @return The hashed value.
+ */
+static uint32_t hash52(
+	uint32_t inp
+) {
+	inp ^= inp >> 15;
+
+	// (2^4 + 1) * (2^7 + 1) * (2^17 - 1)
+	inp *= 0xEEDE0891;
+	inp ^= inp >> 5;
+	inp += inp << 16;
+	inp ^= inp >> 7;
+	inp ^= inp >> 3;
+	inp ^= inp << 6;
+	inp ^= inp >> 17;
+	return inp;
+}
+
+/**
+ * @brief Select texel assignment for a single coordinate.
+ *
+ * @param seed              The seed - the partition index from the block.
+ * @param x                 The texel X coordinate in the block.
+ * @param y                 The texel Y coordinate in the block.
+ * @param z                 The texel Z coordinate in the block.
+ * @param partition_count   The total partition count of this encoding.
+ * @param small_block       @c true if the block has fewer than 32 texels.
+ *
+ * @return The assigned partition index for this texel.
+ */
+static uint8_t select_partition(
+	int seed,
+	int x,
+	int y,
+	int z,
+	int partition_count,
+	bool small_block
+) {
+	// For small blocks bias the coordinates to get better distribution
+	if (small_block)
+	{
+		x <<= 1;
+		y <<= 1;
+		z <<= 1;
+	}
+
+	seed += (partition_count - 1) * 1024;
+
+	uint32_t rnum = hash52(seed);
+
+	uint8_t seed1 = rnum & 0xF;
+	uint8_t seed2 = (rnum >> 4) & 0xF;
+	uint8_t seed3 = (rnum >> 8) & 0xF;
+	uint8_t seed4 = (rnum >> 12) & 0xF;
+	uint8_t seed5 = (rnum >> 16) & 0xF;
+	uint8_t seed6 = (rnum >> 20) & 0xF;
+	uint8_t seed7 = (rnum >> 24) & 0xF;
+	uint8_t seed8 = (rnum >> 28) & 0xF;
+	uint8_t seed9 = (rnum >> 18) & 0xF;
+	uint8_t seed10 = (rnum >> 22) & 0xF;
+	uint8_t seed11 = (rnum >> 26) & 0xF;
+	uint8_t seed12 = ((rnum >> 30) | (rnum << 2)) & 0xF;
+
+	// Squaring all the seeds in order to bias their distribution towards lower values.
+	seed1 *= seed1;
+	seed2 *= seed2;
+	seed3 *= seed3;
+	seed4 *= seed4;
+	seed5 *= seed5;
+	seed6 *= seed6;
+	seed7 *= seed7;
+	seed8 *= seed8;
+	seed9 *= seed9;
+	seed10 *= seed10;
+	seed11 *= seed11;
+	seed12 *= seed12;
+
+	int sh1, sh2;
+	if (seed & 1)
+	{
+		sh1 = (seed & 2 ? 4 : 5);
+		sh2 = (partition_count == 3 ? 6 : 5);
+	}
+	else
+	{
+		sh1 = (partition_count == 3 ? 6 : 5);
+		sh2 = (seed & 2 ? 4 : 5);
+	}
+
+	int sh3 = (seed & 0x10) ? sh1 : sh2;
+
+	seed1 >>= sh1;
+	seed2 >>= sh2;
+	seed3 >>= sh1;
+	seed4 >>= sh2;
+	seed5 >>= sh1;
+	seed6 >>= sh2;
+	seed7 >>= sh1;
+	seed8 >>= sh2;
+
+	seed9 >>= sh3;
+	seed10 >>= sh3;
+	seed11 >>= sh3;
+	seed12 >>= sh3;
+
+	int a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
+	int b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
+	int c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
+	int d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
+
+	// Apply the saw
+	a &= 0x3F;
+	b &= 0x3F;
+	c &= 0x3F;
+	d &= 0x3F;
+
+	// Remove some of the components if we are to output < 4 partitions.
+	if (partition_count <= 3)
+	{
+		d = 0;
+	}
+
+	if (partition_count <= 2)
+	{
+		c = 0;
+	}
+
+	if (partition_count <= 1)
+	{
+		b = 0;
+	}
+
+	uint8_t partition;
+	if (a >= b && a >= c && a >= d)
+	{
+		partition = 0;
+	}
+	else if (b >= c && b >= d)
+	{
+		partition = 1;
+	}
+	else if (c >= d)
+	{
+		partition = 2;
+	}
+	else
+	{
+		partition = 3;
+	}
+
+	return partition;
+}
+
+/**
+ * @brief Generate a single partition info structure.
+ *
+ * @param[out] bsd                     The block size information.
+ * @param      partition_count         The partition count of this partitioning.
+ * @param      partition_index         The partition index / seed of this partitioning.
+ * @param      partition_remap_index   The remapped partition index of this partitioning.
+ * @param[out] pi                      The partition info structure to populate.
+ *
+ * @return True if this is a useful partition index, False if we can skip it.
+ */
+static bool generate_one_partition_info_entry(
+	block_size_descriptor& bsd,
+	unsigned int partition_count,
+	unsigned int partition_index,
+	unsigned int partition_remap_index,
+	partition_info& pi
+) {
+	int texels_per_block = bsd.texel_count;
+	bool small_block = texels_per_block < 32;
+
+	uint8_t *partition_of_texel = pi.partition_of_texel;
+
+	// Assign texels to partitions
+	int texel_idx = 0;
+	int counts[BLOCK_MAX_PARTITIONS] { 0 };
+	for (unsigned int z = 0; z < bsd.zdim; z++)
+	{
+		for (unsigned int y = 0; y <  bsd.ydim; y++)
+		{
+			for (unsigned int x = 0; x <  bsd.xdim; x++)
+			{
+				uint8_t part = select_partition(partition_index, x, y, z, partition_count, small_block);
+				pi.texels_of_partition[part][counts[part]++] = static_cast<uint8_t>(texel_idx++);
+				*partition_of_texel++ = part;
+			}
+		}
+	}
+
+	// Fill loop tail so we can overfetch later
+	for (unsigned int i = 0; i < partition_count; i++)
+	{
+		size_t ptex_count = counts[i];
+		size_t ptex_count_simd = round_up_to_simd_multiple_vla(ptex_count);
+		for (size_t j = ptex_count; j < ptex_count_simd; j++)
+		{
+			pi.texels_of_partition[i][j] = pi.texels_of_partition[i][ptex_count - 1];
+		}
+	}
+
+	// Populate the actual procedural partition count
+	if (counts[0] == 0)
+	{
+		pi.partition_count = 0;
+	}
+	else if (counts[1] == 0)
+	{
+		pi.partition_count = 1;
+	}
+	else if (counts[2] == 0)
+	{
+		pi.partition_count = 2;
+	}
+	else if (counts[3] == 0)
+	{
+		pi.partition_count = 3;
+	}
+	else
+	{
+		pi.partition_count = 4;
+	}
+
+	// Populate the partition index
+	pi.partition_index = static_cast<uint16_t>(partition_index);
+
+	// Populate the coverage bitmaps for 2/3/4 partitions
+	uint64_t* bitmaps { nullptr };
+	if (partition_count == 2)
+	{
+		bitmaps = bsd.coverage_bitmaps_2[partition_remap_index];
+	}
+	else if (partition_count == 3)
+	{
+		bitmaps = bsd.coverage_bitmaps_3[partition_remap_index];
+	}
+	else if (partition_count == 4)
+	{
+		bitmaps = bsd.coverage_bitmaps_4[partition_remap_index];
+	}
+
+	for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONS; i++)
+	{
+		pi.partition_texel_count[i] = static_cast<uint8_t>(counts[i]);
+	}
+
+	// Valid partitionings have texels in all of the requested partitions
+	bool valid = pi.partition_count == partition_count;
+
+	if (bitmaps)
+	{
+		// Populate the partition coverage bitmap
+		for (unsigned int i = 0; i < partition_count; i++)
+		{
+			bitmaps[i] = 0ULL;
+		}
+
+		unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS);
+		for (unsigned int i = 0; i < texels_to_process; i++)
+		{
+			unsigned int idx = bsd.kmeans_texels[i];
+			bitmaps[pi.partition_of_texel[idx]] |= 1ULL << i;
+		}
+	}
+
+	return valid;
+}
+
+static void build_partition_table_for_one_partition_count(
+	block_size_descriptor& bsd,
+	bool can_omit_partitionings,
+	unsigned int partition_count_cutoff,
+	unsigned int partition_count,
+	partition_info* ptab,
+	uint64_t* canonical_patterns
+) {
+	unsigned int next_index = 0;
+	bsd.partitioning_count_selected[partition_count - 1] = 0;
+	bsd.partitioning_count_all[partition_count - 1] = 0;
+
+	// Skip tables larger than config max partition count if we can omit modes
+	if (can_omit_partitionings && (partition_count > partition_count_cutoff))
+	{
+		return;
+	}
+
+	// Iterate through twice
+	//   - Pass 0: Keep selected partitionings
+	//   - Pass 1: Keep non-selected partitionings (skip if in omit mode)
+	unsigned int max_iter = can_omit_partitionings ? 1 : 2;
+
+	// Tracker for things we built in the first iteration
+	uint8_t build[BLOCK_MAX_PARTITIONINGS] { 0 };
+	for (unsigned int x = 0; x < max_iter; x++)
+	{
+		for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONINGS; i++)
+		{
+			// Don't include things we built in the first pass
+			if ((x == 1) && build[i])
+			{
+				continue;
+			}
+
+			bool keep_useful = generate_one_partition_info_entry(bsd, partition_count, i, next_index, ptab[next_index]);
+			if ((x == 0) && !keep_useful)
+			{
+				continue;
+			}
+
+			generate_canonical_partitioning(bsd.texel_count, ptab[next_index].partition_of_texel, canonical_patterns + next_index * BIT_PATTERN_WORDS);
+			bool keep_canonical = true;
+			for (unsigned int j = 0; j < next_index; j++)
+			{
+				bool match = compare_canonical_partitionings(canonical_patterns + next_index * BIT_PATTERN_WORDS, canonical_patterns +  j * BIT_PATTERN_WORDS);
+				if (match)
+				{
+					keep_canonical = false;
+					break;
+				}
+			}
+
+			if (keep_useful && keep_canonical)
+			{
+				if (x == 0)
+				{
+					bsd.partitioning_packed_index[partition_count - 2][i] = static_cast<uint16_t>(next_index);
+					bsd.partitioning_count_selected[partition_count - 1]++;
+					bsd.partitioning_count_all[partition_count - 1]++;
+					build[i] = 1;
+					next_index++;
+				}
+			}
+			else
+			{
+				if (x == 1)
+				{
+					bsd.partitioning_packed_index[partition_count - 2][i] = static_cast<uint16_t>(next_index);
+					bsd.partitioning_count_all[partition_count - 1]++;
+					next_index++;
+				}
+			}
+		}
+	}
+}
+
+/* See header for documentation. */
+void init_partition_tables(
+	block_size_descriptor& bsd,
+	bool can_omit_partitionings,
+	unsigned int partition_count_cutoff
+) {
+	partition_info* par_tab2 = bsd.partitionings;
+	partition_info* par_tab3 = par_tab2 + BLOCK_MAX_PARTITIONINGS;
+	partition_info* par_tab4 = par_tab3 + BLOCK_MAX_PARTITIONINGS;
+	partition_info* par_tab1 = par_tab4 + BLOCK_MAX_PARTITIONINGS;
+
+	generate_one_partition_info_entry(bsd, 1, 0, 0, *par_tab1);
+	bsd.partitioning_count_selected[0] = 1;
+	bsd.partitioning_count_all[0] = 1;
+
+	uint64_t* canonical_patterns = new uint64_t[BLOCK_MAX_PARTITIONINGS * BIT_PATTERN_WORDS];
+
+	build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 2, par_tab2, canonical_patterns);
+	build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 3, par_tab3, canonical_patterns);
+	build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 4, par_tab4, canonical_patterns);
+
+	delete[] canonical_patterns;
+}
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2020-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Platform-specific function implementations.
+ *
+ * This module contains functions for querying the host extended ISA support.
+ */
+
+// Include before the defines below to pick up any auto-setup based on compiler
+// built-in config, if not being set explicitly by the build system
+#include "astcenc_internal.h"
+
+#if (ASTCENC_SSE > 0)    || (ASTCENC_AVX > 0) || \
+    (ASTCENC_POPCNT > 0) || (ASTCENC_F16C > 0)
+
+static bool g_init { false };
+
+/** Does this CPU support SSE 4.1? Set to -1 if not yet initialized. */
+static bool g_cpu_has_sse41 { false };
+
+/** Does this CPU support AVX2? Set to -1 if not yet initialized. */
+static bool g_cpu_has_avx2 { false };
+
+/** Does this CPU support POPCNT? Set to -1 if not yet initialized. */
+static bool g_cpu_has_popcnt { false };
+
+/** Does this CPU support F16C? Set to -1 if not yet initialized. */
+static bool g_cpu_has_f16c { false };
+
+/* ============================================================================
+   Platform code for Visual Studio
+============================================================================ */
+#if !defined(__clang__) && defined(_MSC_VER)
+#define WIN32_LEAN_AND_MEAN
+#include <Windows.h>
+#include <intrin.h>
+
+/**
+ * @brief Detect platform CPU ISA support and update global trackers.
+ */
+static void detect_cpu_isa()
+{
+	int data[4];
+
+	__cpuid(data, 0);
+	int num_id = data[0];
+
+	if (num_id >= 1)
+	{
+		__cpuidex(data, 1, 0);
+		// SSE41 = Bank 1, ECX, bit 19
+		g_cpu_has_sse41 = data[2] & (1 << 19) ? true : false;
+		// POPCNT = Bank 1, ECX, bit 23
+		g_cpu_has_popcnt = data[2] & (1 << 23) ? true : false;
+		// F16C = Bank 1, ECX, bit 29
+		g_cpu_has_f16c = data[2] & (1 << 29) ? true : false;
+	}
+
+	if (num_id >= 7)
+	{
+		__cpuidex(data, 7, 0);
+		// AVX2 = Bank 7, EBX, bit 5
+		g_cpu_has_avx2 = data[1] & (1 << 5) ? true : false;
+	}
+
+	// Ensure state bits are updated before init flag is updated
+	MemoryBarrier();
+	g_init = true;
+}
+
+/* ============================================================================
+   Platform code for GCC and Clang
+============================================================================ */
+#else
+#include <cpuid.h>
+
+/**
+ * @brief Detect platform CPU ISA support and update global trackers.
+ */
+static void detect_cpu_isa()
+{
+	unsigned int data[4];
+
+	if (__get_cpuid_count(1, 0, &data[0], &data[1], &data[2], &data[3]))
+	{
+		// SSE41 = Bank 1, ECX, bit 19
+		g_cpu_has_sse41 = data[2] & (1 << 19) ? true : false;
+		// POPCNT = Bank 1, ECX, bit 23
+		g_cpu_has_popcnt = data[2] & (1 << 23) ? true : false;
+		// F16C = Bank 1, ECX, bit 29
+		g_cpu_has_f16c = data[2] & (1 << 29) ? true : false;
+	}
+
+	g_cpu_has_avx2 = 0;
+	if (__get_cpuid_count(7, 0, &data[0], &data[1], &data[2], &data[3]))
+	{
+		// AVX2 = Bank 7, EBX, bit 5
+		g_cpu_has_avx2 = data[1] & (1 << 5) ? true : false;
+	}
+
+	// Ensure state bits are updated before init flag is updated
+	__sync_synchronize();
+	g_init = true;
+}
+#endif
+
+/* See header for documentation. */
+bool cpu_supports_popcnt()
+{
+	if (!g_init)
+	{
+		detect_cpu_isa();
+	}
+
+	return g_cpu_has_popcnt;
+}
+
+/* See header for documentation. */
+bool cpu_supports_f16c()
+{
+	if (!g_init)
+	{
+		detect_cpu_isa();
+	}
+
+	return g_cpu_has_f16c;
+}
+
+/* See header for documentation. */
+bool cpu_supports_sse41()
+{
+	if (!g_init)
+	{
+		detect_cpu_isa();
+	}
+
+	return g_cpu_has_sse41;
+}
+
+/* See header for documentation. */
+bool cpu_supports_avx2()
+{
+	if (!g_init)
+	{
+		detect_cpu_isa();
+	}
+
+	return g_cpu_has_avx2;
+}
+
+#endif
@@ -0,0 +1,903 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions and data tables for numeric quantization..
+ */
+
+#include "astcenc_internal.h"
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+// Not scrambled, starts from QUANT_6
+const uint8_t color_unquant_to_uquant_tables[17][512] {
+	{ // QUANT_6
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,
+		 51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,
+		 51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,
+		 51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51, 102, 102, 102, 102, 102, 102,
+		102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+		102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+		102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+		153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
+		153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
+		153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
+		153, 153, 153, 153, 153, 153, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
+		204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
+		204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
+		204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+		255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{ // QUANT_8
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+		  0,   0,   0,   0,   0,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,
+		 36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,
+		 36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,
+		 73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,
+		 73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73, 109, 109, 109, 109, 109, 109, 109, 109, 109,
+		109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
+		109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
+		146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146,
+		146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146,
+		146, 146, 146, 146, 146, 146, 146, 146, 146, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
+		182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
+		182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219,
+		219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219,
+		219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 255, 255, 255, 255, 255,
+		255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{ // QUANT_10
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  28,  28,  28,
+		 28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,
+		 28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,
+		 56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,
+		 56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,
+		 84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,
+		 84,  84,  84,  84,  84,  84, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113,
+		113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 171, 171, 171, 171, 171, 171,
+		171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171,
+		171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199,
+		199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199,
+		199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227,
+		227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227,
+		227, 227, 227, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{ // QUANT_12
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  23,  23,  23,  23,  23,  23,  23,  23,
+		 23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,
+		 23,  23,  23,  23,  23,  23,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,
+		 46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,
+		 69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,
+		 69,  69,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,
+		 92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116,
+		116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116,
+		139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139,
+		139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163,
+		163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 186, 186,
+		186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186,
+		186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209,
+		209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 232, 232, 232, 232, 232, 232,
+		232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
+		232, 232, 232, 232, 232, 232, 232, 232, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{ // QUANT_16
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,
+		 17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,
+		 34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,
+		 51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  68,  68,  68,  68,  68,  68,  68,  68,
+		 68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  85,  85,  85,  85,  85,  85,
+		 85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85, 102, 102, 102, 102,
+		102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 119, 119,
+		119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119,
+		136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136,
+		136, 136, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
+		153, 153, 153, 153, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170,
+		170, 170, 170, 170, 170, 170, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
+		187, 187, 187, 187, 187, 187, 187, 187, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
+		204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221,
+		221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238,
+		238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{ // QUANT_20
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,
+		 13,  13,  13,  13,  13,  13,  13,  13,  13,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,
+		 27,  27,  27,  27,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  54,
+		 54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  67,  67,  67,  67,  67,  67,
+		 67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,
+		 80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  80,  94,  94,  94,  94,  94,  94,  94,  94,  94,  94,  94,  94,  94,  94,  94,  94,  94,
+		 94,  94,  94,  94,  94,  94,  94,  94,  94,  94, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
+		107, 107, 107, 107, 107, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121,
+		134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 148, 148, 148, 148, 148,
+		148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161,
+		161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175,
+		175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188,
+		188, 188, 188, 188, 188, 188, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201,
+		201, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 228, 228, 228, 228,
+		228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 242, 242, 242, 242, 242, 242, 242, 242, 242,
+		242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{ // QUANT_24
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,
+		 11,  11,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  33,  33,  33,  33,  33,  33,  33,  33,
+		 33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  44,  44,  44,  44,  44,  44,  44,  44,  44,  44,  44,  44,  44,  44,  44,  44,  44,  44,
+		 44,  44,  44,  44,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  66,  66,  66,  66,  66,  66,
+		 66,  66,  66,  66,  66,  66,  66,  66,  66,  66,  66,  66,  66,  66,  66,  66,  77,  77,  77,  77,  77,  77,  77,  77,  77,  77,  77,  77,  77,  77,  77,  77,
+		 77,  77,  77,  77,  77,  77,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  99,  99,  99,  99,
+		 99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110,
+		110, 110, 110, 110, 110, 110, 110, 110, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121,
+		134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 145, 145, 145, 145, 145, 145, 145, 145,
+		145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156,
+		156, 156, 156, 156, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 178, 178, 178, 178, 178, 178,
+		178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189,
+		189, 189, 189, 189, 189, 189, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 211, 211, 211, 211,
+		211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222,
+		222, 222, 222, 222, 222, 222, 222, 222, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 244, 244,
+		244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{ // QUANT_32
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,  16,  16,  16,  16,  16,  16,  16,
+		 16,  16,  16,  16,  16,  16,  16,  16,  16,  24,  24,  24,  24,  24,  24,  24,  24,  24,  24,  24,  24,  24,  24,  24,  24,  24,  33,  33,  33,  33,  33,  33,
+		 33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  33,  41,  41,  41,  41,  41,  41,  41,  41,  41,  41,  41,  41,  41,  41,  41,  41,  49,  49,  49,  49,  49,
+		 49,  49,  49,  49,  49,  49,  49,  49,  49,  49,  49,  57,  57,  57,  57,  57,  57,  57,  57,  57,  57,  57,  57,  57,  57,  57,  57,  57,  66,  66,  66,  66,
+		 66,  66,  66,  66,  66,  66,  66,  66,  66,  66,  66,  66,  66,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  74,  82,  82,  82,
+		 82,  82,  82,  82,  82,  82,  82,  82,  82,  82,  82,  82,  82,  90,  90,  90,  90,  90,  90,  90,  90,  90,  90,  90,  90,  90,  90,  90,  90,  90,  99,  99,
+		 99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99,  99, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 115,
+		115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123,
+		132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140,
+		140, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156,
+		156, 156, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 173, 173, 173, 173, 173, 173, 173, 173, 173, 173, 173, 173, 173,
+		173, 173, 173, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189,
+		189, 189, 189, 189, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206,
+		206, 206, 206, 206, 206, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222,
+		222, 222, 222, 222, 222, 222, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 239, 239, 239, 239, 239, 239, 239, 239, 239,
+		239, 239, 239, 239, 239, 239, 239, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{ // QUANT_40
+		  0,   0,   0,   0,   0,   0,   0,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,
+		 13,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  26,  26,  26,  26,  26,  26,  26,  26,  26,  26,  26,  26,  26,  32,  32,  32,  32,  32,
+		 32,  32,  32,  32,  32,  32,  32,  32,  39,  39,  39,  39,  39,  39,  39,  39,  39,  39,  39,  39,  39,  45,  45,  45,  45,  45,  45,  45,  45,  45,  45,  45,
+		 45,  45,  52,  52,  52,  52,  52,  52,  52,  52,  52,  52,  52,  52,  52,  58,  58,  58,  58,  58,  58,  58,  58,  58,  58,  58,  58,  58,  65,  65,  65,  65,
+		 65,  65,  65,  65,  65,  65,  65,  65,  65,  71,  71,  71,  71,  71,  71,  71,  71,  71,  71,  71,  71,  71,  78,  78,  78,  78,  78,  78,  78,  78,  78,  78,
+		 78,  78,  78,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  91,  91,  91,  91,  91,  91,  91,  91,  91,  91,  91,  91,  91,  97,  97,  97,
+		 97,  97,  97,  97,  97,  97,  97,  97,  97,  97, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 110, 110, 110, 110, 110, 110, 110, 110, 110,
+		110, 110, 110, 110, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123,
+		132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 138, 138, 138, 138, 138, 138, 138, 138, 138, 138, 138, 138, 138, 145, 145, 145, 145,
+		145, 145, 145, 145, 145, 145, 145, 145, 145, 151, 151, 151, 151, 151, 151, 151, 151, 151, 151, 151, 151, 151, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158,
+		158, 158, 158, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 177, 177, 177,
+		177, 177, 177, 177, 177, 177, 177, 177, 177, 177, 184, 184, 184, 184, 184, 184, 184, 184, 184, 184, 184, 184, 184, 190, 190, 190, 190, 190, 190, 190, 190, 190,
+		190, 190, 190, 190, 197, 197, 197, 197, 197, 197, 197, 197, 197, 197, 197, 197, 197, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 210, 210,
+		210, 210, 210, 210, 210, 210, 210, 210, 210, 210, 210, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 223, 223, 223, 223, 223, 223, 223, 223,
+		223, 223, 223, 223, 223, 229, 229, 229, 229, 229, 229, 229, 229, 229, 229, 229, 229, 229, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 242,
+		242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 255, 255, 255, 255, 255, 255, 255
+	},
+	{ // QUANT_48
+		  0,   0,   0,   0,   0,   0,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  16,  16,  16,  16,
+		 16,  16,  16,  16,  16,  16,  21,  21,  21,  21,  21,  21,  21,  21,  21,  21,  21,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  32,  32,  32,  32,
+		 32,  32,  32,  32,  32,  32,  32,  38,  38,  38,  38,  38,  38,  38,  38,  38,  38,  38,  43,  43,  43,  43,  43,  43,  43,  43,  43,  43,  48,  48,  48,  48,
+		 48,  48,  48,  48,  48,  48,  48,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  59,  59,  59,  59,  59,  59,  59,  59,  59,  59,  59,  65,  65,  65,
+		 65,  65,  65,  65,  65,  65,  65,  65,  70,  70,  70,  70,  70,  70,  70,  70,  70,  70,  70,  76,  76,  76,  76,  76,  76,  76,  76,  76,  76,  76,  81,  81,
+		 81,  81,  81,  81,  81,  81,  81,  81,  86,  86,  86,  86,  86,  86,  86,  86,  86,  86,  86,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  97,  97,
+		 97,  97,  97,  97,  97,  97,  97,  97,  97, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 113, 113,
+		113, 113, 113, 113, 113, 113, 113, 113, 113, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+		131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 158, 158, 158, 158, 158, 158, 158, 158, 158,
+		158, 158, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 169, 169, 169, 169, 169, 169, 169, 169, 169, 169, 169, 174, 174, 174, 174, 174, 174, 174, 174,
+		174, 174, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 185, 185, 185, 185, 185, 185, 185, 185, 185, 185, 185, 190, 190, 190, 190, 190, 190, 190, 190,
+		190, 190, 190, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 207, 207, 207, 207, 207, 207, 207,
+		207, 207, 207, 207, 212, 212, 212, 212, 212, 212, 212, 212, 212, 212, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 223, 223, 223, 223, 223, 223, 223,
+		223, 223, 223, 223, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 234, 234, 234, 234, 234, 234, 234, 234, 234, 234, 234, 239, 239, 239, 239, 239, 239,
+		239, 239, 239, 239, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 255, 255, 255, 255, 255, 255
+	},
+	{ // QUANT_64
+		  0,   0,   0,   0,   0,   4,   4,   4,   4,   4,   4,   4,   4,   8,   8,   8,   8,   8,   8,   8,   8,  12,  12,  12,  12,  12,  12,  12,  12,  16,  16,  16,
+		 16,  16,  16,  16,  16,  20,  20,  20,  20,  20,  20,  20,  20,  24,  24,  24,  24,  24,  24,  24,  24,  28,  28,  28,  28,  28,  28,  28,  28,  32,  32,  32,
+		 32,  32,  32,  32,  32,  36,  36,  36,  36,  36,  36,  36,  36,  40,  40,  40,  40,  40,  40,  40,  40,  44,  44,  44,  44,  44,  44,  44,  44,  48,  48,  48,
+		 48,  48,  48,  48,  48,  52,  52,  52,  52,  52,  52,  52,  52,  56,  56,  56,  56,  56,  56,  56,  56,  60,  60,  60,  60,  60,  60,  60,  60,  60,  65,  65,
+		 65,  65,  65,  65,  65,  65,  65,  69,  69,  69,  69,  69,  69,  69,  69,  73,  73,  73,  73,  73,  73,  73,  73,  77,  77,  77,  77,  77,  77,  77,  77,  81,
+		 81,  81,  81,  81,  81,  81,  81,  85,  85,  85,  85,  85,  85,  85,  85,  89,  89,  89,  89,  89,  89,  89,  89,  93,  93,  93,  93,  93,  93,  93,  93,  97,
+		 97,  97,  97,  97,  97,  97,  97, 101, 101, 101, 101, 101, 101, 101, 101, 105, 105, 105, 105, 105, 105, 105, 105, 109, 109, 109, 109, 109, 109, 109, 109, 113,
+		113, 113, 113, 113, 113, 113, 113, 117, 117, 117, 117, 117, 117, 117, 117, 121, 121, 121, 121, 121, 121, 121, 121, 125, 125, 125, 125, 125, 125, 125, 125, 125,
+		130, 130, 130, 130, 130, 130, 130, 130, 130, 134, 134, 134, 134, 134, 134, 134, 134, 138, 138, 138, 138, 138, 138, 138, 138, 142, 142, 142, 142, 142, 142, 142,
+		142, 146, 146, 146, 146, 146, 146, 146, 146, 150, 150, 150, 150, 150, 150, 150, 150, 154, 154, 154, 154, 154, 154, 154, 154, 158, 158, 158, 158, 158, 158, 158,
+		158, 162, 162, 162, 162, 162, 162, 162, 162, 166, 166, 166, 166, 166, 166, 166, 166, 170, 170, 170, 170, 170, 170, 170, 170, 174, 174, 174, 174, 174, 174, 174,
+		174, 178, 178, 178, 178, 178, 178, 178, 178, 182, 182, 182, 182, 182, 182, 182, 182, 186, 186, 186, 186, 186, 186, 186, 186, 190, 190, 190, 190, 190, 190, 190,
+		190, 190, 195, 195, 195, 195, 195, 195, 195, 195, 195, 199, 199, 199, 199, 199, 199, 199, 199, 203, 203, 203, 203, 203, 203, 203, 203, 207, 207, 207, 207, 207,
+		207, 207, 207, 211, 211, 211, 211, 211, 211, 211, 211, 215, 215, 215, 215, 215, 215, 215, 215, 219, 219, 219, 219, 219, 219, 219, 219, 223, 223, 223, 223, 223,
+		223, 223, 223, 227, 227, 227, 227, 227, 227, 227, 227, 231, 231, 231, 231, 231, 231, 231, 231, 235, 235, 235, 235, 235, 235, 235, 235, 239, 239, 239, 239, 239,
+		239, 239, 239, 243, 243, 243, 243, 243, 243, 243, 243, 247, 247, 247, 247, 247, 247, 247, 247, 251, 251, 251, 251, 251, 251, 251, 251, 255, 255, 255, 255, 255
+	},
+	{ // QUANT_80
+		  0,   0,   0,   0,   3,   3,   3,   3,   3,   3,   6,   6,   6,   6,   6,   6,   9,   9,   9,   9,   9,   9,   9,  13,  13,  13,  13,  13,  13,  13,  16,  16,
+		 16,  16,  16,  16,  19,  19,  19,  19,  19,  19,  22,  22,  22,  22,  22,  22,  25,  25,  25,  25,  25,  25,  25,  29,  29,  29,  29,  29,  29,  29,  32,  32,
+		 32,  32,  32,  32,  35,  35,  35,  35,  35,  35,  38,  38,  38,  38,  38,  38,  38,  42,  42,  42,  42,  42,  42,  42,  45,  45,  45,  45,  45,  45,  48,  48,
+		 48,  48,  48,  48,  51,  51,  51,  51,  51,  51,  54,  54,  54,  54,  54,  54,  54,  58,  58,  58,  58,  58,  58,  58,  61,  61,  61,  61,  61,  61,  64,  64,
+		 64,  64,  64,  64,  67,  67,  67,  67,  67,  67,  67,  71,  71,  71,  71,  71,  71,  71,  74,  74,  74,  74,  74,  74,  77,  77,  77,  77,  77,  77,  80,  80,
+		 80,  80,  80,  80,  83,  83,  83,  83,  83,  83,  83,  87,  87,  87,  87,  87,  87,  87,  90,  90,  90,  90,  90,  90,  93,  93,  93,  93,  93,  93,  96,  96,
+		 96,  96,  96,  96,  96, 100, 100, 100, 100, 100, 100, 100, 103, 103, 103, 103, 103, 103, 106, 106, 106, 106, 106, 106, 109, 109, 109, 109, 109, 109, 112, 112,
+		112, 112, 112, 112, 112, 116, 116, 116, 116, 116, 116, 116, 119, 119, 119, 119, 119, 119, 122, 122, 122, 122, 122, 122, 125, 125, 125, 125, 125, 125, 125, 125,
+		130, 130, 130, 130, 130, 130, 130, 130, 133, 133, 133, 133, 133, 133, 136, 136, 136, 136, 136, 136, 139, 139, 139, 139, 139, 139, 139, 143, 143, 143, 143, 143,
+		143, 143, 146, 146, 146, 146, 146, 146, 149, 149, 149, 149, 149, 149, 152, 152, 152, 152, 152, 152, 155, 155, 155, 155, 155, 155, 155, 159, 159, 159, 159, 159,
+		159, 159, 162, 162, 162, 162, 162, 162, 165, 165, 165, 165, 165, 165, 168, 168, 168, 168, 168, 168, 168, 172, 172, 172, 172, 172, 172, 172, 175, 175, 175, 175,
+		175, 175, 178, 178, 178, 178, 178, 178, 181, 181, 181, 181, 181, 181, 184, 184, 184, 184, 184, 184, 184, 188, 188, 188, 188, 188, 188, 188, 191, 191, 191, 191,
+		191, 191, 194, 194, 194, 194, 194, 194, 197, 197, 197, 197, 197, 197, 197, 201, 201, 201, 201, 201, 201, 201, 204, 204, 204, 204, 204, 204, 207, 207, 207, 207,
+		207, 207, 210, 210, 210, 210, 210, 210, 213, 213, 213, 213, 213, 213, 213, 217, 217, 217, 217, 217, 217, 217, 220, 220, 220, 220, 220, 220, 223, 223, 223, 223,
+		223, 223, 226, 226, 226, 226, 226, 226, 226, 230, 230, 230, 230, 230, 230, 230, 233, 233, 233, 233, 233, 233, 236, 236, 236, 236, 236, 236, 239, 239, 239, 239,
+		239, 239, 242, 242, 242, 242, 242, 242, 242, 246, 246, 246, 246, 246, 246, 246, 249, 249, 249, 249, 249, 249, 252, 252, 252, 252, 252, 252, 255, 255, 255, 255
+	},
+	{ // QUANT_96
+		  0,   0,   0,   2,   2,   2,   2,   2,   5,   5,   5,   5,   5,   5,   8,   8,   8,   8,   8,  10,  10,  10,  10,  10,  13,  13,  13,  13,  13,  13,  16,  16,
+		 16,  16,  16,  18,  18,  18,  18,  18,  21,  21,  21,  21,  21,  21,  24,  24,  24,  24,  24,  26,  26,  26,  26,  26,  29,  29,  29,  29,  29,  29,  32,  32,
+		 32,  32,  32,  32,  35,  35,  35,  35,  35,  37,  37,  37,  37,  37,  40,  40,  40,  40,  40,  40,  43,  43,  43,  43,  43,  45,  45,  45,  45,  45,  48,  48,
+		 48,  48,  48,  48,  51,  51,  51,  51,  51,  53,  53,  53,  53,  53,  56,  56,  56,  56,  56,  56,  59,  59,  59,  59,  59,  61,  61,  61,  61,  61,  64,  64,
+		 64,  64,  64,  64,  67,  67,  67,  67,  67,  67,  70,  70,  70,  70,  70,  72,  72,  72,  72,  72,  75,  75,  75,  75,  75,  75,  78,  78,  78,  78,  78,  80,
+		 80,  80,  80,  80,  83,  83,  83,  83,  83,  83,  86,  86,  86,  86,  86,  88,  88,  88,  88,  88,  91,  91,  91,  91,  91,  91,  94,  94,  94,  94,  94,  96,
+		 96,  96,  96,  96,  99,  99,  99,  99,  99,  99, 102, 102, 102, 102, 102, 104, 104, 104, 104, 104, 107, 107, 107, 107, 107, 107, 110, 110, 110, 110, 110, 112,
+		112, 112, 112, 112, 115, 115, 115, 115, 115, 115, 118, 118, 118, 118, 118, 120, 120, 120, 120, 120, 123, 123, 123, 123, 123, 123, 126, 126, 126, 126, 126, 126,
+		129, 129, 129, 129, 129, 129, 132, 132, 132, 132, 132, 132, 135, 135, 135, 135, 135, 137, 137, 137, 137, 137, 140, 140, 140, 140, 140, 140, 143, 143, 143, 143,
+		143, 145, 145, 145, 145, 145, 148, 148, 148, 148, 148, 148, 151, 151, 151, 151, 151, 153, 153, 153, 153, 153, 156, 156, 156, 156, 156, 156, 159, 159, 159, 159,
+		159, 161, 161, 161, 161, 161, 164, 164, 164, 164, 164, 164, 167, 167, 167, 167, 167, 169, 169, 169, 169, 169, 172, 172, 172, 172, 172, 172, 175, 175, 175, 175,
+		175, 177, 177, 177, 177, 177, 180, 180, 180, 180, 180, 180, 183, 183, 183, 183, 183, 185, 185, 185, 185, 185, 188, 188, 188, 188, 188, 188, 191, 191, 191, 191,
+		191, 191, 194, 194, 194, 194, 194, 196, 196, 196, 196, 196, 199, 199, 199, 199, 199, 199, 202, 202, 202, 202, 202, 204, 204, 204, 204, 204, 207, 207, 207, 207,
+		207, 207, 210, 210, 210, 210, 210, 212, 212, 212, 212, 212, 215, 215, 215, 215, 215, 215, 218, 218, 218, 218, 218, 220, 220, 220, 220, 220, 223, 223, 223, 223,
+		223, 223, 226, 226, 226, 226, 226, 226, 229, 229, 229, 229, 229, 231, 231, 231, 231, 231, 234, 234, 234, 234, 234, 234, 237, 237, 237, 237, 237, 239, 239, 239,
+		239, 239, 242, 242, 242, 242, 242, 242, 245, 245, 245, 245, 245, 247, 247, 247, 247, 247, 250, 250, 250, 250, 250, 250, 253, 253, 253, 253, 253, 255, 255, 255
+	},
+	{ // QUANT_128
+		  0,   0,   0,   2,   2,   2,   2,   4,   4,   4,   4,   6,   6,   6,   6,   8,   8,   8,   8,  10,  10,  10,  10,  12,  12,  12,  12,  14,  14,  14,  14,  16,
+		 16,  16,  16,  18,  18,  18,  18,  20,  20,  20,  20,  22,  22,  22,  22,  24,  24,  24,  24,  26,  26,  26,  26,  28,  28,  28,  28,  30,  30,  30,  30,  32,
+		 32,  32,  32,  34,  34,  34,  34,  36,  36,  36,  36,  38,  38,  38,  38,  40,  40,  40,  40,  42,  42,  42,  42,  44,  44,  44,  44,  46,  46,  46,  46,  48,
+		 48,  48,  48,  50,  50,  50,  50,  52,  52,  52,  52,  54,  54,  54,  54,  56,  56,  56,  56,  58,  58,  58,  58,  60,  60,  60,  60,  62,  62,  62,  62,  64,
+		 64,  64,  64,  66,  66,  66,  66,  68,  68,  68,  68,  70,  70,  70,  70,  72,  72,  72,  72,  74,  74,  74,  74,  76,  76,  76,  76,  78,  78,  78,  78,  80,
+		 80,  80,  80,  82,  82,  82,  82,  84,  84,  84,  84,  86,  86,  86,  86,  88,  88,  88,  88,  90,  90,  90,  90,  92,  92,  92,  92,  94,  94,  94,  94,  96,
+		 96,  96,  96,  98,  98,  98,  98, 100, 100, 100, 100, 102, 102, 102, 102, 104, 104, 104, 104, 106, 106, 106, 106, 108, 108, 108, 108, 110, 110, 110, 110, 112,
+		112, 112, 112, 114, 114, 114, 114, 116, 116, 116, 116, 118, 118, 118, 118, 120, 120, 120, 120, 122, 122, 122, 122, 124, 124, 124, 124, 126, 126, 126, 126, 126,
+		129, 129, 129, 129, 129, 131, 131, 131, 131, 133, 133, 133, 133, 135, 135, 135, 135, 137, 137, 137, 137, 139, 139, 139, 139, 141, 141, 141, 141, 143, 143, 143,
+		143, 145, 145, 145, 145, 147, 147, 147, 147, 149, 149, 149, 149, 151, 151, 151, 151, 153, 153, 153, 153, 155, 155, 155, 155, 157, 157, 157, 157, 159, 159, 159,
+		159, 161, 161, 161, 161, 163, 163, 163, 163, 165, 165, 165, 165, 167, 167, 167, 167, 169, 169, 169, 169, 171, 171, 171, 171, 173, 173, 173, 173, 175, 175, 175,
+		175, 177, 177, 177, 177, 179, 179, 179, 179, 181, 181, 181, 181, 183, 183, 183, 183, 185, 185, 185, 185, 187, 187, 187, 187, 189, 189, 189, 189, 191, 191, 191,
+		191, 193, 193, 193, 193, 195, 195, 195, 195, 197, 197, 197, 197, 199, 199, 199, 199, 201, 201, 201, 201, 203, 203, 203, 203, 205, 205, 205, 205, 207, 207, 207,
+		207, 209, 209, 209, 209, 211, 211, 211, 211, 213, 213, 213, 213, 215, 215, 215, 215, 217, 217, 217, 217, 219, 219, 219, 219, 221, 221, 221, 221, 223, 223, 223,
+		223, 225, 225, 225, 225, 227, 227, 227, 227, 229, 229, 229, 229, 231, 231, 231, 231, 233, 233, 233, 233, 235, 235, 235, 235, 237, 237, 237, 237, 239, 239, 239,
+		239, 241, 241, 241, 241, 243, 243, 243, 243, 245, 245, 245, 245, 247, 247, 247, 247, 249, 249, 249, 249, 251, 251, 251, 251, 253, 253, 253, 253, 255, 255, 255
+	},
+	{ // QUANT_160
+		  0,   0,   1,   1,   1,   3,   3,   3,   4,   4,   4,   6,   6,   6,   6,   8,   8,   8,   9,   9,   9,  11,  11,  11,  12,  12,  12,  14,  14,  14,  14,  16,
+		 16,  16,  17,  17,  17,  19,  19,  19,  20,  20,  20,  22,  22,  22,  22,  24,  24,  24,  25,  25,  25,  27,  27,  27,  28,  28,  28,  30,  30,  30,  30,  32,
+		 32,  32,  33,  33,  33,  35,  35,  35,  36,  36,  36,  38,  38,  38,  38,  40,  40,  40,  41,  41,  41,  43,  43,  43,  44,  44,  44,  46,  46,  46,  46,  48,
+		 48,  48,  49,  49,  49,  51,  51,  51,  52,  52,  52,  54,  54,  54,  54,  56,  56,  56,  57,  57,  57,  59,  59,  59,  60,  60,  60,  62,  62,  62,  62,  64,
+		 64,  64,  65,  65,  65,  67,  67,  67,  68,  68,  68,  70,  70,  70,  70,  72,  72,  72,  73,  73,  73,  75,  75,  75,  76,  76,  76,  78,  78,  78,  78,  80,
+		 80,  80,  81,  81,  81,  83,  83,  83,  84,  84,  84,  86,  86,  86,  86,  88,  88,  88,  89,  89,  89,  91,  91,  91,  92,  92,  92,  94,  94,  94,  94,  96,
+		 96,  96,  97,  97,  97,  99,  99,  99, 100, 100, 100, 102, 102, 102, 102, 104, 104, 104, 105, 105, 105, 107, 107, 107, 108, 108, 108, 110, 110, 110, 110, 112,
+		112, 112, 113, 113, 113, 115, 115, 115, 116, 116, 116, 118, 118, 118, 118, 120, 120, 120, 121, 121, 121, 123, 123, 123, 124, 124, 124, 126, 126, 126, 126, 126,
+		129, 129, 129, 129, 129, 131, 131, 131, 132, 132, 132, 134, 134, 134, 135, 135, 135, 137, 137, 137, 137, 139, 139, 139, 140, 140, 140, 142, 142, 142, 143, 143,
+		143, 145, 145, 145, 145, 147, 147, 147, 148, 148, 148, 150, 150, 150, 151, 151, 151, 153, 153, 153, 153, 155, 155, 155, 156, 156, 156, 158, 158, 158, 159, 159,
+		159, 161, 161, 161, 161, 163, 163, 163, 164, 164, 164, 166, 166, 166, 167, 167, 167, 169, 169, 169, 169, 171, 171, 171, 172, 172, 172, 174, 174, 174, 175, 175,
+		175, 177, 177, 177, 177, 179, 179, 179, 180, 180, 180, 182, 182, 182, 183, 183, 183, 185, 185, 185, 185, 187, 187, 187, 188, 188, 188, 190, 190, 190, 191, 191,
+		191, 193, 193, 193, 193, 195, 195, 195, 196, 196, 196, 198, 198, 198, 199, 199, 199, 201, 201, 201, 201, 203, 203, 203, 204, 204, 204, 206, 206, 206, 207, 207,
+		207, 209, 209, 209, 209, 211, 211, 211, 212, 212, 212, 214, 214, 214, 215, 215, 215, 217, 217, 217, 217, 219, 219, 219, 220, 220, 220, 222, 222, 222, 223, 223,
+		223, 225, 225, 225, 225, 227, 227, 227, 228, 228, 228, 230, 230, 230, 231, 231, 231, 233, 233, 233, 233, 235, 235, 235, 236, 236, 236, 238, 238, 238, 239, 239,
+		239, 241, 241, 241, 241, 243, 243, 243, 244, 244, 244, 246, 246, 246, 247, 247, 247, 249, 249, 249, 249, 251, 251, 251, 252, 252, 252, 254, 254, 254, 255, 255
+	},
+	{ // QUANT_192
+		  0,   0,   1,   1,   2,   2,   2,   4,   4,   4,   5,   5,   6,   6,   6,   8,   8,   8,   9,   9,  10,  10,  10,  12,  12,  12,  13,  13,  14,  14,  14,  16,
+		 16,  16,  17,  17,  18,  18,  18,  20,  20,  20,  21,  21,  22,  22,  22,  24,  24,  24,  25,  25,  26,  26,  26,  28,  28,  28,  29,  29,  30,  30,  30,  32,
+		 32,  32,  33,  33,  34,  34,  34,  36,  36,  36,  37,  37,  38,  38,  38,  40,  40,  40,  41,  41,  42,  42,  42,  44,  44,  44,  45,  45,  46,  46,  46,  48,
+		 48,  48,  49,  49,  50,  50,  50,  52,  52,  52,  53,  53,  54,  54,  54,  56,  56,  56,  57,  57,  58,  58,  58,  60,  60,  60,  61,  61,  62,  62,  62,  64,
+		 64,  64,  65,  65,  66,  66,  66,  68,  68,  68,  69,  69,  70,  70,  70,  72,  72,  72,  73,  73,  74,  74,  74,  76,  76,  76,  77,  77,  78,  78,  78,  80,
+		 80,  80,  81,  81,  82,  82,  82,  84,  84,  84,  85,  85,  86,  86,  86,  88,  88,  88,  89,  89,  90,  90,  90,  92,  92,  92,  93,  93,  94,  94,  94,  96,
+		 96,  96,  97,  97,  98,  98,  98, 100, 100, 100, 101, 101, 102, 102, 102, 104, 104, 104, 105, 105, 106, 106, 106, 108, 108, 108, 109, 109, 110, 110, 110, 112,
+		112, 112, 113, 113, 114, 114, 114, 116, 116, 116, 117, 117, 118, 118, 118, 120, 120, 120, 121, 121, 122, 122, 122, 124, 124, 124, 125, 125, 126, 126, 126, 126,
+		129, 129, 129, 129, 130, 130, 131, 131, 131, 133, 133, 133, 134, 134, 135, 135, 135, 137, 137, 137, 138, 138, 139, 139, 139, 141, 141, 141, 142, 142, 143, 143,
+		143, 145, 145, 145, 146, 146, 147, 147, 147, 149, 149, 149, 150, 150, 151, 151, 151, 153, 153, 153, 154, 154, 155, 155, 155, 157, 157, 157, 158, 158, 159, 159,
+		159, 161, 161, 161, 162, 162, 163, 163, 163, 165, 165, 165, 166, 166, 167, 167, 167, 169, 169, 169, 170, 170, 171, 171, 171, 173, 173, 173, 174, 174, 175, 175,
+		175, 177, 177, 177, 178, 178, 179, 179, 179, 181, 181, 181, 182, 182, 183, 183, 183, 185, 185, 185, 186, 186, 187, 187, 187, 189, 189, 189, 190, 190, 191, 191,
+		191, 193, 193, 193, 194, 194, 195, 195, 195, 197, 197, 197, 198, 198, 199, 199, 199, 201, 201, 201, 202, 202, 203, 203, 203, 205, 205, 205, 206, 206, 207, 207,
+		207, 209, 209, 209, 210, 210, 211, 211, 211, 213, 213, 213, 214, 214, 215, 215, 215, 217, 217, 217, 218, 218, 219, 219, 219, 221, 221, 221, 222, 222, 223, 223,
+		223, 225, 225, 225, 226, 226, 227, 227, 227, 229, 229, 229, 230, 230, 231, 231, 231, 233, 233, 233, 234, 234, 235, 235, 235, 237, 237, 237, 238, 238, 239, 239,
+		239, 241, 241, 241, 242, 242, 243, 243, 243, 245, 245, 245, 246, 246, 247, 247, 247, 249, 249, 249, 250, 250, 251, 251, 251, 253, 253, 253, 254, 254, 255, 255
+	},
+	{ // QUANT_256
+		  0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,   6,   7,   7,   8,   8,   9,   9,  10,  10,  11,  11,  12,  12,  13,  13,  14,  14,  15,  15,
+		 16,  16,  17,  17,  18,  18,  19,  19,  20,  20,  21,  21,  22,  22,  23,  23,  24,  24,  25,  25,  26,  26,  27,  27,  28,  28,  29,  29,  30,  30,  31,  31,
+		 32,  32,  33,  33,  34,  34,  35,  35,  36,  36,  37,  37,  38,  38,  39,  39,  40,  40,  41,  41,  42,  42,  43,  43,  44,  44,  45,  45,  46,  46,  47,  47,
+		 48,  48,  49,  49,  50,  50,  51,  51,  52,  52,  53,  53,  54,  54,  55,  55,  56,  56,  57,  57,  58,  58,  59,  59,  60,  60,  61,  61,  62,  62,  63,  63,
+		 64,  64,  65,  65,  66,  66,  67,  67,  68,  68,  69,  69,  70,  70,  71,  71,  72,  72,  73,  73,  74,  74,  75,  75,  76,  76,  77,  77,  78,  78,  79,  79,
+		 80,  80,  81,  81,  82,  82,  83,  83,  84,  84,  85,  85,  86,  86,  87,  87,  88,  88,  89,  89,  90,  90,  91,  91,  92,  92,  93,  93,  94,  94,  95,  95,
+		 96,  96,  97,  97,  98,  98,  99,  99, 100, 100, 101, 101, 102, 102, 103, 103, 104, 104, 105, 105, 106, 106, 107, 107, 108, 108, 109, 109, 110, 110, 111, 111,
+		112, 112, 113, 113, 114, 114, 115, 115, 116, 116, 117, 117, 118, 118, 119, 119, 120, 120, 121, 121, 122, 122, 123, 123, 124, 124, 125, 125, 126, 126, 127, 127,
+		128, 128, 129, 129, 130, 130, 131, 131, 132, 132, 133, 133, 134, 134, 135, 135, 136, 136, 137, 137, 138, 138, 139, 139, 140, 140, 141, 141, 142, 142, 143, 143,
+		144, 144, 145, 145, 146, 146, 147, 147, 148, 148, 149, 149, 150, 150, 151, 151, 152, 152, 153, 153, 154, 154, 155, 155, 156, 156, 157, 157, 158, 158, 159, 159,
+		160, 160, 161, 161, 162, 162, 163, 163, 164, 164, 165, 165, 166, 166, 167, 167, 168, 168, 169, 169, 170, 170, 171, 171, 172, 172, 173, 173, 174, 174, 175, 175,
+		176, 176, 177, 177, 178, 178, 179, 179, 180, 180, 181, 181, 182, 182, 183, 183, 184, 184, 185, 185, 186, 186, 187, 187, 188, 188, 189, 189, 190, 190, 191, 191,
+		192, 192, 193, 193, 194, 194, 195, 195, 196, 196, 197, 197, 198, 198, 199, 199, 200, 200, 201, 201, 202, 202, 203, 203, 204, 204, 205, 205, 206, 206, 207, 207,
+		208, 208, 209, 209, 210, 210, 211, 211, 212, 212, 213, 213, 214, 214, 215, 215, 216, 216, 217, 217, 218, 218, 219, 219, 220, 220, 221, 221, 222, 222, 223, 223,
+		224, 224, 225, 225, 226, 226, 227, 227, 228, 228, 229, 229, 230, 230, 231, 231, 232, 232, 233, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238, 239, 239,
+		240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255, 255
+	},
+};
+
+// Starts from QUANT_6
+// Scrambled
+const uint8_t color_uquant_to_scrambled_pquant_tables[17][256] {
+	{ // QUANT_6
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+		  1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1
+	},
+	{ // QUANT_8
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+		  0,   0,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+		  1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+		  1,   1,   1,   1,   1,   1,   1,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   6,   6,   6,   6,   6,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   7,   7,   7,
+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7
+	},
+	{ // QUANT_10
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
+		  6,   6,   6,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
+		  8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,
+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   7,   7,   7,
+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,
+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1
+	},
+	{ // QUANT_12
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
+		  8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+		  2,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,  10,  10,  10,  10,  10,  10,  10,  10,
+		 10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,
+		 11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,
+		 11,  11,  11,  11,  11,  11,  11,  11,   7,   7,   7,   7,   7,   7,   7,   7,
+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,
+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1
+	},
+	{ // QUANT_16
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   1,   1,   1,   1,   1,   1,
+		  1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   7,
+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,
+		  8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
+		  8,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,
+		  9,   9,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,
+		 10,  10,  10,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,
+		 11,  11,  11,  11,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,
+		 12,  12,  12,  12,  12,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,
+		 13,  13,  13,  13,  13,  13,  14,  14,  14,  14,  14,  14,  14,  14,  14,  14,
+		 14,  14,  14,  14,  14,  14,  14,  15,  15,  15,  15,  15,  15,  15,  15,  15
+	},
+	{ // QUANT_20
+		  0,   0,   0,   0,   0,   0,   0,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
+		  8,   8,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  16,
+		 16,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   6,   6,   6,   6,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,  10,  10,  10,  10,  10,  10,  10,  10,  10,
+		 10,  10,  10,  10,  10,  14,  14,  14,  14,  14,  14,  14,  14,  14,  14,  14,
+		 14,  14,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,
+		 19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  15,  15,
+		 15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  11,  11,  11,  11,  11,
+		 11,  11,  11,  11,  11,  11,  11,  11,  11,   7,   7,   7,   7,   7,   7,   7,
+		  7,   7,   7,   7,   7,   7,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,
+		 17,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,   9,   9,
+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   1,   1,   1,   1,   1,   1,   1
+	},
+	{ // QUANT_24
+		  0,   0,   0,   0,   0,   0,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
+		  8,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,  10,  10,  10,  10,  10,  10,  10,  10,  10,
+		 10,  10,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,  12,  12,  12,  12,  12,  12,  12,  12,
+		 12,  12,  12,  20,  20,  20,  20,  20,  20,  20,  20,  20,  20,  20,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,   6,  14,  14,  14,  14,  14,  14,  14,
+		 14,  14,  14,  14,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,
+		 23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  15,  15,  15,  15,
+		 15,  15,  15,  15,  15,  15,  15,   7,   7,   7,   7,   7,   7,   7,   7,   7,
+		  7,   7,  21,  21,  21,  21,  21,  21,  21,  21,  21,  21,  21,  13,  13,  13,
+		 13,  13,  13,  13,  13,  13,  13,  13,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  11,  11,
+		 11,  11,  11,  11,  11,  11,  11,  11,  11,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,   9,
+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   1,   1,   1,   1,   1,   1
+	},
+	{ // QUANT_32
+		  0,   0,   0,   0,   1,   1,   1,   1,   1,   1,   1,   1,   2,   2,   2,   2,
+		  2,   2,   2,   2,   3,   3,   3,   3,   3,   3,   3,   3,   3,   4,   4,   4,
+		  4,   4,   4,   4,   4,   5,   5,   5,   5,   5,   5,   5,   5,   6,   6,   6,
+		  6,   6,   6,   6,   6,   7,   7,   7,   7,   7,   7,   7,   7,   7,   8,   8,
+		  8,   8,   8,   8,   8,   8,   9,   9,   9,   9,   9,   9,   9,   9,  10,  10,
+		 10,  10,  10,  10,  10,  10,  11,  11,  11,  11,  11,  11,  11,  11,  11,  12,
+		 12,  12,  12,  12,  12,  12,  12,  13,  13,  13,  13,  13,  13,  13,  13,  14,
+		 14,  14,  14,  14,  14,  14,  14,  15,  15,  15,  15,  15,  15,  15,  15,  15,
+		 16,  16,  16,  16,  16,  16,  16,  16,  17,  17,  17,  17,  17,  17,  17,  17,
+		 18,  18,  18,  18,  18,  18,  18,  18,  19,  19,  19,  19,  19,  19,  19,  19,
+		 19,  20,  20,  20,  20,  20,  20,  20,  20,  21,  21,  21,  21,  21,  21,  21,
+		 21,  22,  22,  22,  22,  22,  22,  22,  22,  23,  23,  23,  23,  23,  23,  23,
+		 23,  23,  24,  24,  24,  24,  24,  24,  24,  24,  25,  25,  25,  25,  25,  25,
+		 25,  25,  26,  26,  26,  26,  26,  26,  26,  26,  27,  27,  27,  27,  27,  27,
+		 27,  27,  27,  28,  28,  28,  28,  28,  28,  28,  28,  29,  29,  29,  29,  29,
+		 29,  29,  29,  30,  30,  30,  30,  30,  30,  30,  30,  31,  31,  31,  31,  31
+	},
+	{ // QUANT_40
+		  0,   0,   0,   8,   8,   8,   8,   8,   8,   8,  16,  16,  16,  16,  16,  16,
+		 24,  24,  24,  24,  24,  24,  24,  32,  32,  32,  32,  32,  32,  32,   2,   2,
+		  2,   2,   2,   2,  10,  10,  10,  10,  10,  10,  18,  18,  18,  18,  18,  18,
+		 18,  26,  26,  26,  26,  26,  26,  34,  34,  34,  34,  34,  34,  34,   4,   4,
+		  4,   4,   4,   4,  12,  12,  12,  12,  12,  12,  12,  20,  20,  20,  20,  20,
+		 20,  28,  28,  28,  28,  28,  28,  28,  36,  36,  36,  36,  36,  36,  36,   6,
+		  6,   6,   6,   6,   6,  14,  14,  14,  14,  14,  14,  22,  22,  22,  22,  22,
+		 22,  22,  30,  30,  30,  30,  30,  30,  38,  38,  38,  38,  38,  38,  38,  38,
+		 39,  39,  39,  39,  39,  39,  39,  39,  31,  31,  31,  31,  31,  31,  23,  23,
+		 23,  23,  23,  23,  23,  15,  15,  15,  15,  15,  15,   7,   7,   7,   7,   7,
+		  7,  37,  37,  37,  37,  37,  37,  37,  29,  29,  29,  29,  29,  29,  29,  21,
+		 21,  21,  21,  21,  21,  13,  13,  13,  13,  13,  13,  13,   5,   5,   5,   5,
+		  5,   5,  35,  35,  35,  35,  35,  35,  35,  27,  27,  27,  27,  27,  27,  19,
+		 19,  19,  19,  19,  19,  19,  11,  11,  11,  11,  11,  11,   3,   3,   3,   3,
+		  3,   3,  33,  33,  33,  33,  33,  33,  33,  25,  25,  25,  25,  25,  25,  25,
+		 17,  17,  17,  17,  17,  17,   9,   9,   9,   9,   9,   9,   9,   1,   1,   1
+	},
+	{ // QUANT_48
+		  0,   0,   0,  16,  16,  16,  16,  16,  32,  32,  32,  32,  32,  32,   2,   2,
+		  2,   2,   2,  18,  18,  18,  18,  18,  34,  34,  34,  34,  34,  34,   4,   4,
+		  4,   4,   4,  20,  20,  20,  20,  20,  20,  36,  36,  36,  36,  36,   6,   6,
+		  6,   6,   6,  22,  22,  22,  22,  22,  22,  38,  38,  38,  38,  38,  38,   8,
+		  8,   8,   8,   8,  24,  24,  24,  24,  24,  40,  40,  40,  40,  40,  40,  10,
+		 10,  10,  10,  10,  26,  26,  26,  26,  26,  42,  42,  42,  42,  42,  42,  12,
+		 12,  12,  12,  12,  28,  28,  28,  28,  28,  28,  44,  44,  44,  44,  44,  14,
+		 14,  14,  14,  14,  30,  30,  30,  30,  30,  30,  46,  46,  46,  46,  46,  46,
+		 47,  47,  47,  47,  47,  47,  31,  31,  31,  31,  31,  31,  15,  15,  15,  15,
+		 15,  45,  45,  45,  45,  45,  29,  29,  29,  29,  29,  29,  13,  13,  13,  13,
+		 13,  43,  43,  43,  43,  43,  43,  27,  27,  27,  27,  27,  11,  11,  11,  11,
+		 11,  41,  41,  41,  41,  41,  41,  25,  25,  25,  25,  25,   9,   9,   9,   9,
+		  9,  39,  39,  39,  39,  39,  39,  23,  23,  23,  23,  23,  23,   7,   7,   7,
+		  7,   7,  37,  37,  37,  37,  37,  21,  21,  21,  21,  21,  21,   5,   5,   5,
+		  5,   5,  35,  35,  35,  35,  35,  35,  19,  19,  19,  19,  19,   3,   3,   3,
+		  3,   3,  33,  33,  33,  33,  33,  33,  17,  17,  17,  17,  17,   1,   1,   1
+	},
+	{ // QUANT_64
+		  0,   0,   1,   1,   1,   1,   2,   2,   2,   2,   3,   3,   3,   3,   4,   4,
+		  4,   4,   5,   5,   5,   5,   6,   6,   6,   6,   7,   7,   7,   7,   8,   8,
+		  8,   8,   9,   9,   9,   9,  10,  10,  10,  10,  11,  11,  11,  11,  12,  12,
+		 12,  12,  13,  13,  13,  13,  14,  14,  14,  14,  15,  15,  15,  15,  15,  16,
+		 16,  16,  16,  17,  17,  17,  17,  18,  18,  18,  18,  19,  19,  19,  19,  20,
+		 20,  20,  20,  21,  21,  21,  21,  22,  22,  22,  22,  23,  23,  23,  23,  24,
+		 24,  24,  24,  25,  25,  25,  25,  26,  26,  26,  26,  27,  27,  27,  27,  28,
+		 28,  28,  28,  29,  29,  29,  29,  30,  30,  30,  30,  31,  31,  31,  31,  31,
+		 32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  34,  35,  35,  35,  35,
+		 36,  36,  36,  36,  37,  37,  37,  37,  38,  38,  38,  38,  39,  39,  39,  39,
+		 40,  40,  40,  40,  41,  41,  41,  41,  42,  42,  42,  42,  43,  43,  43,  43,
+		 44,  44,  44,  44,  45,  45,  45,  45,  46,  46,  46,  46,  47,  47,  47,  47,
+		 47,  48,  48,  48,  48,  49,  49,  49,  49,  50,  50,  50,  50,  51,  51,  51,
+		 51,  52,  52,  52,  52,  53,  53,  53,  53,  54,  54,  54,  54,  55,  55,  55,
+		 55,  56,  56,  56,  56,  57,  57,  57,  57,  58,  58,  58,  58,  59,  59,  59,
+		 59,  60,  60,  60,  60,  61,  61,  61,  61,  62,  62,  62,  62,  63,  63,  63
+	},
+	{ // QUANT_80
+		  0,   0,  16,  16,  16,  32,  32,  32,  48,  48,  48,  64,  64,  64,  64,   2,
+		  2,   2,  18,  18,  18,  34,  34,  34,  50,  50,  50,  66,  66,  66,  66,   4,
+		  4,   4,  20,  20,  20,  36,  36,  36,  52,  52,  52,  52,  68,  68,  68,   6,
+		  6,   6,  22,  22,  22,  38,  38,  38,  54,  54,  54,  54,  70,  70,  70,   8,
+		  8,   8,  24,  24,  24,  40,  40,  40,  40,  56,  56,  56,  72,  72,  72,  10,
+		 10,  10,  26,  26,  26,  42,  42,  42,  42,  58,  58,  58,  74,  74,  74,  12,
+		 12,  12,  28,  28,  28,  28,  44,  44,  44,  60,  60,  60,  76,  76,  76,  14,
+		 14,  14,  30,  30,  30,  30,  46,  46,  46,  62,  62,  62,  78,  78,  78,  78,
+		 79,  79,  79,  79,  63,  63,  63,  47,  47,  47,  31,  31,  31,  31,  15,  15,
+		 15,  77,  77,  77,  61,  61,  61,  45,  45,  45,  29,  29,  29,  29,  13,  13,
+		 13,  75,  75,  75,  59,  59,  59,  43,  43,  43,  43,  27,  27,  27,  11,  11,
+		 11,  73,  73,  73,  57,  57,  57,  41,  41,  41,  41,  25,  25,  25,   9,   9,
+		  9,  71,  71,  71,  55,  55,  55,  55,  39,  39,  39,  23,  23,  23,   7,   7,
+		  7,  69,  69,  69,  53,  53,  53,  53,  37,  37,  37,  21,  21,  21,   5,   5,
+		  5,  67,  67,  67,  67,  51,  51,  51,  35,  35,  35,  19,  19,  19,   3,   3,
+		  3,  65,  65,  65,  65,  49,  49,  49,  33,  33,  33,  17,  17,  17,   1,   1
+	},
+	{ // QUANT_96
+		  0,  32,  32,  32,  64,  64,  64,   2,   2,  34,  34,  34,  66,  66,  66,   4,
+		  4,  36,  36,  36,  68,  68,  68,   6,   6,  38,  38,  38,  70,  70,  70,   8,
+		  8,   8,  40,  40,  72,  72,  72,  10,  10,  10,  42,  42,  74,  74,  74,  12,
+		 12,  12,  44,  44,  76,  76,  76,  14,  14,  14,  46,  46,  78,  78,  78,  16,
+		 16,  16,  48,  48,  48,  80,  80,  80,  18,  18,  50,  50,  50,  82,  82,  82,
+		 20,  20,  52,  52,  52,  84,  84,  84,  22,  22,  54,  54,  54,  86,  86,  86,
+		 24,  24,  56,  56,  56,  88,  88,  88,  26,  26,  58,  58,  58,  90,  90,  90,
+		 28,  28,  60,  60,  60,  92,  92,  92,  30,  30,  62,  62,  62,  94,  94,  94,
+		 95,  95,  95,  63,  63,  63,  31,  31,  93,  93,  93,  61,  61,  61,  29,  29,
+		 91,  91,  91,  59,  59,  59,  27,  27,  89,  89,  89,  57,  57,  57,  25,  25,
+		 87,  87,  87,  55,  55,  55,  23,  23,  85,  85,  85,  53,  53,  53,  21,  21,
+		 83,  83,  83,  51,  51,  51,  19,  19,  81,  81,  81,  49,  49,  49,  17,  17,
+		 17,  79,  79,  79,  47,  47,  15,  15,  15,  77,  77,  77,  45,  45,  13,  13,
+		 13,  75,  75,  75,  43,  43,  11,  11,  11,  73,  73,  73,  41,  41,   9,   9,
+		  9,  71,  71,  71,  39,  39,  39,   7,   7,  69,  69,  69,  37,  37,  37,   5,
+		  5,  67,  67,  67,  35,  35,  35,   3,   3,  65,  65,  65,  33,  33,  33,   1
+	},
+	{ // QUANT_128
+		  0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,   6,   7,   7,   8,
+		  8,   9,   9,  10,  10,  11,  11,  12,  12,  13,  13,  14,  14,  15,  15,  16,
+		 16,  17,  17,  18,  18,  19,  19,  20,  20,  21,  21,  22,  22,  23,  23,  24,
+		 24,  25,  25,  26,  26,  27,  27,  28,  28,  29,  29,  30,  30,  31,  31,  32,
+		 32,  33,  33,  34,  34,  35,  35,  36,  36,  37,  37,  38,  38,  39,  39,  40,
+		 40,  41,  41,  42,  42,  43,  43,  44,  44,  45,  45,  46,  46,  47,  47,  48,
+		 48,  49,  49,  50,  50,  51,  51,  52,  52,  53,  53,  54,  54,  55,  55,  56,
+		 56,  57,  57,  58,  58,  59,  59,  60,  60,  61,  61,  62,  62,  63,  63,  63,
+		 64,  64,  65,  65,  66,  66,  67,  67,  68,  68,  69,  69,  70,  70,  71,  71,
+		 72,  72,  73,  73,  74,  74,  75,  75,  76,  76,  77,  77,  78,  78,  79,  79,
+		 80,  80,  81,  81,  82,  82,  83,  83,  84,  84,  85,  85,  86,  86,  87,  87,
+		 88,  88,  89,  89,  90,  90,  91,  91,  92,  92,  93,  93,  94,  94,  95,  95,
+		 96,  96,  97,  97,  98,  98,  99,  99, 100, 100, 101, 101, 102, 102, 103, 103,
+		104, 104, 105, 105, 106, 106, 107, 107, 108, 108, 109, 109, 110, 110, 111, 111,
+		112, 112, 113, 113, 114, 114, 115, 115, 116, 116, 117, 117, 118, 118, 119, 119,
+		120, 120, 121, 121, 122, 122, 123, 123, 124, 124, 125, 125, 126, 126, 127, 127
+	},
+	{ // QUANT_160
+		  0,  32,  64,  64,  96, 128, 128, 128,   2,  34,  66,  66,  98, 130, 130, 130,
+		  4,  36,  68,  68, 100, 132, 132, 132,   6,  38,  70,  70, 102, 134, 134, 134,
+		  8,  40,  72,  72, 104, 136, 136, 136,  10,  42,  74,  74, 106, 138, 138, 138,
+		 12,  44,  76,  76, 108, 140, 140, 140,  14,  46,  78,  78, 110, 142, 142, 142,
+		 16,  48,  80,  80, 112, 144, 144, 144,  18,  50,  82,  82, 114, 146, 146, 146,
+		 20,  52,  84,  84, 116, 148, 148, 148,  22,  54,  86,  86, 118, 150, 150, 150,
+		 24,  56,  88,  88, 120, 152, 152, 152,  26,  58,  90,  90, 122, 154, 154, 154,
+		 28,  60,  92,  92, 124, 156, 156, 156,  30,  62,  94,  94, 126, 158, 158, 158,
+		159, 159, 159, 127,  95,  95,  63,  31, 157, 157, 157, 125,  93,  93,  61,  29,
+		155, 155, 155, 123,  91,  91,  59,  27, 153, 153, 153, 121,  89,  89,  57,  25,
+		151, 151, 151, 119,  87,  87,  55,  23, 149, 149, 149, 117,  85,  85,  53,  21,
+		147, 147, 147, 115,  83,  83,  51,  19, 145, 145, 145, 113,  81,  81,  49,  17,
+		143, 143, 143, 111,  79,  79,  47,  15, 141, 141, 141, 109,  77,  77,  45,  13,
+		139, 139, 139, 107,  75,  75,  43,  11, 137, 137, 137, 105,  73,  73,  41,   9,
+		135, 135, 135, 103,  71,  71,  39,   7, 133, 133, 133, 101,  69,  69,  37,   5,
+		131, 131, 131,  99,  67,  67,  35,   3, 129, 129, 129,  97,  65,  65,  33,   1
+	},
+	{ // QUANT_192
+		  0,  64, 128, 128,   2,  66, 130, 130,   4,  68, 132, 132,   6,  70, 134, 134,
+		  8,  72, 136, 136,  10,  74, 138, 138,  12,  76, 140, 140,  14,  78, 142, 142,
+		 16,  80, 144, 144,  18,  82, 146, 146,  20,  84, 148, 148,  22,  86, 150, 150,
+		 24,  88, 152, 152,  26,  90, 154, 154,  28,  92, 156, 156,  30,  94, 158, 158,
+		 32,  96, 160, 160,  34,  98, 162, 162,  36, 100, 164, 164,  38, 102, 166, 166,
+		 40, 104, 168, 168,  42, 106, 170, 170,  44, 108, 172, 172,  46, 110, 174, 174,
+		 48, 112, 176, 176,  50, 114, 178, 178,  52, 116, 180, 180,  54, 118, 182, 182,
+		 56, 120, 184, 184,  58, 122, 186, 186,  60, 124, 188, 188,  62, 126, 190, 190,
+		191, 191, 127,  63, 189, 189, 125,  61, 187, 187, 123,  59, 185, 185, 121,  57,
+		183, 183, 119,  55, 181, 181, 117,  53, 179, 179, 115,  51, 177, 177, 113,  49,
+		175, 175, 111,  47, 173, 173, 109,  45, 171, 171, 107,  43, 169, 169, 105,  41,
+		167, 167, 103,  39, 165, 165, 101,  37, 163, 163,  99,  35, 161, 161,  97,  33,
+		159, 159,  95,  31, 157, 157,  93,  29, 155, 155,  91,  27, 153, 153,  89,  25,
+		151, 151,  87,  23, 149, 149,  85,  21, 147, 147,  83,  19, 145, 145,  81,  17,
+		143, 143,  79,  15, 141, 141,  77,  13, 139, 139,  75,  11, 137, 137,  73,   9,
+		135, 135,  71,   7, 133, 133,  69,   5, 131, 131,  67,   3, 129, 129,  65,   1
+	},
+	{ // QUANT_256
+		  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
+		 16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
+		 32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+		 48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
+		 64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
+		 80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+		 96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+		112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+		128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+		144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+		160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+		176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+		192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
+		208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+		224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+		240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
+	}
+};
+
+#endif
+
+// Starts from QUANT_6
+// Scrambled
+static const uint8_t color_scrambled_pquant_to_uquant_q6[6] {
+	  0, 255,  51, 204, 102, 153
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q8[8] {
+	  0,  36,  73, 109, 146, 182, 219, 255
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q10[10] {
+	  0, 255,  28, 227,  56, 199,  84, 171, 113, 142
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q12[12] {
+	  0, 255,  69, 186,  23, 232,  92, 163,  46, 209, 116, 139
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q16[16] {
+	  0,  17,  34,  51,  68,  85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q20[20] {
+	  0, 255,  67, 188,  13, 242,  80, 175,  27, 228,  94, 161,  40, 215, 107, 148,
+	 54, 201, 121, 134
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q24[24] {
+	  0, 255,  33, 222,  66, 189,  99, 156,  11, 244,  44, 211,  77, 178, 110, 145,
+	 22, 233,  55, 200,  88, 167, 121, 134
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q32[32] {
+	  0,   8,  16,  24,  33,  41,  49,  57,  66,  74,  82,  90,  99, 107, 115, 123,
+	132, 140, 148, 156, 165, 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q40[40] {
+	  0, 255,  32, 223,  65, 190,  97, 158,   6, 249,  39, 216,  71, 184, 104, 151,
+	 13, 242,  45, 210,  78, 177, 110, 145,  19, 236,  52, 203,  84, 171, 117, 138,
+	 26, 229,  58, 197,  91, 164, 123, 132
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q48[48] {
+	  0, 255,  16, 239,  32, 223,  48, 207,  65, 190,  81, 174,  97, 158, 113, 142,
+	  5, 250,  21, 234,  38, 217,  54, 201,  70, 185,  86, 169, 103, 152, 119, 136,
+	 11, 244,  27, 228,  43, 212,  59, 196,  76, 179,  92, 163, 108, 147, 124, 131
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q64[64] {
+	  0,   4,   8,  12,  16,  20,  24,  28,  32,  36,  40,  44,  48,  52,  56,  60,
+	 65,  69,  73,  77,  81,  85,  89,  93,  97, 101, 105, 109, 113, 117, 121, 125,
+	130, 134, 138, 142, 146, 150, 154, 158, 162, 166, 170, 174, 178, 182, 186, 190,
+	195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, 239, 243, 247, 251, 255,
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q80[80] {
+	  0, 255,  16, 239,  32, 223,  48, 207,  64, 191,  80, 175,  96, 159, 112, 143,
+	  3, 252,  19, 236,  35, 220,  51, 204,  67, 188,  83, 172, 100, 155, 116, 139,
+	  6, 249,  22, 233,  38, 217,  54, 201,  71, 184,  87, 168, 103, 152, 119, 136,
+	  9, 246,  25, 230,  42, 213,  58, 197,  74, 181,  90, 165, 106, 149, 122, 133,
+	 13, 242,  29, 226,  45, 210,  61, 194,  77, 178,  93, 162, 109, 146, 125, 130
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q96[96] {
+	  0, 255,   8, 247,  16, 239,  24, 231,  32, 223,  40, 215,  48, 207,  56, 199,
+	 64, 191,  72, 183,  80, 175,  88, 167,  96, 159, 104, 151, 112, 143, 120, 135,
+	  2, 253,  10, 245,  18, 237,  26, 229,  35, 220,  43, 212,  51, 204,  59, 196,
+	 67, 188,  75, 180,  83, 172,  91, 164,  99, 156, 107, 148, 115, 140, 123, 132,
+	  5, 250,  13, 242,  21, 234,  29, 226,  37, 218,  45, 210,  53, 202,  61, 194,
+	 70, 185,  78, 177,  86, 169,  94, 161, 102, 153, 110, 145, 118, 137, 126, 129
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q128[128] {
+	  0,   2,   4,   6,   8,  10,  12,  14,  16,  18,  20,  22,  24,  26,  28,  30,
+	 32,  34,  36,  38,  40,  42,  44,  46,  48,  50,  52,  54,  56,  58,  60,  62,
+	 64,  66,  68,  70,  72,  74,  76,  78,  80,  82,  84,  86,  88,  90,  92,  94,
+	 96,  98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126,
+	129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149, 151, 153, 155, 157, 159,
+	161, 163, 165, 167, 169, 171, 173, 175, 177, 179, 181, 183, 185, 187, 189, 191,
+	193, 195, 197, 199, 201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223,
+	225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q160[160] {
+	  0, 255,   8, 247,  16, 239,  24, 231,  32, 223,  40, 215,  48, 207,  56, 199,
+	 64, 191,  72, 183,  80, 175,  88, 167,  96, 159, 104, 151, 112, 143, 120, 135,
+	  1, 254,   9, 246,  17, 238,  25, 230,  33, 222,  41, 214,  49, 206,  57, 198,
+	 65, 190,  73, 182,  81, 174,  89, 166,  97, 158, 105, 150, 113, 142, 121, 134,
+	  3, 252,  11, 244,  19, 236,  27, 228,  35, 220,  43, 212,  51, 204,  59, 196,
+	 67, 188,  75, 180,  83, 172,  91, 164,  99, 156, 107, 148, 115, 140, 123, 132,
+	  4, 251,  12, 243,  20, 235,  28, 227,  36, 219,  44, 211,  52, 203,  60, 195,
+	 68, 187,  76, 179,  84, 171,  92, 163, 100, 155, 108, 147, 116, 139, 124, 131,
+	  6, 249,  14, 241,  22, 233,  30, 225,  38, 217,  46, 209,  54, 201,  62, 193,
+	 70, 185,  78, 177,  86, 169,  94, 161, 102, 153, 110, 145, 118, 137, 126, 129
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q192[192] {
+	  0, 255,   4, 251,   8, 247,  12, 243,  16, 239,  20, 235,  24, 231,  28, 227,
+	 32, 223,  36, 219,  40, 215,  44, 211,  48, 207,  52, 203,  56, 199,  60, 195,
+	 64, 191,  68, 187,  72, 183,  76, 179,  80, 175,  84, 171,  88, 167,  92, 163,
+	 96, 159, 100, 155, 104, 151, 108, 147, 112, 143, 116, 139, 120, 135, 124, 131,
+	  1, 254,   5, 250,   9, 246,  13, 242,  17, 238,  21, 234,  25, 230,  29, 226,
+	 33, 222,  37, 218,  41, 214,  45, 210,  49, 206,  53, 202,  57, 198,  61, 194,
+	 65, 190,  69, 186,  73, 182,  77, 178,  81, 174,  85, 170,  89, 166,  93, 162,
+	 97, 158, 101, 154, 105, 150, 109, 146, 113, 142, 117, 138, 121, 134, 125, 130,
+	  2, 253,   6, 249,  10, 245,  14, 241,  18, 237,  22, 233,  26, 229,  30, 225,
+	 34, 221,  38, 217,  42, 213,  46, 209,  50, 205,  54, 201,  58, 197,  62, 193,
+	 66, 189,  70, 185,  74, 181,  78, 177,  82, 173,  86, 169,  90, 165,  94, 161,
+	 98, 157, 102, 153, 106, 149, 110, 145, 114, 141, 118, 137, 122, 133, 126, 129
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q256[256] {
+	  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
+	 16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
+	 32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+	 48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
+	 64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
+	 80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+	 96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+	112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+	128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+	144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+	160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+	176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+	192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
+	208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+	224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+	240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
+};
+
+const uint8_t* color_scrambled_pquant_to_uquant_tables[17] {
+	color_scrambled_pquant_to_uquant_q6,
+	color_scrambled_pquant_to_uquant_q8,
+	color_scrambled_pquant_to_uquant_q10,
+	color_scrambled_pquant_to_uquant_q12,
+	color_scrambled_pquant_to_uquant_q16,
+	color_scrambled_pquant_to_uquant_q20,
+	color_scrambled_pquant_to_uquant_q24,
+	color_scrambled_pquant_to_uquant_q32,
+	color_scrambled_pquant_to_uquant_q40,
+	color_scrambled_pquant_to_uquant_q48,
+	color_scrambled_pquant_to_uquant_q64,
+	color_scrambled_pquant_to_uquant_q80,
+	color_scrambled_pquant_to_uquant_q96,
+	color_scrambled_pquant_to_uquant_q128,
+	color_scrambled_pquant_to_uquant_q160,
+	color_scrambled_pquant_to_uquant_q192,
+	color_scrambled_pquant_to_uquant_q256
+};
+
+// The quant_mode_table[integer_count/2][bits] gives us the quantization level for a given integer
+// count and number of bits that the integer may fit into.
+const int8_t quant_mode_table[10][128] {
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+    },
+    {
+         -1, -1,  0,  0,  2,  3,  5,  6,  8,  9, 11, 12, 14, 15, 17, 18,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1,  0,  0,  0,  1,  2,  2,  3,  4,  5,  5,  6,  7,
+          8,  8,  9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  1,  1,  2,  2,  3,  3,
+          4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10, 10, 11, 11,
+         12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  1,  1,  1,
+          2,  2,  2,  3,  3,  4,  4,  4,  5,  5,  5,  6,  6,  7,  7,  7,
+          8,  8,  8,  9,  9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13,
+         14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 19, 19, 19,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  0,
+          1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  4,  4,  4,  4,  5,  5,
+          5,  5,  6,  6,  7,  7,  7,  7,  8,  8,  8,  8,  9,  9, 10, 10,
+         10, 10, 11, 11, 11, 11, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14,
+         15, 15, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 19, 19, 19, 19,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,
+          0,  0,  0,  0,  1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,
+          4,  4,  4,  4,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,
+          8,  8,  8,  8,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11, 11,
+         12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15,
+         16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,
+          0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  2,  2,  2,  2,
+          2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  6,
+          6,  6,  6,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  9,  9,  9,
+          9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, 12, 12, 12, 13,
+         13, 13, 13, 13, 14, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16,
+         16, 16, 17, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 19,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,
+          2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,
+          5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,  7,  7,
+          8,  8,  8,  8,  8,  8,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10,
+         11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13,
+         14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16,
+         17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19
+    },
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,
+          1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  4,
+          4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,
+          6,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  9,  9,
+          9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11,
+         12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14,
+         14, 14, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 17, 17
+    }
+};
@@ -0,0 +1,544 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for converting between symbolic and physical encodings.
+ */
+
+#include "astcenc_internal.h"
+
+#include <cassert>
+
+/**
+ * @brief Reverse bits in a byte.
+ *
+ * @param p   The value to reverse.
+  *
+ * @return The reversed result.
+ */
+static inline int bitrev8(int p)
+{
+	p = ((p & 0x0F) << 4) | ((p >> 4) & 0x0F);
+	p = ((p & 0x33) << 2) | ((p >> 2) & 0x33);
+	p = ((p & 0x55) << 1) | ((p >> 1) & 0x55);
+	return p;
+}
+
+
+/**
+ * @brief Read up to 8 bits at an arbitrary bit offset.
+ *
+ * The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
+ * span two separate bytes in memory.
+ *
+ * @param         bitcount    The number of bits to read.
+ * @param         bitoffset   The bit offset to read from, between 0 and 7.
+ * @param[in,out] ptr         The data pointer to read from.
+ *
+ * @return The read value.
+ */
+static inline int read_bits(
+	int bitcount,
+	int bitoffset,
+	const uint8_t* ptr
+) {
+	int mask = (1 << bitcount) - 1;
+	ptr += bitoffset >> 3;
+	bitoffset &= 7;
+	int value = ptr[0] | (ptr[1] << 8);
+	value >>= bitoffset;
+	value &= mask;
+	return value;
+}
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/**
+ * @brief Write up to 8 bits at an arbitrary bit offset.
+ *
+ * The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so
+ * may span two separate bytes in memory.
+ *
+ * @param         value       The value to write.
+ * @param         bitcount    The number of bits to write, starting from LSB.
+ * @param         bitoffset   The bit offset to store at, between 0 and 7.
+ * @param[in,out] ptr         The data pointer to write to.
+ */
+static inline void write_bits(
+	int value,
+	int bitcount,
+	int bitoffset,
+	uint8_t* ptr
+) {
+	int mask = (1 << bitcount) - 1;
+	value &= mask;
+	ptr += bitoffset >> 3;
+	bitoffset &= 7;
+	value <<= bitoffset;
+	mask <<= bitoffset;
+	mask = ~mask;
+
+	ptr[0] &= mask;
+	ptr[0] |= value;
+	ptr[1] &= mask >> 8;
+	ptr[1] |= value >> 8;
+}
+
+/* See header for documentation. */
+void symbolic_to_physical(
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	uint8_t pcb[16]
+) {
+	assert(scb.block_type != SYM_BTYPE_ERROR);
+
+	// Constant color block using UNORM16 colors
+	if (scb.block_type == SYM_BTYPE_CONST_U16)
+	{
+		// There is currently no attempt to coalesce larger void-extents
+		static const uint8_t cbytes[8] { 0xFC, 0xFD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
+		for (unsigned int i = 0; i < 8; i++)
+		{
+			pcb[i] = cbytes[i];
+		}
+
+		for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++)
+		{
+			pcb[2 * i + 8] = scb.constant_color[i] & 0xFF;
+			pcb[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
+		}
+
+		return;
+	}
+
+	// Constant color block using FP16 colors
+	if (scb.block_type == SYM_BTYPE_CONST_F16)
+	{
+		// There is currently no attempt to coalesce larger void-extents
+		static const uint8_t cbytes[8]  { 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
+		for (unsigned int i = 0; i < 8; i++)
+		{
+			pcb[i] = cbytes[i];
+		}
+
+		for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++)
+		{
+			pcb[2 * i + 8] = scb.constant_color[i] & 0xFF;
+			pcb[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
+		}
+
+		return;
+	}
+
+	unsigned int partition_count = scb.partition_count;
+
+	// Compress the weights.
+	// They are encoded as an ordinary integer-sequence, then bit-reversed
+	uint8_t weightbuf[16] { 0 };
+
+	const auto& bm = bsd.get_block_mode(scb.block_mode);
+	const auto& di = bsd.get_decimation_info(bm.decimation_mode);
+	int weight_count = di.weight_count;
+	quant_method weight_quant_method = bm.get_weight_quant_mode();
+	float weight_quant_levels = static_cast<float>(get_quant_level(weight_quant_method));
+	int is_dual_plane = bm.is_dual_plane;
+
+	const auto& qat = quant_and_xfer_tables[weight_quant_method];
+
+	int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;
+
+	int bits_for_weights = get_ise_sequence_bitcount(real_weight_count, weight_quant_method);
+
+	uint8_t weights[64];
+	if (is_dual_plane)
+	{
+		for (int i = 0; i < weight_count; i++)
+		{
+			float uqw = static_cast<float>(scb.weights[i]);
+			float qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
+			int qwi = static_cast<int>(qw + 0.5f);
+			weights[2 * i] = qat.scramble_map[qwi];
+
+			uqw = static_cast<float>(scb.weights[i + WEIGHTS_PLANE2_OFFSET]);
+			qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
+			qwi = static_cast<int>(qw + 0.5f);
+			weights[2 * i + 1] = qat.scramble_map[qwi];
+		}
+	}
+	else
+	{
+		for (int i = 0; i < weight_count; i++)
+		{
+			float uqw = static_cast<float>(scb.weights[i]);
+			float qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
+			int qwi = static_cast<int>(qw + 0.5f);
+			weights[i] = qat.scramble_map[qwi];
+		}
+	}
+
+	encode_ise(weight_quant_method, real_weight_count, weights, weightbuf, 0);
+
+	for (int i = 0; i < 16; i++)
+	{
+		pcb[i] = static_cast<uint8_t>(bitrev8(weightbuf[15 - i]));
+	}
+
+	write_bits(scb.block_mode, 11, 0, pcb);
+	write_bits(partition_count - 1, 2, 11, pcb);
+
+	int below_weights_pos = 128 - bits_for_weights;
+
+	// Encode partition index and color endpoint types for blocks with 2+ partitions
+	if (partition_count > 1)
+	{
+		write_bits(scb.partition_index, 6, 13, pcb);
+		write_bits(scb.partition_index >> 6, PARTITION_INDEX_BITS - 6, 19, pcb);
+
+		if (scb.color_formats_matched)
+		{
+			write_bits(scb.color_formats[0] << 2, 6, 13 + PARTITION_INDEX_BITS, pcb);
+		}
+		else
+		{
+			// Check endpoint types for each partition to determine the lowest class present
+			int low_class = 4;
+
+			for (unsigned int i = 0; i < partition_count; i++)
+			{
+				int class_of_format = scb.color_formats[i] >> 2;
+				low_class = astc::min(class_of_format, low_class);
+			}
+
+			if (low_class == 3)
+			{
+				low_class = 2;
+			}
+
+			int encoded_type = low_class + 1;
+			int bitpos = 2;
+
+			for (unsigned int i = 0; i < partition_count; i++)
+			{
+				int classbit_of_format = (scb.color_formats[i] >> 2) - low_class;
+				encoded_type |= classbit_of_format << bitpos;
+				bitpos++;
+			}
+
+			for (unsigned int i = 0; i < partition_count; i++)
+			{
+				int lowbits_of_format = scb.color_formats[i] & 3;
+				encoded_type |= lowbits_of_format << bitpos;
+				bitpos += 2;
+			}
+
+			int encoded_type_lowpart = encoded_type & 0x3F;
+			int encoded_type_highpart = encoded_type >> 6;
+			int encoded_type_highpart_size = (3 * partition_count) - 4;
+			int encoded_type_highpart_pos = 128 - bits_for_weights - encoded_type_highpart_size;
+			write_bits(encoded_type_lowpart, 6, 13 + PARTITION_INDEX_BITS, pcb);
+			write_bits(encoded_type_highpart, encoded_type_highpart_size, encoded_type_highpart_pos, pcb);
+			below_weights_pos -= encoded_type_highpart_size;
+		}
+	}
+	else
+	{
+		write_bits(scb.color_formats[0], 4, 13, pcb);
+	}
+
+	// In dual-plane mode, encode the color component of the second plane of weights
+	if (is_dual_plane)
+	{
+		write_bits(scb.plane2_component, 2, below_weights_pos - 2, pcb);
+	}
+
+	// Encode the color components
+	uint8_t values_to_encode[32];
+	int valuecount_to_encode = 0;
+
+	const uint8_t* pack_table = color_uquant_to_scrambled_pquant_tables[scb.quant_mode - QUANT_6];
+	for (unsigned int i = 0; i < scb.partition_count; i++)
+	{
+		int vals = 2 * (scb.color_formats[i] >> 2) + 2;
+		assert(vals <= 8);
+		for (int j = 0; j < vals; j++)
+		{
+			values_to_encode[j + valuecount_to_encode] = pack_table[scb.color_values[i][j]];
+		}
+		valuecount_to_encode += vals;
+	}
+
+	encode_ise(scb.get_color_quant_mode(), valuecount_to_encode, values_to_encode, pcb,
+	           scb.partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS);
+}
+
+#endif
+
+/* See header for documentation. */
+void physical_to_symbolic(
+	const block_size_descriptor& bsd,
+	const uint8_t pcb[16],
+	symbolic_compressed_block& scb
+) {
+	uint8_t bswapped[16];
+
+	scb.block_type = SYM_BTYPE_NONCONST;
+
+	// Extract header fields
+	int block_mode = read_bits(11, 0, pcb);
+	if ((block_mode & 0x1FF) == 0x1FC)
+	{
+		// Constant color block
+
+		// Check what format the data has
+		if (block_mode & 0x200)
+		{
+			scb.block_type = SYM_BTYPE_CONST_F16;
+		}
+		else
+		{
+			scb.block_type = SYM_BTYPE_CONST_U16;
+		}
+
+		scb.partition_count = 0;
+		for (int i = 0; i < 4; i++)
+		{
+			scb.constant_color[i] = pcb[2 * i + 8] | (pcb[2 * i + 9] << 8);
+		}
+
+		// Additionally, check that the void-extent
+		if (bsd.zdim == 1)
+		{
+			// 2D void-extent
+			int rsvbits = read_bits(2, 10, pcb);
+			if (rsvbits != 3)
+			{
+				scb.block_type = SYM_BTYPE_ERROR;
+				return;
+			}
+
+			// Low values span 3 bytes so need two read_bits calls
+			int vx_low_s = read_bits(8, 12, pcb) | (read_bits(5, 12 + 8, pcb) << 8);
+			int vx_high_s = read_bits(13, 25, pcb);
+			int vx_low_t = read_bits(8, 38, pcb) | (read_bits(5, 38 + 8, pcb) << 8);
+			int vx_high_t = read_bits(13, 51, pcb);
+
+			int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF &&
+			               vx_low_t == 0x1FFF && vx_high_t == 0x1FFF;
+
+			if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t) && !all_ones)
+			{
+				scb.block_type = SYM_BTYPE_ERROR;
+				return;
+			}
+		}
+		else
+		{
+			// 3D void-extent
+			int vx_low_s = read_bits(9, 10, pcb);
+			int vx_high_s = read_bits(9, 19, pcb);
+			int vx_low_t = read_bits(9, 28, pcb);
+			int vx_high_t = read_bits(9, 37, pcb);
+			int vx_low_r = read_bits(9, 46, pcb);
+			int vx_high_r = read_bits(9, 55, pcb);
+
+			int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF &&
+			               vx_low_t == 0x1FF && vx_high_t == 0x1FF &&
+			               vx_low_r == 0x1FF && vx_high_r == 0x1FF;
+
+			if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t || vx_low_r >= vx_high_r) && !all_ones)
+			{
+				scb.block_type = SYM_BTYPE_ERROR;
+				return;
+			}
+		}
+
+		return;
+	}
+
+	unsigned int packed_index = bsd.block_mode_packed_index[block_mode];
+	if (packed_index == BLOCK_BAD_BLOCK_MODE)
+	{
+		scb.block_type = SYM_BTYPE_ERROR;
+		return;
+	}
+
+	const auto& bm = bsd.get_block_mode(block_mode);
+	const auto& di = bsd.get_decimation_info(bm.decimation_mode);
+
+	int weight_count = di.weight_count;
+	promise(weight_count > 0);
+
+	quant_method weight_quant_method = static_cast<quant_method>(bm.quant_mode);
+	int is_dual_plane = bm.is_dual_plane;
+
+	int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;
+
+	int partition_count = read_bits(2, 11, pcb) + 1;
+	promise(partition_count > 0);
+
+	scb.block_mode = static_cast<uint16_t>(block_mode);
+	scb.partition_count = static_cast<uint8_t>(partition_count);
+
+	for (int i = 0; i < 16; i++)
+	{
+		bswapped[i] = static_cast<uint8_t>(bitrev8(pcb[15 - i]));
+	}
+
+	int bits_for_weights = get_ise_sequence_bitcount(real_weight_count, weight_quant_method);
+
+	int below_weights_pos = 128 - bits_for_weights;
+
+	uint8_t indices[64];
+	const auto& qat = quant_and_xfer_tables[weight_quant_method];
+
+	decode_ise(weight_quant_method, real_weight_count, bswapped, indices, 0);
+
+	if (is_dual_plane)
+	{
+		for (int i = 0; i < weight_count; i++)
+		{
+			scb.weights[i] = qat.unscramble_and_unquant_map[indices[2 * i]];
+			scb.weights[i + WEIGHTS_PLANE2_OFFSET] = qat.unscramble_and_unquant_map[indices[2 * i + 1]];
+		}
+	}
+	else
+	{
+		for (int i = 0; i < weight_count; i++)
+		{
+			scb.weights[i] = qat.unscramble_and_unquant_map[indices[i]];
+		}
+	}
+
+	if (is_dual_plane && partition_count == 4)
+	{
+		scb.block_type = SYM_BTYPE_ERROR;
+		return;
+	}
+
+	scb.color_formats_matched = 0;
+
+	// Determine the format of each endpoint pair
+	int color_formats[BLOCK_MAX_PARTITIONS];
+	int encoded_type_highpart_size = 0;
+	if (partition_count == 1)
+	{
+		color_formats[0] = read_bits(4, 13, pcb);
+		scb.partition_index = 0;
+	}
+	else
+	{
+		encoded_type_highpart_size = (3 * partition_count) - 4;
+		below_weights_pos -= encoded_type_highpart_size;
+		int encoded_type = read_bits(6, 13 + PARTITION_INDEX_BITS, pcb) |
+		                  (read_bits(encoded_type_highpart_size, below_weights_pos, pcb) << 6);
+		int baseclass = encoded_type & 0x3;
+		if (baseclass == 0)
+		{
+			for (int i = 0; i < partition_count; i++)
+			{
+				color_formats[i] = (encoded_type >> 2) & 0xF;
+			}
+
+			below_weights_pos += encoded_type_highpart_size;
+			scb.color_formats_matched = 1;
+			encoded_type_highpart_size = 0;
+		}
+		else
+		{
+			int bitpos = 2;
+			baseclass--;
+
+			for (int i = 0; i < partition_count; i++)
+			{
+				color_formats[i] = (((encoded_type >> bitpos) & 1) + baseclass) << 2;
+				bitpos++;
+			}
+
+			for (int i = 0; i < partition_count; i++)
+			{
+				color_formats[i] |= (encoded_type >> bitpos) & 3;
+				bitpos += 2;
+			}
+		}
+		scb.partition_index = static_cast<uint16_t>(read_bits(10, 13, pcb));
+	}
+
+	for (int i = 0; i < partition_count; i++)
+	{
+		scb.color_formats[i] = static_cast<uint8_t>(color_formats[i]);
+	}
+
+	// Determine number of color endpoint integers
+	int color_integer_count = 0;
+	for (int i = 0; i < partition_count; i++)
+	{
+		int endpoint_class = color_formats[i] >> 2;
+		color_integer_count += (endpoint_class + 1) * 2;
+	}
+
+	if (color_integer_count > 18)
+	{
+		scb.block_type = SYM_BTYPE_ERROR;
+		return;
+	}
+
+	// Determine the color endpoint format to use
+	static const int color_bits_arr[5] { -1, 115 - 4, 113 - 4 - PARTITION_INDEX_BITS, 113 - 4 - PARTITION_INDEX_BITS, 113 - 4 - PARTITION_INDEX_BITS };
+	int color_bits = color_bits_arr[partition_count] - bits_for_weights - encoded_type_highpart_size;
+	if (is_dual_plane)
+	{
+		color_bits -= 2;
+	}
+
+	if (color_bits < 0)
+	{
+		color_bits = 0;
+	}
+
+	int color_quant_level = quant_mode_table[color_integer_count >> 1][color_bits];
+	if (color_quant_level < QUANT_6)
+	{
+		scb.block_type = SYM_BTYPE_ERROR;
+		return;
+	}
+
+	// Unpack the integer color values and assign to endpoints
+	scb.quant_mode = static_cast<quant_method>(color_quant_level);
+
+	uint8_t values_to_decode[32];
+	decode_ise(static_cast<quant_method>(color_quant_level), color_integer_count, pcb,
+	           values_to_decode, (partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS));
+
+	int valuecount_to_decode = 0;
+	const uint8_t* unpack_table = color_scrambled_pquant_to_uquant_tables[scb.quant_mode - QUANT_6];
+	for (int i = 0; i < partition_count; i++)
+	{
+		int vals = 2 * (color_formats[i] >> 2) + 2;
+		for (int j = 0; j < vals; j++)
+		{
+			scb.color_values[i][j] = unpack_table[values_to_decode[j + valuecount_to_decode]];
+		}
+		valuecount_to_decode += vals;
+	}
+
+	// Fetch component for second-plane in the case of dual plane of weights.
+	scb.plane2_component = -1;
+	if (is_dual_plane)
+	{
+		scb.plane2_component = static_cast<int8_t>(read_bits(2, below_weights_pos - 2, pcb));
+	}
+}
@@ -0,0 +1,608 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2019-2025 Arm Limited
+// Copyright 2008 Jose Fonseca
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/*
+ * This module implements vector support for floats, ints, and vector lane
+ * control masks. It provides access to both explicit vector width types, and
+ * flexible N-wide types where N can be determined at compile time.
+ *
+ * The design of this module encourages use of vector length agnostic code, via
+ * the vint, vfloat, and vmask types. These will take on the widest SIMD vector
+ * with that is available at compile time. The current vector width is
+ * accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant.
+ *
+ * Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types.
+ * These are provided primarily for prototyping and algorithm debug of VLA
+ * implementations.
+ *
+ * Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4
+ * types. These are provided for use by VLA code, but are also expected to be
+ * used as a fixed-width type and will supported a reference C++ fallback for
+ * use on platforms without SIMD intrinsics.
+ *
+ * Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8
+ * types. These are provide for use by VLA code, and are not expected to be
+ * used as a fixed-width type in normal code. No reference C implementation is
+ * provided on platforms without underlying SIMD intrinsics.
+ *
+ * With the current implementation ISA support is provided for:
+ *
+ *     * 1-wide for scalar reference
+ *     * 4-wide for Armv8-A NEON
+ *     * 4-wide for x86-64 SSE2
+ *     * 4-wide for x86-64 SSE4.1
+ *     * 8-wide for Armv8-A SVE
+ *     * 8-wide for x86-64 AVX2
+ */
+
+#ifndef ASTC_VECMATHLIB_H_INCLUDED
+#define ASTC_VECMATHLIB_H_INCLUDED
+
+#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
+	#include <immintrin.h>
+#endif
+
+#if ASTCENC_SVE != 0
+	#include <arm_sve.h>
+	#include <arm_neon_sve_bridge.h>
+#endif
+
+#if ASTCENC_NEON != 0
+	#include <arm_neon.h>
+#endif
+
+#if !defined(__clang__) && defined(_MSC_VER)
+	#define ASTCENC_SIMD_INLINE __forceinline
+	#define ASTCENC_NO_INLINE
+#elif defined(__GNUC__) && !defined(__clang__)
+	#define ASTCENC_SIMD_INLINE __attribute__((always_inline)) inline
+	#define ASTCENC_NO_INLINE __attribute__ ((noinline))
+#else
+	#define ASTCENC_SIMD_INLINE __attribute__((always_inline, nodebug)) inline
+	#define ASTCENC_NO_INLINE __attribute__ ((noinline))
+#endif
+
+template<typename T> T gatherf_byte_inds(const float* base, const uint8_t* indices);
+
+#if ASTCENC_AVX >= 2
+	// If we have AVX2 expose 8-wide VLA.
+	#include "astcenc_vecmathlib_sse_4.h"
+	#include "astcenc_vecmathlib_common_4.h"
+	#include "astcenc_vecmathlib_avx2_8.h"
+
+	#define ASTCENC_SIMD_WIDTH 8
+
+	using vfloat = vfloat8;
+
+	#if defined(ASTCENC_NO_INVARIANCE)
+		using vfloatacc = vfloat8;
+	#else
+		using vfloatacc = vfloat4;
+	#endif
+
+	using vint = vint8;
+	using vmask = vmask8;
+
+	using vtable_16x8 = vtable8_16x8;
+	using vtable_32x8 = vtable8_32x8;
+	using vtable_64x8 = vtable8_64x8;
+
+	constexpr auto loada = vfloat8::loada;
+	constexpr auto load1 = vfloat8::load1;
+	constexpr auto vint_from_size = vint8_from_size;
+
+#elif ASTCENC_SSE >= 20
+	// If we have SSE expose 4-wide VLA, and 4-wide fixed width.
+	#include "astcenc_vecmathlib_sse_4.h"
+	#include "astcenc_vecmathlib_common_4.h"
+
+	#define ASTCENC_SIMD_WIDTH 4
+
+	using vfloat = vfloat4;
+	using vfloatacc = vfloat4;
+	using vint = vint4;
+	using vmask = vmask4;
+
+	using vtable_16x8 = vtable4_16x8;
+	using vtable_32x8 = vtable4_32x8;
+	using vtable_64x8 = vtable4_64x8;
+
+	constexpr auto loada = vfloat4::loada;
+	constexpr auto load1 = vfloat4::load1;
+	constexpr auto vint_from_size = vint4_from_size;
+
+#elif ASTCENC_SVE == 8
+	// Check the compiler is configured with fixed-length 256-bit SVE.
+	#if !defined(__ARM_FEATURE_SVE_BITS) || (__ARM_FEATURE_SVE_BITS != 256)
+		#error "__ARM_FEATURE_SVE_BITS is not set to 256 bits"
+	#endif
+
+	// If we have SVE configured as 8-wide, expose 8-wide VLA.
+	#include "astcenc_vecmathlib_neon_4.h"
+	#include "astcenc_vecmathlib_common_4.h"
+	#include "astcenc_vecmathlib_sve_8.h"
+
+	#define ASTCENC_SIMD_WIDTH 8
+
+	using vfloat = vfloat8;
+
+	#if defined(ASTCENC_NO_INVARIANCE)
+		using vfloatacc = vfloat8;
+	#else
+		using vfloatacc = vfloat4;
+	#endif
+
+	using vint = vint8;
+	using vmask = vmask8;
+
+	using vtable_16x8 = vtable8_16x8;
+	using vtable_32x8 = vtable8_32x8;
+	using vtable_64x8 = vtable8_64x8;
+
+	constexpr auto loada = vfloat8::loada;
+	constexpr auto load1 = vfloat8::load1;
+	constexpr auto vint_from_size = vint8_from_size;
+
+#elif ASTCENC_NEON > 0
+	// If we have NEON expose 4-wide VLA.
+	#include "astcenc_vecmathlib_neon_4.h"
+	#include "astcenc_vecmathlib_common_4.h"
+
+	#define ASTCENC_SIMD_WIDTH 4
+
+	using vfloat = vfloat4;
+	using vfloatacc = vfloat4;
+	using vint = vint4;
+	using vmask = vmask4;
+
+	using vtable_16x8 = vtable4_16x8;
+	using vtable_32x8 = vtable4_32x8;
+	using vtable_64x8 = vtable4_64x8;
+
+	constexpr auto loada = vfloat4::loada;
+	constexpr auto load1 = vfloat4::load1;
+	constexpr auto vint_from_size = vint4_from_size;
+
+#else
+	// If we have nothing expose 4-wide VLA, and 4-wide fixed width.
+
+	// Note: We no longer expose the 1-wide scalar fallback because it is not
+	// invariant with the 4-wide path due to algorithms that use horizontal
+	// operations that accumulate a local vector sum before accumulating into
+	// a running sum.
+	//
+	// For 4 items adding into an accumulator using 1-wide vectors the sum is:
+	//
+	//     result = ((((sum + l0) + l1) + l2) + l3)
+	//
+    // ... whereas the accumulator for a 4-wide vector sum is:
+	//
+	//     result = sum + ((l0 + l2) + (l1 + l3))
+	//
+	// In "normal maths" this is the same, but the floating point reassociation
+	// differences mean that these will not produce the same result.
+
+	#include "astcenc_vecmathlib_none_4.h"
+	#include "astcenc_vecmathlib_common_4.h"
+
+	#define ASTCENC_SIMD_WIDTH 4
+
+	using vfloat = vfloat4;
+	using vfloatacc = vfloat4;
+	using vint = vint4;
+	using vmask = vmask4;
+
+	using vtable_16x8 = vtable4_16x8;
+	using vtable_32x8 = vtable4_32x8;
+	using vtable_64x8 = vtable4_64x8;
+
+	constexpr auto loada = vfloat4::loada;
+	constexpr auto load1 = vfloat4::load1;
+	constexpr auto vint_from_size = vint4_from_size;
+#endif
+
+/**
+ * @brief Round a count down to the largest multiple of the SIMD width.
+ *
+ * Assumption that the vector width is a power of two ...
+ *
+ * @param count   The unrounded value.
+ *
+ * @return The rounded value.
+ */
+ASTCENC_SIMD_INLINE size_t round_down_to_simd_multiple_vla(size_t count)
+{
+	return count & static_cast<size_t>(~(ASTCENC_SIMD_WIDTH - 1));
+}
+
+/**
+ * @brief Round a count up to the largest multiple of the SIMD width.
+ *
+ * Assumption that the vector width is a power of two ...
+ *
+ * @param count   The unrounded value.
+ *
+ * @return The rounded value.
+ */
+ASTCENC_SIMD_INLINE size_t round_up_to_simd_multiple_vla(size_t count)
+{
+	size_t multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
+	return multiples * ASTCENC_SIMD_WIDTH;
+}
+
+/**
+ * @brief Return @c a with lanes negated if the @c b lane is negative.
+ */
+ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b)
+{
+	vint ia = float_as_int(a);
+	vint ib = float_as_int(b);
+	vint sign_mask(static_cast<int>(0x80000000));
+	vint r = ia ^ (ib & sign_mask);
+	return int_as_float(r);
+}
+
+/**
+ * @brief Return fast, but approximate, vector atan(x).
+ *
+ * Max error of this implementation is 0.004883.
+ */
+ASTCENC_SIMD_INLINE vfloat atan(vfloat x)
+{
+	vmask c = abs(x) > vfloat(1.0f);
+	vfloat z = change_sign(vfloat(astc::PI_OVER_TWO), x);
+	vfloat y = select(x, vfloat(1.0f) / x, c);
+	y = y / (y * y * vfloat(0.28f) + vfloat(1.0f));
+	return select(y, z - y, c);
+}
+
+/**
+ * @brief Return fast, but approximate, vector atan2(x, y).
+ */
+ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x)
+{
+	vfloat z = atan(abs(y / x));
+	vmask xmask = x < vfloat::zero();
+	return change_sign(select(z, vfloat(astc::PI) - z, xmask), y);
+}
+
+/*
+ * @brief Factory that returns a unit length 4 component vfloat4.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 unit4()
+{
+	return vfloat4(0.5f);
+}
+
+/**
+ * @brief Factory that returns a unit length 3 component vfloat4.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 unit3()
+{
+	float val = 0.577350258827209473f;
+	return vfloat4(val, val, val, 0.0f);
+}
+
+/**
+ * @brief Factory that returns a unit length 2 component vfloat4.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 unit2()
+{
+	float val = 0.707106769084930420f;
+	return vfloat4(val, val, 0.0f, 0.0f);
+}
+
+/**
+ * @brief Factory that returns a 3 component vfloat4.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 vfloat3(float a, float b, float c)
+{
+	return vfloat4(a, b, c, 0.0f);
+}
+
+/**
+ * @brief Factory that returns a 2 component vfloat4.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 vfloat2(float a, float b)
+{
+	return vfloat4(a, b, 0.0f, 0.0f);
+}
+
+/**
+ * @brief Normalize a non-zero length vector to unit length.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 normalize(vfloat4 a)
+{
+	vfloat4 length = dot(a, a);
+	return a / sqrt(length);
+}
+
+/**
+ * @brief Normalize a vector, returning @c safe if len is zero.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 normalize_safe(vfloat4 a, vfloat4 safe)
+{
+	vfloat4 length = dot(a, a);
+	if (length.lane<0>() != 0.0f)
+	{
+		return a / sqrt(length);
+	}
+
+	return safe;
+}
+
+
+
+#define POLY0(x, c0)                     (                                     c0)
+#define POLY1(x, c0, c1)                 ((POLY0(x, c1) * x)                 + c0)
+#define POLY2(x, c0, c1, c2)             ((POLY1(x, c1, c2) * x)             + c0)
+#define POLY3(x, c0, c1, c2, c3)         ((POLY2(x, c1, c2, c3) * x)         + c0)
+#define POLY4(x, c0, c1, c2, c3, c4)     ((POLY3(x, c1, c2, c3, c4) * x)     + c0)
+#define POLY5(x, c0, c1, c2, c3, c4, c5) ((POLY4(x, c1, c2, c3, c4, c5) * x) + c0)
+
+/**
+ * @brief Compute an approximate exp2(x) for each lane in the vector.
+ *
+ * Based on 5th degree minimax polynomials, ported from this blog
+ * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
+ */
+static ASTCENC_SIMD_INLINE vfloat4 exp2(vfloat4 x)
+{
+	x = clamp(-126.99999f, 129.0f, x);
+
+	vint4 ipart = float_to_int(x - 0.5f);
+	vfloat4 fpart = x - int_to_float(ipart);
+
+	// Integer contrib, using 1 << ipart
+	vfloat4 iexp = int_as_float(lsl<23>(ipart + 127));
+
+	// Fractional contrib, using polynomial fit of 2^x in range [-0.5, 0.5)
+	vfloat4 fexp = POLY5(fpart,
+	                     9.9999994e-1f,
+	                     6.9315308e-1f,
+	                     2.4015361e-1f,
+	                     5.5826318e-2f,
+	                     8.9893397e-3f,
+	                     1.8775767e-3f);
+
+	return iexp * fexp;
+}
+
+/**
+ * @brief Compute an approximate log2(x) for each lane in the vector.
+ *
+ * Based on 5th degree minimax polynomials, ported from this blog
+ * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
+ */
+static ASTCENC_SIMD_INLINE vfloat4 log2(vfloat4 x)
+{
+	vint4 exp(0x7F800000);
+	vint4 mant(0x007FFFFF);
+	vint4 one(0x3F800000);
+
+	vint4 i = float_as_int(x);
+
+	vfloat4 e = int_to_float(lsr<23>(i & exp) - 127);
+
+	vfloat4 m = int_as_float((i & mant) | one);
+
+	// Polynomial fit of log2(x)/(x - 1), for x in range [1, 2)
+	vfloat4 p = POLY4(m,
+	                  2.8882704548164776201f,
+	                 -2.52074962577807006663f,
+	                  1.48116647521213171641f,
+	                 -0.465725644288844778798f,
+	                  0.0596515482674574969533f);
+
+	// Increases the polynomial degree, but ensures that log2(1) == 0
+	p = p * (m - 1.0f);
+
+	return p + e;
+}
+
+/**
+ * @brief Compute an approximate pow(x, y) for each lane in the vector.
+ *
+ * Power function based on the exp2(log2(x) * y) transform.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 pow(vfloat4 x, vfloat4 y)
+{
+	vmask4 zero_mask = y == vfloat4(0.0f);
+	vfloat4 estimate = exp2(log2(x) * y);
+
+	// Guarantee that y == 0 returns exactly 1.0f
+	return select(estimate, vfloat4(1.0f), zero_mask);
+}
+
+/**
+ * @brief Count the leading zeros for each lane in @c a.
+ *
+ * Valid for all data values of @c a; will return a per-lane value [0, 32].
+ */
+static ASTCENC_SIMD_INLINE vint4 clz(vint4 a)
+{
+	// This function is a horrible abuse of floating point exponents to convert
+	// the original integer value into a 2^N encoding we can recover easily.
+
+	// Convert to float without risk of rounding up by keeping only top 8 bits.
+	// This trick is is guaranteed to keep top 8 bits and clear the 9th.
+	a = (~lsr<8>(a)) & a;
+	a = float_as_int(int_to_float(a));
+
+	// Extract and unbias exponent
+	a = vint4(127 + 31) - lsr<23>(a);
+
+	// Clamp result to a valid 32-bit range
+	return clamp(0, 32, a);
+}
+
+/**
+ * @brief Return lanewise 2^a for each lane in @c a.
+ *
+ * Use of signed int means that this is only valid for values in range [0, 31].
+ */
+static ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a)
+{
+	// 2^30 is the largest signed number than can be represented
+	assert(all(a < vint4(31)));
+
+	// This function is a horrible abuse of floating point to use the exponent
+	// and float conversion to generate a 2^N multiple.
+
+	// Bias the exponent
+	vint4 exp = a + 127;
+	exp = lsl<23>(exp);
+
+	// Reinterpret the bits as a float, and then convert to an int
+	vfloat4 f = int_as_float(exp);
+	return float_to_int(f);
+}
+
+/**
+ * @brief Convert unorm16 [0, 65535] to float16 in range [0, 1].
+ */
+static ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p)
+{
+	vint4 fp16_one = vint4(0x3C00);
+	vint4 fp16_small = lsl<8>(p);
+
+	vmask4 is_one = p == vint4(0xFFFF);
+	vmask4 is_small = p < vint4(4);
+
+	// Manually inline clz() on Visual Studio to avoid release build codegen bug
+	// see https://github.com/ARM-software/astc-encoder/issues/259
+#if !defined(__clang__) && defined(_MSC_VER)
+	vint4 a = (~lsr<8>(p)) & p;
+	a = float_as_int(int_to_float(a));
+	a = vint4(127 + 31) - lsr<23>(a);
+	vint4 lz = clamp(0, 32, a) - 16;
+#else
+	vint4 lz = clz(p) - 16;
+#endif
+
+	p = p * two_to_the_n(lz + 1);
+	p = p & vint4(0xFFFF);
+
+	p = lsr<6>(p);
+
+	p = p | lsl<10>(vint4(14) - lz);
+
+	vint4 r = select(p, fp16_one, is_one);
+	r = select(r, fp16_small, is_small);
+	return r;
+}
+
+/**
+ * @brief Convert 16-bit LNS to float16.
+ */
+static ASTCENC_SIMD_INLINE vint4 lns_to_sf16(vint4 p)
+{
+	vint4 mc = p & 0x7FF;
+	vint4 ec = lsr<11>(p);
+
+	vint4 mc_512 = mc * 3;
+	vmask4 mask_512 = mc < vint4(512);
+
+	vint4 mc_1536 = mc * 4 - 512;
+	vmask4 mask_1536 = mc < vint4(1536);
+
+	vint4 mc_else = mc * 5 - 2048;
+
+	vint4 mt = mc_else;
+	mt = select(mt, mc_1536, mask_1536);
+	mt = select(mt, mc_512, mask_512);
+
+	vint4 res = lsl<10>(ec) | lsr<3>(mt);
+	return min(res, vint4(0x7BFF));
+}
+
+/**
+ * @brief Extract mantissa and exponent of a float value.
+ *
+ * @param      a      The input value.
+ * @param[out] exp    The output exponent.
+ *
+ * @return The mantissa.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp)
+{
+	// Interpret the bits as an integer
+	vint4 ai = float_as_int(a);
+
+	// Extract and unbias the exponent
+	exp = (lsr<23>(ai) & 0xFF) - 126;
+
+	// Extract and unbias the mantissa
+	vint4 manti = (ai &  static_cast<int>(0x807FFFFF)) | 0x3F000000;
+	return int_as_float(manti);
+}
+
+/**
+ * @brief Convert float to 16-bit LNS.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 float_to_lns(vfloat4 a)
+{
+	vint4 exp;
+	vfloat4 mant = frexp(a, exp);
+
+	// Do these early before we start messing about ...
+	vmask4 mask_underflow_nan = ~(a > vfloat4(1.0f / 67108864.0f));
+	vmask4 mask_infinity = a >= vfloat4(65536.0f);
+
+	// If input is smaller than 2^-14, multiply by 2^25 and don't bias.
+	vmask4 exp_lt_m13 = exp < vint4(-13);
+
+	vfloat4 a1a = a * 33554432.0f;
+	vint4 expa = vint4::zero();
+
+	vfloat4 a1b = (mant - 0.5f) * 4096;
+	vint4 expb = exp + 14;
+
+	a = select(a1b, a1a, exp_lt_m13);
+	exp = select(expb, expa, exp_lt_m13);
+
+	vmask4 a_lt_384 = a < vfloat4(384.0f);
+	vmask4 a_lt_1408 = a <= vfloat4(1408.0f);
+
+	vfloat4 a2a = a * (4.0f / 3.0f);
+	vfloat4 a2b = a + 128.0f;
+	vfloat4 a2c = (a + 512.0f) * (4.0f / 5.0f);
+
+	a = a2c;
+	a = select(a, a2b, a_lt_1408);
+	a = select(a, a2a, a_lt_384);
+
+	a = a + (int_to_float(exp) * 2048.0f) + 1.0f;
+
+	a = select(a, vfloat4(65535.0f), mask_infinity);
+	a = select(a, vfloat4::zero(), mask_underflow_nan);
+
+	return a;
+}
+
+namespace astc
+{
+
+static ASTCENC_SIMD_INLINE float pow(float x, float y)
+{
+	return pow(vfloat4(x), vfloat4(y)).lane<0>();
+}
+
+}
+
+#endif // #ifndef ASTC_VECMATHLIB_H_INCLUDED
@@ -0,0 +1,421 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2020-2025 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Generic 4x32-bit vector functions.
+ *
+ * This module implements generic 4-wide vector functions that are valid for
+ * all instruction sets, typically implemented using lower level 4-wide
+ * operations that are ISA-specific.
+ */
+
+#ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
+#define ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
+
+#ifndef ASTCENC_SIMD_INLINE
+	#error "Include astcenc_vecmathlib.h, do not include directly"
+#endif
+
+#include <cstdio>
+#include <limits>
+
+// ============================================================================
+// vint4 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: vector by scalar addition.
+ */
+ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, int b)
+{
+	return a + vint4(b);
+}
+
+/**
+ * @brief Overload: vector by vector incremental addition.
+ */
+ASTCENC_SIMD_INLINE vint4& operator+=(vint4& a, const vint4& b)
+{
+	a = a + b;
+	return a;
+}
+
+/**
+ * @brief Overload: vector by scalar subtraction.
+ */
+ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, int b)
+{
+	return a - vint4(b);
+}
+
+/**
+ * @brief Overload: vector by scalar multiplication.
+ */
+ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, int b)
+{
+	return a * vint4(b);
+}
+
+/**
+ * @brief Overload: vector by scalar bitwise or.
+ */
+ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, int b)
+{
+	return a | vint4(b);
+}
+
+/**
+ * @brief Overload: vector by scalar bitwise and.
+ */
+ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, int b)
+{
+	return a & vint4(b);
+}
+
+/**
+ * @brief Overload: vector by scalar bitwise xor.
+ */
+ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, int b)
+{
+	return a ^ vint4(b);
+}
+
+/**
+ * @brief Return the clamped value between min and max.
+ */
+ASTCENC_SIMD_INLINE vint4 clamp(int minv, int maxv, vint4 a)
+{
+	return min(max(a, vint4(minv)), vint4(maxv));
+}
+
+/**
+ * @brief Return the horizontal sum of RGB vector lanes as a scalar.
+ */
+ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
+{
+	return a.lane<0>() + a.lane<1>() + a.lane<2>();
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hmin_s(vint4 a)
+{
+	return hmin(a).lane<0>();
+}
+
+/**
+ * @brief Generate a vint4 from a size_t.
+ */
+ ASTCENC_SIMD_INLINE vint4 vint4_from_size(size_t a)
+ {
+	assert(a <= std::numeric_limits<int>::max());
+	return vint4(static_cast<int>(a));
+ }
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hmax_s(vint4 a)
+{
+	return hmax(a).lane<0>();
+}
+
+// ============================================================================
+// vfloat4 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: vector by vector incremental addition.
+ */
+ASTCENC_SIMD_INLINE vfloat4& operator+=(vfloat4& a, const vfloat4& b)
+{
+	a = a + b;
+	return a;
+}
+
+/**
+ * @brief Overload: vector by scalar addition.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, float b)
+{
+	return a + vfloat4(b);
+}
+
+/**
+ * @brief Overload: vector by scalar subtraction.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, float b)
+{
+	return a - vfloat4(b);
+}
+
+/**
+ * @brief Overload: vector by scalar multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, float b)
+{
+	return a * vfloat4(b);
+}
+
+/**
+ * @brief Overload: scalar by vector multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator*(float a, vfloat4 b)
+{
+	return vfloat4(a) * b;
+}
+
+/**
+ * @brief Overload: vector by scalar division.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, float b)
+{
+	return a / vfloat4(b);
+}
+
+/**
+ * @brief Overload: scalar by vector division.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator/(float a, vfloat4 b)
+{
+	return vfloat4(a) / b;
+}
+
+/**
+ * @brief Return the min vector of a vector and a scalar.
+ *
+ * If either lane value is NaN, @c b will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, float b)
+{
+	return min(a, vfloat4(b));
+}
+
+/**
+ * @brief Return the max vector of a vector and a scalar.
+ *
+ * If either lane value is NaN, @c b will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, float b)
+{
+	return max(a, vfloat4(b));
+}
+
+/**
+ * @brief Return the clamped value between min and max.
+ *
+ * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
+ * then @c min will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
+{
+	// Do not reorder - second operand will return if either is NaN
+	return min(max(a, minv), maxv);
+}
+
+/**
+ * @brief Return the clamped value between 0.0f and 1.0f.
+ *
+ * If @c a is NaN then zero will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a)
+{
+	// Do not reorder - second operand will return if either is NaN
+	return min(max(a, vfloat4::zero()), 1.0f);
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE float hmin_s(vfloat4 a)
+{
+	return hmin(a).lane<0>();
+}
+
+/**
+ * @brief Return the horizontal min of RGB vector lanes as a scalar.
+ */
+ASTCENC_SIMD_INLINE float hmin_rgb_s(vfloat4 a)
+{
+	a.set_lane<3>(a.lane<0>());
+	return hmin_s(a);
+}
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE float hmax_s(vfloat4 a)
+{
+	return hmax(a).lane<0>();
+}
+
+/**
+ * @brief Accumulate lane-wise sums for a vector.
+ */
+ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a)
+{
+	accum = accum + a;
+}
+
+/**
+ * @brief Accumulate lane-wise sums for a masked vector.
+ */
+ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a, vmask4 m)
+{
+	a = select(vfloat4::zero(), a, m);
+	haccumulate(accum, a);
+}
+
+/**
+ * @brief Return the horizontal sum of RGB vector lanes as a scalar.
+ */
+ASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a)
+{
+	return a.lane<0>() + a.lane<1>() + a.lane<2>();
+}
+
+#if !defined(ASTCENC_USE_NATIVE_DOT_PRODUCT)
+
+/**
+ * @brief Return the dot product for the full 4 lanes, returning scalar.
+ */
+ASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b)
+{
+	vfloat4 m = a * b;
+	return hadd_s(m);
+}
+
+/**
+ * @brief Return the dot product for the full 4 lanes, returning vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b)
+{
+	vfloat4 m = a * b;
+	return vfloat4(hadd_s(m));
+}
+
+/**
+ * @brief Return the dot product for the bottom 3 lanes, returning scalar.
+ */
+ASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b)
+{
+	vfloat4 m = a * b;
+	return hadd_rgb_s(m);
+}
+
+/**
+ * @brief Return the dot product for the bottom 3 lanes, returning vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b)
+{
+	vfloat4 m = a * b;
+	float d3 = hadd_rgb_s(m);
+	return vfloat4(d3, d3, d3, 0.0f);
+}
+
+#endif
+
+#if !defined(ASTCENC_USE_NATIVE_POPCOUNT)
+
+/**
+ * @brief Population bit count.
+ *
+ * @param v   The value to population count.
+ *
+ * @return The number of 1 bits.
+ */
+static inline int popcount(uint64_t v)
+{
+	uint64_t mask1 = 0x5555555555555555ULL;
+	uint64_t mask2 = 0x3333333333333333ULL;
+	uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL;
+	v -= (v >> 1) & mask1;
+	v = (v & mask2) + ((v >> 2) & mask2);
+	v += v >> 4;
+	v &= mask3;
+	v *= 0x0101010101010101ULL;
+	v >>= 56;
+	return static_cast<int>(v);
+}
+
+#endif
+
+/**
+ * @brief Apply signed bit transfer.
+ *
+ * @param input0   The first encoded endpoint.
+ * @param input1   The second encoded endpoint.
+ */
+static ASTCENC_SIMD_INLINE void bit_transfer_signed(
+	vint4& input0,
+	vint4& input1
+) {
+	input1 = lsr<1>(input1) | (input0 & 0x80);
+	input0 = lsr<1>(input0) & 0x3F;
+
+	vmask4 mask = (input0 & 0x20) != vint4::zero();
+	input0 = select(input0, input0 - 0x40, mask);
+}
+
+/**
+ * @brief Debug function to print a vector of ints.
+ */
+ASTCENC_SIMD_INLINE void print(vint4 a)
+{
+	ASTCENC_ALIGNAS int v[4];
+	storea(a, v);
+	printf("v4_i32:\n  %8d %8d %8d %8d\n",
+	       v[0], v[1], v[2], v[3]);
+}
+
+/**
+ * @brief Debug function to print a vector of ints.
+ */
+ASTCENC_SIMD_INLINE void printx(vint4 a)
+{
+	ASTCENC_ALIGNAS int v[4];
+	storea(a, v);
+
+	unsigned int uv[4];
+	std::memcpy(uv, v, sizeof(int) * 4);
+
+	printf("v4_i32:\n  %08x %08x %08x %08x\n",
+		uv[0], uv[1], uv[2], uv[3]);
+}
+
+/**
+ * @brief Debug function to print a vector of floats.
+ */
+ASTCENC_SIMD_INLINE void print(vfloat4 a)
+{
+	ASTCENC_ALIGNAS float v[4];
+	storea(a, v);
+	printf("v4_f32:\n  %0.4f %0.4f %0.4f %0.4f\n",
+	       static_cast<double>(v[0]), static_cast<double>(v[1]),
+	       static_cast<double>(v[2]), static_cast<double>(v[3]));
+}
+
+/**
+ * @brief Debug function to print a vector of masks.
+ */
+ASTCENC_SIMD_INLINE void print(vmask4 a)
+{
+	print(select(vint4(0), vint4(1), a));
+}
+
+#endif // #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
@@ -0,0 +1,496 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2024 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/**
+ * @brief Functions for angular-sum algorithm for weight alignment.
+ *
+ * This algorithm works as follows:
+ * - we compute a complex number P as (cos s*i, sin s*i) for each weight,
+ *   where i is the input value and s is a scaling factor based on the spacing between the weights.
+ * - we then add together complex numbers for all the weights.
+ * - we then compute the length and angle of the resulting sum.
+ *
+ * This should produce the following results:
+ * - perfect alignment results in a vector whose length is equal to the sum of lengths of all inputs
+ * - even distribution results in a vector of length 0.
+ * - all samples identical results in perfect alignment for every scaling.
+ *
+ * For each scaling factor within a given set, we compute an alignment factor from 0 to 1. This
+ * should then result in some scalings standing out as having particularly good alignment factors;
+ * we can use this to produce a set of candidate scale/shift values for various quantization levels;
+ * we should then actually try them and see what happens.
+ */
+
+#include "astcenc_internal.h"
+#include "astcenc_vecmathlib.h"
+
+#include <stdio.h>
+#include <cassert>
+#include <cstring>
+#include <cfloat>
+
+static constexpr unsigned int ANGULAR_STEPS { 32 };
+
+static_assert((ANGULAR_STEPS % ASTCENC_SIMD_WIDTH) == 0,
+              "ANGULAR_STEPS must be multiple of ASTCENC_SIMD_WIDTH");
+
+static_assert(ANGULAR_STEPS >= 32,
+              "ANGULAR_STEPS must be at least max(steps_for_quant_level)");
+
+// Store a reduced sin/cos table for 64 possible weight values; this causes
+// slight quality loss compared to using sin() and cos() directly. Must be 2^N.
+static constexpr unsigned int SINCOS_STEPS { 64 };
+
+static const uint8_t steps_for_quant_level[12] {
+	2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32
+};
+
+ASTCENC_ALIGNAS static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
+ASTCENC_ALIGNAS static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
+
+#if defined(ASTCENC_DIAGNOSTICS)
+	static bool print_once { true };
+#endif
+
+/* See header for documentation. */
+void prepare_angular_tables()
+{
+	for (unsigned int i = 0; i < ANGULAR_STEPS; i++)
+	{
+		float angle_step = static_cast<float>(i + 1);
+
+		for (unsigned int j = 0; j < SINCOS_STEPS; j++)
+		{
+			sin_table[j][i] = static_cast<float>(sinf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
+			cos_table[j][i] = static_cast<float>(cosf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
+		}
+	}
+}
+
+/**
+ * @brief Compute the angular alignment factors and offsets.
+ *
+ * @param      weight_count              The number of (decimated) weights.
+ * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
+ * @param      max_angular_steps         The maximum number of steps to be tested.
+ * @param[out] offsets                   The output angular offsets array.
+ */
+static void compute_angular_offsets(
+	unsigned int weight_count,
+	const float* dec_weight_ideal_value,
+	unsigned int max_angular_steps,
+	float* offsets
+) {
+	promise(weight_count > 0);
+	promise(max_angular_steps > 0);
+
+	ASTCENC_ALIGNAS int isamplev[BLOCK_MAX_WEIGHTS];
+
+	// Precompute isample; arrays are always allocated 64 elements long
+	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
+	{
+		// Ideal weight can be outside [0, 1] range, so clamp to fit table
+		vfloat ideal_weight = clampzo(loada(dec_weight_ideal_value + i));
+
+		// Convert a weight to a sincos table index
+		vfloat sample = ideal_weight * (SINCOS_STEPS - 1.0f);
+		vint isample = float_to_int_rtn(sample);
+		storea(isample, isamplev + i);
+	}
+
+	// Arrays are multiple of SIMD width (ANGULAR_STEPS), safe to overshoot max
+	vfloat mult(1.0f / (2.0f * astc::PI));
+
+	for (unsigned int i = 0; i < max_angular_steps; i += ASTCENC_SIMD_WIDTH)
+	{
+		vfloat anglesum_x = vfloat::zero();
+		vfloat anglesum_y = vfloat::zero();
+
+		for (unsigned int j = 0; j < weight_count; j++)
+		{
+			int isample = isamplev[j];
+			anglesum_x += loada(cos_table[isample] + i);
+			anglesum_y += loada(sin_table[isample] + i);
+		}
+
+		vfloat angle = atan2(anglesum_y, anglesum_x);
+		vfloat ofs = angle * mult;
+		storea(ofs, offsets + i);
+	}
+}
+
+/**
+ * @brief For a given step size compute the lowest and highest weight.
+ *
+ * Compute the lowest and highest weight that results from quantizing using the given stepsize and
+ * offset, and then compute the resulting error. The cut errors indicate the error that results from
+ * forcing samples that should have had one weight value one step up or down.
+ *
+ * @param      weight_count              The number of (decimated) weights.
+ * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
+ * @param      max_angular_steps         The maximum number of steps to be tested.
+ * @param      max_quant_steps           The maximum quantization level to be tested.
+ * @param      offsets                   The angular offsets array.
+ * @param[out] lowest_weight             Per angular step, the lowest weight.
+ * @param[out] weight_span               Per angular step, the span between lowest and highest weight.
+ * @param[out] error                     Per angular step, the error.
+ * @param[out] cut_low_weight_error      Per angular step, the low weight cut error.
+ * @param[out] cut_high_weight_error     Per angular step, the high weight cut error.
+ */
+static void compute_lowest_and_highest_weight(
+	unsigned int weight_count,
+	const float* dec_weight_ideal_value,
+	unsigned int max_angular_steps,
+	unsigned int max_quant_steps,
+	const float* offsets,
+	float* lowest_weight,
+	int* weight_span,
+	float* error,
+	float* cut_low_weight_error,
+	float* cut_high_weight_error
+) {
+	promise(weight_count > 0);
+	promise(max_angular_steps > 0);
+
+	vfloat rcp_stepsize = int_to_float(vint::lane_id()) + vfloat(1.0f);
+
+	// Compute minimum/maximum weights in the weight array. Our remapping
+	// is monotonic, so the min/max rounded weights relate to the min/max
+	// unrounded weights in a straightforward way.
+	vfloat min_weight(FLT_MAX);
+	vfloat max_weight(-FLT_MAX);
+
+	vint lane_id = vint::lane_id();
+	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
+	{
+		vmask active = lane_id < vint(weight_count);
+		lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+		vfloat weights = loada(dec_weight_ideal_value + i);
+		min_weight = min(min_weight, select(min_weight, weights, active));
+		max_weight = max(max_weight, select(max_weight, weights, active));
+	}
+
+	min_weight = hmin(min_weight);
+	max_weight = hmax(max_weight);
+
+	// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
+	for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
+	{
+		vfloat errval = vfloat::zero();
+		vfloat cut_low_weight_err = vfloat::zero();
+		vfloat cut_high_weight_err = vfloat::zero();
+		vfloat offset = loada(offsets + sp);
+
+		// We know the min and max weight values, so we can figure out
+		// the corresponding indices before we enter the loop.
+		vfloat minidx = round(min_weight * rcp_stepsize - offset);
+		vfloat maxidx = round(max_weight * rcp_stepsize - offset);
+
+		for (unsigned int j = 0; j < weight_count; j++)
+		{
+			vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset;
+			vfloat svalrte = round(sval);
+			vfloat diff = sval - svalrte;
+			errval += diff * diff;
+
+			// Accumulate errors for minimum index
+			vmask mask = svalrte == minidx;
+			vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff;
+			cut_low_weight_err = select(cut_low_weight_err, accum, mask);
+
+			// Accumulate errors for maximum index
+			mask = svalrte == maxidx;
+			accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff;
+			cut_high_weight_err = select(cut_high_weight_err, accum, mask);
+		}
+
+		// Write out min weight and weight span; clamp span to a usable range
+		vint span = float_to_int(maxidx - minidx + vfloat(1));
+		span = min(span, vint(max_quant_steps + 3));
+		span = max(span, vint(2));
+		storea(minidx, lowest_weight + sp);
+		storea(span, weight_span + sp);
+
+		// The cut_(lowest/highest)_weight_error indicate the error that results from  forcing
+		// samples that should have had the weight value one step (up/down).
+		vfloat ssize = 1.0f / rcp_stepsize;
+		vfloat errscale = ssize * ssize;
+		storea(errval * errscale, error + sp);
+		storea(cut_low_weight_err * errscale, cut_low_weight_error + sp);
+		storea(cut_high_weight_err * errscale, cut_high_weight_error + sp);
+
+		rcp_stepsize = rcp_stepsize + vfloat(ASTCENC_SIMD_WIDTH);
+	}
+}
+
+/**
+ * @brief The main function for the angular algorithm.
+ *
+ * @param      weight_count              The number of (decimated) weights.
+ * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
+ * @param      max_quant_level           The maximum quantization level to be tested.
+ * @param[out] low_value                 Per angular step, the lowest weight value.
+ * @param[out] high_value                Per angular step, the highest weight value.
+ */
+static void compute_angular_endpoints_for_quant_levels(
+	unsigned int weight_count,
+	const float* dec_weight_ideal_value,
+	unsigned int max_quant_level,
+	float low_value[TUNE_MAX_ANGULAR_QUANT + 1],
+	float high_value[TUNE_MAX_ANGULAR_QUANT + 1]
+) {
+	unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
+	unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];
+
+	ASTCENC_ALIGNAS float angular_offsets[ANGULAR_STEPS];
+
+	compute_angular_offsets(weight_count, dec_weight_ideal_value,
+	                        max_angular_steps, angular_offsets);
+
+	ASTCENC_ALIGNAS float lowest_weight[ANGULAR_STEPS];
+	ASTCENC_ALIGNAS int32_t weight_span[ANGULAR_STEPS];
+	ASTCENC_ALIGNAS float error[ANGULAR_STEPS];
+	ASTCENC_ALIGNAS float cut_low_weight_error[ANGULAR_STEPS];
+	ASTCENC_ALIGNAS float cut_high_weight_error[ANGULAR_STEPS];
+
+	compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value,
+	                                  max_angular_steps, max_quant_steps,
+	                                  angular_offsets, lowest_weight, weight_span, error,
+	                                  cut_low_weight_error, cut_high_weight_error);
+
+	// For each quantization level, find the best error terms. Use packed vectors so data-dependent
+	// branches can become selects. This involves some integer to float casts, but the values are
+	// small enough so they never round the wrong way.
+	vfloat4 best_results[36];
+
+	// Initialize the array to some safe defaults
+	promise(max_quant_steps > 0);
+	for (unsigned int i = 0; i < (max_quant_steps + 4); i++)
+	{
+		// Lane<0> = Best error
+		// Lane<1> = Best scale; -1 indicates no solution found
+		// Lane<2> = Cut low weight
+		best_results[i] = vfloat4(ERROR_CALC_DEFAULT, -1.0f, 0.0f, 0.0f);
+	}
+
+	promise(max_angular_steps > 0);
+	for (unsigned int i = 0; i < max_angular_steps; i++)
+	{
+		float i_flt = static_cast<float>(i);
+
+		int idx_span = weight_span[i];
+
+		float error_cut_low = error[i] + cut_low_weight_error[i];
+		float error_cut_high = error[i] + cut_high_weight_error[i];
+		float error_cut_low_high = error[i] + cut_low_weight_error[i] + cut_high_weight_error[i];
+
+		// Check best error against record N
+		vfloat4 best_result = best_results[idx_span];
+		vfloat4 new_result = vfloat4(error[i], i_flt, 0.0f, 0.0f);
+		vmask4 mask = vfloat4(best_result.lane<0>()) > vfloat4(error[i]);
+		best_results[idx_span] = select(best_result, new_result, mask);
+
+		// Check best error against record N-1 with either cut low or cut high
+		best_result = best_results[idx_span - 1];
+
+		new_result = vfloat4(error_cut_low, i_flt, 1.0f, 0.0f);
+		mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low);
+		best_result = select(best_result, new_result, mask);
+
+		new_result = vfloat4(error_cut_high, i_flt, 0.0f, 0.0f);
+		mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_high);
+		best_results[idx_span - 1] = select(best_result, new_result, mask);
+
+		// Check best error against record N-2 with both cut low and high
+		best_result = best_results[idx_span - 2];
+		new_result = vfloat4(error_cut_low_high, i_flt, 1.0f, 0.0f);
+		mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low_high);
+		best_results[idx_span - 2] = select(best_result, new_result, mask);
+	}
+
+	for (unsigned int i = 0; i <= max_quant_level; i++)
+	{
+		unsigned int q = steps_for_quant_level[i];
+		int bsi = static_cast<int>(best_results[q].lane<1>());
+
+		// Did we find anything?
+#if defined(ASTCENC_DIAGNOSTICS)
+		if ((bsi < 0) && print_once)
+		{
+			print_once = false;
+			printf("INFO: Unable to find full encoding within search error limit.\n\n");
+		}
+#endif
+
+		bsi = astc::max(0, bsi);
+
+		float lwi = lowest_weight[bsi] + best_results[q].lane<2>();
+		float hwi = lwi + static_cast<float>(q) - 1.0f;
+
+		float stepsize = 1.0f / (1.0f + static_cast<float>(bsi));
+		low_value[i]  = (angular_offsets[bsi] + lwi) * stepsize;
+		high_value[i] = (angular_offsets[bsi] + hwi) * stepsize;
+	}
+}
+
+/* See header for documentation. */
+void compute_angular_endpoints_1plane(
+	bool only_always,
+	const block_size_descriptor& bsd,
+	const float* dec_weight_ideal_value,
+	unsigned int max_weight_quant,
+	compression_working_buffers& tmpbuf
+) {
+	float (&low_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
+	float (&high_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
+
+	float (&low_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
+	float (&high_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
+
+	unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
+	                                                : bsd.decimation_mode_count_selected;
+	promise(max_decimation_modes > 0);
+	for (unsigned int i = 0; i < max_decimation_modes; i++)
+	{
+		const decimation_mode& dm = bsd.decimation_modes[i];
+		if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
+		{
+			continue;
+		}
+
+		unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
+
+		unsigned int max_precision = dm.maxprec_1plane;
+		if (max_precision > TUNE_MAX_ANGULAR_QUANT)
+		{
+			max_precision = TUNE_MAX_ANGULAR_QUANT;
+		}
+
+		if (max_precision > max_weight_quant)
+		{
+			max_precision = max_weight_quant;
+		}
+
+		compute_angular_endpoints_for_quant_levels(
+		    weight_count,
+		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
+		    max_precision, low_values[i], high_values[i]);
+	}
+
+	unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
+	                                           : bsd.block_mode_count_1plane_selected;
+	promise(max_block_modes > 0);
+	for (unsigned int i = 0; i < max_block_modes; i++)
+	{
+		const block_mode& bm = bsd.block_modes[i];
+		assert(!bm.is_dual_plane);
+
+		unsigned int quant_mode = bm.quant_mode;
+		unsigned int decim_mode = bm.decimation_mode;
+
+		if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
+		{
+			low_value[i] = low_values[decim_mode][quant_mode];
+			high_value[i] = high_values[decim_mode][quant_mode];
+		}
+		else
+		{
+			low_value[i] = 0.0f;
+			high_value[i] = 1.0f;
+		}
+	}
+}
+
+/* See header for documentation. */
+void compute_angular_endpoints_2planes(
+	const block_size_descriptor& bsd,
+	const float* dec_weight_ideal_value,
+	unsigned int max_weight_quant,
+	compression_working_buffers& tmpbuf
+) {
+	float (&low_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
+	float (&high_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
+	float (&low_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value2;
+	float (&high_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value2;
+
+	float (&low_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
+	float (&high_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
+	float (&low_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values2;
+	float (&high_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values2;
+
+	promise(bsd.decimation_mode_count_selected > 0);
+	for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
+	{
+		const decimation_mode& dm = bsd.decimation_modes[i];
+		if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
+		{
+			continue;
+		}
+
+		unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
+
+		unsigned int max_precision = dm.maxprec_2planes;
+		if (max_precision > TUNE_MAX_ANGULAR_QUANT)
+		{
+			max_precision = TUNE_MAX_ANGULAR_QUANT;
+		}
+
+		if (max_precision > max_weight_quant)
+		{
+			max_precision = max_weight_quant;
+		}
+
+		compute_angular_endpoints_for_quant_levels(
+		    weight_count,
+		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
+		    max_precision, low_values1[i], high_values1[i]);
+
+		compute_angular_endpoints_for_quant_levels(
+		    weight_count,
+		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
+		    max_precision, low_values2[i], high_values2[i]);
+	}
+
+	unsigned int start = bsd.block_mode_count_1plane_selected;
+	unsigned int end = bsd.block_mode_count_1plane_2plane_selected;
+	for (unsigned int i = start; i < end; i++)
+	{
+		const block_mode& bm = bsd.block_modes[i];
+		unsigned int quant_mode = bm.quant_mode;
+		unsigned int decim_mode = bm.decimation_mode;
+
+		if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
+		{
+			low_value1[i] = low_values1[decim_mode][quant_mode];
+			high_value1[i] = high_values1[decim_mode][quant_mode];
+			low_value2[i] = low_values2[decim_mode][quant_mode];
+			high_value2[i] = high_values2[decim_mode][quant_mode];
+		}
+		else
+		{
+			low_value1[i] = 0.0f;
+			high_value1[i] = 1.0f;
+			low_value2[i] = 0.0f;
+			high_value2[i] = 1.0f;
+		}
+	}
+}
+
+#endif
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Data tables for quantization transfer.
+ */
+
+#include "astcenc_internal.h"
+
+#define _ 0 // Using _ to indicate an entry that will not be used.
+
+const quant_and_transfer_table quant_and_xfer_tables[12] {
+	// QUANT2, range 0..1
+	{
+		{0, 64},
+		{0, 1},
+		{0, 64},
+		{0x4000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
+		 _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
+		 0x4000}
+	},
+	// QUANT_3, range 0..2
+	{
+		{0, 32, 64},
+		{0, 1, 2},
+		{0, 32, 64},
+		{0x2000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
+		 _,_,0x4000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
+		 _,_,_,_,0x4020}
+	},
+	// QUANT_4, range 0..3
+	{
+		{0, 21, 43, 64},
+		{0, 1, 2, 3},
+		{0, 21, 43, 64},
+		{0x1500,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x2b00,_,_,_,_,
+		 _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x4015,_,_,_,_,_,_,_,_,_,_,_,_,
+		 _,_,_,_,_,_,_,_,0x402b}
+	},
+	//QUANT_5, range 0..4
+	{
+		{0, 16, 32, 48, 64},
+		{0, 1, 2, 3, 4},
+		{0, 16, 32, 48, 64},
+		{0x1000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x2000,_,_,_,_,_,_,_,_,_,
+		 _,_,_,_,_,_,0x3010,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x4020,_,_,_,
+		 _,_,_,_,_,_,_,_,_,_,_,_,0x4030}
+	},
+	// QUANT_6, range 0..5
+	{
+		{0, 12, 25, 39, 52, 64},
+		{0, 2, 4, 5, 3, 1},
+		{0, 64, 12, 52, 25, 39},
+		{0x0c00,_,_,_,_,_,_,_,_,_,_,_,0x1900,_,_,_,_,_,_,_,_,_,_,_,_,
+		 0x270c,_,_,_,_,_,_,_,_,_,_,_,_,_,0x3419,_,_,_,_,_,_,_,_,_,_,
+		 _,_,0x4027,_,_,_,_,_,_,_,_,_,_,_,0x4034}
+	},
+	// QUANT_8, range 0..7
+	{
+		{0, 9, 18, 27, 37, 46, 55, 64},
+		{0, 1, 2, 3, 4, 5, 6, 7},
+		{0, 9, 18, 27, 37, 46, 55, 64},
+		{0x0900,_,_,_,_,_,_,_,_,0x1200,_,_,_,_,_,_,_,_,0x1b09,_,_,
+		 _,_,_,_,_,_,0x2512,_,_,_,_,_,_,_,_,_,0x2e1b,_,_,_,_,_,_,_,_,
+		 0x3725,_,_,_,_,_,_,_,_,0x402e,_,_,_,_,_,_,_,_,0x4037}
+	},
+	// QUANT_10, range 0..9
+	{
+		{0, 7, 14, 21, 28, 36, 43, 50, 57, 64},
+		{0, 2, 4, 6, 8, 9, 7, 5, 3, 1},
+		{0, 64, 7, 57, 14, 50, 21, 43, 28, 36},
+		{0x0700,_,_,_,_,_,_,0x0e00,_,_,_,_,_,_,0x1507,_,_,_,_,_,_,
+		 0x1c0e,_,_,_,_,_,_,0x2415,_,_,_,_,_,_,_,0x2b1c,_,_,_,_,_,
+		 _,0x3224,_,_,_,_,_,_,0x392b,_,_,_,_,_,_,0x4032,_,_,_,_,_,
+		 _,0x4039}
+	},
+	// QUANT_12, range 0..11
+	{
+		{0, 5, 11, 17, 23, 28, 36, 41, 47, 53, 59, 64},
+		{0, 4, 8, 2, 6, 10, 11, 7, 3, 9, 5, 1},
+		{0, 64, 17, 47, 5, 59, 23, 41, 11, 53, 28, 36},
+		{0x0500,_,_,_,_,0x0b00,_,_,_,_,_,0x1105,_,_,_,_,_,
+		 0x170b,_,_,_,_,_,0x1c11,_,_,_,_,0x2417,_,_,_,_,_,_,_,
+		 0x291c,_,_,_,_,0x2f24,_,_,_,_,_,0x3529,_,_,_,_,_,
+		 0x3b2f,_,_,_,_,_,0x4035,_,_,_,_,0x403b}
+	},
+	// QUANT_16, range 0..15
+	{
+		{0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64},
+		{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+		{0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64},
+		{0x0400,_,_,_,0x0800,_,_,_,0x0c04,_,_,_,0x1108,_,_,_,_,
+		 0x150c,_,_,_,0x1911,_,_,_,0x1d15,_,_,_,0x2319,_,_,_,_,
+		 _,0x271d,_,_,_,0x2b23,_,_,_,0x2f27,_,_,_,0x342b,_,_,_,
+		 _,0x382f,_,_,_,0x3c34,_,_,_,0x4038,_,_,_,0x403c}
+	},
+	// QUANT_20, range 0..19
+	{
+		{0, 3, 6, 9, 13, 16, 19, 23, 26, 29, 35, 38, 41, 45, 48, 51, 55, 58, 61, 64},
+		{0, 4, 8, 12, 16, 2, 6, 10, 14, 18, 19, 15, 11, 7, 3, 17, 13, 9, 5, 1},
+		{0, 64, 16, 48, 3, 61, 19, 45, 6, 58, 23, 41, 9, 55, 26, 38, 13, 51, 29, 35},
+		{0x0300,_,_,0x0600,_,_,0x0903,_,_,0x0d06,_,_,_,
+		 0x1009,_,_,0x130d,_,_,0x1710,_,_,_,0x1a13,_,_,
+		 0x1d17,_,_,0x231a,_,_,_,_,_,0x261d,_,_,0x2923,_,_,
+		 0x2d26,_,_,_,0x3029,_,_,0x332d,_,_,0x3730,_,_,_,
+		 0x3a33,_,_,0x3d37,_,_,0x403a,_,_,0x403d}
+	},
+	// QUANT_24, range 0..23
+	{
+		{0, 2, 5, 8, 11, 13, 16, 19, 22, 24, 27, 30, 34, 37, 40, 42, 45, 48, 51, 53, 56, 59, 62, 64},
+		{0, 8, 16, 2, 10, 18, 4, 12, 20, 6, 14, 22, 23, 15, 7, 21, 13, 5, 19, 11, 3, 17, 9, 1},
+		{0, 64, 8, 56, 16, 48, 24, 40, 2, 62, 11, 53, 19, 45, 27, 37, 5, 59, 13, 51, 22, 42, 30, 34},
+		{0x0200,_,0x0500,_,_,0x0802,_,_,0x0b05,_,_,0x0d08,
+		 _,0x100b,_,_,0x130d,_,_,0x1610,_,_,0x1813,_,
+		 0x1b16,_,_,0x1e18,_,_,0x221b,_,_,_,0x251e,_,_,
+		 0x2822,_,_,0x2a25,_,0x2d28,_,_,0x302a,_,_,0x332d,
+		 _,_,0x3530,_,0x3833,_,_,0x3b35,_,_,0x3e38,_,_,
+		 0x403b,_,0x403e}
+	},
+	// QUANT_32, range 0..31
+	{
+		{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},
+		{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
+		{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},
+		{0x0200,_,0x0400,_,0x0602,_,0x0804,_,0x0a06,_,
+		 0x0c08,_,0x0e0a,_,0x100c,_,0x120e,_,0x1410,_,
+		 0x1612,_,0x1814,_,0x1a16,_,0x1c18,_,0x1e1a,_,
+		 0x221c,_,_,_,0x241e,_,0x2622,_,0x2824,_,0x2a26,_,
+		 0x2c28,_,0x2e2a,_,0x302c,_,0x322e,_,0x3430,_,
+		 0x3632,_,0x3834,_,0x3a36,_,0x3c38,_,0x3e3a,_,
+		 0x403c,_,0x403e}
+	}
+};
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2020-2024 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Application entry point.
+ *
+ * This module contains the first command line entry point veneer, used to
+ * validate that the host extended ISA availability matches the tool build.
+ * It is compiled without any extended ISA support so it's guaranteed to be
+ * executable without any invalid instruction errors.
+ */
+
+#include <cstdio>
+
+/**
+ * @brief The main veneer entry point.
+ *
+ * @param argc   The number of arguments.
+ * @param argv   The vector of arguments.
+ *
+ * @return 0 on success, non-zero otherwise.
+ */
+int astcenc_main_veneer(
+	int argc,
+	char **argv);
+
+// x86-64 builds
+#if (ASTCENC_SSE > 20)    || (ASTCENC_AVX > 0) || \
+    (ASTCENC_POPCNT > 0) || (ASTCENC_F16C > 0)
+
+static bool g_init { false };
+
+/** Does this CPU support SSE 4.1? Set to -1 if not yet initialized. */
+static bool g_cpu_has_sse41 { false };
+
+/** Does this CPU support AVX2? Set to -1 if not yet initialized. */
+static bool g_cpu_has_avx2 { false };
+
+/** Does this CPU support POPCNT? Set to -1 if not yet initialized. */
+static bool g_cpu_has_popcnt { false };
+
+/** Does this CPU support F16C? Set to -1 if not yet initialized. */
+static bool g_cpu_has_f16c { false };
+
+/* ============================================================================
+   Platform code for Visual Studio
+============================================================================ */
+#if !defined(__clang__) && defined(_MSC_VER)
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <intrin.h>
+
+/**
+ * @brief Detect platform CPU ISA support and update global trackers.
+ */
+static void detect_cpu_isa()
+{
+	int data[4];
+
+	__cpuid(data, 0);
+	int num_id = data[0];
+
+	if (num_id >= 1)
+	{
+		__cpuidex(data, 1, 0);
+		// SSE41 = Bank 1, ECX, bit 19
+		g_cpu_has_sse41 = data[2] & (1 << 19) ? true : false;
+		// POPCNT = Bank 1, ECX, bit 23
+		g_cpu_has_popcnt = data[2] & (1 << 23) ? true : false;
+		// F16C = Bank 1, ECX, bit 29
+		g_cpu_has_f16c = data[2] & (1 << 29) ? true : false;
+	}
+
+	if (num_id >= 7)
+	{
+		__cpuidex(data, 7, 0);
+		// AVX2 = Bank 7, EBX, bit 5
+		g_cpu_has_avx2 = data[1] & (1 << 5) ? true : false;
+	}
+
+	// Ensure state bits are updated before init flag is updated
+	MemoryBarrier();
+	g_init = true;
+}
+
+/* ============================================================================
+   Platform code for GCC and Clang
+============================================================================ */
+#else
+#include <cpuid.h>
+
+/**
+ * @brief Detect platform CPU ISA support and update global trackers.
+ */
+static void detect_cpu_isa()
+{
+	unsigned int data[4];
+
+	if (__get_cpuid_count(1, 0, &data[0], &data[1], &data[2], &data[3]))
+	{
+		// SSE41 = Bank 1, ECX, bit 19
+		g_cpu_has_sse41 = data[2] & (1 << 19) ? true : false;
+		// POPCNT = Bank 1, ECX, bit 23
+		g_cpu_has_popcnt = data[2] & (1 << 23) ? true : false;
+		// F16C = Bank 1, ECX, bit 29
+		g_cpu_has_f16c = data[2] & (1 << 29) ? true : false;
+	}
+
+	g_cpu_has_avx2 = 0;
+	if (__get_cpuid_count(7, 0, &data[0], &data[1], &data[2], &data[3]))
+	{
+		// AVX2 = Bank 7, EBX, bit 5
+		g_cpu_has_avx2 = data[1] & (1 << 5) ? true : false;
+	}
+
+	// Ensure state bits are updated before init flag is updated
+	__sync_synchronize();
+	g_init = true;
+}
+#endif
+
+#if ASTCENC_POPCNT > 0
+/**
+ * @brief Run-time detection if the host CPU supports the POPCNT extension.
+ *
+ * @return @c true if supported, @c false if not.
+ */
+static bool cpu_supports_popcnt()
+{
+	if (!g_init)
+	{
+		detect_cpu_isa();
+	}
+
+	return g_cpu_has_popcnt;
+}
+#endif
+
+#if ASTCENC_F16C > 0
+/**
+ * @brief Run-time detection if the host CPU supports F16C extension.
+ *
+ * @return @c true if supported, @c false if not.
+ */
+static bool cpu_supports_f16c()
+{
+	if (!g_init)
+	{
+		detect_cpu_isa();
+	}
+
+	return g_cpu_has_f16c;
+}
+#endif
+
+#if ASTCENC_SSE >= 41
+/**
+ * @brief Run-time detection if the host CPU supports SSE 4.1 extension.
+ *
+ * @return @c true if supported, @c false if not.
+ */
+static bool cpu_supports_sse41()
+{
+	if (!g_init)
+	{
+		detect_cpu_isa();
+	}
+
+	return g_cpu_has_sse41;
+}
+#endif
+
+#if ASTCENC_AVX >= 2
+/**
+ * @brief Run-time detection if the host CPU supports AVX 2 extension.
+ *
+ * @return @c true if supported, @c false if not.
+ */
+static bool cpu_supports_avx2()
+{
+	if (!g_init)
+	{
+		detect_cpu_isa();
+	}
+
+	return g_cpu_has_avx2;
+}
+#endif
+
+/**
+ * @brief Print a string to stderr.
+ */
+static inline void print_error(
+	const char* format
+) {
+	fprintf(stderr, "%s", format);
+}
+
+/**
+ * @brief Validate CPU ISA support meets the requirements of this build of the library.
+ *
+ * Each library build is statically compiled for a particular set of CPU ISA features, such as the
+ * SIMD support or other ISA extensions such as POPCNT. This function checks that the host CPU
+ * actually supports everything this build needs.
+ *
+ * @return Return @c true if validated, @c false otherwise.
+ */
+static bool validate_cpu_isa()
+{
+	#if ASTCENC_AVX >= 2
+		if (!cpu_supports_avx2())
+		{
+			print_error("ERROR: Host does not support AVX2 ISA extension\n");
+			return false;
+		}
+	#endif
+
+	#if ASTCENC_F16C >= 1
+		if (!cpu_supports_f16c())
+		{
+			print_error("ERROR: Host does not support F16C ISA extension\n");
+			return false;
+		}
+	#endif
+
+	#if ASTCENC_SSE >= 41
+		if (!cpu_supports_sse41())
+		{
+			print_error("ERROR: Host does not support SSE4.1 ISA extension\n");
+			return false;
+		}
+	#endif
+
+	#if ASTCENC_POPCNT >= 1
+		if (!cpu_supports_popcnt())
+		{
+			print_error("ERROR: Host does not support POPCNT ISA extension\n");
+			return false;
+		}
+	#endif
+
+	return true;
+}
+
+// Validate Arm SVE availability
+#elif ASTCENC_SVE != 0
+
+#include <sys/auxv.h>
+static bool cpu_supports_sve()
+{
+	long hwcaps = getauxval(AT_HWCAP);
+	return (hwcaps & HWCAP_SVE) != 0;
+}
+
+/**
+ * @brief Print a string to stderr.
+ */
+static inline void print_error(
+	const char* format
+) {
+	fprintf(stderr, "%s", format);
+}
+
+/**
+ * @brief Validate that SVE is supported.
+ *
+ * Note that this function checks that SVE is supported, but because it
+ * runs in the veneer which is compiled without SVE support, we cannot
+ * check the SVE width is correct. This is checked later.
+ */
+static bool validate_cpu_isa()
+{
+	if (!cpu_supports_sve())
+	{
+		print_error("ERROR: Host does not support SVE ISA extension\n");
+		return false;
+	}
+
+	return true;
+}
+
+#else
+
+// Fallback for cases with no dynamic ISA availability
+static bool validate_cpu_isa()
+{
+	return true;
+}
+
+#endif
+
+int main(
+	int argc,
+	char **argv
+) {
+	if (!validate_cpu_isa())
+	{
+		return 1;
+	}
+
+	return astcenc_main_veneer(argc, argv);
+}
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2024 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Application entry point second veneer.
+ *
+ * This module contains the second command line entry point veneer, used to
+ * validate that Arm SVE vector width matches the tool build. When used, it is
+ * compiled with SVE ISA support but without any vector legnth override, so it
+ * will see the native SVE vector length exposed to the application.
+ */
+
+#include <cstdio>
+
+#if ASTCENC_SVE != 0
+	#include <arm_sve.h>
+#endif
+
+/**
+ * @brief The main entry point.
+ *
+ * @param argc   The number of arguments.
+ * @param argv   The vector of arguments.
+ *
+ * @return 0 on success, non-zero otherwise.
+ */
+int astcenc_main(
+	int argc,
+	char **argv);
+
+/**
+ * @brief Print a formatted string to stderr.
+ */
+template<typename ... _Args>
+static inline void print_error(
+	const char* format,
+	_Args...args
+) {
+	fprintf(stderr, format, args...);
+}
+
+int astcenc_main_veneer(
+	int argc,
+	char **argv
+) {
+	// We don't need this check for 128-bit SVE, because that is compiled as
+	// VLA code, using predicate masks in the augmented NEON.
+#if ASTCENC_SVE > 4
+	// svcntw() returns compile-time length if used with -msve-vector-bits
+	if (svcntw() != ASTCENC_SVE)
+	{
+		int bits = ASTCENC_SVE * 32;
+		print_error("ERROR: Host SVE support is not a %u-bit implementation\n", bits);
+		return 1;
+	}
+#endif
+
+	return astcenc_main(argc, argv);
+}
@@ -0,0 +1,413 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for computing image error metrics.
+ */
+
+#include <cassert>
+#include <cstdio>
+
+#include "astcenccli_internal.h"
+
+/**
+ * @brief An accumulator for errors.
+ */
+class error_accum4
+{
+public:
+	/** @brief The running sum. */
+	double sum_r { 0.0 };
+	double sum_g { 0.0 };
+	double sum_b { 0.0 };
+	double sum_a { 0.0 };
+};
+
+/**
+ * @brief Incremental addition operator for error accumulators.
+ *
+ * @param val The accumulator to increment
+ * @param inc The increment to apply
+ *
+ * @return The updated accumulator
+ */
+static error_accum4& operator+=(
+	error_accum4 &val,
+	vfloat4 inc
+) {
+	val.sum_r += static_cast<double>(inc.lane<0>());
+	val.sum_g += static_cast<double>(inc.lane<1>());
+	val.sum_b += static_cast<double>(inc.lane<2>());
+	val.sum_a += static_cast<double>(inc.lane<3>());
+	return val;
+}
+
+/**
+ * @brief mPSNR tone-mapping operator for HDR images.
+ *
+ * @param val     The color value to tone map
+ * @param fstop   The exposure fstop; should be in range [-125, 125]
+ *
+ * @return The mapped color value in [0.0f, 255.0f] range
+ */
+static float mpsnr_operator(
+	float val,
+	int fstop
+) {
+	if32 p;
+	p.u = 0x3f800000 + (fstop << 23);  // 0x3f800000 is 1.0f
+	val *= p.f;
+	val = powf(val, (1.0f / 2.2f));
+	val *= 255.0f;
+
+	return astc::clamp(val, 0.0f, 255.0f);
+}
+
+/**
+ * @brief mPSNR difference between two values.
+ *
+ * Differences are given as "val1 - val2".
+ *
+ * @param val1       The first color value
+ * @param val2       The second color value
+ * @param fstop_lo   The low exposure fstop; should be in range [-125, 125]
+ * @param fstop_hi   The high exposure fstop; should be in range [-125, 125]
+ *
+ * @return The summed mPSNR difference across all active fstop levels
+ */
+static float mpsnr_sumdiff(
+	float val1,
+	float val2,
+	int fstop_lo,
+	int fstop_hi
+) {
+	float summa = 0.0f;
+	for (int i = fstop_lo; i <= fstop_hi; i++)
+	{
+		float mval1 = mpsnr_operator(val1, i);
+		float mval2 = mpsnr_operator(val2, i);
+		float mdiff = mval1 - mval2;
+		summa += mdiff * mdiff;
+	}
+	return summa;
+}
+
+/* See header for documentation */
+void compute_error_metrics(
+	bool compute_hdr_metrics,
+	bool compute_normal_metrics,
+	int input_components,
+	const astcenc_image* img1,
+	const astcenc_image* img2,
+	int fstop_lo,
+	int fstop_hi
+) {
+	static const int componentmasks[5] { 0x00, 0x07, 0x0C, 0x07, 0x0F };
+	int componentmask = componentmasks[input_components];
+
+	error_accum4 errorsum;
+	error_accum4 alpha_scaled_errorsum;
+	error_accum4 log_errorsum;
+	error_accum4 mpsnr_errorsum;
+	double mean_angular_errorsum = 0.0;
+	double worst_angular_errorsum = 0.0;
+
+	unsigned int dim_x = astc::min(img1->dim_x, img2->dim_x);
+	unsigned int dim_y = astc::min(img1->dim_y, img2->dim_y);
+	unsigned int dim_z = astc::min(img1->dim_z, img2->dim_z);
+
+	if (img1->dim_x != img2->dim_x ||
+	    img1->dim_y != img2->dim_y ||
+	    img1->dim_z != img2->dim_z)
+	{
+		printf("WARNING: Only intersection of images will be compared:\n"
+		       "  Image 1: %dx%dx%d\n"
+		       "  Image 2: %dx%dx%d\n",
+		       img1->dim_x, img1->dim_y, img1->dim_z,
+		       img2->dim_x, img2->dim_y, img2->dim_z);
+	}
+
+	double rgb_peak = 0.0;
+	unsigned int xsize1 = img1->dim_x;
+	unsigned int xsize2 = img2->dim_x;
+
+	for (unsigned int z = 0; z < dim_z; z++)
+	{
+		for (unsigned int y = 0; y < dim_y; y++)
+		{
+			for (unsigned int x = 0; x < dim_x; x++)
+			{
+				vfloat4 color1;
+				vfloat4 color2;
+
+				if (img1->data_type == ASTCENC_TYPE_U8)
+				{
+					uint8_t* data8 = static_cast<uint8_t*>(img1->data[z]);
+
+					color1 = vfloat4(
+					    data8[(4 * xsize1 * y) + (4 * x    )],
+					    data8[(4 * xsize1 * y) + (4 * x + 1)],
+					    data8[(4 * xsize1 * y) + (4 * x + 2)],
+					    data8[(4 * xsize1 * y) + (4 * x + 3)]);
+
+					color1 = color1 / 255.0f;
+				}
+				else if (img1->data_type == ASTCENC_TYPE_F16)
+				{
+					uint16_t* data16 = static_cast<uint16_t*>(img1->data[z]);
+
+					vint4 color1i = vint4(
+					    data16[(4 * xsize1 * y) + (4 * x    )],
+					    data16[(4 * xsize1 * y) + (4 * x + 1)],
+					    data16[(4 * xsize1 * y) + (4 * x + 2)],
+					    data16[(4 * xsize1 * y) + (4 * x + 3)]);
+
+					color1 = float16_to_float(color1i);
+					color1 = clamp(0, 65504.0f, color1);
+				}
+				else // if (img1->data_type == ASTCENC_TYPE_F32)
+				{
+					assert(img1->data_type == ASTCENC_TYPE_F32);
+					float* data32 = static_cast<float*>(img1->data[z]);
+
+					color1 = vfloat4(
+					    data32[(4 * xsize1 * y) + (4 * x    )],
+					    data32[(4 * xsize1 * y) + (4 * x + 1)],
+					    data32[(4 * xsize1 * y) + (4 * x + 2)],
+					    data32[(4 * xsize1 * y) + (4 * x + 3)]);
+
+					color1 = clamp(0, 65504.0f, color1);
+				}
+
+				if (img2->data_type == ASTCENC_TYPE_U8)
+				{
+					uint8_t* data8 = static_cast<uint8_t*>(img2->data[z]);
+
+					color2 = vfloat4(
+					    data8[(4 * xsize2 * y) + (4 * x    )],
+					    data8[(4 * xsize2 * y) + (4 * x + 1)],
+					    data8[(4 * xsize2 * y) + (4 * x + 2)],
+					    data8[(4 * xsize2 * y) + (4 * x + 3)]);
+
+					color2 = color2 / 255.0f;
+				}
+				else if (img2->data_type == ASTCENC_TYPE_F16)
+				{
+					uint16_t* data16 = static_cast<uint16_t*>(img2->data[z]);
+
+					vint4 color2i = vint4(
+					    data16[(4 * xsize2 * y) + (4 * x    )],
+					    data16[(4 * xsize2 * y) + (4 * x + 1)],
+					    data16[(4 * xsize2 * y) + (4 * x + 2)],
+					    data16[(4 * xsize2 * y) + (4 * x + 3)]);
+
+					color2 = float16_to_float(color2i);
+					color2 = clamp(0, 65504.0f, color2);
+				}
+				else // if (img2->data_type == ASTCENC_TYPE_F32)
+				{
+					assert(img2->data_type == ASTCENC_TYPE_F32);
+					float* data32 = static_cast<float*>(img2->data[z]);
+
+					color2 = vfloat4(
+					    data32[(4 * xsize2 * y) + (4 * x    )],
+					    data32[(4 * xsize2 * y) + (4 * x + 1)],
+					    data32[(4 * xsize2 * y) + (4 * x + 2)],
+					    data32[(4 * xsize2 * y) + (4 * x + 3)]);
+
+					color2 = clamp(0, 65504.0f, color2);
+				}
+
+				rgb_peak = astc::max(static_cast<double>(color1.lane<0>()),
+				                     static_cast<double>(color1.lane<1>()),
+				                     static_cast<double>(color1.lane<2>()),
+				                     rgb_peak);
+
+				vfloat4 diffcolor = color1 - color2;
+				vfloat4 diffcolor_sq = diffcolor * diffcolor;
+				errorsum += diffcolor_sq;
+
+				vfloat4 alpha_scaled_diffcolor = vfloat4(
+				    diffcolor.lane<0>() * color1.lane<3>(),
+				    diffcolor.lane<1>() * color1.lane<3>(),
+				    diffcolor.lane<2>() * color1.lane<3>(),
+				    diffcolor.lane<3>());
+
+				vfloat4 alpha_scaled_diffcolor_sq = alpha_scaled_diffcolor * alpha_scaled_diffcolor;
+				alpha_scaled_errorsum += alpha_scaled_diffcolor_sq;
+
+				if (compute_hdr_metrics)
+				{
+					vfloat4 log_input_color1 = log2(color1);
+					vfloat4 log_input_color2 = log2(color2);
+
+					vfloat4 log_diffcolor = log_input_color1 - log_input_color2;
+
+					log_errorsum += log_diffcolor * log_diffcolor;
+
+					vfloat4 mpsnr_error = vfloat4(
+					    mpsnr_sumdiff(color1.lane<0>(), color2.lane<0>(), fstop_lo, fstop_hi),
+					    mpsnr_sumdiff(color1.lane<1>(), color2.lane<1>(), fstop_lo, fstop_hi),
+					    mpsnr_sumdiff(color1.lane<2>(), color2.lane<2>(), fstop_lo, fstop_hi),
+					    mpsnr_sumdiff(color1.lane<3>(), color2.lane<3>(), fstop_lo, fstop_hi));
+
+					mpsnr_errorsum += mpsnr_error;
+				}
+
+				if (compute_normal_metrics)
+				{
+					// Decode the normal vector
+					vfloat4 normal1 = (color1 - 0.5f) * 2.0f;
+					normal1 = normalize_safe(normal1.swz<0, 1, 2>(), unit3());
+
+					vfloat4 normal2 = (color2 - 0.5f) * 2.0f;
+					normal2 = normalize_safe(normal2.swz<0, 1, 2>(), unit3());
+
+					// Float error can push this outside of valid range for acos, so clamp to avoid NaN issues
+					float normal_cos = clamp(-1.0f, 1.0f, dot3(normal1, normal2)).lane<0>();
+					float rad_to_degrees = 180.0f / astc::PI;
+					double error_degrees = std::acos(static_cast<double>(normal_cos)) * static_cast<double>(rad_to_degrees);
+
+					mean_angular_errorsum += error_degrees / (dim_x * dim_y * dim_z);
+					worst_angular_errorsum = astc::max(worst_angular_errorsum, error_degrees);
+				}
+			}
+		}
+	}
+
+	double pixels = static_cast<double>(dim_x * dim_y * dim_z);
+	double samples = 0.0;
+
+	double num = 0.0;
+	double alpha_num = 0.0;
+	double log_num = 0.0;
+	double mpsnr_num = 0.0;
+
+	if (componentmask & 1)
+	{
+		num += errorsum.sum_r;
+		alpha_num += alpha_scaled_errorsum.sum_r;
+		log_num += log_errorsum.sum_r;
+		mpsnr_num += mpsnr_errorsum.sum_r;
+		samples += pixels;
+	}
+
+	if (componentmask & 2)
+	{
+		num += errorsum.sum_g;
+		alpha_num += alpha_scaled_errorsum.sum_g;
+		log_num += log_errorsum.sum_g;
+		mpsnr_num += mpsnr_errorsum.sum_g;
+		samples += pixels;
+	}
+
+	if (componentmask & 4)
+	{
+		num += errorsum.sum_b;
+		alpha_num += alpha_scaled_errorsum.sum_b;
+		log_num += log_errorsum.sum_b;
+		mpsnr_num += mpsnr_errorsum.sum_b;
+		samples += pixels;
+	}
+
+	if (componentmask & 8)
+	{
+		num += errorsum.sum_a;
+		alpha_num += alpha_scaled_errorsum.sum_a;
+		samples += pixels;
+	}
+
+	double denom = samples;
+	double stopcount = static_cast<double>(fstop_hi - fstop_lo + 1);
+	double mpsnr_denom = pixels * 3.0 * stopcount * 255.0 * 255.0;
+
+	double psnr;
+	if (num == 0.0)
+	{
+		psnr = 999.0;
+	}
+	else
+	{
+		psnr = 10.0 * log10(denom / num);
+	}
+
+	double rgb_psnr = psnr;
+
+	printf("Quality metrics\n");
+	printf("===============\n\n");
+
+	if (componentmask & 8)
+	{
+		printf("    PSNR (LDR-RGBA):          %9.4f dB\n", psnr);
+
+		double alpha_psnr;
+		if (alpha_num == 0.0)
+		{
+			alpha_psnr = 999.0;
+		}
+		else
+		{
+			alpha_psnr = 10.0 * log10(denom / alpha_num);
+		}
+		printf("    Alpha-weighted PSNR:      %9.4f dB\n", alpha_psnr);
+
+		double rgb_num = errorsum.sum_r + errorsum.sum_g + errorsum.sum_b;
+		if (rgb_num == 0.0)
+		{
+			rgb_psnr = 999.0;
+		}
+		else
+		{
+			rgb_psnr = 10.0 * log10(pixels * 3.0 / rgb_num);
+		}
+		printf("    PSNR (LDR-RGB):           %9.4f dB\n", rgb_psnr);
+	}
+	else
+	{
+		printf("    PSNR (LDR-RGB):           %9.4f dB\n", psnr);
+	}
+
+	if (compute_hdr_metrics)
+	{
+		printf("    PSNR (RGB norm to peak):  %9.4f dB (peak %f)\n",
+		       rgb_psnr + 20.0 * log10(rgb_peak), rgb_peak);
+
+		double mpsnr;
+		if (mpsnr_num == 0.0)
+		{
+			mpsnr = 999.0;
+		}
+		else
+		{
+			mpsnr = 10.0 * log10(mpsnr_denom / mpsnr_num);
+		}
+
+		printf("    mPSNR (RGB):              %9.4f dB (fstops %+d to %+d)\n",
+		       mpsnr, fstop_lo, fstop_hi);
+
+		double logrmse = sqrt(log_num / pixels);
+		printf("    LogRMSE (RGB):            %9.4f\n", logrmse);
+	}
+
+	if (compute_normal_metrics)
+	{
+		printf("    Mean Angular Error:       %9.4f degrees\n", mean_angular_errorsum);
+		printf("    Worst Angular Error:      %9.4f degrees\n", worst_angular_errorsum);
+	}
+
+	printf("\n");
+}
@@ -0,0 +1,377 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for creating in-memory ASTC image structures.
+ */
+
+#include <cassert>
+#include <cstring>
+
+#include "astcenccli_internal.h"
+
+/* See header for documentation. */
+astcenc_image *alloc_image(
+	unsigned int bitness,
+	unsigned int dim_x,
+	unsigned int dim_y,
+	unsigned int dim_z
+) {
+	astcenc_image *img = new astcenc_image;
+	img->dim_x = dim_x;
+	img->dim_y = dim_y;
+	img->dim_z = dim_z;
+
+	void** data = new void*[dim_z];
+	img->data = data;
+
+	if (bitness == 8)
+	{
+		img->data_type = ASTCENC_TYPE_U8;
+		for (unsigned int z = 0; z < dim_z; z++)
+		{
+			data[z] = new uint8_t[dim_x * dim_y * 4];
+		}
+	}
+	else if (bitness == 16)
+	{
+		img->data_type = ASTCENC_TYPE_F16;
+		for (unsigned int z = 0; z < dim_z; z++)
+		{
+			data[z] = new uint16_t[dim_x * dim_y * 4];
+		}
+	}
+	else // if (bitness == 32)
+	{
+		assert(bitness == 32);
+		img->data_type = ASTCENC_TYPE_F32;
+		for (unsigned int z = 0; z < dim_z; z++)
+		{
+			data[z] = new float[dim_x * dim_y * 4];
+		}
+	}
+
+	return img;
+}
+
+/* See header for documentation. */
+void free_image(astcenc_image * img)
+{
+	if (img == nullptr)
+	{
+		return;
+	}
+
+	for (unsigned int z = 0; z < img->dim_z; z++)
+	{
+		delete[] reinterpret_cast<char*>(img->data[z]);
+	}
+
+	delete[] img->data;
+	delete img;
+}
+
+/* See header for documentation. */
+int determine_image_components(const astcenc_image * img)
+{
+	unsigned int dim_x = img->dim_x;
+	unsigned int dim_y = img->dim_y;
+	unsigned int dim_z = img->dim_z;
+
+	// Scan through the image data to determine how many color components the image has
+	bool is_luma = true;
+	bool has_alpha = false;
+
+	if (img->data_type == ASTCENC_TYPE_U8)
+	{
+		for (unsigned int z = 0; z < dim_z; z++)
+		{
+			uint8_t* data8 = static_cast<uint8_t*>(img->data[z]);
+
+			for (unsigned int y = 0; y < dim_y; y++)
+			{
+				for (unsigned int x = 0; x < dim_x; x++)
+				{
+					int r = data8[(4 * dim_x * y) + (4 * x    )];
+					int g = data8[(4 * dim_x * y) + (4 * x + 1)];
+					int b = data8[(4 * dim_x * y) + (4 * x + 2)];
+					int a = data8[(4 * dim_x * y) + (4 * x + 3)];
+
+					is_luma = is_luma && (r == g) && (r == b);
+					has_alpha = has_alpha || (a != 0xFF);
+				}
+			}
+		}
+	}
+	else if (img->data_type == ASTCENC_TYPE_F16)
+	{
+		for (unsigned int z = 0; z < dim_z; z++)
+		{
+			uint16_t* data16 = static_cast<uint16_t*>(img->data[z]);
+
+			for (unsigned int y = 0; y < dim_y; y++)
+			{
+				for (unsigned int x = 0; x < dim_x; x++)
+				{
+					int r = data16[(4 * dim_x * y) + (4 * x    )];
+					int g = data16[(4 * dim_x * y) + (4 * x + 1)];
+					int b = data16[(4 * dim_x * y) + (4 * x + 2)];
+					int a = data16[(4 * dim_x * y) + (4 * x + 3)];
+
+					is_luma = is_luma && (r == g) && (r == b);
+					has_alpha = has_alpha || ((a ^ 0xC3FF) != 0xFFFF);
+					// a ^ 0xC3FF returns FFFF if and only if the input is 1.0
+				}
+			}
+		}
+	}
+	else // if (img->data_type == ASTCENC_TYPE_F32)
+	{
+		assert(img->data_type == ASTCENC_TYPE_F32);
+
+		for (unsigned int z = 0; z < dim_z; z++)
+		{
+			float* data32 = static_cast<float*>(img->data[z]);
+
+			for (unsigned int y = 0; y < dim_y; y++)
+			{
+				for (unsigned int x = 0; x < dim_x; x++)
+				{
+					float r = data32[(4 * dim_x * y) + (4 * x    )];
+					float g = data32[(4 * dim_x * y) + (4 * x + 1)];
+					float b = data32[(4 * dim_x * y) + (4 * x + 2)];
+					float a = data32[(4 * dim_x * y) + (4 * x + 3)];
+
+					is_luma = is_luma && (r == g) && (r == b);
+					has_alpha = has_alpha || (a != 1.0f);
+				}
+			}
+		}
+	}
+
+	int image_components = 1 + (is_luma == 0 ? 2 : 0) + (has_alpha ? 1 : 0);
+	return image_components;
+}
+
+/* See header for documentation. */
+astcenc_image* astc_img_from_floatx4_array(
+	const float* data,
+	unsigned int dim_x,
+	unsigned int dim_y,
+	bool y_flip
+) {
+	astcenc_image* img = alloc_image(16, dim_x, dim_y, 1);
+
+	for (unsigned int y = 0; y < dim_y; y++)
+	{
+		uint16_t* data16 = static_cast<uint16_t*>(img->data[0]);
+		unsigned int y_src = y_flip ? (dim_y - y - 1) : y;
+		const float* src = data + 4 * dim_x * y_src;
+
+		for (unsigned int x = 0; x < dim_x; x++)
+		{
+			vint4 colorf16 = float_to_float16(vfloat4(
+				src[4 * x    ],
+				src[4 * x + 1],
+				src[4 * x + 2],
+				src[4 * x + 3]
+			));
+
+			data16[(4 * dim_x * y) + (4 * x    )] = static_cast<uint16_t>(colorf16.lane<0>());
+			data16[(4 * dim_x * y) + (4 * x + 1)] = static_cast<uint16_t>(colorf16.lane<1>());
+			data16[(4 * dim_x * y) + (4 * x + 2)] = static_cast<uint16_t>(colorf16.lane<2>());
+			data16[(4 * dim_x * y) + (4 * x + 3)] = static_cast<uint16_t>(colorf16.lane<3>());
+		}
+	}
+
+	return img;
+}
+
+/* See header for documentation. */
+astcenc_image* astc_img_from_unorm8x4_array(
+	const uint8_t* data,
+	unsigned int dim_x,
+	unsigned int dim_y,
+	bool y_flip
+) {
+	astcenc_image* img = alloc_image(8, dim_x, dim_y, 1);
+
+	for (unsigned int y = 0; y < dim_y; y++)
+	{
+		uint8_t* data8 = static_cast<uint8_t*>(img->data[0]);
+		unsigned int y_src = y_flip ? (dim_y - y - 1) : y;
+		const uint8_t* src = data + 4 * dim_x * y_src;
+
+		for (unsigned int x = 0; x < dim_x; x++)
+		{
+			data8[(4 * dim_x * y) + (4 * x    )] = src[4 * x    ];
+			data8[(4 * dim_x * y) + (4 * x + 1)] = src[4 * x + 1];
+			data8[(4 * dim_x * y) + (4 * x + 2)] = src[4 * x + 2];
+			data8[(4 * dim_x * y) + (4 * x + 3)] = src[4 * x + 3];
+		}
+	}
+
+	return img;
+}
+
+// initialize a flattened array of float values from an ASTC codec image
+// The returned array is allocated with new[] and must be deleted with delete[].
+/* See header for documentation. */
+float* floatx4_array_from_astc_img(
+	const astcenc_image* img,
+	bool y_flip,
+	unsigned int z_index
+) {
+	unsigned int dim_x = img->dim_x;
+	unsigned int dim_y = img->dim_y;
+	float *buf = new float[4 * dim_x * dim_y];
+
+	assert(z_index < img->dim_z);
+
+	if (img->data_type == ASTCENC_TYPE_U8)
+	{
+		uint8_t* data8 = static_cast<uint8_t*>(img->data[z_index]);
+		for (unsigned int y = 0; y < dim_y; y++)
+		{
+			unsigned int ymod = y_flip ? dim_y - y - 1 : y;
+			float* dst = buf + y * dim_x * 4;
+
+			for (unsigned int x = 0; x < dim_x; x++)
+			{
+				dst[4 * x    ] = data8[(4 * dim_x * ymod) + (4 * x    )] * (1.0f / 255.0f);
+				dst[4 * x + 1] = data8[(4 * dim_x * ymod) + (4 * x + 1)] * (1.0f / 255.0f);
+				dst[4 * x + 2] = data8[(4 * dim_x * ymod) + (4 * x + 2)] * (1.0f / 255.0f);
+				dst[4 * x + 3] = data8[(4 * dim_x * ymod) + (4 * x + 3)] * (1.0f / 255.0f);
+			}
+		}
+	}
+	else if (img->data_type == ASTCENC_TYPE_F16)
+	{
+		uint16_t* data16 = static_cast<uint16_t*>(img->data[z_index]);
+		for (unsigned int y = 0; y < dim_y; y++)
+		{
+			unsigned int ymod = y_flip ? dim_y - y - 1 : y;
+			float *dst = buf + y * dim_x * 4;
+
+			for (unsigned int x = 0; x < dim_x; x++)
+			{
+				vint4 colori(
+					data16[(4 * dim_x * ymod) + (4 * x    )],
+					data16[(4 * dim_x * ymod) + (4 * x + 1)],
+					data16[(4 * dim_x * ymod) + (4 * x + 2)],
+					data16[(4 * dim_x * ymod) + (4 * x + 3)]
+				);
+
+				vfloat4 color = float16_to_float(colori);
+				store(color, dst + 4 * x);
+			}
+		}
+	}
+	else // if (img->data_type == ASTCENC_TYPE_F32)
+	{
+		assert(img->data_type == ASTCENC_TYPE_F32);
+		float* data32 = static_cast<float*>(img->data[z_index]);
+		for (unsigned int y = 0; y < dim_y; y++)
+		{
+			unsigned int ymod = y_flip ? dim_y - y - 1 : y;
+			float *dst = buf + y * dim_x * 4;
+
+			for (unsigned int x = 0; x < dim_x; x++)
+			{
+				dst[4 * x    ] = data32[(4 * dim_x * ymod) + (4 * x    )];
+				dst[4 * x + 1] = data32[(4 * dim_x * ymod) + (4 * x + 1)];
+				dst[4 * x + 2] = data32[(4 * dim_x * ymod) + (4 * x + 2)];
+				dst[4 * x + 3] = data32[(4 * dim_x * ymod) + (4 * x + 3)];
+			}
+		}
+	}
+
+	return buf;
+}
+
+/* See header for documentation. */
+uint8_t* unorm8x4_array_from_astc_img(
+	const astcenc_image* img,
+	bool y_flip
+) {
+	unsigned int dim_x = img->dim_x;
+	unsigned int dim_y = img->dim_y;
+	uint8_t* buf = new uint8_t[4 * dim_x * dim_y];
+
+	if (img->data_type == ASTCENC_TYPE_U8)
+	{
+		uint8_t* data8 = static_cast<uint8_t*>(img->data[0]);
+		for (unsigned int y = 0; y < dim_y; y++)
+		{
+			unsigned int ymod = y_flip ? dim_y - y - 1 : y;
+			uint8_t* dst = buf + y * dim_x * 4;
+
+			for (unsigned int x = 0; x < dim_x; x++)
+			{
+				dst[4 * x    ] = data8[(4 * dim_x * ymod) + (4 * x    )];
+				dst[4 * x + 1] = data8[(4 * dim_x * ymod) + (4 * x + 1)];
+				dst[4 * x + 2] = data8[(4 * dim_x * ymod) + (4 * x + 2)];
+				dst[4 * x + 3] = data8[(4 * dim_x * ymod) + (4 * x + 3)];
+			}
+		}
+	}
+	else if (img->data_type == ASTCENC_TYPE_F16)
+	{
+		uint16_t* data16 = static_cast<uint16_t*>(img->data[0]);
+		for (unsigned int y = 0; y < dim_y; y++)
+		{
+			unsigned int ymod = y_flip ? dim_y - y - 1 : y;
+			uint8_t* dst = buf + y * dim_x * 4;
+
+			for (unsigned int x = 0; x < dim_x; x++)
+			{
+				vint4 colori(
+					data16[(4 * dim_x * ymod) + (4 * x    )],
+					data16[(4 * dim_x * ymod) + (4 * x + 1)],
+					data16[(4 * dim_x * ymod) + (4 * x + 2)],
+					data16[(4 * dim_x * ymod) + (4 * x + 3)]
+				);
+
+				vfloat4 color = float16_to_float(colori);
+				color = clamp(0.0f, 1.0f, color) * 255.0f;
+
+				colori = float_to_int_rtn(color);
+				pack_and_store_low_bytes(colori, dst + 4 * x);
+			}
+		}
+	}
+	else // if (img->data_type == ASTCENC_TYPE_F32)
+	{
+		assert(img->data_type == ASTCENC_TYPE_F32);
+		float* data32 = static_cast<float*>(img->data[0]);
+		for (unsigned int y = 0; y < dim_y; y++)
+		{
+			unsigned int ymod = y_flip ? dim_y - y - 1 : y;
+			uint8_t* dst = buf + y * dim_x * 4;
+
+			for (unsigned int x = 0; x < dim_x; x++)
+			{
+				dst[4 * x    ] = static_cast<uint8_t>(astc::flt2int_rtn(astc::clamp1f(data32[(4 * dim_x * ymod) + (4 * x    )]) * 255.0f));
+				dst[4 * x + 1] = static_cast<uint8_t>(astc::flt2int_rtn(astc::clamp1f(data32[(4 * dim_x * ymod) + (4 * x + 1)]) * 255.0f));
+				dst[4 * x + 2] = static_cast<uint8_t>(astc::flt2int_rtn(astc::clamp1f(data32[(4 * dim_x * ymod) + (4 * x + 2)]) * 255.0f));
+				dst[4 * x + 3] = static_cast<uint8_t>(astc::flt2int_rtn(astc::clamp1f(data32[(4 * dim_x * ymod) + (4 * x + 3)]) * 255.0f));
+			}
+		}
+	}
+
+	return buf;
+}
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for building the implementation of stb_image and tinyexr.
+ */
+
+#include <cstdlib>
+#include <cstdio>
+#include <fstream>
+#include <vector>
+
+#include "astcenccli_internal.h"
+
+// Configure the STB image write library build.
+#define STB_IMAGE_IMPLEMENTATION
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#define STBI_NO_GIF
+#define STBI_NO_PIC
+#define STBI_NO_PNM
+#define STBI_NO_PNG
+#define STBI_NO_PSD
+
+// Configure the TinyEXR library build.
+#define TINYEXR_IMPLEMENTATION
+
+// Configure the Wuffs library build.
+#define WUFFS_IMPLEMENTATION
+#define WUFFS_CONFIG__MODULES
+#define WUFFS_CONFIG__MODULE__ADLER32
+#define WUFFS_CONFIG__MODULE__BASE
+#define WUFFS_CONFIG__MODULE__CRC32
+#define WUFFS_CONFIG__MODULE__DEFLATE
+#define WUFFS_CONFIG__MODULE__PNG
+#define WUFFS_CONFIG__MODULE__ZLIB
+#include "wuffs-v0.3.c"
+
+// For both libraries force asserts (which can be triggered by corrupt input
+// images) to be handled at runtime in release builds to avoid security issues.
+#define STBI_ASSERT(x) astcenc_runtime_assert(x)
+#define TEXR_ASSERT(x) astcenc_runtime_assert(x)
+
+/**
+ * @brief Trap image load failures and convert into a runtime error.
+ */
+static void astcenc_runtime_assert(bool condition)
+{
+    if (!condition)
+    {
+        print_error("ERROR: Corrupt input image\n");
+        exit(1);
+    }
+}
+
+#include "ThirdParty/stb_image.h"
+#include "ThirdParty/stb_image_write.h"
+#include "ThirdParty/tinyexr.h"
+
+/**
+ * @brief Load an image using Wuffs to provide the loader.
+ *
+ * @param      filename          The name of the file to load.
+ * @param      y_flip            Should the image be vertically flipped?
+ * @param[out] is_hdr            Is this an HDR image load?
+ * @param[out] component_count   The number of components in the data.
+ *
+ * @return The loaded image data in a canonical 4 channel format, or @c nullptr on error.
+ */
+astcenc_image* load_png_with_wuffs(
+	const char* filename,
+	bool y_flip,
+	bool& is_hdr,
+	unsigned int& component_count
+) {
+	is_hdr = false;
+	component_count = 4;
+
+	std::ifstream file(filename, std::ios::binary | std::ios::ate);
+	if (!file)
+	{
+		print_error("ERROR: Failed to load image %s (can't fopen)\n", filename);
+		return nullptr;
+	}
+
+	std::streamsize size = file.tellg();
+	file.seekg(0, std::ios::beg);
+
+	std::vector<uint8_t> buffer(size);
+	file.read((char*)buffer.data(), size);
+
+	wuffs_png__decoder *dec = wuffs_png__decoder__alloc();
+	if (!dec)
+	{
+		return nullptr;
+	}
+
+	wuffs_base__image_config ic;
+	wuffs_base__io_buffer src = wuffs_base__ptr_u8__reader(buffer.data(), size, true);
+	wuffs_base__status status = wuffs_png__decoder__decode_image_config(dec, &ic, &src);
+	if (status.repr)
+	{
+		return nullptr;
+	}
+
+	uint32_t dim_x = wuffs_base__pixel_config__width(&ic.pixcfg);
+	uint32_t dim_y = wuffs_base__pixel_config__height(&ic.pixcfg);
+	size_t num_pixels = dim_x * dim_y;
+	if (num_pixels > (SIZE_MAX / 4))
+	{
+		return nullptr;
+	}
+
+	// Override the image's native pixel format to be RGBA_NONPREMUL
+	wuffs_base__pixel_config__set(
+	    &ic.pixcfg,
+	    WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL,
+	    WUFFS_BASE__PIXEL_SUBSAMPLING__NONE,
+	    dim_x, dim_y);
+
+	// Configure the work buffer
+	size_t workbuf_len = wuffs_png__decoder__workbuf_len(dec).max_incl;
+	if (workbuf_len > SIZE_MAX)
+	{
+		return nullptr;
+	}
+
+	wuffs_base__slice_u8 workbuf_slice = wuffs_base__make_slice_u8((uint8_t*)malloc(workbuf_len), workbuf_len);
+	if (!workbuf_slice.ptr)
+	{
+		return nullptr;
+	}
+
+	wuffs_base__slice_u8 pixbuf_slice = wuffs_base__make_slice_u8((uint8_t*)malloc(num_pixels * 4), num_pixels * 4);
+	if (!pixbuf_slice.ptr)
+	{
+		return nullptr;
+	}
+
+	wuffs_base__pixel_buffer pb;
+	status = wuffs_base__pixel_buffer__set_from_slice(&pb, &ic.pixcfg, pixbuf_slice);
+	if (status.repr)
+	{
+		return nullptr;
+	}
+
+	// Decode the pixels
+	status = wuffs_png__decoder__decode_frame(dec, &pb, &src, WUFFS_BASE__PIXEL_BLEND__SRC, workbuf_slice, NULL);
+	if (status.repr)
+	{
+		return nullptr;
+	}
+
+	astcenc_image* img = astc_img_from_unorm8x4_array(pixbuf_slice.ptr, dim_x, dim_y, y_flip);
+
+	free(pixbuf_slice.ptr);
+	free(workbuf_slice.ptr);
+	free(dec);
+
+	return img;
+}
@@ -0,0 +1,422 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions and data declarations.
+ */
+
+#ifndef ASTCENCCLI_INTERNAL_INCLUDED
+#define ASTCENCCLI_INTERNAL_INCLUDED
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+
+#include "astcenc.h"
+#include "astcenc_mathlib.h"
+
+/**
+ * @brief The payload stored in a compressed ASTC image.
+ */
+struct astc_compressed_image
+{
+	/** @brief The block width in texels. */
+	unsigned int block_x;
+
+	/** @brief The block height in texels. */
+	unsigned int block_y;
+
+	/** @brief The block depth in texels. */
+	unsigned int block_z;
+
+	/** @brief The image width in texels. */
+	unsigned int dim_x;
+
+	/** @brief The image height in texels. */
+	unsigned int dim_y;
+
+	/** @brief The image depth in texels. */
+	unsigned int dim_z;
+
+	/** @brief The binary data payload. */
+	uint8_t* data;
+
+	/** @brief The binary data length in bytes. */
+	size_t data_len;
+};
+
+/**
+ * @brief Config options that have been read from command line.
+ */
+struct cli_config_options
+{
+	/** @brief The number of threads to use for processing. */
+	unsigned int thread_count;
+
+	/** @brief The number of repeats to execute for benchmarking. */
+	unsigned int repeat_count;
+
+	/** @brief The number of image slices to load for a 3D image. */
+	unsigned int array_size;
+
+	/** @brief @c true if running in silent mode with minimal output. */
+	bool silentmode;
+
+	/** @brief @c true if the images should be y-flipped. */
+	bool y_flip;
+
+	/** @brief @c true if diagnostic images should be stored. */
+	bool diagnostic_images;
+
+	/** @brief The low exposure fstop for error computation. */
+	int low_fstop;
+
+	/** @brief The high exposure fstop for error computation. */
+	int high_fstop;
+
+	/** @brief The  pre-encode swizzle. */
+	astcenc_swizzle swz_encode;
+
+	/** @brief The  post-decode swizzle. */
+	astcenc_swizzle swz_decode;
+};
+
+/**
+ * @brief Print a string to stderr.
+ */
+static inline void print_error(
+	const char* format
+) {
+	fprintf(stderr, "%s", format);
+}
+
+/**
+ * @brief Print a formatted string to stderr.
+ */
+template<typename ... _Args>
+static inline void print_error(
+	const char* format,
+	_Args...args
+) {
+	fprintf(stderr, format, args...);
+}
+
+/**
+ * @brief Load uncompressed image.
+ *
+ * @param filename               The file path on disk.
+ * @param y_flip                 Should this image be Y flipped?
+ * @param[out] is_hdr            Is the loaded image HDR?
+ * @param[out] component_count   The number of components in the loaded image.
+ *
+ * @return The astc image file, or nullptr on error.
+ */
+astcenc_image* load_ncimage(
+	const char* filename,
+	bool y_flip,
+	bool& is_hdr,
+	unsigned int& component_count);
+
+/**
+ * @brief Load uncompressed PNG image.
+ *
+ * @param filename               The file path on disk.
+ * @param y_flip                 Should this image be Y flipped?
+ * @param[out] is_hdr            Is the loaded image HDR?
+ * @param[out] component_count   The number of components in the loaded image.
+ *
+ * @return The astc image file, or nullptr on error.
+ */
+astcenc_image* load_png_with_wuffs(
+	const char* filename,
+	bool y_flip,
+	bool& is_hdr,
+	unsigned int& component_count);
+
+/**
+ * @brief Save an uncompressed image.
+ *
+ * @param img        The source data for the image.
+ * @param filename   The name of the file to save.
+ * @param y_flip     Should the image be vertically flipped?
+ *
+ * @return @c true if the image saved OK, @c false on error.
+ */
+bool store_ncimage(
+	const astcenc_image* img,
+	const char* filename,
+	int y_flip);
+
+/**
+ * @brief Check if the output file type requires a specific bitness.
+ *
+ * @param filename The file name, containing hte extension to check.
+ *
+ * @return Valid values are:
+ *     * -1 - error - unknown file type.
+ *     *  0 - no enforced bitness.
+ *     *  8 - enforced 8-bit UNORM.
+ *     * 16 - enforced 16-bit FP16.
+ */
+int get_output_filename_enforced_bitness(
+	const char* filename);
+
+/**
+ * @brief Allocate a new image in a canonical format.
+ *
+ * Allocated images must be freed with a @c free_image() call.
+ *
+ * @param bitness   The number of bits per component (8, 16, or 32).
+ * @param dim_x     The width of the image, in texels.
+ * @param dim_y     The height of the image, in texels.
+ * @param dim_z     The depth of the image, in texels.
+ *
+ * @return The allocated image, or @c nullptr on error.
+ */
+astcenc_image* alloc_image(
+	unsigned int bitness,
+	unsigned int dim_x,
+	unsigned int dim_y,
+	unsigned int dim_z);
+
+/**
+ * @brief Free an image.
+ *
+ * @param img   The image to free.
+ */
+void free_image(
+	astcenc_image* img);
+
+/**
+ * @brief Determine the number of active components in an image.
+ *
+ * @param img   The image to analyze.
+ *
+ * @return The number of active components in the image.
+ */
+int determine_image_components(
+	const astcenc_image* img);
+
+/**
+ * @brief Load a compressed .astc image.
+ *
+ * @param filename   The file to load.
+ * @param img        The image to populate with loaded data.
+ *
+ * @return Non-zero on error, zero on success.
+ */
+int load_cimage(
+	const char* filename,
+	astc_compressed_image& img);
+
+/**
+ * @brief Store a compressed .astc image.
+ *
+ * @param img        The image to store.
+ * @param filename   The file to save.
+ *
+ * @return Non-zero on error, zero on success.
+ */
+int store_cimage(
+	const astc_compressed_image& img,
+	const char* filename);
+
+/**
+ * @brief Load a compressed .ktx image.
+ *
+ * @param filename   The file to load.
+ * @param is_srgb    Is this an sRGB encoded file?
+ * @param img        The image to populate with loaded data.
+ *
+ * @return Non-zero on error, zero on success.
+ */
+bool load_ktx_compressed_image(
+	const char* filename,
+	bool& is_srgb,
+	astc_compressed_image& img) ;
+
+/**
+ * @brief Store a compressed .ktx image.
+ *
+ * @param img        The image to store.
+ * @param filename   The file to store.
+ * @param is_srgb    Is this an sRGB encoded file?
+ *
+ * @return Non-zero on error, zero on success.
+ */
+bool store_ktx_compressed_image(
+	const astc_compressed_image& img,
+	const char* filename,
+	bool is_srgb);
+
+/**
+ * @brief Create an image from a 2D float data array.
+ *
+ * @param data     The raw input data.
+ * @param dim_x    The width of the image, in texels.
+ * @param dim_y    The height of the image, in texels.
+ * @param y_flip   Should this image be vertically flipped?
+ *
+ * @return The populated image.
+ */
+astcenc_image* astc_img_from_floatx4_array(
+	const float* data,
+	unsigned int dim_x,
+	unsigned int dim_y,
+	bool y_flip);
+
+/**
+ * @brief Create an image from a 2D byte data array.
+ *
+ * @param data     The raw input data.
+ * @param dim_x    The width of the image, in texels.
+ * @param dim_y    The height of the image, in texels.
+ * @param y_flip   Should this image be vertically flipped?
+ *
+ * @return The populated image.
+ */
+astcenc_image* astc_img_from_unorm8x4_array(
+	const uint8_t* data,
+	unsigned int dim_x,
+	unsigned int dim_y,
+	bool y_flip);
+
+/**
+ * @brief Create a flattened RGBA FLOAT32 data array for a single slice from an image structure.
+ *
+ * The returned data array is allocated with @c new[] and must be freed with a @c delete[] call.
+ *
+ * @param img       The input image.
+ * @param y_flip    Should the data in the array be Y flipped?
+ * @param z_index   The slice index to convert.
+ *
+ * @return The data array.
+ */
+float* floatx4_array_from_astc_img(
+	const astcenc_image* img,
+	bool y_flip,
+	unsigned int z_index);
+
+/**
+ * @brief Create a flattened RGBA UNORM8 data array from an image structure.
+ *
+ * The returned data array is allocated with @c new[] and must be freed with a @c delete[] call.
+ *
+ * @param img      The input image.
+ * @param y_flip   Should the data in the array be Y flipped?
+ *
+ * @return The data array.
+ */
+uint8_t* unorm8x4_array_from_astc_img(
+	const astcenc_image* img,
+	bool y_flip);
+
+/* ============================================================================
+  Functions for printing build info and help messages
+============================================================================ */
+
+/**
+ * @brief Print the tool copyright and version header to stdout.
+ */
+void astcenc_print_header();
+
+/**
+ * @brief Print the tool copyright, version, and short-form help to stdout.
+ */
+void astcenc_print_shorthelp();
+
+/**
+ * @brief Print the tool copyright, version, and long-form help to stdout.
+ */
+void astcenc_print_longhelp();
+
+/**
+ * @brief Compute error metrics comparing two images.
+ *
+ * @param compute_hdr_metrics      True if HDR metrics should be computed.
+ * @param compute_normal_metrics   True if normal map metrics should be computed.
+ * @param input_components         The number of input color components.
+ * @param img1                     The original image.
+ * @param img2                     The compressed image.
+ * @param fstop_lo                 The low exposure fstop (HDR only).
+ * @param fstop_hi                 The high exposure fstop (HDR only).
+ */
+void compute_error_metrics(
+	bool compute_hdr_metrics,
+	bool compute_normal_metrics,
+	int input_components,
+	const astcenc_image* img1,
+	const astcenc_image* img2,
+	int fstop_lo,
+	int fstop_hi);
+
+/**
+ * @brief Get the current time.
+ *
+ * @return The current time in seconds since arbitrary epoch.
+ */
+double get_time();
+
+/**
+ * @brief Get the number of CPU cores.
+ *
+ * @return The number of online or onlineable CPU cores in the system.
+ */
+int get_cpu_count();
+
+/**
+ * @brief Launch N worker threads and wait for them to complete.
+ *
+ * All threads run the same thread function, and have the same thread payload, but are given a
+ * unique thread ID (0 .. N-1) as a parameter to the run function to allow thread-specific behavior.
+ *
+ * @param operation      The name of the operation for this async task.
+ * @param thread_count   The number of threads to spawn.
+ * @param func           The function to execute. Must have the signature:
+ *                       void (int thread_count, int thread_id, void* payload)
+ * @param payload        Pointer to an opaque thread payload object.
+ */
+void launch_threads(
+	const char* operation,
+	int thread_count,
+	void (*func)(int, int, void*),
+	void *payload);
+
+/**
+ * @brief Set the current thread name to a string value.
+ *
+ * For portability strings should be no longer than 16 characters.
+ *
+ * @param name   The thread name.
+ */
+void set_thread_name(
+	const char* name);
+
+/**
+ * @brief The main entry point.
+ *
+ * @param argc   The number of arguments.
+ * @param argv   The vector of arguments.
+ *
+ * @return 0 on success, non-zero otherwise.
+ */
+int astcenc_main(
+	int argc,
+	char **argv);
+
+#endif
@@ -0,0 +1,309 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2024 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Platform-specific function implementations.
+ *
+ * This module contains functions with strongly OS-dependent implementations:
+ *
+ *  * CPU count queries
+ *  * Threading
+ *  * Time
+ *
+ * In addition to the basic thread abstraction (which is native pthreads on
+ * all platforms, except Windows where it is an emulation of pthreads), a
+ * utility function to create N threads and wait for them to complete a batch
+ * task has also been provided.
+ */
+
+#include "astcenccli_internal.h"
+
+/* ============================================================================
+   Platform code for Windows using the Win32 APIs.
+============================================================================ */
+#if defined(_WIN32) && !defined(__CYGWIN__)
+
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+#include <Processthreadsapi.h>
+#include <algorithm>
+#include <cstring>
+
+/** @brief Alias pthread_t to one of the internal Windows types. */
+typedef HANDLE pthread_t;
+
+/** @brief Alias pthread_attr_t to one of the internal Windows types. */
+typedef int pthread_attr_t;
+
+/**
+ * @brief Proxy Windows @c CreateThread underneath a pthreads-like wrapper.
+ */
+static int pthread_create(
+	pthread_t* thread,
+	const pthread_attr_t* attribs,
+	void* (*threadfunc)(void*),
+	void* thread_arg
+) {
+	static_cast<void>(attribs);
+	LPTHREAD_START_ROUTINE func = reinterpret_cast<LPTHREAD_START_ROUTINE>(threadfunc);
+	*thread = CreateThread(nullptr, 0, func, thread_arg, 0, nullptr);
+
+	// Ensure we return 0 on success, non-zero on error
+	if (*thread == NULL)
+	{
+		return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * @brief Manually set CPU group and thread affinity.
+ *
+ * This is needed on Windows 10 or older to allow benefit from large core count
+ * systems with more than 64 logical CPUs. The assignment is skipped on systems
+ * with a single processor group, as it is not necessary.
+ */
+static void set_group_affinity(
+	pthread_t thread,
+	int thread_index
+) {
+	// Skip thread assignment for hardware with a single CPU group
+	int group_count = GetActiveProcessorGroupCount();
+	if (group_count == 1)
+	{
+		return;
+	}
+
+	// Ensure we have a valid assign if user creates more threads than cores
+	int assign_index = thread_index % get_cpu_count();
+	int assign_group { 0 };
+	int assign_group_cpu_count { 0 };
+
+	// Determine which core group and core in the group to use for this thread
+	int group_cpu_count_sum { 0 };
+	for (int group = 0; group < group_count; group++)
+	{
+		int group_cpu_count = static_cast<int>(GetMaximumProcessorCount(group));
+		group_cpu_count_sum += group_cpu_count;
+
+		if (assign_index < group_cpu_count_sum)
+		{
+			assign_group = group;
+			assign_group_cpu_count = group_cpu_count;
+			break;
+		}
+	}
+
+	// Set the affinity to the assigned group, and all supported cores
+	GROUP_AFFINITY affinity {};
+	affinity.Mask = (1 << assign_group_cpu_count) - 1;
+	affinity.Group = assign_group;
+	SetThreadGroupAffinity(thread, &affinity, nullptr);
+}
+
+/**
+ * @brief Proxy Windows @c WaitForSingleObject underneath a pthreads-like wrapper.
+ */
+static int pthread_join(
+	pthread_t thread,
+	void** value
+) {
+	static_cast<void>(value);
+	WaitForSingleObject(thread, INFINITE);
+	return 0;
+}
+
+/* See header for documentation */
+int get_cpu_count()
+{
+	DWORD cpu_count = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
+	return static_cast<int>(cpu_count);
+}
+
+/* See header for documentation */
+double get_time()
+{
+	FILETIME tv;
+	GetSystemTimePreciseAsFileTime(&tv);
+	unsigned long long ticks = tv.dwHighDateTime;
+	ticks = (ticks << 32) | tv.dwLowDateTime;
+	return static_cast<double>(ticks) / 1.0e7;
+}
+
+/* See header for documentation */
+void set_thread_name(
+	const char* name
+) {
+	// Names are limited to 16 characters
+	wchar_t wname [16] { 0 };
+	size_t name_len = std::strlen(name);
+	size_t clamp_len = std::min<size_t>(name_len, 15);
+
+	// We know we only have basic 7-bit ASCII so just widen
+	for (size_t i = 0; i < clamp_len; i++)
+	{
+		wname[i] = static_cast<wchar_t>(name[i]);
+	}
+
+ 	SetThreadDescription(GetCurrentThread(), wname);
+}
+
+/* ============================================================================
+   Platform code for an platform using POSIX APIs.
+============================================================================ */
+#else
+
+#include <pthread.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+/* See header for documentation */
+int get_cpu_count()
+{
+	return static_cast<int>(sysconf(_SC_NPROCESSORS_ONLN));
+}
+
+/* See header for documentation */
+double get_time()
+{
+	timeval tv;
+	gettimeofday(&tv, 0);
+	return static_cast<double>(tv.tv_sec) + static_cast<double>(tv.tv_usec) * 1.0e-6;
+}
+
+/* See header for documentation */
+void set_thread_name(
+	const char* name
+) {
+	// No standard mechanism, so be defensive here
+#if defined(__linux__)
+	pthread_setname_np(pthread_self(), name);
+#elif defined(__APPLE__)
+	pthread_setname_np(name);
+#else
+	(void)name;
+#endif
+}
+
+#endif
+
+/**
+ * @brief Worker thread helper payload for launch_threads.
+ */
+struct launch_desc
+{
+	/** @brief The native thread handle. */
+	pthread_t thread_handle;
+	/** @brief The total number of threads in the thread pool. */
+	int thread_count;
+	/** @brief The thread index in the thread pool. */
+	int thread_id;
+	/** @brief The user thread function to execute. */
+	void (*func)(int, int, void*);
+	/** @brief The user thread payload. */
+	void* payload;
+};
+
+/**
+ * @brief Helper function to translate thread entry points.
+ *
+ * Convert a (void*) thread entry to an (int, void*) thread entry, where the
+ * integer contains the thread ID in the thread pool.
+ *
+ * @param p The thread launch helper payload.
+ */
+static void* launch_threads_helper(
+	void *p
+) {
+	launch_desc* ltd = reinterpret_cast<launch_desc*>(p);
+	ltd->func(ltd->thread_count, ltd->thread_id, ltd->payload);
+	return nullptr;
+}
+
+/* See header for documentation */
+void launch_threads(
+	const char* operation,
+	int thread_count,
+	void (*func)(int, int, void*),
+	void *payload
+) {
+	// Directly execute single threaded workloads on this thread
+	if (thread_count <= 1)
+	{
+		func(1, 0, payload);
+		return;
+	}
+
+	// Otherwise spawn worker threads
+	launch_desc *thread_descs = new launch_desc[thread_count];
+	int actual_thread_count { 0 };
+
+	for (int i = 0; i < thread_count; i++)
+	{
+		thread_descs[actual_thread_count].thread_count = thread_count;
+		thread_descs[actual_thread_count].thread_id = actual_thread_count;
+		thread_descs[actual_thread_count].payload = payload;
+		thread_descs[actual_thread_count].func = func;
+
+		// Handle pthread_create failing by simply using fewer threads
+		int error = pthread_create(
+			&(thread_descs[actual_thread_count].thread_handle),
+			nullptr,
+			launch_threads_helper,
+			reinterpret_cast<void*>(thread_descs + actual_thread_count));
+
+		// Track how many threads we actually created
+		if (!error)
+		{
+			// Windows needs explicit thread assignment to handle large core count systems
+			#if defined(_WIN32) && !defined(__CYGWIN__)
+				set_group_affinity(
+					thread_descs[actual_thread_count].thread_handle,
+					actual_thread_count);
+			#endif
+
+			actual_thread_count++;
+		}
+	}
+
+	// If we did not create thread_count threads then emit a warning
+	if (actual_thread_count != thread_count)
+	{
+		int log_count = actual_thread_count == 0 ? 1 : actual_thread_count;
+		const char* log_s = log_count == 1 ? "" : "s";
+		printf("WARNING: %s using %d thread%s due to thread creation error\n\n",
+		       operation, log_count, log_s);
+	}
+
+	// If we managed to spawn any threads wait for them to complete
+	if (actual_thread_count != 0)
+	{
+		for (int i = 0; i < actual_thread_count; i++)
+		{
+			pthread_join(thread_descs[i].thread_handle, nullptr);
+		}
+	}
+	// Else fall back to using this thread
+	else
+	{
+		func(1, 0, payload);
+	}
+
+	delete[] thread_descs;
+}
--- a/Show More
+++ b/Show More