This commit is contained in:
2026-06-14 19:09:18 +01:00
parent 14bd1a9271
commit 13fa90a0e9
3958 changed files with 999286 additions and 4 deletions
+7
View File
@@ -0,0 +1,7 @@
# Copyright 2024 The Khronos Group Inc.
# SPDX-License-Identifier: Apache-2.0
---
# Disable clang-format in this directory
DisableFormat: true
SortIncludes: false
...
+12
View File
@@ -0,0 +1,12 @@
<!-- Copyright 2025 Mark Callow -->
<!-- SPDX-License-Identifier: Apache-2.0 -->
SDL_gesture.h
-------------
The Gesture API was removed from SDL3. As a migration path they provided an equivalent single-header library `SDL_gesture.h` that can be dropped into an SDL3-based project.
They do not make formal releases of this code; they say "just grab the latest and drop it into your project!"
The origin of this file is fork https://github.com/MarkCallow/SDL_gesture.git whose upstream is
https://github.com/libsdl-org/SDL_gesture. It includes modifications for robustness to prevent production of spurious GESTURE\_MULTIGESTURE events.
+966
View File
@@ -0,0 +1,966 @@
/*
Simple DirectMedia Layer
Copyright (C) 1997-2022 Sam Lantinga <slouken@libsdl.org>
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
/* Touch gestures were removed from SDL3, so this is the SDL2 implementation copied in here, and tweaked a little. */
#ifndef INCL_SDL_GESTURE_H
#define INCL_SDL_GESTURE_H
#if !defined(SDL_MAJOR_VERSION)
#error Please include SDL.h before including this header.
#elif SDL_MAJOR_VERSION < 2
#error This header requires SDL2 or later.
#elif SDL_MAJOR_VERSION == 2
/* building against SDL2? Just use the built-in SDL2 implementation. */
#define Gesture_Init() (0)
#define Gesture_Quit()
#define Gesture_ID SDL_GestureID
#define Gesture_LoadDollarTemplates SDL_LoadDollarTemplates
#define Gesture_RecordGesture SDL_RecordGesture
#define Gesture_SaveAllDollarTemplates SDL_SaveAllDollarTemplates
#define Gesture_SaveDollarTemplate SDL_SaveDollarTemplate
#define GESTURE_DOLLARGESTURE SDL_DOLLARGESTURE
#define GESTURE_DOLLARRECORD SDL_DOLLARRECORD
#define GESTURE_MULTIGESTURE SDL_MULTIGESTURE
#define Gesture_MultiGestureEvent SDL_MultiGestureEvent
#define Gesture_DollarGestureEvent SDL_DollarGestureEvent
#else
#include <cmath>
/* Set up for C function definitions, even when using C++ */
#ifdef __cplusplus
extern "C" {
#endif
typedef Sint64 Gesture_ID;
/* events... */
/* generally you shouldn't hardcode event type numbers--and doubly so in
the reserved range!--but these match SDL2 and SDL3 promises to preserve
these values to help sdl2-compat. */
#define GESTURE_DOLLARGESTURE 0x800
#define GESTURE_DOLLARRECORD 0x801
#define GESTURE_MULTIGESTURE 0x802
typedef struct Gesture_MultiGestureEvent
{
Uint32 type;
Uint32 reserved;
Uint64 timestamp;
SDL_TouchID touchID;
float dTheta;
float dDist;
float x;
float y;
Uint16 numFingers;
Uint16 padding;
} Gesture_MultiGestureEvent;
typedef struct Gesture_DollarGestureEvent
{
Uint32 type;
Uint32 reserved;
Uint64 timestamp;
SDL_TouchID touchID;
Gesture_ID gestureId;
Uint32 numFingers;
float error;
float x;
float y;
} Gesture_DollarGestureEvent;
/* Function prototypes */
/**
* Call this once, AFTER SDL_Init, to set up the Gesture API.
*
* \returns 0 on success, -1 on error. Call SDL_GetError() for specifics.
*/
extern int SDLCALL Gesture_Init(void);
/**
* Call this once, BEFORE SDL_Quit, to clean up the Gesture API.
*/
extern void SDLCALL Gesture_Quit(void);
/**
* Begin recording a gesture on a specified touch device or all touch devices.
*
* If the parameter `touchID` is -1 (i.e., all devices), this function will
* always return 1, regardless of whether there actually are any devices.
*
* \param touchID the touch device id, or -1 for all touch devices
* \returns 1 on success or 0 if the specified device could not be found.
*/
extern int SDLCALL Gesture_RecordGesture(SDL_TouchID touchID);
/**
* Save all currently loaded Dollar Gesture templates.
*
* \param dst a SDL_IOStream to save to
* \returns the number of saved templates on success or 0 on failure; call
* SDL_GetError() for more information.
*
* \since This function is available since SDL 2.0.0.
*
* \sa Gesture_LoadDollarTemplates
* \sa Gesture_SaveDollarTemplate
*/
extern int SDLCALL Gesture_SaveAllDollarTemplates(SDL_IOStream *dst);
/**
* Save a currently loaded Dollar Gesture template.
*
* \param gestureId a gesture id
* \param dst a SDL_IOStream to save to
* \returns 1 on success or 0 on failure; call SDL_GetError() for more
* information.
*
* \since This function is available since SDL 2.0.0.
*
* \sa SDL_LoadDollarTemplates
* \sa SDL_SaveAllDollarTemplates
*/
extern int SDLCALL Gesture_SaveDollarTemplate(Gesture_ID gestureId, SDL_IOStream *dst);
/**
* Load Dollar Gesture templates from a file.
*
* \param touchID a touch id
* \param src a SDL_IOStream to load from
* \returns the number of loaded templates on success or a negative error code
* (or 0) on failure; call SDL_GetError() for more information.
*
* \since This function is available since SDL 2.0.0.
*
* \sa SDL_SaveAllDollarTemplates
* \sa SDL_SaveDollarTemplate
*/
extern int SDLCALL Gesture_LoadDollarTemplates(SDL_TouchID touchID, SDL_IOStream *src);
/* Ends C function definitions when using C++ */
#ifdef __cplusplus
}
#endif
#if defined(SDL_GESTURE_IMPLEMENTATION)
#define GESTURE_MAX_DOLLAR_PATH_SIZE 1024
#define GESTURE_DOLLARNPOINTS 64
#define GESTURE_DOLLARSIZE 256
#define GESTURE_PHI 0.618033989
typedef struct
{
float length;
int numPoints;
SDL_FPoint p[GESTURE_MAX_DOLLAR_PATH_SIZE];
} GestureDollarPath;
typedef struct
{
SDL_FPoint path[GESTURE_DOLLARNPOINTS];
Sint64 hash;
} GestureDollarTemplate;
typedef struct
{
SDL_TouchID touchID;
SDL_FPoint centroid;
GestureDollarPath dollarPath;
int numDownFingers;
int numDollarTemplates;
GestureDollarTemplate *dollarTemplate;
bool recording;
} GestureTouch;
static GestureTouch *GestureTouches = NULL;
static int GestureNumTouches = 0;
static bool GestureRecordAll = false;
static void GestureProcessEvent(const SDL_Event *event);
static bool SDLCALL GestureEventWatch(void *, SDL_Event *event)
{
GestureProcessEvent(event);
return true;
}
int Gesture_Init(void)
{
Gesture_Quit();
SDL_AddEventWatch(GestureEventWatch, NULL);
return 0;
}
static GestureTouch *GestureAddTouch(const SDL_TouchID touchID)
{
GestureTouch *gestureTouch = (GestureTouch *)SDL_realloc(GestureTouches, (GestureNumTouches + 1) * sizeof(GestureTouch));
if (gestureTouch == NULL) {
SDL_OutOfMemory();
return NULL;
}
GestureTouches = gestureTouch;
SDL_zero(GestureTouches[GestureNumTouches]);
GestureTouches[GestureNumTouches].touchID = touchID;
return &GestureTouches[GestureNumTouches++];
}
#if 0
static int GestureDelTouch(const SDL_TouchID touchID)
{
int i;
for (i = 0; i < GestureNumTouches; i++) {
if (GestureTouches[i].touchID == touchID) {
break;
}
}
if (i == GestureNumTouches) {
/* not found */
return -1;
}
SDL_free(GestureTouches[i].dollarTemplate);
SDL_zero(GestureTouches[i]);
GestureNumTouches--;
if (i != GestureNumTouches) {
SDL_copyp(&GestureTouches[i], &GestureTouches[GestureNumTouches]);
}
return 0;
}
#endif
static GestureTouch *GestureGetTouch(const SDL_TouchID touchID)
{
int i;
for (i = 0; i < GestureNumTouches; i++) {
/* printf("%i ?= %i\n",GestureTouches[i].touchID,touchID); */
if (GestureTouches[i].touchID == touchID) {
return &GestureTouches[i];
}
}
return NULL;
}
int Gesture_RecordGesture(SDL_TouchID touchID)
{
SDL_TouchID *devices;
int i;
devices = SDL_GetTouchDevices(NULL);
if (devices) {
/* make sure we know about all the devices SDL3 knows about, since we aren't connected as tightly as we were in SDL2. */
for (i = 0; devices[i]; i++) {
if (!GestureGetTouch(devices[i])) {
GestureAddTouch(devices[i]);
}
}
SDL_free(devices);
}
if (touchID != 0) {
GestureRecordAll = true; /* !!! FIXME: this is never set back to false anywhere, that's probably a bug. */
for (i = 0; i < GestureNumTouches; i++) {
GestureTouches[i].recording = true;
}
} else {
GestureTouch *touch = GestureGetTouch(touchID);
if (!touch) {
return 0; /* bogus touchid */
}
touch->recording = true;
}
return 1;
}
void Gesture_Quit(void)
{
SDL_RemoveEventWatch(GestureEventWatch, NULL);
SDL_free(GestureTouches);
GestureTouches = NULL;
GestureNumTouches = 0;
GestureRecordAll = false;
}
static unsigned long GestureHashDollar(SDL_FPoint *points)
{
unsigned long hash = 5381;
int i;
for (i = 0; i < GESTURE_DOLLARNPOINTS; i++) {
hash = ((hash << 5) + hash) + (unsigned long)points[i].x;
hash = ((hash << 5) + hash) + (unsigned long)points[i].y;
}
return hash;
}
static int GestureSaveTemplate(GestureDollarTemplate *templ, SDL_IOStream *dst)
{
const size_t bytes = sizeof(templ->path[0]) * GESTURE_DOLLARNPOINTS;
if (dst == NULL) {
return 0;
}
/* No Longer storing the Hash, rehash on load */
/* if (SDL_IOWrite(dst, &(templ->hash), sizeof(templ->hash)) != sizeof(templ->hash)) return 0; */
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
if (SDL_WriteIO(dst, templ->path, bytes) != bytes) {
return 0;
}
#else
{
GestureDollarTemplate copy = *templ;
SDL_FPoint *p = copy.path;
int i;
for (i = 0; i < GESTURE_DOLLARNPOINTS; i++, p++) {
p->x = SDL_SwapFloatLE(p->x);
p->y = SDL_SwapFloatLE(p->y);
}
if (SDL_WriteIO(dst, copy.path, bytes) != bytes) {
return 0;
}
}
#endif
return 1;
}
SDL_DECLSPEC int SDLCALL
Gesture_SaveAllDollarTemplates(SDL_IOStream *dst)
{
int i, j, rtrn = 0;
for (i = 0; i < GestureNumTouches; i++) {
GestureTouch *touch = &GestureTouches[i];
for (j = 0; j < touch->numDollarTemplates; j++) {
rtrn += GestureSaveTemplate(&touch->dollarTemplate[j], dst);
}
}
return rtrn;
}
SDL_DECLSPEC int SDLCALL
Gesture_SaveDollarTemplate(Gesture_ID gestureId, SDL_IOStream *dst)
{
int i, j;
for (i = 0; i < GestureNumTouches; i++) {
GestureTouch *touch = &GestureTouches[i];
for (j = 0; j < touch->numDollarTemplates; j++) {
if (touch->dollarTemplate[j].hash == gestureId) {
return GestureSaveTemplate(&touch->dollarTemplate[j], dst);
}
}
}
return SDL_SetError("Unknown gestureId");
}
/* path is an already sampled set of points
Returns the index of the gesture on success, or -1 */
static int GestureAddDollar_one(GestureTouch *inTouch, SDL_FPoint *path)
{
GestureDollarTemplate *dollarTemplate;
GestureDollarTemplate *templ;
int index;
index = inTouch->numDollarTemplates;
dollarTemplate = (GestureDollarTemplate *)SDL_realloc(inTouch->dollarTemplate, (index + 1) * sizeof(GestureDollarTemplate));
if (dollarTemplate == NULL) {
return SDL_OutOfMemory();
}
inTouch->dollarTemplate = dollarTemplate;
templ = &inTouch->dollarTemplate[index];
SDL_memcpy(templ->path, path, GESTURE_DOLLARNPOINTS * sizeof(SDL_FPoint));
templ->hash = GestureHashDollar(templ->path);
inTouch->numDollarTemplates++;
return index;
}
static int GestureAddDollar(GestureTouch *inTouch, SDL_FPoint *path)
{
int index = -1;
int i = 0;
if (inTouch == NULL) {
if (GestureNumTouches == 0) {
return SDL_SetError("no gesture touch devices registered");
}
for (i = 0; i < GestureNumTouches; i++) {
inTouch = &GestureTouches[i];
index = GestureAddDollar_one(inTouch, path);
if (index < 0) {
return -1;
}
}
/* Use the index of the last one added. */
return index;
}
return GestureAddDollar_one(inTouch, path);
}
SDL_DECLSPEC int SDLCALL
Gesture_LoadDollarTemplates(SDL_TouchID touchID, SDL_IOStream *src)
{
int i, loaded = 0;
GestureTouch *touch = NULL;
if (src == NULL) {
return 0;
}
/* In SDL2 this test was `touchID >= 0` leading to warnings from gcc
because SDL_TouchId is now Uint64. In SDL2 it was Sint64. The
documentation does not say what < 0 means here but the only defined
negative touchID was SDL_MOUSE_TOUCHID (-1). In SDL3 SDL_PEN_TOUCHID (-2)
has been added hence this test. Given the lack of documentation
it is impossible to say if this updated test is correct. */
if (touchID < SDL_PEN_TOUCHID) {
for (i = 0; i < GestureNumTouches; i++) {
if (GestureTouches[i].touchID == touchID) {
touch = &GestureTouches[i];
}
}
if (touch == NULL) {
return SDL_SetError("given touch id not found");
}
}
while (1) {
GestureDollarTemplate templ;
const size_t bytes = sizeof(templ.path[0]) * GESTURE_DOLLARNPOINTS;
if (SDL_ReadIO(src, templ.path, bytes) < bytes) {
if (loaded == 0) {
return SDL_SetError("could not read any dollar gesture from rwops");
}
break;
}
#if SDL_BYTEORDER != SDL_LIL_ENDIAN
for (i = 0; i < GESTURE_DOLLARNPOINTS; i++) {
SDL_FPoint *p = &templ.path[i];
p->x = SDL_SwapFloatLE(p->x);
p->y = SDL_SwapFloatLE(p->y);
}
#endif
// See comment at line 436.
if (touchID < SDL_PEN_TOUCHID) {
/* printf("Adding loaded gesture to 1 touch\n"); */
if (GestureAddDollar(touch, templ.path) >= 0) {
loaded++;
}
} else {
/* printf("Adding to: %i touches\n",GestureNumTouches); */
for (i = 0; i < GestureNumTouches; i++) {
touch = &GestureTouches[i];
/* printf("Adding loaded gesture to + touches\n"); */
/* TODO: What if this fails? */
GestureAddDollar(touch, templ.path);
}
loaded++;
}
}
return loaded;
}
static float GestureDollarDifference(SDL_FPoint *points, SDL_FPoint *templ, float ang)
{
/* SDL_FPoint p[GESTURE_DOLLARNPOINTS]; */
float dist = 0;
SDL_FPoint p;
int i;
for (i = 0; i < GESTURE_DOLLARNPOINTS; i++) {
p.x = points[i].x * SDL_cosf(ang) - points[i].y * SDL_sinf(ang);
p.y = points[i].x * SDL_sinf(ang) + points[i].y * SDL_cosf(ang);
dist += SDL_sqrtf((p.x - templ[i].x) * (p.x - templ[i].x) + (p.y - templ[i].y) * (p.y - templ[i].y));
}
return dist / GESTURE_DOLLARNPOINTS;
}
static float GestureBestDollarDifference(SDL_FPoint *points, SDL_FPoint *templ)
{
/*------------BEGIN DOLLAR BLACKBOX------------------
-TRANSLATED DIRECTLY FROM PSUDEO-CODE AVAILABLE AT-
-"http://depts.washington.edu/aimgroup/proj/dollar/"
*/
double ta = -SDL_PI_D / 4;
double tb = SDL_PI_D / 4;
double dt = SDL_PI_D / 90;
float x1 = (float)(GESTURE_PHI * ta + (1 - GESTURE_PHI) * tb);
float f1 = GestureDollarDifference(points, templ, x1);
float x2 = (float)((1 - GESTURE_PHI) * ta + GESTURE_PHI * tb);
float f2 = GestureDollarDifference(points, templ, x2);
while (SDL_fabs(ta - tb) > dt) {
if (f1 < f2) {
tb = x2;
x2 = x1;
f2 = f1;
x1 = (float)(GESTURE_PHI * ta + (1 - GESTURE_PHI) * tb);
f1 = GestureDollarDifference(points, templ, x1);
} else {
ta = x1;
x1 = x2;
f1 = f2;
x2 = (float)((1 - GESTURE_PHI) * ta + GESTURE_PHI * tb);
f2 = GestureDollarDifference(points, templ, x2);
}
}
/*
if (f1 <= f2)
printf("Min angle (x1): %f\n",x1);
else if (f1 > f2)
printf("Min angle (x2): %f\n",x2);
*/
return SDL_min(f1, f2);
}
/* `path` contains raw points, plus (possibly) the calculated length */
static int GestureDollarNormalize(const GestureDollarPath *path, SDL_FPoint *points, bool is_recording)
{
int i;
float interval;
float dist;
int numPoints = 0;
SDL_FPoint centroid;
float xmin, xmax, ymin, ymax;
float ang;
float w, h;
float length = path->length;
/* Calculate length if it hasn't already been done */
if (length <= 0) {
for (i = 1; i < path->numPoints; i++) {
const float dx = path->p[i].x - path->p[i - 1].x;
const float dy = path->p[i].y - path->p[i - 1].y;
length += SDL_sqrtf(dx * dx + dy * dy);
}
}
/* Resample */
interval = length / (GESTURE_DOLLARNPOINTS - 1);
dist = interval;
centroid.x = 0;
centroid.y = 0;
/* printf("(%f,%f)\n",path->p[path->numPoints-1].x,path->p[path->numPoints-1].y); */
for (i = 1; i < path->numPoints; i++) {
const float d = SDL_sqrtf((path->p[i - 1].x - path->p[i].x) * (path->p[i - 1].x - path->p[i].x) + (path->p[i - 1].y - path->p[i].y) * (path->p[i - 1].y - path->p[i].y));
/* printf("d = %f dist = %f/%f\n",d,dist,interval); */
while (dist + d > interval) {
points[numPoints].x = path->p[i - 1].x +
((interval - dist) / d) * (path->p[i].x - path->p[i - 1].x);
points[numPoints].y = path->p[i - 1].y +
((interval - dist) / d) * (path->p[i].y - path->p[i - 1].y);
centroid.x += points[numPoints].x;
centroid.y += points[numPoints].y;
numPoints++;
dist -= interval;
}
dist += d;
}
if (numPoints < GESTURE_DOLLARNPOINTS - 1) {
if (is_recording) {
SDL_SetError("ERROR: NumPoints = %i", numPoints);
}
return 0;
}
/* copy the last point */
points[GESTURE_DOLLARNPOINTS - 1] = path->p[path->numPoints - 1];
numPoints = GESTURE_DOLLARNPOINTS;
centroid.x /= numPoints;
centroid.y /= numPoints;
/* printf("Centroid (%f,%f)",centroid.x,centroid.y); */
/* Rotate Points so point 0 is left of centroid and solve for the bounding box */
xmin = centroid.x;
xmax = centroid.x;
ymin = centroid.y;
ymax = centroid.y;
ang = SDL_atan2f(centroid.y - points[0].y, centroid.x - points[0].x);
for (i = 0; i < numPoints; i++) {
const float px = points[i].x;
const float py = points[i].y;
points[i].x = (px - centroid.x) * SDL_cosf(ang) - (py - centroid.y) * SDL_sinf(ang) + centroid.x;
points[i].y = (px - centroid.x) * SDL_sinf(ang) + (py - centroid.y) * SDL_cosf(ang) + centroid.y;
if (points[i].x < xmin) {
xmin = points[i].x;
}
if (points[i].x > xmax) {
xmax = points[i].x;
}
if (points[i].y < ymin) {
ymin = points[i].y;
}
if (points[i].y > ymax) {
ymax = points[i].y;
}
}
/* Scale points to GESTURE_DOLLARSIZE, and translate to the origin */
w = xmax - xmin;
h = ymax - ymin;
for (i = 0; i < numPoints; i++) {
points[i].x = (points[i].x - centroid.x) * GESTURE_DOLLARSIZE / w;
points[i].y = (points[i].y - centroid.y) * GESTURE_DOLLARSIZE / h;
}
return numPoints;
}
static float GestureDollarRecognize(const GestureDollarPath *path, int *bestTempl, GestureTouch *touch)
{
SDL_FPoint points[GESTURE_DOLLARNPOINTS];
int i;
float bestDiff = 10000;
SDL_memset(points, 0, sizeof(points));
GestureDollarNormalize(path, points, false);
/* PrintPath(points); */
*bestTempl = -1;
for (i = 0; i < touch->numDollarTemplates; i++) {
const float diff = GestureBestDollarDifference(points, touch->dollarTemplate[i].path);
if (diff < bestDiff) {
bestDiff = diff;
*bestTempl = i;
}
}
return bestDiff;
}
static void GestureSendMulti(GestureTouch *touch, float dTheta, float dDist)
{
if (SDL_EventEnabled(GESTURE_MULTIGESTURE)) {
Gesture_MultiGestureEvent mgesture;
mgesture.type = GESTURE_MULTIGESTURE;
mgesture.timestamp = 0;
mgesture.touchID = touch->touchID;
mgesture.x = touch->centroid.x;
mgesture.y = touch->centroid.y;
mgesture.dTheta = dTheta;
mgesture.dDist = dDist;
mgesture.numFingers = (Uint16)touch->numDownFingers;
SDL_PushEvent((SDL_Event*)&mgesture);
}
}
static void GestureSendDollar(GestureTouch *touch, Gesture_ID gestureId, float error)
{
if (SDL_EventEnabled(GESTURE_DOLLARGESTURE)) {
Gesture_DollarGestureEvent dgesture;
dgesture.type = GESTURE_DOLLARGESTURE;
dgesture.timestamp = 0;
dgesture.touchID = touch->touchID;
dgesture.x = touch->centroid.x;
dgesture.y = touch->centroid.y;
dgesture.gestureId = gestureId;
dgesture.error = error;
/* A finger came up to trigger this event. */
dgesture.numFingers = touch->numDownFingers + 1;
SDL_PushEvent((SDL_Event*)&dgesture);
}
}
static void GestureSendDollarRecord(GestureTouch *touch, Gesture_ID gestureId)
{
if (SDL_EventEnabled(GESTURE_DOLLARRECORD)) {
Gesture_DollarGestureEvent dgesture;
dgesture.type = GESTURE_DOLLARRECORD;
dgesture.timestamp = 0;
dgesture.touchID = touch->touchID;
dgesture.gestureId = gestureId;
SDL_PushEvent((SDL_Event*)&dgesture);
}
}
#if !defined(GESTURE_LOG_UP_DOWN_EVENTS)
#define GESTURE_LOG_UP_DOWN_EVENTS 0
#endif
#if !defined(GESTURE_LOG_MOTION_EVENTS)
#define GESTURE_LOG_MOTION_EVENTS 0
#endif
static void GestureProcessEvent(const SDL_Event *event)
{
float x, y;
int index;
int i;
float pathDx, pathDy;
SDL_FPoint lastP;
SDL_FPoint lastCentroid;
float lDist;
float Dist;
float dtheta;
float dDist;
if (event->type == SDL_EVENT_FINGER_MOTION || event->type == SDL_EVENT_FINGER_DOWN || event->type == SDL_EVENT_FINGER_UP) {
GestureTouch *inTouch = GestureGetTouch(event->tfinger.touchID);
if (inTouch == NULL) { /* we maybe didn't see this one before. */
inTouch = GestureAddTouch(event->tfinger.touchID);
if (!inTouch) {
return; /* oh well. */
}
}
int numDownFingersReported;
SDL_Finger** fingers = SDL_GetTouchFingers(event->tfinger.touchID, &numDownFingersReported);
x = event->tfinger.x;
y = event->tfinger.y;
/* Finger Up */
if (event->type == SDL_EVENT_FINGER_UP) {
#if GESTURE_LOG_UP_DOWN_EVENTS
SDL_Log("GPE: Finger: %#" SDL_PRIx64 " UP. Device: %#" SDL_PRIx64 ", fingers: %i, x: %f, y: %f, press: %f",
event->tfinger.fingerID, event->tfinger.touchID, numDownFingersReported,
event->tfinger.x, event->tfinger.y, event->tfinger.pressure);
#endif
SDL_FPoint path[GESTURE_DOLLARNPOINTS];
#if SDL_PLATFORM_MACOS
/* Workaround issue https://github.com/libsdl-org/SDL/issues/13428,
Extra SDL_EVENT_FINGER_{UP,DOWN} with mouse button press, by
ignoring events with fingerID of SDL_BUTTON_LEFT.
N.B. If SDL_HINT_MOUSE_TOUCH_EVENTS is set to 0 no touch
events are received from the trackpad. */
if (event->tfinger.fingerID == SDL_BUTTON_LEFT) return;
#endif
/* Using the number of fingers returned by SDL_GetTouchFingers
is much more robust than counting finger up and down events.
With counting it is easy for the counted number to be higher
than the actual number. Unfortunately it has not been possible
to identify a sequence of actions that reliably reproduces
this but asserts have shown it happens often. Perhaps
sometimes a single UP or DOWN event is received for multiple
fingers.
Using the reported number is independent of how many events
are actually received. But, and this is a big one, in the
case of FINGER_UP SDL_GetTouchFingers reports the number of
fingers down *before* the up event.
N.B. In the case of a left button press on macOS,
SDL_GetTouchFingers reports 1 for the event that is not
ignored.
*/
inTouch->numDownFingers = numDownFingersReported - 1;
assert(inTouch->numDownFingers >= 0);
#if (GESTURE_LOG_UP_DOWN_EVENTS)
SDL_Log("GPE FINGER_UP, numDownFingers now = %i", inTouch->numDownFingers);
#endif
if (inTouch->recording) {
inTouch->recording = false;
GestureDollarNormalize(&inTouch->dollarPath, path, true);
/* PrintPath(path); */
if (GestureRecordAll) {
index = GestureAddDollar(NULL, path);
for (i = 0; i < GestureNumTouches; i++) {
GestureTouches[i].recording = false;
}
} else {
index = GestureAddDollar(inTouch, path);
}
if (index >= 0) {
GestureSendDollarRecord(inTouch, inTouch->dollarTemplate[index].hash);
} else {
GestureSendDollarRecord(inTouch, -1);
}
} else {
int bestTempl = -1;
const float error = GestureDollarRecognize(&inTouch->dollarPath, &bestTempl, inTouch);
if (bestTempl >= 0) {
/* Send Event */
const Gesture_ID gestureId = inTouch->dollarTemplate[bestTempl].hash;
GestureSendDollar(inTouch, gestureId, error);
/* printf ("%s\n",);("Dollar error: %f\n",error); */
}
}
/* inTouch->gestureLast[j] = inTouch->gestureLast[inTouch->numDownFingers]; */
if (inTouch->numDownFingers > 0) {
inTouch->centroid.x = (inTouch->centroid.x * (inTouch->numDownFingers + 1) - x) / inTouch->numDownFingers;
inTouch->centroid.y = (inTouch->centroid.y * (inTouch->numDownFingers + 1) - y) / inTouch->numDownFingers;
} else {
inTouch->centroid.x = inTouch->centroid.y = 0.0f;
}
} else if (event->type == SDL_EVENT_FINGER_MOTION) {
/* There is one FINGER_MOTION event per down finger. x,y gives
the position of the finger whose id is in the event. */
const float dx = event->tfinger.dx;
const float dy = event->tfinger.dy;
GestureDollarPath *path = &inTouch->dollarPath;
#if GESTURE_LOG_MOTION_EVENTS
SDL_Log("GPE: Finger: %#" SDL_PRIx64 " MOTION: device: %#" SDL_PRIx64 ", timestamp = %"
SDL_PRIu64 ", fingers: %i, x: %f, y: %f, press: %f, numDownFingers: %i",
event->tfinger.fingerID, event->tfinger.touchID, event->tfinger.timestamp,
numDownFingersReported, event->tfinger.x, event->tfinger.y, event->tfinger.pressure,
inTouch->numDownFingers);
#endif
assert(numDownFingersReported > 0);
#if SDL_PLATFORM_MACOS
/* Workaround issue https://github.com/libsdl-org/SDL/issues/13428.
See comment at line 753 for more details. */
if (event->tfinger.fingerID == SDL_BUTTON_LEFT) return;
/* SDL_GetTouchFingers reports 2 fingers down in the motion event
for the other finger during button press. Fix up the number of
fingers. */
uint32_t reportedNumFingers = numDownFingersReported;
for (uint32_t i = 0; i < reportedNumFingers; i++) {
if (fingers[i]->id == SDL_BUTTON_LEFT) {
numDownFingersReported--;
break;
}
}
#endif
/* See comment at line 762. One case where the count reliably
differs from reported is on iOS. When touching, dragging and
releasing 2 fingers, iOS sends a BUTTON_DOWN and BUTTON_UP
for one of the fingers. When the finger corresponding to the
button is raised, it sends the BUTTON_UP followed by the
FINGER_UP but FINGER_MOTION events can come before the
FINGER_UP and those events have only one finger down. */
inTouch->numDownFingers = numDownFingersReported;
if (path->numPoints < GESTURE_MAX_DOLLAR_PATH_SIZE) {
path->p[path->numPoints].x = inTouch->centroid.x;
path->p[path->numPoints].y = inTouch->centroid.y;
pathDx = (path->p[path->numPoints].x - path->p[path->numPoints - 1].x);
pathDy = (path->p[path->numPoints].y - path->p[path->numPoints - 1].y);
path->length += (float)SDL_sqrt(pathDx * pathDx + pathDy * pathDy);
path->numPoints++;
}
lastP.x = x - dx;
lastP.y = y - dy;
lastCentroid = inTouch->centroid;
inTouch->centroid.x += dx / inTouch->numDownFingers;
inTouch->centroid.y += dy / inTouch->numDownFingers;
/* printf("Centroid : (%f,%f)\n",inTouch->centroid.x,inTouch->centroid.y); */
if (inTouch->numDownFingers > 1) {
SDL_FPoint lv; /* Vector from centroid to last x,y position */
SDL_FPoint v; /* Vector from centroid to current x,y position */
/* lv = inTouch->gestureLast[j].cv; */
lv.x = lastP.x - lastCentroid.x;
lv.y = lastP.y - lastCentroid.y;
lDist = SDL_sqrtf(lv.x * lv.x + lv.y * lv.y);
/* printf("lDist = %f\n",lDist); */
v.x = x - inTouch->centroid.x;
v.y = y - inTouch->centroid.y;
/* inTouch->gestureLast[j].cv = v; */
Dist = SDL_sqrtf(v.x * v.x + v.y * v.y);
/* SDL_cosf(dTheta) = (v . lv)/(|v| * |lv|) */
/* Normalize Vectors to simplify angle calculation */
lv.x /= lDist;
lv.y /= lDist;
v.x /= Dist;
v.y /= Dist;
dtheta = SDL_atan2f(lv.x * v.y - lv.y * v.x, lv.x * v.x + lv.y * v.y);
dDist = (Dist - lDist);
if (lDist == 0) {
/* To avoid impossible values */
dDist = 0;
dtheta = 0;
}
/* inTouch->gestureLast[j].dDist = dDist;
inTouch->gestureLast[j].dtheta = dtheta;
printf("dDist = %f, dTheta = %f\n",dDist,dtheta);
gdtheta = gdtheta*.9 + dtheta*.1;
gdDist = gdDist*.9 + dDist*.1
knob.r += dDist/numDownFingers;
knob.ang += dtheta;
printf("thetaSum = %f, distSum = %f\n",gdtheta,gdDist);
printf("id: %i dTheta = %f, dDist = %f\n",j,dtheta,dDist); */
GestureSendMulti(inTouch, dtheta, dDist);
} else {
/* inTouch->gestureLast[j].dDist = 0;
inTouch->gestureLast[j].dtheta = 0;
inTouch->gestureLast[j].cv.x = 0;
inTouch->gestureLast[j].cv.y = 0; */
}
/* inTouch->gestureLast[j].f.p.x = x;
inTouch->gestureLast[j].f.p.y = y;
break;
pressure? */
} else if (event->type == SDL_EVENT_FINGER_DOWN) {
#if (GESTURE_LOG_UP_DOWN_EVENTS)
SDL_Log("GPE: Finger: %#" SDL_PRIx64 " DOWN. Device: %#" SDL_PRIx64 ", fingers: %i, x: %f, y: %f, press: %f",
event->tfinger.fingerID, event->tfinger.touchID, numDownFingersReported,
event->tfinger.x, event->tfinger.y, event->tfinger.pressure);
#endif
#if SDL_PLATFORM_MACOS
/* See comment starting at line 753. */
if (event->tfinger.fingerID == SDL_BUTTON_LEFT) return;
#endif
/* Using the number of fingers returned by SDL_GetTouchFingers
is much more robust than counting finger up and down events.
With counting it is easy for the counted number to be higher
than the actual number. Unfortunately it has not been possible
to identify a sequence of actions that reliably reproduces
this. Using the reported number is independent of how many
events are actually received. */
inTouch->numDownFingers = numDownFingersReported;
inTouch->centroid.x = inTouch->centroid.y = 0.0;
for (i = 0; i < numDownFingersReported; i++) {
inTouch->centroid.x += fingers[i]->x;
inTouch->centroid.y += fingers[i]->y;
}
inTouch->centroid.x /= numDownFingersReported;
inTouch->centroid.y /= numDownFingersReported;
//printf("Finger Down: (%f,%f). Centroid: (%f,%f\n",x,y,
// inTouch->centroid.x,inTouch->centroid.y);
inTouch->dollarPath.length = 0;
inTouch->dollarPath.p[0].x = x;
inTouch->dollarPath.p[0].y = y;
inTouch->dollarPath.numPoints = 1;
}
SDL_free(fingers);
}
}
#endif /* defined(SDL_GESTURE_IMPLEMENTATION) */
#endif /* SDL version > 2 */
#endif /* INCL_SDL_GESTURE_H */
/* vi: set sts=4 ts=4 sw=4 expandtab: */
+21
View File
@@ -0,0 +1,21 @@
# Text type files use auto line endings
* text=auto
# Explicitly declare text file types for this repo
*.c text
*.cpp text
*.h text
*.md text
Jenkinsfile text
# VS solutions always use Windows line endings
*.sln text eol=crlf
*.vcxproj text eol=crlf
# Bash scripts always use *nux line endings
*.sh text eol=lf
# Denote all files that are truly binary and should not be modified.
*.png binary
*.hdr binary
*.exe binary
@@ -0,0 +1,385 @@
name: post-weekly-release
run-name: Build, test, generate signed artifacts and optionally prepare release
on:
workflow_dispatch:
push:
branches:
- main
tags:
- '*'
schedule:
- cron: '17 2 * * 1'
jobs:
coverity:
if: ${{ (!startsWith(github.event.ref, 'refs/tags/')) && (github.repository_owner == 'Arm-software') }}
name: Run Coverity static analysis
runs-on: [self-hosted-ubuntu-latest-x64]
steps:
- name: Clean workspace
uses: AutoModality/action-clean@v1
- name: Git checkout
uses: actions/checkout@v4
with:
submodules: 'true'
- name: Coverity preparation
run: |
export PATH=$PATH:/usr/local/cov-analysis/bin
mkdir build_cov
cd build_cov
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ -DASTCENC_ISA_AVX2=ON ..
cov-configure --config ${GITHUB_WORKSPACE}/coverity.conf --template --compiler cc --comptype gcc
cov-configure --config ${GITHUB_WORKSPACE}/coverity.conf --template --compiler c++ --comptype g++
- name: Coverity build
run: |
export PATH=$PATH:/usr/local/cov-analysis/bin
cd build_cov
cov-build --config ${GITHUB_WORKSPACE}/coverity.conf --dir ${GITHUB_WORKSPACE}/intermediate make install
- name: Coverity analyze
run: |
export PATH=$PATH:/usr/local/cov-analysis/bin
cd build_cov
cov-analyze --dir ${GITHUB_WORKSPACE}/intermediate
- name: Coverity upload
env:
COVERITY_KEY: ${{ secrets.COVERITY_KEY }}
run: |
export PATH=$PATH:/usr/local/cov-analysis/bin
echo "${COVERITY_KEY}" > coverity.key
chmod 400 coverity.key
cd build_cov
cov-commit-defects \
--dir ${GITHUB_WORKSPACE}/intermediate \
--stream astcenc-master \
--url https://coverity.cambridge.arm.com \
--auth-key-file ../coverity.key \
--strip-path ${GITHUB_WORKSPACE}
build-ubuntu-arm64:
name: Ubuntu arm64
runs-on: ubuntu-24.04-arm
steps:
- name: Git checkout
uses: actions/checkout@v4
with:
submodules: 'true'
- name: Update apt packages
run: sudo apt-get update
- name: Install ImageMagick
run: sudo apt-get install imagemagick
- name: Build release
run: |
export CXX=clang++
mkdir build_rel
cd build_rel
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ -DASTCENC_ISA_NEON=ON -DASTCENC_ISA_SVE_128=ON -DASTCENC_ISA_SVE_256=ON -DASTCENC_PACKAGE=arm64 ..
make install package -j4
- name: Upload binaries
uses: actions/upload-artifact@v4
with:
name: astcenc-linux-arm64
path: |
build_rel/*.zip
build_rel/*.zip.sha256
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Get Python modules
run: |
python -m pip install --upgrade pip
pip install numpy Pillow
- name: Run system tests
# Disable SVE testing for now
run: |
python ./Test/astc_test_functional.py --encoder neon
python ./Test/astc_test_image.py --encoder neon --test-set Small
build-ubuntu-x64:
name: Ubuntu x64
runs-on: ubuntu-22.04
steps:
- name: Git checkout
uses: actions/checkout@v4
with:
submodules: 'true'
- name: Build release
run: |
export CXX=clang++
mkdir build_rel
cd build_rel
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ -DASTCENC_ISA_AVX2=ON -DASTCENC_ISA_SSE41=ON -DASTCENC_ISA_SSE2=ON -DASTCENC_PACKAGE=x64 ..
make install package -j4
- name: Upload binaries
uses: actions/upload-artifact@v4
with:
name: astcenc-linux-x86_64
path: |
build_rel/*.zip
build_rel/*.zip.sha256
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Get Python modules
run: |
python -m pip install --upgrade pip
pip install numpy Pillow
- name: Run system tests
run: |
python ./Test/astc_test_functional.py
python ./Test/astc_test_image.py --encoder all-x86 --test-set Small
build-macos-universal:
name: macOS universal
runs-on: macos-14
steps:
- name: Git checkout
uses: actions/checkout@v4
with:
submodules: 'true'
- name: Build release
run: |
mkdir build_rel
cd build_rel
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ -DASTCENC_PACKAGE=universal ..
make install package -j4
- name: Upload binaries
uses: actions/upload-artifact@v4
with:
name: astcenc-macos-universal
path: |
build_rel/*.zip
build_rel/*.zip.sha256
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Get Python modules
run: |
python -m pip install --upgrade pip
pip install numpy Pillow
- name: Run system tests
run: |
python ./Test/astc_test_image.py --test-set Small --encoder universal
build-windows-multi:
name: Windows multi
runs-on: windows-2022
steps:
- name: Git checkout
uses: actions/checkout@v4
with:
submodules: 'true'
- name: Setup Visual Studio x86_64
uses: ilammy/msvc-dev-cmd@v1
- name: Build release x64
run: |
mkdir build_rel
cd build_rel
cmake -G "Visual Studio 17 2022" -T ClangCL -DCMAKE_INSTALL_PREFIX=../ -DASTCENC_ISA_AVX2=ON -DASTCENC_ISA_SSE41=ON -DASTCENC_ISA_SSE2=ON -DASTCENC_PACKAGE=x64 ..
msbuild astcencoder.sln -property:Configuration=Release
msbuild PACKAGE.vcxproj -property:Configuration=Release
msbuild INSTALL.vcxproj -property:Configuration=Release
shell: cmd
- name: Setup Visual Studio arm64
uses: ilammy/msvc-dev-cmd@v1
with:
arch: x86_arm64
- name: Build release arm64
run: |
mkdir build_rel_arm64
cd build_rel_arm64
cmake -G "Visual Studio 17 2022" -A ARM64 -T ClangCL -DCMAKE_INSTALL_PREFIX=../ -DASTCENC_ISA_NEON=ON -DASTCENC_PACKAGE=arm64 ..
msbuild astcencoder.sln -property:Configuration=Release
msbuild PACKAGE.vcxproj -property:Configuration=Release
msbuild INSTALL.vcxproj -property:Configuration=Release
shell: cmd
- name: Upload binaries
uses: actions/upload-artifact@v4
with:
name: astcenc-windows-multi-cl
path: |
build_rel/*.zip
build_rel/*.zip.sha256
build_rel_arm64/*.zip
build_rel_arm64/*.zip.sha256
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Get Python modules
run: |
python -m pip install --upgrade pip
pip install numpy Pillow
shell: cmd
- name: Run system tests
run: |
python ./Test/astc_test_image.py --test-set Small
shell: cmd
sign-binaries:
if: github.repository_owner == 'Arm-software'
name: Sign Windows and macOS
runs-on: [self-hosted-ubuntu-latest-x64]
needs: [build-macos-universal, build-windows-multi]
steps:
- name: Clean workspace
uses: AutoModality/action-clean@v1
- name: Checkout signing code
env:
SIGNING_REPO_URL: ${{ secrets.SIGNING_REPO_URL }}
run: |
git clone --depth 1 ${SIGNING_REPO_URL}
- name: Install code sign v2 client
env:
ARTIFACTORY_USER: ${{ secrets.ARTIFACTORY_USER }}
ARTIFACTORY_APIKEY: ${{ secrets.ARTIFACTORY_APIKEY }}
ARTIFACTORY_FQDN: ${{ secrets.ARTIFACTORY_FQDN }}
run: |
python3.11 -m venv cs
. ./cs/bin/activate
pip install -i https://${ARTIFACTORY_USER}:${ARTIFACTORY_APIKEY}@${ARTIFACTORY_FQDN}/artifactory/api/pypi/dsgcore.pypi/simple code-signer-client
- name: Download macOS binaries
uses: actions/download-artifact@v4
with:
name: astcenc-macos-universal
path: mac
- name: Download Windows binaries
uses: actions/download-artifact@v4
with:
name: astcenc-windows-multi-cl
path: windows
- name: Sign macOS binaries
env:
CODESIGNER_USER: ${{ secrets.CODESIGNER_USER }}
run: |
. ./cs/bin/activate
cd mac
python3 ${GITHUB_WORKSPACE}/signing/macos-client-wrapper.py ${CODESIGNER_USER} *.zip
- name: Sign Windows binaries
env:
ARTIFACTORY_APIKEY: ${{ secrets.ARTIFACTORY_APIKEY }}
run: |
. ./cs/bin/activate
cd windows
for FILENAME in */*; do mv ${FILENAME} .; done
for ZIPFILE in *.zip; do python3 ../signing/windows-client-wrapper.py -b ${GITHUB_RUN_NUMBER} -t ${ARTIFACTORY_APIKEY} ${ZIPFILE}; done
- name: Upload signed binaries
uses: actions/upload-artifact@v4
with:
name: signed-binaries
path: |
windows/*
mac/*
- name: Tidy intermediate artifacts
uses: geekyeggo/delete-artifact@v5
with:
name: |
astcenc-windows-multi-cl
astcenc-macos-universal
prepare-release:
if: ${{ (startsWith(github.event.ref, 'refs/tags/')) && (github.repository_owner == 'Arm-software') }}
name: Prepare release
runs-on: ubuntu-22.04
needs: [sign-binaries, build-ubuntu-x64]
steps:
- name: Git checkout
uses: actions/checkout@v4
- name: Download signed binaries
uses: actions/download-artifact@v4
with:
name: signed-binaries
path: prepare-release
- name: Download Linux x86_64 binaries
uses: actions/download-artifact@v4
with:
name: astcenc-linux-x86_64
path: prepare-release
- name: Download Linux arm64 binaries
uses: actions/download-artifact@v4
with:
name: astcenc-linux-arm64
path: prepare-release
- name: Flatten file structure
run: |
cd prepare-release
for FILENAME in */*; do mv ${FILENAME} .; done
rmdir windows
rmdir mac
- name: Create checksum file
run: |
cd prepare-release
cat *.sha256 > release-sha256.txt
rm *.sha256
- name: Create release body
run: |
export STATUS_DATE=$(date "+%B %Y")
GITHUB_REF=${{ github.ref }} ; export RELEASE_VERSION=${GITHUB_REF##*/}
export SHA_CHECKSUMS=$(cat prepare-release/release-sha256.txt)
envsubst < .github/workflows/release_body_template.md > prepare-release/release_body.md
- name: Create release
id: create_release
uses: comnoco/create-release-action@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tag_name: ${{ github.ref }}
release_name: ${{ github.ref }}
body_path: prepare-release/release_body.md
draft: true
- name: Attach artifacts
uses: AButler/upload-release-assets@v3.0
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
release-id: ${{ steps.create_release.outputs.id }}
files: "prepare-release/astcenc-*-*-*.zip;prepare-release/release-sha256.txt"
@@ -0,0 +1,13 @@
**Status:** ${STATUS_DATE}
The ${RELEASE_VERSION} release is a minor/major maintenance release.
* **General:**
* **Bug fix:** Text here
* **Feature:** Text here
## Binary release sha256 checksums
```
${SHA_CHECKSUMS}
```
+47
View File
@@ -0,0 +1,47 @@
# Editor and engineering scratch files
.cache
.vs
.vscode
.DS_Store
*.log
*.diff
*.user
*.o
*.a
__pycache__
Scratch
Proto
# Precompiled reference binaries for comparison tests
bin
lib
Binaries
# Build artifacts
astcenc
build*
# General build artifacts
Test/DocOut
# Test images we download from other sources
Test/Images/Kodak*/**/*.png
Test/Images/Scratch*
# Test output
TestOutput
/*.xlsx
/*.jpg
/*.json
/*.log
/*.txt
/*.hdr
/*.png
/*.exr
/*.astc
astc_reference-main*
Docs/Profiling.md
Source/astcenccli_version.h
# Do not ignore workflows
!.github/workflows/
+12
View File
@@ -0,0 +1,12 @@
; DO NOT EDIT (unless you know what you are doing)
;
; This subdirectory is a git "subrepo", and this file is maintained by the
; git-subrepo command. See https://github.com/git-commands/git-subrepo#readme
;
[subrepo]
remote = https://github.com/ARM-software/astc-encoder.git
branch = 5.3.0
commit = 30aabb3f42406df45a910d8496f9bee17eeba9bb
parent = f9c73388a58de9b83f260f11008b043d8f7c0954
method = merge
cmdver = 0.4.9
+532
View File
@@ -0,0 +1,532 @@
[MASTER]
# Control the amount of potential inferred values when inferring a single
# object. This can help the performance when dealing with large functions or
# complex, nested conditions.
limit-inference-results=100
# List of plugins (as comma separated values of python module names) to load,
# usually to register additional checkers.
load-plugins=pylint.extensions.docparams
# Pickle collected data for later comparisons.
persistent=yes
# When enabled, pylint would attempt to guess common misconfiguration and emit
# user-friendly hints instead of false-positive error messages.
suggestion-mode=yes
# Allow loading of arbitrary C extensions. Extensions are imported into the
# active Python interpreter and may run arbitrary code.
unsafe-load-any-extension=no
# Ignore specific directories we don't author ourselves
ignore=Test/DocSource
[MESSAGES CONTROL]
# Only show warnings with the listed confidence levels. Leave empty to show
# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
confidence=
# Disable the message, report, category or checker with the given id(s). You
# can either give multiple identifiers separated by comma (,) or put this
# option multiple times (only on the command line, not in the configuration
# file where it should appear only once). You can also use "--disable=all" to
# disable everything first and then reenable specific checks. For example, if
# you want to run only the similarities checker, you can use "--disable=all
# --enable=similarities". If you want to run only the classes checker, but have
# no Warning level messages displayed, use "--disable=all --enable=classes
# --disable=W".
disable=print-statement,
parameter-unpacking,
unpacking-in-except,
old-raise-syntax,
backtick,
long-suffix,
old-ne-operator,
old-octal-literal,
import-star-module-level,
non-ascii-bytes-literal,
raw-checker-failed,
bad-inline-option,
locally-disabled,
file-ignored,
suppressed-message,
useless-suppression,
deprecated-pragma,
use-symbolic-message-instead,
apply-builtin,
basestring-builtin,
buffer-builtin,
cmp-builtin,
coerce-builtin,
execfile-builtin,
file-builtin,
long-builtin,
raw_input-builtin,
reduce-builtin,
standarderror-builtin,
unicode-builtin,
xrange-builtin,
coerce-method,
delslice-method,
getslice-method,
setslice-method,
no-absolute-import,
old-division,
dict-iter-method,
dict-view-method,
next-method-called,
metaclass-assignment,
indexing-exception,
raising-string,
reload-builtin,
oct-method,
hex-method,
nonzero-method,
cmp-method,
input-builtin,
round-builtin,
intern-builtin,
unichr-builtin,
map-builtin-not-iterating,
zip-builtin-not-iterating,
range-builtin-not-iterating,
filter-builtin-not-iterating,
using-cmp-argument,
eq-without-hash,
div-method,
idiv-method,
rdiv-method,
exception-message-attribute,
invalid-str-codec,
sys-max-int,
bad-python3-import,
deprecated-string-function,
deprecated-str-translate-call,
deprecated-itertools-function,
deprecated-types-field,
next-method-defined,
dict-items-not-iterating,
dict-keys-not-iterating,
dict-values-not-iterating,
deprecated-operator-function,
deprecated-urllib-function,
xreadlines-attribute,
deprecated-sys-function,
exception-escape,
comprehension-escape
# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
# multiple time (only on the command line, not in the configuration file where
# it should appear only once). See also the "--disable" option for examples.
enable=c-extension-no-member
[REPORTS]
# Python expression which should return a score less than or equal to 10. You
# have access to the variables 'error', 'warning', 'refactor', and 'convention'
# which contain the number of messages in each category, as well as 'statement'
# which is the total number of statements analyzed. This score is used by the
# global evaluation report (RP0004).
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
# Set the output format. Available formats are text, parseable, colorized, json
# and msvs (visual studio). You can also give a reporter class, e.g.
# mypackage.mymodule.MyReporterClass.
output-format=text
# Tells whether to display a full report or only the messages.
reports=no
# Activate the evaluation score.
score=yes
[REFACTORING]
# Maximum number of nested blocks for function / method body
max-nested-blocks=5
# Complete name of functions that never returns.
never-returning-functions=sys.exit
[BASIC]
# Naming style matching correct argument names.
argument-naming-style=camelCase
# Regular expression matching correct argument names. Overrides argument-
# naming-style.
#argument-rgx=
# Naming style matching correct attribute names.
attr-naming-style=camelCase
# Regular expression matching correct attribute names. Overrides attr-naming-
# style.
#attr-rgx=
# Bad variable names which should always be refused, separated by a comma.
bad-names=foo,
bar,
baz,
toto,
tutu,
tata
# Naming style matching correct class attribute names.
class-attribute-naming-style=any
# Regular expression matching correct class attribute names. Overrides class-
# attribute-naming-style.
#class-attribute-rgx=
# Naming style matching correct class names.
class-naming-style=PascalCase
# Regular expression matching correct class names. Overrides class-naming-
# style.
#class-rgx=
# Naming style matching correct constant names.
const-naming-style=UPPER_CASE
# Regular expression matching correct constant names. Overrides const-naming-
# style.
#const-rgx=
# Minimum line length for functions/classes that require docstrings, shorter
# ones are exempt.
docstring-min-length=-1
# Naming style matching correct function names.
function-naming-style=snake_case
# Regular expression matching correct function names. Overrides function-
# naming-style.
#function-rgx=
# Good variable names which should always be accepted, separated by a comma.
good-names=i,j,k,x,y,z,w,r,g,b,a,ex,Run,_
# Include a hint for the correct naming format with invalid-name.
include-naming-hint=no
# Naming style matching correct inline iteration names.
inlinevar-naming-style=any
# Regular expression matching correct inline iteration names. Overrides
# inlinevar-naming-style.
#inlinevar-rgx=
# Naming style matching correct method names.
method-naming-style=snake_case
# Regular expression matching correct method names. Overrides method-naming-
# style.
#method-rgx=
# Naming style matching correct module names.
module-naming-style=snake_case
# Regular expression matching correct module names. Overrides module-naming-
# style.
#module-rgx=
# Colon-delimited sets of names that determine each other's naming style when
# the name regexes allow several styles.
name-group=
# Regular expression which should only match function or class names that do
# not require a docstring.
no-docstring-rgx=^_
# List of decorators that produce properties, such as abc.abstractproperty. Add
# to this list to register other decorators that produce valid properties.
# These decorators are taken in consideration only for invalid-name.
property-classes=abc.abstractproperty
# Naming style matching correct variable names.
variable-naming-style=camelCase
[FORMAT]
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
expected-line-ending-format=
# Regexp for a line that is allowed to be longer than the limit.
ignore-long-lines=^\s*(# )?<?https?://\S+>?$
# Number of spaces of indent required inside a hanging or continued line.
indent-after-paren=4
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
# tab).
indent-string=' '
# Maximum number of characters on a single line.
max-line-length=79
# Maximum number of lines in a module.
max-module-lines=1000
# Allow the body of a class to be on the same line as the declaration if body
# contains single statement.
single-line-class-stmt=no
# Allow the body of an if to be on the same line as the test if there is no
# else.
single-line-if-stmt=no
[LOGGING]
# Format style used to check logging format string. `old` means using %
# formatting, `new` is for `{}` formatting,and `fstr` is for f-strings.
logging-format-style=old
# Logging modules to check that the string format arguments are in logging
# function parameter format.
logging-modules=logging
[MISCELLANEOUS]
# List of note tags to take in consideration, separated by a comma.
notes=FIXME,XXX,TODO
[SIMILARITIES]
# Ignore comments when computing similarities.
ignore-comments=yes
# Ignore docstrings when computing similarities.
ignore-docstrings=yes
# Ignore imports when computing similarities.
ignore-imports=no
# Minimum lines number of a similarity.
min-similarity-lines=4
[SPELLING]
# Limits count of emitted suggestions for spelling mistakes.
max-spelling-suggestions=4
# Spelling dictionary name. Available dictionaries: none. To make it work,
# install the python-enchant package.
spelling-dict=
# List of comma separated words that should not be checked.
spelling-ignore-words=
# A path to a file that contains the private dictionary; one word per line.
spelling-private-dict-file=
# Tells whether to store unknown words to the private dictionary (see the
# --spelling-private-dict-file option) instead of raising a message.
spelling-store-unknown-words=no
[STRING]
# This flag controls whether the implicit-str-concat-in-sequence should
# generate a warning on implicit string concatenation in sequences defined over
# several lines.
check-str-concat-over-line-jumps=no
[TYPECHECK]
# List of decorators that produce context managers, such as
# contextlib.contextmanager. Add to this list to register other decorators that
# produce valid context managers.
contextmanager-decorators=contextlib.contextmanager
# List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E1101 when accessed. Python regular
# expressions are accepted.
generated-members=
# Tells whether missing members accessed in mixin class should be ignored. A
# mixin class is detected if its name ends with "mixin" (case insensitive).
ignore-mixin-members=yes
# Tells whether to warn about missing members when the owner of the attribute
# is inferred to be None.
ignore-none=yes
# This flag controls whether pylint should warn about no-member and similar
# checks whenever an opaque object is returned when inferring. The inference
# can return multiple potential results while evaluating a Python object, but
# some branches might not be evaluated, which results in partial inference. In
# that case, it might be useful to still emit no-member and other checks for
# the rest of the inferred objects.
ignore-on-opaque-inference=yes
# List of class names for which member attributes should not be checked (useful
# for classes with dynamically set attributes). This supports the use of
# qualified names.
ignored-classes=optparse.Values,thread._local,_thread._local
# List of module names for which member attributes should not be checked
# (useful for modules/projects where namespaces are manipulated during runtime
# and thus existing member attributes cannot be deduced by static analysis). It
# supports qualified module names, as well as Unix pattern matching.
ignored-modules=signal
# Show a hint with possible names when a member name was not found. The aspect
# of finding the hint is based on edit distance.
missing-member-hint=yes
# The minimum edit distance a name should have in order to be considered a
# similar match for a missing member name.
missing-member-hint-distance=1
# The total number of similar names that should be taken in consideration when
# showing a hint for a missing member.
missing-member-max-choices=1
# List of decorators that change the signature of a decorated function.
signature-mutators=
[VARIABLES]
# List of additional names supposed to be defined in builtins. Remember that
# you should avoid defining new builtins when possible.
additional-builtins=
# Tells whether unused global variables should be treated as a violation.
allow-global-unused-variables=yes
# List of strings which can identify a callback function by name. A callback
# name must start or end with one of those strings.
callbacks=cb_,_cb
# A regular expression matching the name of dummy variables (i.e. expected to
# not be used).
dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
# Argument names that match this expression will be ignored. Default to name
# with leading underscore.
ignored-argument-names=_.*|^ignored_|^unused_
# Tells whether we should check for unused import in __init__ files.
init-import=no
# List of qualified module names which can have objects that can redefine
# builtins.
redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
[CLASSES]
# List of method names used to declare (i.e. assign) instance attributes.
defining-attr-methods=__init__,
__new__,
setUp,
__post_init__
# List of member names, which should be excluded from the protected access
# warning.
exclude-protected=_asdict,
_fields,
_replace,
_source,
_make
# List of valid names for the first argument in a class method.
valid-classmethod-first-arg=cls
# List of valid names for the first argument in a metaclass class method.
valid-metaclass-classmethod-first-arg=cls
[DESIGN]
# Maximum number of arguments for function / method.
max-args=7
# Maximum number of attributes for a class (see R0902).
max-attributes=16
# Maximum number of boolean expressions in an if statement (see R0916).
max-bool-expr=5
# Maximum number of branch for function / method body.
max-branches=12
# Maximum number of locals for function / method body.
max-locals=16
# Maximum number of parents for a class (see R0901).
max-parents=7
# Maximum number of public methods for a class (see R0904).
max-public-methods=20
# Maximum number of return / yield for function / method body.
max-returns=6
# Maximum number of statements in function / method body.
max-statements=50
# Minimum number of public methods for a class (see R0903).
min-public-methods=0
[IMPORTS]
# List of modules that can be imported at any level, not just the top level
# one.
allow-any-import-level=
# Allow wildcard imports from modules that define __all__.
allow-wildcard-with-all=no
# Analyse import fallback blocks. This can be used to support both Python 2 and
# 3 compatible code, which means that the block might have code that exists
# only in one or another interpreter, leading to false positives when analysed.
analyse-fallback-blocks=no
# Deprecated modules which should not be used, separated by a comma.
deprecated-modules=optparse,tkinter.tix
# Create a graph of external dependencies in the given file (report RP0402 must
# not be disabled).
ext-import-graph=
# Create a graph of every (i.e. internal and external) dependencies in the
# given file (report RP0402 must not be disabled).
import-graph=
# Create a graph of internal dependencies in the given file (report RP0402 must
# not be disabled).
int-import-graph=
# Force import order to recognize a module as part of the standard
# compatibility libraries.
known-standard-library=
# Force import order to recognize a module as part of a third party library.
known-third-party=enchant
# Couples of modules and preferred modules, separated by a comma.
preferred-modules=
[EXCEPTIONS]
# Exceptions that will emit a warning when being caught. Defaults to
# "BaseException, Exception".
overgeneral-exceptions=BaseException,
Exception
+315
View File
@@ -0,0 +1,315 @@
# Building ASTC Encoder
This page provides instructions for building `astcenc` from the sources in
this repository.
Builds must use CMake 3.15 or higher as the build system generator. The
examples on this page show how to use it to generate build systems for NMake
(Windows) and Make (Linux and macOS), but CMake supports other build system
backends.
## Windows
Builds for Windows are tested with CMake 3.17, and Visual Studio 2019 or newer.
### Configuring the build
To use CMake you must first configure the build. Create a build directory in
the root of the `astcenc` checkout, and then run `cmake` inside that directory
to generate the build system.
```shell
# Create a build directory
mkdir build
cd build
# Configure your build of choice, for example:
# x86-64 using a Visual Studio solution
cmake -G "Visual Studio 16 2019" -T ClangCL -DCMAKE_INSTALL_PREFIX=..\ ^
-DASTCENC_ISA_AVX2=ON -DASTCENC_ISA_SSE41=ON -DASTCENC_ISA_SSE2=ON ..
# x86-64 using NMake
cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=..\ ^
-DASTCENC_ISA_AVX2=ON -DASTCENC_ISA_SSE41=ON -DASTCENC_ISA_SSE2=ON ..
```
A single CMake configure can build multiple binaries for a single target CPU
architecture, for example building x64 for both SSE2 and AVX2. Each binary name
will include the build variant as a postfix. It is possible to build any set of
the supported SIMD variants by enabling only the ones you require.
Using the Visual Studio Clang-CL LLVM toolchain (`-T ClangCL`) is optional but
produces significantly faster binaries than the default toolchain. The C++ LLVM
toolchain component must be installed via the Visual Studio installer.
### Building
Once you have configured the build you can use NMake to compile the project
from your build dir, and install to your target install directory.
```shell
# Run a build and install build outputs in `${CMAKE_INSTALL_PREFIX}/bin/`
cd build
nmake install
```
## macOS and Linux using Make
Builds for macOS and Linux are tested with CMake 3.17, and clang++ 9.0 or
newer.
> Compiling using g++ is supported, but clang++ builds are faster by ~15%.
### Configuring the build
To use CMake you must first configure the build. Create a build directory
in the root of the astcenc checkout, and then run `cmake` inside that directory
to generate the build system.
```shell
# Select your compiler (clang++ recommended, but g++ works)
export CXX=clang++
# Create a build directory
mkdir build
cd build
# Configure your build of choice, for example:
# Arm arch64
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ \
-DASTCENC_ISA_NEON=ON ..
# x86-64
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ \
-DASTCENC_ISA_AVX2=ON -DASTCENC_ISA_SSE41=ON -DASTCENC_ISA_SSE2=ON ..
# macOS universal binary build
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ ..
```
A single CMake configure can build multiple binaries for a single target CPU
architecture, for example building x64 for both SSE2 and AVX2. Each binary name
will include the build variant as a postfix. It is possible to build any set of
the supported SIMD variants by enabling only the ones you require.
For macOS, we additionally support the ability to build a universal binary.
This build includes SSE4.1 (`x86_64`), AVX2 (`x86_64h`), and NEON (`arm64`)
build slices in a single output binary. The OS will select the correct variant
to run for the machine being used. This is the default build target for a macOS
build, but single-target binaries can still be built by setting
`-DASTCENC_UNIVERSAL_BINARY=OFF` and then manually selecting the specific ISA
variants that are required.
### Building
Once you have configured the build you can use Make to compile the project from
your build dir, and install to your target install directory.
```shell
# Run a build and install build outputs in `${CMAKE_INSTALL_PREFIX}/bin/`
# for executable binaries and `${CMAKE_INSTALL_PREFIX}/lib/` for libraries
cd build
make install -j16
```
## macOS using XCode
Builds for macOS and Linux are tested with CMake 3.17, and XCode 14.0 or
newer.
### Configuring the build
To use CMake you must first configure the build. Create a build directory
in the root of the astcenc checkout, and then run `cmake` inside that directory
to generate the build system.
```shell
# Create a build directory
mkdir build
cd build
# Configure a universal build
cmake -G Xcode -DCMAKE_INSTALL_PREFIX=../ ..
```
### Building
Once you have configured the build you can use CMake to compile the project
from your build dir, and install to your target install directory.
```shell
cmake --build . --config Release
# Optionally install the binaries to the installation directory
cmake --install . --config Release
```
## Advanced build options
For codec developers and power users there are a number of useful features in
the build system.
### Build Types
We support and test the following `CMAKE_BUILD_TYPE` options.
| Value | Description |
| ---------------- | -------------------------------------------------------- |
| Release | Optimized release build |
| RelWithDebInfo | Optimized release build with debug info |
| Debug | Unoptimized debug build with debug info |
Note that optimized release builds are compiled with link-time optimization,
which can make profiling more challenging ...
### Shared Libraries
We support building the core library as a shared object by setting the CMake
option `-DASTCENC_SHAREDLIB=ON` at configure time. For macOS build targets the
shared library supports the same universal build configuration as the command
line utility.
Note that the command line tool is always statically linked; the shared objects
are an extra build output that are not currently used by the command line tool.
### Constrained block size builds
All normal builds will support all ASTC block sizes, including the worst case
6x6x6 3D block size (216 texels per block). Compressor memory footprint and
performance can be improved by limiting the block sizes supported in the build
by adding `-DASTCENC_BLOCK_MAX_TEXELS=<texel_count>` to to CMake command line
when configuring. Legal block sizes that are unavailable in a restricted build
will return the error `ASTCENC_ERR_NOT_IMPLEMENTED` during context creation.
### Non-invariant builds
All normal builds are designed to be invariant, so any build from the same git
revision will produce bit-identical results for all compilers and CPU
architectures. To achieve this we sacrifice some performance, so if this is
not required you can specify `-DASTCENC_INVARIANCE=OFF` to enable additional
optimizations. This has most benefit for AVX2 builds where we are able to
enable use of the FMA instruction set extensions.
### No intrinsics builds
All normal builds will use SIMD accelerated code paths using intrinsics, as all
supported target architectures (x86 and arm64) guarantee SIMD availability. For
development purposes it is possible to build an intrinsic-free build which uses
no explicit SIMD acceleration (the compiler may still auto-vectorize).
To enable this binary variant add `-DASTCENC_ISA_NONE=ON` to the CMake command
line when configuring. It is NOT recommended to use this for production; it is
significantly slower than the vectorized SIMD builds.
### No x86 gather instruction builds
On many x86 microarchitectures the native AVX gather instructions are slower
than simply performing manual scalar loads and combining the results. Gathers
are enabled by default, but can be disabled by setting the CMake option
`-DASTCENC_X86_GATHERS=OFF` on the command line when configuring.
Note that we have seen mixed results when compiling the scalar fallback path,
so we would recommend testing which option works best for the compiler and
microarchitecture pairing that you are targeting.
### Test builds
We support building unit tests. These use the `googletest` framework, which is
pulled in though a git submodule. On first use, you must fetch the submodule
dependency:
```shell
git submodule init
git submodule update
```
To build unit tests add `-DASTCENC_UNITTEST=ON` to the CMake command line when
configuring.
To run unit tests use the CMake `ctest` utility from your build directory after
you have built the tests.
```shell
cd build
ctest --verbose
```
### Sanitizer builds
We support building with sanitizers on Linux and macOS when using Clang.
To build binaries with ASAN checking enabled add `-DASTCENC_ASAN=ON` to the
CMake command line when configuring.
To build binaries with UBSAN checking enabled add `-DASTCENC_UBSAN=ON` to the
CMake command line when configuring.
### Android builds
Builds of the command line utility for Android are not officially supported, but can be a useful
development build for testing on e.g. different Arm CPU microarchitectures.
The build script below shows one possible route to building the command line tool for Android. Once
built the application can be pushed to e.g. `/data/local/tmp` and executed from an Android shell
terminal over `adb`.
```shell
ANDROID_ABI=arm64-v8a
ANDROID_NDK=/work/tools/android/ndk/22.1.7171670
BUILD_TYPE=RelWithDebInfo
BUILD_DIR=build
mkdir -p ${BUILD_DIR}
cd ${BUILD_DIR}
cmake \
-DCMAKE_INSTALL_PREFIX=./ \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
-DANDROID_ABI=${ANDROID_ABI} \
-DANDROID_ARM_NEON=ON \
-DANDROID_PLATFORM=android-21 \
-DCMAKE_ANDROID_NDK_TOOLCHAIN_VERSION=clang \
-DANDROID_TOOLCHAIN=clang \
-DANDROID_STL=c++_static \
-DARCH=aarch64 \
-DASTCENC_ISA_NEON=ON \
..
make -j16
```
## Packaging a release bundle
We support building a release bundle of all enabled binary configurations in
the current CMake configuration using the `package` build target
Configure CMake with:
* `-DASTCENC_PACAKGE=<arch>` to set the package architecture/variant name used
to name the package archive (not set by default).
```shell
# Run a build and package build outputs in `./astcenc-<ver>-<os>-<arch>.<fmt>`
cd build
make package -j16
```
Windows packages will use the `.zip` format, other packages will use the
`.tar.gz` format.
## Integrating as a library into another project
The core codec of `astcenc` is built as a library, and so can be easily
integrated into other projects using CMake. An example of the CMake integration
and the codec API usage can be found in the `./Utils/Example` directory in the
repository. See the [Example Readme](../Utils/Example/README.md) for more
details.
- - -
_Copyright © 2019-2024, Arm Limited and contributors. All rights reserved._
+328
View File
@@ -0,0 +1,328 @@
# 2.x series change log
This page summarizes the major functional and performance changes in each
release of the 2.x series.
All performance data on this page is measured on an Intel Core i5-9600K
clocked at 4.2 GHz, running astcenc using 6 threads.
<!-- ---------------------------------------------------------------------- -->
## 2.5
**Status:** Released, March 2021
The 2.5 release is the last major release in the 2.x series. After this release
a `2.x` branch will provide stable long-term support, and the `main` branch
will switch to focusing on more radical changes for the 3.x series.
Reminder for users of the library interface - the API is not designed to be
stable across versions, and this release is not compatible with earlier 2.x
releases. Please update and rebuild your client-side code using the updated
`astcenc.h` header.
**General:**
* **Feature:** The `ISA_INVARIANCE` build option is no longer supported, as
there is no longer any performance benefit from the variant paths. All
builds are now using the equivalent of the `ISA_INVARIANCE=ON` setting, and
all builds (except Armv7) are now believed to be invariant across operating
systems, compilers, CPU architectures, and SIMD instruction sets.
* **Feature:** Armv8 32-bit builds with NEON are now supported, with
out-of-the-box support for Arm Linux soft-float and hard-float ABIs. There
are no pre-built binaries for these targets; support is included for
library users targeting older 32-bit Android and iOS devices.
* **Feature:** A compressor mode for encoding HDR textures that have been
encoded into LDR RGBM wrapper format is now supported. Note that this
encoding has some strong recommendations for how the RGBM encoding is
implemented to avoid block artifacts in the compressed image.
* **Core API:**
* **API Change:** The core API has been changed to be a pure C API, making it
easier to wrap the codec in a stable shared library ABI. Some entry points
that used to accept references now expect pointers.
* **API Change:** The decompression functionality in the core API has been
changed to allow use of multiple threads. The design pattern matches the
compression functionality, requiring the caller to create the threads,
synchronize them between images, and to call the new
`astcenc_decompress_reset()` function between images.
* **API Feature:** Defines to support exporting public API entry point
symbols from a shared object are provided, but not exposed off-the-shelf by
the CMake provided by the project.
* **API Feature:** New `astcenc_get_block_info()` function added to the core
API to allow users to perform high level analysis of compressed data. This
API is not implemented in decompressor-only builds.
* **API Feature:** Codec configuration structure has been extended to expose
the new RGBM compression mode. See the API header for details.
<!-- ---------------------------------------------------------------------- -->
## 2.4
**Status:** Released, February 2021
The 2.4 release is the fifth release in the 2.x series. It is primarily a bug
fix release for HDR image handling, which impacts all earlier 2.x series
releases.
**General:**
* **Feature:** When using the `-a` option, or the equivalent config option
for the API, any 2D blocks that are entirely zero alpha after the alpha
filter radius is taken into account are replaced by transparent black
constant color blocks. This is an RDO-like technique to improve compression
ratios of any additional application packaging compression that is applied.
**Command Line:**
* **Bug fix:** The command line wrapper now correctly loads HDR images that
have a non-square aspect ratio.
<!-- ---------------------------------------------------------------------- -->
## 2.3
**Status:** Released, January 2021
The 2.3 release is the fourth release in the 2.x series. It includes a number
of performance improvements and new features.
Reminder for users of the library interface - the API is not designed to be
stable across versions, and this release is not compatible with 2.2. Please
recompile your client-side code using the updated `astcenc.h` header.
* **General:**
* **Feature:** Decompressor-only builds of the codec are supported again.
While this is primarily a feature for library users who want to shrink
binary size, a variant command line tool `astcdec` can be built by
specifying `DECOMPRESSOR=ON` on the CMake configure command line.
* **Feature:** Diagnostic builds of the codec can now be built. These builds
generate a JSON file containing a trace of the compressor execution.
Diagnostic builds are only suitable for codec development; they are slower
and JSON generation cannot be disabled. Build by setting `DIAGNOSTICS=ON`
on the CMake configure command line.
* **Feature:** Code compatibility improved with older versions of GCC,
earliest compiler now tested is GCC 7.5 (was GCC 9.3).
* **Feature:** Code compatibility improved with newer versions of LLVM,
latest compiler now tested is Clang 12.0 (was Clang 9.0).
* **Feature:** Code compatibility improved with the Visual Studio 2019 LLVM
toolset (`clang-cl`). Using the LLVM toolset gives 25% performance
improvements and is recommended.
* **Command Line:**
* **Feature:** Quality level now accepts either a preset (`-fast`, etc) or a
float value between 0 and 100, allowing more control over the compression
quality vs performance trade-off. The presets are not evenly spaced in the
float range; they have been spaced to give the best distribution of points
between the fast and thorough presets.
* `-fastest`: 0.0
* `-fast`: 10.0
* `-medium`: 60.0
* `-thorough`: 98.0
* `-exhaustive`: 100.0
* **Core API:**
* **API Change:** Quality level preset enum replaced with a float value
between 0 (`-fastest`) and 100 (`-exhaustive`). See above for more info.
### Performance
This release includes a number of optimizations to improve performance.
* New compressor algorithm for handling encoding candidates and refinement.
* Vectorized implementation of `compute_error_of_weight_set()`.
* Unrolled implementation of `encode_ise()`.
* Many other small improvements!
The most significant change is the change to the compressor path, which now
uses an adaptive approach to candidate trials and block refinement.
In earlier releases the quality level will determine the number of encoding
candidates and the number of iterative refinement passes that are used for each
major encoding trial. This is a fixed behavior; it will always try the full N
candidates and M refinement iterations specified by the quality level for each
encoding trial.
The new approach implements two optimizations for this:
* Compression will complete when a block candidate hits the specified target
quality, after its M refinement iterations have been applied. Later block
candidates are simply abandoned.
* Block candidates will predict how much refinement can improve them, and
abandon refinement if they are unlikely to improve upon the best known
encoding already in-hand.
This pair of optimizations provides significant performance improvement to the
high quality modes which use the most block candidates and refinement
iterations. A minor loss of image quality is expected, as the blocks we no
longer test or refine may have been better coding choices.
**Absolute performance vs 2.2 release:**
![Absolute scores 2.3 vs 2.2](./ChangeLogImg/absolute-2.2-to-2.3.png)
**Relative performance vs 2.2 release:**
![Relative scores 2.3 vs 2.2](./ChangeLogImg/relative-2.2-to-2.3.png)
<!-- ---------------------------------------------------------------------- -->
## 2.2
**Status:** Released, January 2021
The 2.2 release is the third release in the 2.x series. It includes a number
of performance improvements and new features.
Reminder for users of the library interface - the API is not designed to be
stable across versions, and this release is not compatible with 2.1. Please
recompile your client-side code using the updated `astcenc.h` header.
* **General:**
* **Feature:** New Arm aarch64 NEON accelerated vector library support.
* **Improvement:** New CMake build system for all platforms.
* **Improvement:** SSE4.2 feature profile changed to SSE4.1, which more
accurately reflects the feature set used.
* **Binary releases:**
* **Improvement:** Linux binaries changed to use Clang 9.0, which gives
up to 15% performance improvement.
* **Improvement:** Windows binaries are now code signed.
* **Improvement:** macOS binaries for Apple silicon platforms now provided.
* **Improvement:** macOS binaries are now code signed and notarized.
* **Command Line:**
* **Feature:** New image preprocess `-pp-normalize` option added. This forces
normal vectors to be unit length, which is useful when compressing source
textures that use normal length to encode an NDF, which is incompatible
with ASTC's two channel encoding.
* **Feature:** New image preprocess `-pp-premultiply` option added. This
scales RGB values by the alpha value. This can be useful to minimize
cross-channel color bleed caused by GPU post-multiply filtering/blending.
* **Improvements:** Command line tool cleanly traps and reports errors for
corrupt input images rather than relying on standard library `assert()`
calls in release builds.
* **Core API:**
* **API Change:** Images using region-based metrics no longer need to include
padding; all input images should be tightly packed and `dim_pad` is removed
from the `astcenc_image` structure. This makes it easier to directly use
images loaded from other libraries.
* **API Change:** Image `data` is no longer a 3D array accessed using
`data[z][y][x]` indexing, it's an array of 2D slices. This makes it easier
to directly use images loaded from other libraries.
* **API Change:** New `ASTCENC_FLG_SELF_DECOMPRESS_ONLY` flag added to the
codec config. Using this flag enables additional optimizations that
aggressively exploit implementation- and configuration-specific, behavior
to gain performance. When using this flag the codec can only reliably
decompress images that were compressed in the same context session. Images
produced via other means may fail to decompress correctly, even if they are
otherwise valid ASTC files.
### Performance
There is one major set of optimizations in this release, related to the new
`ASTCENC_FLG_SELF_DECOMPRESS_ONLY` mode. These allow the compressor to only
create data tables it knows that it is going to use, based on its current set
of heuristics, rather than needing the full set the format allows.
The first benefit of these changes is a reduced context creation time, which
can be reduced by up to 250ms on our test machine. This is a significant
percentage of the command line utility runtime for a small image when using a
quick search preset. Compressing the whole Kodak test suite using the command
line utility and the `-fastest` preset is ~30% faster with this release, which
is mostly due to faster startup.
The reduction in the data table size in this mode also improve the core codec
speed. Our test sets show an average of 12% improvement in the codec for
`-fastest` mode, and an average of 3% for `-medium` mode.
Key for performance charts:
* Color = block size (see legend).
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
**Absolute performance vs 2.1 release:**
![Absolute scores 2.2 vs 2.1](./ChangeLogImg/absolute-2.1-to-2.2.png)
**Relative performance vs 2.1 release:**
![Relative scores 2.2 vs 2.1](./ChangeLogImg/relative-2.1-to-2.2.png)
<!-- ---------------------------------------------------------------------- -->
## 2.1
**Status:** Released, November 2020
The 2.1 release is the second release in the 2.x series. It includes a number
of performance optimizations and new features.
Reminder for users of the library interface - the API is not designed to be
stable across versions, and this release is not compatible with 2.0. Please
recompile your client-side code using the updated `astcenc.h` header.
### Features:
* **Command line:**
* **Bug fix:** The meaning of the `-tH\cH\dH` and `-th\ch\dh` compression
modes was inverted. They now match the documentation; use `-*H` for HDR
RGBA, and `-*h` for HDR RGB with LDR alpha.
* **Feature:** A new `-fastest` quality preset is now available. This is
designed for fast "roughing out" of new content, and sacrifices significant
image quality compared to `-fast`. We do not recommend its use for
production builds.
* **Feature:** A new `-candidatelimit` compression tuning option is now
available. This is a power-user control to determine how many candidates
are returned for each block mode encoding trial. This feature is used
automatically by the search presets; see `-help` for details.
* **Improvement:** The compression test modes (`-tl\ts\th\tH`) now emit a
MTex/s performance metric, in addition to coding time.
* **Core API:**
* **Feature:** A new quality preset `ASTCENC_PRE_FASTEST` is available. See
`-fastest` above for details.
* **Feature:** A new tuning option `tune_candidate_limit` is available in
the config structure. See `-candidatelimit` above for details.
* **Feature:** Image input/output can now use `ASTCENC_TYPE_F32` data types.
* **Stability:**
* **Feature:** The SSE2, SSE4.2, and AVX2 variants now produce identical
compressed output when run on the same CPU when compiled with the
preprocessor define `ASTCENC_ISA_INVARIANCE=1`. For Make builds this can
be set on the command line by setting `ISA_INV=1`. ISA invariance is off
by default; it reduces performance by 1-3%.
### Performance
Key for performance charts:
* Color = block size (see legend).
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
**Absolute performance vs 2.0 release:**
![Absolute scores 2.1 vs 2.0](./ChangeLogImg/absolute-2.0-to-2.1.png)
**Relative performance vs 2.0 release:**
![Relative scores 2.1 vs 2.0](./ChangeLogImg/relative-2.0-to-2.1.png)
<!-- ---------------------------------------------------------------------- -->
## 2.0
**Status:** Released, August 2020
The 2.0 release is first release in the 2.x series. It includes a number of
major changes over the earlier 1.7 series, and is not command-line compatible.
### Features:
* The core codec can be built as a library, exposed via a new codec API.
* The core codec supports accelerated SIMD paths for SSE2, SSE4.2, and AVX2.
* The command line syntax has a clearer mapping to Khronos feature profiles.
### Performance:
Key for performance charts
* Color = block size (see legend).
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
**Absolute performance vs 1.7 release:**
![Absolute scores 2.0 vs 1.7](./ChangeLogImg/absolute-1.7-to-2.0.png)
**Relative performance vs 1.7 release:**
![Relative scores 2.0 vs 1.7](./ChangeLogImg/relative-1.7-to-2.0.png)
- - -
_Copyright © 2020-2022, Arm Limited and contributors. All rights reserved._
+308
View File
@@ -0,0 +1,308 @@
# 3.x series change log
This page summarizes the major functional and performance changes in each
release of the 3.x series.
All performance data on this page is measured on an Intel Core i5-9600K
clocked at 4.2 GHz, running `astcenc` using AVX2 and 6 threads.
<!-- ---------------------------------------------------------------------- -->
## 3.7
**Status:** April 2022
The 3.7 release contains another round of performance optimizations, including
significant improvements to the command line front-end (faster PNG loader) and
the arm64 build of the codec (faster NEON implementation).
* **General:**
* **Feature:** The command line tool PNG loader has been switched to use
the Wuffs library, which is robust and significantly faster than the
current stb_image implementation.
* **Feature:** Support for non-invariant builds returns. Opt-in to slightly
faster, but not bit-exact, builds by setting `-DNO_INVARIANCE=ON` for the
CMake configuration. This improves performance by around 2%.
* **Optimization:** Changed SIMD `select()` so that it matches the default
NEON behavior (bitwise select), rather than the default x86-64 behavior
(lane select on MSB). Specialization `select_msb()` added for the one case
we want to select on a sign-bit, where NEON needs a different
implementation. This provides a significant (>25%) performance uplift on
NEON implementations.
### Performance:
Key for charts:
* Color = block size (see legend).
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
**Relative performance vs 3.5 release:**
![Relative scores 3.7 vs 3.6](./ChangeLogImg/relative-3.6-to-3.7.png)
<!-- ---------------------------------------------------------------------- -->
## 3.6
**Status:** April 2022
The 3.6 release contains another round of performance optimizations.
There are no interface changes in this release, but in general the API is not
designed to be binary compatible across versions. We always recommend
rebuilding your client-side code using the updated `astcenc.h` header.
* **General:**
* **Feature:** Data tables are now optimized for contexts without the
`SELF_DECOMPRESS_ONLY` flag set. The flag therefore no longer improves
compression performance, but still reduces context creation time and
context data table memory footprint.
* **Feature:** Image quality for 4x4 `-fastest` configuration has been
improved.
* **Optimization:** Decimation modes are reliably excluded from processing
when they are only partially selected in the compressor configuration (e.g.
if used for single plane, but not dual plane modes). This is a significant
performance optimization for all quality levels.
* **Optimization:** Fast-path block load function variant added for 2D LDR
images with no swizzle. This is a moderate performance optimization for the
fast and fastest quality levels.
### Performance:
Key for charts:
* Color = block size (see legend).
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
**Relative performance vs 3.5 release:**
![Relative scores 3.6 vs 3.5](./ChangeLogImg/relative-3.5-to-3.6.png)
<!-- ---------------------------------------------------------------------- -->
## 3.5
**Status:** March 2022
The 3.5 release contains another round of performance optimizations.
There are no interface changes in this release, but in general the API is not
designed to be binary compatible across versions. We always recommend
rebuilding your client-side code using the updated `astcenc.h` header.
* **General:**
* **Feature:** Compressor configurations using `SELF_DECOMPRESS_ONLY` mode
store compacted partition tables, which significantly improves both
context create time and runtime performance.
* **Feature:** Bilinear infill for decimated weight grids supports a new
variant for half-decimated grids which are only decimated in one axis.
### Performance:
Key for charts:
* Color = block size (see legend).
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
**Relative performance vs 3.4 release:**
![Relative scores 3.5 vs 3.4](./ChangeLogImg/relative-3.4-to-3.5.png)
<!-- ---------------------------------------------------------------------- -->
## 3.4
**Status:** February 2022
The 3.4 release introduces another round of optimizations, removing a number
of power-user configuration options to simplify the core compressor data path.
Reminder for users of the library interface - the API is not designed to be
binary compatible across versions, and this release is not compatible with
earlier releases. Please update and rebuild your client-side code using the
updated `astcenc.h` header.
* **General:**
* **Feature:** Many memory allocations have been moved off the stack into
dynamically allocated working memory. This significantly reduces the peak
stack usage, allowing the compressor to run in systems with 128KB stack
limits.
* **Feature:** Builds now support `-DBLOCK_MAX_TEXELS=<count>` to allow a
compressor to support a subset of block sizes. This can reduce binary size
and runtime memory footprint, and improve performance.
* **Feature:** The `-v` and `-va` options to set a per-texel error weight
function are no longer supported.
* **Feature:** The `-b` option to set a per-texel error weight boost for
block border texels is no longer supported.
* **Feature:** The `-a` option to set a per-texel error weight based on texel
alpha value is no longer supported as an error weighting tool, but is still
supported for providing sprite-sheet RDO.
* **Feature:** The `-mask` option to set an error metric for mask map
textures is still supported, but is currently a no-op in the compressor.
* **Feature:** The `-perceptual` option to set a perceptual error metric is
still supported, but is currently a no-op in the compressor for mask map
and normal map textures.
* **Bug-fix:** Corrected decompression of error blocks in some cases, so now
returning the expected error color (magenta for LDR, NaN for HDR). Note
that astcenc determines the error color to use based on the output image
data type not the decoder profile.
* **Binary releases:**
* **Improvement:** Windows binaries changed to use ClangCL 12.0, which gives
up to 10% performance improvement.
### Performance:
Key for charts:
* Color = block size (see legend).
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
**Relative performance vs 3.3 release:**
![Relative scores 3.4 vs 3.3](./ChangeLogImg/relative-3.3-to-3.4.png)
<!-- ---------------------------------------------------------------------- -->
## 3.3
**Status:** November 2021
The 3.3 release improves image quality for normal maps, and two component
textures. Normal maps are expected to compress 25% slower than the 3.2
release, although it should be noted that they are still faster to compress
in 3.3 than when using the 2.5 series. This release also fixes one reported
stability issue.
* **General:**
* **Feature:** Normal map image quality has been improved.
* **Feature:** Two component image quality has been improved, provided
that unused components are correctly zero-weighted using e.g. `-cw` on the
command line.
* **Bug-fix:** Improved stability when trying to compress complex blocks that
could not beat even the starting quality threshold. These will now always
compress in to a constant color blocks.
<!-- ---------------------------------------------------------------------- -->
## 3.2
**Status:** August 2021
The 3.2 release is a bugfix release; no significant image quality or
performance differences are expected.
* **General:**
* **Bug-fix:** Improved stability when new contexts were created while other
contexts were compressing or decompressing an image.
* **Bug-fix:** Improved stability when decompressing blocks with invalid
block encodings.
<!-- ---------------------------------------------------------------------- -->
## 3.1
**Status:** July 2021
The 3.1 release gives another performance boost, typically between 5 and 20%
faster than the 3.0 release, as well as further incremental improvements to
image quality. A number of build system improvements make astcenc easier and
faster to integrate into other projects as a library, including support for
building universal binaries on macOS. Full change list is shown below.
Reminder for users of the library interface - the API is not designed to be
binary compatible across versions, and this release is not compatible with
earlier releases. Please update and rebuild your client-side code using the
updated `astcenc.h` header.
* **General:**
* **Feature:** RGB color data now supports `-perceptual` operation. The
current implementation is simple, weighting color channel errors by their
contribution to perceived luminance. This mimics the behavior of the human
visual system, which is most sensitive to green, then red, then blue.
* **Feature:** Codec supports a new low weight search mode, which is a
simpler weight assignment for encodings with a low number of weights in the
weight grid. The weight threshold can be overridden using the new
`-lowweightmodelimit` command line option.
* **Feature:** All platform builds now support building a native binary.
Native binaries automatically select the SIMD level based on the default
configuration of the compiler in use. Native binaries built on one machine
may use different SIMD options than native binaries build on another.
* **Feature:** macOS platform builds now support building universal binaries
containing both `x86_64` and `arm64` target support.
* **Feature:** Building the command line can be disabled when using as a
library in another project. Set `-DCLI=OFF` during the CMake configure
step.
* **Feature:** A standalone minimal example of the core codec API usage has
been added in the `./Utils/Example/` directory.
* **Core API:**
* **Feature:** Config flag `ASTCENC_FLG_USE_PERCEPTUAL` works for color data.
* **Feature:** Config option `tune_low_weight_count_limit` added.
* **Feature:** New heuristic added which prunes dual weight plane searches if
they are unlikely to help. This heuristic is not user controllable.
* **Feature:** Image quality has been improved. In general we see significant
improvements (up to 0.2dB) for high bitrate encodings (4x4, 5x4), and a
smaller improvement (up to 0.1dB) for lower bitrate encodings.
* **Bug fix:** Arm "none" SIMD builds could be invariant with other builds.
This fix has also been back-ported to the 2.x LTS branch.
### Performance:
Key for charts:
* Color = block size (see legend).
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
**Relative performance vs 3.0 release:**
![Relative scores 3.1 vs 3.0](./ChangeLogImg/relative-3.0-to-3.1.png)
<!-- ---------------------------------------------------------------------- -->
## 3.0
**Status:** June 2021
The 3.0 release is the first in a series of updates to the compressor that are
making more radical changes than we felt we could make with the 2.x series.
The primary goals of the 3.x series are to keep the image quality ~static or
better compared to the 2.5 release, but continue to improve performance.
Reminder for users of the library interface - the API is not designed to be
binary compatible across versions, and this release is not compatible with
earlier releases. Please update and rebuild your client-side code using the
updated `astcenc.h` header.
* **General:**
* **Feature:** The code has been significantly cleaned up, with improved
comments, API documentation, function naming, and variable naming.
* **Core API:**
* **API Change:** The core APIs for `astcenc_compress_image()` and for
`astcenc_decompress_image()` now accept swizzle structures by `const`
pointer, instead of pass-by-value.
* **API Change:** Calling the `astcenc_compress_reset()` and the
`astcenc_decompress_reset()` functions between images is no longer required
if the context was created for use by a single thread.
* **Feature:** New heuristics have been added for controlling when to search
beyond 2 partitions and 1 plane, and when to search beyond 3 partitions and
1 plane. The previous `tune_partition_early_out_limit` config option has
been removed, and replaced with two new options
`tune_2_partition_early_out_limit_factor` and
`tune_3_partition_early_out_limit_factor`. See command line help for more
detailed documentation.
* **Feature:** New heuristics have been added for controlling when to use
dual weight planes. The previous `tune_two_plane_early_out_limit` has been
renamed to`tune_2_plane_early_out_limit_correlation`. See command line help
for more detailed documentation.
* **Feature:** Support for using dual weight planes has been restricted to
single partition blocks; it rarely helps blocks with 2 or more partitions
and takes considerable compression search time.
### Performance:
Key for charts:
* Color = block size (see legend).
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
**Relative performance vs 2.5 release:**
![Relative scores 3.0 vs 2.5](./ChangeLogImg/relative-2.5-to-3.0.png)
- - -
_Copyright © 2021-2022, Arm Limited and contributors. All rights reserved._
+416
View File
@@ -0,0 +1,416 @@
# 4.x series change log
This page summarizes the major functional and performance changes in each
release of the 4.x series.
All performance data on this page is measured on an Intel Core i5-9600K
clocked at 4.2 GHz, running `astcenc` using AVX2 and 6 threads.
<!-- ---------------------------------------------------------------------- -->
## 4.8.0
**Status:** May 2024
The 4.8.0 release is a minor maintenance release.
* **General:**
* **Bug fix:** Native builds on macOS will now correctly build for arm64 when
run outside of Rosetta on an Apple silicon device.
* **Bug fix:** Multiple small improvements to remove use of undefined
language behavior, to improve support for deployment using Emscripten.
* **Feature:** Builds using Clang can now build with undefined behavior
sanitizer by setting `-DASTCENC_UBSAN=ON` on the CMake configure line.
* **Feature:** Updated to Wuffs library 0.3.4, which ignores tRNS alpha
chunks for type 4 (LA) and 6 (RGBA) PNGs, to improve compatibility with
libpng.
<!-- ---------------------------------------------------------------------- -->
## 4.7.0
**Status:** January 2024
The 4.7.0 release is a major maintenance release, fixing rounding behavior in
the decompressor to match the Khronos specification. This fix includes the
addition of explicit support for optimizing for `decode_unorm8` rounding.
Reminder - the codec library API is not designed to be binary compatible across
versions. We always recommend rebuilding your client-side code using the
updated `astcenc.h` header.
* **General:**
* **Bug fix:** sRGB LDR decompression now uses the correct endpoint expansion
method to create the 16-bit RGB endpoint colors, and removes the previous
correction code from the interpolation function. This bug could result in
LSB bit flips relative to the standard specification.
* **Bug fix:** Decompressing to an 8-bit per component output image now
matches the `decode_unorm8` extension rounding rules. This bug could result
in LSB bit flips relative to the standard specification.
* **Bug fix:** Code now avoids using `alignas()` in the reference C
implementation, as the default `alignas(16)` is narrower than the
native minimum alignment requirement on some CPUs.
* **Feature:** Library configuration supports a new flag,
`ASTCENC_FLG_USE_DECODE_UNORM8`. This flag indicates that the image will be
used with the `decode_unorm8` decode mode. When set during compression
this allows the compressor to use the correct rounding when determining the
best encoding.
* **Feature:** Command line tool supports a new option, `-decode_unorm8`.
This option indicates that the image will be used with the `decode_unorm8`
decode mode. This option will automatically be set for decompression
(`-d*`) and trial (`-t*`) tool operation if the decompressed output image
is stored to an 8-bit per component file format. This option must be set
manually for compression (`-c*`) tool operation, as the desired decode mode
cannot be reliably determined.
* **Feature:** Library configuration supports a new optional progress
reporting callback to be specified. This is called during compression to
to allow interactive tooling use cases to display incremental progress. The
command line tool uses this feature to show compression progress unless
`-silent` is used.
<!-- ---------------------------------------------------------------------- -->
## 4.6.1
**Status:** November 2023
The 4.6.1 release is a minor maintenance release to fix a scaling bug on
large core count Windows systems.
* **General:**
* **Optimization:** Windows builds of the `astcenc` command line tool can now
use more than 64 cores on large core count systems. This change doubled
command line performance for `-exhaustive` compression when testing on an
96 core/192 thread system.
* **Feature:** Windows Arm64 native builds of the `astcenc` command line tool
are now included in the prebuilt release binaries.
<!-- ---------------------------------------------------------------------- -->
## 4.6.0
**Status:** November 2023
The 4.6.0 release retunes the compressor heuristics to give improvements to
performance for trivial losses to image quality. It also includes some minor
bug fixes and code quality improvements.
Reminder - the codec library API is not designed to be binary compatible across
versions. We always recommend rebuilding your client-side code using the updated
`astcenc.h` header.
* **General:**
* **Bug-fix:** Fixed context allocation for contexts allocated with the
`ASTCENC_FLG_DECOMPRESS_ONLY` flag.
* **Bug-fix:** Reduced use of `reinterpret_cast` in the core codec to
avoid strict aliasing violations.
* **Optimization:** `-medium` search quality no longer tests 4 partition
encodings for block sizes between 25 and 83 texels (inclusive). This
improves performance for a tiny drop in image quality.
* **Optimization:** `-thorough` and higher search qualities no longer test the
mode0 first search for block sizes between 25 and 83 texels (inclusive).
This improves performance for a tiny drop in image quality.
* **Optimization:** `TUNE_MAX_PARTITIONING_CANDIDATES` reduced from 32 to 8
to reduce the size of stack allocated data structures. This causes a tiny
drop in image quality for the `-verythorough` and `-exhaustive` presets.
<!-- ---------------------------------------------------------------------- -->
## 4.5.0
**Status:** June 2023
The 4.5.0 release is a maintenance release with small image quality
improvements, and a number of build system quality of life improvements.
* **General:**
* **Bug-fix:** Improved handling compiler arguments in CMake, including
consistent use of MSVC-style command line arguments for ClangCL.
* **Bug-fix:** Invariant Clang builds now use `-ffp-model=precise` with
`-ffp-contract=off` which is needed to restore invariance due to recent
changes in compiler defaults.
* **Change:** macOS binary releases are now distributed as a single universal
binary for all platforms.
* **Change:** Windows binary releases are now compiled with VS2022.
* **Change:** Invariant MSVC builds for VS2022 now use `/fp:precise` instead
of `/fp:strict`, which is is now possible because precise no longer implies
contraction. This should improve performance for MSVC builds.
* **Change:** Non-invariant Clang builds now use `-ffp-model=precise` with
`-ffp-contract=on`. This should improve performance on older Clang
versions which defaulted to no contraction.
* **Change:** Non-invariant MSVC builds for VS2022 now use `/fp:precise`
with `/fp:contract`. This should improve performance for MSVC builds.
* **Change:** CMake config variables now use an `ASTCENC_` prefix to add a
namespace and group options when the library is used in a larger project.
* **Change:** CMake config `ASTCENC_UNIVERSAL_BUILD` for building macOS
universal binaries has been improved to include the `x86_64h` slice for
AVX2 builds. Universal builds are now on by default for macOS, and always
include NEON (arm64), SSE4.1 (x86_64), and AVX2 (x86_64h) variants.
* **Change:** CMake config `ASTCENC_NO_INVARIANCE` has been inverted to
remove the negated option, and is now `ASTCENC_INVARIANCE` with a default
of `ON`. Disabling this option can substantially improve performance, but
images can different across platforms and compilers.
* **Optimization:** Color quantization and packing for LDR RGB and RGBA has
been vectorized to improve performance.
* **Change:** Color quantization for LDR RGB and RGBA endpoints will now try
multiple quantization packing methods, and pick the one with the lowest
endpoint encoding error. This gives a minor image quality improvement, for
no significant performance impact when combined with the vectorization
optimizations.
<!-- ---------------------------------------------------------------------- -->
## 4.4.0
**Status:** March 2023
The 4.4.0 release is a minor release with image quality improvements, a small
performance boost, and a few new quality-of-life features.
* **General:**
* **Change:** Core library no longer checks availability of required
instruction set extensions, such as SSE4.1 or AVX2. Checking compatibility
is now the responsibility of the caller. See `astcenccli_entry.cpp` for
an example of code performing this check.
* **Change:** Core library can be built as a shared object by setting the
`-DSHAREDLIB=ON` CMake option, resulting in e.g. `libastcenc-avx2-shared.so`.
Note that the command line tool is always statically linked.
* **Change:** Decompressed 3D images will now write one output file per
slice, if the target format is a 2D image format.
* **Change:** Command line errors print to stderr instead of stdout.
* **Change:** Color encoding uses new quantization tables, that now factor
in floating-point rounding if a distance tie is found when using the
integer quant256 value. This improves image quality for 4x4 and 5x5 block
sizes.
* **Optimization:** Partition selection uses a simplified line calculation
with a faster approximation. This improves performance for all block sizes.
* **Bug-fix:** Fixed missing symbol error in decompressor-only builds.
* **Bug-fix:** Fixed infinity handling in debug trace JSON files.
### Performance:
Key for charts:
* Color = block size (see legend).
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
**Relative performance vs 4.3 release:**
![Relative scores 4.4 vs 4.3](./ChangeLogImg/relative-4.3-to-4.4.png)
<!-- ---------------------------------------------------------------------- -->
## 4.3.1
**Status:** January 2023
The 4.3.1 release is a minor maintenance release. No performance or image
quality changes are expected.
* **General:**
* **Bug-fix:** Fixed typo in `-2/3/4partitioncandidatelimit` CLI options.
* **Bug-fix:** Fixed handling for `-3/4partitionindexlimit` CLI options.
* **Bug-fix:** Updated to `stb_image.h` v2.28, which includes multiple fixes
and improvements for image loading.
<!-- ---------------------------------------------------------------------- -->
## 4.3.0
**Status:** January 2023
The 4.3.0 release is an optimization release. There are minor performance
and image quality improvements in this release.
Reminder - the codec library API is not designed to be binary compatible across
versions. We always recommend rebuilding your client-side code using the updated
`astcenc.h` header.
* **General:**
* **Bug-fix:** Use lower case `windows.h` include for MinGW compatibility.
* **Change:** The `-mask` command line option, `ASTCENC_FLG_MAP_MASK` in the
library API, has been removed.
* **Optimization:** Always skip blue-contraction for `QUANT_256` encodings.
This gives a small image quality improvement for the 4x4 block size.
* **Optimization:** Always skip RGBO vector calculation for LDR encodings.
* **Optimization:** Defer color packing and scrambling to physical layer.
* **Optimization:** Remove folded `decimation_info` lookup tables. This
significantly reduces compressor memory footprint and improves context
creation time. Impact increases with the active block size.
* **Optimization:** Increased trial and refinement pruning by using stricter
target errors when determining whether to skip iterations.
### Performance:
Key for charts:
* Color = block size (see legend).
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
**Relative performance vs 4.2 release:**
![Relative scores 4.3 vs 4.2](./ChangeLogImg/relative-4.2-to-4.3.png)
<!-- ---------------------------------------------------------------------- -->
## 4.2.0
**Status:** November 2022
The 4.2.0 release is an optimization release. There are significant performance
improvements, minor image quality improvements, and library interface changes in
this release.
Reminder - the codec library API is not designed to be binary compatible across
versions. We always recommend rebuilding your client-side code using the updated
`astcenc.h` header.
* **General:**
* **Bug-fix:** Compression for RGB and RGBA base+offset encodings no
longer generate endpoints with the incorrect blue-contract behavior.
* **Bug-fix:** Lowest channel correlation calculation now correctly ignores
constant color channels for the purposes of filtering 2 plane encodings.
On average this improves both performance and image quality.
* **Bug-fix:** ISA compatibility now checked in `config_init()` as well as
in `context_alloc()`.
* **Change:** Removed the low-weight count optimization, as more recent
changes had significantly reduced its performance benefit. Option removed
from both command line and configuration structure.
* **Feature:** The `-exhaustive` mode now runs full trials on more
partitioning candidates and block candidates. This improves image quality
by 0.1 to 0.25 dB, but slows down compression by 3x. The `-verythorough`
and `-thorough` modes also test more candidates.
* **Feature:** A new preset, `-verythorough`, has been introduced to provide
a standard performance point between `-thorough` and the re-tuned
`-exhaustive` mode. This new mode is faster and higher quality than the
`-exhaustive` preset in the 4.1 release.
* **Feature:** The compressor can now independently vary the number of
partitionings considered for error estimation for 2/3/4 partitions. This
allows heuristics to put more effort into 2 partitions, and less in to
3/4 partitions.
* **Feature:** The compressor can now run trials on a variable number of
candidate partitionings, allowing high quality modes to explore more of the
search space at the expense of slower compression. The number of trials is
independently configurable for 2/3/4 partition cases.
* **Optimization:** Introduce early-out threshold for 2/3/4 partition
searches based on the results after 1 of 2 trials. This significantly
improves performance for `-medium` and `-thorough` searches, for a minor
loss in image quality.
* **Optimization:** Reduce early-out threshold for 3/4 partition searches
based on 2/3 partition results. This significantly improves performance,
especially for `-thorough` searches, for a minor loss in image quality.
* **Optimization:** Use direct vector compare to create a SIMD mask instead
of a scalar compare that is broadcast to a vector mask.
* **Optimization:** Remove obsolete partition validity masks from the
partition selection algorithm.
* **Optimization:** Removed obsolete channel scaling from partition
`avgs_and_dirs()` calculation.
### Performance:
Key for charts:
* Color = block size (see legend).
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
**Relative performance vs 4.0 and 4.1 release:**
![Relative scores 4.2 vs 4.0](./ChangeLogImg/relative-4.0-to-4.2.png)
<!-- ---------------------------------------------------------------------- -->
## 4.1.0
**Status:** August 2022
The 4.1.0 release is a maintenance release. There is no performance or image
quality change in this release.
* **General:**
* **Change:** Command line decompressor no longer uses the legacy
`GL_LUMINANCE` or `GL_LUMINANCE_ALPHA` format enums when writing KTX
output files. Luminance textures now use the `GL_RED` format and
luminance_alpha textures now use the `GL_RG` format.
* **Change:** Command line tool gains a new `-dimage` option to generate
diagnostic images showing aspects of the compression encoding. The output
file name with its extension stripped is used as the stem of the diagnostic
image file names.
* **Bug-fix:** Library decompressor builds for SSE no longer use masked store
`maskmovdqu` instructions, as they can generate faults on masked lanes.
* **Bug-fix:** Command line decompressor now correctly uses sized type enums
for the internal format when writing output KTX files.
* **Bug-fix:** Command line compressor now correctly loads 16 and 32-bit per
component input KTX files.
* **Bug-fix:** Fixed GCC9 compiler warnings on Arm aarch64.
<!-- ---------------------------------------------------------------------- -->
## 4.0.0
**Status:** July 2022
The 4.0.0 release introduces some major performance enhancement, and a number
of larger changes to the heuristics used in the codec to find a more effective
cost:quality trade off.
* **General:**
* **Change:** The `-array` option for specifying the number of image planes
for ASTC 3D volumetric block compression been renamed to `-zdim`.
* **Change:** The build root package directory is now `bin` instead of
`astcenc`, allowing the CMake install step to write binaries into
`/usr/local/bin` if the user wishes to do so.
* **Feature:** A new `-ssw` option for specifying the shader sampling swizzle
has been added as convenience alternative to the `-cw` option. This is
needed to correct error weighting during compression if not all components
are read in the shader. For example, to extract and compress two components
from an RGBA input image, weighting the two components equally when
sampling through .ra in the shader, use `-esw ggga -ssw ra`. In this
example `-ssw ra` is equivalent to the alternative `-cw 1 0 0 1` encoding.
* **Feature:** The `-a` alpha weighting option has been re-enabled in the
backend, and now again applies alpha scaling to the RGB error metrics when
encoding. This is based on the maximum alpha in each block, not the
individual texel alpha values used in the earlier implementation.
* **Feature:** The command line tool now has `-repeats <count>` for testing,
which will iterate around compression and decompression `count` times.
Reported performance metrics also now separate compression and
decompression scores.
* **Feature:** The core codec is now warning clean up to /W4 for both MSVC
`cl.exe` and `clangcl.exe` compilers.
* **Feature:** The core codec now supports arm64 for both MSVC `cl.exe` and
`clangcl.exe` compilers.
* **Feature:** `NO_INVARIANCE` builds will enable the `-ffp-contract=fast`
option for all targets when using Clang or GCC. In addition AVX2 targets
will also set the `-mfma` option. This reduces image quality by up to 0.2dB
(normally much less), but improves performance by up to 5-20%.
* **Optimization:** Angular endpoint min/max weight selection is restricted
to weight `QUANT_11` or lower. Higher quantization levels assume default
0-1 range, which is less accurate but much faster.
* **Optimization:** Maximum weight quantization for later trials is selected
based on the weight quantization of the best encoding from the 1 plane 1
partition trial. This significantly reduces the search space for the later
trials with more planes or partitions.
* **Optimization:** Small data tables now use in-register SIMD permutes
rather than gathers (AVX2) or unrolled scalar lookups (SSE/NEON). This can
be a significant optimization for paths that are load unit limited.
* **Optimization:** Decompressed image block writes in the decompressor now
use a vectorized approach to writing each row of texels in the block,
including to ability to exploit masked stores if the target supports them.
* **Optimization:** Weight scrambling has been moved into the physical layer;
the rest of the codec now uses linear order weights.
* **Optimization:** Weight packing has been moved into the physical layer;
the rest of the codec now uses unpacked weights in the 0-64 range.
* **Optimization:** Consistently vectorize the creation of unquantized weight
grids when they are needed.
* **Optimization:** Remove redundant per-decimation mode copies of endpoint
and weight structures, which were really read-only duplicates.
* **Optimization:** Early-out the same endpoint mode color calculation if it
cannot be applied.
* **Optimization:** Numerous type size reductions applied to arrays to reduce
both context working buffer size usage and stack usage.
### Performance:
Key for charts:
* Color = block size (see legend).
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
**Relative performance vs 3.7 release:**
![Relative scores 4.0 vs 3.7](./ChangeLogImg/relative-3.7-to-4.0.png)
- - -
_Copyright © 2022-2024, Arm Limited and contributors. All rights reserved._
+105
View File
@@ -0,0 +1,105 @@
# 5.x series change log
This page summarizes the major functional and performance changes in each
release of the 5.x series.
All performance data on this page is measured on an Intel Core i5-9600K
clocked at 4.2 GHz, running `astcenc` using AVX2 and 6 threads.
<!-- ---------------------------------------------------------------------- -->
## 5.3.0
**Status:** March 2025
The 5.3.0 release is a minor maintenance release.
* **General:**
* **Feature:** Reference C builds (`ASTCENC_ISA_NONE`) now support compiling
for big-endian CPUs. Compile with `-DASTCENC_BIG_ENDIAN=ON` when compiling
for a big-endian target; it is not auto-detected.
* **Improvement:** Builds using GCC now specify `-flto=auto` to allow
parallel link steps, and remove the log warnings about not setting a CPU
count parameter value.
* **Bug fix:** Builds using MSVC `cl.exe` that do not specify an explicit
ISA using the preprocessor configuration defines will now correctly
default to the SSE2 backend on x86-64 and the NEON backend on Arm64. Previously they would have defaulted to the reference C implementation,
which is around 3.25 times slower.
<!-- ---------------------------------------------------------------------- -->
## 5.2.0
**Status:** February 2025
The 5.2.0 release is a minor maintenance release.
This release includes changes to the public interface in the `astcenc.h`
header. We always recommend rebuilding your client-side code using the
header from the same release to avoid compatibility issues.
* **General:**
* **Change:** Changed sRGB alpha channel endpoint expansion to match the
revised Khronos Data Format Specification (v1.4.0), which reverts an
unintended specification change. Compared to previous releases, this change
can cause LSB bit differences in the alpha channel of compressed images.
* **Feature:** Arm64 builds for Linux added to the GitHub Actions builds, and
Arm64 binaries for NEON, 128-bit SVE 128 and 256-bit SVE added to release
builds.
* **Feature:** Added a new codec API, `astcenc_compress_cancel()`, which can
be used to cancel an in-flight compression. This is designed to help make
it easier to integrate the codec into an interactive user interface that
can respond to user events with low latency.
* **Bug fix:** Removed incorrect `static` variable qualifier, which could
result in an incorrect `tune_mse_overshoot` heuristic threshold being used
if a user ran multiple concurrent compressions with different settings.
<!-- ---------------------------------------------------------------------- -->
## 5.1.0
**Status:** November 2024
The 5.1.0 release is an optimization release, giving moderate performance
improvements on all platforms. There are no image quality differences.
* **General:**
* **Feature:** Added a new CMake build option to control use of native
gathers, as they can be slower than scalar loads on some common x86
microarchitectures. Build with `-DASTCENC_X86_GATHERS=OFF` to disable use
of native gathers in AVX2 builds.
* **Optimization:** Added new `gather()` abstraction for gathers using byte
indices, allowing implementations without gather hardware to skip the
byte-to-int index conversion.
* **Optimization:** Optimized `compute_lowest_and_highest_weight()` to
pre-compute min/max outside of the main loop.
* **Optimization:** Added improved intrinsics sequence for SSE and AVX2
integer `hmin()` and `hmax()`.
* **Optimization:** Added improved intrinsics sequence for `vint4(uint8_t*)`
on systems implementing Arm SVE.
<!-- ---------------------------------------------------------------------- -->
## 5.0.0
**Status:** November 2024
The 5.0.0 release is the first stable release in the 5.x series. The main new
feature is support for the Arm Scalable Vector Extensions (SVE) SIMD instruction
set.
* **General:**
* **Bug fix:** Fixed incorrect return type in "None" vector library
reference implementation.
* **Bug fix:** Fixed sincos table index under/overflow.
* **Feature:** Changed `ASTCENC_ISA_NATIVE` builds to use `-march=native` and
`-mcpu=native`.
* **Feature:** Added backend for Arm SVE fixed-width 256-bit builds. These
can only run on hardware implementing 256-bit SVE.
* **Feature:** Added backend for Arm SVE 128-bit builds. These are portable
builds and can run on hardware implementing any SVE vector length, but the
explicit SVE use is augmented NEON and will only use the bottom 128-bits of
each SVE vector.
* **Feature:** Optimized NEON mask `any()` and `all()` functions.
* **Feature:** Migrated build and test to GitHub Actions pipelines.
- - -
_Copyright © 2022-2025, Arm Limited and contributors. All rights reserved._
Binary file not shown.

After

Width:  |  Height:  |  Size: 111 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 148 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 141 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 149 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 134 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 112 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 120 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 120 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 123 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 116 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 110 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 125 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 127 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 120 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 124 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 121 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 126 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 116 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 108 KiB

+235
View File
@@ -0,0 +1,235 @@
# Effective ASTC Encoding
Most texture compression schemes encode a single color format at single
bitrate, so there are relatively few configuration options available to content
creators beyond selecting which compressed format to use.
ASTC on the other hand is an extremely flexible container format which can
compress multiple color formats at multiple bit rates. Inevitably this
flexibility gives rise to questions about how to best use ASTC to encode a
specific color format, or what the equivalent settings are to get a close
match to another compression format.
This page aims to give some guidelines, but note that they are only guidelines
and are not exhaustive so please deviate from them as needed.
## Traditional format reference
The most commonly used non-ASTC compressed formats, their color format, and
their compressed bitrate are shown in the table below.
| Name | Color Format | Bits/Pixel | Notes |
| -------- | ------------ | ---------- | ---------------- |
| BC1 | RGB+A | 4 | RGB565 + 1-bit A |
| BC3 | RGB+A | 8 | BC1 RGB + BC4 A |
| BC3nm | G+R | 8 | BC1 G + BC4 R |
| BC4 | R | 4 | L8 |
| BC5 | R+G | 8 | BC1 R + BC1 G |
| BC6H | RGB (HDR) | 8 | |
| BC7 | RGB / RGBA | 8 | |
| EAC_R11 | R | 4 | R11 |
| EAC_RG11 | RG | 8 | RG11 |
| ETC1 | RGB | 4 | RGB565 |
| ETC2 | RGB+A | 4 | RGB565 + 1-bit A |
| ETC2+EAC | RGB+A | 8 | RGB565 + EAC A |
| PVRTC | RGBA | 2 or 4 | |
**Note:** BC2 (RGB+A) is not included in the table because it's rarely used in
practice due to poor quality alpha encoding; BC3 is nearly always used instead.
**Note:** Color representations shown with a `+` symbol indicate non-correlated
compression groups; e.g. an `RGB + A` format compresses `RGB` and `A`
independently and does not assume the two signals are correlated. This can be
a strength (it improves quality when compressing non-correlated signals), but
also a weakness (it reduces quality when compressing correlated signals).
# ASTC Format Mapping
The main question which arises with the mapping of another format on to ASTC
is how to handle cases where the input isn't a 4 component RGBA input. ASTC is
a container format which always decompresses in to a 4 component RGBA result.
However, the internal compressed representation is very flexible and can store
1-4 components as needed on a per-block basis.
To get the best quality for a given bitrate, or the lowest bitrate for a given
quality, it is important that as few components as possible are stored in the
internal representation to avoid wasting coding space.
Specific optimizations in the ASTC coding scheme exist for:
* Encoding the RGB components as a single luminance component, so only a single
value needs to be stored in the coding instead of three.
* Encoding the A component as a constant 1.0 value, so the coding doesn't
actually need to store a per-pixel alpha value at all.
... so mapping your inputs given to the compressor to hit these paths is
really important if you want to get the best output quality for your chosen
bitrate.
## Encoding 1-4 component data
The table below shows the recommended component usage for data with different
numbers of color components present in the data.
The coding swizzle should be applied when compressing an image. This can be
handled by the compressor when reading an uncompressed input image by
specifying the swizzle using the `-esw` command line option.
The sampling swizzle is what you should use in your shader programs to read
the data from the compressed texture, assuming no additional API-level
component swizzling is specified by the application.
| Input components | ASTC Endpoint | Coding Swizzle | Sampling Swizzle |
| -------------- | ------------- | -------------- | ------------------ |
| 1 | L + 1 | `rrr1` | `.g` <sup>1</sup> |
| 2 | L + A | `rrrg` | `.ga` <sup>1</sup> |
| 3 | RGB + 1 | `rgb1` | `.rgb` |
| 4 | RGB + A | `rgba` | `.rgba` |
**1:** Sampling from `g` is preferred to sampling from `r` because it allows a
single shader to be compatible with ASTC, BC1, or ETC formats. BC1 and ETC1
store color endpoints as RGB565 data, so the `g` component will have higher
precision. For ASTC it doesn't actually make any difference; the same single
component luminance will be returned for all three of the `.rgb` components.
## Equivalence with other formats
Based on these component encoding requirements we can now derive the the ASTC
coding equivalents for most of the other texture compression formats in common
use today.
| Formant | ASTC Coding Swizzle | ASTC Sampling Swizzle | Notes |
| -------- | ------------------- | --------------------- | ---------------- |
| BC1 | `rgba` <sup>1</sup> | `.rgba` | |
| BC3 | `rgba` | `.rgba` | |
| BC3nm | `gggr` | `.ag` | |
| BC4 | `rrr1` | `.r` | |
| BC5 | `rrrg` | `.ra` <sup>2</sup> | |
| BC6H | `rgb1` | `.rgb` <sup>3</sup> | HDR profile only |
| BC7 | `rgba` | `.rgba` | |
| EAC_R11 | `rrr1` | `.r` | |
| EAC_RG11 | `rrrg` | `.ra` <sup>2</sup> | |
| ETC1 | `rgb1` | `.rgb` | |
| ETC2 | `rgba` <sup>1</sup> | `.rgba` | |
| ETC2+EAC | `rgba` | `.rgba` | |
| ETC2+EAC | `rgba` | `.rgba` | |
**1:** ASTC has no equivalent of the 1-bit punch-through alpha encoding
supported by BC1 or ETC2; if alpha is present it will be a full alpha
component.
**2:** ASTC relies on using the L+A color endpoint type for coding efficiency
for two component data. It therefore has no direct equivalent of a two-plane
format sampled though the `.rg` components such as BC5 or EAC_RG11. This can
be emulated by setting texture component swizzles in the runtime API - e.g. via
`glTexParameteri()` for OpenGL ES - although it has been noted that API
controlled swizzles are not available in WebGL.
**3:** ASTC can only store unsigned values, and has no equivalent of the BC6
signed endpoint mode.
# Other Considerations
This section outlines some of the other things to consider when encoding
textures using ASTC.
## Decode mode extensions
ASTC is specified to decompress into a 16-bit per component RGBA output by
default, with the exception of the sRGB format which uses an 8-bit value for the
RGB components.
Decompressing in to a 16-bit per component output format is often higher than
many use cases require, especially for LDR textures which originally came from
an 8-bit per component source image. Most implementations of ASTC support the
decode mode extensions, which allow an application to opt-in to a lower
precision decompressed format (RGBA8 for LDR, RGB9E5 for HDR). Using these
extensions can improve GPU texture cache efficiency, and even improve texturing
filtering throughput, for use cases that do not need the higher precision.
The ASTC format uses different data rounding rules when the decode mode
extensions are used. To ensure that the compressor chooses the best encodings
for the RGBA8 rounding rules, you can specify `-decode_unorm8` when compressing
textures that will be decompressed into the RGBA8 intermediate. This gives a
small image quality boost.
**Note:** This mode is automatically enabled if you use the `astcenc`
decompressor to write an 8-bit per component output image.
## Encoding non-correlated components
Most other texture compression formats have a static component assignment in
terms of the expected data correlation. For example, ETC2+EAC assumes that RGB
are always correlated and that alpha is non-correlated. ASTC can automatically
encode data as either fully correlated across all 4 components, or with any one
component assigned to a separate non-correlated partition to the other three.
The non-correlated component can be changed on a block-by-block basis, so the
compressor can dynamically adjust the coding based on the data present in the
image. This means that there is no need for non-correlated data to be stored
in a specific component in the input image.
It is however worth noting that the alpha component is treated differently to
the RGB color components in some circumstances:
* When coding for sRGB the alpha component will always be stored in linear
space.
* When coding for HDR the alpha component can optionally be kept as LDR data.
## Encoding normal maps
The best way to store normal maps using ASTC is similar to the scheme used by
BC5; store the X and Y components of a unit-length normal. The Z component of
the normal can be reconstructed in shader code based on the knowledge that the
vector is unit length.
To encode this we need to store only two input components in the compressed
data, and therefore use the `rrrg` coding swizzle to align the data with the
ASTC luminance+alpha endpoint. We can sample this in shader code using the
`.ga` sampling swizzle, and reconstruct the Z value with:
vec3 nml;
nml.xy = texture(...).ga; // Load normals (range 0 to 1)
nml.xy = nml.xy * 2.0 - 1.0; // Unpack normals (range -1 to +1)
nml.z = sqrt(1 - dot(nml.xy, nml.xy)); // Compute Z, given unit length
The encoding swizzle and appropriate component weighting is enabled by using
the `-normal` command line option. If you wish to use a different pair of
components you can specify a custom swizzle after setting the `-normal`
parameter. For example, to match BC5n component ordering use
`-normal -esw gggr` for compression and `-normal -dsw arz1` for decompression.
## Encoding sRGB data
The ASTC LDR profile can compress sRGB encoded color, which is a more
efficient use of bits than storing linear encoded color because the gamma
corrected value distribution more closely matches human perception of
luminance.
For color data it is nearly always a perceptual quality win to use sRGB input
source textures that are then compressed using the ASTC sRGB compression mode
(compress using the `-cs` command line option rather than the `-cl` command
line option). Note that sRGB gamma correction is only applied to the RGB
components during decode; the alpha component is always treated as linear
encoded data.
*Important:* The uncompressed input texture provided on the command line must
be stored in the sRGB color space for `-cs` to function correctly.
## Encoding HDR data
HDR data can be encoded just like LDR data, but with some caveats around
handling the alpha component.
For many use cases the alpha component is an actual alpha opacity component and
is therefore used for storing an LDR value between 0 and 1. For these cases use
the `-ch` compressor option which will treat the RGB components as HDR, but the
A component as LDR.
For other use cases the alpha component is simply a fourth data component which
is also storing an HDR value. For these cases use the `-cH` compressor option
which will treat all components as HDR data.
- - -
_Copyright © 2019-2024, Arm Limited and contributors. All rights reserved._
+71
View File
@@ -0,0 +1,71 @@
# The .astc File Format
The default file format for compressed textures generated by `astcenc`, as well
as from many other ASTC compressors, is the `.astc` format. This is a very
simple format consisting of a small header followed immediately by the binary
payload for a single image surface.
Header
======
The header is a fixed 16 byte structure, defined as storing only bytes to avoid
any endianness issues or incur any padding overhead.
```
struct astc_header
{
uint8_t magic[4];
uint8_t block_x;
uint8_t block_y;
uint8_t block_z;
uint8_t dim_x[3];
uint8_t dim_y[3];
uint8_t dim_z[3];
};
```
Magic number
------------
The 4 byte magic number at the start of the file acts as a format identifier.
```
magic[0] = 0x13;
magic[1] = 0xAB;
magic[2] = 0xA1;
magic[3] = 0x5C;
```
Block size
----------
The `block_*` fields store the ASTC block dimensions in texels. For 2D images
the Z dimension must be set to 1.
Image dimensions
----------------
The `dim_*` fields store the image dimensions in texels. For 2D images the
Z dimension must be set to 1.
Note that the image is not required to be an exact multiple of the compressed
block size; the compressed data may include padding that is discarded during
decompression.
Each dimension is a 24 bit unsigned value that is reconstructed from the stored
byte values as:
```
decoded_dim = dim[0] + (dim[1] << 8) + (dim[2] << 16);
```
Binary payload
==============
The binary payload is a byte stream that immediately follows the header. It
contains 16 bytes per compressed block. The number of compressed blocks is
determined from the header information.
- - -
_Copyright © 2020-2022, Arm Limited and contributors. All rights reserved._
+488
View File
@@ -0,0 +1,488 @@
# ASTC Format Overview
Adaptive Scalable Texture Compression (ASTC) is an advanced lossy texture
compression technology developed by Arm and AMD. It has been adopted as an
official Khronos extension to the OpenGL and OpenGL ES APIs, and as a standard
optional feature for the Vulkan API.
ASTC offers a number of advantages over earlier texture compression formats:
* **Format flexibility:** ASTC supports compressing between 1 and 4 channels of
data, including support for one non-correlated channel such as RGB+A
(correlated RGB, non-correlated alpha).
* **Bit rate flexibility:** ASTC supports compressing images with a fine
grained choice of bit rates between 0.89 and 8 bits per texel (bpt). The bit
rate choice is independent to the color format choice.
* **Advanced format support:** ASTC supports compressing images in either low
dynamic range (LDR), LDR sRGB, or high dynamic range (HDR) color spaces, as
well as support for compressing 3D volumetric textures.
* **Improved image quality:** Despite the high degree of format flexibility,
ASTC manages to beat nearly all legacy texture compression formats -- such as
ETC2, PVRCT, and the BC formats -- on image quality at equivalent bit
rates.
This article explores the ASTC format, and how it manages to generate the
flexibility and quality improvements that it achieves.
Why ASTC?
=========
Before the creation of ASTC, the format and bit rate coverage of the available
formats was very sparse:
![Legacy texture compression formats and bit rates](./FormatOverviewImg/coverage-legacy.svg)
In reality the situation is even worse than this diagram shows, as many of
these formats are proprietary or simply not available on some operating
systems, so any single platform will have very limited compression choices.
For developers this situation makes developing content which is portable across
multiple platforms a tricky proposition. It's almost certain that differently
compressed assets will be needed for different platforms. Each asset pack would
likely then need to use different levels of compression, and may even have to
fall back to no compression for some assets on some platforms, which leaves
either some image quality or some memory bandwidth efficiency untapped.
It was clear a better way was needed, so the Khronos group asked members to
submit proposals for a new compression algorithm to be adopted in the same
manner that the earlier ETC algorithm was adopted for OpenGL ES. ASTC was the
result of this, and has been adopted as an official algorithm for OpenGL,
OpenGL ES, and Vulkan.
Format overview
===============
Given the fragmentation issues with the existing compression formats, it should
be no surprise that the high level design objectives for ASTC were to have
something which could be used across the whole range of art assets found in
modern content, and which allows artists to have more control over the quality
to bit rate tradeoff.
There are quite a few technical components which make up the ASTC format, so
before we dive into detail it will be useful to give an overview of how ASTC
works at a higher level.
Block compression
-----------------
Compression formats for real-time graphics need the ability to quickly and
efficiently make random samples into a texture. This places two technical
requirements on any compression format:
* It must be possible to compute the address of data in memory given only a
sample coordinate.
* It must be possible to decompress random samples without decompressing too
much surrounding data.
The standard solution for this used by all contemporary real-time formats,
including ASTC, is to divide the image into fixed-size blocks of texels, each
of which is compressed into a fixed number of output bits. This feature makes
it possible to access texels quickly, in any order, and with a well-bounded
decompression cost.
The 2D block footprints in ASTC range from 4x4 texels up to 12x12 texels, which
all compress into 128-bit output blocks. By dividing 128 bits by the number of
texels in the footprint, we derive the format bit rates which range from 8 bpt
(`128/(4*4)`) down to 0.89 bpt (`128/(12*12)`).
Color encoding
--------------
ASTC uses gradients to assign the color values of each texel. Each compressed
block stores the end-point colors for a gradient, and an interpolation weight
for each texel which defines the texel's location along that gradient. During
decompression the color value for each texel is generated by interpolating
between the two end-point colors, based on the per-texel weight.
![One partition gradient storage](./FormatOverviewImg/gradient-1p.svg)
In many cases a block will contain a complex distribution of colors, for
example a red ball sitting on green grass. In these scenarios a single color
gradient will not be able to accurately represent all of the texels' values. To
support this ASTC allows a block to define up to four distinct color gradients,
known as partitions, and can assign each texel to a single partition. For our
example we require two partitions, one for our ball texels and one for our
grass texels.
![Two partition gradient storage](./FormatOverviewImg/gradient-2p.svg)
Now that you know the high level operation of the format, we can dive into more
detail.
Integer encoding
================
Initially the idea of fractional bits per texel sounds implausible, or even
impossible, because we're so used to storing numbers as a whole number of bits.
However, it's not quite as strange as it sounds. ASTC uses an encoding
technique called Bounded Integer Sequence Encoding (BISE), which makes heavy
use of storing numbers with a fractional number of bits to pack information
more efficiently.
Storing alphabets
-----------------
Even though color and weight values per texel are notionally floating-point
values, we have far too few bits available to directly store the actual values,
so they must be quantized during compression to reduce the storage size. For
example, if we have a floating-point weight for each texel in the range 0.0 to
1.0 we could choose to quantize it to five values - 0.0, 0.25, 0.5, 0.75, and
1.0 - which we can then represent in storage using the integer values 0 to 4.
In the general case we need to be able to efficiently store characters of an
alphabet containing N symbols if we choose quantize to N levels. An N symbol
alphabet contains `log2(N)` bits of information per character. If we have an
alphabet of 5 possible symbols then each character contains ~2.32 bits of
information, but simple binary storage would require us to round up to 3 bits.
This wastes 22.3% of our storage capacity. The chart below shows the percentage
of our bit-space wasted when using simple binary encoding to store an arbitrary
N symbol alphabet:
![Binary encoding efficiency](./FormatOverviewImg/binary.png)
... which shows for most alphabet sizes we waste a lot of our storage capacity
when using an integer number of bits per character. Efficiency is of critical
importance to a compression format, so this is something we needed to be able
to improve.
**Note:** We could have chosen to round-up the quantization level to the next
power of two, and at least use the bits we're spending. However, this forces
the encoder to spend bits which could be used elsewhere for a bigger benefit,
so it will reduce image quality and is a sub-optimal solution.
Quints
------
Instead of rounding up a 5 symbol alphabet - called a "quint" in BISE - to
three bits, we could choose to instead pack three quint characters together.
Three characters in a 5-symbol alphabet have 5<sup>3</sup> (125) combinations,
and contain 6.97 bits of information. We can store this in 7 bits and have a
storage waste of only 0.5%.
Trits
-----
We can similarly construct a 3-symbol alphabet - called a "trit" in BISE - and
pack trit characters in groups of five. Each character group has 3<sup>5</sup>
(243) combinations, and contains 7.92 bits of information. We can store this in
8 bits and have a storage waste of only 1%.
BISE
----
The BISE encoding used by ASTC allows storage of character sequences using
arbitrary alphabets of up to 256 symbols, encoding each alphabet size in the
most space-efficient choice of bits, trits, and quints.
* Alphabets with up to (2<sup>n</sup> - 1) symbols can be encoded using n bits
per character.
* Alphabets with up (3 * 2<sup>n</sup> - 1) symbols can be encoded using n bits
(m) and a trit (t) per character, and reconstructed using the equation
(t * 2<sup>n</sup> + m).
* Alphabets with up to (5 * 2<sup>n</sup> - 1) symbols can be encoded using n
bits (m) and a quint (q) per character, and reconstructed using the equation
(q * 2<sup>n</sup> + m).
When the number of characters in a sequence is not a multiple of three or five
we need to avoid wasting storage at the end of the sequence, so we add another
constraint on the encoding. If the last few values in the sequence to encode
are zero, the last few bits in the encoded bit string must also be zero.
Ideally, the number of non-zero bits should be easily calculated and not depend
on the magnitudes of the previous encoded values. This is a little tricky to
arrange during compression, but it is possible. This means that we do not need
to store any padding after the end of the bit sequence, as we can safely assume
that they are zero bits.
With this constraint in place - and by some smart packing the bits, trits, and
quints - BISE encodes an string of S characters in an N symbol alphabet using a
fixed number of bits:
* S values up to (2<sup>n</sup> - 1) uses (NS) bits.
* S values up to (3 * 2<sup>n</sup> - 1) uses (NS + ceil(8S / 5)) bits.
* S values up to (5 * 2<sup>n</sup> - 1) uses (NS + ceil(7S / 3)) bits.
... and the compressor will choose the one of these which produces the smallest
storage for the alphabet size being stored; some will use binary, some will use
bits and a trit, and some will use bits and a quint. If we compare the storage
efficiency of BISE against simple binary for the range of possible alphabet
sizes we might want to encode we can see that it is much more efficient.
![BISE encoding efficiency](./FormatOverviewImg/bise.png)
Block sizes
===========
ASTC always compresses blocks of texels into 128-bit outputs, but allows the
developer to select from a range of block sizes to enable a fine-grained
tradeoff between image quality and size.
| Block footprint | Bits/texel | | Block footprint | Bits/texel |
| --------------- | ---------- | --- | --------------- | ---------- |
| 4x4 | 8.00 | | 10x5 | 2.56 |
| 5x4 | 6.40 | | 10x6 | 2.13 |
| 5x5 | 5.12 | | 8x8 | 2.00 |
| 6x5 | 4.27 | | 10x8 | 1.60 |
| 6x6 | 3.56 | | 10x10 | 1.28 |
| 8x5 | 3.20 | | 12x10 | 1.07 |
| 8x6 | 2.67 | | 12x12 | 0.89 |
Color endpoints
===============
The color data for a block is encoded as a gradient between two color
endpoints, with each texel selecting a position along that gradient which is
then interpolated during decompression. ASTC supports 16 color endpoint
encoding schemes, known as "endpoint modes". Options for endpoint modes
include:
* Varying the number of color channels: e.g. luminance, luminance + alpha, rgb,
and rgba.
* Varying the encoding method: e.g. direct, base+offset, base+scale,
quantization level.
* Varying the data range: e.g. low dynamic range, or high dynamic range
The endpoint modes, and the endpoint color BISE quantization level, can be
chosen on a per-block basis.
Color partitions
================
Colors within a block are often complex, and cannot be accurately captured by a
single color gradient, as discussed earlier with our example of a red ball
lying on green grass. ASTC allows up to four color gradients - known as
"partitions" - to be assigned to a single block. Each texel is then assigned to
a single partition for the purposes of decompression.
Rather then directly storing the partition assignment for each texel, which
would need a lot of decompressor hardware to store it for all block sizes, we
generate it procedurally. Each block only needs to store the partition index -
which is the seed for the procedural generator - and the per texel assignment
can then be generated on-the-fly during decompression. The image below shows
the generated texel assignments for two (top), three (middle), and four
(bottom) partitions for the 8x8 block size.
![ASTC partition table](./FormatOverviewImg/hash.png)
The number of partitions and the partition index can be chosen on a per-block
basis, and a different color endpoint mode can be chosen per partition.
**Note:** ASTC uses a 10-bit seed to drive the partition assignments. The hash
used will introduce horizontal bias in a third of the partitions, vertical bias
in a third, and no bias in the rest. As they are procedurally generated not all
of the partitions are useful, in particular with the smaller block sizes.
* Many partitions are duplicates.
* Many partitions are degenerate (an N partition hash results in at least one
partition assignment that contains no texels).
Texel weights
=============
Each texel requires a weight, which defines the relative contribution of each
color endpoint when interpolating the color gradient.
For smaller block sizes we can choose to store the weight directly, with one
weight per texel, but for the larger block sizes we simply do not have enough
bits of storage to do this. To work around this ASTC allows the weight grid to
be stored at a lower resolution than the texel grid. The per-texel weights are
interpolated from the stored weight grid during decompression using a bilinear
interpolation.
The number of texel weights, and the weight value BISE quantization level, can
be chosen on a per-block basis.
Dual-plane weights
------------------
Using a single weight for all color channels works well when there is good
correlation across the channels, but this is not always the case. Common
examples where we would expect to get low correlation at least some of the time
are textures storing RGBA data - alpha masks are not usually closely
correlated with the color value - or normal data - the X and Y normal values
often change independently.
ASTC allows a dual-plane mode, which uses two separate weight grids for each
texel. A single channel can be assigned to a second plane of weights, while
the other three use the first plane of weights.
The use of dual-plane mode can be chosen on a per-block basis, but its use
prevents the use of four color partitions as we do not have enough bits to
concurrently store both an extra plane of weights and an extra set of color
endpoints.
End results
===========
So, if we pull all of this together what do we end up with?
Adaptive
--------
The first word in the name of ASTC is "adaptive", and it should now hopefully
be clear why. Each block always compresses into 128-bits of storage, but the
developer can choose from a wide range of texel block sizes and the compressor
gets a huge amount of latitude to determine how those 128 bits are used.
The compressor can trade off the number of bits assigned to colors (number of
partitions, endpoint mode, and stored quantization level) and weights (number
of weights per block, use of dual-plane, and stored quantization level) on a
per-block basis to get the best image quality possible.
![ASTC compressed parrot at various bit rates](./FormatOverviewImg/astc-quality.png)
Format support
--------------
The compression scheme used by ASTC effectively compresses arbitrary sequences
of floating point numbers, with a flexible number of channels, across any of
the supported block sizes. There is no real notion of "color format" in the
format itself at all, beyond the color endpoint mode selection, although a
sensible compressor will want to use some format-specific heuristics to drive
an efficient state-space search.
The orthogonal encoding design allows ASTC to provide almost complete coverage
of our desirable format matrix from earlier, across a wide range of bit rates:
![ASTC 2D formats and bit rates](./FormatOverviewImg/coverage-astc.svg)
The only significant omission is the absence of a dedicated two channel
encoding for HDR textures. We simply ran out of entries in the space we had for
encoding color endpoint modes, and this one didn't make the cut.
The flexibility allowed by ASTC ticks the requirement that almost any asset can
be compressed to some degree, at an appropriate bitrate for its quality needs.
This is a powerful enabler for a compression format, because it puts control in
the hands of content creators and not arbitrary format restrictions.
Image quality
-------------
The normal expectation would be that this level of format flexibility would
come at a cost of image quality; it has to cost something, right? Luckily this
isn't true. The high packing efficiency allowed by BISE encoding, and the
ability to dynamically choose where to spend encoding space on a per-block
basis, means that an ASTC compressor is not forced to spend bits on things that
don't help image quality.
This gives some significant improvements in image quality compared to the older
texture formats, even though ASTC also handles a much wider range of options.
* ASTC at 2 bpt outperforms PVRTC at 2 bpt by ~2.0dB.
* ASTC at 3.56 bpt outperforms PVRTC and BC1 at 4 bpt by ~1.5dB, and ETC2 by
~0.7dB, despite a 10% bit rate disadvantage.
* ASTC at 8 bpt for LDR formats is comparable in quality to BC7 at 8 bpt.
* ASTC at 8 bpt for HDR formats is comparable in quality to BC6H at 8 bpt.
Differences as small as 0.25dB are visible to the human eye, and remember that
dB uses a logarithmic scale, so these are significant image quality
improvements.
3D compression
--------------
One of the nice bonus features of ASTC is that the techniques which underpin
the format generalize to compressing volumetric texture data without needing
very much additional decompression hardware.
ASTC is therefore also able to optionally support compression of 3D textures,
which is a unique feature not found in any earlier format, at the following
bit rates:
| Block footprint | Bits/texel | | Block footprint | Bits/texel |
| --------------- | ---------- | --- | --------------- | ---------- |
| 3x3x3 | 4.74 | | 5x5x4 | 1.28 |
| 4x3x3 | 3.56 | | 5x5x5 | 1.02 |
| 4x4x3 | 2.67 | | 6x5x5 | 0.85 |
| 4x4x4 | 2.00 | | 6x6x5 | 0.71 |
| 5x4x4 | 1.60 | | 6x6x6 | 0.59 |
Availability
============
The ASTC functionality is specified as a set of feature profiles, allowing
GPU hardware manufacturers to select which parts of the standard they
implement. There are four commonly seen profiles:
* "LDR":
* 2D blocks.
* LDR and sRGB color space.
* [KHR_texture_compression_astc_ldr][astc_ldr]: KHR OpenGL ES extension.
* "LDR + Sliced 3D":
* 2D blocks and sliced 3D blocks.
* LDR and sRGB color space.
* [KHR_texture_compression_astc_sliced_3d][astc_3d]: KHR OpenGL ES extension.
* "HDR":
* 2D and sliced 3D blocks.
* LDR, sRGB, and HDR color spaces.
* [KHR_texture_compression_astc_hdr][astc_ldr]: KHR OpenGL ES extension.
* "Full":
* 2D, sliced 3D, and volumetric 3D blocks.
* LDR, sRGB, and HDR color spaces.
* [OES_texture_compression_astc][astc_full]: OES OpenGL ES extension.
The LDR profile is mandatory in OpenGL ES 3.2 and a standardized optional
feature for Vulkan, and therefore widely supported on contemporary mobile
devices. The 2D HDR profile is not mandatory, but is widely supported.
3D texturing
------------
The APIs expose 3D textures in two flavors.
The sliced 3D texture support builds a 3D texture from an array of 2D image
slices that have each been individually compressed using 2D ASTC compression.
This is required for the HDR profile, so is also widely supported.
The volumetric 3D texture support uses the native 3D block sizes provided by
ASTC to implement true volumetric compression. This enables a wider choice of
low bitrate options than the 2D blocks, which is particularly important for 3D
textures of any non-trivial size. Volumetric formats are not widely supported,
but are supported on all of the Arm Mali GPUs that support ASTC.
ASTC decode mode
----------------
ASTC is specified to decompress texels into fp16 intermediate values, except
for sRGB which always decompresses into 8-bit UNORM intermediates. For many use
cases this gives more dynamic range and precision than required. This can cause
a reduction in both texture cache efficiency and texture filtering performance
due to the larger decompressed data size.
A pair of extensions exist, and are widely supported on recent mobile GPUs,
which allow applications to reduce the intermediate precision to either UNORM8
(recommended for LDR textures) or RGB9e5 (recommended for HDR textures).
* [OES_texture_compression_astc_decode_mode][astc_decode]: Allow UNORM8
intermediates
* [OES_texture_compression_astc_decode_mode_rgb9e5][astc_decode]: Allow RGB9e5
intermediates
[astc_ldr]: https://www.khronos.org/registry/OpenGL/extensions/KHR/KHR_texture_compression_astc_hdr.txt
[astc_3d]: https://www.khronos.org/registry/OpenGL/extensions/KHR/KHR_texture_compression_astc_sliced_3d.txt
[astc_full]: https://www.khronos.org/registry/OpenGL/extensions/OES/OES_texture_compression_astc.txt
[astc_decode]: https://www.khronos.org/registry/OpenGL/extensions/EXT/EXT_texture_compression_astc_decode_mode.txt
- - -
_Copyright © 2019-2022, Arm Limited and contributors. All rights reserved._
Binary file not shown.

After

Width:  |  Height:  |  Size: 115 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 122 KiB

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 76 KiB

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 55 KiB

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 79 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 47 KiB

+79
View File
@@ -0,0 +1,79 @@
# Terminology for the ASTC Encoder
Like most software, the `astcenc` code base has a set of naming conventions
for variables which are used to ensure both accuracy and reasonable brevity.
:construction: These conventions are being used for new patches, so new code
will conform to this, but older code is still being cleaned up to follow
these conventions.
## Counts
For counts of things prefer `<x>_count` rather than `<x>s`. For example:
* `plane_count`
* `weight_count`
* `texel_count`
Where possible aim for descriptive loop variables, as these are more literate
than simple `i` or `j` variables. For example:
* `plane_index`
* `weight_index`
* `texel_index`
## Ideal, Unpacked Quantized, vs Packed Quantized
Variables that are quantized, such as endpoint colors and weights, have
multiple states depending on how they are being used.
**Ideal values** represent arbitrary numeric values that can take any value.
These are often used during compression to work out the best value before
any quantization is applied. For example, integer weights in the 0-64 range can
take any of the 65 values available.
**Quant uvalues** represent the unpacked numeric value after any quantization
rounding has been applied. These are often used during compression to work out
the error for the quantized value compared to the ideal value. For example,
`QUANT_3` weights in the 0-64 range can only take one of `[0, 32, 64]`.
**Quant pvalues** represent the packed numeric value in the quantized alphabet.
This is what ends up encoded in the ASTC data, although note that the encoded
ordering is scrambled to simplify hardware. For example, `QUANT_3` weights
originally in the 0-64 range can only take one of `[0, 1, 2]`.
For example:
* `weights_ideal_value`
* `weights_quant_uvalue`
* `weights_quant_pvalue`
## Full vs Decimated interpolation weights
Weight grids have multiple states depending on how they are being used.
**full_weights** represent per texel weight grids, storing one weight per texel.
**decimated_weights** represent reduced weight grids, which can store fewer
weights and which are bilinear interpolated to generate the full weight grid.
Full weights have no variable prefix,but decimated weights are stored with
a `dec_` prefix.
* `dec_weights_ideal_value`
* `dec_weights_quant_uvalue`
* `dec_weights_quant_pvalue`
## Weight vs Significance
The original encoder used "weight" for multiple purposes - texel significance
(weight the error), color channel significance (weight the error), and endpoint
interpolation weights. This gets very confusing in functions using all three!
We are slowly refactoring the code to only use "weight" to mean the endpoint
interpolation weights. The error weighting factors used for other purposes are
being updated to use the using the term "significance".
- - -
_Copyright © 2020-2022, Arm Limited and contributors. All rights reserved._
+120
View File
@@ -0,0 +1,120 @@
# Testing astcenc
The repository contains a small suite of tests which can be used to sanity
check source code changes to the compressor. It must be noted that this test
suite is relatively limited in scope and does not cover every feature or
bitrate of the standard.
# Required software
Running the tests requires Python 3.7 to be installed on the host machine, and
an `astcenc-avx2` release build to have been previously compiled and installed
into an directory called `astcenc` in the root of the git checkout. This
can be achieved by configuring the CMake build using the install prefix
`-DCMAKE_INSTALL_PREFIX=../` and then running a build with the `install` build
target.
# Running C++ unit tests
We support a small (but growing) number of C++ unit tests, which are written
using the `googletest` framework and integrated in the CMake "CTest" test
framework.
To build unit tests pull the `googletest` git submodule and add
`-DASTCENC_UNITTEST=ON` to the CMake command line when configuring.
To run unit tests use the CMake `ctest` utility from your build directory after
you have built the tests.
```shell
cd build
ctest --verbose
```
# Running command line tests
To run the command line tests, which aim to get coverage of the command line
options and core codec stability without testing the compression quality
itself, run the command line:
python3 -m unittest discover -s Test -p astc_test*.py -v
# Running image tests
To run the image test suite run the following command from the root directory
of the repository:
python3 ./Test/astc_test_image.py
This will run though a series of image compression tests, comparing the image
PSNR against a set of reference results from the last stable baseline. The test
will fail if any reduction in PSNR above a set threshold is detected. Note that
performance information is reported, but regressions will not flag a failure.
For debug purposes, all decompressed test output images and result CSV files
are stored in the `TestOutput` directory, using the same test set structure as
the `Test/Images` folder.
## Test selection
The runner supports a number of options to filter down what is run, enabling
developers to focus local testing on the parts of the code they are working on.
* `--encoder` selects which encoder to run. By default the `avx2` encoder is
selected. Note that some out-of-tree reference encoders (older encoders, and
some third-party encoders) are supported for comparison purposes. These will
not work without the binaries being manually provided; they are not
distributed here.
* `--test-set` selects which image set to run. By default the `Small` image
test set is selected, which aims to provide basic coverage of many different
color formats and color profiles.
* `--block-size` selects which block size to run. By default a range of
block sizes (2D and 3D) are used.
* `--color-profile` selects which color profiles from the standard should be
used (LDR, LDR sRGB, or HDR) to select images. By default all are selected.
* `--color-format` selects which color formats should be used (L, XY, RGB,
RGBA) to select images. By default all are selected.
## Performance tests
To provide less noisy performance results the test suite supports compressing
each image multiple times and returning the best measured performance. To
enable this mode use the following options:
* `--repeats <M>` : Run M test compression passes which are timed.
**Note:** The reference CSV contains performance results measured on an Intel
Core i5 9600K running at 4.3GHz, running each test 5 times.
## Updating reference data
The reference PSNR and performance scores are stored in CSVs committed to the
repository. This data is created by running the tests using the last stable
release on a standard test machine we use for performance testing builds.
It can be useful for developers to rebuild the reference results for their
local machine, in particular for measuring performance improvements. To build
new reference CSVs, download the current reference `astcenc` binary (1.7) from
GitHub for your host OS and place it in to the `./Binaries/1.7/` directory.
Once this is done, run the command:
python3 ./Test/astc_test_image.py --encoder 1.7 --test-set all --repeats 5
... to regenerate the reference CSV files.
**WARNING:** This can take some hours to complete, and it is best done when the
test suite gets exclusive use of the machine to avoid other processing slowing
down the compression and disturbing the performance data. It is recommended to
shutdown or disable any background applications that are running.
## Valgrind memcheck
It is always worth running the Valgrind memcheck tool to validate that we have
not introduced any obvious memory errors. Build a release build with symbols
information with `-DCMAKE_BUILD_TYPE=RelWithDebInfo` and then run:
valgrind --tool=memcheck --track-origins=yes <command>
- - -
_Copyright © 2019-2022, Arm Limited and contributors. All rights reserved._
+250
View File
@@ -0,0 +1,250 @@
# About
The Arm® Adaptive Scalable Texture Compression (ASTC) Encoder, `astcenc`, is
a command-line tool for compressing and decompressing images using the ASTC
texture compression standard.
## The ASTC format
The ASTC compressed data format, developed by Arm® and AMD, has been adopted as
an official extension to the OpenGL®, OpenGL ES, and Vulkan® graphics APIs. It
provides a major step forward in terms of both the image quality at a given
bitrate, and the format and bitrate flexibility available to content creators.
This allows more assets to use compression, often at a reduced bitrate compared
to other formats, reducing memory storage and bandwidth requirements.
Read the [ASTC Format Overview][1] for a quick introduction to the format, or
read the full [Khronos Data Format Specification][2] for all the details.
## License
This project is licensed under the Apache 2.0 license. By downloading any
component from this repository you acknowledge that you accept terms specified
in the [LICENSE.txt](LICENSE.txt) file.
# Encoder feature support
The encoder supports compression of low dynamic range (BMP, JPEG, PNG, TGA) and
high dynamic range (EXR, HDR) images, as well as a subset of image data wrapped
in the DDS and KTX container formats, into ASTC or KTX format output images.
The decoder supports decompression of ASTC or KTX format input images into low
dynamic range (BMP, PNG, TGA), high dynamic range (EXR, HDR), or DDS and KTX
wrapped output images.
The encoder allows control over the compression time/quality tradeoff with
`exhaustive`, `verythorough`, `thorough`, `medium`, `fast`, and `fastest`
encoding quality presets.
The encoder allows compression time and quality analysis by reporting the
compression time, and the Peak Signal-to-Noise Ratio (PSNR) between the input
image and the compressed output.
## ASTC format support
The `astcenc` compressor supports generation of images for all three profiles
allowed by the ASTC specification:
* 2D Low Dynamic Range (LDR profile)
* 2D LDR and High Dynamic Range (HDR profile)
* 2D and 3D, LDR and HDR (Full profile)
It also supports all of the ASTC block sizes and compression modes, allowing
content creators to use the full spectrum of quality-to-bitrate options ranging
from 0.89 bits/pixel up to 8 bits/pixel.
# Prebuilt binaries
Release build binaries for the `astcenc` stable releases are provided in the
[GitHub Releases page][3].
* Change log: [5.x series](./Docs/ChangeLog-5x.md)
Binaries are provided for 64-bit builds on Windows, macOS, and Linux.
## Windows and Linux
For Windows and Linux the builds of the astcenc are provided as multiple
binaries, each tuned for a specific SIMD instruction set.
For x86-64 we provide, in order of increasing performance:
* `astcenc-sse2` - uses SSE2
* `astcenc-sse4.1` - uses SSE4.1 and POPCNT
* `astcenc-avx2` - uses AVX2, SSE4.2, POPCNT, and F16C
The x86-64 SSE2 builds will work on all x86-64 machines, but it is the slowest
of the three. The other two require extended CPU instruction set support which
is not universally available, but each step gains ~15% more performance.
For Arm we provide, in order of increasing performance:
* `astcenc-sve_256` - uses 256-bit SVE
* `astcenc-sve_128` - uses 128-bit SVE
* `astcenc-neon` - uses NEON
Note: The Arm Scalable Vector Extensions (SVE) allow CPUs to have a variable
vector length. The astcenc implementation is not written in a length-agnostic
style and requires the binary to match the vector length on the host CPU.
## macOS
For macOS devices we provide a single universal binary `astcenc`, which allows
the OS to automatically use the correct binary variant for the current host
machine. Support is provided for three architecture slices:
* `x86_64` - uses the `astcenc-sse4.1` build defined above.
* `x86_64h` - uses the `astcenc-avx2` build defined above.
* `arm64` - uses the `astcenc-neon` build defined above.
## Repository branches
The `main` branch is an active development branch for the compressor. It aims
to be a stable branch for the latest major release series, but as it is used
for ongoing development expect it to have some volatility. We recommend using
the latest stable release tag for production development.
The `4.x` branch is a stable branch for the older 4.x release series. It is no
longer under active development, but is a supported branch that continues to
get back-ported bug fixes.
The `1.x`, `2.x`, and `3.x` branches are stable branches for older releases.
They are no longer under active development or getting bug fixes.
Any other branches you might find are development branches for new features or
optimizations, so might be interesting to play with but should be considered
transient and unstable.
# Getting started
Open a terminal, change to the appropriate directory for your system, and run
the astcenc encoder program, like this on Linux or macOS:
./astcenc
... or like this on Windows:
astcenc
Invoking `astcenc -help` gives an extensive help message, including usage
instructions and details of all available command line options. A summary of
the main encoder options are shown below.
## Compressing an image
Compress an image using the `-cl` \ `-cs` \ `-ch` \ `-cH` modes. For example:
astcenc -cl example.png example.astc 6x6 -medium
This compresses `example.png` using the LDR color profile and a 6x6 block
footprint (3.56 bits/pixel). The `-medium` quality preset gives a reasonable
image quality for a relatively fast compression speed, so is a good starting
point for compression. The output is stored to a linear color space compressed
image, `example.astc`.
The modes available are:
* `-cl` : use the linear LDR color profile.
* `-cs` : use the sRGB LDR color profile.
* `-ch` : use the HDR color profile, tuned for HDR RGB and LDR A.
* `-cH` : use the HDR color profile, tuned for HDR RGBA.
If you intend to use the resulting image with the decode mode extensions to
limit the decompressed precision to UNORM8, it is recommended that you also
specify the `-decode_unorm8` flag. This will ensure that the compressor uses
the correct rounding rules when choosing encodings.
## Decompressing an image
Decompress an image using the `-dl` \ `-ds` \ `-dh` \ `-dH` modes. For example:
astcenc -dh example.astc example.tga
This decompresses `example.astc` using the full HDR feature profile, storing
the decompressed output to `example.tga`.
The modes available mirror the options used for compression, but use a `d`
prefix. Note that for decompression there is no difference between the two HDR
modes, they are both provided simply to maintain symmetry across operations.
## Measuring image quality
Review the compression quality using the `-tl` \ `-ts` \ `-th` \ `-tH` modes.
For example:
astcenc -tl example.png example.tga 5x5 -thorough
This is equivalent to using using the LDR color profile and a 5x5 block size
to compress the image, using the `-thorough` quality preset, and then
immediately decompressing the image and saving the result. This can be used
to enable a visual inspection of the compressed image quality. In addition
this mode also prints out some image quality metrics to the console.
The modes available mirror the options used for compression, but use a `t`
prefix.
## Experimenting
Efficient real-time graphics benefits from minimizing compressed texture size,
as it reduces memory footprint, reduces memory bandwidth, saves energy, and can
improve texture cache efficiency. However, like any lossy compression format
there will come a point where the compressed image quality is unacceptable
because there are simply not enough bits to represent the output with the
precision needed. We recommend experimenting with the block footprint to find
the optimum balance between size and quality, as the finely adjustable
compression ratio is one of major strengths of the ASTC format.
The compression speed can be controlled from `-fastest`, through `-fast`,
`-medium` and `-thorough`, up to `-exhaustive`. In general, the more time the
encoder has to spend looking for good encodings the better the results, but it
does result in increasingly small improvements for the amount of time required.
There are many other command line options for tuning the encoder parameters
which can be used to fine tune the compression algorithm. See the command line
help message for more details.
# Documentation
The [ASTC Format Overview](./Docs/FormatOverview.md) page provides a high level
introduction to the ASTC texture format, how it encodes data, and why it is
both flexible and efficient.
The [Effective ASTC Encoding](./Docs/Encoding.md) page looks at some of the
guidelines that should be followed when compressing data using `astcenc`.
It covers:
* How to efficiently encode data with fewer than 4 channels.
* How to efficiently encode normal maps, sRGB data, and HDR data.
* Coding equivalents to other compression formats.
The [ASTC Developer Guide][5] document (external link) provides a more detailed
guide for developers using the `astcenc` compressor.
The [.astc File Format](./Docs/FileFormat.md) page provides a light-weight
specification for the `.astc` file format and how to read or write it.
The [Building ASTC Encoder](./Docs/Building.md) page provides instructions on
how to build `astcenc` from the sources in this repository.
The [Testing ASTC Encoder](./Docs/Testing.md) page provides instructions on
how to test any modifications to the source code in this repository.
# Support
If you have issues with the `astcenc` encoder, or questions about the ASTC
texture format itself, please raise them in the GitHub issue tracker.
If you have any questions about Arm GPUs, application development for Arm GPUs,
or general mobile graphics development or technology please submit them on the
[Arm Community graphics forums][4].
- - -
_Copyright © 2013-2025, Arm Limited and contributors. All rights reserved._
[1]: ./Docs/FormatOverview.md
[2]: https://www.khronos.org/registry/DataFormat/specs/1.4/dataformat.1.4.html#ASTC
[3]: https://github.com/ARM-software/astc-encoder/releases
[4]: https://community.arm.com/support-forums/f/graphics-gaming-and-vr-forum/
[5]: https://developer.arm.com/documentation/102162/latest/?lang=en
+126
View File
@@ -0,0 +1,126 @@
# SPDX-License-Identifier: Apache-2.0
# ----------------------------------------------------------------------------
# Copyright 2020-2025 Arm Limited
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy
# of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
# ----------------------------------------------------------------------------
# Overwrite the LTO flags to force fat LTO; worth 3-4% performance
# See https://gitlab.kitware.com/cmake/cmake/-/issues/16808
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND ${ASTCENC_CLI})
set(CMAKE_CXX_COMPILE_OPTIONS_IPO "-flto")
endif()
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND ${ASTCENC_CLI})
set(CMAKE_CXX_COMPILE_OPTIONS_IPO "-flto=auto")
endif()
if(${ASTCENC_DECOMPRESSOR})
set(ASTCENC_CODEC dec)
else()
set(ASTCENC_CODEC enc)
endif()
set(ASTCENC_ARTIFACTS native none sve_256 sve_128 neon avx2 sse4.1 sse2)
set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_SVE_256} ${ASTCENC_ISA_SVE_128} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2})
list(LENGTH ASTCENC_ARTIFACTS ASTCENC_ARTIFACTS_LEN)
math(EXPR ASTCENC_ARTIFACTS_LEN "${ASTCENC_ARTIFACTS_LEN} - 1")
foreach(INDEX RANGE ${ASTCENC_ARTIFACTS_LEN})
list(GET ASTCENC_ARTIFACTS ${INDEX} ASTCENC_ARTIFACT)
list(GET ASTCENC_CONFIGS ${INDEX} ASTCENC_CONFIG)
if(${ASTCENC_CONFIG})
set(ASTCENC_ISA_SIMD ${ASTCENC_ARTIFACT})
if(${ASTCENC_ISA_SIMD} MATCHES "sve_256")
# Not suported on macOS
elseif(${ASTCENC_ISA_SIMD} MATCHES "sve_128")
# Not suported on macOS
elseif(${ASTCENC_ISA_SIMD} MATCHES "neon")
set(CMAKE_OSX_ARCHITECTURES arm64)
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
set(CMAKE_OSX_ARCHITECTURES x86_64)
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse4.1")
set(CMAKE_OSX_ARCHITECTURES x86_64)
elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2")
set(CMAKE_OSX_ARCHITECTURES x86_64h)
elseif(${ASTCENC_ISA_SIMD} MATCHES "none")
# Using "none" uses implicit architecture
elseif(${ASTCENC_ISA_SIMD} MATCHES "native")
# Using "native" uses implicit architecture
else()
message(FATAL_ERROR "'${ASTCENC_ISA_SIMD}' is unknown ISA")
endif()
include(cmake_core.cmake)
endif()
endforeach()
if(${ASTCENC_CLI} AND ${ASTCENC_UNIVERSAL_BUILD})
add_custom_target(
astc${ASTCENC_CODEC}
ALL
COMMAND
lipo -create -output $<TARGET_FILE_DIR:astc${ASTCENC_CODEC}-sse4.1>/astc${ASTCENC_CODEC} -arch x86_64 $<TARGET_FILE:astc${ASTCENC_CODEC}-sse4.1> -arch x86_64h $<TARGET_FILE:astc${ASTCENC_CODEC}-avx2> -arch arm64 $<TARGET_FILE:astc${ASTCENC_CODEC}-neon>
VERBATIM)
add_dependencies(
astc${ASTCENC_CODEC}
astc${ASTCENC_CODEC}-sse4.1
astc${ASTCENC_CODEC}-avx2
astc${ASTCENC_CODEC}-neon)
install(PROGRAMS $<TARGET_FILE_DIR:astc${ASTCENC_CODEC}-sse4.1>/astc${ASTCENC_CODEC}
DESTINATION bin)
endif()
if(${ASTCENC_SHAREDLIB} AND ${ASTCENC_UNIVERSAL_BUILD})
add_custom_target(
astc${ASTCENC_CODEC}-shared
ALL
COMMAND
lipo -create -output $<TARGET_FILE_DIR:astc${ASTCENC_CODEC}-sse4.1-shared>/libastc${ASTCENC_CODEC}-shared.dylib -arch x86_64 $<TARGET_FILE:astc${ASTCENC_CODEC}-sse4.1-shared> -arch x86_64h $<TARGET_FILE:astc${ASTCENC_CODEC}-avx2-shared> -arch arm64 $<TARGET_FILE:astc${ASTCENC_CODEC}-neon-shared>
VERBATIM)
add_dependencies(
astc${ASTCENC_CODEC}-shared
astc${ASTCENC_CODEC}-sse4.1-shared
astc${ASTCENC_CODEC}-avx2-shared
astc${ASTCENC_CODEC}-neon-shared)
install(PROGRAMS $<TARGET_FILE_DIR:astc${ASTCENC_CODEC}-sse4.1-shared>/libastc${ASTCENC_CODEC}-shared.dylib
DESTINATION lib)
endif()
# - - - - - - - - - - - - - - - - - -
# Unit testing
if(${ASTCENC_UNITTEST})
set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
set(CMAKE_OSX_ARCHITECTURES x86_64;arm64)
add_subdirectory(GoogleTest)
# Workaround GoogleTest CRT selection issue issue
# See https://github.com/google/googletest/issues/4067
set_property(
TARGET gtest
PROPERTY
MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
set_property(
TARGET gtest_main
PROPERTY
MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
enable_testing()
add_subdirectory(UnitTest)
endif()
@@ -0,0 +1,106 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2020-2021 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Fuzz target for physical_to_symbolic().
*
* This function is the first entrypoint for decompressing a 16 byte block of
* input ASTC data from disk. The 16 bytes can contain arbitrary data; they
* are read from an external source, but the block size used must be a valid
* ASTC block footprint.
*/
#include "astcenc_internal.h"
#include <fuzzer/FuzzedDataProvider.h>
#include <array>
#include <vector>
struct BlockSizes
{
int x;
int y;
int z;
};
std::array<BlockSizes, 3> testSz {{
{ 4, 4, 1}, // Highest bitrate
{12, 12, 1}, // Largest 2D block
{6, 6, 6} // Largest 3D block
}};
std::array<block_size_descriptor, 3> testBSD;
/**
* @brief Utility function to create all of the block size descriptors needed.
*
* This is triggered once via a static initializer.
*
* Triggering once is important so that we only create a single BSD per block
* size we need, rather than one per fuzzer iteration (it's expensive). This
* improves fuzzer throughput by ~ 1000x!
*
* Triggering via a static initializer, rather than a lazy init in the fuzzer
* function, is important because is means that the BSD is allocated before
* fuzzing starts. This means that leaksanitizer will ignore the fact that we
* "leak" the dynamic allocations inside the BSD (we never call term()).
*/
bool bsd_initializer()
{
for (int i = 0; i < testSz.size(); i++)
{
init_block_size_descriptor(
testSz[i].x,
testSz[i].y,
testSz[i].z,
false,
4,
1.0f,
testBSD[i]);
}
return true;
}
extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
{
// Preinitialize the block size descriptors we need
static bool init = bsd_initializer();
// Must have 4 (select block size) and 16 (payload) bytes
if (size < 4 + 16)
{
return 0;
}
FuzzedDataProvider stream(data, size);
// Select a block size to test
int i = stream.ConsumeIntegralInRange<int>(0, testSz.size() - 1);
// Populate the physical block
uint8_t pcb[16];
std::vector<uint8_t> buffer = stream.ConsumeBytes<uint8_t>(16);
std::memcpy(pcb, buffer.data(), 16);
// Call the function under test
symbolic_compressed_block scb;
physical_to_symbolic(testBSD[i], pcb, scb);
return 0;
}
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,51 @@
# SPDX-License-Identifier: Apache-2.0
# ----------------------------------------------------------------------------
# Copyright 2020-2024 Arm Limited
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy
# of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
# ----------------------------------------------------------------------------
set(ASTCENC_ARTIFACTS native none sve_256 sve_128 neon avx2 sse4.1 sse2)
set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_SVE_256} ${ASTCENC_ISA_SVE_128} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2})
list(LENGTH ASTCENC_ARTIFACTS ASTCENC_ARTIFACTS_LEN)
math(EXPR ASTCENC_ARTIFACTS_LEN "${ASTCENC_ARTIFACTS_LEN} - 1")
foreach(INDEX RANGE ${ASTCENC_ARTIFACTS_LEN})
list(GET ASTCENC_ARTIFACTS ${INDEX} ASTCENC_ARTIFACT)
list(GET ASTCENC_CONFIGS ${INDEX} ASTCENC_CONFIG)
if(${ASTCENC_CONFIG})
set(ASTCENC_ISA_SIMD ${ASTCENC_ARTIFACT})
if(${ASTCENC_ISA_SIMD} MATCHES "sve_256")
# Not supported on macOS
elseif(${ASTCENC_ISA_SIMD} MATCHES "sve_128")
# Not supported on macOS
elseif(${ASTCENC_ISA_SIMD} MATCHES "neon")
set(CMAKE_OSX_ARCHITECTURES arm64)
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
set(CMAKE_OSX_ARCHITECTURES x86_64)
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse4.1")
set(CMAKE_OSX_ARCHITECTURES x86_64)
elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2")
set(CMAKE_OSX_ARCHITECTURES x86_64h)
elseif(${ASTCENC_ISA_SIMD} MATCHES "none")
# Using "none" uses implicit architecture
elseif(${ASTCENC_ISA_SIMD} MATCHES "native")
# Using "native" uses implicit architecture
else()
message(FATAL_ERROR "'${ASTCENC_ISA_SIMD}' is unknown ISA")
endif()
include(cmake_core.cmake)
endif()
endforeach()
@@ -0,0 +1,198 @@
# SPDX-License-Identifier: Apache-2.0
# ----------------------------------------------------------------------------
# Copyright 2020-2025 Arm Limited
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy
# of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
# ----------------------------------------------------------------------------
include(../cmake_compiler.cmake)
set(ASTCENC_TEST test-unit-${ASTCENC_ISA_SIMD})
add_executable(${ASTCENC_TEST})
set_property(TARGET ${ASTCENC_TEST}
PROPERTY
CXX_STANDARD 17)
# Enable LTO under the conditions where the codec library will use LTO.
# The library link will fail if the settings don't match
if(${ASTCENC_CLI})
set_property(TARGET ${ASTCENC_TEST}
PROPERTY
INTERPROCEDURAL_OPTIMIZATION_RELEASE True)
endif()
# Use a static runtime on MSVC builds (ignored on non-MSVC compilers)
set_property(TARGET ${ASTCENC_TEST}
PROPERTY
MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
target_sources(${ASTCENC_TEST}
PRIVATE
test_simd.cpp
test_softfloat.cpp
test_decode.cpp)
target_include_directories(${ASTCENC_TEST}
PRIVATE
${gtest_SOURCE_DIR}/include)
target_link_libraries(${ASTCENC_TEST}
PRIVATE
astcenc-${ASTCENC_ISA_SIMD}-static)
target_compile_options(${ASTCENC_TEST}
PRIVATE
# Use pthreads on Linux/macOS
$<$<PLATFORM_ID:Linux,Darwin>:-pthread>
# MSVC compiler defines
$<${is_msvc_fe}:/EHsc>
$<$<AND:$<BOOL:${ASTCENC_WERROR}>,${is_msvc_fe}>:/WX>
$<${is_msvccl}:/wd4324>
# G++ and Clang++ compiler defines
$<${is_gnu_fe}:-Wall>
$<${is_gnu_fe}:-Wextra>
$<${is_gnu_fe}:-Wpedantic>
$<$<AND:$<BOOL:${ASTCENC_WERROR}>,${is_gnu_fe}>:-Werror>
$<${is_gnu_fe}:-Wshadow>
$<${is_gnu_fe}:-Wdouble-promotion>
$<${is_clang}:-Wdocumentation>
# Hide noise thrown up by Clang 10 and clang-cl
$<${is_gnu_fe}:-Wno-unknown-warning-option>
$<${is_gnu_fe}:-Wno-c++98-compat-pedantic>
$<${is_gnu_fe}:-Wno-c++98-c++11-compat-pedantic>
$<${is_gnu_fe}:-Wno-float-equal>
$<${is_gnu_fe}:-Wno-overriding-option>
$<${is_gnu_fe}:-Wno-unsafe-buffer-usage>
$<${is_clang}:-Wno-switch-default>
# Ignore things that the googletest build triggers
$<${is_gnu_fe}:-Wno-unknown-warning-option>
$<${is_gnu_fe}:-Wno-double-promotion>
$<${is_gnu_fe}:-Wno-undef>
$<${is_gnu_fe}:-Wno-reserved-identifier>
$<${is_gnu_fe}:-Wno-global-constructors>)
# Set up configuration for SIMD ISA builds
if(${ASTCENC_ISA_SIMD} MATCHES "none")
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SVE=0
ASTCENC_SSE=0
ASTCENC_AVX=0
ASTCENC_POPCNT=0
ASTCENC_F16C=0)
if(${ASTCENC_BIG_ENDIAN})
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
ASTCENC_BIG_ENDIAN=1)
endif()
elseif(${ASTCENC_ISA_SIMD} MATCHES "neon")
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
ASTCENC_NEON=1
ASTCENC_SVE=0
ASTCENC_SSE=0
ASTCENC_AVX=0
ASTCENC_POPCNT=0
ASTCENC_F16C=0)
elseif(${ASTCENC_ISA_SIMD} MATCHES "sve_256")
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
ASTCENC_NEON=1
ASTCENC_SVE=8
ASTCENC_SSE=0
ASTCENC_AVX=0
ASTCENC_POPCNT=0
ASTCENC_F16C=0)
# Enable SVE
target_compile_options(${ASTCENC_TEST}
PRIVATE
-march=armv8-a+sve -msve-vector-bits=256)
elseif(${ASTCENC_ISA_SIMD} MATCHES "sve_128")
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
ASTCENC_NEON=1
ASTCENC_SVE=4
ASTCENC_SSE=0
ASTCENC_AVX=0
ASTCENC_POPCNT=0
ASTCENC_F16C=0)
# Enable SVE
target_compile_options(${ASTCENC_TEST}
PRIVATE
-march=armv8-a+sve)
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SVE=0
ASTCENC_SSE=20
ASTCENC_AVX=0
ASTCENC_POPCNT=0
ASTCENC_F16C=0)
target_compile_options(${ASTCENC_TEST}
PRIVATE
$<$<CXX_COMPILER_ID:${GNU_LIKE}>:-msse2>)
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse4.1")
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SVE=0
ASTCENC_SSE=41
ASTCENC_AVX=0
ASTCENC_POPCNT=1
ASTCENC_F16C=0)
target_compile_options(${ASTCENC_TEST}
PRIVATE
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-msse4.1 -mpopcnt>)
elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2")
target_compile_definitions(${ASTCENC_TEST}
PRIVATE
ASTCENC_NEON=0
ASTCENC_SVE=0
ASTCENC_SSE=41
ASTCENC_AVX=2
ASTCENC_POPCNT=1
ASTCENC_F16C=1)
target_compile_options(${ASTCENC_TEST}
PRIVATE
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-mavx2 -mpopcnt -mf16c>
$<$<CXX_COMPILER_ID:MSVC>:/arch:AVX2>)
endif()
target_link_libraries(${ASTCENC_TEST}
PRIVATE
gtest_main)
add_test(NAME ${ASTCENC_TEST}
COMMAND ${ASTCENC_TEST})
@@ -0,0 +1,80 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Unit tests for the vectorized SIMD functionality.
*/
#include <limits>
#include "gtest/gtest.h"
#include "../astcenc.h"
namespace astcenc
{
/** @brief Test harness for exploring issue #447. */
TEST(decode, decode12x12)
{
astcenc_error status;
astcenc_config config;
astcenc_context* context;
static const astcenc_swizzle swizzle {
ASTCENC_SWZ_R, ASTCENC_SWZ_G, ASTCENC_SWZ_B, ASTCENC_SWZ_A
};
uint8_t data[16] {
#if 0
0x84,0x00,0x38,0xC8,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0xB3,0x4D,0x78
#else
0x29,0x00,0x1A,0x97,0x01,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0xCF,0x97,0x86
#endif
};
uint8_t output[12*12*4];
astcenc_config_init(ASTCENC_PRF_LDR, 12, 12, 1, ASTCENC_PRE_MEDIUM, 0, &config);
status = astcenc_context_alloc(&config, 1, &context);
EXPECT_EQ(status, ASTCENC_SUCCESS);
astcenc_image image;
image.dim_x = 12;
image.dim_y = 12;
image.dim_z = 1;
image.data_type = ASTCENC_TYPE_U8;
uint8_t* slices = output;
image.data = reinterpret_cast<void**>(&slices);
status = astcenc_decompress_image(context, data, 16, &image, &swizzle, 0);
EXPECT_EQ(status, ASTCENC_SUCCESS);
#if 0
for (int y = 0; y < 12; y++)
{
for (int x = 0; x < 12; x++)
{
uint8_t* pixel = output + (12 * 4 * y) + (4 * x);
printf("[%2dx%2d] = %03d, %03d, %03d, %03d\n", x, y, pixel[0], pixel[1], pixel[2], pixel[3]);
}
}
#endif
}
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,68 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2021 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Unit tests for the software half-float library.
*/
#include "gtest/gtest.h"
#include "../astcenc_internal.h"
namespace astcenc
{
#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
/** @brief Test normal numbers. */
TEST(softfloat, FP16NormalNumbers)
{
float result = sf16_to_float((15 << 10) + 1);
EXPECT_NEAR(result, 1.00098f, 0.00005f);
}
/** @brief Test denormal numbers. */
TEST(softfloat, FP16DenormalNumbers)
{
float result = sf16_to_float((0 << 10) + 1);
EXPECT_NEAR(result, 5.96046e-08f, 0.00005f);
}
/** @brief Test zero. */
TEST(softfloat, FP16Zero)
{
float result = sf16_to_float(0x0000);
EXPECT_EQ(result, 0.0f);
}
/** @brief Test infinity. */
TEST(softfloat, FP16Infinity)
{
float result = sf16_to_float((31 << 10) + 0);
EXPECT_TRUE(std::isinf(result));
}
/** @brief Test NaN. */
TEST(softfloat, FP16NaN)
{
float result = sf16_to_float(0xFFFF);
EXPECT_TRUE(std::isnan(result));
}
#endif
}
+874
View File
@@ -0,0 +1,874 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2020-2025 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief The core astcenc codec library interface.
*
* This interface is the entry point to the core astcenc codec. It aims to be easy to use for
* non-experts, but also to allow experts to have fine control over the compressor heuristics if
* needed. The core codec only handles compression and decompression, transferring all inputs and
* outputs via memory buffers. To catch obvious input/output buffer sizing issues, which can cause
* security and stability problems, all transfer buffers are explicitly sized.
*
* While the aim is that we keep this interface mostly stable, it should be viewed as a mutable
* interface tied to a specific source version. We are not trying to maintain backwards
* compatibility across codec versions.
*
* The API state management is based around an explicit context object, which is the context for all
* allocated memory resources needed to compress and decompress a single image. A context can be
* used to sequentially compress multiple images using the same configuration, allowing setup
* overheads to be amortized over multiple images, which is particularly important when images are
* small.
*
* Multi-threading can be used two ways.
*
* * An application wishing to process multiple images in parallel can allocate multiple
* contexts and assign each context to a thread.
* * An application wishing to process a single image in using multiple threads can configure
* contexts for multi-threaded use, and invoke astcenc_compress/decompress() once per thread
* for faster processing. The caller is responsible for creating the worker threads, and
* synchronizing between images.
*
* Extended instruction set support
* ================================
*
* This library supports use of extended instruction sets, such as SSE4.1 and AVX2. These are
* enabled at compile time when building the library. There is no runtime checking in the core
* library that the instruction sets used are actually available. Checking compatibility is the
* responsibility of the calling code.
*
* Threading
* =========
*
* In pseudo-code, the usage for manual user threading looks like this:
*
* // Configure the compressor run
* astcenc_config my_config;
* astcenc_config_init(..., &my_config);
*
* // Power users can tweak <my_config> settings here ...
*
* // Allocate working state given config and thread_count
* astcenc_context* my_context;
* astcenc_context_alloc(&my_config, thread_count, &my_context);
*
* // Compress each image using these config settings
* foreach image:
* // For each thread in the thread pool
* for i in range(0, thread_count):
* astcenc_compress_image(my_context, &my_input, my_output, i);
*
* astcenc_compress_reset(my_context);
*
* // Clean up
* astcenc_context_free(my_context);
*
* Images
* ======
*
* The codec supports compressing single images, which can be either 2D images or volumetric 3D
* images. Calling code is responsible for any handling of aggregate types, such as mipmap chains,
* texture arrays, or sliced 3D textures.
*
* Images are passed in as an astcenc_image structure. Inputs can be either 8-bit unorm, 16-bit
* half-float, or 32-bit float, as indicated by the data_type field.
*
* Images can be any dimension; there is no requirement to be a multiple of the ASTC block size.
*
* Data is always passed in as 4 color components, and accessed as an array of 2D image slices. Data
* within an image slice is always tightly packed without padding. Addressing looks like this:
*
* data[z_coord][y_coord * x_dim * 4 + x_coord * 4 ] // Red
* data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 1] // Green
* data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 2] // Blue
* data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 3] // Alpha
*
* Common compressor usage
* =======================
*
* One of the most important things for coding image quality is to align the input data component
* count with the ASTC color endpoint mode. This avoids wasting bits encoding components you don't
* actually need in the endpoint colors.
*
* | Input data | Encoding swizzle | Sampling swizzle |
* | ------------ | ---------------- | ---------------- |
* | 1 component | RRR1 | .[rgb] |
* | 2 components | RRRG | .[rgb]a |
* | 3 components | RGB1 | .rgb |
* | 4 components | RGBA | .rgba |
*
* The 1 and 2 component modes recommend sampling from "g" to recover the luminance value as this
* provide best compatibility with other texture formats where the green component may be stored at
* higher precision than the others, such as RGB565. For ASTC any of the RGB components can be used;
* the luminance endpoint component will be returned for all three.
*
* When using the normal map compression mode ASTC will store normals as a two component X+Y map.
* Input images must contain unit-length normalized and should be passed in using a two component
* swizzle. The astcenc command line tool defaults to an RRRG swizzle, but some developers prefer
* to use GGGR for compatability with BC5n which will work just as well. The Z component can be
* recovered programmatically in shader code, using knowledge that the vector is unit length and
* that Z must be positive for a tangent-space normal map.
*
* Decompress-only usage
* =====================
*
* For some use cases it is useful to have a cut-down context and/or library which supports
* decompression but not compression.
*
* A context can be made decompress-only using the ASTCENC_FLG_DECOMPRESS_ONLY flag when the context
* is allocated. These contexts have lower dynamic memory footprint than a full context.
*
* The entire library can be made decompress-only by building the files with the define
* ASTCENC_DECOMPRESS_ONLY set. In this build the context will be smaller, and the library will
* exclude the functionality which is only needed for compression. This reduces the binary size by
* ~180KB. For these builds contexts must be created with the ASTCENC_FLG_DECOMPRESS_ONLY flag.
*
* Note that context structures returned by a library built as decompress-only are incompatible with
* a library built with compression included, and visa versa, as they have different sizes and
* memory layout.
*
* Self-decompress-only usage
* ==========================
*
* ASTC is a complex format with a large search space. The parts of this search space that are
* searched is determined by heuristics that are, in part, tied to the quality level used when
* creating the context.
*
* A normal context is capable of decompressing any ASTC texture, including those generated by other
* compressors with unknown heuristics. This is the most flexible implementation, but forces the
* data tables used by the codec to include entries that are not needed during compression. This
* can slow down context creation by a significant amount, especially for the faster compression
* modes where few data table entries are actually used. To optimize this use case the context can
* be created with the ASTCENC_FLG_SELF_DECOMPRESS_ONLY flag. This tells the compressor that it will
* only be asked to decompress images that it compressed itself, allowing the data tables to
* exclude entries that are not needed by the current compression configuration. This reduces the
* size of the context data tables in memory and improves context creation performance. Note that,
* as of the 3.6 release, this flag no longer affects compression performance.
*
* Using this flag while attempting to decompress an valid image which was created by another
* compressor, or even another astcenc compressor version or configuration, may result in blocks
* returning as solid magenta or NaN value error blocks.
*/
#ifndef ASTCENC_INCLUDED
#define ASTCENC_INCLUDED
#include <cstddef>
#include <cstdint>
#if defined(ASTCENC_DYNAMIC_LIBRARY)
#if defined(_MSC_VER)
#define ASTCENC_PUBLIC extern "C" __declspec(dllexport)
#else
#define ASTCENC_PUBLIC extern "C" __attribute__ ((visibility ("default")))
#endif
#else
#define ASTCENC_PUBLIC
#endif
/* ============================================================================
Data declarations
============================================================================ */
/**
* @brief An opaque structure; see astcenc_internal.h for definition.
*/
struct astcenc_context;
/**
* @brief A codec API error code.
*/
enum astcenc_error {
/** @brief The call was successful. */
ASTCENC_SUCCESS = 0,
/** @brief The call failed due to low memory, or undersized I/O buffers. */
ASTCENC_ERR_OUT_OF_MEM,
/** @brief The call failed due to the build using fast math. */
ASTCENC_ERR_BAD_CPU_FLOAT,
/** @brief The call failed due to an out-of-spec parameter. */
ASTCENC_ERR_BAD_PARAM,
/** @brief The call failed due to an out-of-spec block size. */
ASTCENC_ERR_BAD_BLOCK_SIZE,
/** @brief The call failed due to an out-of-spec color profile. */
ASTCENC_ERR_BAD_PROFILE,
/** @brief The call failed due to an out-of-spec quality value. */
ASTCENC_ERR_BAD_QUALITY,
/** @brief The call failed due to an out-of-spec component swizzle. */
ASTCENC_ERR_BAD_SWIZZLE,
/** @brief The call failed due to an out-of-spec flag set. */
ASTCENC_ERR_BAD_FLAGS,
/** @brief The call failed due to the context not supporting the operation. */
ASTCENC_ERR_BAD_CONTEXT,
/** @brief The call failed due to unimplemented functionality. */
ASTCENC_ERR_NOT_IMPLEMENTED,
/** @brief The call failed due to an out-of-spec decode mode flag set. */
ASTCENC_ERR_BAD_DECODE_MODE,
#if defined(ASTCENC_DIAGNOSTICS)
/** @brief The call failed due to an issue with diagnostic tracing. */
ASTCENC_ERR_DTRACE_FAILURE,
#endif
};
/**
* @brief A codec color profile.
*/
enum astcenc_profile {
/** @brief The LDR sRGB color profile. */
ASTCENC_PRF_LDR_SRGB = 0,
/** @brief The LDR linear color profile. */
ASTCENC_PRF_LDR,
/** @brief The HDR RGB with LDR alpha color profile. */
ASTCENC_PRF_HDR_RGB_LDR_A,
/** @brief The HDR RGBA color profile. */
ASTCENC_PRF_HDR
};
/** @brief The fastest, lowest quality, search preset. */
static const float ASTCENC_PRE_FASTEST = 0.0f;
/** @brief The fast search preset. */
static const float ASTCENC_PRE_FAST = 10.0f;
/** @brief The medium quality search preset. */
static const float ASTCENC_PRE_MEDIUM = 60.0f;
/** @brief The thorough quality search preset. */
static const float ASTCENC_PRE_THOROUGH = 98.0f;
/** @brief The thorough quality search preset. */
static const float ASTCENC_PRE_VERYTHOROUGH = 99.0f;
/** @brief The exhaustive, highest quality, search preset. */
static const float ASTCENC_PRE_EXHAUSTIVE = 100.0f;
/**
* @brief A codec component swizzle selector.
*/
enum astcenc_swz
{
/** @brief Select the red component. */
ASTCENC_SWZ_R = 0,
/** @brief Select the green component. */
ASTCENC_SWZ_G = 1,
/** @brief Select the blue component. */
ASTCENC_SWZ_B = 2,
/** @brief Select the alpha component. */
ASTCENC_SWZ_A = 3,
/** @brief Use a constant zero component. */
ASTCENC_SWZ_0 = 4,
/** @brief Use a constant one component. */
ASTCENC_SWZ_1 = 5,
/** @brief Use a reconstructed normal vector Z component. */
ASTCENC_SWZ_Z = 6
};
/**
* @brief A texel component swizzle.
*/
struct astcenc_swizzle
{
/** @brief The red component selector. */
astcenc_swz r;
/** @brief The green component selector. */
astcenc_swz g;
/** @brief The blue component selector. */
astcenc_swz b;
/** @brief The alpha component selector. */
astcenc_swz a;
};
/**
* @brief A texel component data format.
*/
enum astcenc_type
{
/** @brief Unorm 8-bit data per component. */
ASTCENC_TYPE_U8 = 0,
/** @brief 16-bit float per component. */
ASTCENC_TYPE_F16 = 1,
/** @brief 32-bit float per component. */
ASTCENC_TYPE_F32 = 2
};
/**
* @brief Function pointer type for compression progress reporting callback.
*/
extern "C" typedef void (*astcenc_progress_callback)(float);
/**
* @brief Enable normal map compression.
*
* Input data will be treated a two component normal map, storing X and Y, and the codec will
* optimize for angular error rather than simple linear PSNR. In this mode the input swizzle should
* be e.g. rrrg (the default ordering for ASTC normals on the command line) or gggr (the ordering
* used by BC5n).
*/
static const unsigned int ASTCENC_FLG_MAP_NORMAL = 1 << 0;
/**
* @brief Enable compression heuristics that assume use of decode_unorm8 decode mode.
*
* The decode_unorm8 decode mode rounds differently to the decode_fp16 decode mode, so enabling this
* flag during compression will allow the compressor to use the correct rounding when selecting
* encodings. This will improve the compressed image quality if your application is using the
* decode_unorm8 decode mode, but will reduce image quality if using decode_fp16.
*
* Note that LDR_SRGB images will always use decode_unorm8 for the RGB channels, irrespective of
* this setting.
*/
static const unsigned int ASTCENC_FLG_USE_DECODE_UNORM8 = 1 << 1;
/**
* @brief Enable alpha weighting.
*
* The input alpha value is used for transparency, so errors in the RGB components are weighted by
* the transparency level. This allows the codec to more accurately encode the alpha value in areas
* where the color value is less significant.
*/
static const unsigned int ASTCENC_FLG_USE_ALPHA_WEIGHT = 1 << 2;
/**
* @brief Enable perceptual error metrics.
*
* This mode enables perceptual compression mode, which will optimize for perceptual error rather
* than best PSNR. Only some input modes support perceptual error metrics.
*/
static const unsigned int ASTCENC_FLG_USE_PERCEPTUAL = 1 << 3;
/**
* @brief Create a decompression-only context.
*
* This mode disables support for compression. This enables context allocation to skip some
* transient buffer allocation, resulting in lower memory usage.
*/
static const unsigned int ASTCENC_FLG_DECOMPRESS_ONLY = 1 << 4;
/**
* @brief Create a self-decompression context.
*
* This mode configures the compressor so that it is only guaranteed to be able to decompress images
* that were actually created using the current context. This is the common case for compression use
* cases, and setting this flag enables additional optimizations, but does mean that the context
* cannot reliably decompress arbitrary ASTC images.
*/
static const unsigned int ASTCENC_FLG_SELF_DECOMPRESS_ONLY = 1 << 5;
/**
* @brief Enable RGBM map compression.
*
* Input data will be treated as HDR data that has been stored in an LDR RGBM-encoded wrapper
* format. Data must be preprocessed by the user to be in LDR RGBM format before calling the
* compression function, this flag is only used to control the use of RGBM-specific heuristics and
* error metrics.
*
* IMPORTANT: The ASTC format is prone to bad failure modes with unconstrained RGBM data; very small
* M values can round to zero due to quantization and result in black or white pixels. It is highly
* recommended that the minimum value of M used in the encoding is kept above a lower threshold (try
* 16 or 32). Applying this threshold reduces the number of very dark colors that can be
* represented, but is still higher precision than 8-bit LDR.
*
* When this flag is set the value of @c rgbm_m_scale in the context must be set to the RGBM scale
* factor used during reconstruction. This defaults to 5 when in RGBM mode.
*
* It is recommended that the value of @c cw_a_weight is set to twice the value of the multiplier
* scale, ensuring that the M value is accurately encoded. This defaults to 10 when in RGBM mode,
* matching the default scale factor.
*/
static const unsigned int ASTCENC_FLG_MAP_RGBM = 1 << 6;
/**
* @brief The bit mask of all valid flags.
*/
static const unsigned int ASTCENC_ALL_FLAGS =
ASTCENC_FLG_MAP_NORMAL |
ASTCENC_FLG_MAP_RGBM |
ASTCENC_FLG_USE_ALPHA_WEIGHT |
ASTCENC_FLG_USE_PERCEPTUAL |
ASTCENC_FLG_USE_DECODE_UNORM8 |
ASTCENC_FLG_DECOMPRESS_ONLY |
ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
/**
* @brief The config structure.
*
* This structure will initially be populated by a call to astcenc_config_init, but power users may
* modify it before calling astcenc_context_alloc. See astcenccli_toplevel_help.cpp for full user
* documentation of the power-user settings.
*
* Note for any settings which are associated with a specific color component, the value in the
* config applies to the component that exists after any compression data swizzle is applied.
*/
struct astcenc_config
{
/** @brief The color profile. */
astcenc_profile profile;
/** @brief The set of set flags. */
unsigned int flags;
/** @brief The ASTC block size X dimension. */
unsigned int block_x;
/** @brief The ASTC block size Y dimension. */
unsigned int block_y;
/** @brief The ASTC block size Z dimension. */
unsigned int block_z;
/** @brief The red component weight scale for error weighting (-cw). */
float cw_r_weight;
/** @brief The green component weight scale for error weighting (-cw). */
float cw_g_weight;
/** @brief The blue component weight scale for error weighting (-cw). */
float cw_b_weight;
/** @brief The alpha component weight scale for error weighting (-cw). */
float cw_a_weight;
/**
* @brief The radius for any alpha-weight scaling (-a).
*
* It is recommended that this is set to 1 when using FLG_USE_ALPHA_WEIGHT on a texture that
* will be sampled using linear texture filtering to minimize color bleed out of transparent
* texels that are adjacent to non-transparent texels.
*/
unsigned int a_scale_radius;
/** @brief The RGBM scale factor for the shared multiplier (-rgbm). */
float rgbm_m_scale;
/**
* @brief The maximum number of partitions searched (-partitioncountlimit).
*
* Valid values are between 1 and 4.
*/
unsigned int tune_partition_count_limit;
/**
* @brief The maximum number of partitions searched (-2partitionindexlimit).
*
* Valid values are between 1 and 1024.
*/
unsigned int tune_2partition_index_limit;
/**
* @brief The maximum number of partitions searched (-3partitionindexlimit).
*
* Valid values are between 1 and 1024.
*/
unsigned int tune_3partition_index_limit;
/**
* @brief The maximum number of partitions searched (-4partitionindexlimit).
*
* Valid values are between 1 and 1024.
*/
unsigned int tune_4partition_index_limit;
/**
* @brief The maximum centile for block modes searched (-blockmodelimit).
*
* Valid values are between 1 and 100.
*/
unsigned int tune_block_mode_limit;
/**
* @brief The maximum iterative refinements applied (-refinementlimit).
*
* Valid values are between 1 and N; there is no technical upper limit
* but little benefit is expected after N=4.
*/
unsigned int tune_refinement_limit;
/**
* @brief The number of trial candidates per mode search (-candidatelimit).
*
* Valid values are between 1 and TUNE_MAX_TRIAL_CANDIDATES.
*/
unsigned int tune_candidate_limit;
/**
* @brief The number of trial partitionings per search (-2partitioncandidatelimit).
*
* Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
*/
unsigned int tune_2partitioning_candidate_limit;
/**
* @brief The number of trial partitionings per search (-3partitioncandidatelimit).
*
* Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
*/
unsigned int tune_3partitioning_candidate_limit;
/**
* @brief The number of trial partitionings per search (-4partitioncandidatelimit).
*
* Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
*/
unsigned int tune_4partitioning_candidate_limit;
/**
* @brief The dB threshold for stopping block search (-dblimit).
*
* This option is ineffective for HDR textures.
*/
float tune_db_limit;
/**
* @brief The amount of MSE overshoot needed to early-out trials.
*
* The first early-out is for 1 partition, 1 plane trials, where we try a minimal encode using
* the high probability block modes. This can short-cut compression for simple blocks.
*
* The second early-out is for refinement trials, where we can exit refinement once quality is
* reached.
*/
float tune_mse_overshoot;
/**
* @brief The threshold for skipping 3.1/4.1 trials (-2partitionlimitfactor).
*
* This option is further scaled for normal maps, so it skips less often.
*/
float tune_2partition_early_out_limit_factor;
/**
* @brief The threshold for skipping 4.1 trials (-3partitionlimitfactor).
*
* This option is further scaled for normal maps, so it skips less often.
*/
float tune_3partition_early_out_limit_factor;
/**
* @brief The threshold for skipping two weight planes (-2planelimitcorrelation).
*
* This option is ineffective for normal maps.
*/
float tune_2plane_early_out_limit_correlation;
/**
* @brief The config enable for the mode0 fast-path search.
*
* If this is set to TUNE_MIN_TEXELS_MODE0 or higher then the early-out fast mode0
* search is enabled. This option is ineffective for 3D block sizes.
*/
float tune_search_mode0_enable;
/**
* @brief The progress callback, can be @c nullptr.
*
* If this is specified the codec will peridocially report progress for
* compression as a percentage between 0 and 100. The callback is called from one
* of the compressor threads, so doing significant work in the callback will
* reduce compression performance.
*/
astcenc_progress_callback progress_callback;
#if defined(ASTCENC_DIAGNOSTICS)
/**
* @brief The path to save the diagnostic trace data to.
*
* This option is not part of the public API, and requires special builds
* of the library.
*/
const char* trace_file_path;
#endif
};
/**
* @brief An uncompressed 2D or 3D image.
*
* 3D image are passed in as an array of 2D slices. Each slice has identical
* size and color format.
*/
struct astcenc_image
{
/** @brief The X dimension of the image, in texels. */
unsigned int dim_x;
/** @brief The Y dimension of the image, in texels. */
unsigned int dim_y;
/** @brief The Z dimension of the image, in texels. */
unsigned int dim_z;
/** @brief The data type per component. */
astcenc_type data_type;
/** @brief The array of 2D slices, of length @c dim_z. */
void** data;
};
/**
* @brief A block encoding metadata query result.
*
* If the block is an error block or a constant color block or an error block all fields other than
* the profile, block dimensions, and error/constant indicator will be zero.
*/
struct astcenc_block_info
{
/** @brief The block encoding color profile. */
astcenc_profile profile;
/** @brief The number of texels in the X dimension. */
unsigned int block_x;
/** @brief The number of texels in the Y dimension. */
unsigned int block_y;
/** @brief The number of texel in the Z dimension. */
unsigned int block_z;
/** @brief The number of texels in the block. */
unsigned int texel_count;
/** @brief True if this block is an error block. */
bool is_error_block;
/** @brief True if this block is a constant color block. */
bool is_constant_block;
/** @brief True if this block is an HDR block. */
bool is_hdr_block;
/** @brief True if this block uses two weight planes. */
bool is_dual_plane_block;
/** @brief The number of partitions if not constant color. */
unsigned int partition_count;
/** @brief The partition index if 2 - 4 partitions used. */
unsigned int partition_index;
/** @brief The component index of the second plane if dual plane. */
unsigned int dual_plane_component;
/** @brief The color endpoint encoding mode for each partition. */
unsigned int color_endpoint_modes[4];
/** @brief The number of color endpoint quantization levels. */
unsigned int color_level_count;
/** @brief The number of weight quantization levels. */
unsigned int weight_level_count;
/** @brief The number of weights in the X dimension. */
unsigned int weight_x;
/** @brief The number of weights in the Y dimension. */
unsigned int weight_y;
/** @brief The number of weights in the Z dimension. */
unsigned int weight_z;
/** @brief The unpacked color endpoints for each partition. */
float color_endpoints[4][2][4];
/** @brief The per-texel interpolation weights for the block. */
float weight_values_plane1[216];
/** @brief The per-texel interpolation weights for the block. */
float weight_values_plane2[216];
/** @brief The per-texel partition assignments for the block. */
uint8_t partition_assignment[216];
};
/**
* Populate a codec config based on default settings.
*
* Power users can edit the returned config struct to fine tune before allocating the context.
*
* @param profile Color profile.
* @param block_x ASTC block size X dimension.
* @param block_y ASTC block size Y dimension.
* @param block_z ASTC block size Z dimension.
* @param quality Search quality preset / effort level. Either an
* @c ASTCENC_PRE_* value, or a effort level between 0
* and 100. Performance is not linear between 0 and 100.
* @param flags A valid set of @c ASTCENC_FLG_* flag bits.
* @param[out] config Output config struct to populate.
*
* @return @c ASTCENC_SUCCESS on success, or an error if the inputs are invalid
* either individually, or in combination.
*/
ASTCENC_PUBLIC astcenc_error astcenc_config_init(
astcenc_profile profile,
unsigned int block_x,
unsigned int block_y,
unsigned int block_z,
float quality,
unsigned int flags,
astcenc_config* config);
/**
* @brief Allocate a new codec context based on a config.
*
* This function allocates all of the memory resources and threads needed by the codec. This can be
* slow, so it is recommended that contexts are reused to serially compress or decompress multiple
* images to amortize setup cost.
*
* Contexts can be allocated to support only decompression using the @c ASTCENC_FLG_DECOMPRESS_ONLY
* flag when creating the configuration. The compression functions will fail if invoked. For a
* decompress-only library build the @c ASTCENC_FLG_DECOMPRESS_ONLY flag must be set when creating
* any context.
*
* @param[in] config Codec config.
* @param thread_count Thread count to configure for.
* @param[out] context Location to store an opaque context pointer.
*
* @return @c ASTCENC_SUCCESS on success, or an error if context creation failed.
*/
ASTCENC_PUBLIC astcenc_error astcenc_context_alloc(
const astcenc_config* config,
unsigned int thread_count,
astcenc_context** context);
/**
* @brief Compress an image.
*
* A single context can only compress or decompress a single image at a time.
*
* For a context configured for multi-threading, any set of the N threads can call this function.
* Work will be dynamically scheduled across the threads available. Each thread must have a unique
* @c thread_index.
*
* @param context Codec context.
* @param[in,out] image An input image, in 2D slices.
* @param swizzle Compression data swizzle, applied before compression.
* @param[out] data_out Pointer to output data array.
* @param data_len Length of the output data array.
* @param thread_index Thread index [0..N-1] of calling thread.
*
* @return @c ASTCENC_SUCCESS on success, or an error if compression failed.
*/
ASTCENC_PUBLIC astcenc_error astcenc_compress_image(
astcenc_context* context,
astcenc_image* image,
const astcenc_swizzle* swizzle,
uint8_t* data_out,
size_t data_len,
unsigned int thread_index);
/**
* @brief Reset the codec state for a new compression.
*
* The caller is responsible for synchronizing threads in the worker thread pool. This function must
* only be called when all threads have exited the @c astcenc_compress_image() function for image N,
* but before any thread enters it for image N + 1.
*
* Calling this is not required (but won't hurt), if the context is created for single threaded use.
*
* @param context Codec context.
*
* @return @c ASTCENC_SUCCESS on success, or an error if reset failed.
*/
ASTCENC_PUBLIC astcenc_error astcenc_compress_reset(
astcenc_context* context);
/**
* @brief Cancel any pending compression operation.
*
* The caller must behave as if the compression completed normally, even though the data will be
* undefined. They are still responsible for synchronizing threads in the worker thread pool, and
* must call reset before starting another compression.
*
* @param context Codec context.
*
* @return @c ASTCENC_SUCCESS on success, or an error if cancellation failed.
*/
ASTCENC_PUBLIC astcenc_error astcenc_compress_cancel(
astcenc_context* context);
/**
* @brief Decompress an image.
*
* @param context Codec context.
* @param[in] data Pointer to compressed data.
* @param data_len Length of the compressed data, in bytes.
* @param[in,out] image_out Output image.
* @param swizzle Decompression data swizzle, applied after decompression.
* @param thread_index Thread index [0..N-1] of calling thread.
*
* @return @c ASTCENC_SUCCESS on success, or an error if decompression failed.
*/
ASTCENC_PUBLIC astcenc_error astcenc_decompress_image(
astcenc_context* context,
const uint8_t* data,
size_t data_len,
astcenc_image* image_out,
const astcenc_swizzle* swizzle,
unsigned int thread_index);
/**
* @brief Reset the codec state for a new decompression.
*
* The caller is responsible for synchronizing threads in the worker thread pool. This function must
* only be called when all threads have exited the @c astcenc_decompress_image() function for image
* N, but before any thread enters it for image N + 1.
*
* Calling this is not required (but won't hurt), if the context is created for single threaded use.
*
* @param context Codec context.
*
* @return @c ASTCENC_SUCCESS on success, or an error if reset failed.
*/
ASTCENC_PUBLIC astcenc_error astcenc_decompress_reset(
astcenc_context* context);
/**
* Free the compressor context.
*
* @param context The codec context.
*/
ASTCENC_PUBLIC void astcenc_context_free(
astcenc_context* context);
/**
* @brief Provide a high level summary of a block's encoding.
*
* This feature is primarily useful for codec developers but may be useful for developers building
* advanced content packaging pipelines.
*
* @param context Codec context.
* @param data One block of compressed ASTC data.
* @param info The output info structure to populate.
*
* @return @c ASTCENC_SUCCESS if the block was decoded, or an error otherwise. Note that this
* function will return success even if the block itself was an error block encoding, as the
* decode was correctly handled.
*/
ASTCENC_PUBLIC astcenc_error astcenc_get_block_info(
astcenc_context* context,
const uint8_t data[16],
astcenc_block_info* info);
/**
* @brief Get a printable string for specific status code.
*
* @param status The status value.
*
* @return A human readable nul-terminated string.
*/
ASTCENC_PUBLIC const char* astcenc_get_error_string(
astcenc_error status);
#endif
@@ -0,0 +1,948 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2025 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Functions for finding dominant direction of a set of colors.
*/
#if !defined(ASTCENC_DECOMPRESS_ONLY)
#include "astcenc_internal.h"
#include <cassert>
/**
* @brief Compute the average RGB color of each partition.
*
* The algorithm here uses a vectorized sequential scan and per-partition
* color accumulators, using select() to mask texel lanes in other partitions.
*
* We only accumulate sums for N-1 partitions during the scan; the value for
* the last partition can be computed given that we know the block-wide average
* already.
*
* Because of this we could reduce the loop iteration count so it "just" spans
* the max texel index needed for the N-1 partitions, which could need fewer
* iterations than the full block texel count. However, this makes the loop
* count erratic and causes more branch mispredictions so is a net loss.
*
* @param pi The partitioning to use.
* @param blk The block data to process.
* @param[out] averages The output averages. Unused partition indices will
* not be initialized, and lane<3> will be zero.
*/
static void compute_partition_averages_rgb(
const partition_info& pi,
const image_block& blk,
vfloat4 averages[BLOCK_MAX_PARTITIONS]
) {
unsigned int partition_count = pi.partition_count;
size_t texel_count = blk.texel_count;
promise(texel_count > 0);
// For 1 partition just use the precomputed mean
if (partition_count == 1)
{
averages[0] = blk.data_mean.swz<0, 1, 2>();
}
// For 2 partitions scan results for partition 0, compute partition 1
else if (partition_count == 2)
{
vfloatacc pp_avg_rgb[3] {};
vint lane_id = vint::lane_id();
for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
{
vint texel_partition(pi.partition_of_texel + i);
vmask lane_mask = lane_id < vint_from_size(texel_count);
lane_id += vint(ASTCENC_SIMD_WIDTH);
vmask p0_mask = lane_mask & (texel_partition == vint(0));
vfloat data_r = loada(blk.data_r + i);
haccumulate(pp_avg_rgb[0], data_r, p0_mask);
vfloat data_g = loada(blk.data_g + i);
haccumulate(pp_avg_rgb[1], data_g, p0_mask);
vfloat data_b = loada(blk.data_b + i);
haccumulate(pp_avg_rgb[2], data_b, p0_mask);
}
vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0]),
hadd_s(pp_avg_rgb[1]),
hadd_s(pp_avg_rgb[2]));
vfloat4 p1_total = block_total - p0_total;
averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
}
// For 3 partitions scan results for partition 0/1, compute partition 2
else if (partition_count == 3)
{
vfloatacc pp_avg_rgb[2][3] {};
vint lane_id = vint::lane_id();
for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
{
vint texel_partition(pi.partition_of_texel + i);
vmask lane_mask = lane_id < vint_from_size(texel_count);
lane_id += vint(ASTCENC_SIMD_WIDTH);
vmask p0_mask = lane_mask & (texel_partition == vint(0));
vmask p1_mask = lane_mask & (texel_partition == vint(1));
vfloat data_r = loada(blk.data_r + i);
haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
vfloat data_g = loada(blk.data_g + i);
haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
vfloat data_b = loada(blk.data_b + i);
haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
}
vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
hadd_s(pp_avg_rgb[0][1]),
hadd_s(pp_avg_rgb[0][2]));
vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
hadd_s(pp_avg_rgb[1][1]),
hadd_s(pp_avg_rgb[1][2]));
vfloat4 p2_total = block_total - p0_total - p1_total;
averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
}
else
{
// For 4 partitions scan results for partition 0/1/2, compute partition 3
vfloatacc pp_avg_rgb[3][3] {};
vint lane_id = vint::lane_id();
for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
{
vint texel_partition(pi.partition_of_texel + i);
vmask lane_mask = lane_id < vint_from_size(texel_count);
lane_id += vint(ASTCENC_SIMD_WIDTH);
vmask p0_mask = lane_mask & (texel_partition == vint(0));
vmask p1_mask = lane_mask & (texel_partition == vint(1));
vmask p2_mask = lane_mask & (texel_partition == vint(2));
vfloat data_r = loada(blk.data_r + i);
haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
haccumulate(pp_avg_rgb[2][0], data_r, p2_mask);
vfloat data_g = loada(blk.data_g + i);
haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
haccumulate(pp_avg_rgb[2][1], data_g, p2_mask);
vfloat data_b = loada(blk.data_b + i);
haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
haccumulate(pp_avg_rgb[2][2], data_b, p2_mask);
}
vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
hadd_s(pp_avg_rgb[0][1]),
hadd_s(pp_avg_rgb[0][2]));
vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
hadd_s(pp_avg_rgb[1][1]),
hadd_s(pp_avg_rgb[1][2]));
vfloat4 p2_total = vfloat3(hadd_s(pp_avg_rgb[2][0]),
hadd_s(pp_avg_rgb[2][1]),
hadd_s(pp_avg_rgb[2][2]));
vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
}
}
/**
* @brief Compute the average RGBA color of each partition.
*
* The algorithm here uses a vectorized sequential scan and per-partition
* color accumulators, using select() to mask texel lanes in other partitions.
*
* We only accumulate sums for N-1 partitions during the scan; the value for
* the last partition can be computed given that we know the block-wide average
* already.
*
* Because of this we could reduce the loop iteration count so it "just" spans
* the max texel index needed for the N-1 partitions, which could need fewer
* iterations than the full block texel count. However, this makes the loop
* count erratic and causes more branch mispredictions so is a net loss.
*
* @param pi The partitioning to use.
* @param blk The block data to process.
* @param[out] averages The output averages. Unused partition indices will
* not be initialized.
*/
static void compute_partition_averages_rgba(
const partition_info& pi,
const image_block& blk,
vfloat4 averages[BLOCK_MAX_PARTITIONS]
) {
unsigned int partition_count = pi.partition_count;
size_t texel_count = blk.texel_count;
promise(texel_count > 0);
// For 1 partition just use the precomputed mean
if (partition_count == 1)
{
averages[0] = blk.data_mean;
}
// For 2 partitions scan results for partition 0, compute partition 1
else if (partition_count == 2)
{
vfloat4 pp_avg_rgba[4] {};
vint lane_id = vint::lane_id();
for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
{
vint texel_partition(pi.partition_of_texel + i);
vmask lane_mask = lane_id < vint_from_size(texel_count);
lane_id += vint(ASTCENC_SIMD_WIDTH);
vmask p0_mask = lane_mask & (texel_partition == vint(0));
vfloat data_r = loada(blk.data_r + i);
haccumulate(pp_avg_rgba[0], data_r, p0_mask);
vfloat data_g = loada(blk.data_g + i);
haccumulate(pp_avg_rgba[1], data_g, p0_mask);
vfloat data_b = loada(blk.data_b + i);
haccumulate(pp_avg_rgba[2], data_b, p0_mask);
vfloat data_a = loada(blk.data_a + i);
haccumulate(pp_avg_rgba[3], data_a, p0_mask);
}
vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0]),
hadd_s(pp_avg_rgba[1]),
hadd_s(pp_avg_rgba[2]),
hadd_s(pp_avg_rgba[3]));
vfloat4 p1_total = block_total - p0_total;
averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
}
// For 3 partitions scan results for partition 0/1, compute partition 2
else if (partition_count == 3)
{
vfloat4 pp_avg_rgba[2][4] {};
vint lane_id = vint::lane_id();
for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
{
vint texel_partition(pi.partition_of_texel + i);
vmask lane_mask = lane_id < vint_from_size(texel_count);
lane_id += vint(ASTCENC_SIMD_WIDTH);
vmask p0_mask = lane_mask & (texel_partition == vint(0));
vmask p1_mask = lane_mask & (texel_partition == vint(1));
vfloat data_r = loada(blk.data_r + i);
haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
vfloat data_g = loada(blk.data_g + i);
haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
vfloat data_b = loada(blk.data_b + i);
haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
vfloat data_a = loada(blk.data_a + i);
haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
}
vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
hadd_s(pp_avg_rgba[0][1]),
hadd_s(pp_avg_rgba[0][2]),
hadd_s(pp_avg_rgba[0][3]));
vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
hadd_s(pp_avg_rgba[1][1]),
hadd_s(pp_avg_rgba[1][2]),
hadd_s(pp_avg_rgba[1][3]));
vfloat4 p2_total = block_total - p0_total - p1_total;
averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
}
else
{
// For 4 partitions scan results for partition 0/1/2, compute partition 3
vfloat4 pp_avg_rgba[3][4] {};
vint lane_id = vint::lane_id();
for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
{
vint texel_partition(pi.partition_of_texel + i);
vmask lane_mask = lane_id < vint_from_size(texel_count);
lane_id += vint(ASTCENC_SIMD_WIDTH);
vmask p0_mask = lane_mask & (texel_partition == vint(0));
vmask p1_mask = lane_mask & (texel_partition == vint(1));
vmask p2_mask = lane_mask & (texel_partition == vint(2));
vfloat data_r = loada(blk.data_r + i);
haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
haccumulate(pp_avg_rgba[2][0], data_r, p2_mask);
vfloat data_g = loada(blk.data_g + i);
haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
haccumulate(pp_avg_rgba[2][1], data_g, p2_mask);
vfloat data_b = loada(blk.data_b + i);
haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
haccumulate(pp_avg_rgba[2][2], data_b, p2_mask);
vfloat data_a = loada(blk.data_a + i);
haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
haccumulate(pp_avg_rgba[2][3], data_a, p2_mask);
}
vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
hadd_s(pp_avg_rgba[0][1]),
hadd_s(pp_avg_rgba[0][2]),
hadd_s(pp_avg_rgba[0][3]));
vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
hadd_s(pp_avg_rgba[1][1]),
hadd_s(pp_avg_rgba[1][2]),
hadd_s(pp_avg_rgba[1][3]));
vfloat4 p2_total = vfloat4(hadd_s(pp_avg_rgba[2][0]),
hadd_s(pp_avg_rgba[2][1]),
hadd_s(pp_avg_rgba[2][2]),
hadd_s(pp_avg_rgba[2][3]));
vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
}
}
/* See header for documentation. */
void compute_avgs_and_dirs_4_comp(
const partition_info& pi,
const image_block& blk,
partition_metrics pm[BLOCK_MAX_PARTITIONS]
) {
size_t partition_count = pi.partition_count;
promise(partition_count > 0);
// Pre-compute partition_averages
vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
compute_partition_averages_rgba(pi, blk, partition_averages);
for (size_t partition = 0; partition < partition_count; partition++)
{
const uint8_t *texel_indexes = pi.texels_of_partition[partition];
size_t texel_count = pi.partition_texel_count[partition];
promise(texel_count > 0);
vfloat4 average = partition_averages[partition];
pm[partition].avg = average;
vfloat4 sum_xp = vfloat4::zero();
vfloat4 sum_yp = vfloat4::zero();
vfloat4 sum_zp = vfloat4::zero();
vfloat4 sum_wp = vfloat4::zero();
for (size_t i = 0; i < texel_count; i++)
{
unsigned int iwt = texel_indexes[i];
vfloat4 texel_datum = blk.texel(iwt);
texel_datum = texel_datum - average;
vfloat4 zero = vfloat4::zero();
vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
sum_xp += select(zero, texel_datum, tdm0);
vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
sum_yp += select(zero, texel_datum, tdm1);
vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
sum_zp += select(zero, texel_datum, tdm2);
vmask4 tdm3 = texel_datum.swz<3,3,3,3>() > zero;
sum_wp += select(zero, texel_datum, tdm3);
}
vfloat4 prod_xp = dot(sum_xp, sum_xp);
vfloat4 prod_yp = dot(sum_yp, sum_yp);
vfloat4 prod_zp = dot(sum_zp, sum_zp);
vfloat4 prod_wp = dot(sum_wp, sum_wp);
vfloat4 best_vector = sum_xp;
vfloat4 best_sum = prod_xp;
vmask4 mask = prod_yp > best_sum;
best_vector = select(best_vector, sum_yp, mask);
best_sum = select(best_sum, prod_yp, mask);
mask = prod_zp > best_sum;
best_vector = select(best_vector, sum_zp, mask);
best_sum = select(best_sum, prod_zp, mask);
mask = prod_wp > best_sum;
best_vector = select(best_vector, sum_wp, mask);
pm[partition].dir = best_vector;
}
}
/* See header for documentation. */
void compute_avgs_and_dirs_3_comp(
const partition_info& pi,
const image_block& blk,
unsigned int omitted_component,
partition_metrics pm[BLOCK_MAX_PARTITIONS]
) {
// Pre-compute partition_averages
vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
compute_partition_averages_rgba(pi, blk, partition_averages);
const float* data_vr = blk.data_r;
const float* data_vg = blk.data_g;
const float* data_vb = blk.data_b;
// TODO: Data-driven permute would be useful to avoid this ...
if (omitted_component == 0)
{
partition_averages[0] = partition_averages[0].swz<1, 2, 3>();
partition_averages[1] = partition_averages[1].swz<1, 2, 3>();
partition_averages[2] = partition_averages[2].swz<1, 2, 3>();
partition_averages[3] = partition_averages[3].swz<1, 2, 3>();
data_vr = blk.data_g;
data_vg = blk.data_b;
data_vb = blk.data_a;
}
else if (omitted_component == 1)
{
partition_averages[0] = partition_averages[0].swz<0, 2, 3>();
partition_averages[1] = partition_averages[1].swz<0, 2, 3>();
partition_averages[2] = partition_averages[2].swz<0, 2, 3>();
partition_averages[3] = partition_averages[3].swz<0, 2, 3>();
data_vg = blk.data_b;
data_vb = blk.data_a;
}
else if (omitted_component == 2)
{
partition_averages[0] = partition_averages[0].swz<0, 1, 3>();
partition_averages[1] = partition_averages[1].swz<0, 1, 3>();
partition_averages[2] = partition_averages[2].swz<0, 1, 3>();
partition_averages[3] = partition_averages[3].swz<0, 1, 3>();
data_vb = blk.data_a;
}
else
{
partition_averages[0] = partition_averages[0].swz<0, 1, 2>();
partition_averages[1] = partition_averages[1].swz<0, 1, 2>();
partition_averages[2] = partition_averages[2].swz<0, 1, 2>();
partition_averages[3] = partition_averages[3].swz<0, 1, 2>();
}
size_t partition_count = pi.partition_count;
promise(partition_count > 0);
for (size_t partition = 0; partition < partition_count; partition++)
{
const uint8_t *texel_indexes = pi.texels_of_partition[partition];
size_t texel_count = pi.partition_texel_count[partition];
promise(texel_count > 0);
vfloat4 average = partition_averages[partition];
pm[partition].avg = average;
vfloat4 sum_xp = vfloat4::zero();
vfloat4 sum_yp = vfloat4::zero();
vfloat4 sum_zp = vfloat4::zero();
for (size_t i = 0; i < texel_count; i++)
{
unsigned int iwt = texel_indexes[i];
vfloat4 texel_datum = vfloat3(data_vr[iwt],
data_vg[iwt],
data_vb[iwt]);
texel_datum = texel_datum - average;
vfloat4 zero = vfloat4::zero();
vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
sum_xp += select(zero, texel_datum, tdm0);
vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
sum_yp += select(zero, texel_datum, tdm1);
vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
sum_zp += select(zero, texel_datum, tdm2);
}
vfloat4 prod_xp = dot(sum_xp, sum_xp);
vfloat4 prod_yp = dot(sum_yp, sum_yp);
vfloat4 prod_zp = dot(sum_zp, sum_zp);
vfloat4 best_vector = sum_xp;
vfloat4 best_sum = prod_xp;
vmask4 mask = prod_yp > best_sum;
best_vector = select(best_vector, sum_yp, mask);
best_sum = select(best_sum, prod_yp, mask);
mask = prod_zp > best_sum;
best_vector = select(best_vector, sum_zp, mask);
pm[partition].dir = best_vector;
}
}
/* See header for documentation. */
void compute_avgs_and_dirs_3_comp_rgb(
const partition_info& pi,
const image_block& blk,
partition_metrics pm[BLOCK_MAX_PARTITIONS]
) {
size_t partition_count = pi.partition_count;
promise(partition_count > 0);
// Pre-compute partition_averages
vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
compute_partition_averages_rgb(pi, blk, partition_averages);
for (size_t partition = 0; partition < partition_count; partition++)
{
const uint8_t *texel_indexes = pi.texels_of_partition[partition];
size_t texel_count = pi.partition_texel_count[partition];
promise(texel_count > 0);
vfloat4 average = partition_averages[partition];
pm[partition].avg = average;
vfloat4 sum_xp = vfloat4::zero();
vfloat4 sum_yp = vfloat4::zero();
vfloat4 sum_zp = vfloat4::zero();
for (size_t i = 0; i < texel_count; i++)
{
unsigned int iwt = texel_indexes[i];
vfloat4 texel_datum = blk.texel3(iwt);
texel_datum = texel_datum - average;
vfloat4 zero = vfloat4::zero();
vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
sum_xp += select(zero, texel_datum, tdm0);
vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
sum_yp += select(zero, texel_datum, tdm1);
vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
sum_zp += select(zero, texel_datum, tdm2);
}
vfloat4 prod_xp = dot(sum_xp, sum_xp);
vfloat4 prod_yp = dot(sum_yp, sum_yp);
vfloat4 prod_zp = dot(sum_zp, sum_zp);
vfloat4 best_vector = sum_xp;
vfloat4 best_sum = prod_xp;
vmask4 mask = prod_yp > best_sum;
best_vector = select(best_vector, sum_yp, mask);
best_sum = select(best_sum, prod_yp, mask);
mask = prod_zp > best_sum;
best_vector = select(best_vector, sum_zp, mask);
pm[partition].dir = best_vector;
}
}
/* See header for documentation. */
void compute_avgs_and_dirs_2_comp(
const partition_info& pt,
const image_block& blk,
unsigned int component1,
unsigned int component2,
partition_metrics pm[BLOCK_MAX_PARTITIONS]
) {
vfloat4 average;
const float* data_vr = nullptr;
const float* data_vg = nullptr;
if (component1 == 0 && component2 == 1)
{
average = blk.data_mean.swz<0, 1>();
data_vr = blk.data_r;
data_vg = blk.data_g;
}
else if (component1 == 0 && component2 == 2)
{
average = blk.data_mean.swz<0, 2>();
data_vr = blk.data_r;
data_vg = blk.data_b;
}
else // (component1 == 1 && component2 == 2)
{
assert(component1 == 1 && component2 == 2);
average = blk.data_mean.swz<1, 2>();
data_vr = blk.data_g;
data_vg = blk.data_b;
}
size_t partition_count = pt.partition_count;
promise(partition_count > 0);
for (size_t partition = 0; partition < partition_count; partition++)
{
const uint8_t *texel_indexes = pt.texels_of_partition[partition];
size_t texel_count = pt.partition_texel_count[partition];
promise(texel_count > 0);
// Only compute a partition mean if more than one partition
if (partition_count > 1)
{
average = vfloat4::zero();
for (size_t i = 0; i < texel_count; i++)
{
unsigned int iwt = texel_indexes[i];
average += vfloat2(data_vr[iwt], data_vg[iwt]);
}
average = average / static_cast<float>(texel_count);
}
pm[partition].avg = average;
vfloat4 sum_xp = vfloat4::zero();
vfloat4 sum_yp = vfloat4::zero();
for (size_t i = 0; i < texel_count; i++)
{
unsigned int iwt = texel_indexes[i];
vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]);
texel_datum = texel_datum - average;
vfloat4 zero = vfloat4::zero();
vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
sum_xp += select(zero, texel_datum, tdm0);
vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
sum_yp += select(zero, texel_datum, tdm1);
}
vfloat4 prod_xp = dot(sum_xp, sum_xp);
vfloat4 prod_yp = dot(sum_yp, sum_yp);
vfloat4 best_vector = sum_xp;
vfloat4 best_sum = prod_xp;
vmask4 mask = prod_yp > best_sum;
best_vector = select(best_vector, sum_yp, mask);
pm[partition].dir = best_vector;
}
}
/* See header for documentation. */
void compute_error_squared_rgba(
const partition_info& pi,
const image_block& blk,
const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],
const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],
float line_lengths[BLOCK_MAX_PARTITIONS],
float& uncor_error,
float& samec_error
) {
size_t partition_count = pi.partition_count;
promise(partition_count > 0);
vfloatacc uncor_errorsumv = vfloatacc::zero();
vfloatacc samec_errorsumv = vfloatacc::zero();
for (size_t partition = 0; partition < partition_count; partition++)
{
const uint8_t *texel_indexes = pi.texels_of_partition[partition];
processed_line4 l_uncor = uncor_plines[partition];
processed_line4 l_samec = samec_plines[partition];
size_t texel_count = pi.partition_texel_count[partition];
promise(texel_count > 0);
// Vectorize some useful scalar inputs
vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
vfloat l_uncor_bs3(l_uncor.bs.lane<3>());
vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
vfloat l_uncor_amod3(l_uncor.amod.lane<3>());
vfloat l_samec_bs0(l_samec.bs.lane<0>());
vfloat l_samec_bs1(l_samec.bs.lane<1>());
vfloat l_samec_bs2(l_samec.bs.lane<2>());
vfloat l_samec_bs3(l_samec.bs.lane<3>());
assert(all(l_samec.amod == vfloat4(0.0f)));
vfloat uncor_loparamv(1e10f);
vfloat uncor_hiparamv(-1e10f);
vfloat ew_r(blk.channel_weight.lane<0>());
vfloat ew_g(blk.channel_weight.lane<1>());
vfloat ew_b(blk.channel_weight.lane<2>());
vfloat ew_a(blk.channel_weight.lane<3>());
// This implementation over-shoots, but this is safe as we initialize the texel_indexes
// array to extend the last value. This means min/max are not impacted, but we need to mask
// out the dummy values when we compute the line weighting.
vint lane_ids = vint::lane_id();
for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
{
vmask mask = lane_ids < vint_from_size(texel_count);
const uint8_t* texel_idxs = texel_indexes + i;
vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
vfloat data_a = gatherf_byte_inds<vfloat>(blk.data_a, texel_idxs);
vfloat uncor_param = (data_r * l_uncor_bs0)
+ (data_g * l_uncor_bs1)
+ (data_b * l_uncor_bs2)
+ (data_a * l_uncor_bs3);
uncor_loparamv = min(uncor_param, uncor_loparamv);
uncor_hiparamv = max(uncor_param, uncor_hiparamv);
vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
+ (uncor_param * l_uncor_bs0);
vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
+ (uncor_param * l_uncor_bs1);
vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
+ (uncor_param * l_uncor_bs2);
vfloat uncor_dist3 = (l_uncor_amod3 - data_a)
+ (uncor_param * l_uncor_bs3);
vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
+ (ew_g * uncor_dist1 * uncor_dist1)
+ (ew_b * uncor_dist2 * uncor_dist2)
+ (ew_a * uncor_dist3 * uncor_dist3);
haccumulate(uncor_errorsumv, uncor_err, mask);
// Process samechroma data
vfloat samec_param = (data_r * l_samec_bs0)
+ (data_g * l_samec_bs1)
+ (data_b * l_samec_bs2)
+ (data_a * l_samec_bs3);
vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
vfloat samec_dist3 = samec_param * l_samec_bs3 - data_a;
vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
+ (ew_g * samec_dist1 * samec_dist1)
+ (ew_b * samec_dist2 * samec_dist2)
+ (ew_a * samec_dist3 * samec_dist3);
haccumulate(samec_errorsumv, samec_err, mask);
lane_ids += vint(ASTCENC_SIMD_WIDTH);
}
// Turn very small numbers and NaNs into a small number
float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);
line_lengths[partition] = astc::max(uncor_linelen, 1e-7f);
}
uncor_error = hadd_s(uncor_errorsumv);
samec_error = hadd_s(samec_errorsumv);
}
/* See header for documentation. */
void compute_error_squared_rgb(
const partition_info& pi,
const image_block& blk,
partition_lines3 plines[BLOCK_MAX_PARTITIONS],
float& uncor_error,
float& samec_error
) {
size_t partition_count = pi.partition_count;
promise(partition_count > 0);
vfloatacc uncor_errorsumv = vfloatacc::zero();
vfloatacc samec_errorsumv = vfloatacc::zero();
for (size_t partition = 0; partition < partition_count; partition++)
{
partition_lines3& pl = plines[partition];
const uint8_t *texel_indexes = pi.texels_of_partition[partition];
size_t texel_count = pi.partition_texel_count[partition];
promise(texel_count > 0);
processed_line3 l_uncor = pl.uncor_pline;
processed_line3 l_samec = pl.samec_pline;
// Vectorize some useful scalar inputs
vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
vfloat l_samec_bs0(l_samec.bs.lane<0>());
vfloat l_samec_bs1(l_samec.bs.lane<1>());
vfloat l_samec_bs2(l_samec.bs.lane<2>());
assert(all(l_samec.amod == vfloat4(0.0f)));
vfloat uncor_loparamv(1e10f);
vfloat uncor_hiparamv(-1e10f);
vfloat ew_r(blk.channel_weight.lane<0>());
vfloat ew_g(blk.channel_weight.lane<1>());
vfloat ew_b(blk.channel_weight.lane<2>());
// This implementation over-shoots, but this is safe as we initialize the weights array
// to extend the last value. This means min/max are not impacted, but we need to mask
// out the dummy values when we compute the line weighting.
vint lane_ids = vint::lane_id();
for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
{
vmask mask = lane_ids < vint_from_size(texel_count);
const uint8_t* texel_idxs = texel_indexes + i;
vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
vfloat uncor_param = (data_r * l_uncor_bs0)
+ (data_g * l_uncor_bs1)
+ (data_b * l_uncor_bs2);
uncor_loparamv = min(uncor_param, uncor_loparamv);
uncor_hiparamv = max(uncor_param, uncor_hiparamv);
vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
+ (uncor_param * l_uncor_bs0);
vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
+ (uncor_param * l_uncor_bs1);
vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
+ (uncor_param * l_uncor_bs2);
vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
+ (ew_g * uncor_dist1 * uncor_dist1)
+ (ew_b * uncor_dist2 * uncor_dist2);
haccumulate(uncor_errorsumv, uncor_err, mask);
// Process samechroma data
vfloat samec_param = (data_r * l_samec_bs0)
+ (data_g * l_samec_bs1)
+ (data_b * l_samec_bs2);
vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
+ (ew_g * samec_dist1 * samec_dist1)
+ (ew_b * samec_dist2 * samec_dist2);
haccumulate(samec_errorsumv, samec_err, mask);
lane_ids += vint(ASTCENC_SIMD_WIDTH);
}
// Turn very small numbers and NaNs into a small number
float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);
pl.line_length = astc::max(uncor_linelen, 1e-7f);
}
uncor_error = hadd_s(uncor_errorsumv);
samec_error = hadd_s(samec_errorsumv);
}
#endif
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,941 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
#include <utility>
/**
* @brief Functions for color unquantization.
*/
#include "astcenc_internal.h"
/**
* @brief Un-blue-contract a color.
*
* This function reverses any applied blue contraction.
*
* @param input The input color that has been blue-contracted.
*
* @return The uncontracted color.
*/
static ASTCENC_SIMD_INLINE vint4 uncontract_color(
vint4 input
) {
vmask4 mask(true, true, false, false);
vint4 bc0 = asr<1>(input + input.lane<2>());
return select(input, bc0, mask);
}
void rgba_delta_unpack(
vint4 input0,
vint4 input1,
vint4& output0,
vint4& output1
) {
// Apply bit transfer
bit_transfer_signed(input1, input0);
// Apply blue-uncontraction if needed
int rgb_sum = hadd_rgb_s(input1);
input1 = input1 + input0;
if (rgb_sum < 0)
{
input0 = uncontract_color(input0);
input1 = uncontract_color(input1);
std::swap(input0, input1);
}
output0 = clamp(0, 255, input0);
output1 = clamp(0, 255, input1);
}
/**
* @brief Unpack an LDR RGB color that uses delta encoding.
*
* Output alpha set to 255.
*
* @param input0 The packed endpoint 0 color.
* @param input1 The packed endpoint 1 color deltas.
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void rgb_delta_unpack(
vint4 input0,
vint4 input1,
vint4& output0,
vint4& output1
) {
rgba_delta_unpack(input0, input1, output0, output1);
output0.set_lane<3>(255);
output1.set_lane<3>(255);
}
void rgba_unpack(
vint4 input0,
vint4 input1,
vint4& output0,
vint4& output1
) {
// Apply blue-uncontraction if needed
if (hadd_rgb_s(input0) > hadd_rgb_s(input1))
{
input0 = uncontract_color(input0);
input1 = uncontract_color(input1);
std::swap(input0, input1);
}
output0 = input0;
output1 = input1;
}
/**
* @brief Unpack an LDR RGB color that uses direct encoding.
*
* Output alpha set to 255.
*
* @param input0 The packed endpoint 0 color.
* @param input1 The packed endpoint 1 color.
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void rgb_unpack(
vint4 input0,
vint4 input1,
vint4& output0,
vint4& output1
) {
rgba_unpack(input0, input1, output0, output1);
output0.set_lane<3>(255);
output1.set_lane<3>(255);
}
/**
* @brief Unpack an LDR RGBA color that uses scaled encoding.
*
* Note only the RGB channels use the scaled encoding, alpha uses direct.
*
* @param input0 The packed endpoint 0 color.
* @param alpha1 The packed endpoint 1 alpha value.
* @param scale The packed quantized scale.
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void rgb_scale_alpha_unpack(
vint4 input0,
uint8_t alpha1,
uint8_t scale,
vint4& output0,
vint4& output1
) {
output1 = input0;
output1.set_lane<3>(alpha1);
output0 = asr<8>(input0 * scale);
output0.set_lane<3>(input0.lane<3>());
}
/**
* @brief Unpack an LDR RGB color that uses scaled encoding.
*
* Output alpha is 255.
*
* @param input0 The packed endpoint 0 color.
* @param scale The packed scale.
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void rgb_scale_unpack(
vint4 input0,
int scale,
vint4& output0,
vint4& output1
) {
output1 = input0;
output1.set_lane<3>(255);
output0 = asr<8>(input0 * scale);
output0.set_lane<3>(255);
}
/**
* @brief Unpack an LDR L color that uses direct encoding.
*
* Output alpha is 255.
*
* @param input The packed endpoints.
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void luminance_unpack(
const uint8_t input[2],
vint4& output0,
vint4& output1
) {
int lum0 = input[0];
int lum1 = input[1];
output0 = vint4(lum0, lum0, lum0, 255);
output1 = vint4(lum1, lum1, lum1, 255);
}
/**
* @brief Unpack an LDR L color that uses delta encoding.
*
* Output alpha is 255.
*
* @param input The packed endpoints (L0, L1).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void luminance_delta_unpack(
const uint8_t input[2],
vint4& output0,
vint4& output1
) {
int v0 = input[0];
int v1 = input[1];
int l0 = (v0 >> 2) | (v1 & 0xC0);
int l1 = l0 + (v1 & 0x3F);
l1 = astc::min(l1, 255);
output0 = vint4(l0, l0, l0, 255);
output1 = vint4(l1, l1, l1, 255);
}
/**
* @brief Unpack an LDR LA color that uses direct encoding.
*
* @param input The packed endpoints (L0, L1, A0, A1).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void luminance_alpha_unpack(
const uint8_t input[4],
vint4& output0,
vint4& output1
) {
int lum0 = input[0];
int lum1 = input[1];
int alpha0 = input[2];
int alpha1 = input[3];
output0 = vint4(lum0, lum0, lum0, alpha0);
output1 = vint4(lum1, lum1, lum1, alpha1);
}
/**
* @brief Unpack an LDR LA color that uses delta encoding.
*
* @param input The packed endpoints (L0, L1, A0, A1).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void luminance_alpha_delta_unpack(
const uint8_t input[4],
vint4& output0,
vint4& output1
) {
int lum0 = input[0];
int lum1 = input[1];
int alpha0 = input[2];
int alpha1 = input[3];
lum0 |= (lum1 & 0x80) << 1;
alpha0 |= (alpha1 & 0x80) << 1;
lum1 &= 0x7F;
alpha1 &= 0x7F;
if (lum1 & 0x40)
{
lum1 -= 0x80;
}
if (alpha1 & 0x40)
{
alpha1 -= 0x80;
}
lum0 >>= 1;
lum1 >>= 1;
alpha0 >>= 1;
alpha1 >>= 1;
lum1 += lum0;
alpha1 += alpha0;
lum1 = astc::clamp(lum1, 0, 255);
alpha1 = astc::clamp(alpha1, 0, 255);
output0 = vint4(lum0, lum0, lum0, alpha0);
output1 = vint4(lum1, lum1, lum1, alpha1);
}
/**
* @brief Unpack an HDR RGB + offset encoding.
*
* @param input The packed endpoints (packed and modal).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void hdr_rgbo_unpack(
const uint8_t input[4],
vint4& output0,
vint4& output1
) {
int v0 = input[0];
int v1 = input[1];
int v2 = input[2];
int v3 = input[3];
int modeval = ((v0 & 0xC0) >> 6) | (((v1 & 0x80) >> 7) << 2) | (((v2 & 0x80) >> 7) << 3);
int majcomp;
int mode;
if ((modeval & 0xC) != 0xC)
{
majcomp = modeval >> 2;
mode = modeval & 3;
}
else if (modeval != 0xF)
{
majcomp = modeval & 3;
mode = 4;
}
else
{
majcomp = 0;
mode = 5;
}
int red = v0 & 0x3F;
int green = v1 & 0x1F;
int blue = v2 & 0x1F;
int scale = v3 & 0x1F;
int bit0 = (v1 >> 6) & 1;
int bit1 = (v1 >> 5) & 1;
int bit2 = (v2 >> 6) & 1;
int bit3 = (v2 >> 5) & 1;
int bit4 = (v3 >> 7) & 1;
int bit5 = (v3 >> 6) & 1;
int bit6 = (v3 >> 5) & 1;
int ohcomp = 1 << mode;
if (ohcomp & 0x30)
green |= bit0 << 6;
if (ohcomp & 0x3A)
green |= bit1 << 5;
if (ohcomp & 0x30)
blue |= bit2 << 6;
if (ohcomp & 0x3A)
blue |= bit3 << 5;
if (ohcomp & 0x3D)
scale |= bit6 << 5;
if (ohcomp & 0x2D)
scale |= bit5 << 6;
if (ohcomp & 0x04)
scale |= bit4 << 7;
if (ohcomp & 0x3B)
red |= bit4 << 6;
if (ohcomp & 0x04)
red |= bit3 << 6;
if (ohcomp & 0x10)
red |= bit5 << 7;
if (ohcomp & 0x0F)
red |= bit2 << 7;
if (ohcomp & 0x05)
red |= bit1 << 8;
if (ohcomp & 0x0A)
red |= bit0 << 8;
if (ohcomp & 0x05)
red |= bit0 << 9;
if (ohcomp & 0x02)
red |= bit6 << 9;
if (ohcomp & 0x01)
red |= bit3 << 10;
if (ohcomp & 0x02)
red |= bit5 << 10;
// expand to 12 bits.
static const int shamts[6] { 1, 1, 2, 3, 4, 5 };
int shamt = shamts[mode];
red <<= shamt;
green <<= shamt;
blue <<= shamt;
scale <<= shamt;
// on modes 0 to 4, the values stored for "green" and "blue" are differentials,
// not absolute values.
if (mode != 5)
{
green = red - green;
blue = red - blue;
}
// switch around components.
int temp;
switch (majcomp)
{
case 1:
temp = red;
red = green;
green = temp;
break;
case 2:
temp = red;
red = blue;
blue = temp;
break;
default:
break;
}
int red0 = red - scale;
int green0 = green - scale;
int blue0 = blue - scale;
// clamp to [0,0xFFF].
if (red < 0)
red = 0;
if (green < 0)
green = 0;
if (blue < 0)
blue = 0;
if (red0 < 0)
red0 = 0;
if (green0 < 0)
green0 = 0;
if (blue0 < 0)
blue0 = 0;
output0 = vint4(red0 << 4, green0 << 4, blue0 << 4, 0x7800);
output1 = vint4(red << 4, green << 4, blue << 4, 0x7800);
}
/**
* @brief Unpack an HDR RGB direct encoding.
*
* @param input The packed endpoints (packed and modal).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void hdr_rgb_unpack(
const uint8_t input[6],
vint4& output0,
vint4& output1
) {
int v0 = input[0];
int v1 = input[1];
int v2 = input[2];
int v3 = input[3];
int v4 = input[4];
int v5 = input[5];
// extract all the fixed-placement bitfields
int modeval = ((v1 & 0x80) >> 7) | (((v2 & 0x80) >> 7) << 1) | (((v3 & 0x80) >> 7) << 2);
int majcomp = ((v4 & 0x80) >> 7) | (((v5 & 0x80) >> 7) << 1);
if (majcomp == 3)
{
output0 = vint4(v0 << 8, v2 << 8, (v4 & 0x7F) << 9, 0x7800);
output1 = vint4(v1 << 8, v3 << 8, (v5 & 0x7F) << 9, 0x7800);
return;
}
int a = v0 | ((v1 & 0x40) << 2);
int b0 = v2 & 0x3f;
int b1 = v3 & 0x3f;
int c = v1 & 0x3f;
int d0 = v4 & 0x7f;
int d1 = v5 & 0x7f;
// get hold of the number of bits in 'd0' and 'd1'
static const int dbits_tab[8] { 7, 6, 7, 6, 5, 6, 5, 6 };
int dbits = dbits_tab[modeval];
// extract six variable-placement bits
int bit0 = (v2 >> 6) & 1;
int bit1 = (v3 >> 6) & 1;
int bit2 = (v4 >> 6) & 1;
int bit3 = (v5 >> 6) & 1;
int bit4 = (v4 >> 5) & 1;
int bit5 = (v5 >> 5) & 1;
// and prepend the variable-placement bits depending on mode.
int ohmod = 1 << modeval; // one-hot-mode
if (ohmod & 0xA4)
a |= bit0 << 9;
if (ohmod & 0x8)
a |= bit2 << 9;
if (ohmod & 0x50)
a |= bit4 << 9;
if (ohmod & 0x50)
a |= bit5 << 10;
if (ohmod & 0xA0)
a |= bit1 << 10;
if (ohmod & 0xC0)
a |= bit2 << 11;
if (ohmod & 0x4)
c |= bit1 << 6;
if (ohmod & 0xE8)
c |= bit3 << 6;
if (ohmod & 0x20)
c |= bit2 << 7;
if (ohmod & 0x5B)
{
b0 |= bit0 << 6;
b1 |= bit1 << 6;
}
if (ohmod & 0x12)
{
b0 |= bit2 << 7;
b1 |= bit3 << 7;
}
if (ohmod & 0xAF)
{
d0 |= bit4 << 5;
d1 |= bit5 << 5;
}
if (ohmod & 0x5)
{
d0 |= bit2 << 6;
d1 |= bit3 << 6;
}
// sign-extend 'd0' and 'd1'
// note: this code assumes that signed right-shift actually sign-fills, not zero-fills.
int32_t d0x = d0;
int32_t d1x = d1;
int sx_shamt = 32 - dbits;
d0x <<= sx_shamt;
d0x >>= sx_shamt;
d1x <<= sx_shamt;
d1x >>= sx_shamt;
d0 = d0x;
d1 = d1x;
// expand all values to 12 bits, with left-shift as needed.
int val_shamt = (modeval >> 1) ^ 3;
a <<= val_shamt;
b0 <<= val_shamt;
b1 <<= val_shamt;
c <<= val_shamt;
d0 <<= val_shamt;
d1 <<= val_shamt;
// then compute the actual color values.
int red1 = a;
int green1 = a - b0;
int blue1 = a - b1;
int red0 = a - c;
int green0 = a - b0 - c - d0;
int blue0 = a - b1 - c - d1;
// clamp the color components to [0,2^12 - 1]
red0 = astc::clamp(red0, 0, 4095);
green0 = astc::clamp(green0, 0, 4095);
blue0 = astc::clamp(blue0, 0, 4095);
red1 = astc::clamp(red1, 0, 4095);
green1 = astc::clamp(green1, 0, 4095);
blue1 = astc::clamp(blue1, 0, 4095);
// switch around the color components
int temp0, temp1;
switch (majcomp)
{
case 1: // switch around red and green
temp0 = red0;
temp1 = red1;
red0 = green0;
red1 = green1;
green0 = temp0;
green1 = temp1;
break;
case 2: // switch around red and blue
temp0 = red0;
temp1 = red1;
red0 = blue0;
red1 = blue1;
blue0 = temp0;
blue1 = temp1;
break;
case 0: // no switch
break;
}
output0 = vint4(red0 << 4, green0 << 4, blue0 << 4, 0x7800);
output1 = vint4(red1 << 4, green1 << 4, blue1 << 4, 0x7800);
}
/**
* @brief Unpack an HDR RGB + LDR A direct encoding.
*
* @param input The packed endpoints (packed and modal).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void hdr_rgb_ldr_alpha_unpack(
const uint8_t input[8],
vint4& output0,
vint4& output1
) {
hdr_rgb_unpack(input, output0, output1);
int v6 = input[6];
int v7 = input[7];
output0.set_lane<3>(v6);
output1.set_lane<3>(v7);
}
/**
* @brief Unpack an HDR L (small range) direct encoding.
*
* @param input The packed endpoints (packed and modal).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void hdr_luminance_small_range_unpack(
const uint8_t input[2],
vint4& output0,
vint4& output1
) {
int v0 = input[0];
int v1 = input[1];
int y0, y1;
if (v0 & 0x80)
{
y0 = ((v1 & 0xE0) << 4) | ((v0 & 0x7F) << 2);
y1 = (v1 & 0x1F) << 2;
}
else
{
y0 = ((v1 & 0xF0) << 4) | ((v0 & 0x7F) << 1);
y1 = (v1 & 0xF) << 1;
}
y1 += y0;
if (y1 > 0xFFF)
{
y1 = 0xFFF;
}
output0 = vint4(y0 << 4, y0 << 4, y0 << 4, 0x7800);
output1 = vint4(y1 << 4, y1 << 4, y1 << 4, 0x7800);
}
/**
* @brief Unpack an HDR L (large range) direct encoding.
*
* @param input The packed endpoints (packed and modal).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void hdr_luminance_large_range_unpack(
const uint8_t input[2],
vint4& output0,
vint4& output1
) {
int v0 = input[0];
int v1 = input[1];
int y0, y1;
if (v1 >= v0)
{
y0 = v0 << 4;
y1 = v1 << 4;
}
else
{
y0 = (v1 << 4) + 8;
y1 = (v0 << 4) - 8;
}
output0 = vint4(y0 << 4, y0 << 4, y0 << 4, 0x7800);
output1 = vint4(y1 << 4, y1 << 4, y1 << 4, 0x7800);
}
/**
* @brief Unpack an HDR A direct encoding.
*
* @param input The packed endpoints (packed and modal).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void hdr_alpha_unpack(
const uint8_t input[2],
int& output0,
int& output1
) {
int v6 = input[0];
int v7 = input[1];
int selector = ((v6 >> 7) & 1) | ((v7 >> 6) & 2);
v6 &= 0x7F;
v7 &= 0x7F;
if (selector == 3)
{
output0 = v6 << 5;
output1 = v7 << 5;
}
else
{
v6 |= (v7 << (selector + 1)) & 0x780;
v7 &= (0x3f >> selector);
v7 ^= 32 >> selector;
v7 -= 32 >> selector;
v6 <<= (4 - selector);
v7 <<= (4 - selector);
v7 += v6;
if (v7 < 0)
{
v7 = 0;
}
else if (v7 > 0xFFF)
{
v7 = 0xFFF;
}
output0 = v6;
output1 = v7;
}
output0 <<= 4;
output1 <<= 4;
}
/**
* @brief Unpack an HDR RGBA direct encoding.
*
* @param input The packed endpoints (packed and modal).
* @param[out] output0 The unpacked endpoint 0 color.
* @param[out] output1 The unpacked endpoint 1 color.
*/
static void hdr_rgb_hdr_alpha_unpack(
const uint8_t input[8],
vint4& output0,
vint4& output1
) {
hdr_rgb_unpack(input, output0, output1);
int alpha0, alpha1;
hdr_alpha_unpack(input + 6, alpha0, alpha1);
output0.set_lane<3>(alpha0);
output1.set_lane<3>(alpha1);
}
/* See header for documentation. */
void unpack_color_endpoints(
astcenc_profile decode_mode,
int format,
const uint8_t* input,
bool& rgb_hdr,
bool& alpha_hdr,
vint4& output0,
vint4& output1
) {
// Assume no NaNs and LDR endpoints unless set later
rgb_hdr = false;
alpha_hdr = false;
bool alpha_hdr_default = false;
switch (format)
{
case FMT_LUMINANCE:
luminance_unpack(input, output0, output1);
break;
case FMT_LUMINANCE_DELTA:
luminance_delta_unpack(input, output0, output1);
break;
case FMT_HDR_LUMINANCE_SMALL_RANGE:
rgb_hdr = true;
alpha_hdr_default = true;
hdr_luminance_small_range_unpack(input, output0, output1);
break;
case FMT_HDR_LUMINANCE_LARGE_RANGE:
rgb_hdr = true;
alpha_hdr_default = true;
hdr_luminance_large_range_unpack(input, output0, output1);
break;
case FMT_LUMINANCE_ALPHA:
luminance_alpha_unpack(input, output0, output1);
break;
case FMT_LUMINANCE_ALPHA_DELTA:
luminance_alpha_delta_unpack(input, output0, output1);
break;
case FMT_RGB_SCALE:
{
vint4 input0q(input[0], input[1], input[2], 0);
uint8_t scale = input[3];
rgb_scale_unpack(input0q, scale, output0, output1);
}
break;
case FMT_RGB_SCALE_ALPHA:
{
vint4 input0q(input[0], input[1], input[2], input[4]);
uint8_t alpha1q = input[5];
uint8_t scaleq = input[3];
rgb_scale_alpha_unpack(input0q, alpha1q, scaleq, output0, output1);
}
break;
case FMT_HDR_RGB_SCALE:
rgb_hdr = true;
alpha_hdr_default = true;
hdr_rgbo_unpack(input, output0, output1);
break;
case FMT_RGB:
{
vint4 input0q(input[0], input[2], input[4], 0);
vint4 input1q(input[1], input[3], input[5], 0);
rgb_unpack(input0q, input1q, output0, output1);
}
break;
case FMT_RGB_DELTA:
{
vint4 input0q(input[0], input[2], input[4], 0);
vint4 input1q(input[1], input[3], input[5], 0);
rgb_delta_unpack(input0q, input1q, output0, output1);
}
break;
case FMT_HDR_RGB:
rgb_hdr = true;
alpha_hdr_default = true;
hdr_rgb_unpack(input, output0, output1);
break;
case FMT_RGBA:
{
vint4 input0q(input[0], input[2], input[4], input[6]);
vint4 input1q(input[1], input[3], input[5], input[7]);
rgba_unpack(input0q, input1q, output0, output1);
}
break;
case FMT_RGBA_DELTA:
{
vint4 input0q(input[0], input[2], input[4], input[6]);
vint4 input1q(input[1], input[3], input[5], input[7]);
rgba_delta_unpack(input0q, input1q, output0, output1);
}
break;
case FMT_HDR_RGB_LDR_ALPHA:
rgb_hdr = true;
hdr_rgb_ldr_alpha_unpack(input, output0, output1);
break;
case FMT_HDR_RGBA:
rgb_hdr = true;
alpha_hdr = true;
hdr_rgb_hdr_alpha_unpack(input, output0, output1);
break;
}
// Assign a correct default alpha
if (alpha_hdr_default)
{
if (decode_mode == ASTCENC_PRF_HDR)
{
output0.set_lane<3>(0x7800);
output1.set_lane<3>(0x7800);
alpha_hdr = true;
}
else
{
output0.set_lane<3>(0x00FF);
output1.set_lane<3>(0x00FF);
alpha_hdr = false;
}
}
// Handle endpoint errors and expansion
// Linear LDR 8-bit endpoints are expanded to 16-bit by replication
if (decode_mode == ASTCENC_PRF_LDR)
{
// Error color - HDR endpoint in an LDR encoding
if (rgb_hdr || alpha_hdr)
{
output0 = vint4(0xFF, 0x00, 0xFF, 0xFF);
output1 = vint4(0xFF, 0x00, 0xFF, 0xFF);
rgb_hdr = false;
alpha_hdr = false;
}
output0 = output0 * 257;
output1 = output1 * 257;
}
// sRGB LDR 8-bit endpoints are expanded to 16 bit by:
// - RGB = shift left by 8 bits and OR with 0x80
// - A = replication
else if (decode_mode == ASTCENC_PRF_LDR_SRGB)
{
// Error color - HDR endpoint in an LDR encoding
if (rgb_hdr || alpha_hdr)
{
output0 = vint4(0xFF, 0x00, 0xFF, 0xFF);
output1 = vint4(0xFF, 0x00, 0xFF, 0xFF);
rgb_hdr = false;
alpha_hdr = false;
}
output0 = lsl<8>(output0) | vint4(0x80);
output1 = lsl<8>(output1) | vint4(0x80);
}
// An HDR profile decode, but may be using linear LDR endpoints
// Linear LDR 8-bit endpoints are expanded to 16-bit by replication
// HDR endpoints are already 16-bit
else
{
vmask4 hdr_lanes(rgb_hdr, rgb_hdr, rgb_hdr, alpha_hdr);
vint4 output_scale = select(vint4(257), vint4(1), hdr_lanes);
output0 = output0 * output_scale;
output1 = output1 * output_scale;
}
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,472 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
#if !defined(ASTCENC_DECOMPRESS_ONLY)
/**
* @brief Functions to calculate variance per component in a NxN footprint.
*
* We need N to be parametric, so the routine below uses summed area tables in order to execute in
* O(1) time independent of how big N is.
*
* The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first
* perform a binary reduction, and then distributes the results. This method means that there is no
* serial dependency between a given element and the next one, and also significantly improves
* numerical stability allowing us to use floats rather than doubles.
*/
#include "astcenc_internal.h"
#include <cassert>
/**
* @brief Generate a prefix-sum array using the Brent-Kung algorithm.
*
* This will take an input array of the form:
* v0, v1, v2, ...
* ... and modify in-place to turn it into a prefix-sum array of the form:
* v0, v0+v1, v0+v1+v2, ...
*
* @param d The array to prefix-sum.
* @param items The number of items in the array.
* @param stride The item spacing in the array; i.e. dense arrays should use 1.
*/
static void brent_kung_prefix_sum(
vfloat4* d,
size_t items,
int stride
) {
if (items < 2)
return;
size_t lc_stride = 2;
size_t log2_stride = 1;
// The reduction-tree loop
do {
size_t step = lc_stride >> 1;
size_t start = lc_stride - 1;
size_t iters = items >> log2_stride;
vfloat4 *da = d + (start * stride);
ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
size_t ofs_stride = stride << log2_stride;
while (iters)
{
*da = *da + da[ofs];
da += ofs_stride;
iters--;
}
log2_stride += 1;
lc_stride <<= 1;
} while (lc_stride <= items);
// The expansion-tree loop
do {
log2_stride -= 1;
lc_stride >>= 1;
size_t step = lc_stride >> 1;
size_t start = step + lc_stride - 1;
size_t iters = (items - step) >> log2_stride;
vfloat4 *da = d + (start * stride);
ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
size_t ofs_stride = stride << log2_stride;
while (iters)
{
*da = *da + da[ofs];
da += ofs_stride;
iters--;
}
} while (lc_stride > 2);
}
/* See header for documentation. */
void compute_pixel_region_variance(
astcenc_contexti& ctx,
const pixel_region_args& arg
) {
// Unpack the memory structure into local variables
const astcenc_image* img = arg.img;
astcenc_swizzle swz = arg.swz;
bool have_z = arg.have_z;
int size_x = arg.size_x;
int size_y = arg.size_y;
int size_z = arg.size_z;
int offset_x = arg.offset_x;
int offset_y = arg.offset_y;
int offset_z = arg.offset_z;
int alpha_kernel_radius = arg.alpha_kernel_radius;
float* input_alpha_averages = ctx.input_alpha_averages;
vfloat4* work_memory = arg.work_memory;
// Compute memory sizes and dimensions that we need
int kernel_radius = alpha_kernel_radius;
int kerneldim = 2 * kernel_radius + 1;
int kernel_radius_xy = kernel_radius;
int kernel_radius_z = have_z ? kernel_radius : 0;
int padsize_x = size_x + kerneldim;
int padsize_y = size_y + kerneldim;
int padsize_z = size_z + (have_z ? kerneldim : 0);
int sizeprod = padsize_x * padsize_y * padsize_z;
int zd_start = have_z ? 1 : 0;
vfloat4 *varbuf1 = work_memory;
vfloat4 *varbuf2 = work_memory + sizeprod;
// Scaling factors to apply to Y and Z for accesses into the work buffers
int yst = padsize_x;
int zst = padsize_x * padsize_y;
// Scaling factors to apply to Y and Z for accesses into result buffers
int ydt = img->dim_x;
int zdt = img->dim_x * img->dim_y;
// Macros to act as accessor functions for the work-memory
#define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x]
#define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x]
// Load N and N^2 values into the work buffers
if (img->data_type == ASTCENC_TYPE_U8)
{
// Swizzle data structure 4 = ZERO, 5 = ONE
uint8_t data[6];
data[ASTCENC_SWZ_0] = 0;
data[ASTCENC_SWZ_1] = 255;
for (int z = zd_start; z < padsize_z; z++)
{
int z_src = (z - zd_start) + offset_z - kernel_radius_z;
z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]);
for (int y = 1; y < padsize_y; y++)
{
int y_src = (y - 1) + offset_y - kernel_radius_xy;
y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
for (int x = 1; x < padsize_x; x++)
{
int x_src = (x - 1) + offset_x - kernel_radius_xy;
x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
data[0] = data8[(4 * img->dim_x * y_src) + (4 * x_src )];
data[1] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
data[2] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
data[3] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
uint8_t r = data[swz.r];
uint8_t g = data[swz.g];
uint8_t b = data[swz.b];
uint8_t a = data[swz.a];
vfloat4 d = vfloat4 (r * (1.0f / 255.0f),
g * (1.0f / 255.0f),
b * (1.0f / 255.0f),
a * (1.0f / 255.0f));
VARBUF1(z, y, x) = d;
VARBUF2(z, y, x) = d * d;
}
}
}
}
else if (img->data_type == ASTCENC_TYPE_F16)
{
// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
uint16_t data[6];
data[ASTCENC_SWZ_0] = 0;
data[ASTCENC_SWZ_1] = 0x3C00;
for (int z = zd_start; z < padsize_z; z++)
{
int z_src = (z - zd_start) + offset_z - kernel_radius_z;
z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]);
for (int y = 1; y < padsize_y; y++)
{
int y_src = (y - 1) + offset_y - kernel_radius_xy;
y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
for (int x = 1; x < padsize_x; x++)
{
int x_src = (x - 1) + offset_x - kernel_radius_xy;
x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src )];
data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
vfloat4 d = float16_to_float(di);
VARBUF1(z, y, x) = d;
VARBUF2(z, y, x) = d * d;
}
}
}
}
else // if (img->data_type == ASTCENC_TYPE_F32)
{
assert(img->data_type == ASTCENC_TYPE_F32);
// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
float data[6];
data[ASTCENC_SWZ_0] = 0.0f;
data[ASTCENC_SWZ_1] = 1.0f;
for (int z = zd_start; z < padsize_z; z++)
{
int z_src = (z - zd_start) + offset_z - kernel_radius_z;
z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
float* data32 = static_cast<float*>(img->data[z_src]);
for (int y = 1; y < padsize_y; y++)
{
int y_src = (y - 1) + offset_y - kernel_radius_xy;
y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
for (int x = 1; x < padsize_x; x++)
{
int x_src = (x - 1) + offset_x - kernel_radius_xy;
x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src )];
data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
float r = data[swz.r];
float g = data[swz.g];
float b = data[swz.b];
float a = data[swz.a];
vfloat4 d(r, g, b, a);
VARBUF1(z, y, x) = d;
VARBUF2(z, y, x) = d * d;
}
}
}
}
// Pad with an extra layer of 0s; this forms the edge of the SAT tables
vfloat4 vbz = vfloat4::zero();
for (int z = 0; z < padsize_z; z++)
{
for (int y = 0; y < padsize_y; y++)
{
VARBUF1(z, y, 0) = vbz;
VARBUF2(z, y, 0) = vbz;
}
for (int x = 0; x < padsize_x; x++)
{
VARBUF1(z, 0, x) = vbz;
VARBUF2(z, 0, x) = vbz;
}
}
if (have_z)
{
for (int y = 0; y < padsize_y; y++)
{
for (int x = 0; x < padsize_x; x++)
{
VARBUF1(0, y, x) = vbz;
VARBUF2(0, y, x) = vbz;
}
}
}
// Generate summed-area tables for N and N^2; this is done in-place, using
// a Brent-Kung parallel-prefix based algorithm to minimize precision loss
for (int z = zd_start; z < padsize_z; z++)
{
for (int y = 1; y < padsize_y; y++)
{
brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1);
brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1);
}
}
for (int z = zd_start; z < padsize_z; z++)
{
for (int x = 1; x < padsize_x; x++)
{
brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst);
brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst);
}
}
if (have_z)
{
for (int y = 1; y < padsize_y; y++)
{
for (int x = 1; x < padsize_x; x++)
{
brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst);
brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst);
}
}
}
// Compute a few constants used in the variance-calculation.
float alpha_kdim = static_cast<float>(2 * alpha_kernel_radius + 1);
float alpha_rsamples;
if (have_z)
{
alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim * alpha_kdim);
}
else
{
alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim);
}
// Use the summed-area tables to compute variance for each neighborhood
if (have_z)
{
for (int z = 0; z < size_z; z++)
{
int z_src = z + kernel_radius_z;
int z_dst = z + offset_z;
int z_low = z_src - alpha_kernel_radius;
int z_high = z_src + alpha_kernel_radius + 1;
for (int y = 0; y < size_y; y++)
{
int y_src = y + kernel_radius_xy;
int y_dst = y + offset_y;
int y_low = y_src - alpha_kernel_radius;
int y_high = y_src + alpha_kernel_radius + 1;
for (int x = 0; x < size_x; x++)
{
int x_src = x + kernel_radius_xy;
int x_dst = x + offset_x;
int x_low = x_src - alpha_kernel_radius;
int x_high = x_src + alpha_kernel_radius + 1;
// Summed-area table lookups for alpha average
float vasum = ( VARBUF1(z_high, y_low, x_low).lane<3>()
- VARBUF1(z_high, y_low, x_high).lane<3>()
- VARBUF1(z_high, y_high, x_low).lane<3>()
+ VARBUF1(z_high, y_high, x_high).lane<3>()) -
( VARBUF1(z_low, y_low, x_low).lane<3>()
- VARBUF1(z_low, y_low, x_high).lane<3>()
- VARBUF1(z_low, y_high, x_low).lane<3>()
+ VARBUF1(z_low, y_high, x_high).lane<3>());
int out_index = z_dst * zdt + y_dst * ydt + x_dst;
input_alpha_averages[out_index] = (vasum * alpha_rsamples);
}
}
}
}
else
{
for (int y = 0; y < size_y; y++)
{
int y_src = y + kernel_radius_xy;
int y_dst = y + offset_y;
int y_low = y_src - alpha_kernel_radius;
int y_high = y_src + alpha_kernel_radius + 1;
for (int x = 0; x < size_x; x++)
{
int x_src = x + kernel_radius_xy;
int x_dst = x + offset_x;
int x_low = x_src - alpha_kernel_radius;
int x_high = x_src + alpha_kernel_radius + 1;
// Summed-area table lookups for alpha average
float vasum = VARBUF1(0, y_low, x_low).lane<3>()
- VARBUF1(0, y_low, x_high).lane<3>()
- VARBUF1(0, y_high, x_low).lane<3>()
+ VARBUF1(0, y_high, x_high).lane<3>();
int out_index = y_dst * ydt + x_dst;
input_alpha_averages[out_index] = (vasum * alpha_rsamples);
}
}
}
}
/* See header for documentation. */
unsigned int init_compute_averages(
const astcenc_image& img,
unsigned int alpha_kernel_radius,
const astcenc_swizzle& swz,
avg_args& ag
) {
unsigned int size_x = img.dim_x;
unsigned int size_y = img.dim_y;
unsigned int size_z = img.dim_z;
// Compute maximum block size and from that the working memory buffer size
unsigned int kernel_radius = alpha_kernel_radius;
unsigned int kerneldim = 2 * kernel_radius + 1;
bool have_z = (size_z > 1);
unsigned int max_blk_size_xy = have_z ? 16 : 32;
unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u);
unsigned int max_padsize_xy = max_blk_size_xy + kerneldim;
unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0);
// Perform block-wise averages calculations across the image
// Initialize fields which are not populated until later
ag.arg.size_x = 0;
ag.arg.size_y = 0;
ag.arg.size_z = 0;
ag.arg.offset_x = 0;
ag.arg.offset_y = 0;
ag.arg.offset_z = 0;
ag.arg.work_memory = nullptr;
ag.arg.img = &img;
ag.arg.swz = swz;
ag.arg.have_z = have_z;
ag.arg.alpha_kernel_radius = alpha_kernel_radius;
ag.img_size_x = size_x;
ag.img_size_y = size_y;
ag.img_size_z = size_z;
ag.blk_size_xy = max_blk_size_xy;
ag.blk_size_z = max_blk_size_z;
ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z;
// The parallel task count
unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z;
unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy;
return z_tasks * y_tasks;
}
#endif
@@ -0,0 +1,622 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Functions to decompress a symbolic block.
*/
#include "astcenc_internal.h"
#include <stdio.h>
#include <assert.h>
/**
* @brief Compute the integer linear interpolation of two color endpoints.
*
* @param u8_mask The mask for lanes using decode_unorm8 rather than decode_f16.
* @param color0 The endpoint0 color.
* @param color1 The endpoint1 color.
* @param weights The interpolation weight (between 0 and 64).
*
* @return The interpolated color.
*/
static vint4 lerp_color_int(
vmask4 u8_mask,
vint4 color0,
vint4 color1,
vint4 weights
) {
vint4 weight1 = weights;
vint4 weight0 = vint4(64) - weight1;
vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
color = asr<6>(color);
// For decode_unorm8 values force the codec to bit replicate. This allows the
// rest of the codec to assume the full 0xFFFF range for everything and ignore
// the decode_mode setting
vint4 color_u8 = asr<8>(color) * vint4(257);
color = select(color, color_u8, u8_mask);
return color;
}
/**
* @brief Convert integer color value into a float value for the decoder.
*
* @param data The integer color value post-interpolation.
* @param lns_mask If set treat lane as HDR (LNS) else LDR (unorm16).
*
* @return The float color value.
*/
static inline vfloat4 decode_texel(
vint4 data,
vmask4 lns_mask
) {
vint4 color_lns = vint4::zero();
vint4 color_unorm = vint4::zero();
if (any(lns_mask))
{
color_lns = lns_to_sf16(data);
}
if (!all(lns_mask))
{
color_unorm = unorm16_to_sf16(data);
}
// Pick components and then convert to FP16
vint4 datai = select(color_unorm, color_lns, lns_mask);
return float16_to_float(datai);
}
/* See header for documentation. */
void unpack_weights(
const block_size_descriptor& bsd,
const symbolic_compressed_block& scb,
const decimation_info& di,
bool is_dual_plane,
int weights_plane1[BLOCK_MAX_TEXELS],
int weights_plane2[BLOCK_MAX_TEXELS]
) {
// Safe to overshoot as all arrays are allocated to full size
if (!is_dual_plane)
{
// Build full 64-entry weight lookup table
vtable_64x8 table;
vtable_prepare(table, scb.weights);
for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
{
vint summed_value(8);
vint weight_count(di.texel_weight_count + i);
int max_weight_count = hmax_s(weight_count);
promise(max_weight_count > 0);
for (int j = 0; j < max_weight_count; j++)
{
vint texel_weights(di.texel_weights_tr[j] + i);
vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
summed_value += vtable_lookup_32bit(table, texel_weights) * texel_weights_int;
}
store(lsr<4>(summed_value), weights_plane1 + i);
}
}
else
{
// Build a 32-entry weight lookup table per plane
// Plane 1
vtable_32x8 tab_plane1;
vtable_prepare(tab_plane1, scb.weights);
// Plane 2
vtable_32x8 tab_plane2;
vtable_prepare(tab_plane2, scb.weights + 32);
for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
{
vint sum_plane1(8);
vint sum_plane2(8);
vint weight_count(di.texel_weight_count + i);
int max_weight_count = hmax_s(weight_count);
promise(max_weight_count > 0);
for (int j = 0; j < max_weight_count; j++)
{
vint texel_weights(di.texel_weights_tr[j] + i);
vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
sum_plane1 += vtable_lookup_32bit(tab_plane1, texel_weights) * texel_weights_int;
sum_plane2 += vtable_lookup_32bit(tab_plane2, texel_weights) * texel_weights_int;
}
store(lsr<4>(sum_plane1), weights_plane1 + i);
store(lsr<4>(sum_plane2), weights_plane2 + i);
}
}
}
/**
* @brief Return an FP32 NaN value for use in error colors.
*
* This NaN encoding will turn into 0xFFFF when converted to an FP16 NaN.
*
* @return The float color value.
*/
static float error_color_nan()
{
if32 v;
v.u = 0xFFFFE000U;
return v.f;
}
/* See header for documentation. */
void decompress_symbolic_block(
astcenc_profile decode_mode,
const block_size_descriptor& bsd,
int xpos,
int ypos,
int zpos,
const symbolic_compressed_block& scb,
image_block& blk
) {
blk.xpos = xpos;
blk.ypos = ypos;
blk.zpos = zpos;
blk.data_min = vfloat4::zero();
blk.data_mean = vfloat4::zero();
blk.data_max = vfloat4::zero();
blk.grayscale = false;
// If we detected an error-block, blow up immediately.
if (scb.block_type == SYM_BTYPE_ERROR)
{
for (unsigned int i = 0; i < bsd.texel_count; i++)
{
blk.data_r[i] = error_color_nan();
blk.data_g[i] = error_color_nan();
blk.data_b[i] = error_color_nan();
blk.data_a[i] = error_color_nan();
blk.rgb_lns[i] = 0;
blk.alpha_lns[i] = 0;
}
return;
}
if ((scb.block_type == SYM_BTYPE_CONST_F16) ||
(scb.block_type == SYM_BTYPE_CONST_U16))
{
vfloat4 color;
uint8_t use_lns = 0;
// UNORM16 constant color block
if (scb.block_type == SYM_BTYPE_CONST_U16)
{
vint4 colori(scb.constant_color);
// Determine the UNORM8 rounding on the decode
vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
// The real decoder would just use the top 8 bits, but we rescale
// in to a 16-bit value that rounds correctly.
vint4 colori_u8 = asr<8>(colori) * 257;
colori = select(colori, colori_u8, u8_mask);
vint4 colorf16 = unorm16_to_sf16(colori);
color = float16_to_float(colorf16);
}
// FLOAT16 constant color block
else
{
switch (decode_mode)
{
case ASTCENC_PRF_LDR_SRGB:
case ASTCENC_PRF_LDR:
color = vfloat4(error_color_nan());
break;
case ASTCENC_PRF_HDR_RGB_LDR_A:
case ASTCENC_PRF_HDR:
// Constant-color block; unpack from FP16 to FP32.
color = float16_to_float(vint4(scb.constant_color));
use_lns = 1;
break;
}
}
for (unsigned int i = 0; i < bsd.texel_count; i++)
{
blk.data_r[i] = color.lane<0>();
blk.data_g[i] = color.lane<1>();
blk.data_b[i] = color.lane<2>();
blk.data_a[i] = color.lane<3>();
blk.rgb_lns[i] = use_lns;
blk.alpha_lns[i] = use_lns;
}
return;
}
// Get the appropriate partition-table entry
int partition_count = scb.partition_count;
const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
// Get the appropriate block descriptors
const auto& bm = bsd.get_block_mode(scb.block_mode);
const auto& di = bsd.get_decimation_info(bm.decimation_mode);
bool is_dual_plane = static_cast<bool>(bm.is_dual_plane);
// Unquantize and undecimate the weights
int plane1_weights[BLOCK_MAX_TEXELS];
int plane2_weights[BLOCK_MAX_TEXELS];
unpack_weights(bsd, scb, di, is_dual_plane, plane1_weights, plane2_weights);
// Now that we have endpoint colors and weights, we can unpack texel colors
int plane2_component = scb.plane2_component;
vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);
vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
for (int i = 0; i < partition_count; i++)
{
// Decode the color endpoints for this partition
vint4 ep0;
vint4 ep1;
bool rgb_lns;
bool a_lns;
unpack_color_endpoints(decode_mode,
scb.color_formats[i],
scb.color_values[i],
rgb_lns, a_lns,
ep0, ep1);
vmask4 lns_mask(rgb_lns, rgb_lns, rgb_lns, a_lns);
int texel_count = pi.partition_texel_count[i];
for (int j = 0; j < texel_count; j++)
{
int tix = pi.texels_of_partition[i][j];
vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
vint4 color = lerp_color_int(u8_mask, ep0, ep1, weight);
vfloat4 colorf = decode_texel(color, lns_mask);
blk.data_r[tix] = colorf.lane<0>();
blk.data_g[tix] = colorf.lane<1>();
blk.data_b[tix] = colorf.lane<2>();
blk.data_a[tix] = colorf.lane<3>();
}
}
}
#if !defined(ASTCENC_DECOMPRESS_ONLY)
/* See header for documentation. */
float compute_symbolic_block_difference_2plane(
const astcenc_config& config,
const block_size_descriptor& bsd,
const symbolic_compressed_block& scb,
const image_block& blk
) {
// If we detected an error-block, blow up immediately.
if (scb.block_type == SYM_BTYPE_ERROR)
{
return ERROR_CALC_DEFAULT;
}
assert(scb.block_mode >= 0);
assert(scb.partition_count == 1);
assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 1);
// Get the appropriate block descriptor
const block_mode& bm = bsd.get_block_mode(scb.block_mode);
const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
// Unquantize and undecimate the weights
int plane1_weights[BLOCK_MAX_TEXELS];
int plane2_weights[BLOCK_MAX_TEXELS];
unpack_weights(bsd, scb, di, true, plane1_weights, plane2_weights);
vmask4 plane2_mask = vint4::lane_id() == vint4(scb.plane2_component);
vfloat4 summa = vfloat4::zero();
// Decode the color endpoints for this partition
vint4 ep0;
vint4 ep1;
bool rgb_lns;
bool a_lns;
unpack_color_endpoints(config.profile,
scb.color_formats[0],
scb.color_values[0],
rgb_lns, a_lns,
ep0, ep1);
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
// Unpack and compute error for each texel in the partition
unsigned int texel_count = bsd.texel_count;
for (unsigned int i = 0; i < texel_count; i++)
{
vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
vint4 colori = lerp_color_int(u8_mask, ep0, ep1, weight);
vfloat4 color = int_to_float(colori);
vfloat4 oldColor = blk.texel(i);
// Compare error using a perceptual decode metric for RGBM textures
if (config.flags & ASTCENC_FLG_MAP_RGBM)
{
// Fail encodings that result in zero weight M pixels. Note that this can cause
// "interesting" artifacts if we reject all useful encodings - we typically get max
// brightness encodings instead which look just as bad. We recommend users apply a
// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
// getting small M values post-quantization, but we can't prove it would never
// happen, especially at low bit rates ...
if (color.lane<3>() == 0.0f)
{
return -ERROR_CALC_DEFAULT;
}
// Compute error based on decoded RGBM color
color = vfloat4(
color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
1.0f
);
oldColor = vfloat4(
oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
1.0f
);
}
vfloat4 error = oldColor - color;
error = min(abs(error), 1e15f);
error = error * error;
summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
}
return summa.lane<0>();
}
/* See header for documentation. */
float compute_symbolic_block_difference_1plane(
const astcenc_config& config,
const block_size_descriptor& bsd,
const symbolic_compressed_block& scb,
const image_block& blk
) {
assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 0);
// If we detected an error-block, blow up immediately.
if (scb.block_type == SYM_BTYPE_ERROR)
{
return ERROR_CALC_DEFAULT;
}
assert(scb.block_mode >= 0);
// Get the appropriate partition-table entry
unsigned int partition_count = scb.partition_count;
const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
// Get the appropriate block descriptor
const block_mode& bm = bsd.get_block_mode(scb.block_mode);
const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
// Unquantize and undecimate the weights
int plane1_weights[BLOCK_MAX_TEXELS];
unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
vfloat4 summa = vfloat4::zero();
for (unsigned int i = 0; i < partition_count; i++)
{
// Decode the color endpoints for this partition
vint4 ep0;
vint4 ep1;
bool rgb_lns;
bool a_lns;
unpack_color_endpoints(config.profile,
scb.color_formats[i],
scb.color_values[i],
rgb_lns, a_lns,
ep0, ep1);
// Unpack and compute error for each texel in the partition
unsigned int texel_count = pi.partition_texel_count[i];
for (unsigned int j = 0; j < texel_count; j++)
{
unsigned int tix = pi.texels_of_partition[i][j];
vint4 colori = lerp_color_int(u8_mask, ep0, ep1,
vint4(plane1_weights[tix]));
vfloat4 color = int_to_float(colori);
vfloat4 oldColor = blk.texel(tix);
// Compare error using a perceptual decode metric for RGBM textures
if (config.flags & ASTCENC_FLG_MAP_RGBM)
{
// Fail encodings that result in zero weight M pixels. Note that this can cause
// "interesting" artifacts if we reject all useful encodings - we typically get max
// brightness encodings instead which look just as bad. We recommend users apply a
// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
// getting small M values post-quantization, but we can't prove it would never
// happen, especially at low bit rates ...
if (color.lane<3>() == 0.0f)
{
return -ERROR_CALC_DEFAULT;
}
// Compute error based on decoded RGBM color
color = vfloat4(
color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
1.0f
);
oldColor = vfloat4(
oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
1.0f
);
}
vfloat4 error = oldColor - color;
error = min(abs(error), 1e15f);
error = error * error;
summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
}
}
return summa.lane<0>();
}
/* See header for documentation. */
float compute_symbolic_block_difference_1plane_1partition(
const astcenc_config& config,
const block_size_descriptor& bsd,
const symbolic_compressed_block& scb,
const image_block& blk
) {
// If we detected an error-block, blow up immediately.
if (scb.block_type == SYM_BTYPE_ERROR)
{
return ERROR_CALC_DEFAULT;
}
assert(scb.block_mode >= 0);
assert(bsd.get_partition_info(scb.partition_count, scb.partition_index).partition_count == 1);
// Get the appropriate block descriptor
const block_mode& bm = bsd.get_block_mode(scb.block_mode);
const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
// Unquantize and undecimate the weights
ASTCENC_ALIGNAS int plane1_weights[BLOCK_MAX_TEXELS];
unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
// Decode the color endpoints for this partition
vint4 ep0;
vint4 ep1;
bool rgb_lns;
bool a_lns;
unpack_color_endpoints(config.profile,
scb.color_formats[0],
scb.color_values[0],
rgb_lns, a_lns,
ep0, ep1);
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
// Unpack and compute error for each texel in the partition
vfloatacc summav = vfloatacc::zero();
vint lane_id = vint::lane_id();
unsigned int texel_count = bsd.texel_count;
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
{
// Compute EP1 contribution
vint weight1 = vint::loada(plane1_weights + i);
vint ep1_r = vint(ep1.lane<0>()) * weight1;
vint ep1_g = vint(ep1.lane<1>()) * weight1;
vint ep1_b = vint(ep1.lane<2>()) * weight1;
vint ep1_a = vint(ep1.lane<3>()) * weight1;
// Compute EP0 contribution
vint weight0 = vint(64) - weight1;
vint ep0_r = vint(ep0.lane<0>()) * weight0;
vint ep0_g = vint(ep0.lane<1>()) * weight0;
vint ep0_b = vint(ep0.lane<2>()) * weight0;
vint ep0_a = vint(ep0.lane<3>()) * weight0;
// Combine contributions
vint colori_r = asr<6>(ep0_r + ep1_r + vint(32));
vint colori_g = asr<6>(ep0_g + ep1_g + vint(32));
vint colori_b = asr<6>(ep0_b + ep1_b + vint(32));
vint colori_a = asr<6>(ep0_a + ep1_a + vint(32));
// If using a U8 decode mode bit replicate top 8 bits
// so rest of codec can assume 0xFFFF max range everywhere
vint colori_r8 = asr<8>(colori_r) * vint(257);
colori_r = select(colori_r, colori_r8, vmask(u8_mask.lane<0>()));
vint colori_g8 = asr<8>(colori_g) * vint(257);
colori_g = select(colori_g, colori_g8, vmask(u8_mask.lane<1>()));
vint colori_b8 = asr<8>(colori_b) * vint(257);
colori_b = select(colori_b, colori_b8, vmask(u8_mask.lane<2>()));
vint colori_a8 = asr<8>(colori_a) * vint(257);
colori_a = select(colori_a, colori_a8, vmask(u8_mask.lane<3>()));
// Compute color diff
vfloat color_r = int_to_float(colori_r);
vfloat color_g = int_to_float(colori_g);
vfloat color_b = int_to_float(colori_b);
vfloat color_a = int_to_float(colori_a);
vfloat color_orig_r = loada(blk.data_r + i);
vfloat color_orig_g = loada(blk.data_g + i);
vfloat color_orig_b = loada(blk.data_b + i);
vfloat color_orig_a = loada(blk.data_a + i);
vfloat color_error_r = min(abs(color_orig_r - color_r), vfloat(1e15f));
vfloat color_error_g = min(abs(color_orig_g - color_g), vfloat(1e15f));
vfloat color_error_b = min(abs(color_orig_b - color_b), vfloat(1e15f));
vfloat color_error_a = min(abs(color_orig_a - color_a), vfloat(1e15f));
// Compute squared error metric
color_error_r = color_error_r * color_error_r;
color_error_g = color_error_g * color_error_g;
color_error_b = color_error_b * color_error_b;
color_error_a = color_error_a * color_error_a;
vfloat metric = color_error_r * blk.channel_weight.lane<0>()
+ color_error_g * blk.channel_weight.lane<1>()
+ color_error_b * blk.channel_weight.lane<2>()
+ color_error_a * blk.channel_weight.lane<3>();
// Mask off bad lanes
vmask mask = lane_id < vint(texel_count);
lane_id += vint(ASTCENC_SIMD_WIDTH);
haccumulate(summav, metric, mask);
}
return hadd_s(summav);
}
#endif
@@ -0,0 +1,245 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2021-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Functions for the library entrypoint.
*/
#if defined(ASTCENC_DIAGNOSTICS)
#include <cassert>
#include <cstdarg>
#include <cstdio>
#include <cmath>
#include <limits>
#include <string>
#include "astcenc_diagnostic_trace.h"
/** @brief The global trace logger. */
static TraceLog* g_TraceLog = nullptr;
/** @brief The JSON indentation level. */
static const size_t g_trace_indent = 2;
TraceLog::TraceLog(
const char* file_name):
m_file(file_name, std::ofstream::out | std::ofstream::binary)
{
assert(!g_TraceLog);
g_TraceLog = this;
m_root = new TraceNode("root");
}
/* See header for documentation. */
TraceNode* TraceLog::get_current_leaf()
{
if (m_stack.size())
{
return m_stack.back();
}
return nullptr;
}
/* See header for documentation. */
size_t TraceLog::get_depth()
{
return m_stack.size();
}
/* See header for documentation. */
TraceLog::~TraceLog()
{
assert(g_TraceLog == this);
delete m_root;
g_TraceLog = nullptr;
}
/* See header for documentation. */
TraceNode::TraceNode(
const char* format,
...
) {
// Format the name string
constexpr size_t bufsz = 256;
char buffer[bufsz];
va_list args;
va_start (args, format);
vsnprintf (buffer, bufsz, format, args);
va_end (args);
// Guarantee there is a nul terminator
buffer[bufsz - 1] = 0;
// Generate the node
TraceNode* parent = g_TraceLog->get_current_leaf();
size_t depth = g_TraceLog->get_depth();
g_TraceLog->m_stack.push_back(this);
bool comma = parent && parent->m_attrib_count;
auto& out = g_TraceLog->m_file;
if (parent)
{
parent->m_attrib_count++;
}
if (comma)
{
out << ',';
}
if (depth)
{
out << '\n';
}
size_t out_indent = (depth * 2) * g_trace_indent;
size_t in_indent = (depth * 2 + 1) * g_trace_indent;
std::string out_indents("");
if (out_indent)
{
out_indents = std::string(out_indent, ' ');
}
std::string in_indents(in_indent, ' ');
out << out_indents << "[ \"node\", \"" << buffer << "\",\n";
out << in_indents << "[";
}
/* See header for documentation. */
void TraceNode::add_attrib(
std::string type,
std::string key,
std::string value
) {
(void)type;
size_t depth = g_TraceLog->get_depth();
size_t indent = (depth * 2) * g_trace_indent;
auto& out = g_TraceLog->m_file;
bool comma = m_attrib_count;
m_attrib_count++;
if (comma)
{
out << ',';
}
out << '\n';
out << std::string(indent, ' ') << "[ "
<< "\"" << key << "\", "
<< value << " ]";
}
/* See header for documentation. */
TraceNode::~TraceNode()
{
g_TraceLog->m_stack.pop_back();
auto& out = g_TraceLog->m_file;
size_t depth = g_TraceLog->get_depth();
size_t out_indent = (depth * 2) * g_trace_indent;
size_t in_indent = (depth * 2 + 1) * g_trace_indent;
std::string out_indents("");
if (out_indent)
{
out_indents = std::string(out_indent, ' ');
}
std::string in_indents(in_indent, ' ');
if (m_attrib_count)
{
out << "\n" << in_indents;
}
out << "]\n";
out << out_indents << "]";
}
/* See header for documentation. */
void trace_add_data(
const char* key,
const char* format,
...
) {
constexpr size_t bufsz = 256;
char buffer[bufsz];
va_list args;
va_start (args, format);
vsnprintf (buffer, bufsz, format, args);
va_end (args);
// Guarantee there is a nul terminator
buffer[bufsz - 1] = 0;
std::string value = "\"" + std::string(buffer) + "\"";
TraceNode* node = g_TraceLog->get_current_leaf();
node->add_attrib("str", key, value);
}
/* See header for documentation. */
void trace_add_data(
const char* key,
float value
) {
// Turn infinities into parseable values
if (std::isinf(value))
{
if (value > 0.0f)
{
value = std::numeric_limits<float>::max();
}
else
{
value = -std::numeric_limits<float>::max();
}
}
char buffer[256];
sprintf(buffer, "%.20g", (double)value);
TraceNode* node = g_TraceLog->get_current_leaf();
node->add_attrib("float", key, buffer);
}
/* See header for documentation. */
void trace_add_data(
const char* key,
int value
) {
TraceNode* node = g_TraceLog->get_current_leaf();
node->add_attrib("int", key, std::to_string(value));
}
/* See header for documentation. */
void trace_add_data(
const char* key,
unsigned int value
) {
TraceNode* node = g_TraceLog->get_current_leaf();
node->add_attrib("int", key, std::to_string(value));
}
#endif
@@ -0,0 +1,219 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2021-2022 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief This module provides a set of diagnostic tracing utilities.
*
* Overview
* ========
*
* The built-in diagnostic trace tool generates a hierarchical JSON tree structure. The tree
* hierarchy contains three levels:
*
* - block
* - pass
* - candidate
*
* One block node exists for each compressed block in the image. One pass node exists for each major
* pass (N partition, M planes, O components) applied to a block. One candidate node exists for each
* encoding candidate trialed for a pass.
*
* Each node contains both the hierarchy but also a number of attributes which explain the behavior.
* For example, the block node contains the block coordinates in the image, the pass explains the
* pass configuration, and the candidate will explain the candidate encoding such as weight
* decimation, refinement error, etc.
*
* Trace Nodes are designed as scope-managed C++ objects with stack-like push/pop behavior.
* Constructing a trace node on the stack will automatically add it to the current node as a child,
* and then make it the current node. Destroying the current node will pop the stack and set the
* parent to the current node. This provides a robust mechanism for ensuring reliable nesting in the
* tree structure.
*
* A set of utility macros are provided to add attribute annotations to the current trace node.
*
* Usage
* =====
*
* Create Trace Nodes on the stack using the @c TRACE_NODE() macro. This will compile-out completely
* in builds with diagnostics disabled.
*
* Add annotations to the current trace node using the @c trace_add_data() macro. This will
* similarly compile out completely in builds with diagnostics disabled.
*
* If you need to add additional code to support diagnostics-only behavior wrap
* it in preprocessor guards:
*
* #if defined(ASTCENC_DIAGNOSTICS)
* #endif
*/
#ifndef ASTCENC_DIAGNOSTIC_TRACE_INCLUDED
#define ASTCENC_DIAGNOSTIC_TRACE_INCLUDED
#if defined(ASTCENC_DIAGNOSTICS)
#include <iostream>
#include <fstream>
#include <vector>
/**
* @brief Class representing a single node in the trace hierarchy.
*/
class TraceNode
{
public:
/**
* @brief Construct a new node.
*
* Constructing a node will push to the the top of the stack, automatically making it a child of
* the current node, and then setting it to become the current node.
*
* @param format The format template for the node name.
* @param ... The format parameters.
*/
TraceNode(const char* format, ...);
/**
* @brief Add an attribute to this node.
*
* Note that no quoting is applied to the @c value, so if quoting is needed it must be done by
* the caller.
*
* @param type The type of the attribute.
* @param key The key of the attribute.
* @param value The value of the attribute.
*/
void add_attrib(std::string type, std::string key, std::string value);
/**
* @brief Destroy this node.
*
* Destroying a node will pop it from the top of the stack, making its parent the current node.
* It is invalid behavior to destroy a node that is not the current node; usage must conform to
* stack push-pop semantics.
*/
~TraceNode();
/**
* @brief The number of attributes and child nodes in this node.
*/
unsigned int m_attrib_count { 0 };
};
/**
* @brief Class representing the trace log file being written.
*/
class TraceLog
{
public:
/**
* @brief Create a new trace log.
*
* The trace log is global; there can be only one at a time.
*
* @param file_name The name of the file to write.
*/
TraceLog(const char* file_name);
/**
* @brief Detroy the trace log.
*
* Trace logs MUST be cleanly destroyed to ensure the file gets written.
*/
~TraceLog();
/**
* @brief Get the current child node.
*
* @return The current leaf node.
*/
TraceNode* get_current_leaf();
/**
* @brief Get the stack depth of the current child node.
*
* @return The current leaf node stack depth.
*/
size_t get_depth();
/**
* @brief The file stream to write to.
*/
std::ofstream m_file;
/**
* @brief The stack of nodes (newest at the back).
*/
std::vector<TraceNode*> m_stack;
private:
/**
* @brief The root node in the JSON file.
*/
TraceNode* m_root;
};
/**
* @brief Utility macro to create a trace node on the stack.
*
* @param name The variable name to use.
* @param ... The name template and format parameters.
*/
#define TRACE_NODE(name, ...) TraceNode name(__VA_ARGS__);
/**
* @brief Add a string annotation to the current node.
*
* @param key The name of the attribute.
* @param format The format template for the attribute value.
* @param ... The format parameters.
*/
void trace_add_data(const char* key, const char* format, ...);
/**
* @brief Add a float annotation to the current node.
*
* @param key The name of the attribute.
* @param value The value of the attribute.
*/
void trace_add_data(const char* key, float value);
/**
* @brief Add an integer annotation to the current node.
*
* @param key The name of the attribute.
* @param value The value of the attribute.
*/
void trace_add_data(const char* key, int value);
/**
* @brief Add an unsigned integer annotation to the current node.
*
* @param key The name of the attribute.
* @param value The value of the attribute.
*/
void trace_add_data(const char* key, unsigned int value);
#else
#define TRACE_NODE(name, ...)
#define trace_add_data(...)
#endif
#endif
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,781 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2025 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
#if !defined(ASTCENC_DECOMPRESS_ONLY)
/**
* @brief Functions for finding best partition for a block.
*
* The partition search operates in two stages. The first pass uses kmeans clustering to group
* texels into an ideal partitioning for the requested partition count, and then compares that
* against the 1024 partitionings generated by the ASTC partition hash function. The generated
* partitions are then ranked by the number of texels in the wrong partition, compared to the ideal
* clustering. All 1024 partitions are tested for similarity and ranked, apart from duplicates and
* partitionings that actually generate fewer than the requested partition count, but only the top
* N candidates are actually put through a more detailed search. N is determined by the compressor
* quality preset.
*
* For the detailed search, each candidate is checked against two possible encoding methods:
*
* - The best partitioning assuming different chroma colors (RGB + RGB or RGB + delta endpoints).
* - The best partitioning assuming same chroma colors (RGB + scale endpoints).
*
* This is implemented by computing the compute mean color and dominant direction for each
* partition. This defines two lines, both of which go through the mean color value.
*
* - One line has a direction defined by the dominant direction; this is used to assess the error
* from using an uncorrelated color representation.
* - The other line goes through (0,0,0,1) and is used to assess the error from using a same chroma
* (RGB + scale) color representation.
*
* The best candidate is selected by computing the squared-errors that result from using these
* lines for endpoint selection.
*/
#include <limits>
#include "astcenc_internal.h"
/**
* @brief Pick some initial kmeans cluster centers.
*
* @param blk The image block color data to compress.
* @param texel_count The number of texels in the block.
* @param partition_count The number of partitions in the block.
* @param[out] cluster_centers The initial partition cluster center colors.
*/
static void kmeans_init(
const image_block& blk,
unsigned int texel_count,
unsigned int partition_count,
vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS]
) {
promise(texel_count > 0);
promise(partition_count > 0);
unsigned int clusters_selected = 0;
float distances[BLOCK_MAX_TEXELS];
// Pick a random sample as first cluster center; 145897 from random.org
unsigned int sample = 145897 % texel_count;
vfloat4 center_color = blk.texel(sample);
cluster_centers[clusters_selected] = center_color;
clusters_selected++;
// Compute the distance to the first cluster center
float distance_sum = 0.0f;
for (unsigned int i = 0; i < texel_count; i++)
{
vfloat4 color = blk.texel(i);
vfloat4 diff = color - center_color;
float distance = dot_s(diff * diff, blk.channel_weight);
distance_sum += distance;
distances[i] = distance;
}
// More numbers from random.org for weighted-random center selection
const float cluster_cutoffs[9] {
0.626220f, 0.932770f, 0.275454f,
0.318558f, 0.240113f, 0.009190f,
0.347661f, 0.731960f, 0.156391f
};
unsigned int cutoff = (clusters_selected - 1) + 3 * (partition_count - 2);
// Pick the remaining samples as needed
while (true)
{
// Pick the next center in a weighted-random fashion.
float summa = 0.0f;
float distance_cutoff = distance_sum * cluster_cutoffs[cutoff++];
for (sample = 0; sample < texel_count; sample++)
{
summa += distances[sample];
if (summa >= distance_cutoff)
{
break;
}
}
// Clamp to a valid range and store the selected cluster center
sample = astc::min(sample, texel_count - 1);
center_color = blk.texel(sample);
cluster_centers[clusters_selected++] = center_color;
if (clusters_selected >= partition_count)
{
break;
}
// Compute the distance to the new cluster center, keep the min dist
distance_sum = 0.0f;
for (unsigned int i = 0; i < texel_count; i++)
{
vfloat4 color = blk.texel(i);
vfloat4 diff = color - center_color;
float distance = dot_s(diff * diff, blk.channel_weight);
distance = astc::min(distance, distances[i]);
distance_sum += distance;
distances[i] = distance;
}
}
}
/**
* @brief Assign texels to clusters, based on a set of chosen center points.
*
* @param blk The image block color data to compress.
* @param texel_count The number of texels in the block.
* @param partition_count The number of partitions in the block.
* @param cluster_centers The partition cluster center colors.
* @param[out] partition_of_texel The partition assigned for each texel.
*/
static void kmeans_assign(
const image_block& blk,
unsigned int texel_count,
unsigned int partition_count,
const vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
) {
promise(texel_count > 0);
promise(partition_count > 0);
uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
// Find the best partition for every texel
for (unsigned int i = 0; i < texel_count; i++)
{
float best_distance = std::numeric_limits<float>::max();
unsigned int best_partition = 0;
vfloat4 color = blk.texel(i);
for (unsigned int j = 0; j < partition_count; j++)
{
vfloat4 diff = color - cluster_centers[j];
float distance = dot_s(diff * diff, blk.channel_weight);
if (distance < best_distance)
{
best_distance = distance;
best_partition = j;
}
}
partition_of_texel[i] = static_cast<uint8_t>(best_partition);
partition_texel_count[best_partition]++;
}
// It is possible to get a situation where a partition ends up without any texels. In this case,
// assign texel N to partition N. This is silly, but ensures that every partition retains at
// least one texel. Reassigning a texel in this manner may cause another partition to go empty,
// so if we actually did a reassignment, run the whole loop over again.
bool problem_case;
do
{
problem_case = false;
for (unsigned int i = 0; i < partition_count; i++)
{
if (partition_texel_count[i] == 0)
{
partition_texel_count[partition_of_texel[i]]--;
partition_texel_count[i]++;
partition_of_texel[i] = static_cast<uint8_t>(i);
problem_case = true;
}
}
} while (problem_case);
}
/**
* @brief Compute new cluster centers based on their center of gravity.
*
* @param blk The image block color data to compress.
* @param texel_count The number of texels in the block.
* @param partition_count The number of partitions in the block.
* @param[out] cluster_centers The new cluster center colors.
* @param partition_of_texel The partition assigned for each texel.
*/
static void kmeans_update(
const image_block& blk,
unsigned int texel_count,
unsigned int partition_count,
vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
const uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
) {
promise(texel_count > 0);
promise(partition_count > 0);
vfloat4 color_sum[BLOCK_MAX_PARTITIONS] {
vfloat4::zero(),
vfloat4::zero(),
vfloat4::zero(),
vfloat4::zero()
};
uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
// Find the center of gravity in each cluster
for (unsigned int i = 0; i < texel_count; i++)
{
uint8_t partition = partition_of_texel[i];
color_sum[partition] += blk.texel(i);
partition_texel_count[partition]++;
}
// Set the center of gravity to be the new cluster center
for (unsigned int i = 0; i < partition_count; i++)
{
float scale = 1.0f / static_cast<float>(partition_texel_count[i]);
cluster_centers[i] = color_sum[i] * scale;
}
}
/**
* @brief Compute bit-mismatch for partitioning in 2-partition mode.
*
* @param a The texel assignment bitvector for the block.
* @param b The texel assignment bitvector for the partition table.
*
* @return The number of bit mismatches.
*/
static inline uint8_t partition_mismatch2(
const uint64_t a[2],
const uint64_t b[2]
) {
int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]);
int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]);
// Divide by 2 because XOR always counts errors twice, once when missing
// in the expected position, and again when present in the wrong partition
return static_cast<uint8_t>(astc::min(v1, v2) / 2);
}
/**
* @brief Compute bit-mismatch for partitioning in 3-partition mode.
*
* @param a The texel assignment bitvector for the block.
* @param b The texel assignment bitvector for the partition table.
*
* @return The number of bit mismatches.
*/
static inline uint8_t partition_mismatch3(
const uint64_t a[3],
const uint64_t b[3]
) {
int p00 = popcount(a[0] ^ b[0]);
int p01 = popcount(a[0] ^ b[1]);
int p02 = popcount(a[0] ^ b[2]);
int p10 = popcount(a[1] ^ b[0]);
int p11 = popcount(a[1] ^ b[1]);
int p12 = popcount(a[1] ^ b[2]);
int p20 = popcount(a[2] ^ b[0]);
int p21 = popcount(a[2] ^ b[1]);
int p22 = popcount(a[2] ^ b[2]);
int s0 = p11 + p22;
int s1 = p12 + p21;
int v0 = astc::min(s0, s1) + p00;
int s2 = p10 + p22;
int s3 = p12 + p20;
int v1 = astc::min(s2, s3) + p01;
int s4 = p10 + p21;
int s5 = p11 + p20;
int v2 = astc::min(s4, s5) + p02;
// Divide by 2 because XOR always counts errors twice, once when missing
// in the expected position, and again when present in the wrong partition
return static_cast<uint8_t>(astc::min(v0, v1, v2) / 2);
}
/**
* @brief Compute bit-mismatch for partitioning in 4-partition mode.
*
* @param a The texel assignment bitvector for the block.
* @param b The texel assignment bitvector for the partition table.
*
* @return The number of bit mismatches.
*/
static inline uint8_t partition_mismatch4(
const uint64_t a[4],
const uint64_t b[4]
) {
int p00 = popcount(a[0] ^ b[0]);
int p01 = popcount(a[0] ^ b[1]);
int p02 = popcount(a[0] ^ b[2]);
int p03 = popcount(a[0] ^ b[3]);
int p10 = popcount(a[1] ^ b[0]);
int p11 = popcount(a[1] ^ b[1]);
int p12 = popcount(a[1] ^ b[2]);
int p13 = popcount(a[1] ^ b[3]);
int p20 = popcount(a[2] ^ b[0]);
int p21 = popcount(a[2] ^ b[1]);
int p22 = popcount(a[2] ^ b[2]);
int p23 = popcount(a[2] ^ b[3]);
int p30 = popcount(a[3] ^ b[0]);
int p31 = popcount(a[3] ^ b[1]);
int p32 = popcount(a[3] ^ b[2]);
int p33 = popcount(a[3] ^ b[3]);
int mx23 = astc::min(p22 + p33, p23 + p32);
int mx13 = astc::min(p21 + p33, p23 + p31);
int mx12 = astc::min(p21 + p32, p22 + p31);
int mx03 = astc::min(p20 + p33, p23 + p30);
int mx02 = astc::min(p20 + p32, p22 + p30);
int mx01 = astc::min(p21 + p30, p20 + p31);
int v0 = p00 + astc::min(p11 + mx23, p12 + mx13, p13 + mx12);
int v1 = p01 + astc::min(p10 + mx23, p12 + mx03, p13 + mx02);
int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01);
int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12);
// Divide by 2 because XOR always counts errors twice, once when missing
// in the expected position, and again when present in the wrong partition
return static_cast<uint8_t>(astc::min(v0, v1, v2, v3) / 2);
}
using mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*);
/**
* @brief Count the partition table mismatches vs the data clustering.
*
* @param bsd The block size information.
* @param partition_count The number of partitions in the block.
* @param bitmaps The block texel partition assignment patterns.
* @param[out] mismatch_counts The array storing per partitioning mismatch counts.
*/
static void count_partition_mismatch_bits(
const block_size_descriptor& bsd,
unsigned int partition_count,
const uint64_t bitmaps[BLOCK_MAX_PARTITIONS],
uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS]
) {
unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1];
promise(active_count > 0);
if (partition_count == 2)
{
for (unsigned int i = 0; i < active_count; i++)
{
mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]);
assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
assert(mismatch_counts[i] < bsd.texel_count);
}
}
else if (partition_count == 3)
{
for (unsigned int i = 0; i < active_count; i++)
{
mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]);
assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
assert(mismatch_counts[i] < bsd.texel_count);
}
}
else
{
for (unsigned int i = 0; i < active_count; i++)
{
mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]);
assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
assert(mismatch_counts[i] < bsd.texel_count);
}
}
}
/**
* @brief Use counting sort on the mismatch array to sort partition candidates.
*
* @param partitioning_count The number of packed partitionings.
* @param mismatch_count Partitioning mismatch counts, in index order.
* @param[out] partition_ordering Partition index values, in mismatch order.
*
* @return The number of active partitions in this selection.
*/
static unsigned int get_partition_ordering_by_mismatch_bits(
unsigned int texel_count,
unsigned int partitioning_count,
const uint8_t mismatch_count[BLOCK_MAX_PARTITIONINGS],
uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
) {
promise(partitioning_count > 0);
uint16_t mscount[BLOCK_MAX_KMEANS_TEXELS] { 0 };
// Create the histogram of mismatch counts
for (unsigned int i = 0; i < partitioning_count; i++)
{
mscount[mismatch_count[i]]++;
}
// Create a running sum from the histogram array
// Indices store previous values only; i.e. exclude self after sum
uint16_t sum = 0;
for (unsigned int i = 0; i < texel_count; i++)
{
uint16_t cnt = mscount[i];
mscount[i] = sum;
sum += cnt;
}
// Use the running sum as the index, incrementing after read to allow
// sequential entries with the same count
for (unsigned int i = 0; i < partitioning_count; i++)
{
unsigned int idx = mscount[mismatch_count[i]]++;
partition_ordering[idx] = static_cast<uint16_t>(i);
}
return partitioning_count;
}
/**
* @brief Use k-means clustering to compute a partition ordering for a block..
*
* @param bsd The block size information.
* @param blk The image block color data to compress.
* @param partition_count The desired number of partitions in the block.
* @param[out] partition_ordering The list of recommended partition indices, in priority order.
*
* @return The number of active partitionings in this selection.
*/
static unsigned int compute_kmeans_partition_ordering(
const block_size_descriptor& bsd,
const image_block& blk,
unsigned int partition_count,
uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
) {
vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS];
uint8_t texel_partitions[BLOCK_MAX_TEXELS];
// Use three passes of k-means clustering to partition the block data
for (unsigned int i = 0; i < 3; i++)
{
if (i == 0)
{
kmeans_init(blk, bsd.texel_count, partition_count, cluster_centers);
}
else
{
kmeans_update(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
}
kmeans_assign(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
}
// Construct the block bitmaps of texel assignments to each partition
uint64_t bitmaps[BLOCK_MAX_PARTITIONS] { 0 };
unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS);
promise(texels_to_process > 0);
for (unsigned int i = 0; i < texels_to_process; i++)
{
unsigned int idx = bsd.kmeans_texels[i];
bitmaps[texel_partitions[idx]] |= 1ULL << i;
}
// Count the mismatch between the block and the format's partition tables
uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS];
count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts);
// Sort the partitions based on the number of mismatched bits
return get_partition_ordering_by_mismatch_bits(
texels_to_process,
bsd.partitioning_count_selected[partition_count - 1],
mismatch_counts, partition_ordering);
}
/**
* @brief Insert a partitioning into an order list of results, sorted by error.
*
* @param max_values The max number of entries in the best result arrays.
* @param this_error The error of the new entry.
* @param this_partition The partition ID of the new entry.
* @param[out] best_errors The array of best error values.
* @param[out] best_partitions The array of best partition values.
*/
static void insert_result(
unsigned int max_values,
float this_error,
unsigned int this_partition,
float* best_errors,
unsigned int* best_partitions)
{
promise(max_values > 0);
// Don't bother searching if the current worst error beats the new error
if (this_error >= best_errors[max_values - 1])
{
return;
}
// Else insert into the list in error-order
for (unsigned int i = 0; i < max_values; i++)
{
// Existing result is better - move on ...
if (this_error > best_errors[i])
{
continue;
}
// Move existing results down one
for (unsigned int j = max_values - 1; j > i; j--)
{
best_errors[j] = best_errors[j - 1];
best_partitions[j] = best_partitions[j - 1];
}
// Insert new result
best_errors[i] = this_error;
best_partitions[i] = this_partition;
break;
}
}
/* See header for documentation. */
unsigned int find_best_partition_candidates(
const block_size_descriptor& bsd,
const image_block& blk,
unsigned int partition_count,
unsigned int partition_search_limit,
unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES],
unsigned int requested_candidates
) {
// Constant used to estimate quantization error for a given partitioning; the optimal value for
// this depends on bitrate. These values have been determined empirically.
unsigned int texels_per_block = bsd.texel_count;
float weight_imprecision_estim = 0.055f;
if (texels_per_block <= 20)
{
weight_imprecision_estim = 0.03f;
}
else if (texels_per_block <= 31)
{
weight_imprecision_estim = 0.04f;
}
else if (texels_per_block <= 41)
{
weight_imprecision_estim = 0.05f;
}
promise(partition_count > 0);
promise(partition_search_limit > 0);
weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim;
uint16_t partition_sequence[BLOCK_MAX_PARTITIONINGS];
unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence);
partition_search_limit = astc::min(partition_search_limit, sequence_len);
requested_candidates = astc::min(partition_search_limit, requested_candidates);
bool uses_alpha = !blk.is_constant_channel(3);
// Partitioning errors assuming uncorrelated-chrominance endpoints
float uncor_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
unsigned int uncor_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
// Partitioning errors assuming same-chrominance endpoints
float samec_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
unsigned int samec_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
for (unsigned int i = 0; i < requested_candidates; i++)
{
uncor_best_errors[i] = ERROR_CALC_DEFAULT;
samec_best_errors[i] = ERROR_CALC_DEFAULT;
}
if (uses_alpha)
{
for (unsigned int i = 0; i < partition_search_limit; i++)
{
unsigned int partition = partition_sequence[i];
const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
// Compute weighting to give to each component in each partition
partition_metrics pms[BLOCK_MAX_PARTITIONS];
compute_avgs_and_dirs_4_comp(pi, blk, pms);
line4 uncor_lines[BLOCK_MAX_PARTITIONS];
line4 samec_lines[BLOCK_MAX_PARTITIONS];
processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS];
processed_line4 samec_plines[BLOCK_MAX_PARTITIONS];
float line_lengths[BLOCK_MAX_PARTITIONS];
for (unsigned int j = 0; j < partition_count; j++)
{
partition_metrics& pm = pms[j];
uncor_lines[j].a = pm.avg;
uncor_lines[j].b = normalize_safe(pm.dir, unit4());
uncor_plines[j].amod = uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b);
uncor_plines[j].bs = uncor_lines[j].b;
samec_lines[j].a = vfloat4::zero();
samec_lines[j].b = normalize_safe(pm.avg, unit4());
samec_plines[j].amod = vfloat4::zero();
samec_plines[j].bs = samec_lines[j].b;
}
float uncor_error = 0.0f;
float samec_error = 0.0f;
compute_error_squared_rgba(pi,
blk,
uncor_plines,
samec_plines,
line_lengths,
uncor_error,
samec_error);
// Compute an estimate of error introduced by weight quantization imprecision.
// This error is computed as follows, for each partition
// 1: compute the principal-axis vector (full length) in error-space
// 2: convert the principal-axis vector to regular RGB-space
// 3: scale the vector by a constant that estimates average quantization error
// 4: for each texel, square the vector, then do a dot-product with the texel's
// error weight; sum up the results across all texels.
// 4(optimized): square the vector once, then do a dot-product with the average
// texel error, then multiply by the number of texels.
for (unsigned int j = 0; j < partition_count; j++)
{
float tpp = static_cast<float>(pi.partition_texel_count[j]);
vfloat4 error_weights(tpp * weight_imprecision_estim);
vfloat4 uncor_vector = uncor_lines[j].b * line_lengths[j];
vfloat4 samec_vector = samec_lines[j].b * line_lengths[j];
uncor_error += dot_s(uncor_vector * uncor_vector, error_weights);
samec_error += dot_s(samec_vector * samec_vector, error_weights);
}
insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
}
}
else
{
for (unsigned int i = 0; i < partition_search_limit; i++)
{
unsigned int partition = partition_sequence[i];
const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
// Compute weighting to give to each component in each partition
partition_metrics pms[BLOCK_MAX_PARTITIONS];
compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
partition_lines3 plines[BLOCK_MAX_PARTITIONS];
for (unsigned int j = 0; j < partition_count; j++)
{
partition_metrics& pm = pms[j];
partition_lines3& pl = plines[j];
pl.uncor_line.a = pm.avg;
pl.uncor_line.b = normalize_safe(pm.dir, unit3());
pl.samec_line.a = vfloat4::zero();
pl.samec_line.b = normalize_safe(pm.avg, unit3());
pl.uncor_pline.amod = pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b);
pl.uncor_pline.bs = pl.uncor_line.b;
pl.samec_pline.amod = vfloat4::zero();
pl.samec_pline.bs = pl.samec_line.b;
}
float uncor_error = 0.0f;
float samec_error = 0.0f;
compute_error_squared_rgb(pi,
blk,
plines,
uncor_error,
samec_error);
// Compute an estimate of error introduced by weight quantization imprecision.
// This error is computed as follows, for each partition
// 1: compute the principal-axis vector (full length) in error-space
// 2: convert the principal-axis vector to regular RGB-space
// 3: scale the vector by a constant that estimates average quantization error
// 4: for each texel, square the vector, then do a dot-product with the texel's
// error weight; sum up the results across all texels.
// 4(optimized): square the vector once, then do a dot-product with the average
// texel error, then multiply by the number of texels.
for (unsigned int j = 0; j < partition_count; j++)
{
partition_lines3& pl = plines[j];
float tpp = static_cast<float>(pi.partition_texel_count[j]);
vfloat4 error_weights(tpp * weight_imprecision_estim);
vfloat4 uncor_vector = pl.uncor_line.b * pl.line_length;
vfloat4 samec_vector = pl.samec_line.b * pl.line_length;
uncor_error += dot3_s(uncor_vector * uncor_vector, error_weights);
samec_error += dot3_s(samec_vector * samec_vector, error_weights);
}
insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
}
}
unsigned int interleave[2 * TUNE_MAX_PARTITIONING_CANDIDATES];
for (unsigned int i = 0; i < requested_candidates; i++)
{
interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
}
uint64_t bitmasks[1024/64] { 0 };
unsigned int emitted = 0;
// Deduplicate the first "requested" entries
for (unsigned int i = 0; i < requested_candidates * 2; i++)
{
unsigned int partition = interleave[i];
unsigned int word = partition / 64;
unsigned int bit = partition % 64;
bool written = bitmasks[word] & (1ull << bit);
if (!written)
{
best_partitions[emitted] = partition;
bitmasks[word] |= 1ull << bit;
emitted++;
if (emitted == requested_candidates)
{
break;
}
}
}
return emitted;
}
#endif
File diff suppressed because it is too large Load Diff
+558
View File
@@ -0,0 +1,558 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Functions for creating in-memory ASTC image structures.
*/
#include <cassert>
#include <cstring>
#include "astcenc_internal.h"
/**
* @brief Loader pipeline function type for data fetch from memory.
*/
using pixel_loader = vfloat4(*)(const void*, int);
/**
* @brief Loader pipeline function type for swizzling data in a vector.
*/
using pixel_swizzler = vfloat4(*)(vfloat4, const astcenc_swizzle&);
/**
* @brief Loader pipeline function type for converting data in a vector to LNS.
*/
using pixel_converter = vfloat4(*)(vfloat4, vmask4);
/**
* @brief Load a 8-bit UNORM texel from a data array.
*
* @param data The data pointer.
* @param base_offset The index offset to the start of the pixel.
*/
static vfloat4 load_texel_u8(
const void* data,
int base_offset
) {
const uint8_t* data8 = static_cast<const uint8_t*>(data);
return int_to_float(vint4(data8 + base_offset)) / 255.0f;
}
/**
* @brief Load a 16-bit fp16 texel from a data array.
*
* @param data The data pointer.
* @param base_offset The index offset to the start of the pixel.
*/
static vfloat4 load_texel_f16(
const void* data,
int base_offset
) {
const uint16_t* data16 = static_cast<const uint16_t*>(data);
int r = data16[base_offset ];
int g = data16[base_offset + 1];
int b = data16[base_offset + 2];
int a = data16[base_offset + 3];
return float16_to_float(vint4(r, g, b, a));
}
/**
* @brief Load a 32-bit float texel from a data array.
*
* @param data The data pointer.
* @param base_offset The index offset to the start of the pixel.
*/
static vfloat4 load_texel_f32(
const void* data,
int base_offset
) {
const float* data32 = static_cast<const float*>(data);
return vfloat4(data32 + base_offset);
}
/**
* @brief Dummy no-op swizzle function.
*
* @param data The source RGBA vector to swizzle.
* @param swz The swizzle to use.
*/
static vfloat4 swz_texel_skip(
vfloat4 data,
const astcenc_swizzle& swz
) {
(void)swz;
return data;
}
/**
* @brief Swizzle a texel into a new arrangement.
*
* @param data The source RGBA vector to swizzle.
* @param swz The swizzle to use.
*/
static vfloat4 swz_texel(
vfloat4 data,
const astcenc_swizzle& swz
) {
ASTCENC_ALIGNAS float datas[6];
storea(data, datas);
datas[ASTCENC_SWZ_0] = 0.0f;
datas[ASTCENC_SWZ_1] = 1.0f;
return vfloat4(datas[swz.r], datas[swz.g], datas[swz.b], datas[swz.a]);
}
/**
* @brief Encode a texel that is entirely LDR linear.
*
* @param data The RGBA data to encode.
* @param lns_mask The mask for the HDR channels than need LNS encoding.
*/
static vfloat4 encode_texel_unorm(
vfloat4 data,
vmask4 lns_mask
) {
(void)lns_mask;
return data * 65535.0f;
}
/**
* @brief Encode a texel that includes at least some HDR LNS texels.
*
* @param data The RGBA data to encode.
* @param lns_mask The mask for the HDR channels than need LNS encoding.
*/
static vfloat4 encode_texel_lns(
vfloat4 data,
vmask4 lns_mask
) {
vfloat4 datav_unorm = data * 65535.0f;
vfloat4 datav_lns = float_to_lns(data);
return select(datav_unorm, datav_lns, lns_mask);
}
/* See header for documentation. */
void load_image_block(
astcenc_profile decode_mode,
const astcenc_image& img,
image_block& blk,
const block_size_descriptor& bsd,
unsigned int xpos,
unsigned int ypos,
unsigned int zpos,
const astcenc_swizzle& swz
) {
unsigned int xsize = img.dim_x;
unsigned int ysize = img.dim_y;
unsigned int zsize = img.dim_z;
blk.xpos = xpos;
blk.ypos = ypos;
blk.zpos = zpos;
// True if any non-identity swizzle
bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
(swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
int idx = 0;
vfloat4 data_min(1e38f);
vfloat4 data_mean(0.0f);
vfloat4 data_mean_scale(1.0f / static_cast<float>(bsd.texel_count));
vfloat4 data_max(-1e38f);
vmask4 grayscalev(true);
// This works because we impose the same choice everywhere during encode
uint8_t rgb_lns = (decode_mode == ASTCENC_PRF_HDR) ||
(decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A) ? 1 : 0;
uint8_t a_lns = decode_mode == ASTCENC_PRF_HDR ? 1 : 0;
vint4 use_lns(rgb_lns, rgb_lns, rgb_lns, a_lns);
vmask4 lns_mask = use_lns != vint4::zero();
// Set up the function pointers for loading pipeline as needed
pixel_loader loader = load_texel_u8;
if (img.data_type == ASTCENC_TYPE_F16)
{
loader = load_texel_f16;
}
else if (img.data_type == ASTCENC_TYPE_F32)
{
loader = load_texel_f32;
}
pixel_swizzler swizzler = swz_texel_skip;
if (needs_swz)
{
swizzler = swz_texel;
}
pixel_converter converter = encode_texel_unorm;
if (any(lns_mask))
{
converter = encode_texel_lns;
}
for (unsigned int z = 0; z < bsd.zdim; z++)
{
unsigned int zi = astc::min(zpos + z, zsize - 1);
void* plane = img.data[zi];
for (unsigned int y = 0; y < bsd.ydim; y++)
{
unsigned int yi = astc::min(ypos + y, ysize - 1);
for (unsigned int x = 0; x < bsd.xdim; x++)
{
unsigned int xi = astc::min(xpos + x, xsize - 1);
vfloat4 datav = loader(plane, (4 * xsize * yi) + (4 * xi));
datav = swizzler(datav, swz);
datav = converter(datav, lns_mask);
// Compute block metadata
data_min = min(data_min, datav);
data_mean += datav * data_mean_scale;
data_max = max(data_max, datav);
grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
blk.data_r[idx] = datav.lane<0>();
blk.data_g[idx] = datav.lane<1>();
blk.data_b[idx] = datav.lane<2>();
blk.data_a[idx] = datav.lane<3>();
blk.rgb_lns[idx] = rgb_lns;
blk.alpha_lns[idx] = a_lns;
idx++;
}
}
}
// Reverse the encoding so we store origin block in the original format
vfloat4 data_enc = blk.texel(0);
vfloat4 data_enc_unorm = data_enc / 65535.0f;
vfloat4 data_enc_lns = vfloat4::zero();
if (rgb_lns || a_lns)
{
data_enc_lns = float16_to_float(lns_to_sf16(float_to_int(data_enc)));
}
blk.origin_texel = select(data_enc_unorm, data_enc_lns, lns_mask);
// Store block metadata
blk.data_min = data_min;
blk.data_mean = data_mean;
blk.data_max = data_max;
blk.grayscale = all(grayscalev);
}
/* See header for documentation. */
void load_image_block_fast_ldr(
astcenc_profile decode_mode,
const astcenc_image& img,
image_block& blk,
const block_size_descriptor& bsd,
unsigned int xpos,
unsigned int ypos,
unsigned int zpos,
const astcenc_swizzle& swz
) {
(void)swz;
(void)decode_mode;
unsigned int xsize = img.dim_x;
unsigned int ysize = img.dim_y;
blk.xpos = xpos;
blk.ypos = ypos;
blk.zpos = zpos;
vfloat4 data_min(1e38f);
vfloat4 data_mean = vfloat4::zero();
vfloat4 data_max(-1e38f);
vmask4 grayscalev(true);
int idx = 0;
const uint8_t* plane = static_cast<const uint8_t*>(img.data[0]);
for (unsigned int y = ypos; y < ypos + bsd.ydim; y++)
{
unsigned int yi = astc::min(y, ysize - 1);
for (unsigned int x = xpos; x < xpos + bsd.xdim; x++)
{
unsigned int xi = astc::min(x, xsize - 1);
vint4 datavi = vint4(plane + (4 * xsize * yi) + (4 * xi));
vfloat4 datav = int_to_float(datavi) * (65535.0f / 255.0f);
// Compute block metadata
data_min = min(data_min, datav);
data_mean += datav;
data_max = max(data_max, datav);
grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
blk.data_r[idx] = datav.lane<0>();
blk.data_g[idx] = datav.lane<1>();
blk.data_b[idx] = datav.lane<2>();
blk.data_a[idx] = datav.lane<3>();
idx++;
}
}
// Reverse the encoding so we store origin block in the original format
blk.origin_texel = blk.texel(0) / 65535.0f;
// Store block metadata
blk.rgb_lns[0] = 0;
blk.alpha_lns[0] = 0;
blk.data_min = data_min;
blk.data_mean = data_mean / static_cast<float>(bsd.texel_count);
blk.data_max = data_max;
blk.grayscale = all(grayscalev);
}
/* See header for documentation. */
void store_image_block(
astcenc_image& img,
const image_block& blk,
const block_size_descriptor& bsd,
unsigned int xpos,
unsigned int ypos,
unsigned int zpos,
const astcenc_swizzle& swz
) {
unsigned int x_size = img.dim_x;
unsigned int x_start = xpos;
unsigned int x_end = astc::min(x_size, xpos + bsd.xdim);
unsigned int x_count = x_end - x_start;
unsigned int x_nudge = bsd.xdim - x_count;
unsigned int y_size = img.dim_y;
unsigned int y_start = ypos;
unsigned int y_end = astc::min(y_size, ypos + bsd.ydim);
unsigned int y_count = y_end - y_start;
unsigned int y_nudge = (bsd.ydim - y_count) * bsd.xdim;
unsigned int z_size = img.dim_z;
unsigned int z_start = zpos;
unsigned int z_end = astc::min(z_size, zpos + bsd.zdim);
// True if any non-identity swizzle
bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
(swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
// True if any swizzle uses Z reconstruct
bool needs_z = (swz.r == ASTCENC_SWZ_Z) || (swz.g == ASTCENC_SWZ_Z) ||
(swz.b == ASTCENC_SWZ_Z) || (swz.a == ASTCENC_SWZ_Z);
int idx = 0;
if (img.data_type == ASTCENC_TYPE_U8)
{
for (unsigned int z = z_start; z < z_end; z++)
{
// Fetch the image plane
uint8_t* data8 = static_cast<uint8_t*>(img.data[z]);
for (unsigned int y = y_start; y < y_end; y++)
{
uint8_t* data8_row = data8 + (4 * x_size * y) + (4 * x_start);
for (unsigned int x = 0; x < x_count; x += ASTCENC_SIMD_WIDTH)
{
unsigned int max_texels = ASTCENC_SIMD_WIDTH;
unsigned int used_texels = astc::min(x_count - x, max_texels);
// Unaligned load as rows are not always SIMD_WIDTH long
vfloat data_r(blk.data_r + idx);
vfloat data_g(blk.data_g + idx);
vfloat data_b(blk.data_b + idx);
vfloat data_a(blk.data_a + idx);
vint data_ri = float_to_int_rtn(min(data_r, 1.0f) * 255.0f);
vint data_gi = float_to_int_rtn(min(data_g, 1.0f) * 255.0f);
vint data_bi = float_to_int_rtn(min(data_b, 1.0f) * 255.0f);
vint data_ai = float_to_int_rtn(min(data_a, 1.0f) * 255.0f);
if (needs_swz)
{
vint swizzle_table[7];
swizzle_table[ASTCENC_SWZ_0] = vint(0);
swizzle_table[ASTCENC_SWZ_1] = vint(255);
swizzle_table[ASTCENC_SWZ_R] = data_ri;
swizzle_table[ASTCENC_SWZ_G] = data_gi;
swizzle_table[ASTCENC_SWZ_B] = data_bi;
swizzle_table[ASTCENC_SWZ_A] = data_ai;
if (needs_z)
{
vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f);
vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f);
vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y);
data_z = max(data_z, 0.0f);
data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f);
swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f);
}
data_ri = swizzle_table[swz.r];
data_gi = swizzle_table[swz.g];
data_bi = swizzle_table[swz.b];
data_ai = swizzle_table[swz.a];
}
// Errors are NaN encoded - convert to magenta error color
// Branch is OK here - it is almost never true so predicts well
vmask nan_mask = data_r != data_r;
if (any(nan_mask))
{
data_ri = select(data_ri, vint(0xFF), nan_mask);
data_gi = select(data_gi, vint(0x00), nan_mask);
data_bi = select(data_bi, vint(0xFF), nan_mask);
data_ai = select(data_ai, vint(0xFF), nan_mask);
}
vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai);
vmask store_mask = vint::lane_id() < vint(used_texels);
store_lanes_masked(data8_row, data_rgbai, store_mask);
data8_row += ASTCENC_SIMD_WIDTH * 4;
idx += used_texels;
}
idx += x_nudge;
}
idx += y_nudge;
}
}
else if (img.data_type == ASTCENC_TYPE_F16)
{
for (unsigned int z = z_start; z < z_end; z++)
{
// Fetch the image plane
uint16_t* data16 = static_cast<uint16_t*>(img.data[z]);
for (unsigned int y = y_start; y < y_end; y++)
{
uint16_t* data16_row = data16 + (4 * x_size * y) + (4 * x_start);
for (unsigned int x = 0; x < x_count; x++)
{
vint4 color;
// NaNs are handled inline - no need to special case
if (needs_swz)
{
float data[7];
data[ASTCENC_SWZ_0] = 0.0f;
data[ASTCENC_SWZ_1] = 1.0f;
data[ASTCENC_SWZ_R] = blk.data_r[idx];
data[ASTCENC_SWZ_G] = blk.data_g[idx];
data[ASTCENC_SWZ_B] = blk.data_b[idx];
data[ASTCENC_SWZ_A] = blk.data_a[idx];
if (needs_z)
{
float xN = (data[0] * 2.0f) - 1.0f;
float yN = (data[3] * 2.0f) - 1.0f;
float zN = 1.0f - xN * xN - yN * yN;
if (zN < 0.0f)
{
zN = 0.0f;
}
data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
}
vfloat4 colorf(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
color = float_to_float16(colorf);
}
else
{
vfloat4 colorf = blk.texel(idx);
color = float_to_float16(colorf);
}
// TODO: Vectorize with store N shorts?
data16_row[0] = static_cast<uint16_t>(color.lane<0>());
data16_row[1] = static_cast<uint16_t>(color.lane<1>());
data16_row[2] = static_cast<uint16_t>(color.lane<2>());
data16_row[3] = static_cast<uint16_t>(color.lane<3>());
data16_row += 4;
idx++;
}
idx += x_nudge;
}
idx += y_nudge;
}
}
else // if (img.data_type == ASTCENC_TYPE_F32)
{
assert(img.data_type == ASTCENC_TYPE_F32);
for (unsigned int z = z_start; z < z_end; z++)
{
// Fetch the image plane
float* data32 = static_cast<float*>(img.data[z]);
for (unsigned int y = y_start; y < y_end; y++)
{
float* data32_row = data32 + (4 * x_size * y) + (4 * x_start);
for (unsigned int x = 0; x < x_count; x++)
{
vfloat4 color = blk.texel(idx);
// NaNs are handled inline - no need to special case
if (needs_swz)
{
float data[7];
data[ASTCENC_SWZ_0] = 0.0f;
data[ASTCENC_SWZ_1] = 1.0f;
data[ASTCENC_SWZ_R] = color.lane<0>();
data[ASTCENC_SWZ_G] = color.lane<1>();
data[ASTCENC_SWZ_B] = color.lane<2>();
data[ASTCENC_SWZ_A] = color.lane<3>();
if (needs_z)
{
float xN = (data[0] * 2.0f) - 1.0f;
float yN = (data[3] * 2.0f) - 1.0f;
float zN = 1.0f - xN * xN - yN * yN;
if (zN < 0.0f)
{
zN = 0.0f;
}
data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
}
color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
}
store(color, data32_row);
data32_row += 4;
idx++;
}
idx += x_nudge;
}
idx += y_nudge;
}
}
}
@@ -0,0 +1,739 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Functions for encoding/decoding Bounded Integer Sequence Encoding.
*/
#include "astcenc_internal.h"
#include <array>
/** @brief Unpacked quint triplets <low,middle,high> for each packed value */
// TODO: Bitpack these into a uint16_t?
static const uint8_t quints_of_integer[128][3] {
{0, 0, 0}, {1, 0, 0}, {2, 0, 0}, {3, 0, 0},
{4, 0, 0}, {0, 4, 0}, {4, 4, 0}, {4, 4, 4},
{0, 1, 0}, {1, 1, 0}, {2, 1, 0}, {3, 1, 0},
{4, 1, 0}, {1, 4, 0}, {4, 4, 1}, {4, 4, 4},
{0, 2, 0}, {1, 2, 0}, {2, 2, 0}, {3, 2, 0},
{4, 2, 0}, {2, 4, 0}, {4, 4, 2}, {4, 4, 4},
{0, 3, 0}, {1, 3, 0}, {2, 3, 0}, {3, 3, 0},
{4, 3, 0}, {3, 4, 0}, {4, 4, 3}, {4, 4, 4},
{0, 0, 1}, {1, 0, 1}, {2, 0, 1}, {3, 0, 1},
{4, 0, 1}, {0, 4, 1}, {4, 0, 4}, {0, 4, 4},
{0, 1, 1}, {1, 1, 1}, {2, 1, 1}, {3, 1, 1},
{4, 1, 1}, {1, 4, 1}, {4, 1, 4}, {1, 4, 4},
{0, 2, 1}, {1, 2, 1}, {2, 2, 1}, {3, 2, 1},
{4, 2, 1}, {2, 4, 1}, {4, 2, 4}, {2, 4, 4},
{0, 3, 1}, {1, 3, 1}, {2, 3, 1}, {3, 3, 1},
{4, 3, 1}, {3, 4, 1}, {4, 3, 4}, {3, 4, 4},
{0, 0, 2}, {1, 0, 2}, {2, 0, 2}, {3, 0, 2},
{4, 0, 2}, {0, 4, 2}, {2, 0, 4}, {3, 0, 4},
{0, 1, 2}, {1, 1, 2}, {2, 1, 2}, {3, 1, 2},
{4, 1, 2}, {1, 4, 2}, {2, 1, 4}, {3, 1, 4},
{0, 2, 2}, {1, 2, 2}, {2, 2, 2}, {3, 2, 2},
{4, 2, 2}, {2, 4, 2}, {2, 2, 4}, {3, 2, 4},
{0, 3, 2}, {1, 3, 2}, {2, 3, 2}, {3, 3, 2},
{4, 3, 2}, {3, 4, 2}, {2, 3, 4}, {3, 3, 4},
{0, 0, 3}, {1, 0, 3}, {2, 0, 3}, {3, 0, 3},
{4, 0, 3}, {0, 4, 3}, {0, 0, 4}, {1, 0, 4},
{0, 1, 3}, {1, 1, 3}, {2, 1, 3}, {3, 1, 3},
{4, 1, 3}, {1, 4, 3}, {0, 1, 4}, {1, 1, 4},
{0, 2, 3}, {1, 2, 3}, {2, 2, 3}, {3, 2, 3},
{4, 2, 3}, {2, 4, 3}, {0, 2, 4}, {1, 2, 4},
{0, 3, 3}, {1, 3, 3}, {2, 3, 3}, {3, 3, 3},
{4, 3, 3}, {3, 4, 3}, {0, 3, 4}, {1, 3, 4}
};
/** @brief Packed quint values for each unpacked value, indexed [hi][mid][lo]. */
static const uint8_t integer_of_quints[5][5][5] {
{
{0, 1, 2, 3, 4},
{8, 9, 10, 11, 12},
{16, 17, 18, 19, 20},
{24, 25, 26, 27, 28},
{5, 13, 21, 29, 6}
},
{
{32, 33, 34, 35, 36},
{40, 41, 42, 43, 44},
{48, 49, 50, 51, 52},
{56, 57, 58, 59, 60},
{37, 45, 53, 61, 14}
},
{
{64, 65, 66, 67, 68},
{72, 73, 74, 75, 76},
{80, 81, 82, 83, 84},
{88, 89, 90, 91, 92},
{69, 77, 85, 93, 22}
},
{
{96, 97, 98, 99, 100},
{104, 105, 106, 107, 108},
{112, 113, 114, 115, 116},
{120, 121, 122, 123, 124},
{101, 109, 117, 125, 30}
},
{
{102, 103, 70, 71, 38},
{110, 111, 78, 79, 46},
{118, 119, 86, 87, 54},
{126, 127, 94, 95, 62},
{39, 47, 55, 63, 31}
}
};
/** @brief Unpacked trit quintuplets <low,...,high> for each packed value */
// TODO: Bitpack these into a uint16_t?
static const uint8_t trits_of_integer[256][5] {
{0, 0, 0, 0, 0}, {1, 0, 0, 0, 0}, {2, 0, 0, 0, 0}, {0, 0, 2, 0, 0},
{0, 1, 0, 0, 0}, {1, 1, 0, 0, 0}, {2, 1, 0, 0, 0}, {1, 0, 2, 0, 0},
{0, 2, 0, 0, 0}, {1, 2, 0, 0, 0}, {2, 2, 0, 0, 0}, {2, 0, 2, 0, 0},
{0, 2, 2, 0, 0}, {1, 2, 2, 0, 0}, {2, 2, 2, 0, 0}, {2, 0, 2, 0, 0},
{0, 0, 1, 0, 0}, {1, 0, 1, 0, 0}, {2, 0, 1, 0, 0}, {0, 1, 2, 0, 0},
{0, 1, 1, 0, 0}, {1, 1, 1, 0, 0}, {2, 1, 1, 0, 0}, {1, 1, 2, 0, 0},
{0, 2, 1, 0, 0}, {1, 2, 1, 0, 0}, {2, 2, 1, 0, 0}, {2, 1, 2, 0, 0},
{0, 0, 0, 2, 2}, {1, 0, 0, 2, 2}, {2, 0, 0, 2, 2}, {0, 0, 2, 2, 2},
{0, 0, 0, 1, 0}, {1, 0, 0, 1, 0}, {2, 0, 0, 1, 0}, {0, 0, 2, 1, 0},
{0, 1, 0, 1, 0}, {1, 1, 0, 1, 0}, {2, 1, 0, 1, 0}, {1, 0, 2, 1, 0},
{0, 2, 0, 1, 0}, {1, 2, 0, 1, 0}, {2, 2, 0, 1, 0}, {2, 0, 2, 1, 0},
{0, 2, 2, 1, 0}, {1, 2, 2, 1, 0}, {2, 2, 2, 1, 0}, {2, 0, 2, 1, 0},
{0, 0, 1, 1, 0}, {1, 0, 1, 1, 0}, {2, 0, 1, 1, 0}, {0, 1, 2, 1, 0},
{0, 1, 1, 1, 0}, {1, 1, 1, 1, 0}, {2, 1, 1, 1, 0}, {1, 1, 2, 1, 0},
{0, 2, 1, 1, 0}, {1, 2, 1, 1, 0}, {2, 2, 1, 1, 0}, {2, 1, 2, 1, 0},
{0, 1, 0, 2, 2}, {1, 1, 0, 2, 2}, {2, 1, 0, 2, 2}, {1, 0, 2, 2, 2},
{0, 0, 0, 2, 0}, {1, 0, 0, 2, 0}, {2, 0, 0, 2, 0}, {0, 0, 2, 2, 0},
{0, 1, 0, 2, 0}, {1, 1, 0, 2, 0}, {2, 1, 0, 2, 0}, {1, 0, 2, 2, 0},
{0, 2, 0, 2, 0}, {1, 2, 0, 2, 0}, {2, 2, 0, 2, 0}, {2, 0, 2, 2, 0},
{0, 2, 2, 2, 0}, {1, 2, 2, 2, 0}, {2, 2, 2, 2, 0}, {2, 0, 2, 2, 0},
{0, 0, 1, 2, 0}, {1, 0, 1, 2, 0}, {2, 0, 1, 2, 0}, {0, 1, 2, 2, 0},
{0, 1, 1, 2, 0}, {1, 1, 1, 2, 0}, {2, 1, 1, 2, 0}, {1, 1, 2, 2, 0},
{0, 2, 1, 2, 0}, {1, 2, 1, 2, 0}, {2, 2, 1, 2, 0}, {2, 1, 2, 2, 0},
{0, 2, 0, 2, 2}, {1, 2, 0, 2, 2}, {2, 2, 0, 2, 2}, {2, 0, 2, 2, 2},
{0, 0, 0, 0, 2}, {1, 0, 0, 0, 2}, {2, 0, 0, 0, 2}, {0, 0, 2, 0, 2},
{0, 1, 0, 0, 2}, {1, 1, 0, 0, 2}, {2, 1, 0, 0, 2}, {1, 0, 2, 0, 2},
{0, 2, 0, 0, 2}, {1, 2, 0, 0, 2}, {2, 2, 0, 0, 2}, {2, 0, 2, 0, 2},
{0, 2, 2, 0, 2}, {1, 2, 2, 0, 2}, {2, 2, 2, 0, 2}, {2, 0, 2, 0, 2},
{0, 0, 1, 0, 2}, {1, 0, 1, 0, 2}, {2, 0, 1, 0, 2}, {0, 1, 2, 0, 2},
{0, 1, 1, 0, 2}, {1, 1, 1, 0, 2}, {2, 1, 1, 0, 2}, {1, 1, 2, 0, 2},
{0, 2, 1, 0, 2}, {1, 2, 1, 0, 2}, {2, 2, 1, 0, 2}, {2, 1, 2, 0, 2},
{0, 2, 2, 2, 2}, {1, 2, 2, 2, 2}, {2, 2, 2, 2, 2}, {2, 0, 2, 2, 2},
{0, 0, 0, 0, 1}, {1, 0, 0, 0, 1}, {2, 0, 0, 0, 1}, {0, 0, 2, 0, 1},
{0, 1, 0, 0, 1}, {1, 1, 0, 0, 1}, {2, 1, 0, 0, 1}, {1, 0, 2, 0, 1},
{0, 2, 0, 0, 1}, {1, 2, 0, 0, 1}, {2, 2, 0, 0, 1}, {2, 0, 2, 0, 1},
{0, 2, 2, 0, 1}, {1, 2, 2, 0, 1}, {2, 2, 2, 0, 1}, {2, 0, 2, 0, 1},
{0, 0, 1, 0, 1}, {1, 0, 1, 0, 1}, {2, 0, 1, 0, 1}, {0, 1, 2, 0, 1},
{0, 1, 1, 0, 1}, {1, 1, 1, 0, 1}, {2, 1, 1, 0, 1}, {1, 1, 2, 0, 1},
{0, 2, 1, 0, 1}, {1, 2, 1, 0, 1}, {2, 2, 1, 0, 1}, {2, 1, 2, 0, 1},
{0, 0, 1, 2, 2}, {1, 0, 1, 2, 2}, {2, 0, 1, 2, 2}, {0, 1, 2, 2, 2},
{0, 0, 0, 1, 1}, {1, 0, 0, 1, 1}, {2, 0, 0, 1, 1}, {0, 0, 2, 1, 1},
{0, 1, 0, 1, 1}, {1, 1, 0, 1, 1}, {2, 1, 0, 1, 1}, {1, 0, 2, 1, 1},
{0, 2, 0, 1, 1}, {1, 2, 0, 1, 1}, {2, 2, 0, 1, 1}, {2, 0, 2, 1, 1},
{0, 2, 2, 1, 1}, {1, 2, 2, 1, 1}, {2, 2, 2, 1, 1}, {2, 0, 2, 1, 1},
{0, 0, 1, 1, 1}, {1, 0, 1, 1, 1}, {2, 0, 1, 1, 1}, {0, 1, 2, 1, 1},
{0, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 1, 1}, {1, 1, 2, 1, 1},
{0, 2, 1, 1, 1}, {1, 2, 1, 1, 1}, {2, 2, 1, 1, 1}, {2, 1, 2, 1, 1},
{0, 1, 1, 2, 2}, {1, 1, 1, 2, 2}, {2, 1, 1, 2, 2}, {1, 1, 2, 2, 2},
{0, 0, 0, 2, 1}, {1, 0, 0, 2, 1}, {2, 0, 0, 2, 1}, {0, 0, 2, 2, 1},
{0, 1, 0, 2, 1}, {1, 1, 0, 2, 1}, {2, 1, 0, 2, 1}, {1, 0, 2, 2, 1},
{0, 2, 0, 2, 1}, {1, 2, 0, 2, 1}, {2, 2, 0, 2, 1}, {2, 0, 2, 2, 1},
{0, 2, 2, 2, 1}, {1, 2, 2, 2, 1}, {2, 2, 2, 2, 1}, {2, 0, 2, 2, 1},
{0, 0, 1, 2, 1}, {1, 0, 1, 2, 1}, {2, 0, 1, 2, 1}, {0, 1, 2, 2, 1},
{0, 1, 1, 2, 1}, {1, 1, 1, 2, 1}, {2, 1, 1, 2, 1}, {1, 1, 2, 2, 1},
{0, 2, 1, 2, 1}, {1, 2, 1, 2, 1}, {2, 2, 1, 2, 1}, {2, 1, 2, 2, 1},
{0, 2, 1, 2, 2}, {1, 2, 1, 2, 2}, {2, 2, 1, 2, 2}, {2, 1, 2, 2, 2},
{0, 0, 0, 1, 2}, {1, 0, 0, 1, 2}, {2, 0, 0, 1, 2}, {0, 0, 2, 1, 2},
{0, 1, 0, 1, 2}, {1, 1, 0, 1, 2}, {2, 1, 0, 1, 2}, {1, 0, 2, 1, 2},
{0, 2, 0, 1, 2}, {1, 2, 0, 1, 2}, {2, 2, 0, 1, 2}, {2, 0, 2, 1, 2},
{0, 2, 2, 1, 2}, {1, 2, 2, 1, 2}, {2, 2, 2, 1, 2}, {2, 0, 2, 1, 2},
{0, 0, 1, 1, 2}, {1, 0, 1, 1, 2}, {2, 0, 1, 1, 2}, {0, 1, 2, 1, 2},
{0, 1, 1, 1, 2}, {1, 1, 1, 1, 2}, {2, 1, 1, 1, 2}, {1, 1, 2, 1, 2},
{0, 2, 1, 1, 2}, {1, 2, 1, 1, 2}, {2, 2, 1, 1, 2}, {2, 1, 2, 1, 2},
{0, 2, 2, 2, 2}, {1, 2, 2, 2, 2}, {2, 2, 2, 2, 2}, {2, 1, 2, 2, 2}
};
/** @brief Packed trit values for each unpacked value, indexed [hi][][][][lo]. */
static const uint8_t integer_of_trits[3][3][3][3][3] {
{
{
{
{0, 1, 2},
{4, 5, 6},
{8, 9, 10}
},
{
{16, 17, 18},
{20, 21, 22},
{24, 25, 26}
},
{
{3, 7, 15},
{19, 23, 27},
{12, 13, 14}
}
},
{
{
{32, 33, 34},
{36, 37, 38},
{40, 41, 42}
},
{
{48, 49, 50},
{52, 53, 54},
{56, 57, 58}
},
{
{35, 39, 47},
{51, 55, 59},
{44, 45, 46}
}
},
{
{
{64, 65, 66},
{68, 69, 70},
{72, 73, 74}
},
{
{80, 81, 82},
{84, 85, 86},
{88, 89, 90}
},
{
{67, 71, 79},
{83, 87, 91},
{76, 77, 78}
}
}
},
{
{
{
{128, 129, 130},
{132, 133, 134},
{136, 137, 138}
},
{
{144, 145, 146},
{148, 149, 150},
{152, 153, 154}
},
{
{131, 135, 143},
{147, 151, 155},
{140, 141, 142}
}
},
{
{
{160, 161, 162},
{164, 165, 166},
{168, 169, 170}
},
{
{176, 177, 178},
{180, 181, 182},
{184, 185, 186}
},
{
{163, 167, 175},
{179, 183, 187},
{172, 173, 174}
}
},
{
{
{192, 193, 194},
{196, 197, 198},
{200, 201, 202}
},
{
{208, 209, 210},
{212, 213, 214},
{216, 217, 218}
},
{
{195, 199, 207},
{211, 215, 219},
{204, 205, 206}
}
}
},
{
{
{
{96, 97, 98},
{100, 101, 102},
{104, 105, 106}
},
{
{112, 113, 114},
{116, 117, 118},
{120, 121, 122}
},
{
{99, 103, 111},
{115, 119, 123},
{108, 109, 110}
}
},
{
{
{224, 225, 226},
{228, 229, 230},
{232, 233, 234}
},
{
{240, 241, 242},
{244, 245, 246},
{248, 249, 250}
},
{
{227, 231, 239},
{243, 247, 251},
{236, 237, 238}
}
},
{
{
{28, 29, 30},
{60, 61, 62},
{92, 93, 94}
},
{
{156, 157, 158},
{188, 189, 190},
{220, 221, 222}
},
{
{31, 63, 127},
{159, 191, 255},
{252, 253, 254}
}
}
}
};
/**
* @brief The number of bits, trits, and quints needed for a quant level.
*/
struct btq_count
{
/** @brief The number of bits. */
uint8_t bits:6;
/** @brief The number of trits. */
uint8_t trits:1;
/** @brief The number of quints. */
uint8_t quints:1;
};
/**
* @brief The table of bits, trits, and quints needed for a quant encode.
*/
static const std::array<btq_count, 21> btq_counts {{
{ 1, 0, 0 }, // QUANT_2
{ 0, 1, 0 }, // QUANT_3
{ 2, 0, 0 }, // QUANT_4
{ 0, 0, 1 }, // QUANT_5
{ 1, 1, 0 }, // QUANT_6
{ 3, 0, 0 }, // QUANT_8
{ 1, 0, 1 }, // QUANT_10
{ 2, 1, 0 }, // QUANT_12
{ 4, 0, 0 }, // QUANT_16
{ 2, 0, 1 }, // QUANT_20
{ 3, 1, 0 }, // QUANT_24
{ 5, 0, 0 }, // QUANT_32
{ 3, 0, 1 }, // QUANT_40
{ 4, 1, 0 }, // QUANT_48
{ 6, 0, 0 }, // QUANT_64
{ 4, 0, 1 }, // QUANT_80
{ 5, 1, 0 }, // QUANT_96
{ 7, 0, 0 }, // QUANT_128
{ 5, 0, 1 }, // QUANT_160
{ 6, 1, 0 }, // QUANT_192
{ 8, 0, 0 } // QUANT_256
}};
/**
* @brief The sequence scale, round, and divisors needed to compute sizing.
*
* The length of a quantized sequence in bits is:
* (scale * <sequence_len> + round) / divisor
*/
struct ise_size
{
/** @brief The scaling parameter. */
uint8_t scale:6;
/** @brief The divisor parameter. */
uint8_t divisor:2;
};
/**
* @brief The table of scale, round, and divisors needed for quant sizing.
*/
static const std::array<ise_size, 21> ise_sizes {{
{ 1, 0 }, // QUANT_2
{ 8, 2 }, // QUANT_3
{ 2, 0 }, // QUANT_4
{ 7, 1 }, // QUANT_5
{ 13, 2 }, // QUANT_6
{ 3, 0 }, // QUANT_8
{ 10, 1 }, // QUANT_10
{ 18, 2 }, // QUANT_12
{ 4, 0 }, // QUANT_16
{ 13, 1 }, // QUANT_20
{ 23, 2 }, // QUANT_24
{ 5, 0 }, // QUANT_32
{ 16, 1 }, // QUANT_40
{ 28, 2 }, // QUANT_48
{ 6, 0 }, // QUANT_64
{ 19, 1 }, // QUANT_80
{ 33, 2 }, // QUANT_96
{ 7, 0 }, // QUANT_128
{ 22, 1 }, // QUANT_160
{ 38, 2 }, // QUANT_192
{ 8, 0 } // QUANT_256
}};
/* See header for documentation. */
unsigned int get_ise_sequence_bitcount(
unsigned int character_count,
quant_method quant_level
) {
// Cope with out-of bounds values - input might be invalid
if (static_cast<size_t>(quant_level) >= ise_sizes.size())
{
// Arbitrary large number that's more than an ASTC block can hold
return 1024;
}
auto& entry = ise_sizes[quant_level];
unsigned int divisor = (entry.divisor << 1) + 1;
return (entry.scale * character_count + divisor - 1) / divisor;
}
/**
* @brief Write up to 8 bits at an arbitrary bit offset.
*
* The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
* span two separate bytes in memory.
*
* @param value The value to write.
* @param bitcount The number of bits to write, starting from LSB.
* @param bitoffset The bit offset to store at, between 0 and 7.
* @param[in,out] ptr The data pointer to write to.
*/
static inline void write_bits(
unsigned int value,
unsigned int bitcount,
unsigned int bitoffset,
uint8_t ptr[2]
) {
unsigned int mask = (1 << bitcount) - 1;
value &= mask;
ptr += bitoffset >> 3;
bitoffset &= 7;
value <<= bitoffset;
mask <<= bitoffset;
mask = ~mask;
ptr[0] &= mask;
ptr[0] |= value;
ptr[1] &= mask >> 8;
ptr[1] |= value >> 8;
}
/**
* @brief Read up to 16 bits from two bytes.
*
* This function reads a packed N-bit field from two bytes in memory. The stored value must exist
* within the two bytes, but can start at an arbitary bit offset and span the two bytes in memory.
*
* @param bitcount The number of bits to read.
* @param bitoffset The bit offset to read from, between 0 and 7.
* @param[in,out] ptr The data pointer to read from.
*
* @return The read value.
*/
static inline unsigned int read_bits(
unsigned int bitcount,
unsigned int bitoffset,
const uint8_t* ptr
) {
unsigned int mask = (1 << bitcount) - 1;
ptr += bitoffset >> 3;
bitoffset &= 7;
unsigned int value = ptr[0] | (ptr[1] << 8);
value >>= bitoffset;
value &= mask;
return value;
}
/* See header for documentation. */
void encode_ise(
quant_method quant_level,
unsigned int character_count,
const uint8_t* input_data,
uint8_t* output_data,
unsigned int bit_offset
) {
promise(character_count > 0);
unsigned int bits = btq_counts[quant_level].bits;
unsigned int trits = btq_counts[quant_level].trits;
unsigned int quints = btq_counts[quant_level].quints;
unsigned int mask = (1 << bits) - 1;
// Write out trits and bits
if (trits)
{
unsigned int i = 0;
unsigned int full_trit_blocks = character_count / 5;
for (unsigned int j = 0; j < full_trit_blocks; j++)
{
unsigned int i4 = input_data[i + 4] >> bits;
unsigned int i3 = input_data[i + 3] >> bits;
unsigned int i2 = input_data[i + 2] >> bits;
unsigned int i1 = input_data[i + 1] >> bits;
unsigned int i0 = input_data[i + 0] >> bits;
uint8_t T = integer_of_trits[i4][i3][i2][i1][i0];
// The max size of a trit bit count is 6, so we can always safely
// pack a single MX value with the following 1 or 2 T bits.
uint8_t pack;
// Element 0 + T0 + T1
pack = (input_data[i++] & mask) | (((T >> 0) & 0x3) << bits);
write_bits(pack, bits + 2, bit_offset, output_data);
bit_offset += bits + 2;
// Element 1 + T2 + T3
pack = (input_data[i++] & mask) | (((T >> 2) & 0x3) << bits);
write_bits(pack, bits + 2, bit_offset, output_data);
bit_offset += bits + 2;
// Element 2 + T4
pack = (input_data[i++] & mask) | (((T >> 4) & 0x1) << bits);
write_bits(pack, bits + 1, bit_offset, output_data);
bit_offset += bits + 1;
// Element 3 + T5 + T6
pack = (input_data[i++] & mask) | (((T >> 5) & 0x3) << bits);
write_bits(pack, bits + 2, bit_offset, output_data);
bit_offset += bits + 2;
// Element 4 + T7
pack = (input_data[i++] & mask) | (((T >> 7) & 0x1) << bits);
write_bits(pack, bits + 1, bit_offset, output_data);
bit_offset += bits + 1;
}
// Loop tail for a partial block
if (i != character_count)
{
// i4 cannot be present - we know the block is partial
// i0 must be present - we know the block isn't empty
unsigned int i4 = 0;
unsigned int i3 = i + 3 >= character_count ? 0 : input_data[i + 3] >> bits;
unsigned int i2 = i + 2 >= character_count ? 0 : input_data[i + 2] >> bits;
unsigned int i1 = i + 1 >= character_count ? 0 : input_data[i + 1] >> bits;
unsigned int i0 = input_data[i + 0] >> bits;
uint8_t T = integer_of_trits[i4][i3][i2][i1][i0];
for (unsigned int j = 0; i < character_count; i++, j++)
{
// Truncated table as this iteration is always partital
static const uint8_t tbits[4] { 2, 2, 1, 2 };
static const uint8_t tshift[4] { 0, 2, 4, 5 };
uint8_t pack = (input_data[i] & mask) |
(((T >> tshift[j]) & ((1 << tbits[j]) - 1)) << bits);
write_bits(pack, bits + tbits[j], bit_offset, output_data);
bit_offset += bits + tbits[j];
}
}
}
// Write out quints and bits
else if (quints)
{
unsigned int i = 0;
unsigned int full_quint_blocks = character_count / 3;
for (unsigned int j = 0; j < full_quint_blocks; j++)
{
unsigned int i2 = input_data[i + 2] >> bits;
unsigned int i1 = input_data[i + 1] >> bits;
unsigned int i0 = input_data[i + 0] >> bits;
uint8_t T = integer_of_quints[i2][i1][i0];
// The max size of a quint bit count is 5, so we can always safely
// pack a single M value with the following 2 or 3 T bits.
uint8_t pack;
// Element 0
pack = (input_data[i++] & mask) | (((T >> 0) & 0x7) << bits);
write_bits(pack, bits + 3, bit_offset, output_data);
bit_offset += bits + 3;
// Element 1
pack = (input_data[i++] & mask) | (((T >> 3) & 0x3) << bits);
write_bits(pack, bits + 2, bit_offset, output_data);
bit_offset += bits + 2;
// Element 2
pack = (input_data[i++] & mask) | (((T >> 5) & 0x3) << bits);
write_bits(pack, bits + 2, bit_offset, output_data);
bit_offset += bits + 2;
}
// Loop tail for a partial block
if (i != character_count)
{
// i2 cannot be present - we know the block is partial
// i0 must be present - we know the block isn't empty
unsigned int i2 = 0;
unsigned int i1 = i + 1 >= character_count ? 0 : input_data[i + 1] >> bits;
unsigned int i0 = input_data[i + 0] >> bits;
uint8_t T = integer_of_quints[i2][i1][i0];
for (unsigned int j = 0; i < character_count; i++, j++)
{
// Truncated table as this iteration is always partital
static const uint8_t tbits[2] { 3, 2 };
static const uint8_t tshift[2] { 0, 3 };
uint8_t pack = (input_data[i] & mask) |
(((T >> tshift[j]) & ((1 << tbits[j]) - 1)) << bits);
write_bits(pack, bits + tbits[j], bit_offset, output_data);
bit_offset += bits + tbits[j];
}
}
}
// Write out just bits
else
{
for (unsigned int i = 0; i < character_count; i++)
{
write_bits(input_data[i], bits, bit_offset, output_data);
bit_offset += bits;
}
}
}
/* See header for documentation. */
void decode_ise(
quant_method quant_level,
unsigned int character_count,
const uint8_t* input_data,
uint8_t* output_data,
unsigned int bit_offset
) {
promise(character_count > 0);
// Note: due to how the trit/quint-block unpacking is done in this function, we may write more
// temporary results than the number of outputs. The maximum actual number of results is 64 bit,
// but we keep 4 additional character_count of padding.
uint8_t results[68];
uint8_t tq_blocks[22] { 0 }; // Trit-blocks or quint-blocks, must be zeroed
unsigned int bits = btq_counts[quant_level].bits;
unsigned int trits = btq_counts[quant_level].trits;
unsigned int quints = btq_counts[quant_level].quints;
unsigned int lcounter = 0;
unsigned int hcounter = 0;
// Collect bits for each element, as well as bits for any trit-blocks and quint-blocks.
for (unsigned int i = 0; i < character_count; i++)
{
results[i] = static_cast<uint8_t>(read_bits(bits, bit_offset, input_data));
bit_offset += bits;
if (trits)
{
static const uint8_t bits_to_read[5] { 2, 2, 1, 2, 1 };
static const uint8_t block_shift[5] { 0, 2, 4, 5, 7 };
static const uint8_t next_lcounter[5] { 1, 2, 3, 4, 0 };
static const uint8_t hcounter_incr[5] { 0, 0, 0, 0, 1 };
unsigned int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
bit_offset += bits_to_read[lcounter];
tq_blocks[hcounter] |= tdata << block_shift[lcounter];
hcounter += hcounter_incr[lcounter];
lcounter = next_lcounter[lcounter];
}
if (quints)
{
static const uint8_t bits_to_read[3] { 3, 2, 2 };
static const uint8_t block_shift[3] { 0, 3, 5 };
static const uint8_t next_lcounter[3] { 1, 2, 0 };
static const uint8_t hcounter_incr[3] { 0, 0, 1 };
unsigned int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
bit_offset += bits_to_read[lcounter];
tq_blocks[hcounter] |= tdata << block_shift[lcounter];
hcounter += hcounter_incr[lcounter];
lcounter = next_lcounter[lcounter];
}
}
// Unpack trit-blocks or quint-blocks as needed
if (trits)
{
unsigned int trit_blocks = (character_count + 4) / 5;
promise(trit_blocks > 0);
for (unsigned int i = 0; i < trit_blocks; i++)
{
const uint8_t *tritptr = trits_of_integer[tq_blocks[i]];
results[5 * i ] |= tritptr[0] << bits;
results[5 * i + 1] |= tritptr[1] << bits;
results[5 * i + 2] |= tritptr[2] << bits;
results[5 * i + 3] |= tritptr[3] << bits;
results[5 * i + 4] |= tritptr[4] << bits;
}
}
if (quints)
{
unsigned int quint_blocks = (character_count + 2) / 3;
promise(quint_blocks > 0);
for (unsigned int i = 0; i < quint_blocks; i++)
{
const uint8_t *quintptr = quints_of_integer[tq_blocks[i]];
results[3 * i ] |= quintptr[0] << bits;
results[3 * i + 1] |= quintptr[1] << bits;
results[3 * i + 2] |= quintptr[2] << bits;
}
}
for (unsigned int i = 0; i < character_count; i++)
{
output_data[i] = results[i];
}
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,346 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2025 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Functions and data declarations for the outer context.
*
* The outer context includes thread-pool management, which is slower to
* compile due to increased use of C++ stdlib. The inner context used in the
* majority of the codec library does not include this.
*/
#ifndef ASTCENC_INTERNAL_ENTRY_INCLUDED
#define ASTCENC_INTERNAL_ENTRY_INCLUDED
#include <atomic>
#include <condition_variable>
#include <functional>
#include <mutex>
#include "astcenc_internal.h"
/* ============================================================================
Parallel execution control
============================================================================ */
/**
* @brief A simple counter-based manager for parallel task execution.
*
* The task processing execution consists of:
*
* * A single-threaded init stage.
* * A multi-threaded processing stage.
* * A condition variable so threads can wait for processing completion.
*
* The init stage will be executed by the first thread to arrive in the critical section, there is
* no main thread in the thread pool.
*
* The processing stage uses dynamic dispatch to assign task tickets to threads on an on-demand
* basis. Threads may each therefore executed different numbers of tasks, depending on their
* processing complexity. The task queue and the task tickets are just counters; the caller must map
* these integers to an actual processing partition in a specific problem domain.
*
* The exit wait condition is needed to ensure processing has finished before a worker thread can
* progress to the next stage of the pipeline. Specifically a worker may exit the processing stage
* because there are no new tasks to assign to it while other worker threads are still processing.
* Calling @c wait() will ensure that all other worker have finished before the thread can proceed.
*
* The basic usage model:
*
* // --------- From single-threaded code ---------
*
* // Reset the tracker state
* manager->reset()
*
* // --------- From multi-threaded code ---------
*
* // Run the stage init; only first thread actually runs the lambda
* manager->init(<lambda>)
*
* do
* {
* // Request a task assignment
* uint task_count;
* uint base_index = manager->get_tasks(<granule>, task_count);
*
* // Process any tasks we were given (task_count <= granule size)
* if (task_count)
* {
* // Run the user task processing code for N tasks here
* ...
*
* // Flag these tasks as complete
* manager->complete_tasks(task_count);
* }
* } while (task_count);
*
* // Wait for all threads to complete tasks before progressing
* manager->wait()
*
* // Run the stage term; only first thread actually runs the lambda
* manager->term(<lambda>)
*/
class ParallelManager
{
private:
/** @brief Lock used for critical section and condition synchronization. */
std::mutex m_lock;
/** @brief True if the current operation is cancelled. */
std::atomic<bool> m_is_cancelled;
/** @brief True if the stage init() step has been executed. */
bool m_init_done;
/** @brief True if the stage term() step has been executed. */
bool m_term_done;
/** @brief Condition variable for tracking stage processing completion. */
std::condition_variable m_complete;
/** @brief Number of tasks started, but not necessarily finished. */
std::atomic<unsigned int> m_start_count;
/** @brief Number of tasks finished. */
unsigned int m_done_count;
/** @brief Number of tasks that need to be processed. */
unsigned int m_task_count;
/** @brief Progress callback (optional). */
astcenc_progress_callback m_callback;
/** @brief Lock used for callback synchronization. */
std::mutex m_callback_lock;
/** @brief Minimum progress before making a callback. */
float m_callback_min_diff;
/** @brief Last progress callback value. */
float m_callback_last_value;
public:
/** @brief Create a new ParallelManager. */
ParallelManager()
{
reset();
}
/**
* @brief Reset the tracker for a new processing batch.
*
* This must be called from single-threaded code before starting the multi-threaded processing
* operations.
*/
void reset()
{
m_init_done = false;
m_term_done = false;
m_is_cancelled = false;
m_start_count = 0;
m_done_count = 0;
m_task_count = 0;
m_callback = nullptr;
m_callback_last_value = 0.0f;
m_callback_min_diff = 1.0f;
}
/**
* @brief Clear the tracker and stop new tasks being assigned.
*
* Note, all in-flight tasks in a worker will still complete normally.
*/
void cancel()
{
m_is_cancelled = true;
}
/**
* @brief Trigger the pipeline stage init step.
*
* This can be called from multi-threaded code. The first thread to hit this will process the
* initialization. Other threads will block and wait for it to complete.
*
* @param init_func Callable which executes the stage initialization. It must return the
* total number of tasks in the stage.
*/
void init(std::function<unsigned int(void)> init_func)
{
std::lock_guard<std::mutex> lck(m_lock);
if (!m_init_done)
{
m_task_count = init_func();
m_init_done = true;
}
}
/**
* @brief Trigger the pipeline stage init step.
*
* This can be called from multi-threaded code. The first thread to hit this will process the
* initialization. Other threads will block and wait for it to complete.
*
* @param task_count Total number of tasks needing processing.
* @param callback Function pointer for progress status callbacks.
*/
void init(unsigned int task_count, astcenc_progress_callback callback)
{
std::lock_guard<std::mutex> lck(m_lock);
if (!m_init_done)
{
m_callback = callback;
m_task_count = task_count;
m_init_done = true;
// Report every 1% or 4096 blocks, whichever is larger, to avoid callback overhead
float min_diff = (4096.0f / static_cast<float>(task_count)) * 100.0f;
m_callback_min_diff = astc::max(min_diff, 1.0f);
}
}
/**
* @brief Request a task assignment.
*
* Assign up to @c granule tasks to the caller for processing.
*
* @param granule Maximum number of tasks that can be assigned.
* @param[out] count Actual number of tasks assigned, or zero if no tasks were assigned.
*
* @return Task index of the first assigned task; assigned tasks increment from this.
*/
unsigned int get_task_assignment(unsigned int granule, unsigned int& count)
{
unsigned int base = m_start_count.fetch_add(granule, std::memory_order_relaxed);
if (m_is_cancelled || base >= m_task_count)
{
count = 0;
return 0;
}
count = astc::min(m_task_count - base, granule);
return base;
}
/**
* @brief Complete a task assignment.
*
* Mark @c count tasks as complete. This will notify all threads blocked on @c wait() if this
* completes the processing of the stage.
*
* @param count The number of completed tasks.
*/
void complete_task_assignment(unsigned int count)
{
// Note: m_done_count cannot use an atomic without the mutex; this has a race between the
// update here and the wait() for other threads
unsigned int local_count;
float local_last_value;
{
std::unique_lock<std::mutex> lck(m_lock);
m_done_count += count;
local_count = m_done_count;
local_last_value = m_callback_last_value;
// Ensure the progress bar hits 100%
if (m_callback && m_done_count == m_task_count)
{
std::unique_lock<std::mutex> cblck(m_callback_lock);
m_callback(100.0f);
m_callback_last_value = 100.0f;
}
// Notify if nothing left to do
if (m_is_cancelled || m_done_count == m_task_count)
{
lck.unlock();
m_complete.notify_all();
}
}
// Process progress callback if we have one
if (m_callback)
{
// Initial lockless test - have we progressed enough to emit?
float num = static_cast<float>(local_count);
float den = static_cast<float>(m_task_count);
float this_value = (num / den) * 100.0f;
bool report_test = (this_value - local_last_value) > m_callback_min_diff;
// Recheck under lock, because another thread might report first
if (report_test)
{
std::unique_lock<std::mutex> cblck(m_callback_lock);
bool report_retest = (this_value - m_callback_last_value) > m_callback_min_diff;
if (report_retest)
{
m_callback(this_value);
m_callback_last_value = this_value;
}
}
}
}
/**
* @brief Wait for stage processing to complete.
*/
void wait()
{
std::unique_lock<std::mutex> lck(m_lock);
m_complete.wait(lck, [this]{ return m_is_cancelled || m_done_count == m_task_count; });
}
/**
* @brief Trigger the pipeline stage term step.
*
* This can be called from multi-threaded code. The first thread to hit this will process the
* work pool termination. Caller must have called @c wait() prior to calling this function to
* ensure that processing is complete.
*
* @param term_func Callable which executes the stage termination.
*/
void term(std::function<void(void)> term_func)
{
std::lock_guard<std::mutex> lck(m_lock);
if (!m_term_done)
{
term_func();
m_term_done = true;
}
}
};
/**
* @brief The astcenc compression context.
*/
struct astcenc_context
{
/** @brief The context internal state. */
astcenc_contexti context;
#if !defined(ASTCENC_DECOMPRESS_ONLY)
/** @brief The parallel manager for averages computation. */
ParallelManager manage_avg;
/** @brief The parallel manager for compression. */
ParallelManager manage_compress;
#endif
/** @brief The parallel manager for decompression. */
ParallelManager manage_decompress;
};
#endif
+48
View File
@@ -0,0 +1,48 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
#include "astcenc_mathlib.h"
/**
* @brief 64-bit rotate left.
*
* @param val The value to rotate.
* @param count The rotation, in bits.
*/
static inline uint64_t rotl(uint64_t val, int count)
{
return (val << count) | (val >> (64 - count));
}
/* See header for documentation. */
void astc::rand_init(uint64_t state[2])
{
state[0] = 0xfaf9e171cea1ec6bULL;
state[1] = 0xf1b318cc06af5d71ULL;
}
/* See header for documentation. */
uint64_t astc::rand(uint64_t state[2])
{
uint64_t s0 = state[0];
uint64_t s1 = state[1];
uint64_t res = s0 + s1;
s1 ^= s0;
state[0] = rotl(s0, 24) ^ s1 ^ (s1 << 16);
state[1] = rotl(s1, 37);
return res;
}
+505
View File
@@ -0,0 +1,505 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2025 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/*
* This module implements a variety of mathematical data types and library
* functions used by the codec.
*/
#ifndef ASTC_MATHLIB_H_INCLUDED
#define ASTC_MATHLIB_H_INCLUDED
#include <cassert>
#include <cstdint>
#include <cmath>
#ifndef ASTCENC_POPCNT
#if defined(__POPCNT__)
#define ASTCENC_POPCNT 1
#else
#define ASTCENC_POPCNT 0
#endif
#endif
#ifndef ASTCENC_F16C
#if defined(__F16C__)
#define ASTCENC_F16C 1
#else
#define ASTCENC_F16C 0
#endif
#endif
#ifndef ASTCENC_SSE
#if defined(__SSE4_2__)
#define ASTCENC_SSE 42
#elif defined(__SSE4_1__)
#define ASTCENC_SSE 41
#elif defined(__SSE2__) || (defined(_M_AMD64) && !defined(_M_ARM64EC))
#define ASTCENC_SSE 20
#else
#define ASTCENC_SSE 0
#endif
#endif
#ifndef ASTCENC_AVX
#if defined(__AVX2__)
#define ASTCENC_AVX 2
#define ASTCENC_X86_GATHERS 1
#elif defined(__AVX__)
#define ASTCENC_AVX 1
#define ASTCENC_X86_GATHERS 1
#else
#define ASTCENC_AVX 0
#endif
#endif
#ifndef ASTCENC_NEON
#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
#define ASTCENC_NEON 1
#else
#define ASTCENC_NEON 0
#endif
#endif
#ifndef ASTCENC_SVE
#if defined(__ARM_FEATURE_SVE)
#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256
#define ASTCENC_SVE 8
// Auto-detected SVE can only assume vector width of 4 is available, but
// must also allow for hardware being longer and so all use of intrinsics
// must explicitly use predicate masks to limit to 4-wide.
#else
#define ASTCENC_SVE 4
#endif
#else
#define ASTCENC_SVE 0
#endif
#endif
// Force vector-sized SIMD alignment
#if ASTCENC_AVX || ASTCENC_SVE == 8
#define ASTCENC_VECALIGN 32
#elif ASTCENC_SSE || ASTCENC_NEON || ASTCENC_SVE == 4
#define ASTCENC_VECALIGN 16
// Use default alignment for non-SIMD builds
#else
#define ASTCENC_VECALIGN 0
#endif
// C++11 states that alignas(0) should be ignored but GCC doesn't do
// this on some versions, so workaround and avoid emitting alignas(0)
#if ASTCENC_VECALIGN > 0
#define ASTCENC_ALIGNAS alignas(ASTCENC_VECALIGN)
#else
#define ASTCENC_ALIGNAS
#endif
#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 || ASTCENC_POPCNT != 0
#include <immintrin.h>
#endif
/* ============================================================================
Fast math library; note that many of the higher-order functions in this set
use approximations which are less accurate, but faster, than <cmath> standard
library equivalents.
Note: Many of these are not necessarily faster than simple C versions when
used on a single scalar value, but are included for testing purposes as most
have an option based on SSE intrinsics and therefore provide an obvious route
to future vectorization.
============================================================================ */
// Union for manipulation of float bit patterns
typedef union
{
uint32_t u;
int32_t s;
float f;
} if32;
// These are namespaced to avoid colliding with C standard library functions.
namespace astc
{
static const float PI = 3.14159265358979323846f;
static const float PI_OVER_TWO = 1.57079632679489661923f;
/**
* @brief SP float absolute value.
*
* @param v The value to make absolute.
*
* @return The absolute value.
*/
static inline float fabs(float v)
{
return std::fabs(v);
}
/**
* @brief Test if a float value is a nan.
*
* @param v The value test.
*
* @return Zero is not a NaN, non-zero otherwise.
*/
static inline bool isnan(float v)
{
return v != v;
}
/**
* @brief Return the minimum of two values.
*
* For floats, NaNs are turned into @c q.
*
* @param p The first value to compare.
* @param q The second value to compare.
*
* @return The smallest value.
*/
template<typename T>
static inline T min(T p, T q)
{
return p < q ? p : q;
}
/**
* @brief Return the minimum of three values.
*
* For floats, NaNs are turned into @c r.
*
* @param p The first value to compare.
* @param q The second value to compare.
* @param r The third value to compare.
*
* @return The smallest value.
*/
template<typename T>
static inline T min(T p, T q, T r)
{
return min(min(p, q), r);
}
/**
* @brief Return the minimum of four values.
*
* For floats, NaNs are turned into @c s.
*
* @param p The first value to compare.
* @param q The second value to compare.
* @param r The third value to compare.
* @param s The fourth value to compare.
*
* @return The smallest value.
*/
template<typename T>
static inline T min(T p, T q, T r, T s)
{
return min(min(p, q), min(r, s));
}
/**
* @brief Return the maximum of two values.
*
* For floats, NaNs are turned into @c q.
*
* @param p The first value to compare.
* @param q The second value to compare.
*
* @return The largest value.
*/
template<typename T>
static inline T max(T p, T q)
{
return p > q ? p : q;
}
/**
* @brief Return the maximum of three values.
*
* For floats, NaNs are turned into @c r.
*
* @param p The first value to compare.
* @param q The second value to compare.
* @param r The third value to compare.
*
* @return The largest value.
*/
template<typename T>
static inline T max(T p, T q, T r)
{
return max(max(p, q), r);
}
/**
* @brief Return the maximum of four values.
*
* For floats, NaNs are turned into @c s.
*
* @param p The first value to compare.
* @param q The second value to compare.
* @param r The third value to compare.
* @param s The fourth value to compare.
*
* @return The largest value.
*/
template<typename T>
static inline T max(T p, T q, T r, T s)
{
return max(max(p, q), max(r, s));
}
/**
* @brief Clamp a value value between @c mn and @c mx.
*
* For floats, NaNs are turned into @c mn.
*
* @param v The value to clamp.
* @param mn The min value (inclusive).
* @param mx The max value (inclusive).
*
* @return The clamped value.
*/
template<typename T>
inline T clamp(T v, T mn, T mx)
{
// Do not reorder; correct NaN handling relies on the fact that comparison
// with NaN returns false and will fall-though to the "min" value.
if (v > mx) return mx;
if (v > mn) return v;
return mn;
}
/**
* @brief Clamp a float value between 0.0f and 1.0f.
*
* NaNs are turned into 0.0f.
*
* @param v The value to clamp.
*
* @return The clamped value.
*/
static inline float clamp1f(float v)
{
return astc::clamp(v, 0.0f, 1.0f);
}
/**
* @brief Clamp a float value between 0.0f and 255.0f.
*
* NaNs are turned into 0.0f.
*
* @param v The value to clamp.
*
* @return The clamped value.
*/
static inline float clamp255f(float v)
{
return astc::clamp(v, 0.0f, 255.0f);
}
/**
* @brief SP float round-down.
*
* @param v The value to round.
*
* @return The rounded value.
*/
static inline float flt_rd(float v)
{
return std::floor(v);
}
/**
* @brief SP float round-to-nearest and convert to integer.
*
* @param v The value to round.
*
* @return The rounded value.
*/
static inline int flt2int_rtn(float v)
{
return static_cast<int>(v + 0.5f);
}
/**
* @brief SP float round down and convert to integer.
*
* @param v The value to round.
*
* @return The rounded value.
*/
static inline int flt2int_rd(float v)
{
return static_cast<int>(v);
}
/**
* @brief SP float bit-interpreted as an integer.
*
* @param v The value to bitcast.
*
* @return The converted value.
*/
static inline int float_as_int(float v)
{
union { int a; float b; } u;
u.b = v;
return u.a;
}
/**
* @brief Integer bit-interpreted as an SP float.
*
* @param v The value to bitcast.
*
* @return The converted value.
*/
static inline float int_as_float(int v)
{
union { int a; float b; } u;
u.a = v;
return u.b;
}
/**
* @brief Fast approximation of 1.0 / sqrt(val).
*
* @param v The input value.
*
* @return The approximated result.
*/
static inline float rsqrt(float v)
{
return 1.0f / std::sqrt(v);
}
/**
* @brief Fast approximation of sqrt(val).
*
* @param v The input value.
*
* @return The approximated result.
*/
static inline float sqrt(float v)
{
return std::sqrt(v);
}
/**
* @brief Extract mantissa and exponent of a float value.
*
* @param v The input value.
* @param[out] expo The output exponent.
*
* @return The mantissa.
*/
static inline float frexp(float v, int* expo)
{
if32 p;
p.f = v;
*expo = ((p.u >> 23) & 0xFF) - 126;
p.u = (p.u & 0x807fffff) | 0x3f000000;
return p.f;
}
/**
* @brief Initialize the seed structure for a random number generator.
*
* Important note: For the purposes of ASTC we want sets of random numbers to
* use the codec, but we want the same seed value across instances and threads
* to ensure that image output is stable across compressor runs and across
* platforms. Every PRNG created by this call will therefore return the same
* sequence of values ...
*
* @param state The state structure to initialize.
*/
void rand_init(uint64_t state[2]);
/**
* @brief Return the next random number from the generator.
*
* This RNG is an implementation of the "xoroshoro-128+ 1.0" PRNG, based on the
* public-domain implementation given by David Blackman & Sebastiano Vigna at
* http://vigna.di.unimi.it/xorshift/xoroshiro128plus.c
*
* @param state The state structure to use/update.
*/
uint64_t rand(uint64_t state[2]);
}
/* ============================================================================
Softfloat library with fp32 and fp16 conversion functionality.
============================================================================ */
#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
/* narrowing float->float conversions */
uint16_t float_to_sf16(float val);
float sf16_to_float(uint16_t val);
#endif
/*********************************
Vector library
*********************************/
#include "astcenc_vecmathlib.h"
/*********************************
Declaration of line types
*********************************/
// parametric line, 2D: The line is given by line = a + b * t.
struct line2
{
vfloat4 a;
vfloat4 b;
};
// parametric line, 3D
struct line3
{
vfloat4 a;
vfloat4 b;
};
struct line4
{
vfloat4 a;
vfloat4 b;
};
struct processed_line2
{
vfloat4 amod;
vfloat4 bs;
};
struct processed_line3
{
vfloat4 amod;
vfloat4 bs;
};
struct processed_line4
{
vfloat4 amod;
vfloat4 bs;
};
#endif
@@ -0,0 +1,411 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Soft-float library for IEEE-754.
*/
#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
#include "astcenc_mathlib.h"
/* sized soft-float types. These are mapped to the sized integer
types of C99, instead of C's floating-point types; this is because
the library needs to maintain exact, bit-level control on all
operations on these data types. */
typedef uint16_t sf16;
typedef uint32_t sf32;
/******************************************
helper functions and their lookup tables
******************************************/
/* count leading zeros functions. Only used when the input is nonzero. */
#if defined(__GNUC__) && (defined(__i386) || defined(__amd64))
#elif defined(__arm__) && defined(__ARMCC_VERSION)
#elif defined(__arm__) && defined(__GNUC__)
#else
/* table used for the slow default versions. */
static const uint8_t clz_table[256] =
{
8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
#endif
/*
32-bit count-leading-zeros function: use the Assembly instruction whenever possible. */
static uint32_t clz32(uint32_t inp)
{
#if defined(__GNUC__) && (defined(__i386) || defined(__amd64))
uint32_t bsr;
__asm__("bsrl %1, %0": "=r"(bsr):"r"(inp | 1));
return 31 - bsr;
#else
#if defined(__arm__) && defined(__ARMCC_VERSION)
return __clz(inp); /* armcc builtin */
#else
#if defined(__arm__) && defined(__GNUC__)
uint32_t lz;
__asm__("clz %0, %1": "=r"(lz):"r"(inp));
return lz;
#else
/* slow default version */
uint32_t summa = 24;
if (inp >= UINT32_C(0x10000))
{
inp >>= 16;
summa -= 16;
}
if (inp >= UINT32_C(0x100))
{
inp >>= 8;
summa -= 8;
}
return summa + clz_table[inp];
#endif
#endif
#endif
}
/* the five rounding modes that IEEE-754r defines */
typedef enum
{
SF_UP = 0, /* round towards positive infinity */
SF_DOWN = 1, /* round towards negative infinity */
SF_TOZERO = 2, /* round towards zero */
SF_NEARESTEVEN = 3, /* round toward nearest value; if mid-between, round to even value */
SF_NEARESTAWAY = 4 /* round toward nearest value; if mid-between, round away from zero */
} roundmode;
static uint32_t rtne_shift32(uint32_t inp, uint32_t shamt)
{
uint32_t vl1 = UINT32_C(1) << shamt;
uint32_t inp2 = inp + (vl1 >> 1); /* added 0.5 ULP */
uint32_t msk = (inp | UINT32_C(1)) & vl1; /* nonzero if odd. '| 1' forces it to 1 if the shamt is 0. */
msk--; /* negative if even, nonnegative if odd. */
inp2 -= (msk >> 31); /* subtract epsilon before shift if even. */
inp2 >>= shamt;
return inp2;
}
static uint32_t rtna_shift32(uint32_t inp, uint32_t shamt)
{
uint32_t vl1 = (UINT32_C(1) << shamt) >> 1;
inp += vl1;
inp >>= shamt;
return inp;
}
static uint32_t rtup_shift32(uint32_t inp, uint32_t shamt)
{
uint32_t vl1 = UINT32_C(1) << shamt;
inp += vl1;
inp--;
inp >>= shamt;
return inp;
}
/* convert from FP16 to FP32. */
static sf32 sf16_to_sf32(sf16 inp)
{
uint32_t inpx = inp;
/*
This table contains, for every FP16 sign/exponent value combination,
the difference between the input FP16 value and the value obtained
by shifting the correct FP32 result right by 13 bits.
This table allows us to handle every case except denormals and NaN
with just 1 table lookup, 2 shifts and 1 add.
*/
#define WITH_MSB(a) (UINT32_C(a) | (1u << 31))
static const uint32_t tbl[64] =
{
WITH_MSB(0x00000), 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,
0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,
0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,
0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, WITH_MSB(0x38000),
WITH_MSB(0x38000), 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,
0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,
0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,
0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, WITH_MSB(0x70000)
};
uint32_t res = tbl[inpx >> 10];
res += inpx;
/* Normal cases: MSB of 'res' not set. */
if ((res & WITH_MSB(0)) == 0)
{
return res << 13;
}
/* Infinity and Zero: 10 LSB of 'res' not set. */
if ((res & 0x3FF) == 0)
{
return res << 13;
}
/* NaN: the exponent field of 'inp' is non-zero. */
if ((inpx & 0x7C00) != 0)
{
/* All NaNs are quietened. */
return (res << 13) | 0x400000;
}
/* Denormal cases */
uint32_t sign = (inpx & 0x8000) << 16;
uint32_t mskval = inpx & 0x7FFF;
uint32_t leadingzeroes = clz32(mskval);
mskval <<= leadingzeroes;
return (mskval >> 8) + ((0x85 - leadingzeroes) << 23) + sign;
}
/* Conversion routine that converts from FP32 to FP16. It supports denormals and all rounding modes. If a NaN is given as input, it is quietened. */
static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
{
/* for each possible sign/exponent combination, store a case index. This gives a 512-byte table */
static const uint8_t tab[512] {
0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 40,
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 50,
5, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
25, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 45,
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 55,
};
/* many of the cases below use a case-dependent magic constant. So we look up a magic constant before actually performing the switch. This table allows us to group cases, thereby minimizing code
size. */
static const uint32_t tabx[60] {
UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x80000000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
UINT32_C(1), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8001), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
UINT32_C(0xC8001FFF), UINT32_C(0xC8000000), UINT32_C(0xC8000000), UINT32_C(0xC8000FFF), UINT32_C(0xC8001000),
UINT32_C(0x58000000), UINT32_C(0x38001FFF), UINT32_C(0x58000000), UINT32_C(0x58000FFF), UINT32_C(0x58001000),
UINT32_C(0x7C00), UINT32_C(0x7BFF), UINT32_C(0x7BFF), UINT32_C(0x7C00), UINT32_C(0x7C00),
UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFC00),
UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000),
UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000)
};
uint32_t p;
uint32_t idx = rmode + tab[inp >> 23];
uint32_t vlx = tabx[idx];
switch (idx)
{
/*
Positive number which may be Infinity or NaN.
We need to check whether it is NaN; if it is, quieten it by setting the top bit of the mantissa.
(If we don't do this quieting, then a NaN that is distinguished only by having
its low-order bits set, would be turned into an INF. */
case 50:
case 51:
case 52:
case 53:
case 54:
case 55:
case 56:
case 57:
case 58:
case 59:
/*
the input value is 0x7F800000 or 0xFF800000 if it is INF.
By subtracting 1, we get 7F7FFFFF or FF7FFFFF, that is, bit 23 becomes zero.
For NaNs, however, this operation will keep bit 23 with the value 1.
We can then extract bit 23, and logical-OR bit 9 of the result with this
bit in order to quieten the NaN (a Quiet NaN is a NaN where the top bit
of the mantissa is set.)
*/
p = (inp - 1) & UINT32_C(0x800000); /* zero if INF, nonzero if NaN. */
return static_cast<sf16>(((inp + vlx) >> 13) | (p >> 14));
/*
positive, exponent = 0, round-mode == UP; need to check whether number actually is 0.
If it is, then return 0, else return 1 (the smallest representable nonzero number)
*/
case 0:
/*
-inp will set the MSB if the input number is nonzero.
Thus (-inp) >> 31 will turn into 0 if the input number is 0 and 1 otherwise.
*/
return static_cast<sf16>(static_cast<uint32_t>((-static_cast<int32_t>(inp))) >> 31);
/*
negative, exponent = , round-mode == DOWN, need to check whether number is
actually 0. If it is, return 0x8000 ( float -0.0 )
Else return the smallest negative number ( 0x8001 ) */
case 6:
/*
in this case 'vlx' is 0x80000000. By subtracting the input value from it,
we obtain a value that is 0 if the input value is in fact zero and has
the MSB set if it isn't. We then right-shift the value by 31 places to
get a value that is 0 if the input is -0.0 and 1 otherwise.
*/
return static_cast<sf16>(((vlx - inp) >> 31) + UINT32_C(0x8000));
/*
for all other cases involving underflow/overflow, we don't need to
do actual tests; we just return 'vlx'.
*/
case 1:
case 2:
case 3:
case 4:
case 5:
case 7:
case 8:
case 9:
case 10:
case 11:
case 12:
case 13:
case 14:
case 15:
case 16:
case 17:
case 18:
case 19:
case 40:
case 41:
case 42:
case 43:
case 44:
case 45:
case 46:
case 47:
case 48:
case 49:
return static_cast<sf16>(vlx);
/*
for normal numbers, 'vlx' is the difference between the FP32 value of a number and the
FP16 representation of the same number left-shifted by 13 places. In addition, a rounding constant is
baked into 'vlx': for rounding-away-from zero, the constant is 2^13 - 1, causing roundoff away
from zero. for round-to-nearest away, the constant is 2^12, causing roundoff away from zero.
for round-to-nearest-even, the constant is 2^12 - 1. This causes correct round-to-nearest-even
except for odd input numbers. For odd input numbers, we need to add 1 to the constant. */
/* normal number, all rounding modes except round-to-nearest-even: */
case 30:
case 31:
case 32:
case 34:
case 35:
case 36:
case 37:
case 39:
return static_cast<sf16>((inp + vlx) >> 13);
/* normal number, round-to-nearest-even. */
case 33:
case 38:
p = inp + vlx;
p += (inp >> 13) & 1;
return static_cast<sf16>(p >> 13);
/*
the various denormal cases. These are not expected to be common, so their performance is a bit
less important. For each of these cases, we need to extract an exponent and a mantissa
(including the implicit '1'!), and then right-shift the mantissa by a shift-amount that
depends on the exponent. The shift must apply the correct rounding mode. 'vlx' is used to supply the
sign of the resulting denormal number.
*/
case 21:
case 22:
case 25:
case 27:
/* denormal, round towards zero. */
p = 126 - ((inp >> 23) & 0xFF);
return static_cast<sf16>((((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000)) >> p) | vlx);
case 20:
case 26:
/* denormal, round away from zero. */
p = 126 - ((inp >> 23) & 0xFF);
return static_cast<sf16>(rtup_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
case 24:
case 29:
/* denormal, round to nearest-away */
p = 126 - ((inp >> 23) & 0xFF);
return static_cast<sf16>(rtna_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
case 23:
case 28:
/* denormal, round to nearest-even. */
p = 126 - ((inp >> 23) & 0xFF);
return static_cast<sf16>(rtne_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
}
return 0;
}
/* convert from soft-float to native-float */
float sf16_to_float(uint16_t p)
{
if32 i;
i.u = sf16_to_sf32(p);
return i.f;
}
/* convert from native-float to soft-float */
uint16_t float_to_sf16(float p)
{
if32 i;
i.f = p;
return sf32_to_sf16(i.u, SF_NEARESTEVEN);
}
#endif
@@ -0,0 +1,481 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Functions for generating partition tables on demand.
*/
#include "astcenc_internal.h"
/** @brief The number of 64-bit words needed to represent a canonical partition bit pattern. */
#define BIT_PATTERN_WORDS (((ASTCENC_BLOCK_MAX_TEXELS * 2) + 63) / 64)
/**
* @brief Generate a canonical representation of a partition pattern.
*
* The returned value stores two bits per texel, for up to 6x6x6 texels, where the two bits store
* the remapped texel index. Remapping ensures that we only match on the partition pattern,
* independent of the partition order generated by the hash.
*
* @param texel_count The number of texels in the block.
* @param partition_of_texel The partition assignments, in hash order.
* @param[out] bit_pattern The output bit pattern representation.
*/
static void generate_canonical_partitioning(
unsigned int texel_count,
const uint8_t* partition_of_texel,
uint64_t bit_pattern[BIT_PATTERN_WORDS]
) {
// Clear the pattern
for (unsigned int i = 0; i < BIT_PATTERN_WORDS; i++)
{
bit_pattern[i] = 0;
}
// Store a mapping to reorder the raw partitions so that the partitions are ordered such
// that the lowest texel index in partition N is smaller than the lowest texel index in
// partition N + 1.
int mapped_index[BLOCK_MAX_PARTITIONS];
int map_weight_count = 0;
for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONS; i++)
{
mapped_index[i] = -1;
}
for (unsigned int i = 0; i < texel_count; i++)
{
int index = partition_of_texel[i];
if (mapped_index[index] < 0)
{
mapped_index[index] = map_weight_count++;
}
uint64_t xlat_index = mapped_index[index];
bit_pattern[i >> 5] |= xlat_index << (2 * (i & 0x1F));
}
}
/**
* @brief Compare two canonical patterns to see if they are the same.
*
* @param part1 The first canonical bit pattern to check.
* @param part2 The second canonical bit pattern to check.
*
* @return @c true if the patterns are the same, @c false otherwise.
*/
static bool compare_canonical_partitionings(
const uint64_t part1[BIT_PATTERN_WORDS],
const uint64_t part2[BIT_PATTERN_WORDS]
) {
return (part1[0] == part2[0])
#if BIT_PATTERN_WORDS > 1
&& (part1[1] == part2[1])
#endif
#if BIT_PATTERN_WORDS > 2
&& (part1[2] == part2[2])
#endif
#if BIT_PATTERN_WORDS > 3
&& (part1[3] == part2[3])
#endif
#if BIT_PATTERN_WORDS > 4
&& (part1[4] == part2[4])
#endif
#if BIT_PATTERN_WORDS > 5
&& (part1[5] == part2[5])
#endif
#if BIT_PATTERN_WORDS > 6
&& (part1[6] == part2[6])
#endif
;
}
/**
* @brief Hash function used for procedural partition assignment.
*
* @param inp The hash seed.
*
* @return The hashed value.
*/
static uint32_t hash52(
uint32_t inp
) {
inp ^= inp >> 15;
// (2^4 + 1) * (2^7 + 1) * (2^17 - 1)
inp *= 0xEEDE0891;
inp ^= inp >> 5;
inp += inp << 16;
inp ^= inp >> 7;
inp ^= inp >> 3;
inp ^= inp << 6;
inp ^= inp >> 17;
return inp;
}
/**
* @brief Select texel assignment for a single coordinate.
*
* @param seed The seed - the partition index from the block.
* @param x The texel X coordinate in the block.
* @param y The texel Y coordinate in the block.
* @param z The texel Z coordinate in the block.
* @param partition_count The total partition count of this encoding.
* @param small_block @c true if the block has fewer than 32 texels.
*
* @return The assigned partition index for this texel.
*/
static uint8_t select_partition(
int seed,
int x,
int y,
int z,
int partition_count,
bool small_block
) {
// For small blocks bias the coordinates to get better distribution
if (small_block)
{
x <<= 1;
y <<= 1;
z <<= 1;
}
seed += (partition_count - 1) * 1024;
uint32_t rnum = hash52(seed);
uint8_t seed1 = rnum & 0xF;
uint8_t seed2 = (rnum >> 4) & 0xF;
uint8_t seed3 = (rnum >> 8) & 0xF;
uint8_t seed4 = (rnum >> 12) & 0xF;
uint8_t seed5 = (rnum >> 16) & 0xF;
uint8_t seed6 = (rnum >> 20) & 0xF;
uint8_t seed7 = (rnum >> 24) & 0xF;
uint8_t seed8 = (rnum >> 28) & 0xF;
uint8_t seed9 = (rnum >> 18) & 0xF;
uint8_t seed10 = (rnum >> 22) & 0xF;
uint8_t seed11 = (rnum >> 26) & 0xF;
uint8_t seed12 = ((rnum >> 30) | (rnum << 2)) & 0xF;
// Squaring all the seeds in order to bias their distribution towards lower values.
seed1 *= seed1;
seed2 *= seed2;
seed3 *= seed3;
seed4 *= seed4;
seed5 *= seed5;
seed6 *= seed6;
seed7 *= seed7;
seed8 *= seed8;
seed9 *= seed9;
seed10 *= seed10;
seed11 *= seed11;
seed12 *= seed12;
int sh1, sh2;
if (seed & 1)
{
sh1 = (seed & 2 ? 4 : 5);
sh2 = (partition_count == 3 ? 6 : 5);
}
else
{
sh1 = (partition_count == 3 ? 6 : 5);
sh2 = (seed & 2 ? 4 : 5);
}
int sh3 = (seed & 0x10) ? sh1 : sh2;
seed1 >>= sh1;
seed2 >>= sh2;
seed3 >>= sh1;
seed4 >>= sh2;
seed5 >>= sh1;
seed6 >>= sh2;
seed7 >>= sh1;
seed8 >>= sh2;
seed9 >>= sh3;
seed10 >>= sh3;
seed11 >>= sh3;
seed12 >>= sh3;
int a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
int b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
int c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
int d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
// Apply the saw
a &= 0x3F;
b &= 0x3F;
c &= 0x3F;
d &= 0x3F;
// Remove some of the components if we are to output < 4 partitions.
if (partition_count <= 3)
{
d = 0;
}
if (partition_count <= 2)
{
c = 0;
}
if (partition_count <= 1)
{
b = 0;
}
uint8_t partition;
if (a >= b && a >= c && a >= d)
{
partition = 0;
}
else if (b >= c && b >= d)
{
partition = 1;
}
else if (c >= d)
{
partition = 2;
}
else
{
partition = 3;
}
return partition;
}
/**
* @brief Generate a single partition info structure.
*
* @param[out] bsd The block size information.
* @param partition_count The partition count of this partitioning.
* @param partition_index The partition index / seed of this partitioning.
* @param partition_remap_index The remapped partition index of this partitioning.
* @param[out] pi The partition info structure to populate.
*
* @return True if this is a useful partition index, False if we can skip it.
*/
static bool generate_one_partition_info_entry(
block_size_descriptor& bsd,
unsigned int partition_count,
unsigned int partition_index,
unsigned int partition_remap_index,
partition_info& pi
) {
int texels_per_block = bsd.texel_count;
bool small_block = texels_per_block < 32;
uint8_t *partition_of_texel = pi.partition_of_texel;
// Assign texels to partitions
int texel_idx = 0;
int counts[BLOCK_MAX_PARTITIONS] { 0 };
for (unsigned int z = 0; z < bsd.zdim; z++)
{
for (unsigned int y = 0; y < bsd.ydim; y++)
{
for (unsigned int x = 0; x < bsd.xdim; x++)
{
uint8_t part = select_partition(partition_index, x, y, z, partition_count, small_block);
pi.texels_of_partition[part][counts[part]++] = static_cast<uint8_t>(texel_idx++);
*partition_of_texel++ = part;
}
}
}
// Fill loop tail so we can overfetch later
for (unsigned int i = 0; i < partition_count; i++)
{
size_t ptex_count = counts[i];
size_t ptex_count_simd = round_up_to_simd_multiple_vla(ptex_count);
for (size_t j = ptex_count; j < ptex_count_simd; j++)
{
pi.texels_of_partition[i][j] = pi.texels_of_partition[i][ptex_count - 1];
}
}
// Populate the actual procedural partition count
if (counts[0] == 0)
{
pi.partition_count = 0;
}
else if (counts[1] == 0)
{
pi.partition_count = 1;
}
else if (counts[2] == 0)
{
pi.partition_count = 2;
}
else if (counts[3] == 0)
{
pi.partition_count = 3;
}
else
{
pi.partition_count = 4;
}
// Populate the partition index
pi.partition_index = static_cast<uint16_t>(partition_index);
// Populate the coverage bitmaps for 2/3/4 partitions
uint64_t* bitmaps { nullptr };
if (partition_count == 2)
{
bitmaps = bsd.coverage_bitmaps_2[partition_remap_index];
}
else if (partition_count == 3)
{
bitmaps = bsd.coverage_bitmaps_3[partition_remap_index];
}
else if (partition_count == 4)
{
bitmaps = bsd.coverage_bitmaps_4[partition_remap_index];
}
for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONS; i++)
{
pi.partition_texel_count[i] = static_cast<uint8_t>(counts[i]);
}
// Valid partitionings have texels in all of the requested partitions
bool valid = pi.partition_count == partition_count;
if (bitmaps)
{
// Populate the partition coverage bitmap
for (unsigned int i = 0; i < partition_count; i++)
{
bitmaps[i] = 0ULL;
}
unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS);
for (unsigned int i = 0; i < texels_to_process; i++)
{
unsigned int idx = bsd.kmeans_texels[i];
bitmaps[pi.partition_of_texel[idx]] |= 1ULL << i;
}
}
return valid;
}
static void build_partition_table_for_one_partition_count(
block_size_descriptor& bsd,
bool can_omit_partitionings,
unsigned int partition_count_cutoff,
unsigned int partition_count,
partition_info* ptab,
uint64_t* canonical_patterns
) {
unsigned int next_index = 0;
bsd.partitioning_count_selected[partition_count - 1] = 0;
bsd.partitioning_count_all[partition_count - 1] = 0;
// Skip tables larger than config max partition count if we can omit modes
if (can_omit_partitionings && (partition_count > partition_count_cutoff))
{
return;
}
// Iterate through twice
// - Pass 0: Keep selected partitionings
// - Pass 1: Keep non-selected partitionings (skip if in omit mode)
unsigned int max_iter = can_omit_partitionings ? 1 : 2;
// Tracker for things we built in the first iteration
uint8_t build[BLOCK_MAX_PARTITIONINGS] { 0 };
for (unsigned int x = 0; x < max_iter; x++)
{
for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONINGS; i++)
{
// Don't include things we built in the first pass
if ((x == 1) && build[i])
{
continue;
}
bool keep_useful = generate_one_partition_info_entry(bsd, partition_count, i, next_index, ptab[next_index]);
if ((x == 0) && !keep_useful)
{
continue;
}
generate_canonical_partitioning(bsd.texel_count, ptab[next_index].partition_of_texel, canonical_patterns + next_index * BIT_PATTERN_WORDS);
bool keep_canonical = true;
for (unsigned int j = 0; j < next_index; j++)
{
bool match = compare_canonical_partitionings(canonical_patterns + next_index * BIT_PATTERN_WORDS, canonical_patterns + j * BIT_PATTERN_WORDS);
if (match)
{
keep_canonical = false;
break;
}
}
if (keep_useful && keep_canonical)
{
if (x == 0)
{
bsd.partitioning_packed_index[partition_count - 2][i] = static_cast<uint16_t>(next_index);
bsd.partitioning_count_selected[partition_count - 1]++;
bsd.partitioning_count_all[partition_count - 1]++;
build[i] = 1;
next_index++;
}
}
else
{
if (x == 1)
{
bsd.partitioning_packed_index[partition_count - 2][i] = static_cast<uint16_t>(next_index);
bsd.partitioning_count_all[partition_count - 1]++;
next_index++;
}
}
}
}
}
/* See header for documentation. */
void init_partition_tables(
block_size_descriptor& bsd,
bool can_omit_partitionings,
unsigned int partition_count_cutoff
) {
partition_info* par_tab2 = bsd.partitionings;
partition_info* par_tab3 = par_tab2 + BLOCK_MAX_PARTITIONINGS;
partition_info* par_tab4 = par_tab3 + BLOCK_MAX_PARTITIONINGS;
partition_info* par_tab1 = par_tab4 + BLOCK_MAX_PARTITIONINGS;
generate_one_partition_info_entry(bsd, 1, 0, 0, *par_tab1);
bsd.partitioning_count_selected[0] = 1;
bsd.partitioning_count_all[0] = 1;
uint64_t* canonical_patterns = new uint64_t[BLOCK_MAX_PARTITIONINGS * BIT_PATTERN_WORDS];
build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 2, par_tab2, canonical_patterns);
build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 3, par_tab3, canonical_patterns);
build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 4, par_tab4, canonical_patterns);
delete[] canonical_patterns;
}
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,166 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2020-2021 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Platform-specific function implementations.
*
* This module contains functions for querying the host extended ISA support.
*/
// Include before the defines below to pick up any auto-setup based on compiler
// built-in config, if not being set explicitly by the build system
#include "astcenc_internal.h"
#if (ASTCENC_SSE > 0) || (ASTCENC_AVX > 0) || \
(ASTCENC_POPCNT > 0) || (ASTCENC_F16C > 0)
static bool g_init { false };
/** Does this CPU support SSE 4.1? Set to -1 if not yet initialized. */
static bool g_cpu_has_sse41 { false };
/** Does this CPU support AVX2? Set to -1 if not yet initialized. */
static bool g_cpu_has_avx2 { false };
/** Does this CPU support POPCNT? Set to -1 if not yet initialized. */
static bool g_cpu_has_popcnt { false };
/** Does this CPU support F16C? Set to -1 if not yet initialized. */
static bool g_cpu_has_f16c { false };
/* ============================================================================
Platform code for Visual Studio
============================================================================ */
#if !defined(__clang__) && defined(_MSC_VER)
#define WIN32_LEAN_AND_MEAN
#include <Windows.h>
#include <intrin.h>
/**
* @brief Detect platform CPU ISA support and update global trackers.
*/
static void detect_cpu_isa()
{
int data[4];
__cpuid(data, 0);
int num_id = data[0];
if (num_id >= 1)
{
__cpuidex(data, 1, 0);
// SSE41 = Bank 1, ECX, bit 19
g_cpu_has_sse41 = data[2] & (1 << 19) ? true : false;
// POPCNT = Bank 1, ECX, bit 23
g_cpu_has_popcnt = data[2] & (1 << 23) ? true : false;
// F16C = Bank 1, ECX, bit 29
g_cpu_has_f16c = data[2] & (1 << 29) ? true : false;
}
if (num_id >= 7)
{
__cpuidex(data, 7, 0);
// AVX2 = Bank 7, EBX, bit 5
g_cpu_has_avx2 = data[1] & (1 << 5) ? true : false;
}
// Ensure state bits are updated before init flag is updated
MemoryBarrier();
g_init = true;
}
/* ============================================================================
Platform code for GCC and Clang
============================================================================ */
#else
#include <cpuid.h>
/**
* @brief Detect platform CPU ISA support and update global trackers.
*/
static void detect_cpu_isa()
{
unsigned int data[4];
if (__get_cpuid_count(1, 0, &data[0], &data[1], &data[2], &data[3]))
{
// SSE41 = Bank 1, ECX, bit 19
g_cpu_has_sse41 = data[2] & (1 << 19) ? true : false;
// POPCNT = Bank 1, ECX, bit 23
g_cpu_has_popcnt = data[2] & (1 << 23) ? true : false;
// F16C = Bank 1, ECX, bit 29
g_cpu_has_f16c = data[2] & (1 << 29) ? true : false;
}
g_cpu_has_avx2 = 0;
if (__get_cpuid_count(7, 0, &data[0], &data[1], &data[2], &data[3]))
{
// AVX2 = Bank 7, EBX, bit 5
g_cpu_has_avx2 = data[1] & (1 << 5) ? true : false;
}
// Ensure state bits are updated before init flag is updated
__sync_synchronize();
g_init = true;
}
#endif
/* See header for documentation. */
bool cpu_supports_popcnt()
{
if (!g_init)
{
detect_cpu_isa();
}
return g_cpu_has_popcnt;
}
/* See header for documentation. */
bool cpu_supports_f16c()
{
if (!g_init)
{
detect_cpu_isa();
}
return g_cpu_has_f16c;
}
/* See header for documentation. */
bool cpu_supports_sse41()
{
if (!g_init)
{
detect_cpu_isa();
}
return g_cpu_has_sse41;
}
/* See header for documentation. */
bool cpu_supports_avx2()
{
if (!g_init)
{
detect_cpu_isa();
}
return g_cpu_has_avx2;
}
#endif
@@ -0,0 +1,903 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Functions and data tables for numeric quantization..
*/
#include "astcenc_internal.h"
#if !defined(ASTCENC_DECOMPRESS_ONLY)
// Not scrambled, starts from QUANT_6
const uint8_t color_unquant_to_uquant_tables[17][512] {
{ // QUANT_6
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51,
51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51,
51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51,
51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 102, 102, 102, 102, 102, 102,
102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
153, 153, 153, 153, 153, 153, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
},
{ // QUANT_8
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73,
73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73,
73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 109, 109, 109, 109, 109, 109, 109, 109, 109,
109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146,
146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146,
146, 146, 146, 146, 146, 146, 146, 146, 146, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219,
219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219,
219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
},
{ // QUANT_10
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 28, 28,
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56,
56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56,
56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
84, 84, 84, 84, 84, 84, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113,
113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113,
142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 171, 171, 171, 171, 171, 171,
171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171,
171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199,
199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199,
199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227,
227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227,
227, 227, 227, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
},
{ // QUANT_12
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 23, 23, 23, 23, 23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46,
46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69,
69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69,
69, 69, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92,
92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116,
116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116,
139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139,
139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163,
163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 186, 186,
186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186,
186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209,
209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 232, 232, 232, 232, 232, 232,
232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
232, 232, 232, 232, 232, 232, 232, 232, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
},
{ // QUANT_16
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51,
51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 68, 68, 68, 68, 68, 68, 68, 68,
68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 85, 85, 85, 85, 85, 85,
85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 102, 102, 102, 102,
102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 119, 119,
119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119,
136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136,
136, 136, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
153, 153, 153, 153, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170,
170, 170, 170, 170, 170, 170, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
187, 187, 187, 187, 187, 187, 187, 187, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221,
221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238,
238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
},
{ // QUANT_20
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
13, 13, 13, 13, 13, 13, 13, 13, 13, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
27, 27, 27, 27, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 54,
54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 67, 67, 67, 67, 67, 67,
67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80,
80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94,
94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
107, 107, 107, 107, 107, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121,
134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 148, 148, 148, 148, 148,
148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161,
161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175,
175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188,
188, 188, 188, 188, 188, 188, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201,
201, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 228, 228, 228, 228,
228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 242, 242, 242, 242, 242, 242, 242, 242, 242,
242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
},
{ // QUANT_24
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
11, 11, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 33, 33,
33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
44, 44, 44, 44, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 66, 66, 66, 66, 66, 66,
66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77,
77, 77, 77, 77, 77, 77, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 99, 99, 99, 99,
99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110,
110, 110, 110, 110, 110, 110, 110, 110, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121,
134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 145, 145, 145, 145, 145, 145, 145, 145,
145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156,
156, 156, 156, 156, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 178, 178, 178, 178, 178, 178,
178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189,
189, 189, 189, 189, 189, 189, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 211, 211, 211, 211,
211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222,
222, 222, 222, 222, 222, 222, 222, 222, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 244, 244,
244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
},
{ // QUANT_32
0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 33, 33, 33, 33, 33, 33,
33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 49, 49, 49, 49, 49,
49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 66, 66, 66, 66,
66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 82, 82, 82,
82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 99, 99,
99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 115,
115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123,
132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140,
140, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156,
156, 156, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 173, 173, 173, 173, 173, 173, 173, 173, 173, 173, 173, 173, 173,
173, 173, 173, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189,
189, 189, 189, 189, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206,
206, 206, 206, 206, 206, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222,
222, 222, 222, 222, 222, 222, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 239, 239, 239, 239, 239, 239, 239, 239, 239,
239, 239, 239, 239, 239, 239, 239, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 255, 255, 255, 255, 255, 255, 255, 255, 255
},
{ // QUANT_40
0, 0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
13, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 32, 32, 32, 32, 32,
32, 32, 32, 32, 32, 32, 32, 32, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
45, 45, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 65, 65, 65, 65,
65, 65, 65, 65, 65, 65, 65, 65, 65, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
78, 78, 78, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 97, 97, 97,
97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 110, 110, 110, 110, 110, 110, 110, 110, 110,
110, 110, 110, 110, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123,
132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 138, 138, 138, 138, 138, 138, 138, 138, 138, 138, 138, 138, 138, 145, 145, 145, 145,
145, 145, 145, 145, 145, 145, 145, 145, 145, 151, 151, 151, 151, 151, 151, 151, 151, 151, 151, 151, 151, 151, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158,
158, 158, 158, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 177, 177, 177,
177, 177, 177, 177, 177, 177, 177, 177, 177, 177, 184, 184, 184, 184, 184, 184, 184, 184, 184, 184, 184, 184, 184, 190, 190, 190, 190, 190, 190, 190, 190, 190,
190, 190, 190, 190, 197, 197, 197, 197, 197, 197, 197, 197, 197, 197, 197, 197, 197, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 210, 210,
210, 210, 210, 210, 210, 210, 210, 210, 210, 210, 210, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 223, 223, 223, 223, 223, 223, 223, 223,
223, 223, 223, 223, 223, 229, 229, 229, 229, 229, 229, 229, 229, 229, 229, 229, 229, 229, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 242,
242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 255, 255, 255, 255, 255, 255, 255
},
{ // QUANT_48
0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 32, 32, 32, 32,
32, 32, 32, 32, 32, 32, 32, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 48, 48, 48, 48,
48, 48, 48, 48, 48, 48, 48, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 65, 65, 65,
65, 65, 65, 65, 65, 65, 65, 65, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 81, 81,
81, 81, 81, 81, 81, 81, 81, 81, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 97, 97,
97, 97, 97, 97, 97, 97, 97, 97, 97, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 113, 113,
113, 113, 113, 113, 113, 113, 113, 113, 113, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124,
131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 142, 142, 142, 142, 142, 142, 142, 142, 142,
142, 142, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 158, 158, 158, 158, 158, 158, 158, 158, 158,
158, 158, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 169, 169, 169, 169, 169, 169, 169, 169, 169, 169, 169, 174, 174, 174, 174, 174, 174, 174, 174,
174, 174, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 185, 185, 185, 185, 185, 185, 185, 185, 185, 185, 185, 190, 190, 190, 190, 190, 190, 190, 190,
190, 190, 190, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 207, 207, 207, 207, 207, 207, 207,
207, 207, 207, 207, 212, 212, 212, 212, 212, 212, 212, 212, 212, 212, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 223, 223, 223, 223, 223, 223, 223,
223, 223, 223, 223, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 234, 234, 234, 234, 234, 234, 234, 234, 234, 234, 234, 239, 239, 239, 239, 239, 239,
239, 239, 239, 239, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 255, 255, 255, 255, 255, 255
},
{ // QUANT_64
0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 12, 12, 12, 12, 12, 12, 12, 12, 16, 16, 16,
16, 16, 16, 16, 16, 20, 20, 20, 20, 20, 20, 20, 20, 24, 24, 24, 24, 24, 24, 24, 24, 28, 28, 28, 28, 28, 28, 28, 28, 32, 32, 32,
32, 32, 32, 32, 32, 36, 36, 36, 36, 36, 36, 36, 36, 40, 40, 40, 40, 40, 40, 40, 40, 44, 44, 44, 44, 44, 44, 44, 44, 48, 48, 48,
48, 48, 48, 48, 48, 52, 52, 52, 52, 52, 52, 52, 52, 56, 56, 56, 56, 56, 56, 56, 56, 60, 60, 60, 60, 60, 60, 60, 60, 60, 65, 65,
65, 65, 65, 65, 65, 65, 65, 69, 69, 69, 69, 69, 69, 69, 69, 73, 73, 73, 73, 73, 73, 73, 73, 77, 77, 77, 77, 77, 77, 77, 77, 81,
81, 81, 81, 81, 81, 81, 81, 85, 85, 85, 85, 85, 85, 85, 85, 89, 89, 89, 89, 89, 89, 89, 89, 93, 93, 93, 93, 93, 93, 93, 93, 97,
97, 97, 97, 97, 97, 97, 97, 101, 101, 101, 101, 101, 101, 101, 101, 105, 105, 105, 105, 105, 105, 105, 105, 109, 109, 109, 109, 109, 109, 109, 109, 113,
113, 113, 113, 113, 113, 113, 113, 117, 117, 117, 117, 117, 117, 117, 117, 121, 121, 121, 121, 121, 121, 121, 121, 125, 125, 125, 125, 125, 125, 125, 125, 125,
130, 130, 130, 130, 130, 130, 130, 130, 130, 134, 134, 134, 134, 134, 134, 134, 134, 138, 138, 138, 138, 138, 138, 138, 138, 142, 142, 142, 142, 142, 142, 142,
142, 146, 146, 146, 146, 146, 146, 146, 146, 150, 150, 150, 150, 150, 150, 150, 150, 154, 154, 154, 154, 154, 154, 154, 154, 158, 158, 158, 158, 158, 158, 158,
158, 162, 162, 162, 162, 162, 162, 162, 162, 166, 166, 166, 166, 166, 166, 166, 166, 170, 170, 170, 170, 170, 170, 170, 170, 174, 174, 174, 174, 174, 174, 174,
174, 178, 178, 178, 178, 178, 178, 178, 178, 182, 182, 182, 182, 182, 182, 182, 182, 186, 186, 186, 186, 186, 186, 186, 186, 190, 190, 190, 190, 190, 190, 190,
190, 190, 195, 195, 195, 195, 195, 195, 195, 195, 195, 199, 199, 199, 199, 199, 199, 199, 199, 203, 203, 203, 203, 203, 203, 203, 203, 207, 207, 207, 207, 207,
207, 207, 207, 211, 211, 211, 211, 211, 211, 211, 211, 215, 215, 215, 215, 215, 215, 215, 215, 219, 219, 219, 219, 219, 219, 219, 219, 223, 223, 223, 223, 223,
223, 223, 223, 227, 227, 227, 227, 227, 227, 227, 227, 231, 231, 231, 231, 231, 231, 231, 231, 235, 235, 235, 235, 235, 235, 235, 235, 239, 239, 239, 239, 239,
239, 239, 239, 243, 243, 243, 243, 243, 243, 243, 243, 247, 247, 247, 247, 247, 247, 247, 247, 251, 251, 251, 251, 251, 251, 251, 251, 255, 255, 255, 255, 255
},
{ // QUANT_80
0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 6, 6, 6, 6, 6, 6, 9, 9, 9, 9, 9, 9, 9, 13, 13, 13, 13, 13, 13, 13, 16, 16,
16, 16, 16, 16, 19, 19, 19, 19, 19, 19, 22, 22, 22, 22, 22, 22, 25, 25, 25, 25, 25, 25, 25, 29, 29, 29, 29, 29, 29, 29, 32, 32,
32, 32, 32, 32, 35, 35, 35, 35, 35, 35, 38, 38, 38, 38, 38, 38, 38, 42, 42, 42, 42, 42, 42, 42, 45, 45, 45, 45, 45, 45, 48, 48,
48, 48, 48, 48, 51, 51, 51, 51, 51, 51, 54, 54, 54, 54, 54, 54, 54, 58, 58, 58, 58, 58, 58, 58, 61, 61, 61, 61, 61, 61, 64, 64,
64, 64, 64, 64, 67, 67, 67, 67, 67, 67, 67, 71, 71, 71, 71, 71, 71, 71, 74, 74, 74, 74, 74, 74, 77, 77, 77, 77, 77, 77, 80, 80,
80, 80, 80, 80, 83, 83, 83, 83, 83, 83, 83, 87, 87, 87, 87, 87, 87, 87, 90, 90, 90, 90, 90, 90, 93, 93, 93, 93, 93, 93, 96, 96,
96, 96, 96, 96, 96, 100, 100, 100, 100, 100, 100, 100, 103, 103, 103, 103, 103, 103, 106, 106, 106, 106, 106, 106, 109, 109, 109, 109, 109, 109, 112, 112,
112, 112, 112, 112, 112, 116, 116, 116, 116, 116, 116, 116, 119, 119, 119, 119, 119, 119, 122, 122, 122, 122, 122, 122, 125, 125, 125, 125, 125, 125, 125, 125,
130, 130, 130, 130, 130, 130, 130, 130, 133, 133, 133, 133, 133, 133, 136, 136, 136, 136, 136, 136, 139, 139, 139, 139, 139, 139, 139, 143, 143, 143, 143, 143,
143, 143, 146, 146, 146, 146, 146, 146, 149, 149, 149, 149, 149, 149, 152, 152, 152, 152, 152, 152, 155, 155, 155, 155, 155, 155, 155, 159, 159, 159, 159, 159,
159, 159, 162, 162, 162, 162, 162, 162, 165, 165, 165, 165, 165, 165, 168, 168, 168, 168, 168, 168, 168, 172, 172, 172, 172, 172, 172, 172, 175, 175, 175, 175,
175, 175, 178, 178, 178, 178, 178, 178, 181, 181, 181, 181, 181, 181, 184, 184, 184, 184, 184, 184, 184, 188, 188, 188, 188, 188, 188, 188, 191, 191, 191, 191,
191, 191, 194, 194, 194, 194, 194, 194, 197, 197, 197, 197, 197, 197, 197, 201, 201, 201, 201, 201, 201, 201, 204, 204, 204, 204, 204, 204, 207, 207, 207, 207,
207, 207, 210, 210, 210, 210, 210, 210, 213, 213, 213, 213, 213, 213, 213, 217, 217, 217, 217, 217, 217, 217, 220, 220, 220, 220, 220, 220, 223, 223, 223, 223,
223, 223, 226, 226, 226, 226, 226, 226, 226, 230, 230, 230, 230, 230, 230, 230, 233, 233, 233, 233, 233, 233, 236, 236, 236, 236, 236, 236, 239, 239, 239, 239,
239, 239, 242, 242, 242, 242, 242, 242, 242, 246, 246, 246, 246, 246, 246, 246, 249, 249, 249, 249, 249, 249, 252, 252, 252, 252, 252, 252, 255, 255, 255, 255
},
{ // QUANT_96
0, 0, 0, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 10, 10, 10, 10, 10, 13, 13, 13, 13, 13, 13, 16, 16,
16, 16, 16, 18, 18, 18, 18, 18, 21, 21, 21, 21, 21, 21, 24, 24, 24, 24, 24, 26, 26, 26, 26, 26, 29, 29, 29, 29, 29, 29, 32, 32,
32, 32, 32, 32, 35, 35, 35, 35, 35, 37, 37, 37, 37, 37, 40, 40, 40, 40, 40, 40, 43, 43, 43, 43, 43, 45, 45, 45, 45, 45, 48, 48,
48, 48, 48, 48, 51, 51, 51, 51, 51, 53, 53, 53, 53, 53, 56, 56, 56, 56, 56, 56, 59, 59, 59, 59, 59, 61, 61, 61, 61, 61, 64, 64,
64, 64, 64, 64, 67, 67, 67, 67, 67, 67, 70, 70, 70, 70, 70, 72, 72, 72, 72, 72, 75, 75, 75, 75, 75, 75, 78, 78, 78, 78, 78, 80,
80, 80, 80, 80, 83, 83, 83, 83, 83, 83, 86, 86, 86, 86, 86, 88, 88, 88, 88, 88, 91, 91, 91, 91, 91, 91, 94, 94, 94, 94, 94, 96,
96, 96, 96, 96, 99, 99, 99, 99, 99, 99, 102, 102, 102, 102, 102, 104, 104, 104, 104, 104, 107, 107, 107, 107, 107, 107, 110, 110, 110, 110, 110, 112,
112, 112, 112, 112, 115, 115, 115, 115, 115, 115, 118, 118, 118, 118, 118, 120, 120, 120, 120, 120, 123, 123, 123, 123, 123, 123, 126, 126, 126, 126, 126, 126,
129, 129, 129, 129, 129, 129, 132, 132, 132, 132, 132, 132, 135, 135, 135, 135, 135, 137, 137, 137, 137, 137, 140, 140, 140, 140, 140, 140, 143, 143, 143, 143,
143, 145, 145, 145, 145, 145, 148, 148, 148, 148, 148, 148, 151, 151, 151, 151, 151, 153, 153, 153, 153, 153, 156, 156, 156, 156, 156, 156, 159, 159, 159, 159,
159, 161, 161, 161, 161, 161, 164, 164, 164, 164, 164, 164, 167, 167, 167, 167, 167, 169, 169, 169, 169, 169, 172, 172, 172, 172, 172, 172, 175, 175, 175, 175,
175, 177, 177, 177, 177, 177, 180, 180, 180, 180, 180, 180, 183, 183, 183, 183, 183, 185, 185, 185, 185, 185, 188, 188, 188, 188, 188, 188, 191, 191, 191, 191,
191, 191, 194, 194, 194, 194, 194, 196, 196, 196, 196, 196, 199, 199, 199, 199, 199, 199, 202, 202, 202, 202, 202, 204, 204, 204, 204, 204, 207, 207, 207, 207,
207, 207, 210, 210, 210, 210, 210, 212, 212, 212, 212, 212, 215, 215, 215, 215, 215, 215, 218, 218, 218, 218, 218, 220, 220, 220, 220, 220, 223, 223, 223, 223,
223, 223, 226, 226, 226, 226, 226, 226, 229, 229, 229, 229, 229, 231, 231, 231, 231, 231, 234, 234, 234, 234, 234, 234, 237, 237, 237, 237, 237, 239, 239, 239,
239, 239, 242, 242, 242, 242, 242, 242, 245, 245, 245, 245, 245, 247, 247, 247, 247, 247, 250, 250, 250, 250, 250, 250, 253, 253, 253, 253, 253, 255, 255, 255
},
{ // QUANT_128
0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8, 8, 8, 8, 10, 10, 10, 10, 12, 12, 12, 12, 14, 14, 14, 14, 16,
16, 16, 16, 18, 18, 18, 18, 20, 20, 20, 20, 22, 22, 22, 22, 24, 24, 24, 24, 26, 26, 26, 26, 28, 28, 28, 28, 30, 30, 30, 30, 32,
32, 32, 32, 34, 34, 34, 34, 36, 36, 36, 36, 38, 38, 38, 38, 40, 40, 40, 40, 42, 42, 42, 42, 44, 44, 44, 44, 46, 46, 46, 46, 48,
48, 48, 48, 50, 50, 50, 50, 52, 52, 52, 52, 54, 54, 54, 54, 56, 56, 56, 56, 58, 58, 58, 58, 60, 60, 60, 60, 62, 62, 62, 62, 64,
64, 64, 64, 66, 66, 66, 66, 68, 68, 68, 68, 70, 70, 70, 70, 72, 72, 72, 72, 74, 74, 74, 74, 76, 76, 76, 76, 78, 78, 78, 78, 80,
80, 80, 80, 82, 82, 82, 82, 84, 84, 84, 84, 86, 86, 86, 86, 88, 88, 88, 88, 90, 90, 90, 90, 92, 92, 92, 92, 94, 94, 94, 94, 96,
96, 96, 96, 98, 98, 98, 98, 100, 100, 100, 100, 102, 102, 102, 102, 104, 104, 104, 104, 106, 106, 106, 106, 108, 108, 108, 108, 110, 110, 110, 110, 112,
112, 112, 112, 114, 114, 114, 114, 116, 116, 116, 116, 118, 118, 118, 118, 120, 120, 120, 120, 122, 122, 122, 122, 124, 124, 124, 124, 126, 126, 126, 126, 126,
129, 129, 129, 129, 129, 131, 131, 131, 131, 133, 133, 133, 133, 135, 135, 135, 135, 137, 137, 137, 137, 139, 139, 139, 139, 141, 141, 141, 141, 143, 143, 143,
143, 145, 145, 145, 145, 147, 147, 147, 147, 149, 149, 149, 149, 151, 151, 151, 151, 153, 153, 153, 153, 155, 155, 155, 155, 157, 157, 157, 157, 159, 159, 159,
159, 161, 161, 161, 161, 163, 163, 163, 163, 165, 165, 165, 165, 167, 167, 167, 167, 169, 169, 169, 169, 171, 171, 171, 171, 173, 173, 173, 173, 175, 175, 175,
175, 177, 177, 177, 177, 179, 179, 179, 179, 181, 181, 181, 181, 183, 183, 183, 183, 185, 185, 185, 185, 187, 187, 187, 187, 189, 189, 189, 189, 191, 191, 191,
191, 193, 193, 193, 193, 195, 195, 195, 195, 197, 197, 197, 197, 199, 199, 199, 199, 201, 201, 201, 201, 203, 203, 203, 203, 205, 205, 205, 205, 207, 207, 207,
207, 209, 209, 209, 209, 211, 211, 211, 211, 213, 213, 213, 213, 215, 215, 215, 215, 217, 217, 217, 217, 219, 219, 219, 219, 221, 221, 221, 221, 223, 223, 223,
223, 225, 225, 225, 225, 227, 227, 227, 227, 229, 229, 229, 229, 231, 231, 231, 231, 233, 233, 233, 233, 235, 235, 235, 235, 237, 237, 237, 237, 239, 239, 239,
239, 241, 241, 241, 241, 243, 243, 243, 243, 245, 245, 245, 245, 247, 247, 247, 247, 249, 249, 249, 249, 251, 251, 251, 251, 253, 253, 253, 253, 255, 255, 255
},
{ // QUANT_160
0, 0, 1, 1, 1, 3, 3, 3, 4, 4, 4, 6, 6, 6, 6, 8, 8, 8, 9, 9, 9, 11, 11, 11, 12, 12, 12, 14, 14, 14, 14, 16,
16, 16, 17, 17, 17, 19, 19, 19, 20, 20, 20, 22, 22, 22, 22, 24, 24, 24, 25, 25, 25, 27, 27, 27, 28, 28, 28, 30, 30, 30, 30, 32,
32, 32, 33, 33, 33, 35, 35, 35, 36, 36, 36, 38, 38, 38, 38, 40, 40, 40, 41, 41, 41, 43, 43, 43, 44, 44, 44, 46, 46, 46, 46, 48,
48, 48, 49, 49, 49, 51, 51, 51, 52, 52, 52, 54, 54, 54, 54, 56, 56, 56, 57, 57, 57, 59, 59, 59, 60, 60, 60, 62, 62, 62, 62, 64,
64, 64, 65, 65, 65, 67, 67, 67, 68, 68, 68, 70, 70, 70, 70, 72, 72, 72, 73, 73, 73, 75, 75, 75, 76, 76, 76, 78, 78, 78, 78, 80,
80, 80, 81, 81, 81, 83, 83, 83, 84, 84, 84, 86, 86, 86, 86, 88, 88, 88, 89, 89, 89, 91, 91, 91, 92, 92, 92, 94, 94, 94, 94, 96,
96, 96, 97, 97, 97, 99, 99, 99, 100, 100, 100, 102, 102, 102, 102, 104, 104, 104, 105, 105, 105, 107, 107, 107, 108, 108, 108, 110, 110, 110, 110, 112,
112, 112, 113, 113, 113, 115, 115, 115, 116, 116, 116, 118, 118, 118, 118, 120, 120, 120, 121, 121, 121, 123, 123, 123, 124, 124, 124, 126, 126, 126, 126, 126,
129, 129, 129, 129, 129, 131, 131, 131, 132, 132, 132, 134, 134, 134, 135, 135, 135, 137, 137, 137, 137, 139, 139, 139, 140, 140, 140, 142, 142, 142, 143, 143,
143, 145, 145, 145, 145, 147, 147, 147, 148, 148, 148, 150, 150, 150, 151, 151, 151, 153, 153, 153, 153, 155, 155, 155, 156, 156, 156, 158, 158, 158, 159, 159,
159, 161, 161, 161, 161, 163, 163, 163, 164, 164, 164, 166, 166, 166, 167, 167, 167, 169, 169, 169, 169, 171, 171, 171, 172, 172, 172, 174, 174, 174, 175, 175,
175, 177, 177, 177, 177, 179, 179, 179, 180, 180, 180, 182, 182, 182, 183, 183, 183, 185, 185, 185, 185, 187, 187, 187, 188, 188, 188, 190, 190, 190, 191, 191,
191, 193, 193, 193, 193, 195, 195, 195, 196, 196, 196, 198, 198, 198, 199, 199, 199, 201, 201, 201, 201, 203, 203, 203, 204, 204, 204, 206, 206, 206, 207, 207,
207, 209, 209, 209, 209, 211, 211, 211, 212, 212, 212, 214, 214, 214, 215, 215, 215, 217, 217, 217, 217, 219, 219, 219, 220, 220, 220, 222, 222, 222, 223, 223,
223, 225, 225, 225, 225, 227, 227, 227, 228, 228, 228, 230, 230, 230, 231, 231, 231, 233, 233, 233, 233, 235, 235, 235, 236, 236, 236, 238, 238, 238, 239, 239,
239, 241, 241, 241, 241, 243, 243, 243, 244, 244, 244, 246, 246, 246, 247, 247, 247, 249, 249, 249, 249, 251, 251, 251, 252, 252, 252, 254, 254, 254, 255, 255
},
{ // QUANT_192
0, 0, 1, 1, 2, 2, 2, 4, 4, 4, 5, 5, 6, 6, 6, 8, 8, 8, 9, 9, 10, 10, 10, 12, 12, 12, 13, 13, 14, 14, 14, 16,
16, 16, 17, 17, 18, 18, 18, 20, 20, 20, 21, 21, 22, 22, 22, 24, 24, 24, 25, 25, 26, 26, 26, 28, 28, 28, 29, 29, 30, 30, 30, 32,
32, 32, 33, 33, 34, 34, 34, 36, 36, 36, 37, 37, 38, 38, 38, 40, 40, 40, 41, 41, 42, 42, 42, 44, 44, 44, 45, 45, 46, 46, 46, 48,
48, 48, 49, 49, 50, 50, 50, 52, 52, 52, 53, 53, 54, 54, 54, 56, 56, 56, 57, 57, 58, 58, 58, 60, 60, 60, 61, 61, 62, 62, 62, 64,
64, 64, 65, 65, 66, 66, 66, 68, 68, 68, 69, 69, 70, 70, 70, 72, 72, 72, 73, 73, 74, 74, 74, 76, 76, 76, 77, 77, 78, 78, 78, 80,
80, 80, 81, 81, 82, 82, 82, 84, 84, 84, 85, 85, 86, 86, 86, 88, 88, 88, 89, 89, 90, 90, 90, 92, 92, 92, 93, 93, 94, 94, 94, 96,
96, 96, 97, 97, 98, 98, 98, 100, 100, 100, 101, 101, 102, 102, 102, 104, 104, 104, 105, 105, 106, 106, 106, 108, 108, 108, 109, 109, 110, 110, 110, 112,
112, 112, 113, 113, 114, 114, 114, 116, 116, 116, 117, 117, 118, 118, 118, 120, 120, 120, 121, 121, 122, 122, 122, 124, 124, 124, 125, 125, 126, 126, 126, 126,
129, 129, 129, 129, 130, 130, 131, 131, 131, 133, 133, 133, 134, 134, 135, 135, 135, 137, 137, 137, 138, 138, 139, 139, 139, 141, 141, 141, 142, 142, 143, 143,
143, 145, 145, 145, 146, 146, 147, 147, 147, 149, 149, 149, 150, 150, 151, 151, 151, 153, 153, 153, 154, 154, 155, 155, 155, 157, 157, 157, 158, 158, 159, 159,
159, 161, 161, 161, 162, 162, 163, 163, 163, 165, 165, 165, 166, 166, 167, 167, 167, 169, 169, 169, 170, 170, 171, 171, 171, 173, 173, 173, 174, 174, 175, 175,
175, 177, 177, 177, 178, 178, 179, 179, 179, 181, 181, 181, 182, 182, 183, 183, 183, 185, 185, 185, 186, 186, 187, 187, 187, 189, 189, 189, 190, 190, 191, 191,
191, 193, 193, 193, 194, 194, 195, 195, 195, 197, 197, 197, 198, 198, 199, 199, 199, 201, 201, 201, 202, 202, 203, 203, 203, 205, 205, 205, 206, 206, 207, 207,
207, 209, 209, 209, 210, 210, 211, 211, 211, 213, 213, 213, 214, 214, 215, 215, 215, 217, 217, 217, 218, 218, 219, 219, 219, 221, 221, 221, 222, 222, 223, 223,
223, 225, 225, 225, 226, 226, 227, 227, 227, 229, 229, 229, 230, 230, 231, 231, 231, 233, 233, 233, 234, 234, 235, 235, 235, 237, 237, 237, 238, 238, 239, 239,
239, 241, 241, 241, 242, 242, 243, 243, 243, 245, 245, 245, 246, 246, 247, 247, 247, 249, 249, 249, 250, 250, 251, 251, 251, 253, 253, 253, 254, 254, 255, 255
},
{ // QUANT_256
0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15,
16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31,
32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40, 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47,
48, 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55, 56, 56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63,
64, 64, 65, 65, 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71, 72, 72, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79,
80, 80, 81, 81, 82, 82, 83, 83, 84, 84, 85, 85, 86, 86, 87, 87, 88, 88, 89, 89, 90, 90, 91, 91, 92, 92, 93, 93, 94, 94, 95, 95,
96, 96, 97, 97, 98, 98, 99, 99, 100, 100, 101, 101, 102, 102, 103, 103, 104, 104, 105, 105, 106, 106, 107, 107, 108, 108, 109, 109, 110, 110, 111, 111,
112, 112, 113, 113, 114, 114, 115, 115, 116, 116, 117, 117, 118, 118, 119, 119, 120, 120, 121, 121, 122, 122, 123, 123, 124, 124, 125, 125, 126, 126, 127, 127,
128, 128, 129, 129, 130, 130, 131, 131, 132, 132, 133, 133, 134, 134, 135, 135, 136, 136, 137, 137, 138, 138, 139, 139, 140, 140, 141, 141, 142, 142, 143, 143,
144, 144, 145, 145, 146, 146, 147, 147, 148, 148, 149, 149, 150, 150, 151, 151, 152, 152, 153, 153, 154, 154, 155, 155, 156, 156, 157, 157, 158, 158, 159, 159,
160, 160, 161, 161, 162, 162, 163, 163, 164, 164, 165, 165, 166, 166, 167, 167, 168, 168, 169, 169, 170, 170, 171, 171, 172, 172, 173, 173, 174, 174, 175, 175,
176, 176, 177, 177, 178, 178, 179, 179, 180, 180, 181, 181, 182, 182, 183, 183, 184, 184, 185, 185, 186, 186, 187, 187, 188, 188, 189, 189, 190, 190, 191, 191,
192, 192, 193, 193, 194, 194, 195, 195, 196, 196, 197, 197, 198, 198, 199, 199, 200, 200, 201, 201, 202, 202, 203, 203, 204, 204, 205, 205, 206, 206, 207, 207,
208, 208, 209, 209, 210, 210, 211, 211, 212, 212, 213, 213, 214, 214, 215, 215, 216, 216, 217, 217, 218, 218, 219, 219, 220, 220, 221, 221, 222, 222, 223, 223,
224, 224, 225, 225, 226, 226, 227, 227, 228, 228, 229, 229, 230, 230, 231, 231, 232, 232, 233, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238, 239, 239,
240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255, 255
},
};
// Starts from QUANT_6
// Scrambled
const uint8_t color_uquant_to_scrambled_pquant_tables[17][256] {
{ // QUANT_6
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
},
{ // QUANT_8
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
},
{ // QUANT_10
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
},
{ // QUANT_12
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
11, 11, 11, 11, 11, 11, 11, 11, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
},
{ // QUANT_16
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15
},
{ // QUANT_20
0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
14, 14, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 11, 11, 11, 11, 11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
17, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1
},
{ // QUANT_24
0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 12, 12, 12, 12, 12, 12, 12, 12,
12, 12, 12, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 14, 14, 14, 14, 14, 14, 14,
14, 14, 14, 14, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 13, 13, 13,
13, 13, 13, 13, 13, 13, 13, 13, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 11, 11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1, 1
},
{ // QUANT_32
0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4,
4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6,
6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8,
8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10,
10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12,
12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 14,
14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15,
16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17,
18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
19, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21,
21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23,
23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25,
25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27,
27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29,
29, 29, 29, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31
},
{ // QUANT_40
0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16,
24, 24, 24, 24, 24, 24, 24, 32, 32, 32, 32, 32, 32, 32, 2, 2,
2, 2, 2, 2, 10, 10, 10, 10, 10, 10, 18, 18, 18, 18, 18, 18,
18, 26, 26, 26, 26, 26, 26, 34, 34, 34, 34, 34, 34, 34, 4, 4,
4, 4, 4, 4, 12, 12, 12, 12, 12, 12, 12, 20, 20, 20, 20, 20,
20, 28, 28, 28, 28, 28, 28, 28, 36, 36, 36, 36, 36, 36, 36, 6,
6, 6, 6, 6, 6, 14, 14, 14, 14, 14, 14, 22, 22, 22, 22, 22,
22, 22, 30, 30, 30, 30, 30, 30, 38, 38, 38, 38, 38, 38, 38, 38,
39, 39, 39, 39, 39, 39, 39, 39, 31, 31, 31, 31, 31, 31, 23, 23,
23, 23, 23, 23, 23, 15, 15, 15, 15, 15, 15, 7, 7, 7, 7, 7,
7, 37, 37, 37, 37, 37, 37, 37, 29, 29, 29, 29, 29, 29, 29, 21,
21, 21, 21, 21, 21, 13, 13, 13, 13, 13, 13, 13, 5, 5, 5, 5,
5, 5, 35, 35, 35, 35, 35, 35, 35, 27, 27, 27, 27, 27, 27, 19,
19, 19, 19, 19, 19, 19, 11, 11, 11, 11, 11, 11, 3, 3, 3, 3,
3, 3, 33, 33, 33, 33, 33, 33, 33, 25, 25, 25, 25, 25, 25, 25,
17, 17, 17, 17, 17, 17, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1
},
{ // QUANT_48
0, 0, 0, 16, 16, 16, 16, 16, 32, 32, 32, 32, 32, 32, 2, 2,
2, 2, 2, 18, 18, 18, 18, 18, 34, 34, 34, 34, 34, 34, 4, 4,
4, 4, 4, 20, 20, 20, 20, 20, 20, 36, 36, 36, 36, 36, 6, 6,
6, 6, 6, 22, 22, 22, 22, 22, 22, 38, 38, 38, 38, 38, 38, 8,
8, 8, 8, 8, 24, 24, 24, 24, 24, 40, 40, 40, 40, 40, 40, 10,
10, 10, 10, 10, 26, 26, 26, 26, 26, 42, 42, 42, 42, 42, 42, 12,
12, 12, 12, 12, 28, 28, 28, 28, 28, 28, 44, 44, 44, 44, 44, 14,
14, 14, 14, 14, 30, 30, 30, 30, 30, 30, 46, 46, 46, 46, 46, 46,
47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 15, 15, 15, 15,
15, 45, 45, 45, 45, 45, 29, 29, 29, 29, 29, 29, 13, 13, 13, 13,
13, 43, 43, 43, 43, 43, 43, 27, 27, 27, 27, 27, 11, 11, 11, 11,
11, 41, 41, 41, 41, 41, 41, 25, 25, 25, 25, 25, 9, 9, 9, 9,
9, 39, 39, 39, 39, 39, 39, 23, 23, 23, 23, 23, 23, 7, 7, 7,
7, 7, 37, 37, 37, 37, 37, 21, 21, 21, 21, 21, 21, 5, 5, 5,
5, 5, 35, 35, 35, 35, 35, 35, 19, 19, 19, 19, 19, 3, 3, 3,
3, 3, 33, 33, 33, 33, 33, 33, 17, 17, 17, 17, 17, 1, 1, 1
},
{ // QUANT_64
0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4,
4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8,
8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12,
12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 15, 16,
16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 20,
20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 24,
24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27, 28,
28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 31,
32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35,
36, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 39, 39, 39,
40, 40, 40, 40, 41, 41, 41, 41, 42, 42, 42, 42, 43, 43, 43, 43,
44, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47,
47, 48, 48, 48, 48, 49, 49, 49, 49, 50, 50, 50, 50, 51, 51, 51,
51, 52, 52, 52, 52, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 55,
55, 56, 56, 56, 56, 57, 57, 57, 57, 58, 58, 58, 58, 59, 59, 59,
59, 60, 60, 60, 60, 61, 61, 61, 61, 62, 62, 62, 62, 63, 63, 63
},
{ // QUANT_80
0, 0, 16, 16, 16, 32, 32, 32, 48, 48, 48, 64, 64, 64, 64, 2,
2, 2, 18, 18, 18, 34, 34, 34, 50, 50, 50, 66, 66, 66, 66, 4,
4, 4, 20, 20, 20, 36, 36, 36, 52, 52, 52, 52, 68, 68, 68, 6,
6, 6, 22, 22, 22, 38, 38, 38, 54, 54, 54, 54, 70, 70, 70, 8,
8, 8, 24, 24, 24, 40, 40, 40, 40, 56, 56, 56, 72, 72, 72, 10,
10, 10, 26, 26, 26, 42, 42, 42, 42, 58, 58, 58, 74, 74, 74, 12,
12, 12, 28, 28, 28, 28, 44, 44, 44, 60, 60, 60, 76, 76, 76, 14,
14, 14, 30, 30, 30, 30, 46, 46, 46, 62, 62, 62, 78, 78, 78, 78,
79, 79, 79, 79, 63, 63, 63, 47, 47, 47, 31, 31, 31, 31, 15, 15,
15, 77, 77, 77, 61, 61, 61, 45, 45, 45, 29, 29, 29, 29, 13, 13,
13, 75, 75, 75, 59, 59, 59, 43, 43, 43, 43, 27, 27, 27, 11, 11,
11, 73, 73, 73, 57, 57, 57, 41, 41, 41, 41, 25, 25, 25, 9, 9,
9, 71, 71, 71, 55, 55, 55, 55, 39, 39, 39, 23, 23, 23, 7, 7,
7, 69, 69, 69, 53, 53, 53, 53, 37, 37, 37, 21, 21, 21, 5, 5,
5, 67, 67, 67, 67, 51, 51, 51, 35, 35, 35, 19, 19, 19, 3, 3,
3, 65, 65, 65, 65, 49, 49, 49, 33, 33, 33, 17, 17, 17, 1, 1
},
{ // QUANT_96
0, 32, 32, 32, 64, 64, 64, 2, 2, 34, 34, 34, 66, 66, 66, 4,
4, 36, 36, 36, 68, 68, 68, 6, 6, 38, 38, 38, 70, 70, 70, 8,
8, 8, 40, 40, 72, 72, 72, 10, 10, 10, 42, 42, 74, 74, 74, 12,
12, 12, 44, 44, 76, 76, 76, 14, 14, 14, 46, 46, 78, 78, 78, 16,
16, 16, 48, 48, 48, 80, 80, 80, 18, 18, 50, 50, 50, 82, 82, 82,
20, 20, 52, 52, 52, 84, 84, 84, 22, 22, 54, 54, 54, 86, 86, 86,
24, 24, 56, 56, 56, 88, 88, 88, 26, 26, 58, 58, 58, 90, 90, 90,
28, 28, 60, 60, 60, 92, 92, 92, 30, 30, 62, 62, 62, 94, 94, 94,
95, 95, 95, 63, 63, 63, 31, 31, 93, 93, 93, 61, 61, 61, 29, 29,
91, 91, 91, 59, 59, 59, 27, 27, 89, 89, 89, 57, 57, 57, 25, 25,
87, 87, 87, 55, 55, 55, 23, 23, 85, 85, 85, 53, 53, 53, 21, 21,
83, 83, 83, 51, 51, 51, 19, 19, 81, 81, 81, 49, 49, 49, 17, 17,
17, 79, 79, 79, 47, 47, 15, 15, 15, 77, 77, 77, 45, 45, 13, 13,
13, 75, 75, 75, 43, 43, 11, 11, 11, 73, 73, 73, 41, 41, 9, 9,
9, 71, 71, 71, 39, 39, 39, 7, 7, 69, 69, 69, 37, 37, 37, 5,
5, 67, 67, 67, 35, 35, 35, 3, 3, 65, 65, 65, 33, 33, 33, 1
},
{ // QUANT_128
0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16,
16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24,
24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32,
32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40,
40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48,
48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55, 56,
56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 63,
64, 64, 65, 65, 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71,
72, 72, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79,
80, 80, 81, 81, 82, 82, 83, 83, 84, 84, 85, 85, 86, 86, 87, 87,
88, 88, 89, 89, 90, 90, 91, 91, 92, 92, 93, 93, 94, 94, 95, 95,
96, 96, 97, 97, 98, 98, 99, 99, 100, 100, 101, 101, 102, 102, 103, 103,
104, 104, 105, 105, 106, 106, 107, 107, 108, 108, 109, 109, 110, 110, 111, 111,
112, 112, 113, 113, 114, 114, 115, 115, 116, 116, 117, 117, 118, 118, 119, 119,
120, 120, 121, 121, 122, 122, 123, 123, 124, 124, 125, 125, 126, 126, 127, 127
},
{ // QUANT_160
0, 32, 64, 64, 96, 128, 128, 128, 2, 34, 66, 66, 98, 130, 130, 130,
4, 36, 68, 68, 100, 132, 132, 132, 6, 38, 70, 70, 102, 134, 134, 134,
8, 40, 72, 72, 104, 136, 136, 136, 10, 42, 74, 74, 106, 138, 138, 138,
12, 44, 76, 76, 108, 140, 140, 140, 14, 46, 78, 78, 110, 142, 142, 142,
16, 48, 80, 80, 112, 144, 144, 144, 18, 50, 82, 82, 114, 146, 146, 146,
20, 52, 84, 84, 116, 148, 148, 148, 22, 54, 86, 86, 118, 150, 150, 150,
24, 56, 88, 88, 120, 152, 152, 152, 26, 58, 90, 90, 122, 154, 154, 154,
28, 60, 92, 92, 124, 156, 156, 156, 30, 62, 94, 94, 126, 158, 158, 158,
159, 159, 159, 127, 95, 95, 63, 31, 157, 157, 157, 125, 93, 93, 61, 29,
155, 155, 155, 123, 91, 91, 59, 27, 153, 153, 153, 121, 89, 89, 57, 25,
151, 151, 151, 119, 87, 87, 55, 23, 149, 149, 149, 117, 85, 85, 53, 21,
147, 147, 147, 115, 83, 83, 51, 19, 145, 145, 145, 113, 81, 81, 49, 17,
143, 143, 143, 111, 79, 79, 47, 15, 141, 141, 141, 109, 77, 77, 45, 13,
139, 139, 139, 107, 75, 75, 43, 11, 137, 137, 137, 105, 73, 73, 41, 9,
135, 135, 135, 103, 71, 71, 39, 7, 133, 133, 133, 101, 69, 69, 37, 5,
131, 131, 131, 99, 67, 67, 35, 3, 129, 129, 129, 97, 65, 65, 33, 1
},
{ // QUANT_192
0, 64, 128, 128, 2, 66, 130, 130, 4, 68, 132, 132, 6, 70, 134, 134,
8, 72, 136, 136, 10, 74, 138, 138, 12, 76, 140, 140, 14, 78, 142, 142,
16, 80, 144, 144, 18, 82, 146, 146, 20, 84, 148, 148, 22, 86, 150, 150,
24, 88, 152, 152, 26, 90, 154, 154, 28, 92, 156, 156, 30, 94, 158, 158,
32, 96, 160, 160, 34, 98, 162, 162, 36, 100, 164, 164, 38, 102, 166, 166,
40, 104, 168, 168, 42, 106, 170, 170, 44, 108, 172, 172, 46, 110, 174, 174,
48, 112, 176, 176, 50, 114, 178, 178, 52, 116, 180, 180, 54, 118, 182, 182,
56, 120, 184, 184, 58, 122, 186, 186, 60, 124, 188, 188, 62, 126, 190, 190,
191, 191, 127, 63, 189, 189, 125, 61, 187, 187, 123, 59, 185, 185, 121, 57,
183, 183, 119, 55, 181, 181, 117, 53, 179, 179, 115, 51, 177, 177, 113, 49,
175, 175, 111, 47, 173, 173, 109, 45, 171, 171, 107, 43, 169, 169, 105, 41,
167, 167, 103, 39, 165, 165, 101, 37, 163, 163, 99, 35, 161, 161, 97, 33,
159, 159, 95, 31, 157, 157, 93, 29, 155, 155, 91, 27, 153, 153, 89, 25,
151, 151, 87, 23, 149, 149, 85, 21, 147, 147, 83, 19, 145, 145, 81, 17,
143, 143, 79, 15, 141, 141, 77, 13, 139, 139, 75, 11, 137, 137, 73, 9,
135, 135, 71, 7, 133, 133, 69, 5, 131, 131, 67, 3, 129, 129, 65, 1
},
{ // QUANT_256
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
}
};
#endif
// Starts from QUANT_6
// Scrambled
static const uint8_t color_scrambled_pquant_to_uquant_q6[6] {
0, 255, 51, 204, 102, 153
};
static const uint8_t color_scrambled_pquant_to_uquant_q8[8] {
0, 36, 73, 109, 146, 182, 219, 255
};
static const uint8_t color_scrambled_pquant_to_uquant_q10[10] {
0, 255, 28, 227, 56, 199, 84, 171, 113, 142
};
static const uint8_t color_scrambled_pquant_to_uquant_q12[12] {
0, 255, 69, 186, 23, 232, 92, 163, 46, 209, 116, 139
};
static const uint8_t color_scrambled_pquant_to_uquant_q16[16] {
0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255
};
static const uint8_t color_scrambled_pquant_to_uquant_q20[20] {
0, 255, 67, 188, 13, 242, 80, 175, 27, 228, 94, 161, 40, 215, 107, 148,
54, 201, 121, 134
};
static const uint8_t color_scrambled_pquant_to_uquant_q24[24] {
0, 255, 33, 222, 66, 189, 99, 156, 11, 244, 44, 211, 77, 178, 110, 145,
22, 233, 55, 200, 88, 167, 121, 134
};
static const uint8_t color_scrambled_pquant_to_uquant_q32[32] {
0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123,
132, 140, 148, 156, 165, 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255
};
static const uint8_t color_scrambled_pquant_to_uquant_q40[40] {
0, 255, 32, 223, 65, 190, 97, 158, 6, 249, 39, 216, 71, 184, 104, 151,
13, 242, 45, 210, 78, 177, 110, 145, 19, 236, 52, 203, 84, 171, 117, 138,
26, 229, 58, 197, 91, 164, 123, 132
};
static const uint8_t color_scrambled_pquant_to_uquant_q48[48] {
0, 255, 16, 239, 32, 223, 48, 207, 65, 190, 81, 174, 97, 158, 113, 142,
5, 250, 21, 234, 38, 217, 54, 201, 70, 185, 86, 169, 103, 152, 119, 136,
11, 244, 27, 228, 43, 212, 59, 196, 76, 179, 92, 163, 108, 147, 124, 131
};
static const uint8_t color_scrambled_pquant_to_uquant_q64[64] {
0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
65, 69, 73, 77, 81, 85, 89, 93, 97, 101, 105, 109, 113, 117, 121, 125,
130, 134, 138, 142, 146, 150, 154, 158, 162, 166, 170, 174, 178, 182, 186, 190,
195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, 239, 243, 247, 251, 255,
};
static const uint8_t color_scrambled_pquant_to_uquant_q80[80] {
0, 255, 16, 239, 32, 223, 48, 207, 64, 191, 80, 175, 96, 159, 112, 143,
3, 252, 19, 236, 35, 220, 51, 204, 67, 188, 83, 172, 100, 155, 116, 139,
6, 249, 22, 233, 38, 217, 54, 201, 71, 184, 87, 168, 103, 152, 119, 136,
9, 246, 25, 230, 42, 213, 58, 197, 74, 181, 90, 165, 106, 149, 122, 133,
13, 242, 29, 226, 45, 210, 61, 194, 77, 178, 93, 162, 109, 146, 125, 130
};
static const uint8_t color_scrambled_pquant_to_uquant_q96[96] {
0, 255, 8, 247, 16, 239, 24, 231, 32, 223, 40, 215, 48, 207, 56, 199,
64, 191, 72, 183, 80, 175, 88, 167, 96, 159, 104, 151, 112, 143, 120, 135,
2, 253, 10, 245, 18, 237, 26, 229, 35, 220, 43, 212, 51, 204, 59, 196,
67, 188, 75, 180, 83, 172, 91, 164, 99, 156, 107, 148, 115, 140, 123, 132,
5, 250, 13, 242, 21, 234, 29, 226, 37, 218, 45, 210, 53, 202, 61, 194,
70, 185, 78, 177, 86, 169, 94, 161, 102, 153, 110, 145, 118, 137, 126, 129
};
static const uint8_t color_scrambled_pquant_to_uquant_q128[128] {
0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94,
96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126,
129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149, 151, 153, 155, 157, 159,
161, 163, 165, 167, 169, 171, 173, 175, 177, 179, 181, 183, 185, 187, 189, 191,
193, 195, 197, 199, 201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223,
225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255
};
static const uint8_t color_scrambled_pquant_to_uquant_q160[160] {
0, 255, 8, 247, 16, 239, 24, 231, 32, 223, 40, 215, 48, 207, 56, 199,
64, 191, 72, 183, 80, 175, 88, 167, 96, 159, 104, 151, 112, 143, 120, 135,
1, 254, 9, 246, 17, 238, 25, 230, 33, 222, 41, 214, 49, 206, 57, 198,
65, 190, 73, 182, 81, 174, 89, 166, 97, 158, 105, 150, 113, 142, 121, 134,
3, 252, 11, 244, 19, 236, 27, 228, 35, 220, 43, 212, 51, 204, 59, 196,
67, 188, 75, 180, 83, 172, 91, 164, 99, 156, 107, 148, 115, 140, 123, 132,
4, 251, 12, 243, 20, 235, 28, 227, 36, 219, 44, 211, 52, 203, 60, 195,
68, 187, 76, 179, 84, 171, 92, 163, 100, 155, 108, 147, 116, 139, 124, 131,
6, 249, 14, 241, 22, 233, 30, 225, 38, 217, 46, 209, 54, 201, 62, 193,
70, 185, 78, 177, 86, 169, 94, 161, 102, 153, 110, 145, 118, 137, 126, 129
};
static const uint8_t color_scrambled_pquant_to_uquant_q192[192] {
0, 255, 4, 251, 8, 247, 12, 243, 16, 239, 20, 235, 24, 231, 28, 227,
32, 223, 36, 219, 40, 215, 44, 211, 48, 207, 52, 203, 56, 199, 60, 195,
64, 191, 68, 187, 72, 183, 76, 179, 80, 175, 84, 171, 88, 167, 92, 163,
96, 159, 100, 155, 104, 151, 108, 147, 112, 143, 116, 139, 120, 135, 124, 131,
1, 254, 5, 250, 9, 246, 13, 242, 17, 238, 21, 234, 25, 230, 29, 226,
33, 222, 37, 218, 41, 214, 45, 210, 49, 206, 53, 202, 57, 198, 61, 194,
65, 190, 69, 186, 73, 182, 77, 178, 81, 174, 85, 170, 89, 166, 93, 162,
97, 158, 101, 154, 105, 150, 109, 146, 113, 142, 117, 138, 121, 134, 125, 130,
2, 253, 6, 249, 10, 245, 14, 241, 18, 237, 22, 233, 26, 229, 30, 225,
34, 221, 38, 217, 42, 213, 46, 209, 50, 205, 54, 201, 58, 197, 62, 193,
66, 189, 70, 185, 74, 181, 78, 177, 82, 173, 86, 169, 90, 165, 94, 161,
98, 157, 102, 153, 106, 149, 110, 145, 114, 141, 118, 137, 122, 133, 126, 129
};
static const uint8_t color_scrambled_pquant_to_uquant_q256[256] {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
};
const uint8_t* color_scrambled_pquant_to_uquant_tables[17] {
color_scrambled_pquant_to_uquant_q6,
color_scrambled_pquant_to_uquant_q8,
color_scrambled_pquant_to_uquant_q10,
color_scrambled_pquant_to_uquant_q12,
color_scrambled_pquant_to_uquant_q16,
color_scrambled_pquant_to_uquant_q20,
color_scrambled_pquant_to_uquant_q24,
color_scrambled_pquant_to_uquant_q32,
color_scrambled_pquant_to_uquant_q40,
color_scrambled_pquant_to_uquant_q48,
color_scrambled_pquant_to_uquant_q64,
color_scrambled_pquant_to_uquant_q80,
color_scrambled_pquant_to_uquant_q96,
color_scrambled_pquant_to_uquant_q128,
color_scrambled_pquant_to_uquant_q160,
color_scrambled_pquant_to_uquant_q192,
color_scrambled_pquant_to_uquant_q256
};
// The quant_mode_table[integer_count/2][bits] gives us the quantization level for a given integer
// count and number of bits that the integer may fit into.
const int8_t quant_mode_table[10][128] {
{
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
},
{
-1, -1, 0, 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
},
{
-1, -1, -1, -1, 0, 0, 0, 1, 2, 2, 3, 4, 5, 5, 6, 7,
8, 8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
},
{
-1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
},
{
-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 1, 1, 1,
2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 7,
8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13,
14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 19, 19, 19,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
},
{
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5,
5, 5, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 10, 10,
10, 10, 11, 11, 11, 11, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14,
15, 15, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 19, 19, 19, 19,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
},
{
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0,
0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11,
12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15,
16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
},
{
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2,
2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6,
6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9,
9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, 12, 12, 12, 13,
13, 13, 13, 13, 14, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16,
16, 16, 17, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 19,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
},
{
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7,
8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10,
11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13,
14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16,
17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19
},
{
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4,
4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6,
6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9,
9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11,
12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14,
14, 14, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 17, 17
}
};
@@ -0,0 +1,544 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Functions for converting between symbolic and physical encodings.
*/
#include "astcenc_internal.h"
#include <cassert>
/**
* @brief Reverse bits in a byte.
*
* @param p The value to reverse.
*
* @return The reversed result.
*/
static inline int bitrev8(int p)
{
p = ((p & 0x0F) << 4) | ((p >> 4) & 0x0F);
p = ((p & 0x33) << 2) | ((p >> 2) & 0x33);
p = ((p & 0x55) << 1) | ((p >> 1) & 0x55);
return p;
}
/**
* @brief Read up to 8 bits at an arbitrary bit offset.
*
* The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
* span two separate bytes in memory.
*
* @param bitcount The number of bits to read.
* @param bitoffset The bit offset to read from, between 0 and 7.
* @param[in,out] ptr The data pointer to read from.
*
* @return The read value.
*/
static inline int read_bits(
int bitcount,
int bitoffset,
const uint8_t* ptr
) {
int mask = (1 << bitcount) - 1;
ptr += bitoffset >> 3;
bitoffset &= 7;
int value = ptr[0] | (ptr[1] << 8);
value >>= bitoffset;
value &= mask;
return value;
}
#if !defined(ASTCENC_DECOMPRESS_ONLY)
/**
* @brief Write up to 8 bits at an arbitrary bit offset.
*
* The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so
* may span two separate bytes in memory.
*
* @param value The value to write.
* @param bitcount The number of bits to write, starting from LSB.
* @param bitoffset The bit offset to store at, between 0 and 7.
* @param[in,out] ptr The data pointer to write to.
*/
static inline void write_bits(
int value,
int bitcount,
int bitoffset,
uint8_t* ptr
) {
int mask = (1 << bitcount) - 1;
value &= mask;
ptr += bitoffset >> 3;
bitoffset &= 7;
value <<= bitoffset;
mask <<= bitoffset;
mask = ~mask;
ptr[0] &= mask;
ptr[0] |= value;
ptr[1] &= mask >> 8;
ptr[1] |= value >> 8;
}
/* See header for documentation. */
void symbolic_to_physical(
const block_size_descriptor& bsd,
const symbolic_compressed_block& scb,
uint8_t pcb[16]
) {
assert(scb.block_type != SYM_BTYPE_ERROR);
// Constant color block using UNORM16 colors
if (scb.block_type == SYM_BTYPE_CONST_U16)
{
// There is currently no attempt to coalesce larger void-extents
static const uint8_t cbytes[8] { 0xFC, 0xFD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
for (unsigned int i = 0; i < 8; i++)
{
pcb[i] = cbytes[i];
}
for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++)
{
pcb[2 * i + 8] = scb.constant_color[i] & 0xFF;
pcb[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
}
return;
}
// Constant color block using FP16 colors
if (scb.block_type == SYM_BTYPE_CONST_F16)
{
// There is currently no attempt to coalesce larger void-extents
static const uint8_t cbytes[8] { 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
for (unsigned int i = 0; i < 8; i++)
{
pcb[i] = cbytes[i];
}
for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++)
{
pcb[2 * i + 8] = scb.constant_color[i] & 0xFF;
pcb[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
}
return;
}
unsigned int partition_count = scb.partition_count;
// Compress the weights.
// They are encoded as an ordinary integer-sequence, then bit-reversed
uint8_t weightbuf[16] { 0 };
const auto& bm = bsd.get_block_mode(scb.block_mode);
const auto& di = bsd.get_decimation_info(bm.decimation_mode);
int weight_count = di.weight_count;
quant_method weight_quant_method = bm.get_weight_quant_mode();
float weight_quant_levels = static_cast<float>(get_quant_level(weight_quant_method));
int is_dual_plane = bm.is_dual_plane;
const auto& qat = quant_and_xfer_tables[weight_quant_method];
int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;
int bits_for_weights = get_ise_sequence_bitcount(real_weight_count, weight_quant_method);
uint8_t weights[64];
if (is_dual_plane)
{
for (int i = 0; i < weight_count; i++)
{
float uqw = static_cast<float>(scb.weights[i]);
float qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
int qwi = static_cast<int>(qw + 0.5f);
weights[2 * i] = qat.scramble_map[qwi];
uqw = static_cast<float>(scb.weights[i + WEIGHTS_PLANE2_OFFSET]);
qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
qwi = static_cast<int>(qw + 0.5f);
weights[2 * i + 1] = qat.scramble_map[qwi];
}
}
else
{
for (int i = 0; i < weight_count; i++)
{
float uqw = static_cast<float>(scb.weights[i]);
float qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
int qwi = static_cast<int>(qw + 0.5f);
weights[i] = qat.scramble_map[qwi];
}
}
encode_ise(weight_quant_method, real_weight_count, weights, weightbuf, 0);
for (int i = 0; i < 16; i++)
{
pcb[i] = static_cast<uint8_t>(bitrev8(weightbuf[15 - i]));
}
write_bits(scb.block_mode, 11, 0, pcb);
write_bits(partition_count - 1, 2, 11, pcb);
int below_weights_pos = 128 - bits_for_weights;
// Encode partition index and color endpoint types for blocks with 2+ partitions
if (partition_count > 1)
{
write_bits(scb.partition_index, 6, 13, pcb);
write_bits(scb.partition_index >> 6, PARTITION_INDEX_BITS - 6, 19, pcb);
if (scb.color_formats_matched)
{
write_bits(scb.color_formats[0] << 2, 6, 13 + PARTITION_INDEX_BITS, pcb);
}
else
{
// Check endpoint types for each partition to determine the lowest class present
int low_class = 4;
for (unsigned int i = 0; i < partition_count; i++)
{
int class_of_format = scb.color_formats[i] >> 2;
low_class = astc::min(class_of_format, low_class);
}
if (low_class == 3)
{
low_class = 2;
}
int encoded_type = low_class + 1;
int bitpos = 2;
for (unsigned int i = 0; i < partition_count; i++)
{
int classbit_of_format = (scb.color_formats[i] >> 2) - low_class;
encoded_type |= classbit_of_format << bitpos;
bitpos++;
}
for (unsigned int i = 0; i < partition_count; i++)
{
int lowbits_of_format = scb.color_formats[i] & 3;
encoded_type |= lowbits_of_format << bitpos;
bitpos += 2;
}
int encoded_type_lowpart = encoded_type & 0x3F;
int encoded_type_highpart = encoded_type >> 6;
int encoded_type_highpart_size = (3 * partition_count) - 4;
int encoded_type_highpart_pos = 128 - bits_for_weights - encoded_type_highpart_size;
write_bits(encoded_type_lowpart, 6, 13 + PARTITION_INDEX_BITS, pcb);
write_bits(encoded_type_highpart, encoded_type_highpart_size, encoded_type_highpart_pos, pcb);
below_weights_pos -= encoded_type_highpart_size;
}
}
else
{
write_bits(scb.color_formats[0], 4, 13, pcb);
}
// In dual-plane mode, encode the color component of the second plane of weights
if (is_dual_plane)
{
write_bits(scb.plane2_component, 2, below_weights_pos - 2, pcb);
}
// Encode the color components
uint8_t values_to_encode[32];
int valuecount_to_encode = 0;
const uint8_t* pack_table = color_uquant_to_scrambled_pquant_tables[scb.quant_mode - QUANT_6];
for (unsigned int i = 0; i < scb.partition_count; i++)
{
int vals = 2 * (scb.color_formats[i] >> 2) + 2;
assert(vals <= 8);
for (int j = 0; j < vals; j++)
{
values_to_encode[j + valuecount_to_encode] = pack_table[scb.color_values[i][j]];
}
valuecount_to_encode += vals;
}
encode_ise(scb.get_color_quant_mode(), valuecount_to_encode, values_to_encode, pcb,
scb.partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS);
}
#endif
/* See header for documentation. */
void physical_to_symbolic(
const block_size_descriptor& bsd,
const uint8_t pcb[16],
symbolic_compressed_block& scb
) {
uint8_t bswapped[16];
scb.block_type = SYM_BTYPE_NONCONST;
// Extract header fields
int block_mode = read_bits(11, 0, pcb);
if ((block_mode & 0x1FF) == 0x1FC)
{
// Constant color block
// Check what format the data has
if (block_mode & 0x200)
{
scb.block_type = SYM_BTYPE_CONST_F16;
}
else
{
scb.block_type = SYM_BTYPE_CONST_U16;
}
scb.partition_count = 0;
for (int i = 0; i < 4; i++)
{
scb.constant_color[i] = pcb[2 * i + 8] | (pcb[2 * i + 9] << 8);
}
// Additionally, check that the void-extent
if (bsd.zdim == 1)
{
// 2D void-extent
int rsvbits = read_bits(2, 10, pcb);
if (rsvbits != 3)
{
scb.block_type = SYM_BTYPE_ERROR;
return;
}
// Low values span 3 bytes so need two read_bits calls
int vx_low_s = read_bits(8, 12, pcb) | (read_bits(5, 12 + 8, pcb) << 8);
int vx_high_s = read_bits(13, 25, pcb);
int vx_low_t = read_bits(8, 38, pcb) | (read_bits(5, 38 + 8, pcb) << 8);
int vx_high_t = read_bits(13, 51, pcb);
int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF &&
vx_low_t == 0x1FFF && vx_high_t == 0x1FFF;
if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t) && !all_ones)
{
scb.block_type = SYM_BTYPE_ERROR;
return;
}
}
else
{
// 3D void-extent
int vx_low_s = read_bits(9, 10, pcb);
int vx_high_s = read_bits(9, 19, pcb);
int vx_low_t = read_bits(9, 28, pcb);
int vx_high_t = read_bits(9, 37, pcb);
int vx_low_r = read_bits(9, 46, pcb);
int vx_high_r = read_bits(9, 55, pcb);
int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF &&
vx_low_t == 0x1FF && vx_high_t == 0x1FF &&
vx_low_r == 0x1FF && vx_high_r == 0x1FF;
if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t || vx_low_r >= vx_high_r) && !all_ones)
{
scb.block_type = SYM_BTYPE_ERROR;
return;
}
}
return;
}
unsigned int packed_index = bsd.block_mode_packed_index[block_mode];
if (packed_index == BLOCK_BAD_BLOCK_MODE)
{
scb.block_type = SYM_BTYPE_ERROR;
return;
}
const auto& bm = bsd.get_block_mode(block_mode);
const auto& di = bsd.get_decimation_info(bm.decimation_mode);
int weight_count = di.weight_count;
promise(weight_count > 0);
quant_method weight_quant_method = static_cast<quant_method>(bm.quant_mode);
int is_dual_plane = bm.is_dual_plane;
int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;
int partition_count = read_bits(2, 11, pcb) + 1;
promise(partition_count > 0);
scb.block_mode = static_cast<uint16_t>(block_mode);
scb.partition_count = static_cast<uint8_t>(partition_count);
for (int i = 0; i < 16; i++)
{
bswapped[i] = static_cast<uint8_t>(bitrev8(pcb[15 - i]));
}
int bits_for_weights = get_ise_sequence_bitcount(real_weight_count, weight_quant_method);
int below_weights_pos = 128 - bits_for_weights;
uint8_t indices[64];
const auto& qat = quant_and_xfer_tables[weight_quant_method];
decode_ise(weight_quant_method, real_weight_count, bswapped, indices, 0);
if (is_dual_plane)
{
for (int i = 0; i < weight_count; i++)
{
scb.weights[i] = qat.unscramble_and_unquant_map[indices[2 * i]];
scb.weights[i + WEIGHTS_PLANE2_OFFSET] = qat.unscramble_and_unquant_map[indices[2 * i + 1]];
}
}
else
{
for (int i = 0; i < weight_count; i++)
{
scb.weights[i] = qat.unscramble_and_unquant_map[indices[i]];
}
}
if (is_dual_plane && partition_count == 4)
{
scb.block_type = SYM_BTYPE_ERROR;
return;
}
scb.color_formats_matched = 0;
// Determine the format of each endpoint pair
int color_formats[BLOCK_MAX_PARTITIONS];
int encoded_type_highpart_size = 0;
if (partition_count == 1)
{
color_formats[0] = read_bits(4, 13, pcb);
scb.partition_index = 0;
}
else
{
encoded_type_highpart_size = (3 * partition_count) - 4;
below_weights_pos -= encoded_type_highpart_size;
int encoded_type = read_bits(6, 13 + PARTITION_INDEX_BITS, pcb) |
(read_bits(encoded_type_highpart_size, below_weights_pos, pcb) << 6);
int baseclass = encoded_type & 0x3;
if (baseclass == 0)
{
for (int i = 0; i < partition_count; i++)
{
color_formats[i] = (encoded_type >> 2) & 0xF;
}
below_weights_pos += encoded_type_highpart_size;
scb.color_formats_matched = 1;
encoded_type_highpart_size = 0;
}
else
{
int bitpos = 2;
baseclass--;
for (int i = 0; i < partition_count; i++)
{
color_formats[i] = (((encoded_type >> bitpos) & 1) + baseclass) << 2;
bitpos++;
}
for (int i = 0; i < partition_count; i++)
{
color_formats[i] |= (encoded_type >> bitpos) & 3;
bitpos += 2;
}
}
scb.partition_index = static_cast<uint16_t>(read_bits(10, 13, pcb));
}
for (int i = 0; i < partition_count; i++)
{
scb.color_formats[i] = static_cast<uint8_t>(color_formats[i]);
}
// Determine number of color endpoint integers
int color_integer_count = 0;
for (int i = 0; i < partition_count; i++)
{
int endpoint_class = color_formats[i] >> 2;
color_integer_count += (endpoint_class + 1) * 2;
}
if (color_integer_count > 18)
{
scb.block_type = SYM_BTYPE_ERROR;
return;
}
// Determine the color endpoint format to use
static const int color_bits_arr[5] { -1, 115 - 4, 113 - 4 - PARTITION_INDEX_BITS, 113 - 4 - PARTITION_INDEX_BITS, 113 - 4 - PARTITION_INDEX_BITS };
int color_bits = color_bits_arr[partition_count] - bits_for_weights - encoded_type_highpart_size;
if (is_dual_plane)
{
color_bits -= 2;
}
if (color_bits < 0)
{
color_bits = 0;
}
int color_quant_level = quant_mode_table[color_integer_count >> 1][color_bits];
if (color_quant_level < QUANT_6)
{
scb.block_type = SYM_BTYPE_ERROR;
return;
}
// Unpack the integer color values and assign to endpoints
scb.quant_mode = static_cast<quant_method>(color_quant_level);
uint8_t values_to_decode[32];
decode_ise(static_cast<quant_method>(color_quant_level), color_integer_count, pcb,
values_to_decode, (partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS));
int valuecount_to_decode = 0;
const uint8_t* unpack_table = color_scrambled_pquant_to_uquant_tables[scb.quant_mode - QUANT_6];
for (int i = 0; i < partition_count; i++)
{
int vals = 2 * (color_formats[i] >> 2) + 2;
for (int j = 0; j < vals; j++)
{
scb.color_values[i][j] = unpack_table[values_to_decode[j + valuecount_to_decode]];
}
valuecount_to_decode += vals;
}
// Fetch component for second-plane in the case of dual plane of weights.
scb.plane2_component = -1;
if (is_dual_plane)
{
scb.plane2_component = static_cast<int8_t>(read_bits(2, below_weights_pos - 2, pcb));
}
}
+608
View File
@@ -0,0 +1,608 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2019-2025 Arm Limited
// Copyright 2008 Jose Fonseca
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/*
* This module implements vector support for floats, ints, and vector lane
* control masks. It provides access to both explicit vector width types, and
* flexible N-wide types where N can be determined at compile time.
*
* The design of this module encourages use of vector length agnostic code, via
* the vint, vfloat, and vmask types. These will take on the widest SIMD vector
* with that is available at compile time. The current vector width is
* accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant.
*
* Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types.
* These are provided primarily for prototyping and algorithm debug of VLA
* implementations.
*
* Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4
* types. These are provided for use by VLA code, but are also expected to be
* used as a fixed-width type and will supported a reference C++ fallback for
* use on platforms without SIMD intrinsics.
*
* Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8
* types. These are provide for use by VLA code, and are not expected to be
* used as a fixed-width type in normal code. No reference C implementation is
* provided on platforms without underlying SIMD intrinsics.
*
* With the current implementation ISA support is provided for:
*
* * 1-wide for scalar reference
* * 4-wide for Armv8-A NEON
* * 4-wide for x86-64 SSE2
* * 4-wide for x86-64 SSE4.1
* * 8-wide for Armv8-A SVE
* * 8-wide for x86-64 AVX2
*/
#ifndef ASTC_VECMATHLIB_H_INCLUDED
#define ASTC_VECMATHLIB_H_INCLUDED
#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
#include <immintrin.h>
#endif
#if ASTCENC_SVE != 0
#include <arm_sve.h>
#include <arm_neon_sve_bridge.h>
#endif
#if ASTCENC_NEON != 0
#include <arm_neon.h>
#endif
#if !defined(__clang__) && defined(_MSC_VER)
#define ASTCENC_SIMD_INLINE __forceinline
#define ASTCENC_NO_INLINE
#elif defined(__GNUC__) && !defined(__clang__)
#define ASTCENC_SIMD_INLINE __attribute__((always_inline)) inline
#define ASTCENC_NO_INLINE __attribute__ ((noinline))
#else
#define ASTCENC_SIMD_INLINE __attribute__((always_inline, nodebug)) inline
#define ASTCENC_NO_INLINE __attribute__ ((noinline))
#endif
template<typename T> T gatherf_byte_inds(const float* base, const uint8_t* indices);
#if ASTCENC_AVX >= 2
// If we have AVX2 expose 8-wide VLA.
#include "astcenc_vecmathlib_sse_4.h"
#include "astcenc_vecmathlib_common_4.h"
#include "astcenc_vecmathlib_avx2_8.h"
#define ASTCENC_SIMD_WIDTH 8
using vfloat = vfloat8;
#if defined(ASTCENC_NO_INVARIANCE)
using vfloatacc = vfloat8;
#else
using vfloatacc = vfloat4;
#endif
using vint = vint8;
using vmask = vmask8;
using vtable_16x8 = vtable8_16x8;
using vtable_32x8 = vtable8_32x8;
using vtable_64x8 = vtable8_64x8;
constexpr auto loada = vfloat8::loada;
constexpr auto load1 = vfloat8::load1;
constexpr auto vint_from_size = vint8_from_size;
#elif ASTCENC_SSE >= 20
// If we have SSE expose 4-wide VLA, and 4-wide fixed width.
#include "astcenc_vecmathlib_sse_4.h"
#include "astcenc_vecmathlib_common_4.h"
#define ASTCENC_SIMD_WIDTH 4
using vfloat = vfloat4;
using vfloatacc = vfloat4;
using vint = vint4;
using vmask = vmask4;
using vtable_16x8 = vtable4_16x8;
using vtable_32x8 = vtable4_32x8;
using vtable_64x8 = vtable4_64x8;
constexpr auto loada = vfloat4::loada;
constexpr auto load1 = vfloat4::load1;
constexpr auto vint_from_size = vint4_from_size;
#elif ASTCENC_SVE == 8
// Check the compiler is configured with fixed-length 256-bit SVE.
#if !defined(__ARM_FEATURE_SVE_BITS) || (__ARM_FEATURE_SVE_BITS != 256)
#error "__ARM_FEATURE_SVE_BITS is not set to 256 bits"
#endif
// If we have SVE configured as 8-wide, expose 8-wide VLA.
#include "astcenc_vecmathlib_neon_4.h"
#include "astcenc_vecmathlib_common_4.h"
#include "astcenc_vecmathlib_sve_8.h"
#define ASTCENC_SIMD_WIDTH 8
using vfloat = vfloat8;
#if defined(ASTCENC_NO_INVARIANCE)
using vfloatacc = vfloat8;
#else
using vfloatacc = vfloat4;
#endif
using vint = vint8;
using vmask = vmask8;
using vtable_16x8 = vtable8_16x8;
using vtable_32x8 = vtable8_32x8;
using vtable_64x8 = vtable8_64x8;
constexpr auto loada = vfloat8::loada;
constexpr auto load1 = vfloat8::load1;
constexpr auto vint_from_size = vint8_from_size;
#elif ASTCENC_NEON > 0
// If we have NEON expose 4-wide VLA.
#include "astcenc_vecmathlib_neon_4.h"
#include "astcenc_vecmathlib_common_4.h"
#define ASTCENC_SIMD_WIDTH 4
using vfloat = vfloat4;
using vfloatacc = vfloat4;
using vint = vint4;
using vmask = vmask4;
using vtable_16x8 = vtable4_16x8;
using vtable_32x8 = vtable4_32x8;
using vtable_64x8 = vtable4_64x8;
constexpr auto loada = vfloat4::loada;
constexpr auto load1 = vfloat4::load1;
constexpr auto vint_from_size = vint4_from_size;
#else
// If we have nothing expose 4-wide VLA, and 4-wide fixed width.
// Note: We no longer expose the 1-wide scalar fallback because it is not
// invariant with the 4-wide path due to algorithms that use horizontal
// operations that accumulate a local vector sum before accumulating into
// a running sum.
//
// For 4 items adding into an accumulator using 1-wide vectors the sum is:
//
// result = ((((sum + l0) + l1) + l2) + l3)
//
// ... whereas the accumulator for a 4-wide vector sum is:
//
// result = sum + ((l0 + l2) + (l1 + l3))
//
// In "normal maths" this is the same, but the floating point reassociation
// differences mean that these will not produce the same result.
#include "astcenc_vecmathlib_none_4.h"
#include "astcenc_vecmathlib_common_4.h"
#define ASTCENC_SIMD_WIDTH 4
using vfloat = vfloat4;
using vfloatacc = vfloat4;
using vint = vint4;
using vmask = vmask4;
using vtable_16x8 = vtable4_16x8;
using vtable_32x8 = vtable4_32x8;
using vtable_64x8 = vtable4_64x8;
constexpr auto loada = vfloat4::loada;
constexpr auto load1 = vfloat4::load1;
constexpr auto vint_from_size = vint4_from_size;
#endif
/**
* @brief Round a count down to the largest multiple of the SIMD width.
*
* Assumption that the vector width is a power of two ...
*
* @param count The unrounded value.
*
* @return The rounded value.
*/
ASTCENC_SIMD_INLINE size_t round_down_to_simd_multiple_vla(size_t count)
{
return count & static_cast<size_t>(~(ASTCENC_SIMD_WIDTH - 1));
}
/**
* @brief Round a count up to the largest multiple of the SIMD width.
*
* Assumption that the vector width is a power of two ...
*
* @param count The unrounded value.
*
* @return The rounded value.
*/
ASTCENC_SIMD_INLINE size_t round_up_to_simd_multiple_vla(size_t count)
{
size_t multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
return multiples * ASTCENC_SIMD_WIDTH;
}
/**
* @brief Return @c a with lanes negated if the @c b lane is negative.
*/
ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b)
{
vint ia = float_as_int(a);
vint ib = float_as_int(b);
vint sign_mask(static_cast<int>(0x80000000));
vint r = ia ^ (ib & sign_mask);
return int_as_float(r);
}
/**
* @brief Return fast, but approximate, vector atan(x).
*
* Max error of this implementation is 0.004883.
*/
ASTCENC_SIMD_INLINE vfloat atan(vfloat x)
{
vmask c = abs(x) > vfloat(1.0f);
vfloat z = change_sign(vfloat(astc::PI_OVER_TWO), x);
vfloat y = select(x, vfloat(1.0f) / x, c);
y = y / (y * y * vfloat(0.28f) + vfloat(1.0f));
return select(y, z - y, c);
}
/**
* @brief Return fast, but approximate, vector atan2(x, y).
*/
ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x)
{
vfloat z = atan(abs(y / x));
vmask xmask = x < vfloat::zero();
return change_sign(select(z, vfloat(astc::PI) - z, xmask), y);
}
/*
* @brief Factory that returns a unit length 4 component vfloat4.
*/
static ASTCENC_SIMD_INLINE vfloat4 unit4()
{
return vfloat4(0.5f);
}
/**
* @brief Factory that returns a unit length 3 component vfloat4.
*/
static ASTCENC_SIMD_INLINE vfloat4 unit3()
{
float val = 0.577350258827209473f;
return vfloat4(val, val, val, 0.0f);
}
/**
* @brief Factory that returns a unit length 2 component vfloat4.
*/
static ASTCENC_SIMD_INLINE vfloat4 unit2()
{
float val = 0.707106769084930420f;
return vfloat4(val, val, 0.0f, 0.0f);
}
/**
* @brief Factory that returns a 3 component vfloat4.
*/
static ASTCENC_SIMD_INLINE vfloat4 vfloat3(float a, float b, float c)
{
return vfloat4(a, b, c, 0.0f);
}
/**
* @brief Factory that returns a 2 component vfloat4.
*/
static ASTCENC_SIMD_INLINE vfloat4 vfloat2(float a, float b)
{
return vfloat4(a, b, 0.0f, 0.0f);
}
/**
* @brief Normalize a non-zero length vector to unit length.
*/
static ASTCENC_SIMD_INLINE vfloat4 normalize(vfloat4 a)
{
vfloat4 length = dot(a, a);
return a / sqrt(length);
}
/**
* @brief Normalize a vector, returning @c safe if len is zero.
*/
static ASTCENC_SIMD_INLINE vfloat4 normalize_safe(vfloat4 a, vfloat4 safe)
{
vfloat4 length = dot(a, a);
if (length.lane<0>() != 0.0f)
{
return a / sqrt(length);
}
return safe;
}
#define POLY0(x, c0) ( c0)
#define POLY1(x, c0, c1) ((POLY0(x, c1) * x) + c0)
#define POLY2(x, c0, c1, c2) ((POLY1(x, c1, c2) * x) + c0)
#define POLY3(x, c0, c1, c2, c3) ((POLY2(x, c1, c2, c3) * x) + c0)
#define POLY4(x, c0, c1, c2, c3, c4) ((POLY3(x, c1, c2, c3, c4) * x) + c0)
#define POLY5(x, c0, c1, c2, c3, c4, c5) ((POLY4(x, c1, c2, c3, c4, c5) * x) + c0)
/**
* @brief Compute an approximate exp2(x) for each lane in the vector.
*
* Based on 5th degree minimax polynomials, ported from this blog
* https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
*/
static ASTCENC_SIMD_INLINE vfloat4 exp2(vfloat4 x)
{
x = clamp(-126.99999f, 129.0f, x);
vint4 ipart = float_to_int(x - 0.5f);
vfloat4 fpart = x - int_to_float(ipart);
// Integer contrib, using 1 << ipart
vfloat4 iexp = int_as_float(lsl<23>(ipart + 127));
// Fractional contrib, using polynomial fit of 2^x in range [-0.5, 0.5)
vfloat4 fexp = POLY5(fpart,
9.9999994e-1f,
6.9315308e-1f,
2.4015361e-1f,
5.5826318e-2f,
8.9893397e-3f,
1.8775767e-3f);
return iexp * fexp;
}
/**
* @brief Compute an approximate log2(x) for each lane in the vector.
*
* Based on 5th degree minimax polynomials, ported from this blog
* https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
*/
static ASTCENC_SIMD_INLINE vfloat4 log2(vfloat4 x)
{
vint4 exp(0x7F800000);
vint4 mant(0x007FFFFF);
vint4 one(0x3F800000);
vint4 i = float_as_int(x);
vfloat4 e = int_to_float(lsr<23>(i & exp) - 127);
vfloat4 m = int_as_float((i & mant) | one);
// Polynomial fit of log2(x)/(x - 1), for x in range [1, 2)
vfloat4 p = POLY4(m,
2.8882704548164776201f,
-2.52074962577807006663f,
1.48116647521213171641f,
-0.465725644288844778798f,
0.0596515482674574969533f);
// Increases the polynomial degree, but ensures that log2(1) == 0
p = p * (m - 1.0f);
return p + e;
}
/**
* @brief Compute an approximate pow(x, y) for each lane in the vector.
*
* Power function based on the exp2(log2(x) * y) transform.
*/
static ASTCENC_SIMD_INLINE vfloat4 pow(vfloat4 x, vfloat4 y)
{
vmask4 zero_mask = y == vfloat4(0.0f);
vfloat4 estimate = exp2(log2(x) * y);
// Guarantee that y == 0 returns exactly 1.0f
return select(estimate, vfloat4(1.0f), zero_mask);
}
/**
* @brief Count the leading zeros for each lane in @c a.
*
* Valid for all data values of @c a; will return a per-lane value [0, 32].
*/
static ASTCENC_SIMD_INLINE vint4 clz(vint4 a)
{
// This function is a horrible abuse of floating point exponents to convert
// the original integer value into a 2^N encoding we can recover easily.
// Convert to float without risk of rounding up by keeping only top 8 bits.
// This trick is is guaranteed to keep top 8 bits and clear the 9th.
a = (~lsr<8>(a)) & a;
a = float_as_int(int_to_float(a));
// Extract and unbias exponent
a = vint4(127 + 31) - lsr<23>(a);
// Clamp result to a valid 32-bit range
return clamp(0, 32, a);
}
/**
* @brief Return lanewise 2^a for each lane in @c a.
*
* Use of signed int means that this is only valid for values in range [0, 31].
*/
static ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a)
{
// 2^30 is the largest signed number than can be represented
assert(all(a < vint4(31)));
// This function is a horrible abuse of floating point to use the exponent
// and float conversion to generate a 2^N multiple.
// Bias the exponent
vint4 exp = a + 127;
exp = lsl<23>(exp);
// Reinterpret the bits as a float, and then convert to an int
vfloat4 f = int_as_float(exp);
return float_to_int(f);
}
/**
* @brief Convert unorm16 [0, 65535] to float16 in range [0, 1].
*/
static ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p)
{
vint4 fp16_one = vint4(0x3C00);
vint4 fp16_small = lsl<8>(p);
vmask4 is_one = p == vint4(0xFFFF);
vmask4 is_small = p < vint4(4);
// Manually inline clz() on Visual Studio to avoid release build codegen bug
// see https://github.com/ARM-software/astc-encoder/issues/259
#if !defined(__clang__) && defined(_MSC_VER)
vint4 a = (~lsr<8>(p)) & p;
a = float_as_int(int_to_float(a));
a = vint4(127 + 31) - lsr<23>(a);
vint4 lz = clamp(0, 32, a) - 16;
#else
vint4 lz = clz(p) - 16;
#endif
p = p * two_to_the_n(lz + 1);
p = p & vint4(0xFFFF);
p = lsr<6>(p);
p = p | lsl<10>(vint4(14) - lz);
vint4 r = select(p, fp16_one, is_one);
r = select(r, fp16_small, is_small);
return r;
}
/**
* @brief Convert 16-bit LNS to float16.
*/
static ASTCENC_SIMD_INLINE vint4 lns_to_sf16(vint4 p)
{
vint4 mc = p & 0x7FF;
vint4 ec = lsr<11>(p);
vint4 mc_512 = mc * 3;
vmask4 mask_512 = mc < vint4(512);
vint4 mc_1536 = mc * 4 - 512;
vmask4 mask_1536 = mc < vint4(1536);
vint4 mc_else = mc * 5 - 2048;
vint4 mt = mc_else;
mt = select(mt, mc_1536, mask_1536);
mt = select(mt, mc_512, mask_512);
vint4 res = lsl<10>(ec) | lsr<3>(mt);
return min(res, vint4(0x7BFF));
}
/**
* @brief Extract mantissa and exponent of a float value.
*
* @param a The input value.
* @param[out] exp The output exponent.
*
* @return The mantissa.
*/
static ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp)
{
// Interpret the bits as an integer
vint4 ai = float_as_int(a);
// Extract and unbias the exponent
exp = (lsr<23>(ai) & 0xFF) - 126;
// Extract and unbias the mantissa
vint4 manti = (ai & static_cast<int>(0x807FFFFF)) | 0x3F000000;
return int_as_float(manti);
}
/**
* @brief Convert float to 16-bit LNS.
*/
static ASTCENC_SIMD_INLINE vfloat4 float_to_lns(vfloat4 a)
{
vint4 exp;
vfloat4 mant = frexp(a, exp);
// Do these early before we start messing about ...
vmask4 mask_underflow_nan = ~(a > vfloat4(1.0f / 67108864.0f));
vmask4 mask_infinity = a >= vfloat4(65536.0f);
// If input is smaller than 2^-14, multiply by 2^25 and don't bias.
vmask4 exp_lt_m13 = exp < vint4(-13);
vfloat4 a1a = a * 33554432.0f;
vint4 expa = vint4::zero();
vfloat4 a1b = (mant - 0.5f) * 4096;
vint4 expb = exp + 14;
a = select(a1b, a1a, exp_lt_m13);
exp = select(expb, expa, exp_lt_m13);
vmask4 a_lt_384 = a < vfloat4(384.0f);
vmask4 a_lt_1408 = a <= vfloat4(1408.0f);
vfloat4 a2a = a * (4.0f / 3.0f);
vfloat4 a2b = a + 128.0f;
vfloat4 a2c = (a + 512.0f) * (4.0f / 5.0f);
a = a2c;
a = select(a, a2b, a_lt_1408);
a = select(a, a2a, a_lt_384);
a = a + (int_to_float(exp) * 2048.0f) + 1.0f;
a = select(a, vfloat4(65535.0f), mask_infinity);
a = select(a, vfloat4::zero(), mask_underflow_nan);
return a;
}
namespace astc
{
static ASTCENC_SIMD_INLINE float pow(float x, float y)
{
return pow(vfloat4(x), vfloat4(y)).lane<0>();
}
}
#endif // #ifndef ASTC_VECMATHLIB_H_INCLUDED
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,421 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2020-2025 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Generic 4x32-bit vector functions.
*
* This module implements generic 4-wide vector functions that are valid for
* all instruction sets, typically implemented using lower level 4-wide
* operations that are ISA-specific.
*/
#ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
#define ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
#ifndef ASTCENC_SIMD_INLINE
#error "Include astcenc_vecmathlib.h, do not include directly"
#endif
#include <cstdio>
#include <limits>
// ============================================================================
// vint4 operators and functions
// ============================================================================
/**
* @brief Overload: vector by scalar addition.
*/
ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, int b)
{
return a + vint4(b);
}
/**
* @brief Overload: vector by vector incremental addition.
*/
ASTCENC_SIMD_INLINE vint4& operator+=(vint4& a, const vint4& b)
{
a = a + b;
return a;
}
/**
* @brief Overload: vector by scalar subtraction.
*/
ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, int b)
{
return a - vint4(b);
}
/**
* @brief Overload: vector by scalar multiplication.
*/
ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, int b)
{
return a * vint4(b);
}
/**
* @brief Overload: vector by scalar bitwise or.
*/
ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, int b)
{
return a | vint4(b);
}
/**
* @brief Overload: vector by scalar bitwise and.
*/
ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, int b)
{
return a & vint4(b);
}
/**
* @brief Overload: vector by scalar bitwise xor.
*/
ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, int b)
{
return a ^ vint4(b);
}
/**
* @brief Return the clamped value between min and max.
*/
ASTCENC_SIMD_INLINE vint4 clamp(int minv, int maxv, vint4 a)
{
return min(max(a, vint4(minv)), vint4(maxv));
}
/**
* @brief Return the horizontal sum of RGB vector lanes as a scalar.
*/
ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
{
return a.lane<0>() + a.lane<1>() + a.lane<2>();
}
/**
* @brief Return the horizontal minimum of a vector.
*/
ASTCENC_SIMD_INLINE int hmin_s(vint4 a)
{
return hmin(a).lane<0>();
}
/**
* @brief Generate a vint4 from a size_t.
*/
ASTCENC_SIMD_INLINE vint4 vint4_from_size(size_t a)
{
assert(a <= std::numeric_limits<int>::max());
return vint4(static_cast<int>(a));
}
/**
* @brief Return the horizontal maximum of a vector.
*/
ASTCENC_SIMD_INLINE int hmax_s(vint4 a)
{
return hmax(a).lane<0>();
}
// ============================================================================
// vfloat4 operators and functions
// ============================================================================
/**
* @brief Overload: vector by vector incremental addition.
*/
ASTCENC_SIMD_INLINE vfloat4& operator+=(vfloat4& a, const vfloat4& b)
{
a = a + b;
return a;
}
/**
* @brief Overload: vector by scalar addition.
*/
ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, float b)
{
return a + vfloat4(b);
}
/**
* @brief Overload: vector by scalar subtraction.
*/
ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, float b)
{
return a - vfloat4(b);
}
/**
* @brief Overload: vector by scalar multiplication.
*/
ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, float b)
{
return a * vfloat4(b);
}
/**
* @brief Overload: scalar by vector multiplication.
*/
ASTCENC_SIMD_INLINE vfloat4 operator*(float a, vfloat4 b)
{
return vfloat4(a) * b;
}
/**
* @brief Overload: vector by scalar division.
*/
ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, float b)
{
return a / vfloat4(b);
}
/**
* @brief Overload: scalar by vector division.
*/
ASTCENC_SIMD_INLINE vfloat4 operator/(float a, vfloat4 b)
{
return vfloat4(a) / b;
}
/**
* @brief Return the min vector of a vector and a scalar.
*
* If either lane value is NaN, @c b will be returned for that lane.
*/
ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, float b)
{
return min(a, vfloat4(b));
}
/**
* @brief Return the max vector of a vector and a scalar.
*
* If either lane value is NaN, @c b will be returned for that lane.
*/
ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, float b)
{
return max(a, vfloat4(b));
}
/**
* @brief Return the clamped value between min and max.
*
* It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
* then @c min will be returned for that lane.
*/
ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
{
// Do not reorder - second operand will return if either is NaN
return min(max(a, minv), maxv);
}
/**
* @brief Return the clamped value between 0.0f and 1.0f.
*
* If @c a is NaN then zero will be returned for that lane.
*/
ASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a)
{
// Do not reorder - second operand will return if either is NaN
return min(max(a, vfloat4::zero()), 1.0f);
}
/**
* @brief Return the horizontal minimum of a vector.
*/
ASTCENC_SIMD_INLINE float hmin_s(vfloat4 a)
{
return hmin(a).lane<0>();
}
/**
* @brief Return the horizontal min of RGB vector lanes as a scalar.
*/
ASTCENC_SIMD_INLINE float hmin_rgb_s(vfloat4 a)
{
a.set_lane<3>(a.lane<0>());
return hmin_s(a);
}
/**
* @brief Return the horizontal maximum of a vector.
*/
ASTCENC_SIMD_INLINE float hmax_s(vfloat4 a)
{
return hmax(a).lane<0>();
}
/**
* @brief Accumulate lane-wise sums for a vector.
*/
ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a)
{
accum = accum + a;
}
/**
* @brief Accumulate lane-wise sums for a masked vector.
*/
ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a, vmask4 m)
{
a = select(vfloat4::zero(), a, m);
haccumulate(accum, a);
}
/**
* @brief Return the horizontal sum of RGB vector lanes as a scalar.
*/
ASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a)
{
return a.lane<0>() + a.lane<1>() + a.lane<2>();
}
#if !defined(ASTCENC_USE_NATIVE_DOT_PRODUCT)
/**
* @brief Return the dot product for the full 4 lanes, returning scalar.
*/
ASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b)
{
vfloat4 m = a * b;
return hadd_s(m);
}
/**
* @brief Return the dot product for the full 4 lanes, returning vector.
*/
ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b)
{
vfloat4 m = a * b;
return vfloat4(hadd_s(m));
}
/**
* @brief Return the dot product for the bottom 3 lanes, returning scalar.
*/
ASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b)
{
vfloat4 m = a * b;
return hadd_rgb_s(m);
}
/**
* @brief Return the dot product for the bottom 3 lanes, returning vector.
*/
ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b)
{
vfloat4 m = a * b;
float d3 = hadd_rgb_s(m);
return vfloat4(d3, d3, d3, 0.0f);
}
#endif
#if !defined(ASTCENC_USE_NATIVE_POPCOUNT)
/**
* @brief Population bit count.
*
* @param v The value to population count.
*
* @return The number of 1 bits.
*/
static inline int popcount(uint64_t v)
{
uint64_t mask1 = 0x5555555555555555ULL;
uint64_t mask2 = 0x3333333333333333ULL;
uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL;
v -= (v >> 1) & mask1;
v = (v & mask2) + ((v >> 2) & mask2);
v += v >> 4;
v &= mask3;
v *= 0x0101010101010101ULL;
v >>= 56;
return static_cast<int>(v);
}
#endif
/**
* @brief Apply signed bit transfer.
*
* @param input0 The first encoded endpoint.
* @param input1 The second encoded endpoint.
*/
static ASTCENC_SIMD_INLINE void bit_transfer_signed(
vint4& input0,
vint4& input1
) {
input1 = lsr<1>(input1) | (input0 & 0x80);
input0 = lsr<1>(input0) & 0x3F;
vmask4 mask = (input0 & 0x20) != vint4::zero();
input0 = select(input0, input0 - 0x40, mask);
}
/**
* @brief Debug function to print a vector of ints.
*/
ASTCENC_SIMD_INLINE void print(vint4 a)
{
ASTCENC_ALIGNAS int v[4];
storea(a, v);
printf("v4_i32:\n %8d %8d %8d %8d\n",
v[0], v[1], v[2], v[3]);
}
/**
* @brief Debug function to print a vector of ints.
*/
ASTCENC_SIMD_INLINE void printx(vint4 a)
{
ASTCENC_ALIGNAS int v[4];
storea(a, v);
unsigned int uv[4];
std::memcpy(uv, v, sizeof(int) * 4);
printf("v4_i32:\n %08x %08x %08x %08x\n",
uv[0], uv[1], uv[2], uv[3]);
}
/**
* @brief Debug function to print a vector of floats.
*/
ASTCENC_SIMD_INLINE void print(vfloat4 a)
{
ASTCENC_ALIGNAS float v[4];
storea(a, v);
printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n",
static_cast<double>(v[0]), static_cast<double>(v[1]),
static_cast<double>(v[2]), static_cast<double>(v[3]));
}
/**
* @brief Debug function to print a vector of masks.
*/
ASTCENC_SIMD_INLINE void print(vmask4 a)
{
print(select(vint4(0), vint4(1), a));
}
#endif // #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,496 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
#if !defined(ASTCENC_DECOMPRESS_ONLY)
/**
* @brief Functions for angular-sum algorithm for weight alignment.
*
* This algorithm works as follows:
* - we compute a complex number P as (cos s*i, sin s*i) for each weight,
* where i is the input value and s is a scaling factor based on the spacing between the weights.
* - we then add together complex numbers for all the weights.
* - we then compute the length and angle of the resulting sum.
*
* This should produce the following results:
* - perfect alignment results in a vector whose length is equal to the sum of lengths of all inputs
* - even distribution results in a vector of length 0.
* - all samples identical results in perfect alignment for every scaling.
*
* For each scaling factor within a given set, we compute an alignment factor from 0 to 1. This
* should then result in some scalings standing out as having particularly good alignment factors;
* we can use this to produce a set of candidate scale/shift values for various quantization levels;
* we should then actually try them and see what happens.
*/
#include "astcenc_internal.h"
#include "astcenc_vecmathlib.h"
#include <stdio.h>
#include <cassert>
#include <cstring>
#include <cfloat>
static constexpr unsigned int ANGULAR_STEPS { 32 };
static_assert((ANGULAR_STEPS % ASTCENC_SIMD_WIDTH) == 0,
"ANGULAR_STEPS must be multiple of ASTCENC_SIMD_WIDTH");
static_assert(ANGULAR_STEPS >= 32,
"ANGULAR_STEPS must be at least max(steps_for_quant_level)");
// Store a reduced sin/cos table for 64 possible weight values; this causes
// slight quality loss compared to using sin() and cos() directly. Must be 2^N.
static constexpr unsigned int SINCOS_STEPS { 64 };
static const uint8_t steps_for_quant_level[12] {
2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32
};
ASTCENC_ALIGNAS static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
ASTCENC_ALIGNAS static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
#if defined(ASTCENC_DIAGNOSTICS)
static bool print_once { true };
#endif
/* See header for documentation. */
void prepare_angular_tables()
{
for (unsigned int i = 0; i < ANGULAR_STEPS; i++)
{
float angle_step = static_cast<float>(i + 1);
for (unsigned int j = 0; j < SINCOS_STEPS; j++)
{
sin_table[j][i] = static_cast<float>(sinf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
cos_table[j][i] = static_cast<float>(cosf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
}
}
}
/**
* @brief Compute the angular alignment factors and offsets.
*
* @param weight_count The number of (decimated) weights.
* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
* @param max_angular_steps The maximum number of steps to be tested.
* @param[out] offsets The output angular offsets array.
*/
static void compute_angular_offsets(
unsigned int weight_count,
const float* dec_weight_ideal_value,
unsigned int max_angular_steps,
float* offsets
) {
promise(weight_count > 0);
promise(max_angular_steps > 0);
ASTCENC_ALIGNAS int isamplev[BLOCK_MAX_WEIGHTS];
// Precompute isample; arrays are always allocated 64 elements long
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
{
// Ideal weight can be outside [0, 1] range, so clamp to fit table
vfloat ideal_weight = clampzo(loada(dec_weight_ideal_value + i));
// Convert a weight to a sincos table index
vfloat sample = ideal_weight * (SINCOS_STEPS - 1.0f);
vint isample = float_to_int_rtn(sample);
storea(isample, isamplev + i);
}
// Arrays are multiple of SIMD width (ANGULAR_STEPS), safe to overshoot max
vfloat mult(1.0f / (2.0f * astc::PI));
for (unsigned int i = 0; i < max_angular_steps; i += ASTCENC_SIMD_WIDTH)
{
vfloat anglesum_x = vfloat::zero();
vfloat anglesum_y = vfloat::zero();
for (unsigned int j = 0; j < weight_count; j++)
{
int isample = isamplev[j];
anglesum_x += loada(cos_table[isample] + i);
anglesum_y += loada(sin_table[isample] + i);
}
vfloat angle = atan2(anglesum_y, anglesum_x);
vfloat ofs = angle * mult;
storea(ofs, offsets + i);
}
}
/**
* @brief For a given step size compute the lowest and highest weight.
*
* Compute the lowest and highest weight that results from quantizing using the given stepsize and
* offset, and then compute the resulting error. The cut errors indicate the error that results from
* forcing samples that should have had one weight value one step up or down.
*
* @param weight_count The number of (decimated) weights.
* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
* @param max_angular_steps The maximum number of steps to be tested.
* @param max_quant_steps The maximum quantization level to be tested.
* @param offsets The angular offsets array.
* @param[out] lowest_weight Per angular step, the lowest weight.
* @param[out] weight_span Per angular step, the span between lowest and highest weight.
* @param[out] error Per angular step, the error.
* @param[out] cut_low_weight_error Per angular step, the low weight cut error.
* @param[out] cut_high_weight_error Per angular step, the high weight cut error.
*/
static void compute_lowest_and_highest_weight(
unsigned int weight_count,
const float* dec_weight_ideal_value,
unsigned int max_angular_steps,
unsigned int max_quant_steps,
const float* offsets,
float* lowest_weight,
int* weight_span,
float* error,
float* cut_low_weight_error,
float* cut_high_weight_error
) {
promise(weight_count > 0);
promise(max_angular_steps > 0);
vfloat rcp_stepsize = int_to_float(vint::lane_id()) + vfloat(1.0f);
// Compute minimum/maximum weights in the weight array. Our remapping
// is monotonic, so the min/max rounded weights relate to the min/max
// unrounded weights in a straightforward way.
vfloat min_weight(FLT_MAX);
vfloat max_weight(-FLT_MAX);
vint lane_id = vint::lane_id();
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
{
vmask active = lane_id < vint(weight_count);
lane_id += vint(ASTCENC_SIMD_WIDTH);
vfloat weights = loada(dec_weight_ideal_value + i);
min_weight = min(min_weight, select(min_weight, weights, active));
max_weight = max(max_weight, select(max_weight, weights, active));
}
min_weight = hmin(min_weight);
max_weight = hmax(max_weight);
// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
{
vfloat errval = vfloat::zero();
vfloat cut_low_weight_err = vfloat::zero();
vfloat cut_high_weight_err = vfloat::zero();
vfloat offset = loada(offsets + sp);
// We know the min and max weight values, so we can figure out
// the corresponding indices before we enter the loop.
vfloat minidx = round(min_weight * rcp_stepsize - offset);
vfloat maxidx = round(max_weight * rcp_stepsize - offset);
for (unsigned int j = 0; j < weight_count; j++)
{
vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset;
vfloat svalrte = round(sval);
vfloat diff = sval - svalrte;
errval += diff * diff;
// Accumulate errors for minimum index
vmask mask = svalrte == minidx;
vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff;
cut_low_weight_err = select(cut_low_weight_err, accum, mask);
// Accumulate errors for maximum index
mask = svalrte == maxidx;
accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff;
cut_high_weight_err = select(cut_high_weight_err, accum, mask);
}
// Write out min weight and weight span; clamp span to a usable range
vint span = float_to_int(maxidx - minidx + vfloat(1));
span = min(span, vint(max_quant_steps + 3));
span = max(span, vint(2));
storea(minidx, lowest_weight + sp);
storea(span, weight_span + sp);
// The cut_(lowest/highest)_weight_error indicate the error that results from forcing
// samples that should have had the weight value one step (up/down).
vfloat ssize = 1.0f / rcp_stepsize;
vfloat errscale = ssize * ssize;
storea(errval * errscale, error + sp);
storea(cut_low_weight_err * errscale, cut_low_weight_error + sp);
storea(cut_high_weight_err * errscale, cut_high_weight_error + sp);
rcp_stepsize = rcp_stepsize + vfloat(ASTCENC_SIMD_WIDTH);
}
}
/**
* @brief The main function for the angular algorithm.
*
* @param weight_count The number of (decimated) weights.
* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
* @param max_quant_level The maximum quantization level to be tested.
* @param[out] low_value Per angular step, the lowest weight value.
* @param[out] high_value Per angular step, the highest weight value.
*/
static void compute_angular_endpoints_for_quant_levels(
unsigned int weight_count,
const float* dec_weight_ideal_value,
unsigned int max_quant_level,
float low_value[TUNE_MAX_ANGULAR_QUANT + 1],
float high_value[TUNE_MAX_ANGULAR_QUANT + 1]
) {
unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];
ASTCENC_ALIGNAS float angular_offsets[ANGULAR_STEPS];
compute_angular_offsets(weight_count, dec_weight_ideal_value,
max_angular_steps, angular_offsets);
ASTCENC_ALIGNAS float lowest_weight[ANGULAR_STEPS];
ASTCENC_ALIGNAS int32_t weight_span[ANGULAR_STEPS];
ASTCENC_ALIGNAS float error[ANGULAR_STEPS];
ASTCENC_ALIGNAS float cut_low_weight_error[ANGULAR_STEPS];
ASTCENC_ALIGNAS float cut_high_weight_error[ANGULAR_STEPS];
compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value,
max_angular_steps, max_quant_steps,
angular_offsets, lowest_weight, weight_span, error,
cut_low_weight_error, cut_high_weight_error);
// For each quantization level, find the best error terms. Use packed vectors so data-dependent
// branches can become selects. This involves some integer to float casts, but the values are
// small enough so they never round the wrong way.
vfloat4 best_results[36];
// Initialize the array to some safe defaults
promise(max_quant_steps > 0);
for (unsigned int i = 0; i < (max_quant_steps + 4); i++)
{
// Lane<0> = Best error
// Lane<1> = Best scale; -1 indicates no solution found
// Lane<2> = Cut low weight
best_results[i] = vfloat4(ERROR_CALC_DEFAULT, -1.0f, 0.0f, 0.0f);
}
promise(max_angular_steps > 0);
for (unsigned int i = 0; i < max_angular_steps; i++)
{
float i_flt = static_cast<float>(i);
int idx_span = weight_span[i];
float error_cut_low = error[i] + cut_low_weight_error[i];
float error_cut_high = error[i] + cut_high_weight_error[i];
float error_cut_low_high = error[i] + cut_low_weight_error[i] + cut_high_weight_error[i];
// Check best error against record N
vfloat4 best_result = best_results[idx_span];
vfloat4 new_result = vfloat4(error[i], i_flt, 0.0f, 0.0f);
vmask4 mask = vfloat4(best_result.lane<0>()) > vfloat4(error[i]);
best_results[idx_span] = select(best_result, new_result, mask);
// Check best error against record N-1 with either cut low or cut high
best_result = best_results[idx_span - 1];
new_result = vfloat4(error_cut_low, i_flt, 1.0f, 0.0f);
mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low);
best_result = select(best_result, new_result, mask);
new_result = vfloat4(error_cut_high, i_flt, 0.0f, 0.0f);
mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_high);
best_results[idx_span - 1] = select(best_result, new_result, mask);
// Check best error against record N-2 with both cut low and high
best_result = best_results[idx_span - 2];
new_result = vfloat4(error_cut_low_high, i_flt, 1.0f, 0.0f);
mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low_high);
best_results[idx_span - 2] = select(best_result, new_result, mask);
}
for (unsigned int i = 0; i <= max_quant_level; i++)
{
unsigned int q = steps_for_quant_level[i];
int bsi = static_cast<int>(best_results[q].lane<1>());
// Did we find anything?
#if defined(ASTCENC_DIAGNOSTICS)
if ((bsi < 0) && print_once)
{
print_once = false;
printf("INFO: Unable to find full encoding within search error limit.\n\n");
}
#endif
bsi = astc::max(0, bsi);
float lwi = lowest_weight[bsi] + best_results[q].lane<2>();
float hwi = lwi + static_cast<float>(q) - 1.0f;
float stepsize = 1.0f / (1.0f + static_cast<float>(bsi));
low_value[i] = (angular_offsets[bsi] + lwi) * stepsize;
high_value[i] = (angular_offsets[bsi] + hwi) * stepsize;
}
}
/* See header for documentation. */
void compute_angular_endpoints_1plane(
bool only_always,
const block_size_descriptor& bsd,
const float* dec_weight_ideal_value,
unsigned int max_weight_quant,
compression_working_buffers& tmpbuf
) {
float (&low_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
float (&high_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
float (&low_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
float (&high_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
: bsd.decimation_mode_count_selected;
promise(max_decimation_modes > 0);
for (unsigned int i = 0; i < max_decimation_modes; i++)
{
const decimation_mode& dm = bsd.decimation_modes[i];
if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
{
continue;
}
unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
unsigned int max_precision = dm.maxprec_1plane;
if (max_precision > TUNE_MAX_ANGULAR_QUANT)
{
max_precision = TUNE_MAX_ANGULAR_QUANT;
}
if (max_precision > max_weight_quant)
{
max_precision = max_weight_quant;
}
compute_angular_endpoints_for_quant_levels(
weight_count,
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
max_precision, low_values[i], high_values[i]);
}
unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
: bsd.block_mode_count_1plane_selected;
promise(max_block_modes > 0);
for (unsigned int i = 0; i < max_block_modes; i++)
{
const block_mode& bm = bsd.block_modes[i];
assert(!bm.is_dual_plane);
unsigned int quant_mode = bm.quant_mode;
unsigned int decim_mode = bm.decimation_mode;
if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
{
low_value[i] = low_values[decim_mode][quant_mode];
high_value[i] = high_values[decim_mode][quant_mode];
}
else
{
low_value[i] = 0.0f;
high_value[i] = 1.0f;
}
}
}
/* See header for documentation. */
void compute_angular_endpoints_2planes(
const block_size_descriptor& bsd,
const float* dec_weight_ideal_value,
unsigned int max_weight_quant,
compression_working_buffers& tmpbuf
) {
float (&low_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
float (&high_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
float (&low_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value2;
float (&high_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value2;
float (&low_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
float (&high_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
float (&low_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values2;
float (&high_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values2;
promise(bsd.decimation_mode_count_selected > 0);
for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
{
const decimation_mode& dm = bsd.decimation_modes[i];
if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
{
continue;
}
unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
unsigned int max_precision = dm.maxprec_2planes;
if (max_precision > TUNE_MAX_ANGULAR_QUANT)
{
max_precision = TUNE_MAX_ANGULAR_QUANT;
}
if (max_precision > max_weight_quant)
{
max_precision = max_weight_quant;
}
compute_angular_endpoints_for_quant_levels(
weight_count,
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
max_precision, low_values1[i], high_values1[i]);
compute_angular_endpoints_for_quant_levels(
weight_count,
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
max_precision, low_values2[i], high_values2[i]);
}
unsigned int start = bsd.block_mode_count_1plane_selected;
unsigned int end = bsd.block_mode_count_1plane_2plane_selected;
for (unsigned int i = start; i < end; i++)
{
const block_mode& bm = bsd.block_modes[i];
unsigned int quant_mode = bm.quant_mode;
unsigned int decim_mode = bm.decimation_mode;
if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
{
low_value1[i] = low_values1[decim_mode][quant_mode];
high_value1[i] = high_values1[decim_mode][quant_mode];
low_value2[i] = low_values2[decim_mode][quant_mode];
high_value2[i] = high_values2[decim_mode][quant_mode];
}
else
{
low_value1[i] = 0.0f;
high_value1[i] = 1.0f;
low_value2[i] = 0.0f;
high_value2[i] = 1.0f;
}
}
}
#endif
@@ -0,0 +1,147 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2021 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Data tables for quantization transfer.
*/
#include "astcenc_internal.h"
#define _ 0 // Using _ to indicate an entry that will not be used.
const quant_and_transfer_table quant_and_xfer_tables[12] {
// QUANT2, range 0..1
{
{0, 64},
{0, 1},
{0, 64},
{0x4000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
0x4000}
},
// QUANT_3, range 0..2
{
{0, 32, 64},
{0, 1, 2},
{0, 32, 64},
{0x2000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
_,_,0x4000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
_,_,_,_,0x4020}
},
// QUANT_4, range 0..3
{
{0, 21, 43, 64},
{0, 1, 2, 3},
{0, 21, 43, 64},
{0x1500,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x2b00,_,_,_,_,
_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x4015,_,_,_,_,_,_,_,_,_,_,_,_,
_,_,_,_,_,_,_,_,0x402b}
},
//QUANT_5, range 0..4
{
{0, 16, 32, 48, 64},
{0, 1, 2, 3, 4},
{0, 16, 32, 48, 64},
{0x1000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x2000,_,_,_,_,_,_,_,_,_,
_,_,_,_,_,_,0x3010,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x4020,_,_,_,
_,_,_,_,_,_,_,_,_,_,_,_,0x4030}
},
// QUANT_6, range 0..5
{
{0, 12, 25, 39, 52, 64},
{0, 2, 4, 5, 3, 1},
{0, 64, 12, 52, 25, 39},
{0x0c00,_,_,_,_,_,_,_,_,_,_,_,0x1900,_,_,_,_,_,_,_,_,_,_,_,_,
0x270c,_,_,_,_,_,_,_,_,_,_,_,_,_,0x3419,_,_,_,_,_,_,_,_,_,_,
_,_,0x4027,_,_,_,_,_,_,_,_,_,_,_,0x4034}
},
// QUANT_8, range 0..7
{
{0, 9, 18, 27, 37, 46, 55, 64},
{0, 1, 2, 3, 4, 5, 6, 7},
{0, 9, 18, 27, 37, 46, 55, 64},
{0x0900,_,_,_,_,_,_,_,_,0x1200,_,_,_,_,_,_,_,_,0x1b09,_,_,
_,_,_,_,_,_,0x2512,_,_,_,_,_,_,_,_,_,0x2e1b,_,_,_,_,_,_,_,_,
0x3725,_,_,_,_,_,_,_,_,0x402e,_,_,_,_,_,_,_,_,0x4037}
},
// QUANT_10, range 0..9
{
{0, 7, 14, 21, 28, 36, 43, 50, 57, 64},
{0, 2, 4, 6, 8, 9, 7, 5, 3, 1},
{0, 64, 7, 57, 14, 50, 21, 43, 28, 36},
{0x0700,_,_,_,_,_,_,0x0e00,_,_,_,_,_,_,0x1507,_,_,_,_,_,_,
0x1c0e,_,_,_,_,_,_,0x2415,_,_,_,_,_,_,_,0x2b1c,_,_,_,_,_,
_,0x3224,_,_,_,_,_,_,0x392b,_,_,_,_,_,_,0x4032,_,_,_,_,_,
_,0x4039}
},
// QUANT_12, range 0..11
{
{0, 5, 11, 17, 23, 28, 36, 41, 47, 53, 59, 64},
{0, 4, 8, 2, 6, 10, 11, 7, 3, 9, 5, 1},
{0, 64, 17, 47, 5, 59, 23, 41, 11, 53, 28, 36},
{0x0500,_,_,_,_,0x0b00,_,_,_,_,_,0x1105,_,_,_,_,_,
0x170b,_,_,_,_,_,0x1c11,_,_,_,_,0x2417,_,_,_,_,_,_,_,
0x291c,_,_,_,_,0x2f24,_,_,_,_,_,0x3529,_,_,_,_,_,
0x3b2f,_,_,_,_,_,0x4035,_,_,_,_,0x403b}
},
// QUANT_16, range 0..15
{
{0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64},
{0x0400,_,_,_,0x0800,_,_,_,0x0c04,_,_,_,0x1108,_,_,_,_,
0x150c,_,_,_,0x1911,_,_,_,0x1d15,_,_,_,0x2319,_,_,_,_,
_,0x271d,_,_,_,0x2b23,_,_,_,0x2f27,_,_,_,0x342b,_,_,_,
_,0x382f,_,_,_,0x3c34,_,_,_,0x4038,_,_,_,0x403c}
},
// QUANT_20, range 0..19
{
{0, 3, 6, 9, 13, 16, 19, 23, 26, 29, 35, 38, 41, 45, 48, 51, 55, 58, 61, 64},
{0, 4, 8, 12, 16, 2, 6, 10, 14, 18, 19, 15, 11, 7, 3, 17, 13, 9, 5, 1},
{0, 64, 16, 48, 3, 61, 19, 45, 6, 58, 23, 41, 9, 55, 26, 38, 13, 51, 29, 35},
{0x0300,_,_,0x0600,_,_,0x0903,_,_,0x0d06,_,_,_,
0x1009,_,_,0x130d,_,_,0x1710,_,_,_,0x1a13,_,_,
0x1d17,_,_,0x231a,_,_,_,_,_,0x261d,_,_,0x2923,_,_,
0x2d26,_,_,_,0x3029,_,_,0x332d,_,_,0x3730,_,_,_,
0x3a33,_,_,0x3d37,_,_,0x403a,_,_,0x403d}
},
// QUANT_24, range 0..23
{
{0, 2, 5, 8, 11, 13, 16, 19, 22, 24, 27, 30, 34, 37, 40, 42, 45, 48, 51, 53, 56, 59, 62, 64},
{0, 8, 16, 2, 10, 18, 4, 12, 20, 6, 14, 22, 23, 15, 7, 21, 13, 5, 19, 11, 3, 17, 9, 1},
{0, 64, 8, 56, 16, 48, 24, 40, 2, 62, 11, 53, 19, 45, 27, 37, 5, 59, 13, 51, 22, 42, 30, 34},
{0x0200,_,0x0500,_,_,0x0802,_,_,0x0b05,_,_,0x0d08,
_,0x100b,_,_,0x130d,_,_,0x1610,_,_,0x1813,_,
0x1b16,_,_,0x1e18,_,_,0x221b,_,_,_,0x251e,_,_,
0x2822,_,_,0x2a25,_,0x2d28,_,_,0x302a,_,_,0x332d,
_,_,0x3530,_,0x3833,_,_,0x3b35,_,_,0x3e38,_,_,
0x403b,_,0x403e}
},
// QUANT_32, range 0..31
{
{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},
{0x0200,_,0x0400,_,0x0602,_,0x0804,_,0x0a06,_,
0x0c08,_,0x0e0a,_,0x100c,_,0x120e,_,0x1410,_,
0x1612,_,0x1814,_,0x1a16,_,0x1c18,_,0x1e1a,_,
0x221c,_,_,_,0x241e,_,0x2622,_,0x2824,_,0x2a26,_,
0x2c28,_,0x2e2a,_,0x302c,_,0x322e,_,0x3430,_,
0x3632,_,0x3834,_,0x3a36,_,0x3c38,_,0x3e3a,_,
0x403c,_,0x403e}
}
};
+316
View File
@@ -0,0 +1,316 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2020-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Application entry point.
*
* This module contains the first command line entry point veneer, used to
* validate that the host extended ISA availability matches the tool build.
* It is compiled without any extended ISA support so it's guaranteed to be
* executable without any invalid instruction errors.
*/
#include <cstdio>
/**
* @brief The main veneer entry point.
*
* @param argc The number of arguments.
* @param argv The vector of arguments.
*
* @return 0 on success, non-zero otherwise.
*/
int astcenc_main_veneer(
int argc,
char **argv);
// x86-64 builds
#if (ASTCENC_SSE > 20) || (ASTCENC_AVX > 0) || \
(ASTCENC_POPCNT > 0) || (ASTCENC_F16C > 0)
static bool g_init { false };
/** Does this CPU support SSE 4.1? Set to -1 if not yet initialized. */
static bool g_cpu_has_sse41 { false };
/** Does this CPU support AVX2? Set to -1 if not yet initialized. */
static bool g_cpu_has_avx2 { false };
/** Does this CPU support POPCNT? Set to -1 if not yet initialized. */
static bool g_cpu_has_popcnt { false };
/** Does this CPU support F16C? Set to -1 if not yet initialized. */
static bool g_cpu_has_f16c { false };
/* ============================================================================
Platform code for Visual Studio
============================================================================ */
#if !defined(__clang__) && defined(_MSC_VER)
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <intrin.h>
/**
* @brief Detect platform CPU ISA support and update global trackers.
*/
static void detect_cpu_isa()
{
int data[4];
__cpuid(data, 0);
int num_id = data[0];
if (num_id >= 1)
{
__cpuidex(data, 1, 0);
// SSE41 = Bank 1, ECX, bit 19
g_cpu_has_sse41 = data[2] & (1 << 19) ? true : false;
// POPCNT = Bank 1, ECX, bit 23
g_cpu_has_popcnt = data[2] & (1 << 23) ? true : false;
// F16C = Bank 1, ECX, bit 29
g_cpu_has_f16c = data[2] & (1 << 29) ? true : false;
}
if (num_id >= 7)
{
__cpuidex(data, 7, 0);
// AVX2 = Bank 7, EBX, bit 5
g_cpu_has_avx2 = data[1] & (1 << 5) ? true : false;
}
// Ensure state bits are updated before init flag is updated
MemoryBarrier();
g_init = true;
}
/* ============================================================================
Platform code for GCC and Clang
============================================================================ */
#else
#include <cpuid.h>
/**
* @brief Detect platform CPU ISA support and update global trackers.
*/
static void detect_cpu_isa()
{
unsigned int data[4];
if (__get_cpuid_count(1, 0, &data[0], &data[1], &data[2], &data[3]))
{
// SSE41 = Bank 1, ECX, bit 19
g_cpu_has_sse41 = data[2] & (1 << 19) ? true : false;
// POPCNT = Bank 1, ECX, bit 23
g_cpu_has_popcnt = data[2] & (1 << 23) ? true : false;
// F16C = Bank 1, ECX, bit 29
g_cpu_has_f16c = data[2] & (1 << 29) ? true : false;
}
g_cpu_has_avx2 = 0;
if (__get_cpuid_count(7, 0, &data[0], &data[1], &data[2], &data[3]))
{
// AVX2 = Bank 7, EBX, bit 5
g_cpu_has_avx2 = data[1] & (1 << 5) ? true : false;
}
// Ensure state bits are updated before init flag is updated
__sync_synchronize();
g_init = true;
}
#endif
#if ASTCENC_POPCNT > 0
/**
* @brief Run-time detection if the host CPU supports the POPCNT extension.
*
* @return @c true if supported, @c false if not.
*/
static bool cpu_supports_popcnt()
{
if (!g_init)
{
detect_cpu_isa();
}
return g_cpu_has_popcnt;
}
#endif
#if ASTCENC_F16C > 0
/**
* @brief Run-time detection if the host CPU supports F16C extension.
*
* @return @c true if supported, @c false if not.
*/
static bool cpu_supports_f16c()
{
if (!g_init)
{
detect_cpu_isa();
}
return g_cpu_has_f16c;
}
#endif
#if ASTCENC_SSE >= 41
/**
* @brief Run-time detection if the host CPU supports SSE 4.1 extension.
*
* @return @c true if supported, @c false if not.
*/
static bool cpu_supports_sse41()
{
if (!g_init)
{
detect_cpu_isa();
}
return g_cpu_has_sse41;
}
#endif
#if ASTCENC_AVX >= 2
/**
* @brief Run-time detection if the host CPU supports AVX 2 extension.
*
* @return @c true if supported, @c false if not.
*/
static bool cpu_supports_avx2()
{
if (!g_init)
{
detect_cpu_isa();
}
return g_cpu_has_avx2;
}
#endif
/**
* @brief Print a string to stderr.
*/
static inline void print_error(
const char* format
) {
fprintf(stderr, "%s", format);
}
/**
* @brief Validate CPU ISA support meets the requirements of this build of the library.
*
* Each library build is statically compiled for a particular set of CPU ISA features, such as the
* SIMD support or other ISA extensions such as POPCNT. This function checks that the host CPU
* actually supports everything this build needs.
*
* @return Return @c true if validated, @c false otherwise.
*/
static bool validate_cpu_isa()
{
#if ASTCENC_AVX >= 2
if (!cpu_supports_avx2())
{
print_error("ERROR: Host does not support AVX2 ISA extension\n");
return false;
}
#endif
#if ASTCENC_F16C >= 1
if (!cpu_supports_f16c())
{
print_error("ERROR: Host does not support F16C ISA extension\n");
return false;
}
#endif
#if ASTCENC_SSE >= 41
if (!cpu_supports_sse41())
{
print_error("ERROR: Host does not support SSE4.1 ISA extension\n");
return false;
}
#endif
#if ASTCENC_POPCNT >= 1
if (!cpu_supports_popcnt())
{
print_error("ERROR: Host does not support POPCNT ISA extension\n");
return false;
}
#endif
return true;
}
// Validate Arm SVE availability
#elif ASTCENC_SVE != 0
#include <sys/auxv.h>
static bool cpu_supports_sve()
{
long hwcaps = getauxval(AT_HWCAP);
return (hwcaps & HWCAP_SVE) != 0;
}
/**
* @brief Print a string to stderr.
*/
static inline void print_error(
const char* format
) {
fprintf(stderr, "%s", format);
}
/**
* @brief Validate that SVE is supported.
*
* Note that this function checks that SVE is supported, but because it
* runs in the veneer which is compiled without SVE support, we cannot
* check the SVE width is correct. This is checked later.
*/
static bool validate_cpu_isa()
{
if (!cpu_supports_sve())
{
print_error("ERROR: Host does not support SVE ISA extension\n");
return false;
}
return true;
}
#else
// Fallback for cases with no dynamic ISA availability
static bool validate_cpu_isa()
{
return true;
}
#endif
int main(
int argc,
char **argv
) {
if (!validate_cpu_isa())
{
return 1;
}
return astcenc_main_veneer(argc, argv);
}
+73
View File
@@ -0,0 +1,73 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Application entry point second veneer.
*
* This module contains the second command line entry point veneer, used to
* validate that Arm SVE vector width matches the tool build. When used, it is
* compiled with SVE ISA support but without any vector legnth override, so it
* will see the native SVE vector length exposed to the application.
*/
#include <cstdio>
#if ASTCENC_SVE != 0
#include <arm_sve.h>
#endif
/**
* @brief The main entry point.
*
* @param argc The number of arguments.
* @param argv The vector of arguments.
*
* @return 0 on success, non-zero otherwise.
*/
int astcenc_main(
int argc,
char **argv);
/**
* @brief Print a formatted string to stderr.
*/
template<typename ... _Args>
static inline void print_error(
const char* format,
_Args...args
) {
fprintf(stderr, format, args...);
}
int astcenc_main_veneer(
int argc,
char **argv
) {
// We don't need this check for 128-bit SVE, because that is compiled as
// VLA code, using predicate masks in the augmented NEON.
#if ASTCENC_SVE > 4
// svcntw() returns compile-time length if used with -msve-vector-bits
if (svcntw() != ASTCENC_SVE)
{
int bits = ASTCENC_SVE * 32;
print_error("ERROR: Host SVE support is not a %u-bit implementation\n", bits);
return 1;
}
#endif
return astcenc_main(argc, argv);
}
@@ -0,0 +1,413 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Functions for computing image error metrics.
*/
#include <cassert>
#include <cstdio>
#include "astcenccli_internal.h"
/**
* @brief An accumulator for errors.
*/
class error_accum4
{
public:
/** @brief The running sum. */
double sum_r { 0.0 };
double sum_g { 0.0 };
double sum_b { 0.0 };
double sum_a { 0.0 };
};
/**
* @brief Incremental addition operator for error accumulators.
*
* @param val The accumulator to increment
* @param inc The increment to apply
*
* @return The updated accumulator
*/
static error_accum4& operator+=(
error_accum4 &val,
vfloat4 inc
) {
val.sum_r += static_cast<double>(inc.lane<0>());
val.sum_g += static_cast<double>(inc.lane<1>());
val.sum_b += static_cast<double>(inc.lane<2>());
val.sum_a += static_cast<double>(inc.lane<3>());
return val;
}
/**
* @brief mPSNR tone-mapping operator for HDR images.
*
* @param val The color value to tone map
* @param fstop The exposure fstop; should be in range [-125, 125]
*
* @return The mapped color value in [0.0f, 255.0f] range
*/
static float mpsnr_operator(
float val,
int fstop
) {
if32 p;
p.u = 0x3f800000 + (fstop << 23); // 0x3f800000 is 1.0f
val *= p.f;
val = powf(val, (1.0f / 2.2f));
val *= 255.0f;
return astc::clamp(val, 0.0f, 255.0f);
}
/**
* @brief mPSNR difference between two values.
*
* Differences are given as "val1 - val2".
*
* @param val1 The first color value
* @param val2 The second color value
* @param fstop_lo The low exposure fstop; should be in range [-125, 125]
* @param fstop_hi The high exposure fstop; should be in range [-125, 125]
*
* @return The summed mPSNR difference across all active fstop levels
*/
static float mpsnr_sumdiff(
float val1,
float val2,
int fstop_lo,
int fstop_hi
) {
float summa = 0.0f;
for (int i = fstop_lo; i <= fstop_hi; i++)
{
float mval1 = mpsnr_operator(val1, i);
float mval2 = mpsnr_operator(val2, i);
float mdiff = mval1 - mval2;
summa += mdiff * mdiff;
}
return summa;
}
/* See header for documentation */
void compute_error_metrics(
bool compute_hdr_metrics,
bool compute_normal_metrics,
int input_components,
const astcenc_image* img1,
const astcenc_image* img2,
int fstop_lo,
int fstop_hi
) {
static const int componentmasks[5] { 0x00, 0x07, 0x0C, 0x07, 0x0F };
int componentmask = componentmasks[input_components];
error_accum4 errorsum;
error_accum4 alpha_scaled_errorsum;
error_accum4 log_errorsum;
error_accum4 mpsnr_errorsum;
double mean_angular_errorsum = 0.0;
double worst_angular_errorsum = 0.0;
unsigned int dim_x = astc::min(img1->dim_x, img2->dim_x);
unsigned int dim_y = astc::min(img1->dim_y, img2->dim_y);
unsigned int dim_z = astc::min(img1->dim_z, img2->dim_z);
if (img1->dim_x != img2->dim_x ||
img1->dim_y != img2->dim_y ||
img1->dim_z != img2->dim_z)
{
printf("WARNING: Only intersection of images will be compared:\n"
" Image 1: %dx%dx%d\n"
" Image 2: %dx%dx%d\n",
img1->dim_x, img1->dim_y, img1->dim_z,
img2->dim_x, img2->dim_y, img2->dim_z);
}
double rgb_peak = 0.0;
unsigned int xsize1 = img1->dim_x;
unsigned int xsize2 = img2->dim_x;
for (unsigned int z = 0; z < dim_z; z++)
{
for (unsigned int y = 0; y < dim_y; y++)
{
for (unsigned int x = 0; x < dim_x; x++)
{
vfloat4 color1;
vfloat4 color2;
if (img1->data_type == ASTCENC_TYPE_U8)
{
uint8_t* data8 = static_cast<uint8_t*>(img1->data[z]);
color1 = vfloat4(
data8[(4 * xsize1 * y) + (4 * x )],
data8[(4 * xsize1 * y) + (4 * x + 1)],
data8[(4 * xsize1 * y) + (4 * x + 2)],
data8[(4 * xsize1 * y) + (4 * x + 3)]);
color1 = color1 / 255.0f;
}
else if (img1->data_type == ASTCENC_TYPE_F16)
{
uint16_t* data16 = static_cast<uint16_t*>(img1->data[z]);
vint4 color1i = vint4(
data16[(4 * xsize1 * y) + (4 * x )],
data16[(4 * xsize1 * y) + (4 * x + 1)],
data16[(4 * xsize1 * y) + (4 * x + 2)],
data16[(4 * xsize1 * y) + (4 * x + 3)]);
color1 = float16_to_float(color1i);
color1 = clamp(0, 65504.0f, color1);
}
else // if (img1->data_type == ASTCENC_TYPE_F32)
{
assert(img1->data_type == ASTCENC_TYPE_F32);
float* data32 = static_cast<float*>(img1->data[z]);
color1 = vfloat4(
data32[(4 * xsize1 * y) + (4 * x )],
data32[(4 * xsize1 * y) + (4 * x + 1)],
data32[(4 * xsize1 * y) + (4 * x + 2)],
data32[(4 * xsize1 * y) + (4 * x + 3)]);
color1 = clamp(0, 65504.0f, color1);
}
if (img2->data_type == ASTCENC_TYPE_U8)
{
uint8_t* data8 = static_cast<uint8_t*>(img2->data[z]);
color2 = vfloat4(
data8[(4 * xsize2 * y) + (4 * x )],
data8[(4 * xsize2 * y) + (4 * x + 1)],
data8[(4 * xsize2 * y) + (4 * x + 2)],
data8[(4 * xsize2 * y) + (4 * x + 3)]);
color2 = color2 / 255.0f;
}
else if (img2->data_type == ASTCENC_TYPE_F16)
{
uint16_t* data16 = static_cast<uint16_t*>(img2->data[z]);
vint4 color2i = vint4(
data16[(4 * xsize2 * y) + (4 * x )],
data16[(4 * xsize2 * y) + (4 * x + 1)],
data16[(4 * xsize2 * y) + (4 * x + 2)],
data16[(4 * xsize2 * y) + (4 * x + 3)]);
color2 = float16_to_float(color2i);
color2 = clamp(0, 65504.0f, color2);
}
else // if (img2->data_type == ASTCENC_TYPE_F32)
{
assert(img2->data_type == ASTCENC_TYPE_F32);
float* data32 = static_cast<float*>(img2->data[z]);
color2 = vfloat4(
data32[(4 * xsize2 * y) + (4 * x )],
data32[(4 * xsize2 * y) + (4 * x + 1)],
data32[(4 * xsize2 * y) + (4 * x + 2)],
data32[(4 * xsize2 * y) + (4 * x + 3)]);
color2 = clamp(0, 65504.0f, color2);
}
rgb_peak = astc::max(static_cast<double>(color1.lane<0>()),
static_cast<double>(color1.lane<1>()),
static_cast<double>(color1.lane<2>()),
rgb_peak);
vfloat4 diffcolor = color1 - color2;
vfloat4 diffcolor_sq = diffcolor * diffcolor;
errorsum += diffcolor_sq;
vfloat4 alpha_scaled_diffcolor = vfloat4(
diffcolor.lane<0>() * color1.lane<3>(),
diffcolor.lane<1>() * color1.lane<3>(),
diffcolor.lane<2>() * color1.lane<3>(),
diffcolor.lane<3>());
vfloat4 alpha_scaled_diffcolor_sq = alpha_scaled_diffcolor * alpha_scaled_diffcolor;
alpha_scaled_errorsum += alpha_scaled_diffcolor_sq;
if (compute_hdr_metrics)
{
vfloat4 log_input_color1 = log2(color1);
vfloat4 log_input_color2 = log2(color2);
vfloat4 log_diffcolor = log_input_color1 - log_input_color2;
log_errorsum += log_diffcolor * log_diffcolor;
vfloat4 mpsnr_error = vfloat4(
mpsnr_sumdiff(color1.lane<0>(), color2.lane<0>(), fstop_lo, fstop_hi),
mpsnr_sumdiff(color1.lane<1>(), color2.lane<1>(), fstop_lo, fstop_hi),
mpsnr_sumdiff(color1.lane<2>(), color2.lane<2>(), fstop_lo, fstop_hi),
mpsnr_sumdiff(color1.lane<3>(), color2.lane<3>(), fstop_lo, fstop_hi));
mpsnr_errorsum += mpsnr_error;
}
if (compute_normal_metrics)
{
// Decode the normal vector
vfloat4 normal1 = (color1 - 0.5f) * 2.0f;
normal1 = normalize_safe(normal1.swz<0, 1, 2>(), unit3());
vfloat4 normal2 = (color2 - 0.5f) * 2.0f;
normal2 = normalize_safe(normal2.swz<0, 1, 2>(), unit3());
// Float error can push this outside of valid range for acos, so clamp to avoid NaN issues
float normal_cos = clamp(-1.0f, 1.0f, dot3(normal1, normal2)).lane<0>();
float rad_to_degrees = 180.0f / astc::PI;
double error_degrees = std::acos(static_cast<double>(normal_cos)) * static_cast<double>(rad_to_degrees);
mean_angular_errorsum += error_degrees / (dim_x * dim_y * dim_z);
worst_angular_errorsum = astc::max(worst_angular_errorsum, error_degrees);
}
}
}
}
double pixels = static_cast<double>(dim_x * dim_y * dim_z);
double samples = 0.0;
double num = 0.0;
double alpha_num = 0.0;
double log_num = 0.0;
double mpsnr_num = 0.0;
if (componentmask & 1)
{
num += errorsum.sum_r;
alpha_num += alpha_scaled_errorsum.sum_r;
log_num += log_errorsum.sum_r;
mpsnr_num += mpsnr_errorsum.sum_r;
samples += pixels;
}
if (componentmask & 2)
{
num += errorsum.sum_g;
alpha_num += alpha_scaled_errorsum.sum_g;
log_num += log_errorsum.sum_g;
mpsnr_num += mpsnr_errorsum.sum_g;
samples += pixels;
}
if (componentmask & 4)
{
num += errorsum.sum_b;
alpha_num += alpha_scaled_errorsum.sum_b;
log_num += log_errorsum.sum_b;
mpsnr_num += mpsnr_errorsum.sum_b;
samples += pixels;
}
if (componentmask & 8)
{
num += errorsum.sum_a;
alpha_num += alpha_scaled_errorsum.sum_a;
samples += pixels;
}
double denom = samples;
double stopcount = static_cast<double>(fstop_hi - fstop_lo + 1);
double mpsnr_denom = pixels * 3.0 * stopcount * 255.0 * 255.0;
double psnr;
if (num == 0.0)
{
psnr = 999.0;
}
else
{
psnr = 10.0 * log10(denom / num);
}
double rgb_psnr = psnr;
printf("Quality metrics\n");
printf("===============\n\n");
if (componentmask & 8)
{
printf(" PSNR (LDR-RGBA): %9.4f dB\n", psnr);
double alpha_psnr;
if (alpha_num == 0.0)
{
alpha_psnr = 999.0;
}
else
{
alpha_psnr = 10.0 * log10(denom / alpha_num);
}
printf(" Alpha-weighted PSNR: %9.4f dB\n", alpha_psnr);
double rgb_num = errorsum.sum_r + errorsum.sum_g + errorsum.sum_b;
if (rgb_num == 0.0)
{
rgb_psnr = 999.0;
}
else
{
rgb_psnr = 10.0 * log10(pixels * 3.0 / rgb_num);
}
printf(" PSNR (LDR-RGB): %9.4f dB\n", rgb_psnr);
}
else
{
printf(" PSNR (LDR-RGB): %9.4f dB\n", psnr);
}
if (compute_hdr_metrics)
{
printf(" PSNR (RGB norm to peak): %9.4f dB (peak %f)\n",
rgb_psnr + 20.0 * log10(rgb_peak), rgb_peak);
double mpsnr;
if (mpsnr_num == 0.0)
{
mpsnr = 999.0;
}
else
{
mpsnr = 10.0 * log10(mpsnr_denom / mpsnr_num);
}
printf(" mPSNR (RGB): %9.4f dB (fstops %+d to %+d)\n",
mpsnr, fstop_lo, fstop_hi);
double logrmse = sqrt(log_num / pixels);
printf(" LogRMSE (RGB): %9.4f\n", logrmse);
}
if (compute_normal_metrics)
{
printf(" Mean Angular Error: %9.4f degrees\n", mean_angular_errorsum);
printf(" Worst Angular Error: %9.4f degrees\n", worst_angular_errorsum);
}
printf("\n");
}
+377
View File
@@ -0,0 +1,377 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Functions for creating in-memory ASTC image structures.
*/
#include <cassert>
#include <cstring>
#include "astcenccli_internal.h"
/* See header for documentation. */
astcenc_image *alloc_image(
unsigned int bitness,
unsigned int dim_x,
unsigned int dim_y,
unsigned int dim_z
) {
astcenc_image *img = new astcenc_image;
img->dim_x = dim_x;
img->dim_y = dim_y;
img->dim_z = dim_z;
void** data = new void*[dim_z];
img->data = data;
if (bitness == 8)
{
img->data_type = ASTCENC_TYPE_U8;
for (unsigned int z = 0; z < dim_z; z++)
{
data[z] = new uint8_t[dim_x * dim_y * 4];
}
}
else if (bitness == 16)
{
img->data_type = ASTCENC_TYPE_F16;
for (unsigned int z = 0; z < dim_z; z++)
{
data[z] = new uint16_t[dim_x * dim_y * 4];
}
}
else // if (bitness == 32)
{
assert(bitness == 32);
img->data_type = ASTCENC_TYPE_F32;
for (unsigned int z = 0; z < dim_z; z++)
{
data[z] = new float[dim_x * dim_y * 4];
}
}
return img;
}
/* See header for documentation. */
void free_image(astcenc_image * img)
{
if (img == nullptr)
{
return;
}
for (unsigned int z = 0; z < img->dim_z; z++)
{
delete[] reinterpret_cast<char*>(img->data[z]);
}
delete[] img->data;
delete img;
}
/* See header for documentation. */
int determine_image_components(const astcenc_image * img)
{
unsigned int dim_x = img->dim_x;
unsigned int dim_y = img->dim_y;
unsigned int dim_z = img->dim_z;
// Scan through the image data to determine how many color components the image has
bool is_luma = true;
bool has_alpha = false;
if (img->data_type == ASTCENC_TYPE_U8)
{
for (unsigned int z = 0; z < dim_z; z++)
{
uint8_t* data8 = static_cast<uint8_t*>(img->data[z]);
for (unsigned int y = 0; y < dim_y; y++)
{
for (unsigned int x = 0; x < dim_x; x++)
{
int r = data8[(4 * dim_x * y) + (4 * x )];
int g = data8[(4 * dim_x * y) + (4 * x + 1)];
int b = data8[(4 * dim_x * y) + (4 * x + 2)];
int a = data8[(4 * dim_x * y) + (4 * x + 3)];
is_luma = is_luma && (r == g) && (r == b);
has_alpha = has_alpha || (a != 0xFF);
}
}
}
}
else if (img->data_type == ASTCENC_TYPE_F16)
{
for (unsigned int z = 0; z < dim_z; z++)
{
uint16_t* data16 = static_cast<uint16_t*>(img->data[z]);
for (unsigned int y = 0; y < dim_y; y++)
{
for (unsigned int x = 0; x < dim_x; x++)
{
int r = data16[(4 * dim_x * y) + (4 * x )];
int g = data16[(4 * dim_x * y) + (4 * x + 1)];
int b = data16[(4 * dim_x * y) + (4 * x + 2)];
int a = data16[(4 * dim_x * y) + (4 * x + 3)];
is_luma = is_luma && (r == g) && (r == b);
has_alpha = has_alpha || ((a ^ 0xC3FF) != 0xFFFF);
// a ^ 0xC3FF returns FFFF if and only if the input is 1.0
}
}
}
}
else // if (img->data_type == ASTCENC_TYPE_F32)
{
assert(img->data_type == ASTCENC_TYPE_F32);
for (unsigned int z = 0; z < dim_z; z++)
{
float* data32 = static_cast<float*>(img->data[z]);
for (unsigned int y = 0; y < dim_y; y++)
{
for (unsigned int x = 0; x < dim_x; x++)
{
float r = data32[(4 * dim_x * y) + (4 * x )];
float g = data32[(4 * dim_x * y) + (4 * x + 1)];
float b = data32[(4 * dim_x * y) + (4 * x + 2)];
float a = data32[(4 * dim_x * y) + (4 * x + 3)];
is_luma = is_luma && (r == g) && (r == b);
has_alpha = has_alpha || (a != 1.0f);
}
}
}
}
int image_components = 1 + (is_luma == 0 ? 2 : 0) + (has_alpha ? 1 : 0);
return image_components;
}
/* See header for documentation. */
astcenc_image* astc_img_from_floatx4_array(
const float* data,
unsigned int dim_x,
unsigned int dim_y,
bool y_flip
) {
astcenc_image* img = alloc_image(16, dim_x, dim_y, 1);
for (unsigned int y = 0; y < dim_y; y++)
{
uint16_t* data16 = static_cast<uint16_t*>(img->data[0]);
unsigned int y_src = y_flip ? (dim_y - y - 1) : y;
const float* src = data + 4 * dim_x * y_src;
for (unsigned int x = 0; x < dim_x; x++)
{
vint4 colorf16 = float_to_float16(vfloat4(
src[4 * x ],
src[4 * x + 1],
src[4 * x + 2],
src[4 * x + 3]
));
data16[(4 * dim_x * y) + (4 * x )] = static_cast<uint16_t>(colorf16.lane<0>());
data16[(4 * dim_x * y) + (4 * x + 1)] = static_cast<uint16_t>(colorf16.lane<1>());
data16[(4 * dim_x * y) + (4 * x + 2)] = static_cast<uint16_t>(colorf16.lane<2>());
data16[(4 * dim_x * y) + (4 * x + 3)] = static_cast<uint16_t>(colorf16.lane<3>());
}
}
return img;
}
/* See header for documentation. */
astcenc_image* astc_img_from_unorm8x4_array(
const uint8_t* data,
unsigned int dim_x,
unsigned int dim_y,
bool y_flip
) {
astcenc_image* img = alloc_image(8, dim_x, dim_y, 1);
for (unsigned int y = 0; y < dim_y; y++)
{
uint8_t* data8 = static_cast<uint8_t*>(img->data[0]);
unsigned int y_src = y_flip ? (dim_y - y - 1) : y;
const uint8_t* src = data + 4 * dim_x * y_src;
for (unsigned int x = 0; x < dim_x; x++)
{
data8[(4 * dim_x * y) + (4 * x )] = src[4 * x ];
data8[(4 * dim_x * y) + (4 * x + 1)] = src[4 * x + 1];
data8[(4 * dim_x * y) + (4 * x + 2)] = src[4 * x + 2];
data8[(4 * dim_x * y) + (4 * x + 3)] = src[4 * x + 3];
}
}
return img;
}
// initialize a flattened array of float values from an ASTC codec image
// The returned array is allocated with new[] and must be deleted with delete[].
/* See header for documentation. */
float* floatx4_array_from_astc_img(
const astcenc_image* img,
bool y_flip,
unsigned int z_index
) {
unsigned int dim_x = img->dim_x;
unsigned int dim_y = img->dim_y;
float *buf = new float[4 * dim_x * dim_y];
assert(z_index < img->dim_z);
if (img->data_type == ASTCENC_TYPE_U8)
{
uint8_t* data8 = static_cast<uint8_t*>(img->data[z_index]);
for (unsigned int y = 0; y < dim_y; y++)
{
unsigned int ymod = y_flip ? dim_y - y - 1 : y;
float* dst = buf + y * dim_x * 4;
for (unsigned int x = 0; x < dim_x; x++)
{
dst[4 * x ] = data8[(4 * dim_x * ymod) + (4 * x )] * (1.0f / 255.0f);
dst[4 * x + 1] = data8[(4 * dim_x * ymod) + (4 * x + 1)] * (1.0f / 255.0f);
dst[4 * x + 2] = data8[(4 * dim_x * ymod) + (4 * x + 2)] * (1.0f / 255.0f);
dst[4 * x + 3] = data8[(4 * dim_x * ymod) + (4 * x + 3)] * (1.0f / 255.0f);
}
}
}
else if (img->data_type == ASTCENC_TYPE_F16)
{
uint16_t* data16 = static_cast<uint16_t*>(img->data[z_index]);
for (unsigned int y = 0; y < dim_y; y++)
{
unsigned int ymod = y_flip ? dim_y - y - 1 : y;
float *dst = buf + y * dim_x * 4;
for (unsigned int x = 0; x < dim_x; x++)
{
vint4 colori(
data16[(4 * dim_x * ymod) + (4 * x )],
data16[(4 * dim_x * ymod) + (4 * x + 1)],
data16[(4 * dim_x * ymod) + (4 * x + 2)],
data16[(4 * dim_x * ymod) + (4 * x + 3)]
);
vfloat4 color = float16_to_float(colori);
store(color, dst + 4 * x);
}
}
}
else // if (img->data_type == ASTCENC_TYPE_F32)
{
assert(img->data_type == ASTCENC_TYPE_F32);
float* data32 = static_cast<float*>(img->data[z_index]);
for (unsigned int y = 0; y < dim_y; y++)
{
unsigned int ymod = y_flip ? dim_y - y - 1 : y;
float *dst = buf + y * dim_x * 4;
for (unsigned int x = 0; x < dim_x; x++)
{
dst[4 * x ] = data32[(4 * dim_x * ymod) + (4 * x )];
dst[4 * x + 1] = data32[(4 * dim_x * ymod) + (4 * x + 1)];
dst[4 * x + 2] = data32[(4 * dim_x * ymod) + (4 * x + 2)];
dst[4 * x + 3] = data32[(4 * dim_x * ymod) + (4 * x + 3)];
}
}
}
return buf;
}
/* See header for documentation. */
uint8_t* unorm8x4_array_from_astc_img(
const astcenc_image* img,
bool y_flip
) {
unsigned int dim_x = img->dim_x;
unsigned int dim_y = img->dim_y;
uint8_t* buf = new uint8_t[4 * dim_x * dim_y];
if (img->data_type == ASTCENC_TYPE_U8)
{
uint8_t* data8 = static_cast<uint8_t*>(img->data[0]);
for (unsigned int y = 0; y < dim_y; y++)
{
unsigned int ymod = y_flip ? dim_y - y - 1 : y;
uint8_t* dst = buf + y * dim_x * 4;
for (unsigned int x = 0; x < dim_x; x++)
{
dst[4 * x ] = data8[(4 * dim_x * ymod) + (4 * x )];
dst[4 * x + 1] = data8[(4 * dim_x * ymod) + (4 * x + 1)];
dst[4 * x + 2] = data8[(4 * dim_x * ymod) + (4 * x + 2)];
dst[4 * x + 3] = data8[(4 * dim_x * ymod) + (4 * x + 3)];
}
}
}
else if (img->data_type == ASTCENC_TYPE_F16)
{
uint16_t* data16 = static_cast<uint16_t*>(img->data[0]);
for (unsigned int y = 0; y < dim_y; y++)
{
unsigned int ymod = y_flip ? dim_y - y - 1 : y;
uint8_t* dst = buf + y * dim_x * 4;
for (unsigned int x = 0; x < dim_x; x++)
{
vint4 colori(
data16[(4 * dim_x * ymod) + (4 * x )],
data16[(4 * dim_x * ymod) + (4 * x + 1)],
data16[(4 * dim_x * ymod) + (4 * x + 2)],
data16[(4 * dim_x * ymod) + (4 * x + 3)]
);
vfloat4 color = float16_to_float(colori);
color = clamp(0.0f, 1.0f, color) * 255.0f;
colori = float_to_int_rtn(color);
pack_and_store_low_bytes(colori, dst + 4 * x);
}
}
}
else // if (img->data_type == ASTCENC_TYPE_F32)
{
assert(img->data_type == ASTCENC_TYPE_F32);
float* data32 = static_cast<float*>(img->data[0]);
for (unsigned int y = 0; y < dim_y; y++)
{
unsigned int ymod = y_flip ? dim_y - y - 1 : y;
uint8_t* dst = buf + y * dim_x * 4;
for (unsigned int x = 0; x < dim_x; x++)
{
dst[4 * x ] = static_cast<uint8_t>(astc::flt2int_rtn(astc::clamp1f(data32[(4 * dim_x * ymod) + (4 * x )]) * 255.0f));
dst[4 * x + 1] = static_cast<uint8_t>(astc::flt2int_rtn(astc::clamp1f(data32[(4 * dim_x * ymod) + (4 * x + 1)]) * 255.0f));
dst[4 * x + 2] = static_cast<uint8_t>(astc::flt2int_rtn(astc::clamp1f(data32[(4 * dim_x * ymod) + (4 * x + 2)]) * 255.0f));
dst[4 * x + 3] = static_cast<uint8_t>(astc::flt2int_rtn(astc::clamp1f(data32[(4 * dim_x * ymod) + (4 * x + 3)]) * 255.0f));
}
}
}
return buf;
}
@@ -0,0 +1,174 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Functions for building the implementation of stb_image and tinyexr.
*/
#include <cstdlib>
#include <cstdio>
#include <fstream>
#include <vector>
#include "astcenccli_internal.h"
// Configure the STB image write library build.
#define STB_IMAGE_IMPLEMENTATION
#define STB_IMAGE_WRITE_IMPLEMENTATION
#define STBI_NO_GIF
#define STBI_NO_PIC
#define STBI_NO_PNM
#define STBI_NO_PNG
#define STBI_NO_PSD
// Configure the TinyEXR library build.
#define TINYEXR_IMPLEMENTATION
// Configure the Wuffs library build.
#define WUFFS_IMPLEMENTATION
#define WUFFS_CONFIG__MODULES
#define WUFFS_CONFIG__MODULE__ADLER32
#define WUFFS_CONFIG__MODULE__BASE
#define WUFFS_CONFIG__MODULE__CRC32
#define WUFFS_CONFIG__MODULE__DEFLATE
#define WUFFS_CONFIG__MODULE__PNG
#define WUFFS_CONFIG__MODULE__ZLIB
#include "wuffs-v0.3.c"
// For both libraries force asserts (which can be triggered by corrupt input
// images) to be handled at runtime in release builds to avoid security issues.
#define STBI_ASSERT(x) astcenc_runtime_assert(x)
#define TEXR_ASSERT(x) astcenc_runtime_assert(x)
/**
* @brief Trap image load failures and convert into a runtime error.
*/
static void astcenc_runtime_assert(bool condition)
{
if (!condition)
{
print_error("ERROR: Corrupt input image\n");
exit(1);
}
}
#include "ThirdParty/stb_image.h"
#include "ThirdParty/stb_image_write.h"
#include "ThirdParty/tinyexr.h"
/**
* @brief Load an image using Wuffs to provide the loader.
*
* @param filename The name of the file to load.
* @param y_flip Should the image be vertically flipped?
* @param[out] is_hdr Is this an HDR image load?
* @param[out] component_count The number of components in the data.
*
* @return The loaded image data in a canonical 4 channel format, or @c nullptr on error.
*/
astcenc_image* load_png_with_wuffs(
const char* filename,
bool y_flip,
bool& is_hdr,
unsigned int& component_count
) {
is_hdr = false;
component_count = 4;
std::ifstream file(filename, std::ios::binary | std::ios::ate);
if (!file)
{
print_error("ERROR: Failed to load image %s (can't fopen)\n", filename);
return nullptr;
}
std::streamsize size = file.tellg();
file.seekg(0, std::ios::beg);
std::vector<uint8_t> buffer(size);
file.read((char*)buffer.data(), size);
wuffs_png__decoder *dec = wuffs_png__decoder__alloc();
if (!dec)
{
return nullptr;
}
wuffs_base__image_config ic;
wuffs_base__io_buffer src = wuffs_base__ptr_u8__reader(buffer.data(), size, true);
wuffs_base__status status = wuffs_png__decoder__decode_image_config(dec, &ic, &src);
if (status.repr)
{
return nullptr;
}
uint32_t dim_x = wuffs_base__pixel_config__width(&ic.pixcfg);
uint32_t dim_y = wuffs_base__pixel_config__height(&ic.pixcfg);
size_t num_pixels = dim_x * dim_y;
if (num_pixels > (SIZE_MAX / 4))
{
return nullptr;
}
// Override the image's native pixel format to be RGBA_NONPREMUL
wuffs_base__pixel_config__set(
&ic.pixcfg,
WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL,
WUFFS_BASE__PIXEL_SUBSAMPLING__NONE,
dim_x, dim_y);
// Configure the work buffer
size_t workbuf_len = wuffs_png__decoder__workbuf_len(dec).max_incl;
if (workbuf_len > SIZE_MAX)
{
return nullptr;
}
wuffs_base__slice_u8 workbuf_slice = wuffs_base__make_slice_u8((uint8_t*)malloc(workbuf_len), workbuf_len);
if (!workbuf_slice.ptr)
{
return nullptr;
}
wuffs_base__slice_u8 pixbuf_slice = wuffs_base__make_slice_u8((uint8_t*)malloc(num_pixels * 4), num_pixels * 4);
if (!pixbuf_slice.ptr)
{
return nullptr;
}
wuffs_base__pixel_buffer pb;
status = wuffs_base__pixel_buffer__set_from_slice(&pb, &ic.pixcfg, pixbuf_slice);
if (status.repr)
{
return nullptr;
}
// Decode the pixels
status = wuffs_png__decoder__decode_frame(dec, &pb, &src, WUFFS_BASE__PIXEL_BLEND__SRC, workbuf_slice, NULL);
if (status.repr)
{
return nullptr;
}
astcenc_image* img = astc_img_from_unorm8x4_array(pixbuf_slice.ptr, dim_x, dim_y, y_flip);
free(pixbuf_slice.ptr);
free(workbuf_slice.ptr);
free(dec);
return img;
}
File diff suppressed because it is too large Load Diff
+422
View File
@@ -0,0 +1,422 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2023 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Functions and data declarations.
*/
#ifndef ASTCENCCLI_INTERNAL_INCLUDED
#define ASTCENCCLI_INTERNAL_INCLUDED
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include "astcenc.h"
#include "astcenc_mathlib.h"
/**
* @brief The payload stored in a compressed ASTC image.
*/
struct astc_compressed_image
{
/** @brief The block width in texels. */
unsigned int block_x;
/** @brief The block height in texels. */
unsigned int block_y;
/** @brief The block depth in texels. */
unsigned int block_z;
/** @brief The image width in texels. */
unsigned int dim_x;
/** @brief The image height in texels. */
unsigned int dim_y;
/** @brief The image depth in texels. */
unsigned int dim_z;
/** @brief The binary data payload. */
uint8_t* data;
/** @brief The binary data length in bytes. */
size_t data_len;
};
/**
* @brief Config options that have been read from command line.
*/
struct cli_config_options
{
/** @brief The number of threads to use for processing. */
unsigned int thread_count;
/** @brief The number of repeats to execute for benchmarking. */
unsigned int repeat_count;
/** @brief The number of image slices to load for a 3D image. */
unsigned int array_size;
/** @brief @c true if running in silent mode with minimal output. */
bool silentmode;
/** @brief @c true if the images should be y-flipped. */
bool y_flip;
/** @brief @c true if diagnostic images should be stored. */
bool diagnostic_images;
/** @brief The low exposure fstop for error computation. */
int low_fstop;
/** @brief The high exposure fstop for error computation. */
int high_fstop;
/** @brief The pre-encode swizzle. */
astcenc_swizzle swz_encode;
/** @brief The post-decode swizzle. */
astcenc_swizzle swz_decode;
};
/**
* @brief Print a string to stderr.
*/
static inline void print_error(
const char* format
) {
fprintf(stderr, "%s", format);
}
/**
* @brief Print a formatted string to stderr.
*/
template<typename ... _Args>
static inline void print_error(
const char* format,
_Args...args
) {
fprintf(stderr, format, args...);
}
/**
* @brief Load uncompressed image.
*
* @param filename The file path on disk.
* @param y_flip Should this image be Y flipped?
* @param[out] is_hdr Is the loaded image HDR?
* @param[out] component_count The number of components in the loaded image.
*
* @return The astc image file, or nullptr on error.
*/
astcenc_image* load_ncimage(
const char* filename,
bool y_flip,
bool& is_hdr,
unsigned int& component_count);
/**
* @brief Load uncompressed PNG image.
*
* @param filename The file path on disk.
* @param y_flip Should this image be Y flipped?
* @param[out] is_hdr Is the loaded image HDR?
* @param[out] component_count The number of components in the loaded image.
*
* @return The astc image file, or nullptr on error.
*/
astcenc_image* load_png_with_wuffs(
const char* filename,
bool y_flip,
bool& is_hdr,
unsigned int& component_count);
/**
* @brief Save an uncompressed image.
*
* @param img The source data for the image.
* @param filename The name of the file to save.
* @param y_flip Should the image be vertically flipped?
*
* @return @c true if the image saved OK, @c false on error.
*/
bool store_ncimage(
const astcenc_image* img,
const char* filename,
int y_flip);
/**
* @brief Check if the output file type requires a specific bitness.
*
* @param filename The file name, containing hte extension to check.
*
* @return Valid values are:
* * -1 - error - unknown file type.
* * 0 - no enforced bitness.
* * 8 - enforced 8-bit UNORM.
* * 16 - enforced 16-bit FP16.
*/
int get_output_filename_enforced_bitness(
const char* filename);
/**
* @brief Allocate a new image in a canonical format.
*
* Allocated images must be freed with a @c free_image() call.
*
* @param bitness The number of bits per component (8, 16, or 32).
* @param dim_x The width of the image, in texels.
* @param dim_y The height of the image, in texels.
* @param dim_z The depth of the image, in texels.
*
* @return The allocated image, or @c nullptr on error.
*/
astcenc_image* alloc_image(
unsigned int bitness,
unsigned int dim_x,
unsigned int dim_y,
unsigned int dim_z);
/**
* @brief Free an image.
*
* @param img The image to free.
*/
void free_image(
astcenc_image* img);
/**
* @brief Determine the number of active components in an image.
*
* @param img The image to analyze.
*
* @return The number of active components in the image.
*/
int determine_image_components(
const astcenc_image* img);
/**
* @brief Load a compressed .astc image.
*
* @param filename The file to load.
* @param img The image to populate with loaded data.
*
* @return Non-zero on error, zero on success.
*/
int load_cimage(
const char* filename,
astc_compressed_image& img);
/**
* @brief Store a compressed .astc image.
*
* @param img The image to store.
* @param filename The file to save.
*
* @return Non-zero on error, zero on success.
*/
int store_cimage(
const astc_compressed_image& img,
const char* filename);
/**
* @brief Load a compressed .ktx image.
*
* @param filename The file to load.
* @param is_srgb Is this an sRGB encoded file?
* @param img The image to populate with loaded data.
*
* @return Non-zero on error, zero on success.
*/
bool load_ktx_compressed_image(
const char* filename,
bool& is_srgb,
astc_compressed_image& img) ;
/**
* @brief Store a compressed .ktx image.
*
* @param img The image to store.
* @param filename The file to store.
* @param is_srgb Is this an sRGB encoded file?
*
* @return Non-zero on error, zero on success.
*/
bool store_ktx_compressed_image(
const astc_compressed_image& img,
const char* filename,
bool is_srgb);
/**
* @brief Create an image from a 2D float data array.
*
* @param data The raw input data.
* @param dim_x The width of the image, in texels.
* @param dim_y The height of the image, in texels.
* @param y_flip Should this image be vertically flipped?
*
* @return The populated image.
*/
astcenc_image* astc_img_from_floatx4_array(
const float* data,
unsigned int dim_x,
unsigned int dim_y,
bool y_flip);
/**
* @brief Create an image from a 2D byte data array.
*
* @param data The raw input data.
* @param dim_x The width of the image, in texels.
* @param dim_y The height of the image, in texels.
* @param y_flip Should this image be vertically flipped?
*
* @return The populated image.
*/
astcenc_image* astc_img_from_unorm8x4_array(
const uint8_t* data,
unsigned int dim_x,
unsigned int dim_y,
bool y_flip);
/**
* @brief Create a flattened RGBA FLOAT32 data array for a single slice from an image structure.
*
* The returned data array is allocated with @c new[] and must be freed with a @c delete[] call.
*
* @param img The input image.
* @param y_flip Should the data in the array be Y flipped?
* @param z_index The slice index to convert.
*
* @return The data array.
*/
float* floatx4_array_from_astc_img(
const astcenc_image* img,
bool y_flip,
unsigned int z_index);
/**
* @brief Create a flattened RGBA UNORM8 data array from an image structure.
*
* The returned data array is allocated with @c new[] and must be freed with a @c delete[] call.
*
* @param img The input image.
* @param y_flip Should the data in the array be Y flipped?
*
* @return The data array.
*/
uint8_t* unorm8x4_array_from_astc_img(
const astcenc_image* img,
bool y_flip);
/* ============================================================================
Functions for printing build info and help messages
============================================================================ */
/**
* @brief Print the tool copyright and version header to stdout.
*/
void astcenc_print_header();
/**
* @brief Print the tool copyright, version, and short-form help to stdout.
*/
void astcenc_print_shorthelp();
/**
* @brief Print the tool copyright, version, and long-form help to stdout.
*/
void astcenc_print_longhelp();
/**
* @brief Compute error metrics comparing two images.
*
* @param compute_hdr_metrics True if HDR metrics should be computed.
* @param compute_normal_metrics True if normal map metrics should be computed.
* @param input_components The number of input color components.
* @param img1 The original image.
* @param img2 The compressed image.
* @param fstop_lo The low exposure fstop (HDR only).
* @param fstop_hi The high exposure fstop (HDR only).
*/
void compute_error_metrics(
bool compute_hdr_metrics,
bool compute_normal_metrics,
int input_components,
const astcenc_image* img1,
const astcenc_image* img2,
int fstop_lo,
int fstop_hi);
/**
* @brief Get the current time.
*
* @return The current time in seconds since arbitrary epoch.
*/
double get_time();
/**
* @brief Get the number of CPU cores.
*
* @return The number of online or onlineable CPU cores in the system.
*/
int get_cpu_count();
/**
* @brief Launch N worker threads and wait for them to complete.
*
* All threads run the same thread function, and have the same thread payload, but are given a
* unique thread ID (0 .. N-1) as a parameter to the run function to allow thread-specific behavior.
*
* @param operation The name of the operation for this async task.
* @param thread_count The number of threads to spawn.
* @param func The function to execute. Must have the signature:
* void (int thread_count, int thread_id, void* payload)
* @param payload Pointer to an opaque thread payload object.
*/
void launch_threads(
const char* operation,
int thread_count,
void (*func)(int, int, void*),
void *payload);
/**
* @brief Set the current thread name to a string value.
*
* For portability strings should be no longer than 16 characters.
*
* @param name The thread name.
*/
void set_thread_name(
const char* name);
/**
* @brief The main entry point.
*
* @param argc The number of arguments.
* @param argv The vector of arguments.
*
* @return 0 on success, non-zero otherwise.
*/
int astcenc_main(
int argc,
char **argv);
#endif
@@ -0,0 +1,309 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------
/**
* @brief Platform-specific function implementations.
*
* This module contains functions with strongly OS-dependent implementations:
*
* * CPU count queries
* * Threading
* * Time
*
* In addition to the basic thread abstraction (which is native pthreads on
* all platforms, except Windows where it is an emulation of pthreads), a
* utility function to create N threads and wait for them to complete a batch
* task has also been provided.
*/
#include "astcenccli_internal.h"
/* ============================================================================
Platform code for Windows using the Win32 APIs.
============================================================================ */
#if defined(_WIN32) && !defined(__CYGWIN__)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <Processthreadsapi.h>
#include <algorithm>
#include <cstring>
/** @brief Alias pthread_t to one of the internal Windows types. */
typedef HANDLE pthread_t;
/** @brief Alias pthread_attr_t to one of the internal Windows types. */
typedef int pthread_attr_t;
/**
* @brief Proxy Windows @c CreateThread underneath a pthreads-like wrapper.
*/
static int pthread_create(
pthread_t* thread,
const pthread_attr_t* attribs,
void* (*threadfunc)(void*),
void* thread_arg
) {
static_cast<void>(attribs);
LPTHREAD_START_ROUTINE func = reinterpret_cast<LPTHREAD_START_ROUTINE>(threadfunc);
*thread = CreateThread(nullptr, 0, func, thread_arg, 0, nullptr);
// Ensure we return 0 on success, non-zero on error
if (*thread == NULL)
{
return 1;
}
return 0;
}
/**
* @brief Manually set CPU group and thread affinity.
*
* This is needed on Windows 10 or older to allow benefit from large core count
* systems with more than 64 logical CPUs. The assignment is skipped on systems
* with a single processor group, as it is not necessary.
*/
static void set_group_affinity(
pthread_t thread,
int thread_index
) {
// Skip thread assignment for hardware with a single CPU group
int group_count = GetActiveProcessorGroupCount();
if (group_count == 1)
{
return;
}
// Ensure we have a valid assign if user creates more threads than cores
int assign_index = thread_index % get_cpu_count();
int assign_group { 0 };
int assign_group_cpu_count { 0 };
// Determine which core group and core in the group to use for this thread
int group_cpu_count_sum { 0 };
for (int group = 0; group < group_count; group++)
{
int group_cpu_count = static_cast<int>(GetMaximumProcessorCount(group));
group_cpu_count_sum += group_cpu_count;
if (assign_index < group_cpu_count_sum)
{
assign_group = group;
assign_group_cpu_count = group_cpu_count;
break;
}
}
// Set the affinity to the assigned group, and all supported cores
GROUP_AFFINITY affinity {};
affinity.Mask = (1 << assign_group_cpu_count) - 1;
affinity.Group = assign_group;
SetThreadGroupAffinity(thread, &affinity, nullptr);
}
/**
* @brief Proxy Windows @c WaitForSingleObject underneath a pthreads-like wrapper.
*/
static int pthread_join(
pthread_t thread,
void** value
) {
static_cast<void>(value);
WaitForSingleObject(thread, INFINITE);
return 0;
}
/* See header for documentation */
int get_cpu_count()
{
DWORD cpu_count = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
return static_cast<int>(cpu_count);
}
/* See header for documentation */
double get_time()
{
FILETIME tv;
GetSystemTimePreciseAsFileTime(&tv);
unsigned long long ticks = tv.dwHighDateTime;
ticks = (ticks << 32) | tv.dwLowDateTime;
return static_cast<double>(ticks) / 1.0e7;
}
/* See header for documentation */
void set_thread_name(
const char* name
) {
// Names are limited to 16 characters
wchar_t wname [16] { 0 };
size_t name_len = std::strlen(name);
size_t clamp_len = std::min<size_t>(name_len, 15);
// We know we only have basic 7-bit ASCII so just widen
for (size_t i = 0; i < clamp_len; i++)
{
wname[i] = static_cast<wchar_t>(name[i]);
}
SetThreadDescription(GetCurrentThread(), wname);
}
/* ============================================================================
Platform code for an platform using POSIX APIs.
============================================================================ */
#else
#include <pthread.h>
#include <sys/time.h>
#include <unistd.h>
/* See header for documentation */
int get_cpu_count()
{
return static_cast<int>(sysconf(_SC_NPROCESSORS_ONLN));
}
/* See header for documentation */
double get_time()
{
timeval tv;
gettimeofday(&tv, 0);
return static_cast<double>(tv.tv_sec) + static_cast<double>(tv.tv_usec) * 1.0e-6;
}
/* See header for documentation */
void set_thread_name(
const char* name
) {
// No standard mechanism, so be defensive here
#if defined(__linux__)
pthread_setname_np(pthread_self(), name);
#elif defined(__APPLE__)
pthread_setname_np(name);
#else
(void)name;
#endif
}
#endif
/**
* @brief Worker thread helper payload for launch_threads.
*/
struct launch_desc
{
/** @brief The native thread handle. */
pthread_t thread_handle;
/** @brief The total number of threads in the thread pool. */
int thread_count;
/** @brief The thread index in the thread pool. */
int thread_id;
/** @brief The user thread function to execute. */
void (*func)(int, int, void*);
/** @brief The user thread payload. */
void* payload;
};
/**
* @brief Helper function to translate thread entry points.
*
* Convert a (void*) thread entry to an (int, void*) thread entry, where the
* integer contains the thread ID in the thread pool.
*
* @param p The thread launch helper payload.
*/
static void* launch_threads_helper(
void *p
) {
launch_desc* ltd = reinterpret_cast<launch_desc*>(p);
ltd->func(ltd->thread_count, ltd->thread_id, ltd->payload);
return nullptr;
}
/* See header for documentation */
void launch_threads(
const char* operation,
int thread_count,
void (*func)(int, int, void*),
void *payload
) {
// Directly execute single threaded workloads on this thread
if (thread_count <= 1)
{
func(1, 0, payload);
return;
}
// Otherwise spawn worker threads
launch_desc *thread_descs = new launch_desc[thread_count];
int actual_thread_count { 0 };
for (int i = 0; i < thread_count; i++)
{
thread_descs[actual_thread_count].thread_count = thread_count;
thread_descs[actual_thread_count].thread_id = actual_thread_count;
thread_descs[actual_thread_count].payload = payload;
thread_descs[actual_thread_count].func = func;
// Handle pthread_create failing by simply using fewer threads
int error = pthread_create(
&(thread_descs[actual_thread_count].thread_handle),
nullptr,
launch_threads_helper,
reinterpret_cast<void*>(thread_descs + actual_thread_count));
// Track how many threads we actually created
if (!error)
{
// Windows needs explicit thread assignment to handle large core count systems
#if defined(_WIN32) && !defined(__CYGWIN__)
set_group_affinity(
thread_descs[actual_thread_count].thread_handle,
actual_thread_count);
#endif
actual_thread_count++;
}
}
// If we did not create thread_count threads then emit a warning
if (actual_thread_count != thread_count)
{
int log_count = actual_thread_count == 0 ? 1 : actual_thread_count;
const char* log_s = log_count == 1 ? "" : "s";
printf("WARNING: %s using %d thread%s due to thread creation error\n\n",
operation, log_count, log_s);
}
// If we managed to spawn any threads wait for them to complete
if (actual_thread_count != 0)
{
for (int i = 0; i < actual_thread_count; i++)
{
pthread_join(thread_descs[i].thread_handle, nullptr);
}
}
// Else fall back to using this thread
else
{
func(1, 0, payload);
}
delete[] thread_descs;
}

Some files were not shown because too many files have changed in this diff Show More