Add ktx
@@ -0,0 +1,7 @@
|
||||
# Copyright 2024 The Khronos Group Inc.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
---
|
||||
# Disable clang-format in this directory
|
||||
DisableFormat: true
|
||||
SortIncludes: false
|
||||
...
|
||||
@@ -0,0 +1,12 @@
|
||||
<!-- Copyright 2025 Mark Callow -->
|
||||
<!-- SPDX-License-Identifier: Apache-2.0 -->
|
||||
|
||||
SDL_gesture.h
|
||||
-------------
|
||||
|
||||
The Gesture API was removed from SDL3. As a migration path they provided an equivalent single-header library `SDL_gesture.h` that can be dropped into an SDL3-based project.
|
||||
|
||||
They do not make formal releases of this code; they say "just grab the latest and drop it into your project!"
|
||||
|
||||
The origin of this file is fork https://github.com/MarkCallow/SDL_gesture.git whose upstream is
|
||||
https://github.com/libsdl-org/SDL_gesture. It includes modifications for robustness to prevent production of spurious GESTURE\_MULTIGESTURE events.
|
||||
@@ -0,0 +1,966 @@
|
||||
/*
|
||||
Simple DirectMedia Layer
|
||||
Copyright (C) 1997-2022 Sam Lantinga <slouken@libsdl.org>
|
||||
|
||||
This software is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the authors be held liable for any damages
|
||||
arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not
|
||||
claim that you wrote the original software. If you use this software
|
||||
in a product, an acknowledgment in the product documentation would be
|
||||
appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/* Touch gestures were removed from SDL3, so this is the SDL2 implementation copied in here, and tweaked a little. */
|
||||
|
||||
#ifndef INCL_SDL_GESTURE_H
|
||||
#define INCL_SDL_GESTURE_H
|
||||
|
||||
#if !defined(SDL_MAJOR_VERSION)
|
||||
#error Please include SDL.h before including this header.
|
||||
#elif SDL_MAJOR_VERSION < 2
|
||||
#error This header requires SDL2 or later.
|
||||
#elif SDL_MAJOR_VERSION == 2
|
||||
/* building against SDL2? Just use the built-in SDL2 implementation. */
|
||||
#define Gesture_Init() (0)
|
||||
#define Gesture_Quit()
|
||||
#define Gesture_ID SDL_GestureID
|
||||
#define Gesture_LoadDollarTemplates SDL_LoadDollarTemplates
|
||||
#define Gesture_RecordGesture SDL_RecordGesture
|
||||
#define Gesture_SaveAllDollarTemplates SDL_SaveAllDollarTemplates
|
||||
#define Gesture_SaveDollarTemplate SDL_SaveDollarTemplate
|
||||
#define GESTURE_DOLLARGESTURE SDL_DOLLARGESTURE
|
||||
#define GESTURE_DOLLARRECORD SDL_DOLLARRECORD
|
||||
#define GESTURE_MULTIGESTURE SDL_MULTIGESTURE
|
||||
#define Gesture_MultiGestureEvent SDL_MultiGestureEvent
|
||||
#define Gesture_DollarGestureEvent SDL_DollarGestureEvent
|
||||
#else
|
||||
|
||||
#include <cmath>
|
||||
|
||||
/* Set up for C function definitions, even when using C++ */
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef Sint64 Gesture_ID;
|
||||
|
||||
/* events... */
|
||||
|
||||
/* generally you shouldn't hardcode event type numbers--and doubly so in
|
||||
the reserved range!--but these match SDL2 and SDL3 promises to preserve
|
||||
these values to help sdl2-compat. */
|
||||
#define GESTURE_DOLLARGESTURE 0x800
|
||||
#define GESTURE_DOLLARRECORD 0x801
|
||||
#define GESTURE_MULTIGESTURE 0x802
|
||||
|
||||
typedef struct Gesture_MultiGestureEvent
|
||||
{
|
||||
Uint32 type;
|
||||
Uint32 reserved;
|
||||
Uint64 timestamp;
|
||||
SDL_TouchID touchID;
|
||||
float dTheta;
|
||||
float dDist;
|
||||
float x;
|
||||
float y;
|
||||
Uint16 numFingers;
|
||||
Uint16 padding;
|
||||
} Gesture_MultiGestureEvent;
|
||||
|
||||
typedef struct Gesture_DollarGestureEvent
|
||||
{
|
||||
Uint32 type;
|
||||
Uint32 reserved;
|
||||
Uint64 timestamp;
|
||||
SDL_TouchID touchID;
|
||||
Gesture_ID gestureId;
|
||||
Uint32 numFingers;
|
||||
float error;
|
||||
float x;
|
||||
float y;
|
||||
} Gesture_DollarGestureEvent;
|
||||
|
||||
|
||||
/* Function prototypes */
|
||||
|
||||
/**
|
||||
* Call this once, AFTER SDL_Init, to set up the Gesture API.
|
||||
*
|
||||
* \returns 0 on success, -1 on error. Call SDL_GetError() for specifics.
|
||||
*/
|
||||
extern int SDLCALL Gesture_Init(void);
|
||||
|
||||
/**
|
||||
* Call this once, BEFORE SDL_Quit, to clean up the Gesture API.
|
||||
*/
|
||||
extern void SDLCALL Gesture_Quit(void);
|
||||
|
||||
/**
|
||||
* Begin recording a gesture on a specified touch device or all touch devices.
|
||||
*
|
||||
* If the parameter `touchID` is -1 (i.e., all devices), this function will
|
||||
* always return 1, regardless of whether there actually are any devices.
|
||||
*
|
||||
* \param touchID the touch device id, or -1 for all touch devices
|
||||
* \returns 1 on success or 0 if the specified device could not be found.
|
||||
*/
|
||||
extern int SDLCALL Gesture_RecordGesture(SDL_TouchID touchID);
|
||||
|
||||
/**
|
||||
* Save all currently loaded Dollar Gesture templates.
|
||||
*
|
||||
* \param dst a SDL_IOStream to save to
|
||||
* \returns the number of saved templates on success or 0 on failure; call
|
||||
* SDL_GetError() for more information.
|
||||
*
|
||||
* \since This function is available since SDL 2.0.0.
|
||||
*
|
||||
* \sa Gesture_LoadDollarTemplates
|
||||
* \sa Gesture_SaveDollarTemplate
|
||||
*/
|
||||
extern int SDLCALL Gesture_SaveAllDollarTemplates(SDL_IOStream *dst);
|
||||
|
||||
/**
|
||||
* Save a currently loaded Dollar Gesture template.
|
||||
*
|
||||
* \param gestureId a gesture id
|
||||
* \param dst a SDL_IOStream to save to
|
||||
* \returns 1 on success or 0 on failure; call SDL_GetError() for more
|
||||
* information.
|
||||
*
|
||||
* \since This function is available since SDL 2.0.0.
|
||||
*
|
||||
* \sa SDL_LoadDollarTemplates
|
||||
* \sa SDL_SaveAllDollarTemplates
|
||||
*/
|
||||
extern int SDLCALL Gesture_SaveDollarTemplate(Gesture_ID gestureId, SDL_IOStream *dst);
|
||||
|
||||
/**
|
||||
* Load Dollar Gesture templates from a file.
|
||||
*
|
||||
* \param touchID a touch id
|
||||
* \param src a SDL_IOStream to load from
|
||||
* \returns the number of loaded templates on success or a negative error code
|
||||
* (or 0) on failure; call SDL_GetError() for more information.
|
||||
*
|
||||
* \since This function is available since SDL 2.0.0.
|
||||
*
|
||||
* \sa SDL_SaveAllDollarTemplates
|
||||
* \sa SDL_SaveDollarTemplate
|
||||
*/
|
||||
extern int SDLCALL Gesture_LoadDollarTemplates(SDL_TouchID touchID, SDL_IOStream *src);
|
||||
|
||||
/* Ends C function definitions when using C++ */
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(SDL_GESTURE_IMPLEMENTATION)
|
||||
|
||||
#define GESTURE_MAX_DOLLAR_PATH_SIZE 1024
|
||||
#define GESTURE_DOLLARNPOINTS 64
|
||||
#define GESTURE_DOLLARSIZE 256
|
||||
#define GESTURE_PHI 0.618033989
|
||||
|
||||
typedef struct
|
||||
{
|
||||
float length;
|
||||
int numPoints;
|
||||
SDL_FPoint p[GESTURE_MAX_DOLLAR_PATH_SIZE];
|
||||
} GestureDollarPath;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
SDL_FPoint path[GESTURE_DOLLARNPOINTS];
|
||||
Sint64 hash;
|
||||
} GestureDollarTemplate;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
SDL_TouchID touchID;
|
||||
SDL_FPoint centroid;
|
||||
GestureDollarPath dollarPath;
|
||||
int numDownFingers;
|
||||
int numDollarTemplates;
|
||||
GestureDollarTemplate *dollarTemplate;
|
||||
bool recording;
|
||||
} GestureTouch;
|
||||
|
||||
static GestureTouch *GestureTouches = NULL;
|
||||
static int GestureNumTouches = 0;
|
||||
static bool GestureRecordAll = false;
|
||||
|
||||
static void GestureProcessEvent(const SDL_Event *event);
|
||||
|
||||
static bool SDLCALL GestureEventWatch(void *, SDL_Event *event)
|
||||
{
|
||||
GestureProcessEvent(event);
|
||||
return true;
|
||||
}
|
||||
|
||||
int Gesture_Init(void)
|
||||
{
|
||||
Gesture_Quit();
|
||||
SDL_AddEventWatch(GestureEventWatch, NULL);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static GestureTouch *GestureAddTouch(const SDL_TouchID touchID)
|
||||
{
|
||||
GestureTouch *gestureTouch = (GestureTouch *)SDL_realloc(GestureTouches, (GestureNumTouches + 1) * sizeof(GestureTouch));
|
||||
if (gestureTouch == NULL) {
|
||||
SDL_OutOfMemory();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
GestureTouches = gestureTouch;
|
||||
SDL_zero(GestureTouches[GestureNumTouches]);
|
||||
GestureTouches[GestureNumTouches].touchID = touchID;
|
||||
return &GestureTouches[GestureNumTouches++];
|
||||
}
|
||||
|
||||
#if 0
|
||||
static int GestureDelTouch(const SDL_TouchID touchID)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < GestureNumTouches; i++) {
|
||||
if (GestureTouches[i].touchID == touchID) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (i == GestureNumTouches) {
|
||||
/* not found */
|
||||
return -1;
|
||||
}
|
||||
|
||||
SDL_free(GestureTouches[i].dollarTemplate);
|
||||
SDL_zero(GestureTouches[i]);
|
||||
|
||||
GestureNumTouches--;
|
||||
if (i != GestureNumTouches) {
|
||||
SDL_copyp(&GestureTouches[i], &GestureTouches[GestureNumTouches]);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static GestureTouch *GestureGetTouch(const SDL_TouchID touchID)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < GestureNumTouches; i++) {
|
||||
/* printf("%i ?= %i\n",GestureTouches[i].touchID,touchID); */
|
||||
if (GestureTouches[i].touchID == touchID) {
|
||||
return &GestureTouches[i];
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int Gesture_RecordGesture(SDL_TouchID touchID)
|
||||
{
|
||||
SDL_TouchID *devices;
|
||||
int i;
|
||||
|
||||
devices = SDL_GetTouchDevices(NULL);
|
||||
if (devices) {
|
||||
/* make sure we know about all the devices SDL3 knows about, since we aren't connected as tightly as we were in SDL2. */
|
||||
for (i = 0; devices[i]; i++) {
|
||||
if (!GestureGetTouch(devices[i])) {
|
||||
GestureAddTouch(devices[i]);
|
||||
}
|
||||
}
|
||||
SDL_free(devices);
|
||||
}
|
||||
|
||||
if (touchID != 0) {
|
||||
GestureRecordAll = true; /* !!! FIXME: this is never set back to false anywhere, that's probably a bug. */
|
||||
for (i = 0; i < GestureNumTouches; i++) {
|
||||
GestureTouches[i].recording = true;
|
||||
}
|
||||
} else {
|
||||
GestureTouch *touch = GestureGetTouch(touchID);
|
||||
if (!touch) {
|
||||
return 0; /* bogus touchid */
|
||||
}
|
||||
touch->recording = true;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
void Gesture_Quit(void)
|
||||
{
|
||||
SDL_RemoveEventWatch(GestureEventWatch, NULL);
|
||||
SDL_free(GestureTouches);
|
||||
GestureTouches = NULL;
|
||||
GestureNumTouches = 0;
|
||||
GestureRecordAll = false;
|
||||
}
|
||||
|
||||
static unsigned long GestureHashDollar(SDL_FPoint *points)
|
||||
{
|
||||
unsigned long hash = 5381;
|
||||
int i;
|
||||
for (i = 0; i < GESTURE_DOLLARNPOINTS; i++) {
|
||||
hash = ((hash << 5) + hash) + (unsigned long)points[i].x;
|
||||
hash = ((hash << 5) + hash) + (unsigned long)points[i].y;
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
|
||||
static int GestureSaveTemplate(GestureDollarTemplate *templ, SDL_IOStream *dst)
|
||||
{
|
||||
const size_t bytes = sizeof(templ->path[0]) * GESTURE_DOLLARNPOINTS;
|
||||
|
||||
if (dst == NULL) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* No Longer storing the Hash, rehash on load */
|
||||
/* if (SDL_IOWrite(dst, &(templ->hash), sizeof(templ->hash)) != sizeof(templ->hash)) return 0; */
|
||||
|
||||
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
|
||||
if (SDL_WriteIO(dst, templ->path, bytes) != bytes) {
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
{
|
||||
GestureDollarTemplate copy = *templ;
|
||||
SDL_FPoint *p = copy.path;
|
||||
int i;
|
||||
for (i = 0; i < GESTURE_DOLLARNPOINTS; i++, p++) {
|
||||
p->x = SDL_SwapFloatLE(p->x);
|
||||
p->y = SDL_SwapFloatLE(p->y);
|
||||
}
|
||||
|
||||
if (SDL_WriteIO(dst, copy.path, bytes) != bytes) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
SDL_DECLSPEC int SDLCALL
|
||||
Gesture_SaveAllDollarTemplates(SDL_IOStream *dst)
|
||||
{
|
||||
int i, j, rtrn = 0;
|
||||
for (i = 0; i < GestureNumTouches; i++) {
|
||||
GestureTouch *touch = &GestureTouches[i];
|
||||
for (j = 0; j < touch->numDollarTemplates; j++) {
|
||||
rtrn += GestureSaveTemplate(&touch->dollarTemplate[j], dst);
|
||||
}
|
||||
}
|
||||
return rtrn;
|
||||
}
|
||||
|
||||
SDL_DECLSPEC int SDLCALL
|
||||
Gesture_SaveDollarTemplate(Gesture_ID gestureId, SDL_IOStream *dst)
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0; i < GestureNumTouches; i++) {
|
||||
GestureTouch *touch = &GestureTouches[i];
|
||||
for (j = 0; j < touch->numDollarTemplates; j++) {
|
||||
if (touch->dollarTemplate[j].hash == gestureId) {
|
||||
return GestureSaveTemplate(&touch->dollarTemplate[j], dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
return SDL_SetError("Unknown gestureId");
|
||||
}
|
||||
|
||||
/* path is an already sampled set of points
|
||||
Returns the index of the gesture on success, or -1 */
|
||||
static int GestureAddDollar_one(GestureTouch *inTouch, SDL_FPoint *path)
|
||||
{
|
||||
GestureDollarTemplate *dollarTemplate;
|
||||
GestureDollarTemplate *templ;
|
||||
int index;
|
||||
|
||||
index = inTouch->numDollarTemplates;
|
||||
dollarTemplate = (GestureDollarTemplate *)SDL_realloc(inTouch->dollarTemplate, (index + 1) * sizeof(GestureDollarTemplate));
|
||||
if (dollarTemplate == NULL) {
|
||||
return SDL_OutOfMemory();
|
||||
}
|
||||
inTouch->dollarTemplate = dollarTemplate;
|
||||
|
||||
templ = &inTouch->dollarTemplate[index];
|
||||
SDL_memcpy(templ->path, path, GESTURE_DOLLARNPOINTS * sizeof(SDL_FPoint));
|
||||
templ->hash = GestureHashDollar(templ->path);
|
||||
inTouch->numDollarTemplates++;
|
||||
|
||||
return index;
|
||||
}
|
||||
|
||||
static int GestureAddDollar(GestureTouch *inTouch, SDL_FPoint *path)
|
||||
{
|
||||
int index = -1;
|
||||
int i = 0;
|
||||
if (inTouch == NULL) {
|
||||
if (GestureNumTouches == 0) {
|
||||
return SDL_SetError("no gesture touch devices registered");
|
||||
}
|
||||
for (i = 0; i < GestureNumTouches; i++) {
|
||||
inTouch = &GestureTouches[i];
|
||||
index = GestureAddDollar_one(inTouch, path);
|
||||
if (index < 0) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
/* Use the index of the last one added. */
|
||||
return index;
|
||||
}
|
||||
return GestureAddDollar_one(inTouch, path);
|
||||
}
|
||||
|
||||
SDL_DECLSPEC int SDLCALL
|
||||
Gesture_LoadDollarTemplates(SDL_TouchID touchID, SDL_IOStream *src)
|
||||
{
|
||||
int i, loaded = 0;
|
||||
GestureTouch *touch = NULL;
|
||||
if (src == NULL) {
|
||||
return 0;
|
||||
}
|
||||
/* In SDL2 this test was `touchID >= 0` leading to warnings from gcc
|
||||
because SDL_TouchId is now Uint64. In SDL2 it was Sint64. The
|
||||
documentation does not say what < 0 means here but the only defined
|
||||
negative touchID was SDL_MOUSE_TOUCHID (-1). In SDL3 SDL_PEN_TOUCHID (-2)
|
||||
has been added hence this test. Given the lack of documentation
|
||||
it is impossible to say if this updated test is correct. */
|
||||
if (touchID < SDL_PEN_TOUCHID) {
|
||||
for (i = 0; i < GestureNumTouches; i++) {
|
||||
if (GestureTouches[i].touchID == touchID) {
|
||||
touch = &GestureTouches[i];
|
||||
}
|
||||
}
|
||||
if (touch == NULL) {
|
||||
return SDL_SetError("given touch id not found");
|
||||
}
|
||||
}
|
||||
|
||||
while (1) {
|
||||
GestureDollarTemplate templ;
|
||||
const size_t bytes = sizeof(templ.path[0]) * GESTURE_DOLLARNPOINTS;
|
||||
|
||||
if (SDL_ReadIO(src, templ.path, bytes) < bytes) {
|
||||
if (loaded == 0) {
|
||||
return SDL_SetError("could not read any dollar gesture from rwops");
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
#if SDL_BYTEORDER != SDL_LIL_ENDIAN
|
||||
for (i = 0; i < GESTURE_DOLLARNPOINTS; i++) {
|
||||
SDL_FPoint *p = &templ.path[i];
|
||||
p->x = SDL_SwapFloatLE(p->x);
|
||||
p->y = SDL_SwapFloatLE(p->y);
|
||||
}
|
||||
#endif
|
||||
|
||||
// See comment at line 436.
|
||||
if (touchID < SDL_PEN_TOUCHID) {
|
||||
/* printf("Adding loaded gesture to 1 touch\n"); */
|
||||
if (GestureAddDollar(touch, templ.path) >= 0) {
|
||||
loaded++;
|
||||
}
|
||||
} else {
|
||||
/* printf("Adding to: %i touches\n",GestureNumTouches); */
|
||||
for (i = 0; i < GestureNumTouches; i++) {
|
||||
touch = &GestureTouches[i];
|
||||
/* printf("Adding loaded gesture to + touches\n"); */
|
||||
/* TODO: What if this fails? */
|
||||
GestureAddDollar(touch, templ.path);
|
||||
}
|
||||
loaded++;
|
||||
}
|
||||
}
|
||||
|
||||
return loaded;
|
||||
}
|
||||
|
||||
static float GestureDollarDifference(SDL_FPoint *points, SDL_FPoint *templ, float ang)
|
||||
{
|
||||
/* SDL_FPoint p[GESTURE_DOLLARNPOINTS]; */
|
||||
float dist = 0;
|
||||
SDL_FPoint p;
|
||||
int i;
|
||||
for (i = 0; i < GESTURE_DOLLARNPOINTS; i++) {
|
||||
p.x = points[i].x * SDL_cosf(ang) - points[i].y * SDL_sinf(ang);
|
||||
p.y = points[i].x * SDL_sinf(ang) + points[i].y * SDL_cosf(ang);
|
||||
dist += SDL_sqrtf((p.x - templ[i].x) * (p.x - templ[i].x) + (p.y - templ[i].y) * (p.y - templ[i].y));
|
||||
}
|
||||
return dist / GESTURE_DOLLARNPOINTS;
|
||||
}
|
||||
|
||||
static float GestureBestDollarDifference(SDL_FPoint *points, SDL_FPoint *templ)
|
||||
{
|
||||
/*------------BEGIN DOLLAR BLACKBOX------------------
|
||||
-TRANSLATED DIRECTLY FROM PSUDEO-CODE AVAILABLE AT-
|
||||
-"http://depts.washington.edu/aimgroup/proj/dollar/"
|
||||
*/
|
||||
double ta = -SDL_PI_D / 4;
|
||||
double tb = SDL_PI_D / 4;
|
||||
double dt = SDL_PI_D / 90;
|
||||
float x1 = (float)(GESTURE_PHI * ta + (1 - GESTURE_PHI) * tb);
|
||||
float f1 = GestureDollarDifference(points, templ, x1);
|
||||
float x2 = (float)((1 - GESTURE_PHI) * ta + GESTURE_PHI * tb);
|
||||
float f2 = GestureDollarDifference(points, templ, x2);
|
||||
while (SDL_fabs(ta - tb) > dt) {
|
||||
if (f1 < f2) {
|
||||
tb = x2;
|
||||
x2 = x1;
|
||||
f2 = f1;
|
||||
x1 = (float)(GESTURE_PHI * ta + (1 - GESTURE_PHI) * tb);
|
||||
f1 = GestureDollarDifference(points, templ, x1);
|
||||
} else {
|
||||
ta = x1;
|
||||
x1 = x2;
|
||||
f1 = f2;
|
||||
x2 = (float)((1 - GESTURE_PHI) * ta + GESTURE_PHI * tb);
|
||||
f2 = GestureDollarDifference(points, templ, x2);
|
||||
}
|
||||
}
|
||||
/*
|
||||
if (f1 <= f2)
|
||||
printf("Min angle (x1): %f\n",x1);
|
||||
else if (f1 > f2)
|
||||
printf("Min angle (x2): %f\n",x2);
|
||||
*/
|
||||
return SDL_min(f1, f2);
|
||||
}
|
||||
|
||||
/* `path` contains raw points, plus (possibly) the calculated length */
|
||||
static int GestureDollarNormalize(const GestureDollarPath *path, SDL_FPoint *points, bool is_recording)
|
||||
{
|
||||
int i;
|
||||
float interval;
|
||||
float dist;
|
||||
int numPoints = 0;
|
||||
SDL_FPoint centroid;
|
||||
float xmin, xmax, ymin, ymax;
|
||||
float ang;
|
||||
float w, h;
|
||||
float length = path->length;
|
||||
|
||||
/* Calculate length if it hasn't already been done */
|
||||
if (length <= 0) {
|
||||
for (i = 1; i < path->numPoints; i++) {
|
||||
const float dx = path->p[i].x - path->p[i - 1].x;
|
||||
const float dy = path->p[i].y - path->p[i - 1].y;
|
||||
length += SDL_sqrtf(dx * dx + dy * dy);
|
||||
}
|
||||
}
|
||||
|
||||
/* Resample */
|
||||
interval = length / (GESTURE_DOLLARNPOINTS - 1);
|
||||
dist = interval;
|
||||
|
||||
centroid.x = 0;
|
||||
centroid.y = 0;
|
||||
|
||||
/* printf("(%f,%f)\n",path->p[path->numPoints-1].x,path->p[path->numPoints-1].y); */
|
||||
for (i = 1; i < path->numPoints; i++) {
|
||||
const float d = SDL_sqrtf((path->p[i - 1].x - path->p[i].x) * (path->p[i - 1].x - path->p[i].x) + (path->p[i - 1].y - path->p[i].y) * (path->p[i - 1].y - path->p[i].y));
|
||||
/* printf("d = %f dist = %f/%f\n",d,dist,interval); */
|
||||
while (dist + d > interval) {
|
||||
points[numPoints].x = path->p[i - 1].x +
|
||||
((interval - dist) / d) * (path->p[i].x - path->p[i - 1].x);
|
||||
points[numPoints].y = path->p[i - 1].y +
|
||||
((interval - dist) / d) * (path->p[i].y - path->p[i - 1].y);
|
||||
centroid.x += points[numPoints].x;
|
||||
centroid.y += points[numPoints].y;
|
||||
numPoints++;
|
||||
|
||||
dist -= interval;
|
||||
}
|
||||
dist += d;
|
||||
}
|
||||
if (numPoints < GESTURE_DOLLARNPOINTS - 1) {
|
||||
if (is_recording) {
|
||||
SDL_SetError("ERROR: NumPoints = %i", numPoints);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
/* copy the last point */
|
||||
points[GESTURE_DOLLARNPOINTS - 1] = path->p[path->numPoints - 1];
|
||||
numPoints = GESTURE_DOLLARNPOINTS;
|
||||
|
||||
centroid.x /= numPoints;
|
||||
centroid.y /= numPoints;
|
||||
|
||||
/* printf("Centroid (%f,%f)",centroid.x,centroid.y); */
|
||||
/* Rotate Points so point 0 is left of centroid and solve for the bounding box */
|
||||
xmin = centroid.x;
|
||||
xmax = centroid.x;
|
||||
ymin = centroid.y;
|
||||
ymax = centroid.y;
|
||||
|
||||
ang = SDL_atan2f(centroid.y - points[0].y, centroid.x - points[0].x);
|
||||
|
||||
for (i = 0; i < numPoints; i++) {
|
||||
const float px = points[i].x;
|
||||
const float py = points[i].y;
|
||||
points[i].x = (px - centroid.x) * SDL_cosf(ang) - (py - centroid.y) * SDL_sinf(ang) + centroid.x;
|
||||
points[i].y = (px - centroid.x) * SDL_sinf(ang) + (py - centroid.y) * SDL_cosf(ang) + centroid.y;
|
||||
|
||||
if (points[i].x < xmin) {
|
||||
xmin = points[i].x;
|
||||
}
|
||||
if (points[i].x > xmax) {
|
||||
xmax = points[i].x;
|
||||
}
|
||||
if (points[i].y < ymin) {
|
||||
ymin = points[i].y;
|
||||
}
|
||||
if (points[i].y > ymax) {
|
||||
ymax = points[i].y;
|
||||
}
|
||||
}
|
||||
|
||||
/* Scale points to GESTURE_DOLLARSIZE, and translate to the origin */
|
||||
w = xmax - xmin;
|
||||
h = ymax - ymin;
|
||||
|
||||
for (i = 0; i < numPoints; i++) {
|
||||
points[i].x = (points[i].x - centroid.x) * GESTURE_DOLLARSIZE / w;
|
||||
points[i].y = (points[i].y - centroid.y) * GESTURE_DOLLARSIZE / h;
|
||||
}
|
||||
return numPoints;
|
||||
}
|
||||
|
||||
static float GestureDollarRecognize(const GestureDollarPath *path, int *bestTempl, GestureTouch *touch)
|
||||
{
|
||||
SDL_FPoint points[GESTURE_DOLLARNPOINTS];
|
||||
int i;
|
||||
float bestDiff = 10000;
|
||||
|
||||
SDL_memset(points, 0, sizeof(points));
|
||||
|
||||
GestureDollarNormalize(path, points, false);
|
||||
|
||||
/* PrintPath(points); */
|
||||
*bestTempl = -1;
|
||||
for (i = 0; i < touch->numDollarTemplates; i++) {
|
||||
const float diff = GestureBestDollarDifference(points, touch->dollarTemplate[i].path);
|
||||
if (diff < bestDiff) {
|
||||
bestDiff = diff;
|
||||
*bestTempl = i;
|
||||
}
|
||||
}
|
||||
return bestDiff;
|
||||
}
|
||||
|
||||
static void GestureSendMulti(GestureTouch *touch, float dTheta, float dDist)
|
||||
{
|
||||
if (SDL_EventEnabled(GESTURE_MULTIGESTURE)) {
|
||||
Gesture_MultiGestureEvent mgesture;
|
||||
mgesture.type = GESTURE_MULTIGESTURE;
|
||||
mgesture.timestamp = 0;
|
||||
mgesture.touchID = touch->touchID;
|
||||
mgesture.x = touch->centroid.x;
|
||||
mgesture.y = touch->centroid.y;
|
||||
mgesture.dTheta = dTheta;
|
||||
mgesture.dDist = dDist;
|
||||
mgesture.numFingers = (Uint16)touch->numDownFingers;
|
||||
SDL_PushEvent((SDL_Event*)&mgesture);
|
||||
}
|
||||
}
|
||||
|
||||
static void GestureSendDollar(GestureTouch *touch, Gesture_ID gestureId, float error)
|
||||
{
|
||||
if (SDL_EventEnabled(GESTURE_DOLLARGESTURE)) {
|
||||
Gesture_DollarGestureEvent dgesture;
|
||||
dgesture.type = GESTURE_DOLLARGESTURE;
|
||||
dgesture.timestamp = 0;
|
||||
dgesture.touchID = touch->touchID;
|
||||
dgesture.x = touch->centroid.x;
|
||||
dgesture.y = touch->centroid.y;
|
||||
dgesture.gestureId = gestureId;
|
||||
dgesture.error = error;
|
||||
/* A finger came up to trigger this event. */
|
||||
dgesture.numFingers = touch->numDownFingers + 1;
|
||||
SDL_PushEvent((SDL_Event*)&dgesture);
|
||||
}
|
||||
}
|
||||
|
||||
static void GestureSendDollarRecord(GestureTouch *touch, Gesture_ID gestureId)
|
||||
{
|
||||
if (SDL_EventEnabled(GESTURE_DOLLARRECORD)) {
|
||||
Gesture_DollarGestureEvent dgesture;
|
||||
dgesture.type = GESTURE_DOLLARRECORD;
|
||||
dgesture.timestamp = 0;
|
||||
dgesture.touchID = touch->touchID;
|
||||
dgesture.gestureId = gestureId;
|
||||
SDL_PushEvent((SDL_Event*)&dgesture);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(GESTURE_LOG_UP_DOWN_EVENTS)
|
||||
#define GESTURE_LOG_UP_DOWN_EVENTS 0
|
||||
#endif
|
||||
#if !defined(GESTURE_LOG_MOTION_EVENTS)
|
||||
#define GESTURE_LOG_MOTION_EVENTS 0
|
||||
#endif
|
||||
|
||||
static void GestureProcessEvent(const SDL_Event *event)
|
||||
{
|
||||
float x, y;
|
||||
int index;
|
||||
int i;
|
||||
float pathDx, pathDy;
|
||||
SDL_FPoint lastP;
|
||||
SDL_FPoint lastCentroid;
|
||||
float lDist;
|
||||
float Dist;
|
||||
float dtheta;
|
||||
float dDist;
|
||||
|
||||
if (event->type == SDL_EVENT_FINGER_MOTION || event->type == SDL_EVENT_FINGER_DOWN || event->type == SDL_EVENT_FINGER_UP) {
|
||||
GestureTouch *inTouch = GestureGetTouch(event->tfinger.touchID);
|
||||
if (inTouch == NULL) { /* we maybe didn't see this one before. */
|
||||
inTouch = GestureAddTouch(event->tfinger.touchID);
|
||||
if (!inTouch) {
|
||||
return; /* oh well. */
|
||||
}
|
||||
}
|
||||
int numDownFingersReported;
|
||||
SDL_Finger** fingers = SDL_GetTouchFingers(event->tfinger.touchID, &numDownFingersReported);
|
||||
|
||||
x = event->tfinger.x;
|
||||
y = event->tfinger.y;
|
||||
|
||||
/* Finger Up */
|
||||
if (event->type == SDL_EVENT_FINGER_UP) {
|
||||
#if GESTURE_LOG_UP_DOWN_EVENTS
|
||||
SDL_Log("GPE: Finger: %#" SDL_PRIx64 " UP. Device: %#" SDL_PRIx64 ", fingers: %i, x: %f, y: %f, press: %f",
|
||||
event->tfinger.fingerID, event->tfinger.touchID, numDownFingersReported,
|
||||
event->tfinger.x, event->tfinger.y, event->tfinger.pressure);
|
||||
#endif
|
||||
SDL_FPoint path[GESTURE_DOLLARNPOINTS];
|
||||
|
||||
#if SDL_PLATFORM_MACOS
|
||||
/* Workaround issue https://github.com/libsdl-org/SDL/issues/13428,
|
||||
Extra SDL_EVENT_FINGER_{UP,DOWN} with mouse button press, by
|
||||
ignoring events with fingerID of SDL_BUTTON_LEFT.
|
||||
|
||||
N.B. If SDL_HINT_MOUSE_TOUCH_EVENTS is set to 0 no touch
|
||||
events are received from the trackpad. */
|
||||
if (event->tfinger.fingerID == SDL_BUTTON_LEFT) return;
|
||||
#endif
|
||||
/* Using the number of fingers returned by SDL_GetTouchFingers
|
||||
is much more robust than counting finger up and down events.
|
||||
With counting it is easy for the counted number to be higher
|
||||
than the actual number. Unfortunately it has not been possible
|
||||
to identify a sequence of actions that reliably reproduces
|
||||
this but asserts have shown it happens often. Perhaps
|
||||
sometimes a single UP or DOWN event is received for multiple
|
||||
fingers.
|
||||
|
||||
Using the reported number is independent of how many events
|
||||
are actually received. But, and this is a big one, in the
|
||||
case of FINGER_UP SDL_GetTouchFingers reports the number of
|
||||
fingers down *before* the up event.
|
||||
|
||||
N.B. In the case of a left button press on macOS,
|
||||
SDL_GetTouchFingers reports 1 for the event that is not
|
||||
ignored.
|
||||
*/
|
||||
inTouch->numDownFingers = numDownFingersReported - 1;
|
||||
assert(inTouch->numDownFingers >= 0);
|
||||
#if (GESTURE_LOG_UP_DOWN_EVENTS)
|
||||
SDL_Log("GPE FINGER_UP, numDownFingers now = %i", inTouch->numDownFingers);
|
||||
#endif
|
||||
|
||||
if (inTouch->recording) {
|
||||
inTouch->recording = false;
|
||||
GestureDollarNormalize(&inTouch->dollarPath, path, true);
|
||||
/* PrintPath(path); */
|
||||
if (GestureRecordAll) {
|
||||
index = GestureAddDollar(NULL, path);
|
||||
for (i = 0; i < GestureNumTouches; i++) {
|
||||
GestureTouches[i].recording = false;
|
||||
}
|
||||
} else {
|
||||
index = GestureAddDollar(inTouch, path);
|
||||
}
|
||||
|
||||
if (index >= 0) {
|
||||
GestureSendDollarRecord(inTouch, inTouch->dollarTemplate[index].hash);
|
||||
} else {
|
||||
GestureSendDollarRecord(inTouch, -1);
|
||||
}
|
||||
} else {
|
||||
int bestTempl = -1;
|
||||
const float error = GestureDollarRecognize(&inTouch->dollarPath, &bestTempl, inTouch);
|
||||
if (bestTempl >= 0) {
|
||||
/* Send Event */
|
||||
const Gesture_ID gestureId = inTouch->dollarTemplate[bestTempl].hash;
|
||||
GestureSendDollar(inTouch, gestureId, error);
|
||||
/* printf ("%s\n",);("Dollar error: %f\n",error); */
|
||||
}
|
||||
}
|
||||
|
||||
/* inTouch->gestureLast[j] = inTouch->gestureLast[inTouch->numDownFingers]; */
|
||||
if (inTouch->numDownFingers > 0) {
|
||||
inTouch->centroid.x = (inTouch->centroid.x * (inTouch->numDownFingers + 1) - x) / inTouch->numDownFingers;
|
||||
inTouch->centroid.y = (inTouch->centroid.y * (inTouch->numDownFingers + 1) - y) / inTouch->numDownFingers;
|
||||
} else {
|
||||
inTouch->centroid.x = inTouch->centroid.y = 0.0f;
|
||||
}
|
||||
} else if (event->type == SDL_EVENT_FINGER_MOTION) {
|
||||
/* There is one FINGER_MOTION event per down finger. x,y gives
|
||||
the position of the finger whose id is in the event. */
|
||||
const float dx = event->tfinger.dx;
|
||||
const float dy = event->tfinger.dy;
|
||||
GestureDollarPath *path = &inTouch->dollarPath;
|
||||
|
||||
#if GESTURE_LOG_MOTION_EVENTS
|
||||
SDL_Log("GPE: Finger: %#" SDL_PRIx64 " MOTION: device: %#" SDL_PRIx64 ", timestamp = %"
|
||||
SDL_PRIu64 ", fingers: %i, x: %f, y: %f, press: %f, numDownFingers: %i",
|
||||
event->tfinger.fingerID, event->tfinger.touchID, event->tfinger.timestamp,
|
||||
numDownFingersReported, event->tfinger.x, event->tfinger.y, event->tfinger.pressure,
|
||||
inTouch->numDownFingers);
|
||||
#endif
|
||||
assert(numDownFingersReported > 0);
|
||||
#if SDL_PLATFORM_MACOS
|
||||
/* Workaround issue https://github.com/libsdl-org/SDL/issues/13428.
|
||||
See comment at line 753 for more details. */
|
||||
if (event->tfinger.fingerID == SDL_BUTTON_LEFT) return;
|
||||
/* SDL_GetTouchFingers reports 2 fingers down in the motion event
|
||||
for the other finger during button press. Fix up the number of
|
||||
fingers. */
|
||||
uint32_t reportedNumFingers = numDownFingersReported;
|
||||
for (uint32_t i = 0; i < reportedNumFingers; i++) {
|
||||
if (fingers[i]->id == SDL_BUTTON_LEFT) {
|
||||
numDownFingersReported--;
|
||||
break;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
/* See comment at line 762. One case where the count reliably
|
||||
differs from reported is on iOS. When touching, dragging and
|
||||
releasing 2 fingers, iOS sends a BUTTON_DOWN and BUTTON_UP
|
||||
for one of the fingers. When the finger corresponding to the
|
||||
button is raised, it sends the BUTTON_UP followed by the
|
||||
FINGER_UP but FINGER_MOTION events can come before the
|
||||
FINGER_UP and those events have only one finger down. */
|
||||
inTouch->numDownFingers = numDownFingersReported;
|
||||
if (path->numPoints < GESTURE_MAX_DOLLAR_PATH_SIZE) {
|
||||
path->p[path->numPoints].x = inTouch->centroid.x;
|
||||
path->p[path->numPoints].y = inTouch->centroid.y;
|
||||
pathDx = (path->p[path->numPoints].x - path->p[path->numPoints - 1].x);
|
||||
pathDy = (path->p[path->numPoints].y - path->p[path->numPoints - 1].y);
|
||||
path->length += (float)SDL_sqrt(pathDx * pathDx + pathDy * pathDy);
|
||||
path->numPoints++;
|
||||
}
|
||||
|
||||
lastP.x = x - dx;
|
||||
lastP.y = y - dy;
|
||||
lastCentroid = inTouch->centroid;
|
||||
|
||||
inTouch->centroid.x += dx / inTouch->numDownFingers;
|
||||
inTouch->centroid.y += dy / inTouch->numDownFingers;
|
||||
/* printf("Centroid : (%f,%f)\n",inTouch->centroid.x,inTouch->centroid.y); */
|
||||
if (inTouch->numDownFingers > 1) {
|
||||
SDL_FPoint lv; /* Vector from centroid to last x,y position */
|
||||
SDL_FPoint v; /* Vector from centroid to current x,y position */
|
||||
/* lv = inTouch->gestureLast[j].cv; */
|
||||
lv.x = lastP.x - lastCentroid.x;
|
||||
lv.y = lastP.y - lastCentroid.y;
|
||||
lDist = SDL_sqrtf(lv.x * lv.x + lv.y * lv.y);
|
||||
/* printf("lDist = %f\n",lDist); */
|
||||
v.x = x - inTouch->centroid.x;
|
||||
v.y = y - inTouch->centroid.y;
|
||||
/* inTouch->gestureLast[j].cv = v; */
|
||||
Dist = SDL_sqrtf(v.x * v.x + v.y * v.y);
|
||||
/* SDL_cosf(dTheta) = (v . lv)/(|v| * |lv|) */
|
||||
|
||||
/* Normalize Vectors to simplify angle calculation */
|
||||
lv.x /= lDist;
|
||||
lv.y /= lDist;
|
||||
v.x /= Dist;
|
||||
v.y /= Dist;
|
||||
dtheta = SDL_atan2f(lv.x * v.y - lv.y * v.x, lv.x * v.x + lv.y * v.y);
|
||||
|
||||
dDist = (Dist - lDist);
|
||||
if (lDist == 0) {
|
||||
/* To avoid impossible values */
|
||||
dDist = 0;
|
||||
dtheta = 0;
|
||||
}
|
||||
|
||||
/* inTouch->gestureLast[j].dDist = dDist;
|
||||
inTouch->gestureLast[j].dtheta = dtheta;
|
||||
|
||||
printf("dDist = %f, dTheta = %f\n",dDist,dtheta);
|
||||
gdtheta = gdtheta*.9 + dtheta*.1;
|
||||
gdDist = gdDist*.9 + dDist*.1
|
||||
knob.r += dDist/numDownFingers;
|
||||
knob.ang += dtheta;
|
||||
printf("thetaSum = %f, distSum = %f\n",gdtheta,gdDist);
|
||||
printf("id: %i dTheta = %f, dDist = %f\n",j,dtheta,dDist); */
|
||||
GestureSendMulti(inTouch, dtheta, dDist);
|
||||
} else {
|
||||
/* inTouch->gestureLast[j].dDist = 0;
|
||||
inTouch->gestureLast[j].dtheta = 0;
|
||||
inTouch->gestureLast[j].cv.x = 0;
|
||||
inTouch->gestureLast[j].cv.y = 0; */
|
||||
}
|
||||
/* inTouch->gestureLast[j].f.p.x = x;
|
||||
inTouch->gestureLast[j].f.p.y = y;
|
||||
break;
|
||||
pressure? */
|
||||
} else if (event->type == SDL_EVENT_FINGER_DOWN) {
|
||||
#if (GESTURE_LOG_UP_DOWN_EVENTS)
|
||||
SDL_Log("GPE: Finger: %#" SDL_PRIx64 " DOWN. Device: %#" SDL_PRIx64 ", fingers: %i, x: %f, y: %f, press: %f",
|
||||
event->tfinger.fingerID, event->tfinger.touchID, numDownFingersReported,
|
||||
event->tfinger.x, event->tfinger.y, event->tfinger.pressure);
|
||||
#endif
|
||||
#if SDL_PLATFORM_MACOS
|
||||
/* See comment starting at line 753. */
|
||||
if (event->tfinger.fingerID == SDL_BUTTON_LEFT) return;
|
||||
#endif
|
||||
/* Using the number of fingers returned by SDL_GetTouchFingers
|
||||
is much more robust than counting finger up and down events.
|
||||
With counting it is easy for the counted number to be higher
|
||||
than the actual number. Unfortunately it has not been possible
|
||||
to identify a sequence of actions that reliably reproduces
|
||||
this. Using the reported number is independent of how many
|
||||
events are actually received. */
|
||||
inTouch->numDownFingers = numDownFingersReported;
|
||||
inTouch->centroid.x = inTouch->centroid.y = 0.0;
|
||||
for (i = 0; i < numDownFingersReported; i++) {
|
||||
inTouch->centroid.x += fingers[i]->x;
|
||||
inTouch->centroid.y += fingers[i]->y;
|
||||
}
|
||||
inTouch->centroid.x /= numDownFingersReported;
|
||||
inTouch->centroid.y /= numDownFingersReported;
|
||||
//printf("Finger Down: (%f,%f). Centroid: (%f,%f\n",x,y,
|
||||
// inTouch->centroid.x,inTouch->centroid.y);
|
||||
|
||||
inTouch->dollarPath.length = 0;
|
||||
inTouch->dollarPath.p[0].x = x;
|
||||
inTouch->dollarPath.p[0].y = y;
|
||||
inTouch->dollarPath.numPoints = 1;
|
||||
}
|
||||
SDL_free(fingers);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* defined(SDL_GESTURE_IMPLEMENTATION) */
|
||||
#endif /* SDL version > 2 */
|
||||
#endif /* INCL_SDL_GESTURE_H */
|
||||
|
||||
/* vi: set sts=4 ts=4 sw=4 expandtab: */
|
||||
@@ -0,0 +1,21 @@
|
||||
# Text type files use auto line endings
|
||||
* text=auto
|
||||
|
||||
# Explicitly declare text file types for this repo
|
||||
*.c text
|
||||
*.cpp text
|
||||
*.h text
|
||||
*.md text
|
||||
Jenkinsfile text
|
||||
|
||||
# VS solutions always use Windows line endings
|
||||
*.sln text eol=crlf
|
||||
*.vcxproj text eol=crlf
|
||||
|
||||
# Bash scripts always use *nux line endings
|
||||
*.sh text eol=lf
|
||||
|
||||
# Denote all files that are truly binary and should not be modified.
|
||||
*.png binary
|
||||
*.hdr binary
|
||||
*.exe binary
|
||||
@@ -0,0 +1,385 @@
|
||||
name: post-weekly-release
|
||||
run-name: Build, test, generate signed artifacts and optionally prepare release
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
tags:
|
||||
- '*'
|
||||
schedule:
|
||||
- cron: '17 2 * * 1'
|
||||
|
||||
jobs:
|
||||
|
||||
coverity:
|
||||
if: ${{ (!startsWith(github.event.ref, 'refs/tags/')) && (github.repository_owner == 'Arm-software') }}
|
||||
name: Run Coverity static analysis
|
||||
runs-on: [self-hosted-ubuntu-latest-x64]
|
||||
steps:
|
||||
- name: Clean workspace
|
||||
uses: AutoModality/action-clean@v1
|
||||
|
||||
- name: Git checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: 'true'
|
||||
|
||||
- name: Coverity preparation
|
||||
run: |
|
||||
export PATH=$PATH:/usr/local/cov-analysis/bin
|
||||
mkdir build_cov
|
||||
cd build_cov
|
||||
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ -DASTCENC_ISA_AVX2=ON ..
|
||||
cov-configure --config ${GITHUB_WORKSPACE}/coverity.conf --template --compiler cc --comptype gcc
|
||||
cov-configure --config ${GITHUB_WORKSPACE}/coverity.conf --template --compiler c++ --comptype g++
|
||||
|
||||
- name: Coverity build
|
||||
run: |
|
||||
export PATH=$PATH:/usr/local/cov-analysis/bin
|
||||
cd build_cov
|
||||
cov-build --config ${GITHUB_WORKSPACE}/coverity.conf --dir ${GITHUB_WORKSPACE}/intermediate make install
|
||||
|
||||
- name: Coverity analyze
|
||||
run: |
|
||||
export PATH=$PATH:/usr/local/cov-analysis/bin
|
||||
cd build_cov
|
||||
cov-analyze --dir ${GITHUB_WORKSPACE}/intermediate
|
||||
|
||||
- name: Coverity upload
|
||||
env:
|
||||
COVERITY_KEY: ${{ secrets.COVERITY_KEY }}
|
||||
run: |
|
||||
export PATH=$PATH:/usr/local/cov-analysis/bin
|
||||
echo "${COVERITY_KEY}" > coverity.key
|
||||
chmod 400 coverity.key
|
||||
cd build_cov
|
||||
cov-commit-defects \
|
||||
--dir ${GITHUB_WORKSPACE}/intermediate \
|
||||
--stream astcenc-master \
|
||||
--url https://coverity.cambridge.arm.com \
|
||||
--auth-key-file ../coverity.key \
|
||||
--strip-path ${GITHUB_WORKSPACE}
|
||||
|
||||
build-ubuntu-arm64:
|
||||
name: Ubuntu arm64
|
||||
runs-on: ubuntu-24.04-arm
|
||||
steps:
|
||||
- name: Git checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: 'true'
|
||||
|
||||
- name: Update apt packages
|
||||
run: sudo apt-get update
|
||||
|
||||
- name: Install ImageMagick
|
||||
run: sudo apt-get install imagemagick
|
||||
|
||||
- name: Build release
|
||||
run: |
|
||||
export CXX=clang++
|
||||
mkdir build_rel
|
||||
cd build_rel
|
||||
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ -DASTCENC_ISA_NEON=ON -DASTCENC_ISA_SVE_128=ON -DASTCENC_ISA_SVE_256=ON -DASTCENC_PACKAGE=arm64 ..
|
||||
make install package -j4
|
||||
|
||||
- name: Upload binaries
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: astcenc-linux-arm64
|
||||
path: |
|
||||
build_rel/*.zip
|
||||
build_rel/*.zip.sha256
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Get Python modules
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install numpy Pillow
|
||||
|
||||
- name: Run system tests
|
||||
# Disable SVE testing for now
|
||||
run: |
|
||||
python ./Test/astc_test_functional.py --encoder neon
|
||||
python ./Test/astc_test_image.py --encoder neon --test-set Small
|
||||
|
||||
build-ubuntu-x64:
|
||||
name: Ubuntu x64
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Git checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: 'true'
|
||||
|
||||
- name: Build release
|
||||
run: |
|
||||
export CXX=clang++
|
||||
mkdir build_rel
|
||||
cd build_rel
|
||||
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ -DASTCENC_ISA_AVX2=ON -DASTCENC_ISA_SSE41=ON -DASTCENC_ISA_SSE2=ON -DASTCENC_PACKAGE=x64 ..
|
||||
make install package -j4
|
||||
|
||||
- name: Upload binaries
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: astcenc-linux-x86_64
|
||||
path: |
|
||||
build_rel/*.zip
|
||||
build_rel/*.zip.sha256
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Get Python modules
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install numpy Pillow
|
||||
|
||||
- name: Run system tests
|
||||
run: |
|
||||
python ./Test/astc_test_functional.py
|
||||
python ./Test/astc_test_image.py --encoder all-x86 --test-set Small
|
||||
|
||||
build-macos-universal:
|
||||
name: macOS universal
|
||||
runs-on: macos-14
|
||||
steps:
|
||||
- name: Git checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: 'true'
|
||||
|
||||
- name: Build release
|
||||
run: |
|
||||
mkdir build_rel
|
||||
cd build_rel
|
||||
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ -DASTCENC_PACKAGE=universal ..
|
||||
make install package -j4
|
||||
|
||||
- name: Upload binaries
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: astcenc-macos-universal
|
||||
path: |
|
||||
build_rel/*.zip
|
||||
build_rel/*.zip.sha256
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Get Python modules
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install numpy Pillow
|
||||
|
||||
- name: Run system tests
|
||||
run: |
|
||||
python ./Test/astc_test_image.py --test-set Small --encoder universal
|
||||
|
||||
build-windows-multi:
|
||||
name: Windows multi
|
||||
runs-on: windows-2022
|
||||
steps:
|
||||
- name: Git checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: 'true'
|
||||
|
||||
- name: Setup Visual Studio x86_64
|
||||
uses: ilammy/msvc-dev-cmd@v1
|
||||
|
||||
- name: Build release x64
|
||||
run: |
|
||||
mkdir build_rel
|
||||
cd build_rel
|
||||
cmake -G "Visual Studio 17 2022" -T ClangCL -DCMAKE_INSTALL_PREFIX=../ -DASTCENC_ISA_AVX2=ON -DASTCENC_ISA_SSE41=ON -DASTCENC_ISA_SSE2=ON -DASTCENC_PACKAGE=x64 ..
|
||||
msbuild astcencoder.sln -property:Configuration=Release
|
||||
msbuild PACKAGE.vcxproj -property:Configuration=Release
|
||||
msbuild INSTALL.vcxproj -property:Configuration=Release
|
||||
shell: cmd
|
||||
|
||||
- name: Setup Visual Studio arm64
|
||||
uses: ilammy/msvc-dev-cmd@v1
|
||||
with:
|
||||
arch: x86_arm64
|
||||
|
||||
- name: Build release arm64
|
||||
run: |
|
||||
mkdir build_rel_arm64
|
||||
cd build_rel_arm64
|
||||
cmake -G "Visual Studio 17 2022" -A ARM64 -T ClangCL -DCMAKE_INSTALL_PREFIX=../ -DASTCENC_ISA_NEON=ON -DASTCENC_PACKAGE=arm64 ..
|
||||
msbuild astcencoder.sln -property:Configuration=Release
|
||||
msbuild PACKAGE.vcxproj -property:Configuration=Release
|
||||
msbuild INSTALL.vcxproj -property:Configuration=Release
|
||||
shell: cmd
|
||||
|
||||
- name: Upload binaries
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: astcenc-windows-multi-cl
|
||||
path: |
|
||||
build_rel/*.zip
|
||||
build_rel/*.zip.sha256
|
||||
build_rel_arm64/*.zip
|
||||
build_rel_arm64/*.zip.sha256
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.10'
|
||||
|
||||
- name: Get Python modules
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install numpy Pillow
|
||||
shell: cmd
|
||||
|
||||
- name: Run system tests
|
||||
run: |
|
||||
python ./Test/astc_test_image.py --test-set Small
|
||||
shell: cmd
|
||||
|
||||
sign-binaries:
|
||||
if: github.repository_owner == 'Arm-software'
|
||||
name: Sign Windows and macOS
|
||||
runs-on: [self-hosted-ubuntu-latest-x64]
|
||||
needs: [build-macos-universal, build-windows-multi]
|
||||
steps:
|
||||
- name: Clean workspace
|
||||
uses: AutoModality/action-clean@v1
|
||||
|
||||
- name: Checkout signing code
|
||||
env:
|
||||
SIGNING_REPO_URL: ${{ secrets.SIGNING_REPO_URL }}
|
||||
run: |
|
||||
git clone --depth 1 ${SIGNING_REPO_URL}
|
||||
|
||||
- name: Install code sign v2 client
|
||||
env:
|
||||
ARTIFACTORY_USER: ${{ secrets.ARTIFACTORY_USER }}
|
||||
ARTIFACTORY_APIKEY: ${{ secrets.ARTIFACTORY_APIKEY }}
|
||||
ARTIFACTORY_FQDN: ${{ secrets.ARTIFACTORY_FQDN }}
|
||||
run: |
|
||||
python3.11 -m venv cs
|
||||
. ./cs/bin/activate
|
||||
pip install -i https://${ARTIFACTORY_USER}:${ARTIFACTORY_APIKEY}@${ARTIFACTORY_FQDN}/artifactory/api/pypi/dsgcore.pypi/simple code-signer-client
|
||||
|
||||
- name: Download macOS binaries
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: astcenc-macos-universal
|
||||
path: mac
|
||||
|
||||
- name: Download Windows binaries
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: astcenc-windows-multi-cl
|
||||
path: windows
|
||||
|
||||
- name: Sign macOS binaries
|
||||
env:
|
||||
CODESIGNER_USER: ${{ secrets.CODESIGNER_USER }}
|
||||
run: |
|
||||
. ./cs/bin/activate
|
||||
cd mac
|
||||
python3 ${GITHUB_WORKSPACE}/signing/macos-client-wrapper.py ${CODESIGNER_USER} *.zip
|
||||
|
||||
- name: Sign Windows binaries
|
||||
env:
|
||||
ARTIFACTORY_APIKEY: ${{ secrets.ARTIFACTORY_APIKEY }}
|
||||
run: |
|
||||
. ./cs/bin/activate
|
||||
cd windows
|
||||
for FILENAME in */*; do mv ${FILENAME} .; done
|
||||
for ZIPFILE in *.zip; do python3 ../signing/windows-client-wrapper.py -b ${GITHUB_RUN_NUMBER} -t ${ARTIFACTORY_APIKEY} ${ZIPFILE}; done
|
||||
|
||||
- name: Upload signed binaries
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: signed-binaries
|
||||
path: |
|
||||
windows/*
|
||||
mac/*
|
||||
|
||||
- name: Tidy intermediate artifacts
|
||||
uses: geekyeggo/delete-artifact@v5
|
||||
with:
|
||||
name: |
|
||||
astcenc-windows-multi-cl
|
||||
astcenc-macos-universal
|
||||
|
||||
prepare-release:
|
||||
if: ${{ (startsWith(github.event.ref, 'refs/tags/')) && (github.repository_owner == 'Arm-software') }}
|
||||
name: Prepare release
|
||||
runs-on: ubuntu-22.04
|
||||
needs: [sign-binaries, build-ubuntu-x64]
|
||||
steps:
|
||||
- name: Git checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Download signed binaries
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: signed-binaries
|
||||
path: prepare-release
|
||||
|
||||
- name: Download Linux x86_64 binaries
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: astcenc-linux-x86_64
|
||||
path: prepare-release
|
||||
|
||||
- name: Download Linux arm64 binaries
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: astcenc-linux-arm64
|
||||
path: prepare-release
|
||||
|
||||
- name: Flatten file structure
|
||||
run: |
|
||||
cd prepare-release
|
||||
for FILENAME in */*; do mv ${FILENAME} .; done
|
||||
rmdir windows
|
||||
rmdir mac
|
||||
|
||||
- name: Create checksum file
|
||||
run: |
|
||||
cd prepare-release
|
||||
cat *.sha256 > release-sha256.txt
|
||||
rm *.sha256
|
||||
|
||||
- name: Create release body
|
||||
run: |
|
||||
export STATUS_DATE=$(date "+%B %Y")
|
||||
GITHUB_REF=${{ github.ref }} ; export RELEASE_VERSION=${GITHUB_REF##*/}
|
||||
export SHA_CHECKSUMS=$(cat prepare-release/release-sha256.txt)
|
||||
envsubst < .github/workflows/release_body_template.md > prepare-release/release_body.md
|
||||
|
||||
- name: Create release
|
||||
id: create_release
|
||||
uses: comnoco/create-release-action@v2
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
with:
|
||||
tag_name: ${{ github.ref }}
|
||||
release_name: ${{ github.ref }}
|
||||
body_path: prepare-release/release_body.md
|
||||
draft: true
|
||||
|
||||
- name: Attach artifacts
|
||||
uses: AButler/upload-release-assets@v3.0
|
||||
with:
|
||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
release-id: ${{ steps.create_release.outputs.id }}
|
||||
files: "prepare-release/astcenc-*-*-*.zip;prepare-release/release-sha256.txt"
|
||||
@@ -0,0 +1,13 @@
|
||||
**Status:** ${STATUS_DATE}
|
||||
|
||||
The ${RELEASE_VERSION} release is a minor/major maintenance release.
|
||||
|
||||
* **General:**
|
||||
* **Bug fix:** Text here
|
||||
* **Feature:** Text here
|
||||
|
||||
## Binary release sha256 checksums
|
||||
|
||||
```
|
||||
${SHA_CHECKSUMS}
|
||||
```
|
||||
@@ -0,0 +1,47 @@
|
||||
# Editor and engineering scratch files
|
||||
.cache
|
||||
.vs
|
||||
.vscode
|
||||
.DS_Store
|
||||
*.log
|
||||
*.diff
|
||||
*.user
|
||||
*.o
|
||||
*.a
|
||||
__pycache__
|
||||
Scratch
|
||||
Proto
|
||||
|
||||
# Precompiled reference binaries for comparison tests
|
||||
bin
|
||||
lib
|
||||
Binaries
|
||||
|
||||
# Build artifacts
|
||||
astcenc
|
||||
build*
|
||||
|
||||
# General build artifacts
|
||||
Test/DocOut
|
||||
|
||||
# Test images we download from other sources
|
||||
Test/Images/Kodak*/**/*.png
|
||||
Test/Images/Scratch*
|
||||
|
||||
# Test output
|
||||
TestOutput
|
||||
/*.xlsx
|
||||
/*.jpg
|
||||
/*.json
|
||||
/*.log
|
||||
/*.txt
|
||||
/*.hdr
|
||||
/*.png
|
||||
/*.exr
|
||||
/*.astc
|
||||
astc_reference-main*
|
||||
Docs/Profiling.md
|
||||
Source/astcenccli_version.h
|
||||
|
||||
# Do not ignore workflows
|
||||
!.github/workflows/
|
||||
@@ -0,0 +1,12 @@
|
||||
; DO NOT EDIT (unless you know what you are doing)
|
||||
;
|
||||
; This subdirectory is a git "subrepo", and this file is maintained by the
|
||||
; git-subrepo command. See https://github.com/git-commands/git-subrepo#readme
|
||||
;
|
||||
[subrepo]
|
||||
remote = https://github.com/ARM-software/astc-encoder.git
|
||||
branch = 5.3.0
|
||||
commit = 30aabb3f42406df45a910d8496f9bee17eeba9bb
|
||||
parent = f9c73388a58de9b83f260f11008b043d8f7c0954
|
||||
method = merge
|
||||
cmdver = 0.4.9
|
||||
@@ -0,0 +1,532 @@
|
||||
[MASTER]
|
||||
|
||||
# Control the amount of potential inferred values when inferring a single
|
||||
# object. This can help the performance when dealing with large functions or
|
||||
# complex, nested conditions.
|
||||
limit-inference-results=100
|
||||
|
||||
# List of plugins (as comma separated values of python module names) to load,
|
||||
# usually to register additional checkers.
|
||||
load-plugins=pylint.extensions.docparams
|
||||
|
||||
# Pickle collected data for later comparisons.
|
||||
persistent=yes
|
||||
|
||||
# When enabled, pylint would attempt to guess common misconfiguration and emit
|
||||
# user-friendly hints instead of false-positive error messages.
|
||||
suggestion-mode=yes
|
||||
|
||||
# Allow loading of arbitrary C extensions. Extensions are imported into the
|
||||
# active Python interpreter and may run arbitrary code.
|
||||
unsafe-load-any-extension=no
|
||||
|
||||
# Ignore specific directories we don't author ourselves
|
||||
ignore=Test/DocSource
|
||||
|
||||
[MESSAGES CONTROL]
|
||||
|
||||
# Only show warnings with the listed confidence levels. Leave empty to show
|
||||
# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
|
||||
confidence=
|
||||
|
||||
# Disable the message, report, category or checker with the given id(s). You
|
||||
# can either give multiple identifiers separated by comma (,) or put this
|
||||
# option multiple times (only on the command line, not in the configuration
|
||||
# file where it should appear only once). You can also use "--disable=all" to
|
||||
# disable everything first and then reenable specific checks. For example, if
|
||||
# you want to run only the similarities checker, you can use "--disable=all
|
||||
# --enable=similarities". If you want to run only the classes checker, but have
|
||||
# no Warning level messages displayed, use "--disable=all --enable=classes
|
||||
# --disable=W".
|
||||
disable=print-statement,
|
||||
parameter-unpacking,
|
||||
unpacking-in-except,
|
||||
old-raise-syntax,
|
||||
backtick,
|
||||
long-suffix,
|
||||
old-ne-operator,
|
||||
old-octal-literal,
|
||||
import-star-module-level,
|
||||
non-ascii-bytes-literal,
|
||||
raw-checker-failed,
|
||||
bad-inline-option,
|
||||
locally-disabled,
|
||||
file-ignored,
|
||||
suppressed-message,
|
||||
useless-suppression,
|
||||
deprecated-pragma,
|
||||
use-symbolic-message-instead,
|
||||
apply-builtin,
|
||||
basestring-builtin,
|
||||
buffer-builtin,
|
||||
cmp-builtin,
|
||||
coerce-builtin,
|
||||
execfile-builtin,
|
||||
file-builtin,
|
||||
long-builtin,
|
||||
raw_input-builtin,
|
||||
reduce-builtin,
|
||||
standarderror-builtin,
|
||||
unicode-builtin,
|
||||
xrange-builtin,
|
||||
coerce-method,
|
||||
delslice-method,
|
||||
getslice-method,
|
||||
setslice-method,
|
||||
no-absolute-import,
|
||||
old-division,
|
||||
dict-iter-method,
|
||||
dict-view-method,
|
||||
next-method-called,
|
||||
metaclass-assignment,
|
||||
indexing-exception,
|
||||
raising-string,
|
||||
reload-builtin,
|
||||
oct-method,
|
||||
hex-method,
|
||||
nonzero-method,
|
||||
cmp-method,
|
||||
input-builtin,
|
||||
round-builtin,
|
||||
intern-builtin,
|
||||
unichr-builtin,
|
||||
map-builtin-not-iterating,
|
||||
zip-builtin-not-iterating,
|
||||
range-builtin-not-iterating,
|
||||
filter-builtin-not-iterating,
|
||||
using-cmp-argument,
|
||||
eq-without-hash,
|
||||
div-method,
|
||||
idiv-method,
|
||||
rdiv-method,
|
||||
exception-message-attribute,
|
||||
invalid-str-codec,
|
||||
sys-max-int,
|
||||
bad-python3-import,
|
||||
deprecated-string-function,
|
||||
deprecated-str-translate-call,
|
||||
deprecated-itertools-function,
|
||||
deprecated-types-field,
|
||||
next-method-defined,
|
||||
dict-items-not-iterating,
|
||||
dict-keys-not-iterating,
|
||||
dict-values-not-iterating,
|
||||
deprecated-operator-function,
|
||||
deprecated-urllib-function,
|
||||
xreadlines-attribute,
|
||||
deprecated-sys-function,
|
||||
exception-escape,
|
||||
comprehension-escape
|
||||
|
||||
# Enable the message, report, category or checker with the given id(s). You can
|
||||
# either give multiple identifier separated by comma (,) or put this option
|
||||
# multiple time (only on the command line, not in the configuration file where
|
||||
# it should appear only once). See also the "--disable" option for examples.
|
||||
enable=c-extension-no-member
|
||||
|
||||
|
||||
[REPORTS]
|
||||
|
||||
# Python expression which should return a score less than or equal to 10. You
|
||||
# have access to the variables 'error', 'warning', 'refactor', and 'convention'
|
||||
# which contain the number of messages in each category, as well as 'statement'
|
||||
# which is the total number of statements analyzed. This score is used by the
|
||||
# global evaluation report (RP0004).
|
||||
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
|
||||
|
||||
# Set the output format. Available formats are text, parseable, colorized, json
|
||||
# and msvs (visual studio). You can also give a reporter class, e.g.
|
||||
# mypackage.mymodule.MyReporterClass.
|
||||
output-format=text
|
||||
|
||||
# Tells whether to display a full report or only the messages.
|
||||
reports=no
|
||||
|
||||
# Activate the evaluation score.
|
||||
score=yes
|
||||
|
||||
|
||||
[REFACTORING]
|
||||
|
||||
# Maximum number of nested blocks for function / method body
|
||||
max-nested-blocks=5
|
||||
|
||||
# Complete name of functions that never returns.
|
||||
never-returning-functions=sys.exit
|
||||
|
||||
|
||||
[BASIC]
|
||||
|
||||
# Naming style matching correct argument names.
|
||||
argument-naming-style=camelCase
|
||||
|
||||
# Regular expression matching correct argument names. Overrides argument-
|
||||
# naming-style.
|
||||
#argument-rgx=
|
||||
|
||||
# Naming style matching correct attribute names.
|
||||
attr-naming-style=camelCase
|
||||
|
||||
# Regular expression matching correct attribute names. Overrides attr-naming-
|
||||
# style.
|
||||
#attr-rgx=
|
||||
|
||||
# Bad variable names which should always be refused, separated by a comma.
|
||||
bad-names=foo,
|
||||
bar,
|
||||
baz,
|
||||
toto,
|
||||
tutu,
|
||||
tata
|
||||
|
||||
# Naming style matching correct class attribute names.
|
||||
class-attribute-naming-style=any
|
||||
|
||||
# Regular expression matching correct class attribute names. Overrides class-
|
||||
# attribute-naming-style.
|
||||
#class-attribute-rgx=
|
||||
|
||||
# Naming style matching correct class names.
|
||||
class-naming-style=PascalCase
|
||||
|
||||
# Regular expression matching correct class names. Overrides class-naming-
|
||||
# style.
|
||||
#class-rgx=
|
||||
|
||||
# Naming style matching correct constant names.
|
||||
const-naming-style=UPPER_CASE
|
||||
|
||||
# Regular expression matching correct constant names. Overrides const-naming-
|
||||
# style.
|
||||
#const-rgx=
|
||||
|
||||
# Minimum line length for functions/classes that require docstrings, shorter
|
||||
# ones are exempt.
|
||||
docstring-min-length=-1
|
||||
|
||||
# Naming style matching correct function names.
|
||||
function-naming-style=snake_case
|
||||
|
||||
# Regular expression matching correct function names. Overrides function-
|
||||
# naming-style.
|
||||
#function-rgx=
|
||||
|
||||
# Good variable names which should always be accepted, separated by a comma.
|
||||
good-names=i,j,k,x,y,z,w,r,g,b,a,ex,Run,_
|
||||
|
||||
# Include a hint for the correct naming format with invalid-name.
|
||||
include-naming-hint=no
|
||||
|
||||
# Naming style matching correct inline iteration names.
|
||||
inlinevar-naming-style=any
|
||||
|
||||
# Regular expression matching correct inline iteration names. Overrides
|
||||
# inlinevar-naming-style.
|
||||
#inlinevar-rgx=
|
||||
|
||||
# Naming style matching correct method names.
|
||||
method-naming-style=snake_case
|
||||
|
||||
# Regular expression matching correct method names. Overrides method-naming-
|
||||
# style.
|
||||
#method-rgx=
|
||||
|
||||
# Naming style matching correct module names.
|
||||
module-naming-style=snake_case
|
||||
|
||||
# Regular expression matching correct module names. Overrides module-naming-
|
||||
# style.
|
||||
#module-rgx=
|
||||
|
||||
# Colon-delimited sets of names that determine each other's naming style when
|
||||
# the name regexes allow several styles.
|
||||
name-group=
|
||||
|
||||
# Regular expression which should only match function or class names that do
|
||||
# not require a docstring.
|
||||
no-docstring-rgx=^_
|
||||
|
||||
# List of decorators that produce properties, such as abc.abstractproperty. Add
|
||||
# to this list to register other decorators that produce valid properties.
|
||||
# These decorators are taken in consideration only for invalid-name.
|
||||
property-classes=abc.abstractproperty
|
||||
|
||||
# Naming style matching correct variable names.
|
||||
variable-naming-style=camelCase
|
||||
|
||||
[FORMAT]
|
||||
|
||||
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
|
||||
expected-line-ending-format=
|
||||
|
||||
# Regexp for a line that is allowed to be longer than the limit.
|
||||
ignore-long-lines=^\s*(# )?<?https?://\S+>?$
|
||||
|
||||
# Number of spaces of indent required inside a hanging or continued line.
|
||||
indent-after-paren=4
|
||||
|
||||
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
|
||||
# tab).
|
||||
indent-string=' '
|
||||
|
||||
# Maximum number of characters on a single line.
|
||||
max-line-length=79
|
||||
|
||||
# Maximum number of lines in a module.
|
||||
max-module-lines=1000
|
||||
|
||||
# Allow the body of a class to be on the same line as the declaration if body
|
||||
# contains single statement.
|
||||
single-line-class-stmt=no
|
||||
|
||||
# Allow the body of an if to be on the same line as the test if there is no
|
||||
# else.
|
||||
single-line-if-stmt=no
|
||||
|
||||
|
||||
[LOGGING]
|
||||
|
||||
# Format style used to check logging format string. `old` means using %
|
||||
# formatting, `new` is for `{}` formatting,and `fstr` is for f-strings.
|
||||
logging-format-style=old
|
||||
|
||||
# Logging modules to check that the string format arguments are in logging
|
||||
# function parameter format.
|
||||
logging-modules=logging
|
||||
|
||||
|
||||
[MISCELLANEOUS]
|
||||
|
||||
# List of note tags to take in consideration, separated by a comma.
|
||||
notes=FIXME,XXX,TODO
|
||||
|
||||
|
||||
[SIMILARITIES]
|
||||
|
||||
# Ignore comments when computing similarities.
|
||||
ignore-comments=yes
|
||||
|
||||
# Ignore docstrings when computing similarities.
|
||||
ignore-docstrings=yes
|
||||
|
||||
# Ignore imports when computing similarities.
|
||||
ignore-imports=no
|
||||
|
||||
# Minimum lines number of a similarity.
|
||||
min-similarity-lines=4
|
||||
|
||||
|
||||
[SPELLING]
|
||||
|
||||
# Limits count of emitted suggestions for spelling mistakes.
|
||||
max-spelling-suggestions=4
|
||||
|
||||
# Spelling dictionary name. Available dictionaries: none. To make it work,
|
||||
# install the python-enchant package.
|
||||
spelling-dict=
|
||||
|
||||
# List of comma separated words that should not be checked.
|
||||
spelling-ignore-words=
|
||||
|
||||
# A path to a file that contains the private dictionary; one word per line.
|
||||
spelling-private-dict-file=
|
||||
|
||||
# Tells whether to store unknown words to the private dictionary (see the
|
||||
# --spelling-private-dict-file option) instead of raising a message.
|
||||
spelling-store-unknown-words=no
|
||||
|
||||
|
||||
[STRING]
|
||||
|
||||
# This flag controls whether the implicit-str-concat-in-sequence should
|
||||
# generate a warning on implicit string concatenation in sequences defined over
|
||||
# several lines.
|
||||
check-str-concat-over-line-jumps=no
|
||||
|
||||
|
||||
[TYPECHECK]
|
||||
|
||||
# List of decorators that produce context managers, such as
|
||||
# contextlib.contextmanager. Add to this list to register other decorators that
|
||||
# produce valid context managers.
|
||||
contextmanager-decorators=contextlib.contextmanager
|
||||
|
||||
# List of members which are set dynamically and missed by pylint inference
|
||||
# system, and so shouldn't trigger E1101 when accessed. Python regular
|
||||
# expressions are accepted.
|
||||
generated-members=
|
||||
|
||||
# Tells whether missing members accessed in mixin class should be ignored. A
|
||||
# mixin class is detected if its name ends with "mixin" (case insensitive).
|
||||
ignore-mixin-members=yes
|
||||
|
||||
# Tells whether to warn about missing members when the owner of the attribute
|
||||
# is inferred to be None.
|
||||
ignore-none=yes
|
||||
|
||||
# This flag controls whether pylint should warn about no-member and similar
|
||||
# checks whenever an opaque object is returned when inferring. The inference
|
||||
# can return multiple potential results while evaluating a Python object, but
|
||||
# some branches might not be evaluated, which results in partial inference. In
|
||||
# that case, it might be useful to still emit no-member and other checks for
|
||||
# the rest of the inferred objects.
|
||||
ignore-on-opaque-inference=yes
|
||||
|
||||
# List of class names for which member attributes should not be checked (useful
|
||||
# for classes with dynamically set attributes). This supports the use of
|
||||
# qualified names.
|
||||
ignored-classes=optparse.Values,thread._local,_thread._local
|
||||
|
||||
# List of module names for which member attributes should not be checked
|
||||
# (useful for modules/projects where namespaces are manipulated during runtime
|
||||
# and thus existing member attributes cannot be deduced by static analysis). It
|
||||
# supports qualified module names, as well as Unix pattern matching.
|
||||
ignored-modules=signal
|
||||
|
||||
# Show a hint with possible names when a member name was not found. The aspect
|
||||
# of finding the hint is based on edit distance.
|
||||
missing-member-hint=yes
|
||||
|
||||
# The minimum edit distance a name should have in order to be considered a
|
||||
# similar match for a missing member name.
|
||||
missing-member-hint-distance=1
|
||||
|
||||
# The total number of similar names that should be taken in consideration when
|
||||
# showing a hint for a missing member.
|
||||
missing-member-max-choices=1
|
||||
|
||||
# List of decorators that change the signature of a decorated function.
|
||||
signature-mutators=
|
||||
|
||||
|
||||
[VARIABLES]
|
||||
|
||||
# List of additional names supposed to be defined in builtins. Remember that
|
||||
# you should avoid defining new builtins when possible.
|
||||
additional-builtins=
|
||||
|
||||
# Tells whether unused global variables should be treated as a violation.
|
||||
allow-global-unused-variables=yes
|
||||
|
||||
# List of strings which can identify a callback function by name. A callback
|
||||
# name must start or end with one of those strings.
|
||||
callbacks=cb_,_cb
|
||||
|
||||
# A regular expression matching the name of dummy variables (i.e. expected to
|
||||
# not be used).
|
||||
dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
|
||||
|
||||
# Argument names that match this expression will be ignored. Default to name
|
||||
# with leading underscore.
|
||||
ignored-argument-names=_.*|^ignored_|^unused_
|
||||
|
||||
# Tells whether we should check for unused import in __init__ files.
|
||||
init-import=no
|
||||
|
||||
# List of qualified module names which can have objects that can redefine
|
||||
# builtins.
|
||||
redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
|
||||
|
||||
|
||||
[CLASSES]
|
||||
|
||||
# List of method names used to declare (i.e. assign) instance attributes.
|
||||
defining-attr-methods=__init__,
|
||||
__new__,
|
||||
setUp,
|
||||
__post_init__
|
||||
|
||||
# List of member names, which should be excluded from the protected access
|
||||
# warning.
|
||||
exclude-protected=_asdict,
|
||||
_fields,
|
||||
_replace,
|
||||
_source,
|
||||
_make
|
||||
|
||||
# List of valid names for the first argument in a class method.
|
||||
valid-classmethod-first-arg=cls
|
||||
|
||||
# List of valid names for the first argument in a metaclass class method.
|
||||
valid-metaclass-classmethod-first-arg=cls
|
||||
|
||||
|
||||
[DESIGN]
|
||||
|
||||
# Maximum number of arguments for function / method.
|
||||
max-args=7
|
||||
|
||||
# Maximum number of attributes for a class (see R0902).
|
||||
max-attributes=16
|
||||
|
||||
# Maximum number of boolean expressions in an if statement (see R0916).
|
||||
max-bool-expr=5
|
||||
|
||||
# Maximum number of branch for function / method body.
|
||||
max-branches=12
|
||||
|
||||
# Maximum number of locals for function / method body.
|
||||
max-locals=16
|
||||
|
||||
# Maximum number of parents for a class (see R0901).
|
||||
max-parents=7
|
||||
|
||||
# Maximum number of public methods for a class (see R0904).
|
||||
max-public-methods=20
|
||||
|
||||
# Maximum number of return / yield for function / method body.
|
||||
max-returns=6
|
||||
|
||||
# Maximum number of statements in function / method body.
|
||||
max-statements=50
|
||||
|
||||
# Minimum number of public methods for a class (see R0903).
|
||||
min-public-methods=0
|
||||
|
||||
|
||||
[IMPORTS]
|
||||
|
||||
# List of modules that can be imported at any level, not just the top level
|
||||
# one.
|
||||
allow-any-import-level=
|
||||
|
||||
# Allow wildcard imports from modules that define __all__.
|
||||
allow-wildcard-with-all=no
|
||||
|
||||
# Analyse import fallback blocks. This can be used to support both Python 2 and
|
||||
# 3 compatible code, which means that the block might have code that exists
|
||||
# only in one or another interpreter, leading to false positives when analysed.
|
||||
analyse-fallback-blocks=no
|
||||
|
||||
# Deprecated modules which should not be used, separated by a comma.
|
||||
deprecated-modules=optparse,tkinter.tix
|
||||
|
||||
# Create a graph of external dependencies in the given file (report RP0402 must
|
||||
# not be disabled).
|
||||
ext-import-graph=
|
||||
|
||||
# Create a graph of every (i.e. internal and external) dependencies in the
|
||||
# given file (report RP0402 must not be disabled).
|
||||
import-graph=
|
||||
|
||||
# Create a graph of internal dependencies in the given file (report RP0402 must
|
||||
# not be disabled).
|
||||
int-import-graph=
|
||||
|
||||
# Force import order to recognize a module as part of the standard
|
||||
# compatibility libraries.
|
||||
known-standard-library=
|
||||
|
||||
# Force import order to recognize a module as part of a third party library.
|
||||
known-third-party=enchant
|
||||
|
||||
# Couples of modules and preferred modules, separated by a comma.
|
||||
preferred-modules=
|
||||
|
||||
|
||||
[EXCEPTIONS]
|
||||
|
||||
# Exceptions that will emit a warning when being caught. Defaults to
|
||||
# "BaseException, Exception".
|
||||
overgeneral-exceptions=BaseException,
|
||||
Exception
|
||||
@@ -0,0 +1,315 @@
|
||||
# Building ASTC Encoder
|
||||
|
||||
This page provides instructions for building `astcenc` from the sources in
|
||||
this repository.
|
||||
|
||||
Builds must use CMake 3.15 or higher as the build system generator. The
|
||||
examples on this page show how to use it to generate build systems for NMake
|
||||
(Windows) and Make (Linux and macOS), but CMake supports other build system
|
||||
backends.
|
||||
|
||||
## Windows
|
||||
|
||||
Builds for Windows are tested with CMake 3.17, and Visual Studio 2019 or newer.
|
||||
|
||||
### Configuring the build
|
||||
|
||||
To use CMake you must first configure the build. Create a build directory in
|
||||
the root of the `astcenc` checkout, and then run `cmake` inside that directory
|
||||
to generate the build system.
|
||||
|
||||
```shell
|
||||
# Create a build directory
|
||||
mkdir build
|
||||
cd build
|
||||
|
||||
# Configure your build of choice, for example:
|
||||
|
||||
# x86-64 using a Visual Studio solution
|
||||
cmake -G "Visual Studio 16 2019" -T ClangCL -DCMAKE_INSTALL_PREFIX=..\ ^
|
||||
-DASTCENC_ISA_AVX2=ON -DASTCENC_ISA_SSE41=ON -DASTCENC_ISA_SSE2=ON ..
|
||||
|
||||
# x86-64 using NMake
|
||||
cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=..\ ^
|
||||
-DASTCENC_ISA_AVX2=ON -DASTCENC_ISA_SSE41=ON -DASTCENC_ISA_SSE2=ON ..
|
||||
```
|
||||
|
||||
A single CMake configure can build multiple binaries for a single target CPU
|
||||
architecture, for example building x64 for both SSE2 and AVX2. Each binary name
|
||||
will include the build variant as a postfix. It is possible to build any set of
|
||||
the supported SIMD variants by enabling only the ones you require.
|
||||
|
||||
Using the Visual Studio Clang-CL LLVM toolchain (`-T ClangCL`) is optional but
|
||||
produces significantly faster binaries than the default toolchain. The C++ LLVM
|
||||
toolchain component must be installed via the Visual Studio installer.
|
||||
|
||||
### Building
|
||||
|
||||
Once you have configured the build you can use NMake to compile the project
|
||||
from your build dir, and install to your target install directory.
|
||||
|
||||
```shell
|
||||
# Run a build and install build outputs in `${CMAKE_INSTALL_PREFIX}/bin/`
|
||||
cd build
|
||||
nmake install
|
||||
```
|
||||
|
||||
## macOS and Linux using Make
|
||||
|
||||
Builds for macOS and Linux are tested with CMake 3.17, and clang++ 9.0 or
|
||||
newer.
|
||||
|
||||
> Compiling using g++ is supported, but clang++ builds are faster by ~15%.
|
||||
|
||||
### Configuring the build
|
||||
|
||||
To use CMake you must first configure the build. Create a build directory
|
||||
in the root of the astcenc checkout, and then run `cmake` inside that directory
|
||||
to generate the build system.
|
||||
|
||||
```shell
|
||||
# Select your compiler (clang++ recommended, but g++ works)
|
||||
export CXX=clang++
|
||||
|
||||
# Create a build directory
|
||||
mkdir build
|
||||
cd build
|
||||
|
||||
# Configure your build of choice, for example:
|
||||
|
||||
# Arm arch64
|
||||
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ \
|
||||
-DASTCENC_ISA_NEON=ON ..
|
||||
|
||||
# x86-64
|
||||
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ \
|
||||
-DASTCENC_ISA_AVX2=ON -DASTCENC_ISA_SSE41=ON -DASTCENC_ISA_SSE2=ON ..
|
||||
|
||||
# macOS universal binary build
|
||||
cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=../ ..
|
||||
```
|
||||
|
||||
A single CMake configure can build multiple binaries for a single target CPU
|
||||
architecture, for example building x64 for both SSE2 and AVX2. Each binary name
|
||||
will include the build variant as a postfix. It is possible to build any set of
|
||||
the supported SIMD variants by enabling only the ones you require.
|
||||
|
||||
For macOS, we additionally support the ability to build a universal binary.
|
||||
This build includes SSE4.1 (`x86_64`), AVX2 (`x86_64h`), and NEON (`arm64`)
|
||||
build slices in a single output binary. The OS will select the correct variant
|
||||
to run for the machine being used. This is the default build target for a macOS
|
||||
build, but single-target binaries can still be built by setting
|
||||
`-DASTCENC_UNIVERSAL_BINARY=OFF` and then manually selecting the specific ISA
|
||||
variants that are required.
|
||||
|
||||
### Building
|
||||
|
||||
Once you have configured the build you can use Make to compile the project from
|
||||
your build dir, and install to your target install directory.
|
||||
|
||||
```shell
|
||||
# Run a build and install build outputs in `${CMAKE_INSTALL_PREFIX}/bin/`
|
||||
# for executable binaries and `${CMAKE_INSTALL_PREFIX}/lib/` for libraries
|
||||
cd build
|
||||
make install -j16
|
||||
```
|
||||
|
||||
## macOS using XCode
|
||||
|
||||
Builds for macOS and Linux are tested with CMake 3.17, and XCode 14.0 or
|
||||
newer.
|
||||
|
||||
### Configuring the build
|
||||
|
||||
To use CMake you must first configure the build. Create a build directory
|
||||
in the root of the astcenc checkout, and then run `cmake` inside that directory
|
||||
to generate the build system.
|
||||
|
||||
```shell
|
||||
# Create a build directory
|
||||
mkdir build
|
||||
cd build
|
||||
|
||||
# Configure a universal build
|
||||
cmake -G Xcode -DCMAKE_INSTALL_PREFIX=../ ..
|
||||
```
|
||||
|
||||
### Building
|
||||
|
||||
Once you have configured the build you can use CMake to compile the project
|
||||
from your build dir, and install to your target install directory.
|
||||
|
||||
```shell
|
||||
cmake --build . --config Release
|
||||
|
||||
# Optionally install the binaries to the installation directory
|
||||
cmake --install . --config Release
|
||||
```
|
||||
|
||||
## Advanced build options
|
||||
|
||||
For codec developers and power users there are a number of useful features in
|
||||
the build system.
|
||||
|
||||
### Build Types
|
||||
|
||||
We support and test the following `CMAKE_BUILD_TYPE` options.
|
||||
|
||||
| Value | Description |
|
||||
| ---------------- | -------------------------------------------------------- |
|
||||
| Release | Optimized release build |
|
||||
| RelWithDebInfo | Optimized release build with debug info |
|
||||
| Debug | Unoptimized debug build with debug info |
|
||||
|
||||
Note that optimized release builds are compiled with link-time optimization,
|
||||
which can make profiling more challenging ...
|
||||
|
||||
### Shared Libraries
|
||||
|
||||
We support building the core library as a shared object by setting the CMake
|
||||
option `-DASTCENC_SHAREDLIB=ON` at configure time. For macOS build targets the
|
||||
shared library supports the same universal build configuration as the command
|
||||
line utility.
|
||||
|
||||
Note that the command line tool is always statically linked; the shared objects
|
||||
are an extra build output that are not currently used by the command line tool.
|
||||
|
||||
### Constrained block size builds
|
||||
|
||||
All normal builds will support all ASTC block sizes, including the worst case
|
||||
6x6x6 3D block size (216 texels per block). Compressor memory footprint and
|
||||
performance can be improved by limiting the block sizes supported in the build
|
||||
by adding `-DASTCENC_BLOCK_MAX_TEXELS=<texel_count>` to to CMake command line
|
||||
when configuring. Legal block sizes that are unavailable in a restricted build
|
||||
will return the error `ASTCENC_ERR_NOT_IMPLEMENTED` during context creation.
|
||||
|
||||
### Non-invariant builds
|
||||
|
||||
All normal builds are designed to be invariant, so any build from the same git
|
||||
revision will produce bit-identical results for all compilers and CPU
|
||||
architectures. To achieve this we sacrifice some performance, so if this is
|
||||
not required you can specify `-DASTCENC_INVARIANCE=OFF` to enable additional
|
||||
optimizations. This has most benefit for AVX2 builds where we are able to
|
||||
enable use of the FMA instruction set extensions.
|
||||
|
||||
### No intrinsics builds
|
||||
|
||||
All normal builds will use SIMD accelerated code paths using intrinsics, as all
|
||||
supported target architectures (x86 and arm64) guarantee SIMD availability. For
|
||||
development purposes it is possible to build an intrinsic-free build which uses
|
||||
no explicit SIMD acceleration (the compiler may still auto-vectorize).
|
||||
|
||||
To enable this binary variant add `-DASTCENC_ISA_NONE=ON` to the CMake command
|
||||
line when configuring. It is NOT recommended to use this for production; it is
|
||||
significantly slower than the vectorized SIMD builds.
|
||||
|
||||
### No x86 gather instruction builds
|
||||
|
||||
On many x86 microarchitectures the native AVX gather instructions are slower
|
||||
than simply performing manual scalar loads and combining the results. Gathers
|
||||
are enabled by default, but can be disabled by setting the CMake option
|
||||
`-DASTCENC_X86_GATHERS=OFF` on the command line when configuring.
|
||||
|
||||
Note that we have seen mixed results when compiling the scalar fallback path,
|
||||
so we would recommend testing which option works best for the compiler and
|
||||
microarchitecture pairing that you are targeting.
|
||||
|
||||
### Test builds
|
||||
|
||||
We support building unit tests. These use the `googletest` framework, which is
|
||||
pulled in though a git submodule. On first use, you must fetch the submodule
|
||||
dependency:
|
||||
|
||||
```shell
|
||||
git submodule init
|
||||
git submodule update
|
||||
```
|
||||
|
||||
To build unit tests add `-DASTCENC_UNITTEST=ON` to the CMake command line when
|
||||
configuring.
|
||||
|
||||
To run unit tests use the CMake `ctest` utility from your build directory after
|
||||
you have built the tests.
|
||||
|
||||
```shell
|
||||
cd build
|
||||
ctest --verbose
|
||||
```
|
||||
|
||||
### Sanitizer builds
|
||||
|
||||
We support building with sanitizers on Linux and macOS when using Clang.
|
||||
|
||||
To build binaries with ASAN checking enabled add `-DASTCENC_ASAN=ON` to the
|
||||
CMake command line when configuring.
|
||||
|
||||
To build binaries with UBSAN checking enabled add `-DASTCENC_UBSAN=ON` to the
|
||||
CMake command line when configuring.
|
||||
|
||||
### Android builds
|
||||
|
||||
Builds of the command line utility for Android are not officially supported, but can be a useful
|
||||
development build for testing on e.g. different Arm CPU microarchitectures.
|
||||
|
||||
The build script below shows one possible route to building the command line tool for Android. Once
|
||||
built the application can be pushed to e.g. `/data/local/tmp` and executed from an Android shell
|
||||
terminal over `adb`.
|
||||
|
||||
```shell
|
||||
ANDROID_ABI=arm64-v8a
|
||||
ANDROID_NDK=/work/tools/android/ndk/22.1.7171670
|
||||
|
||||
BUILD_TYPE=RelWithDebInfo
|
||||
|
||||
BUILD_DIR=build
|
||||
|
||||
mkdir -p ${BUILD_DIR}
|
||||
cd ${BUILD_DIR}
|
||||
|
||||
cmake \
|
||||
-DCMAKE_INSTALL_PREFIX=./ \
|
||||
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||
-DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
|
||||
-DANDROID_ABI=${ANDROID_ABI} \
|
||||
-DANDROID_ARM_NEON=ON \
|
||||
-DANDROID_PLATFORM=android-21 \
|
||||
-DCMAKE_ANDROID_NDK_TOOLCHAIN_VERSION=clang \
|
||||
-DANDROID_TOOLCHAIN=clang \
|
||||
-DANDROID_STL=c++_static \
|
||||
-DARCH=aarch64 \
|
||||
-DASTCENC_ISA_NEON=ON \
|
||||
..
|
||||
|
||||
make -j16
|
||||
```
|
||||
|
||||
## Packaging a release bundle
|
||||
|
||||
We support building a release bundle of all enabled binary configurations in
|
||||
the current CMake configuration using the `package` build target
|
||||
|
||||
Configure CMake with:
|
||||
|
||||
* `-DASTCENC_PACAKGE=<arch>` to set the package architecture/variant name used
|
||||
to name the package archive (not set by default).
|
||||
|
||||
```shell
|
||||
# Run a build and package build outputs in `./astcenc-<ver>-<os>-<arch>.<fmt>`
|
||||
cd build
|
||||
make package -j16
|
||||
```
|
||||
|
||||
Windows packages will use the `.zip` format, other packages will use the
|
||||
`.tar.gz` format.
|
||||
|
||||
## Integrating as a library into another project
|
||||
|
||||
The core codec of `astcenc` is built as a library, and so can be easily
|
||||
integrated into other projects using CMake. An example of the CMake integration
|
||||
and the codec API usage can be found in the `./Utils/Example` directory in the
|
||||
repository. See the [Example Readme](../Utils/Example/README.md) for more
|
||||
details.
|
||||
|
||||
- - -
|
||||
|
||||
_Copyright © 2019-2024, Arm Limited and contributors. All rights reserved._
|
||||
@@ -0,0 +1,328 @@
|
||||
# 2.x series change log
|
||||
|
||||
This page summarizes the major functional and performance changes in each
|
||||
release of the 2.x series.
|
||||
|
||||
All performance data on this page is measured on an Intel Core i5-9600K
|
||||
clocked at 4.2 GHz, running astcenc using 6 threads.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 2.5
|
||||
|
||||
**Status:** Released, March 2021
|
||||
|
||||
The 2.5 release is the last major release in the 2.x series. After this release
|
||||
a `2.x` branch will provide stable long-term support, and the `main` branch
|
||||
will switch to focusing on more radical changes for the 3.x series.
|
||||
|
||||
Reminder for users of the library interface - the API is not designed to be
|
||||
stable across versions, and this release is not compatible with earlier 2.x
|
||||
releases. Please update and rebuild your client-side code using the updated
|
||||
`astcenc.h` header.
|
||||
|
||||
**General:**
|
||||
* **Feature:** The `ISA_INVARIANCE` build option is no longer supported, as
|
||||
there is no longer any performance benefit from the variant paths. All
|
||||
builds are now using the equivalent of the `ISA_INVARIANCE=ON` setting, and
|
||||
all builds (except Armv7) are now believed to be invariant across operating
|
||||
systems, compilers, CPU architectures, and SIMD instruction sets.
|
||||
* **Feature:** Armv8 32-bit builds with NEON are now supported, with
|
||||
out-of-the-box support for Arm Linux soft-float and hard-float ABIs. There
|
||||
are no pre-built binaries for these targets; support is included for
|
||||
library users targeting older 32-bit Android and iOS devices.
|
||||
* **Feature:** A compressor mode for encoding HDR textures that have been
|
||||
encoded into LDR RGBM wrapper format is now supported. Note that this
|
||||
encoding has some strong recommendations for how the RGBM encoding is
|
||||
implemented to avoid block artifacts in the compressed image.
|
||||
* **Core API:**
|
||||
* **API Change:** The core API has been changed to be a pure C API, making it
|
||||
easier to wrap the codec in a stable shared library ABI. Some entry points
|
||||
that used to accept references now expect pointers.
|
||||
* **API Change:** The decompression functionality in the core API has been
|
||||
changed to allow use of multiple threads. The design pattern matches the
|
||||
compression functionality, requiring the caller to create the threads,
|
||||
synchronize them between images, and to call the new
|
||||
`astcenc_decompress_reset()` function between images.
|
||||
* **API Feature:** Defines to support exporting public API entry point
|
||||
symbols from a shared object are provided, but not exposed off-the-shelf by
|
||||
the CMake provided by the project.
|
||||
* **API Feature:** New `astcenc_get_block_info()` function added to the core
|
||||
API to allow users to perform high level analysis of compressed data. This
|
||||
API is not implemented in decompressor-only builds.
|
||||
* **API Feature:** Codec configuration structure has been extended to expose
|
||||
the new RGBM compression mode. See the API header for details.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 2.4
|
||||
|
||||
**Status:** Released, February 2021
|
||||
|
||||
The 2.4 release is the fifth release in the 2.x series. It is primarily a bug
|
||||
fix release for HDR image handling, which impacts all earlier 2.x series
|
||||
releases.
|
||||
|
||||
**General:**
|
||||
* **Feature:** When using the `-a` option, or the equivalent config option
|
||||
for the API, any 2D blocks that are entirely zero alpha after the alpha
|
||||
filter radius is taken into account are replaced by transparent black
|
||||
constant color blocks. This is an RDO-like technique to improve compression
|
||||
ratios of any additional application packaging compression that is applied.
|
||||
**Command Line:**
|
||||
* **Bug fix:** The command line wrapper now correctly loads HDR images that
|
||||
have a non-square aspect ratio.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 2.3
|
||||
|
||||
**Status:** Released, January 2021
|
||||
|
||||
The 2.3 release is the fourth release in the 2.x series. It includes a number
|
||||
of performance improvements and new features.
|
||||
|
||||
Reminder for users of the library interface - the API is not designed to be
|
||||
stable across versions, and this release is not compatible with 2.2. Please
|
||||
recompile your client-side code using the updated `astcenc.h` header.
|
||||
|
||||
* **General:**
|
||||
* **Feature:** Decompressor-only builds of the codec are supported again.
|
||||
While this is primarily a feature for library users who want to shrink
|
||||
binary size, a variant command line tool `astcdec` can be built by
|
||||
specifying `DECOMPRESSOR=ON` on the CMake configure command line.
|
||||
* **Feature:** Diagnostic builds of the codec can now be built. These builds
|
||||
generate a JSON file containing a trace of the compressor execution.
|
||||
Diagnostic builds are only suitable for codec development; they are slower
|
||||
and JSON generation cannot be disabled. Build by setting `DIAGNOSTICS=ON`
|
||||
on the CMake configure command line.
|
||||
* **Feature:** Code compatibility improved with older versions of GCC,
|
||||
earliest compiler now tested is GCC 7.5 (was GCC 9.3).
|
||||
* **Feature:** Code compatibility improved with newer versions of LLVM,
|
||||
latest compiler now tested is Clang 12.0 (was Clang 9.0).
|
||||
* **Feature:** Code compatibility improved with the Visual Studio 2019 LLVM
|
||||
toolset (`clang-cl`). Using the LLVM toolset gives 25% performance
|
||||
improvements and is recommended.
|
||||
* **Command Line:**
|
||||
* **Feature:** Quality level now accepts either a preset (`-fast`, etc) or a
|
||||
float value between 0 and 100, allowing more control over the compression
|
||||
quality vs performance trade-off. The presets are not evenly spaced in the
|
||||
float range; they have been spaced to give the best distribution of points
|
||||
between the fast and thorough presets.
|
||||
* `-fastest`: 0.0
|
||||
* `-fast`: 10.0
|
||||
* `-medium`: 60.0
|
||||
* `-thorough`: 98.0
|
||||
* `-exhaustive`: 100.0
|
||||
* **Core API:**
|
||||
* **API Change:** Quality level preset enum replaced with a float value
|
||||
between 0 (`-fastest`) and 100 (`-exhaustive`). See above for more info.
|
||||
|
||||
### Performance
|
||||
|
||||
This release includes a number of optimizations to improve performance.
|
||||
|
||||
* New compressor algorithm for handling encoding candidates and refinement.
|
||||
* Vectorized implementation of `compute_error_of_weight_set()`.
|
||||
* Unrolled implementation of `encode_ise()`.
|
||||
* Many other small improvements!
|
||||
|
||||
The most significant change is the change to the compressor path, which now
|
||||
uses an adaptive approach to candidate trials and block refinement.
|
||||
|
||||
In earlier releases the quality level will determine the number of encoding
|
||||
candidates and the number of iterative refinement passes that are used for each
|
||||
major encoding trial. This is a fixed behavior; it will always try the full N
|
||||
candidates and M refinement iterations specified by the quality level for each
|
||||
encoding trial.
|
||||
|
||||
The new approach implements two optimizations for this:
|
||||
|
||||
* Compression will complete when a block candidate hits the specified target
|
||||
quality, after its M refinement iterations have been applied. Later block
|
||||
candidates are simply abandoned.
|
||||
* Block candidates will predict how much refinement can improve them, and
|
||||
abandon refinement if they are unlikely to improve upon the best known
|
||||
encoding already in-hand.
|
||||
|
||||
This pair of optimizations provides significant performance improvement to the
|
||||
high quality modes which use the most block candidates and refinement
|
||||
iterations. A minor loss of image quality is expected, as the blocks we no
|
||||
longer test or refine may have been better coding choices.
|
||||
|
||||
**Absolute performance vs 2.2 release:**
|
||||
|
||||

|
||||
|
||||
**Relative performance vs 2.2 release:**
|
||||
|
||||

|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 2.2
|
||||
|
||||
**Status:** Released, January 2021
|
||||
|
||||
The 2.2 release is the third release in the 2.x series. It includes a number
|
||||
of performance improvements and new features.
|
||||
|
||||
Reminder for users of the library interface - the API is not designed to be
|
||||
stable across versions, and this release is not compatible with 2.1. Please
|
||||
recompile your client-side code using the updated `astcenc.h` header.
|
||||
|
||||
* **General:**
|
||||
* **Feature:** New Arm aarch64 NEON accelerated vector library support.
|
||||
* **Improvement:** New CMake build system for all platforms.
|
||||
* **Improvement:** SSE4.2 feature profile changed to SSE4.1, which more
|
||||
accurately reflects the feature set used.
|
||||
* **Binary releases:**
|
||||
* **Improvement:** Linux binaries changed to use Clang 9.0, which gives
|
||||
up to 15% performance improvement.
|
||||
* **Improvement:** Windows binaries are now code signed.
|
||||
* **Improvement:** macOS binaries for Apple silicon platforms now provided.
|
||||
* **Improvement:** macOS binaries are now code signed and notarized.
|
||||
* **Command Line:**
|
||||
* **Feature:** New image preprocess `-pp-normalize` option added. This forces
|
||||
normal vectors to be unit length, which is useful when compressing source
|
||||
textures that use normal length to encode an NDF, which is incompatible
|
||||
with ASTC's two channel encoding.
|
||||
* **Feature:** New image preprocess `-pp-premultiply` option added. This
|
||||
scales RGB values by the alpha value. This can be useful to minimize
|
||||
cross-channel color bleed caused by GPU post-multiply filtering/blending.
|
||||
* **Improvements:** Command line tool cleanly traps and reports errors for
|
||||
corrupt input images rather than relying on standard library `assert()`
|
||||
calls in release builds.
|
||||
* **Core API:**
|
||||
* **API Change:** Images using region-based metrics no longer need to include
|
||||
padding; all input images should be tightly packed and `dim_pad` is removed
|
||||
from the `astcenc_image` structure. This makes it easier to directly use
|
||||
images loaded from other libraries.
|
||||
* **API Change:** Image `data` is no longer a 3D array accessed using
|
||||
`data[z][y][x]` indexing, it's an array of 2D slices. This makes it easier
|
||||
to directly use images loaded from other libraries.
|
||||
* **API Change:** New `ASTCENC_FLG_SELF_DECOMPRESS_ONLY` flag added to the
|
||||
codec config. Using this flag enables additional optimizations that
|
||||
aggressively exploit implementation- and configuration-specific, behavior
|
||||
to gain performance. When using this flag the codec can only reliably
|
||||
decompress images that were compressed in the same context session. Images
|
||||
produced via other means may fail to decompress correctly, even if they are
|
||||
otherwise valid ASTC files.
|
||||
|
||||
### Performance
|
||||
|
||||
There is one major set of optimizations in this release, related to the new
|
||||
`ASTCENC_FLG_SELF_DECOMPRESS_ONLY` mode. These allow the compressor to only
|
||||
create data tables it knows that it is going to use, based on its current set
|
||||
of heuristics, rather than needing the full set the format allows.
|
||||
|
||||
The first benefit of these changes is a reduced context creation time, which
|
||||
can be reduced by up to 250ms on our test machine. This is a significant
|
||||
percentage of the command line utility runtime for a small image when using a
|
||||
quick search preset. Compressing the whole Kodak test suite using the command
|
||||
line utility and the `-fastest` preset is ~30% faster with this release, which
|
||||
is mostly due to faster startup.
|
||||
|
||||
The reduction in the data table size in this mode also improve the core codec
|
||||
speed. Our test sets show an average of 12% improvement in the codec for
|
||||
`-fastest` mode, and an average of 3% for `-medium` mode.
|
||||
|
||||
Key for performance charts:
|
||||
|
||||
* Color = block size (see legend).
|
||||
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
|
||||
|
||||
**Absolute performance vs 2.1 release:**
|
||||
|
||||

|
||||
|
||||
**Relative performance vs 2.1 release:**
|
||||
|
||||

|
||||
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 2.1
|
||||
|
||||
**Status:** Released, November 2020
|
||||
|
||||
The 2.1 release is the second release in the 2.x series. It includes a number
|
||||
of performance optimizations and new features.
|
||||
|
||||
Reminder for users of the library interface - the API is not designed to be
|
||||
stable across versions, and this release is not compatible with 2.0. Please
|
||||
recompile your client-side code using the updated `astcenc.h` header.
|
||||
|
||||
### Features:
|
||||
|
||||
* **Command line:**
|
||||
* **Bug fix:** The meaning of the `-tH\cH\dH` and `-th\ch\dh` compression
|
||||
modes was inverted. They now match the documentation; use `-*H` for HDR
|
||||
RGBA, and `-*h` for HDR RGB with LDR alpha.
|
||||
* **Feature:** A new `-fastest` quality preset is now available. This is
|
||||
designed for fast "roughing out" of new content, and sacrifices significant
|
||||
image quality compared to `-fast`. We do not recommend its use for
|
||||
production builds.
|
||||
* **Feature:** A new `-candidatelimit` compression tuning option is now
|
||||
available. This is a power-user control to determine how many candidates
|
||||
are returned for each block mode encoding trial. This feature is used
|
||||
automatically by the search presets; see `-help` for details.
|
||||
* **Improvement:** The compression test modes (`-tl\ts\th\tH`) now emit a
|
||||
MTex/s performance metric, in addition to coding time.
|
||||
* **Core API:**
|
||||
* **Feature:** A new quality preset `ASTCENC_PRE_FASTEST` is available. See
|
||||
`-fastest` above for details.
|
||||
* **Feature:** A new tuning option `tune_candidate_limit` is available in
|
||||
the config structure. See `-candidatelimit` above for details.
|
||||
* **Feature:** Image input/output can now use `ASTCENC_TYPE_F32` data types.
|
||||
* **Stability:**
|
||||
* **Feature:** The SSE2, SSE4.2, and AVX2 variants now produce identical
|
||||
compressed output when run on the same CPU when compiled with the
|
||||
preprocessor define `ASTCENC_ISA_INVARIANCE=1`. For Make builds this can
|
||||
be set on the command line by setting `ISA_INV=1`. ISA invariance is off
|
||||
by default; it reduces performance by 1-3%.
|
||||
|
||||
### Performance
|
||||
|
||||
Key for performance charts:
|
||||
|
||||
* Color = block size (see legend).
|
||||
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
|
||||
|
||||
**Absolute performance vs 2.0 release:**
|
||||
|
||||

|
||||
|
||||
**Relative performance vs 2.0 release:**
|
||||
|
||||

|
||||
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 2.0
|
||||
|
||||
**Status:** Released, August 2020
|
||||
|
||||
The 2.0 release is first release in the 2.x series. It includes a number of
|
||||
major changes over the earlier 1.7 series, and is not command-line compatible.
|
||||
|
||||
### Features:
|
||||
|
||||
* The core codec can be built as a library, exposed via a new codec API.
|
||||
* The core codec supports accelerated SIMD paths for SSE2, SSE4.2, and AVX2.
|
||||
* The command line syntax has a clearer mapping to Khronos feature profiles.
|
||||
|
||||
### Performance:
|
||||
|
||||
Key for performance charts
|
||||
|
||||
* Color = block size (see legend).
|
||||
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
|
||||
|
||||
**Absolute performance vs 1.7 release:**
|
||||
|
||||

|
||||
|
||||
**Relative performance vs 1.7 release:**
|
||||
|
||||

|
||||
|
||||
- - -
|
||||
|
||||
_Copyright © 2020-2022, Arm Limited and contributors. All rights reserved._
|
||||
@@ -0,0 +1,308 @@
|
||||
# 3.x series change log
|
||||
|
||||
This page summarizes the major functional and performance changes in each
|
||||
release of the 3.x series.
|
||||
|
||||
All performance data on this page is measured on an Intel Core i5-9600K
|
||||
clocked at 4.2 GHz, running `astcenc` using AVX2 and 6 threads.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 3.7
|
||||
|
||||
**Status:** April 2022
|
||||
|
||||
The 3.7 release contains another round of performance optimizations, including
|
||||
significant improvements to the command line front-end (faster PNG loader) and
|
||||
the arm64 build of the codec (faster NEON implementation).
|
||||
|
||||
* **General:**
|
||||
* **Feature:** The command line tool PNG loader has been switched to use
|
||||
the Wuffs library, which is robust and significantly faster than the
|
||||
current stb_image implementation.
|
||||
* **Feature:** Support for non-invariant builds returns. Opt-in to slightly
|
||||
faster, but not bit-exact, builds by setting `-DNO_INVARIANCE=ON` for the
|
||||
CMake configuration. This improves performance by around 2%.
|
||||
* **Optimization:** Changed SIMD `select()` so that it matches the default
|
||||
NEON behavior (bitwise select), rather than the default x86-64 behavior
|
||||
(lane select on MSB). Specialization `select_msb()` added for the one case
|
||||
we want to select on a sign-bit, where NEON needs a different
|
||||
implementation. This provides a significant (>25%) performance uplift on
|
||||
NEON implementations.
|
||||
|
||||
### Performance:
|
||||
|
||||
Key for charts:
|
||||
|
||||
* Color = block size (see legend).
|
||||
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
|
||||
|
||||
**Relative performance vs 3.5 release:**
|
||||
|
||||

|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 3.6
|
||||
|
||||
**Status:** April 2022
|
||||
|
||||
The 3.6 release contains another round of performance optimizations.
|
||||
|
||||
There are no interface changes in this release, but in general the API is not
|
||||
designed to be binary compatible across versions. We always recommend
|
||||
rebuilding your client-side code using the updated `astcenc.h` header.
|
||||
|
||||
* **General:**
|
||||
* **Feature:** Data tables are now optimized for contexts without the
|
||||
`SELF_DECOMPRESS_ONLY` flag set. The flag therefore no longer improves
|
||||
compression performance, but still reduces context creation time and
|
||||
context data table memory footprint.
|
||||
* **Feature:** Image quality for 4x4 `-fastest` configuration has been
|
||||
improved.
|
||||
* **Optimization:** Decimation modes are reliably excluded from processing
|
||||
when they are only partially selected in the compressor configuration (e.g.
|
||||
if used for single plane, but not dual plane modes). This is a significant
|
||||
performance optimization for all quality levels.
|
||||
* **Optimization:** Fast-path block load function variant added for 2D LDR
|
||||
images with no swizzle. This is a moderate performance optimization for the
|
||||
fast and fastest quality levels.
|
||||
|
||||
### Performance:
|
||||
|
||||
Key for charts:
|
||||
|
||||
* Color = block size (see legend).
|
||||
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
|
||||
|
||||
**Relative performance vs 3.5 release:**
|
||||
|
||||

|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 3.5
|
||||
|
||||
**Status:** March 2022
|
||||
|
||||
The 3.5 release contains another round of performance optimizations.
|
||||
|
||||
There are no interface changes in this release, but in general the API is not
|
||||
designed to be binary compatible across versions. We always recommend
|
||||
rebuilding your client-side code using the updated `astcenc.h` header.
|
||||
|
||||
* **General:**
|
||||
* **Feature:** Compressor configurations using `SELF_DECOMPRESS_ONLY` mode
|
||||
store compacted partition tables, which significantly improves both
|
||||
context create time and runtime performance.
|
||||
* **Feature:** Bilinear infill for decimated weight grids supports a new
|
||||
variant for half-decimated grids which are only decimated in one axis.
|
||||
|
||||
### Performance:
|
||||
|
||||
Key for charts:
|
||||
|
||||
* Color = block size (see legend).
|
||||
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
|
||||
|
||||
**Relative performance vs 3.4 release:**
|
||||
|
||||

|
||||
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 3.4
|
||||
|
||||
**Status:** February 2022
|
||||
|
||||
The 3.4 release introduces another round of optimizations, removing a number
|
||||
of power-user configuration options to simplify the core compressor data path.
|
||||
|
||||
Reminder for users of the library interface - the API is not designed to be
|
||||
binary compatible across versions, and this release is not compatible with
|
||||
earlier releases. Please update and rebuild your client-side code using the
|
||||
updated `astcenc.h` header.
|
||||
|
||||
* **General:**
|
||||
* **Feature:** Many memory allocations have been moved off the stack into
|
||||
dynamically allocated working memory. This significantly reduces the peak
|
||||
stack usage, allowing the compressor to run in systems with 128KB stack
|
||||
limits.
|
||||
* **Feature:** Builds now support `-DBLOCK_MAX_TEXELS=<count>` to allow a
|
||||
compressor to support a subset of block sizes. This can reduce binary size
|
||||
and runtime memory footprint, and improve performance.
|
||||
* **Feature:** The `-v` and `-va` options to set a per-texel error weight
|
||||
function are no longer supported.
|
||||
* **Feature:** The `-b` option to set a per-texel error weight boost for
|
||||
block border texels is no longer supported.
|
||||
* **Feature:** The `-a` option to set a per-texel error weight based on texel
|
||||
alpha value is no longer supported as an error weighting tool, but is still
|
||||
supported for providing sprite-sheet RDO.
|
||||
* **Feature:** The `-mask` option to set an error metric for mask map
|
||||
textures is still supported, but is currently a no-op in the compressor.
|
||||
* **Feature:** The `-perceptual` option to set a perceptual error metric is
|
||||
still supported, but is currently a no-op in the compressor for mask map
|
||||
and normal map textures.
|
||||
* **Bug-fix:** Corrected decompression of error blocks in some cases, so now
|
||||
returning the expected error color (magenta for LDR, NaN for HDR). Note
|
||||
that astcenc determines the error color to use based on the output image
|
||||
data type not the decoder profile.
|
||||
* **Binary releases:**
|
||||
* **Improvement:** Windows binaries changed to use ClangCL 12.0, which gives
|
||||
up to 10% performance improvement.
|
||||
|
||||
### Performance:
|
||||
|
||||
Key for charts:
|
||||
|
||||
* Color = block size (see legend).
|
||||
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
|
||||
|
||||
**Relative performance vs 3.3 release:**
|
||||
|
||||

|
||||
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 3.3
|
||||
|
||||
**Status:** November 2021
|
||||
|
||||
The 3.3 release improves image quality for normal maps, and two component
|
||||
textures. Normal maps are expected to compress 25% slower than the 3.2
|
||||
release, although it should be noted that they are still faster to compress
|
||||
in 3.3 than when using the 2.5 series. This release also fixes one reported
|
||||
stability issue.
|
||||
|
||||
* **General:**
|
||||
* **Feature:** Normal map image quality has been improved.
|
||||
* **Feature:** Two component image quality has been improved, provided
|
||||
that unused components are correctly zero-weighted using e.g. `-cw` on the
|
||||
command line.
|
||||
* **Bug-fix:** Improved stability when trying to compress complex blocks that
|
||||
could not beat even the starting quality threshold. These will now always
|
||||
compress in to a constant color blocks.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 3.2
|
||||
|
||||
**Status:** August 2021
|
||||
|
||||
The 3.2 release is a bugfix release; no significant image quality or
|
||||
performance differences are expected.
|
||||
|
||||
* **General:**
|
||||
* **Bug-fix:** Improved stability when new contexts were created while other
|
||||
contexts were compressing or decompressing an image.
|
||||
* **Bug-fix:** Improved stability when decompressing blocks with invalid
|
||||
block encodings.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 3.1
|
||||
|
||||
**Status:** July 2021
|
||||
|
||||
The 3.1 release gives another performance boost, typically between 5 and 20%
|
||||
faster than the 3.0 release, as well as further incremental improvements to
|
||||
image quality. A number of build system improvements make astcenc easier and
|
||||
faster to integrate into other projects as a library, including support for
|
||||
building universal binaries on macOS. Full change list is shown below.
|
||||
|
||||
Reminder for users of the library interface - the API is not designed to be
|
||||
binary compatible across versions, and this release is not compatible with
|
||||
earlier releases. Please update and rebuild your client-side code using the
|
||||
updated `astcenc.h` header.
|
||||
|
||||
* **General:**
|
||||
* **Feature:** RGB color data now supports `-perceptual` operation. The
|
||||
current implementation is simple, weighting color channel errors by their
|
||||
contribution to perceived luminance. This mimics the behavior of the human
|
||||
visual system, which is most sensitive to green, then red, then blue.
|
||||
* **Feature:** Codec supports a new low weight search mode, which is a
|
||||
simpler weight assignment for encodings with a low number of weights in the
|
||||
weight grid. The weight threshold can be overridden using the new
|
||||
`-lowweightmodelimit` command line option.
|
||||
* **Feature:** All platform builds now support building a native binary.
|
||||
Native binaries automatically select the SIMD level based on the default
|
||||
configuration of the compiler in use. Native binaries built on one machine
|
||||
may use different SIMD options than native binaries build on another.
|
||||
* **Feature:** macOS platform builds now support building universal binaries
|
||||
containing both `x86_64` and `arm64` target support.
|
||||
* **Feature:** Building the command line can be disabled when using as a
|
||||
library in another project. Set `-DCLI=OFF` during the CMake configure
|
||||
step.
|
||||
* **Feature:** A standalone minimal example of the core codec API usage has
|
||||
been added in the `./Utils/Example/` directory.
|
||||
* **Core API:**
|
||||
* **Feature:** Config flag `ASTCENC_FLG_USE_PERCEPTUAL` works for color data.
|
||||
* **Feature:** Config option `tune_low_weight_count_limit` added.
|
||||
* **Feature:** New heuristic added which prunes dual weight plane searches if
|
||||
they are unlikely to help. This heuristic is not user controllable.
|
||||
* **Feature:** Image quality has been improved. In general we see significant
|
||||
improvements (up to 0.2dB) for high bitrate encodings (4x4, 5x4), and a
|
||||
smaller improvement (up to 0.1dB) for lower bitrate encodings.
|
||||
* **Bug fix:** Arm "none" SIMD builds could be invariant with other builds.
|
||||
This fix has also been back-ported to the 2.x LTS branch.
|
||||
|
||||
### Performance:
|
||||
|
||||
Key for charts:
|
||||
|
||||
* Color = block size (see legend).
|
||||
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
|
||||
|
||||
**Relative performance vs 3.0 release:**
|
||||
|
||||

|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 3.0
|
||||
|
||||
**Status:** June 2021
|
||||
|
||||
The 3.0 release is the first in a series of updates to the compressor that are
|
||||
making more radical changes than we felt we could make with the 2.x series.
|
||||
The primary goals of the 3.x series are to keep the image quality ~static or
|
||||
better compared to the 2.5 release, but continue to improve performance.
|
||||
|
||||
Reminder for users of the library interface - the API is not designed to be
|
||||
binary compatible across versions, and this release is not compatible with
|
||||
earlier releases. Please update and rebuild your client-side code using the
|
||||
updated `astcenc.h` header.
|
||||
|
||||
* **General:**
|
||||
* **Feature:** The code has been significantly cleaned up, with improved
|
||||
comments, API documentation, function naming, and variable naming.
|
||||
* **Core API:**
|
||||
* **API Change:** The core APIs for `astcenc_compress_image()` and for
|
||||
`astcenc_decompress_image()` now accept swizzle structures by `const`
|
||||
pointer, instead of pass-by-value.
|
||||
* **API Change:** Calling the `astcenc_compress_reset()` and the
|
||||
`astcenc_decompress_reset()` functions between images is no longer required
|
||||
if the context was created for use by a single thread.
|
||||
* **Feature:** New heuristics have been added for controlling when to search
|
||||
beyond 2 partitions and 1 plane, and when to search beyond 3 partitions and
|
||||
1 plane. The previous `tune_partition_early_out_limit` config option has
|
||||
been removed, and replaced with two new options
|
||||
`tune_2_partition_early_out_limit_factor` and
|
||||
`tune_3_partition_early_out_limit_factor`. See command line help for more
|
||||
detailed documentation.
|
||||
* **Feature:** New heuristics have been added for controlling when to use
|
||||
dual weight planes. The previous `tune_two_plane_early_out_limit` has been
|
||||
renamed to`tune_2_plane_early_out_limit_correlation`. See command line help
|
||||
for more detailed documentation.
|
||||
* **Feature:** Support for using dual weight planes has been restricted to
|
||||
single partition blocks; it rarely helps blocks with 2 or more partitions
|
||||
and takes considerable compression search time.
|
||||
|
||||
### Performance:
|
||||
|
||||
Key for charts:
|
||||
|
||||
* Color = block size (see legend).
|
||||
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
|
||||
|
||||
**Relative performance vs 2.5 release:**
|
||||
|
||||

|
||||
|
||||
- - -
|
||||
|
||||
_Copyright © 2021-2022, Arm Limited and contributors. All rights reserved._
|
||||
@@ -0,0 +1,416 @@
|
||||
# 4.x series change log
|
||||
|
||||
This page summarizes the major functional and performance changes in each
|
||||
release of the 4.x series.
|
||||
|
||||
All performance data on this page is measured on an Intel Core i5-9600K
|
||||
clocked at 4.2 GHz, running `astcenc` using AVX2 and 6 threads.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 4.8.0
|
||||
|
||||
**Status:** May 2024
|
||||
|
||||
The 4.8.0 release is a minor maintenance release.
|
||||
|
||||
* **General:**
|
||||
* **Bug fix:** Native builds on macOS will now correctly build for arm64 when
|
||||
run outside of Rosetta on an Apple silicon device.
|
||||
* **Bug fix:** Multiple small improvements to remove use of undefined
|
||||
language behavior, to improve support for deployment using Emscripten.
|
||||
* **Feature:** Builds using Clang can now build with undefined behavior
|
||||
sanitizer by setting `-DASTCENC_UBSAN=ON` on the CMake configure line.
|
||||
* **Feature:** Updated to Wuffs library 0.3.4, which ignores tRNS alpha
|
||||
chunks for type 4 (LA) and 6 (RGBA) PNGs, to improve compatibility with
|
||||
libpng.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 4.7.0
|
||||
|
||||
**Status:** January 2024
|
||||
|
||||
The 4.7.0 release is a major maintenance release, fixing rounding behavior in
|
||||
the decompressor to match the Khronos specification. This fix includes the
|
||||
addition of explicit support for optimizing for `decode_unorm8` rounding.
|
||||
|
||||
Reminder - the codec library API is not designed to be binary compatible across
|
||||
versions. We always recommend rebuilding your client-side code using the
|
||||
updated `astcenc.h` header.
|
||||
|
||||
* **General:**
|
||||
* **Bug fix:** sRGB LDR decompression now uses the correct endpoint expansion
|
||||
method to create the 16-bit RGB endpoint colors, and removes the previous
|
||||
correction code from the interpolation function. This bug could result in
|
||||
LSB bit flips relative to the standard specification.
|
||||
* **Bug fix:** Decompressing to an 8-bit per component output image now
|
||||
matches the `decode_unorm8` extension rounding rules. This bug could result
|
||||
in LSB bit flips relative to the standard specification.
|
||||
* **Bug fix:** Code now avoids using `alignas()` in the reference C
|
||||
implementation, as the default `alignas(16)` is narrower than the
|
||||
native minimum alignment requirement on some CPUs.
|
||||
* **Feature:** Library configuration supports a new flag,
|
||||
`ASTCENC_FLG_USE_DECODE_UNORM8`. This flag indicates that the image will be
|
||||
used with the `decode_unorm8` decode mode. When set during compression
|
||||
this allows the compressor to use the correct rounding when determining the
|
||||
best encoding.
|
||||
* **Feature:** Command line tool supports a new option, `-decode_unorm8`.
|
||||
This option indicates that the image will be used with the `decode_unorm8`
|
||||
decode mode. This option will automatically be set for decompression
|
||||
(`-d*`) and trial (`-t*`) tool operation if the decompressed output image
|
||||
is stored to an 8-bit per component file format. This option must be set
|
||||
manually for compression (`-c*`) tool operation, as the desired decode mode
|
||||
cannot be reliably determined.
|
||||
* **Feature:** Library configuration supports a new optional progress
|
||||
reporting callback to be specified. This is called during compression to
|
||||
to allow interactive tooling use cases to display incremental progress. The
|
||||
command line tool uses this feature to show compression progress unless
|
||||
`-silent` is used.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 4.6.1
|
||||
|
||||
**Status:** November 2023
|
||||
|
||||
The 4.6.1 release is a minor maintenance release to fix a scaling bug on
|
||||
large core count Windows systems.
|
||||
|
||||
* **General:**
|
||||
* **Optimization:** Windows builds of the `astcenc` command line tool can now
|
||||
use more than 64 cores on large core count systems. This change doubled
|
||||
command line performance for `-exhaustive` compression when testing on an
|
||||
96 core/192 thread system.
|
||||
* **Feature:** Windows Arm64 native builds of the `astcenc` command line tool
|
||||
are now included in the prebuilt release binaries.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 4.6.0
|
||||
|
||||
**Status:** November 2023
|
||||
|
||||
The 4.6.0 release retunes the compressor heuristics to give improvements to
|
||||
performance for trivial losses to image quality. It also includes some minor
|
||||
bug fixes and code quality improvements.
|
||||
|
||||
Reminder - the codec library API is not designed to be binary compatible across
|
||||
versions. We always recommend rebuilding your client-side code using the updated
|
||||
`astcenc.h` header.
|
||||
|
||||
* **General:**
|
||||
* **Bug-fix:** Fixed context allocation for contexts allocated with the
|
||||
`ASTCENC_FLG_DECOMPRESS_ONLY` flag.
|
||||
* **Bug-fix:** Reduced use of `reinterpret_cast` in the core codec to
|
||||
avoid strict aliasing violations.
|
||||
* **Optimization:** `-medium` search quality no longer tests 4 partition
|
||||
encodings for block sizes between 25 and 83 texels (inclusive). This
|
||||
improves performance for a tiny drop in image quality.
|
||||
* **Optimization:** `-thorough` and higher search qualities no longer test the
|
||||
mode0 first search for block sizes between 25 and 83 texels (inclusive).
|
||||
This improves performance for a tiny drop in image quality.
|
||||
* **Optimization:** `TUNE_MAX_PARTITIONING_CANDIDATES` reduced from 32 to 8
|
||||
to reduce the size of stack allocated data structures. This causes a tiny
|
||||
drop in image quality for the `-verythorough` and `-exhaustive` presets.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 4.5.0
|
||||
|
||||
**Status:** June 2023
|
||||
|
||||
The 4.5.0 release is a maintenance release with small image quality
|
||||
improvements, and a number of build system quality of life improvements.
|
||||
|
||||
* **General:**
|
||||
* **Bug-fix:** Improved handling compiler arguments in CMake, including
|
||||
consistent use of MSVC-style command line arguments for ClangCL.
|
||||
* **Bug-fix:** Invariant Clang builds now use `-ffp-model=precise` with
|
||||
`-ffp-contract=off` which is needed to restore invariance due to recent
|
||||
changes in compiler defaults.
|
||||
* **Change:** macOS binary releases are now distributed as a single universal
|
||||
binary for all platforms.
|
||||
* **Change:** Windows binary releases are now compiled with VS2022.
|
||||
* **Change:** Invariant MSVC builds for VS2022 now use `/fp:precise` instead
|
||||
of `/fp:strict`, which is is now possible because precise no longer implies
|
||||
contraction. This should improve performance for MSVC builds.
|
||||
* **Change:** Non-invariant Clang builds now use `-ffp-model=precise` with
|
||||
`-ffp-contract=on`. This should improve performance on older Clang
|
||||
versions which defaulted to no contraction.
|
||||
* **Change:** Non-invariant MSVC builds for VS2022 now use `/fp:precise`
|
||||
with `/fp:contract`. This should improve performance for MSVC builds.
|
||||
* **Change:** CMake config variables now use an `ASTCENC_` prefix to add a
|
||||
namespace and group options when the library is used in a larger project.
|
||||
* **Change:** CMake config `ASTCENC_UNIVERSAL_BUILD` for building macOS
|
||||
universal binaries has been improved to include the `x86_64h` slice for
|
||||
AVX2 builds. Universal builds are now on by default for macOS, and always
|
||||
include NEON (arm64), SSE4.1 (x86_64), and AVX2 (x86_64h) variants.
|
||||
* **Change:** CMake config `ASTCENC_NO_INVARIANCE` has been inverted to
|
||||
remove the negated option, and is now `ASTCENC_INVARIANCE` with a default
|
||||
of `ON`. Disabling this option can substantially improve performance, but
|
||||
images can different across platforms and compilers.
|
||||
* **Optimization:** Color quantization and packing for LDR RGB and RGBA has
|
||||
been vectorized to improve performance.
|
||||
* **Change:** Color quantization for LDR RGB and RGBA endpoints will now try
|
||||
multiple quantization packing methods, and pick the one with the lowest
|
||||
endpoint encoding error. This gives a minor image quality improvement, for
|
||||
no significant performance impact when combined with the vectorization
|
||||
optimizations.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 4.4.0
|
||||
|
||||
**Status:** March 2023
|
||||
|
||||
The 4.4.0 release is a minor release with image quality improvements, a small
|
||||
performance boost, and a few new quality-of-life features.
|
||||
|
||||
* **General:**
|
||||
* **Change:** Core library no longer checks availability of required
|
||||
instruction set extensions, such as SSE4.1 or AVX2. Checking compatibility
|
||||
is now the responsibility of the caller. See `astcenccli_entry.cpp` for
|
||||
an example of code performing this check.
|
||||
* **Change:** Core library can be built as a shared object by setting the
|
||||
`-DSHAREDLIB=ON` CMake option, resulting in e.g. `libastcenc-avx2-shared.so`.
|
||||
Note that the command line tool is always statically linked.
|
||||
* **Change:** Decompressed 3D images will now write one output file per
|
||||
slice, if the target format is a 2D image format.
|
||||
* **Change:** Command line errors print to stderr instead of stdout.
|
||||
* **Change:** Color encoding uses new quantization tables, that now factor
|
||||
in floating-point rounding if a distance tie is found when using the
|
||||
integer quant256 value. This improves image quality for 4x4 and 5x5 block
|
||||
sizes.
|
||||
* **Optimization:** Partition selection uses a simplified line calculation
|
||||
with a faster approximation. This improves performance for all block sizes.
|
||||
* **Bug-fix:** Fixed missing symbol error in decompressor-only builds.
|
||||
* **Bug-fix:** Fixed infinity handling in debug trace JSON files.
|
||||
|
||||
### Performance:
|
||||
|
||||
Key for charts:
|
||||
|
||||
* Color = block size (see legend).
|
||||
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
|
||||
|
||||
**Relative performance vs 4.3 release:**
|
||||
|
||||

|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 4.3.1
|
||||
|
||||
**Status:** January 2023
|
||||
|
||||
The 4.3.1 release is a minor maintenance release. No performance or image
|
||||
quality changes are expected.
|
||||
|
||||
* **General:**
|
||||
* **Bug-fix:** Fixed typo in `-2/3/4partitioncandidatelimit` CLI options.
|
||||
* **Bug-fix:** Fixed handling for `-3/4partitionindexlimit` CLI options.
|
||||
* **Bug-fix:** Updated to `stb_image.h` v2.28, which includes multiple fixes
|
||||
and improvements for image loading.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 4.3.0
|
||||
|
||||
**Status:** January 2023
|
||||
|
||||
The 4.3.0 release is an optimization release. There are minor performance
|
||||
and image quality improvements in this release.
|
||||
|
||||
Reminder - the codec library API is not designed to be binary compatible across
|
||||
versions. We always recommend rebuilding your client-side code using the updated
|
||||
`astcenc.h` header.
|
||||
|
||||
* **General:**
|
||||
* **Bug-fix:** Use lower case `windows.h` include for MinGW compatibility.
|
||||
* **Change:** The `-mask` command line option, `ASTCENC_FLG_MAP_MASK` in the
|
||||
library API, has been removed.
|
||||
* **Optimization:** Always skip blue-contraction for `QUANT_256` encodings.
|
||||
This gives a small image quality improvement for the 4x4 block size.
|
||||
* **Optimization:** Always skip RGBO vector calculation for LDR encodings.
|
||||
* **Optimization:** Defer color packing and scrambling to physical layer.
|
||||
* **Optimization:** Remove folded `decimation_info` lookup tables. This
|
||||
significantly reduces compressor memory footprint and improves context
|
||||
creation time. Impact increases with the active block size.
|
||||
* **Optimization:** Increased trial and refinement pruning by using stricter
|
||||
target errors when determining whether to skip iterations.
|
||||
|
||||
### Performance:
|
||||
|
||||
Key for charts:
|
||||
|
||||
* Color = block size (see legend).
|
||||
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
|
||||
|
||||
**Relative performance vs 4.2 release:**
|
||||
|
||||

|
||||
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 4.2.0
|
||||
|
||||
**Status:** November 2022
|
||||
|
||||
The 4.2.0 release is an optimization release. There are significant performance
|
||||
improvements, minor image quality improvements, and library interface changes in
|
||||
this release.
|
||||
|
||||
Reminder - the codec library API is not designed to be binary compatible across
|
||||
versions. We always recommend rebuilding your client-side code using the updated
|
||||
`astcenc.h` header.
|
||||
|
||||
* **General:**
|
||||
* **Bug-fix:** Compression for RGB and RGBA base+offset encodings no
|
||||
longer generate endpoints with the incorrect blue-contract behavior.
|
||||
* **Bug-fix:** Lowest channel correlation calculation now correctly ignores
|
||||
constant color channels for the purposes of filtering 2 plane encodings.
|
||||
On average this improves both performance and image quality.
|
||||
* **Bug-fix:** ISA compatibility now checked in `config_init()` as well as
|
||||
in `context_alloc()`.
|
||||
* **Change:** Removed the low-weight count optimization, as more recent
|
||||
changes had significantly reduced its performance benefit. Option removed
|
||||
from both command line and configuration structure.
|
||||
* **Feature:** The `-exhaustive` mode now runs full trials on more
|
||||
partitioning candidates and block candidates. This improves image quality
|
||||
by 0.1 to 0.25 dB, but slows down compression by 3x. The `-verythorough`
|
||||
and `-thorough` modes also test more candidates.
|
||||
* **Feature:** A new preset, `-verythorough`, has been introduced to provide
|
||||
a standard performance point between `-thorough` and the re-tuned
|
||||
`-exhaustive` mode. This new mode is faster and higher quality than the
|
||||
`-exhaustive` preset in the 4.1 release.
|
||||
* **Feature:** The compressor can now independently vary the number of
|
||||
partitionings considered for error estimation for 2/3/4 partitions. This
|
||||
allows heuristics to put more effort into 2 partitions, and less in to
|
||||
3/4 partitions.
|
||||
* **Feature:** The compressor can now run trials on a variable number of
|
||||
candidate partitionings, allowing high quality modes to explore more of the
|
||||
search space at the expense of slower compression. The number of trials is
|
||||
independently configurable for 2/3/4 partition cases.
|
||||
* **Optimization:** Introduce early-out threshold for 2/3/4 partition
|
||||
searches based on the results after 1 of 2 trials. This significantly
|
||||
improves performance for `-medium` and `-thorough` searches, for a minor
|
||||
loss in image quality.
|
||||
* **Optimization:** Reduce early-out threshold for 3/4 partition searches
|
||||
based on 2/3 partition results. This significantly improves performance,
|
||||
especially for `-thorough` searches, for a minor loss in image quality.
|
||||
* **Optimization:** Use direct vector compare to create a SIMD mask instead
|
||||
of a scalar compare that is broadcast to a vector mask.
|
||||
* **Optimization:** Remove obsolete partition validity masks from the
|
||||
partition selection algorithm.
|
||||
* **Optimization:** Removed obsolete channel scaling from partition
|
||||
`avgs_and_dirs()` calculation.
|
||||
|
||||
### Performance:
|
||||
|
||||
Key for charts:
|
||||
|
||||
* Color = block size (see legend).
|
||||
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
|
||||
|
||||
**Relative performance vs 4.0 and 4.1 release:**
|
||||
|
||||

|
||||
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 4.1.0
|
||||
|
||||
**Status:** August 2022
|
||||
|
||||
The 4.1.0 release is a maintenance release. There is no performance or image
|
||||
quality change in this release.
|
||||
|
||||
* **General:**
|
||||
* **Change:** Command line decompressor no longer uses the legacy
|
||||
`GL_LUMINANCE` or `GL_LUMINANCE_ALPHA` format enums when writing KTX
|
||||
output files. Luminance textures now use the `GL_RED` format and
|
||||
luminance_alpha textures now use the `GL_RG` format.
|
||||
* **Change:** Command line tool gains a new `-dimage` option to generate
|
||||
diagnostic images showing aspects of the compression encoding. The output
|
||||
file name with its extension stripped is used as the stem of the diagnostic
|
||||
image file names.
|
||||
* **Bug-fix:** Library decompressor builds for SSE no longer use masked store
|
||||
`maskmovdqu` instructions, as they can generate faults on masked lanes.
|
||||
* **Bug-fix:** Command line decompressor now correctly uses sized type enums
|
||||
for the internal format when writing output KTX files.
|
||||
* **Bug-fix:** Command line compressor now correctly loads 16 and 32-bit per
|
||||
component input KTX files.
|
||||
* **Bug-fix:** Fixed GCC9 compiler warnings on Arm aarch64.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 4.0.0
|
||||
|
||||
**Status:** July 2022
|
||||
|
||||
The 4.0.0 release introduces some major performance enhancement, and a number
|
||||
of larger changes to the heuristics used in the codec to find a more effective
|
||||
cost:quality trade off.
|
||||
|
||||
* **General:**
|
||||
* **Change:** The `-array` option for specifying the number of image planes
|
||||
for ASTC 3D volumetric block compression been renamed to `-zdim`.
|
||||
* **Change:** The build root package directory is now `bin` instead of
|
||||
`astcenc`, allowing the CMake install step to write binaries into
|
||||
`/usr/local/bin` if the user wishes to do so.
|
||||
* **Feature:** A new `-ssw` option for specifying the shader sampling swizzle
|
||||
has been added as convenience alternative to the `-cw` option. This is
|
||||
needed to correct error weighting during compression if not all components
|
||||
are read in the shader. For example, to extract and compress two components
|
||||
from an RGBA input image, weighting the two components equally when
|
||||
sampling through .ra in the shader, use `-esw ggga -ssw ra`. In this
|
||||
example `-ssw ra` is equivalent to the alternative `-cw 1 0 0 1` encoding.
|
||||
* **Feature:** The `-a` alpha weighting option has been re-enabled in the
|
||||
backend, and now again applies alpha scaling to the RGB error metrics when
|
||||
encoding. This is based on the maximum alpha in each block, not the
|
||||
individual texel alpha values used in the earlier implementation.
|
||||
* **Feature:** The command line tool now has `-repeats <count>` for testing,
|
||||
which will iterate around compression and decompression `count` times.
|
||||
Reported performance metrics also now separate compression and
|
||||
decompression scores.
|
||||
* **Feature:** The core codec is now warning clean up to /W4 for both MSVC
|
||||
`cl.exe` and `clangcl.exe` compilers.
|
||||
* **Feature:** The core codec now supports arm64 for both MSVC `cl.exe` and
|
||||
`clangcl.exe` compilers.
|
||||
* **Feature:** `NO_INVARIANCE` builds will enable the `-ffp-contract=fast`
|
||||
option for all targets when using Clang or GCC. In addition AVX2 targets
|
||||
will also set the `-mfma` option. This reduces image quality by up to 0.2dB
|
||||
(normally much less), but improves performance by up to 5-20%.
|
||||
* **Optimization:** Angular endpoint min/max weight selection is restricted
|
||||
to weight `QUANT_11` or lower. Higher quantization levels assume default
|
||||
0-1 range, which is less accurate but much faster.
|
||||
* **Optimization:** Maximum weight quantization for later trials is selected
|
||||
based on the weight quantization of the best encoding from the 1 plane 1
|
||||
partition trial. This significantly reduces the search space for the later
|
||||
trials with more planes or partitions.
|
||||
* **Optimization:** Small data tables now use in-register SIMD permutes
|
||||
rather than gathers (AVX2) or unrolled scalar lookups (SSE/NEON). This can
|
||||
be a significant optimization for paths that are load unit limited.
|
||||
* **Optimization:** Decompressed image block writes in the decompressor now
|
||||
use a vectorized approach to writing each row of texels in the block,
|
||||
including to ability to exploit masked stores if the target supports them.
|
||||
* **Optimization:** Weight scrambling has been moved into the physical layer;
|
||||
the rest of the codec now uses linear order weights.
|
||||
* **Optimization:** Weight packing has been moved into the physical layer;
|
||||
the rest of the codec now uses unpacked weights in the 0-64 range.
|
||||
* **Optimization:** Consistently vectorize the creation of unquantized weight
|
||||
grids when they are needed.
|
||||
* **Optimization:** Remove redundant per-decimation mode copies of endpoint
|
||||
and weight structures, which were really read-only duplicates.
|
||||
* **Optimization:** Early-out the same endpoint mode color calculation if it
|
||||
cannot be applied.
|
||||
* **Optimization:** Numerous type size reductions applied to arrays to reduce
|
||||
both context working buffer size usage and stack usage.
|
||||
|
||||
### Performance:
|
||||
|
||||
Key for charts:
|
||||
|
||||
* Color = block size (see legend).
|
||||
* Letter = image format (N = normal map, G = grayscale, L = LDR, H = HDR).
|
||||
|
||||
**Relative performance vs 3.7 release:**
|
||||
|
||||

|
||||
|
||||
|
||||
- - -
|
||||
|
||||
_Copyright © 2022-2024, Arm Limited and contributors. All rights reserved._
|
||||
@@ -0,0 +1,105 @@
|
||||
# 5.x series change log
|
||||
|
||||
This page summarizes the major functional and performance changes in each
|
||||
release of the 5.x series.
|
||||
|
||||
All performance data on this page is measured on an Intel Core i5-9600K
|
||||
clocked at 4.2 GHz, running `astcenc` using AVX2 and 6 threads.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 5.3.0
|
||||
|
||||
**Status:** March 2025
|
||||
|
||||
The 5.3.0 release is a minor maintenance release.
|
||||
|
||||
* **General:**
|
||||
* **Feature:** Reference C builds (`ASTCENC_ISA_NONE`) now support compiling
|
||||
for big-endian CPUs. Compile with `-DASTCENC_BIG_ENDIAN=ON` when compiling
|
||||
for a big-endian target; it is not auto-detected.
|
||||
* **Improvement:** Builds using GCC now specify `-flto=auto` to allow
|
||||
parallel link steps, and remove the log warnings about not setting a CPU
|
||||
count parameter value.
|
||||
* **Bug fix:** Builds using MSVC `cl.exe` that do not specify an explicit
|
||||
ISA using the preprocessor configuration defines will now correctly
|
||||
default to the SSE2 backend on x86-64 and the NEON backend on Arm64. Previously they would have defaulted to the reference C implementation,
|
||||
which is around 3.25 times slower.
|
||||
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 5.2.0
|
||||
|
||||
**Status:** February 2025
|
||||
|
||||
The 5.2.0 release is a minor maintenance release.
|
||||
|
||||
This release includes changes to the public interface in the `astcenc.h`
|
||||
header. We always recommend rebuilding your client-side code using the
|
||||
header from the same release to avoid compatibility issues.
|
||||
|
||||
* **General:**
|
||||
* **Change:** Changed sRGB alpha channel endpoint expansion to match the
|
||||
revised Khronos Data Format Specification (v1.4.0), which reverts an
|
||||
unintended specification change. Compared to previous releases, this change
|
||||
can cause LSB bit differences in the alpha channel of compressed images.
|
||||
* **Feature:** Arm64 builds for Linux added to the GitHub Actions builds, and
|
||||
Arm64 binaries for NEON, 128-bit SVE 128 and 256-bit SVE added to release
|
||||
builds.
|
||||
* **Feature:** Added a new codec API, `astcenc_compress_cancel()`, which can
|
||||
be used to cancel an in-flight compression. This is designed to help make
|
||||
it easier to integrate the codec into an interactive user interface that
|
||||
can respond to user events with low latency.
|
||||
* **Bug fix:** Removed incorrect `static` variable qualifier, which could
|
||||
result in an incorrect `tune_mse_overshoot` heuristic threshold being used
|
||||
if a user ran multiple concurrent compressions with different settings.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 5.1.0
|
||||
|
||||
**Status:** November 2024
|
||||
|
||||
The 5.1.0 release is an optimization release, giving moderate performance
|
||||
improvements on all platforms. There are no image quality differences.
|
||||
|
||||
* **General:**
|
||||
* **Feature:** Added a new CMake build option to control use of native
|
||||
gathers, as they can be slower than scalar loads on some common x86
|
||||
microarchitectures. Build with `-DASTCENC_X86_GATHERS=OFF` to disable use
|
||||
of native gathers in AVX2 builds.
|
||||
* **Optimization:** Added new `gather()` abstraction for gathers using byte
|
||||
indices, allowing implementations without gather hardware to skip the
|
||||
byte-to-int index conversion.
|
||||
* **Optimization:** Optimized `compute_lowest_and_highest_weight()` to
|
||||
pre-compute min/max outside of the main loop.
|
||||
* **Optimization:** Added improved intrinsics sequence for SSE and AVX2
|
||||
integer `hmin()` and `hmax()`.
|
||||
* **Optimization:** Added improved intrinsics sequence for `vint4(uint8_t*)`
|
||||
on systems implementing Arm SVE.
|
||||
|
||||
<!-- ---------------------------------------------------------------------- -->
|
||||
## 5.0.0
|
||||
|
||||
**Status:** November 2024
|
||||
|
||||
The 5.0.0 release is the first stable release in the 5.x series. The main new
|
||||
feature is support for the Arm Scalable Vector Extensions (SVE) SIMD instruction
|
||||
set.
|
||||
|
||||
* **General:**
|
||||
* **Bug fix:** Fixed incorrect return type in "None" vector library
|
||||
reference implementation.
|
||||
* **Bug fix:** Fixed sincos table index under/overflow.
|
||||
* **Feature:** Changed `ASTCENC_ISA_NATIVE` builds to use `-march=native` and
|
||||
`-mcpu=native`.
|
||||
* **Feature:** Added backend for Arm SVE fixed-width 256-bit builds. These
|
||||
can only run on hardware implementing 256-bit SVE.
|
||||
* **Feature:** Added backend for Arm SVE 128-bit builds. These are portable
|
||||
builds and can run on hardware implementing any SVE vector length, but the
|
||||
explicit SVE use is augmented NEON and will only use the bottom 128-bits of
|
||||
each SVE vector.
|
||||
* **Feature:** Optimized NEON mask `any()` and `all()` functions.
|
||||
* **Feature:** Migrated build and test to GitHub Actions pipelines.
|
||||
|
||||
- - -
|
||||
|
||||
_Copyright © 2022-2025, Arm Limited and contributors. All rights reserved._
|
||||
|
After Width: | Height: | Size: 111 KiB |
|
After Width: | Height: | Size: 148 KiB |
|
After Width: | Height: | Size: 141 KiB |
|
After Width: | Height: | Size: 149 KiB |
|
After Width: | Height: | Size: 134 KiB |
|
After Width: | Height: | Size: 112 KiB |
|
After Width: | Height: | Size: 120 KiB |
|
After Width: | Height: | Size: 120 KiB |
|
After Width: | Height: | Size: 123 KiB |
|
After Width: | Height: | Size: 116 KiB |
|
After Width: | Height: | Size: 110 KiB |
|
After Width: | Height: | Size: 125 KiB |
|
After Width: | Height: | Size: 127 KiB |
|
After Width: | Height: | Size: 120 KiB |
|
After Width: | Height: | Size: 124 KiB |
|
After Width: | Height: | Size: 121 KiB |
|
After Width: | Height: | Size: 126 KiB |
|
After Width: | Height: | Size: 116 KiB |
|
After Width: | Height: | Size: 108 KiB |
@@ -0,0 +1,235 @@
|
||||
# Effective ASTC Encoding
|
||||
|
||||
Most texture compression schemes encode a single color format at single
|
||||
bitrate, so there are relatively few configuration options available to content
|
||||
creators beyond selecting which compressed format to use.
|
||||
|
||||
ASTC on the other hand is an extremely flexible container format which can
|
||||
compress multiple color formats at multiple bit rates. Inevitably this
|
||||
flexibility gives rise to questions about how to best use ASTC to encode a
|
||||
specific color format, or what the equivalent settings are to get a close
|
||||
match to another compression format.
|
||||
|
||||
This page aims to give some guidelines, but note that they are only guidelines
|
||||
and are not exhaustive so please deviate from them as needed.
|
||||
|
||||
## Traditional format reference
|
||||
|
||||
The most commonly used non-ASTC compressed formats, their color format, and
|
||||
their compressed bitrate are shown in the table below.
|
||||
|
||||
| Name | Color Format | Bits/Pixel | Notes |
|
||||
| -------- | ------------ | ---------- | ---------------- |
|
||||
| BC1 | RGB+A | 4 | RGB565 + 1-bit A |
|
||||
| BC3 | RGB+A | 8 | BC1 RGB + BC4 A |
|
||||
| BC3nm | G+R | 8 | BC1 G + BC4 R |
|
||||
| BC4 | R | 4 | L8 |
|
||||
| BC5 | R+G | 8 | BC1 R + BC1 G |
|
||||
| BC6H | RGB (HDR) | 8 | |
|
||||
| BC7 | RGB / RGBA | 8 | |
|
||||
| EAC_R11 | R | 4 | R11 |
|
||||
| EAC_RG11 | RG | 8 | RG11 |
|
||||
| ETC1 | RGB | 4 | RGB565 |
|
||||
| ETC2 | RGB+A | 4 | RGB565 + 1-bit A |
|
||||
| ETC2+EAC | RGB+A | 8 | RGB565 + EAC A |
|
||||
| PVRTC | RGBA | 2 or 4 | |
|
||||
|
||||
**Note:** BC2 (RGB+A) is not included in the table because it's rarely used in
|
||||
practice due to poor quality alpha encoding; BC3 is nearly always used instead.
|
||||
|
||||
**Note:** Color representations shown with a `+` symbol indicate non-correlated
|
||||
compression groups; e.g. an `RGB + A` format compresses `RGB` and `A`
|
||||
independently and does not assume the two signals are correlated. This can be
|
||||
a strength (it improves quality when compressing non-correlated signals), but
|
||||
also a weakness (it reduces quality when compressing correlated signals).
|
||||
|
||||
# ASTC Format Mapping
|
||||
|
||||
The main question which arises with the mapping of another format on to ASTC
|
||||
is how to handle cases where the input isn't a 4 component RGBA input. ASTC is
|
||||
a container format which always decompresses in to a 4 component RGBA result.
|
||||
However, the internal compressed representation is very flexible and can store
|
||||
1-4 components as needed on a per-block basis.
|
||||
|
||||
To get the best quality for a given bitrate, or the lowest bitrate for a given
|
||||
quality, it is important that as few components as possible are stored in the
|
||||
internal representation to avoid wasting coding space.
|
||||
|
||||
Specific optimizations in the ASTC coding scheme exist for:
|
||||
|
||||
* Encoding the RGB components as a single luminance component, so only a single
|
||||
value needs to be stored in the coding instead of three.
|
||||
* Encoding the A component as a constant 1.0 value, so the coding doesn't
|
||||
actually need to store a per-pixel alpha value at all.
|
||||
|
||||
... so mapping your inputs given to the compressor to hit these paths is
|
||||
really important if you want to get the best output quality for your chosen
|
||||
bitrate.
|
||||
|
||||
## Encoding 1-4 component data
|
||||
|
||||
The table below shows the recommended component usage for data with different
|
||||
numbers of color components present in the data.
|
||||
|
||||
The coding swizzle should be applied when compressing an image. This can be
|
||||
handled by the compressor when reading an uncompressed input image by
|
||||
specifying the swizzle using the `-esw` command line option.
|
||||
|
||||
The sampling swizzle is what you should use in your shader programs to read
|
||||
the data from the compressed texture, assuming no additional API-level
|
||||
component swizzling is specified by the application.
|
||||
|
||||
| Input components | ASTC Endpoint | Coding Swizzle | Sampling Swizzle |
|
||||
| -------------- | ------------- | -------------- | ------------------ |
|
||||
| 1 | L + 1 | `rrr1` | `.g` <sup>1</sup> |
|
||||
| 2 | L + A | `rrrg` | `.ga` <sup>1</sup> |
|
||||
| 3 | RGB + 1 | `rgb1` | `.rgb` |
|
||||
| 4 | RGB + A | `rgba` | `.rgba` |
|
||||
|
||||
**1:** Sampling from `g` is preferred to sampling from `r` because it allows a
|
||||
single shader to be compatible with ASTC, BC1, or ETC formats. BC1 and ETC1
|
||||
store color endpoints as RGB565 data, so the `g` component will have higher
|
||||
precision. For ASTC it doesn't actually make any difference; the same single
|
||||
component luminance will be returned for all three of the `.rgb` components.
|
||||
|
||||
## Equivalence with other formats
|
||||
|
||||
Based on these component encoding requirements we can now derive the the ASTC
|
||||
coding equivalents for most of the other texture compression formats in common
|
||||
use today.
|
||||
|
||||
| Formant | ASTC Coding Swizzle | ASTC Sampling Swizzle | Notes |
|
||||
| -------- | ------------------- | --------------------- | ---------------- |
|
||||
| BC1 | `rgba` <sup>1</sup> | `.rgba` | |
|
||||
| BC3 | `rgba` | `.rgba` | |
|
||||
| BC3nm | `gggr` | `.ag` | |
|
||||
| BC4 | `rrr1` | `.r` | |
|
||||
| BC5 | `rrrg` | `.ra` <sup>2</sup> | |
|
||||
| BC6H | `rgb1` | `.rgb` <sup>3</sup> | HDR profile only |
|
||||
| BC7 | `rgba` | `.rgba` | |
|
||||
| EAC_R11 | `rrr1` | `.r` | |
|
||||
| EAC_RG11 | `rrrg` | `.ra` <sup>2</sup> | |
|
||||
| ETC1 | `rgb1` | `.rgb` | |
|
||||
| ETC2 | `rgba` <sup>1</sup> | `.rgba` | |
|
||||
| ETC2+EAC | `rgba` | `.rgba` | |
|
||||
| ETC2+EAC | `rgba` | `.rgba` | |
|
||||
|
||||
**1:** ASTC has no equivalent of the 1-bit punch-through alpha encoding
|
||||
supported by BC1 or ETC2; if alpha is present it will be a full alpha
|
||||
component.
|
||||
|
||||
**2:** ASTC relies on using the L+A color endpoint type for coding efficiency
|
||||
for two component data. It therefore has no direct equivalent of a two-plane
|
||||
format sampled though the `.rg` components such as BC5 or EAC_RG11. This can
|
||||
be emulated by setting texture component swizzles in the runtime API - e.g. via
|
||||
`glTexParameteri()` for OpenGL ES - although it has been noted that API
|
||||
controlled swizzles are not available in WebGL.
|
||||
|
||||
**3:** ASTC can only store unsigned values, and has no equivalent of the BC6
|
||||
signed endpoint mode.
|
||||
|
||||
# Other Considerations
|
||||
|
||||
This section outlines some of the other things to consider when encoding
|
||||
textures using ASTC.
|
||||
|
||||
## Decode mode extensions
|
||||
|
||||
ASTC is specified to decompress into a 16-bit per component RGBA output by
|
||||
default, with the exception of the sRGB format which uses an 8-bit value for the
|
||||
RGB components.
|
||||
|
||||
Decompressing in to a 16-bit per component output format is often higher than
|
||||
many use cases require, especially for LDR textures which originally came from
|
||||
an 8-bit per component source image. Most implementations of ASTC support the
|
||||
decode mode extensions, which allow an application to opt-in to a lower
|
||||
precision decompressed format (RGBA8 for LDR, RGB9E5 for HDR). Using these
|
||||
extensions can improve GPU texture cache efficiency, and even improve texturing
|
||||
filtering throughput, for use cases that do not need the higher precision.
|
||||
|
||||
The ASTC format uses different data rounding rules when the decode mode
|
||||
extensions are used. To ensure that the compressor chooses the best encodings
|
||||
for the RGBA8 rounding rules, you can specify `-decode_unorm8` when compressing
|
||||
textures that will be decompressed into the RGBA8 intermediate. This gives a
|
||||
small image quality boost.
|
||||
|
||||
**Note:** This mode is automatically enabled if you use the `astcenc`
|
||||
decompressor to write an 8-bit per component output image.
|
||||
|
||||
## Encoding non-correlated components
|
||||
|
||||
Most other texture compression formats have a static component assignment in
|
||||
terms of the expected data correlation. For example, ETC2+EAC assumes that RGB
|
||||
are always correlated and that alpha is non-correlated. ASTC can automatically
|
||||
encode data as either fully correlated across all 4 components, or with any one
|
||||
component assigned to a separate non-correlated partition to the other three.
|
||||
|
||||
The non-correlated component can be changed on a block-by-block basis, so the
|
||||
compressor can dynamically adjust the coding based on the data present in the
|
||||
image. This means that there is no need for non-correlated data to be stored
|
||||
in a specific component in the input image.
|
||||
|
||||
It is however worth noting that the alpha component is treated differently to
|
||||
the RGB color components in some circumstances:
|
||||
|
||||
* When coding for sRGB the alpha component will always be stored in linear
|
||||
space.
|
||||
* When coding for HDR the alpha component can optionally be kept as LDR data.
|
||||
|
||||
## Encoding normal maps
|
||||
|
||||
The best way to store normal maps using ASTC is similar to the scheme used by
|
||||
BC5; store the X and Y components of a unit-length normal. The Z component of
|
||||
the normal can be reconstructed in shader code based on the knowledge that the
|
||||
vector is unit length.
|
||||
|
||||
To encode this we need to store only two input components in the compressed
|
||||
data, and therefore use the `rrrg` coding swizzle to align the data with the
|
||||
ASTC luminance+alpha endpoint. We can sample this in shader code using the
|
||||
`.ga` sampling swizzle, and reconstruct the Z value with:
|
||||
|
||||
vec3 nml;
|
||||
nml.xy = texture(...).ga; // Load normals (range 0 to 1)
|
||||
nml.xy = nml.xy * 2.0 - 1.0; // Unpack normals (range -1 to +1)
|
||||
nml.z = sqrt(1 - dot(nml.xy, nml.xy)); // Compute Z, given unit length
|
||||
|
||||
The encoding swizzle and appropriate component weighting is enabled by using
|
||||
the `-normal` command line option. If you wish to use a different pair of
|
||||
components you can specify a custom swizzle after setting the `-normal`
|
||||
parameter. For example, to match BC5n component ordering use
|
||||
`-normal -esw gggr` for compression and `-normal -dsw arz1` for decompression.
|
||||
|
||||
## Encoding sRGB data
|
||||
|
||||
The ASTC LDR profile can compress sRGB encoded color, which is a more
|
||||
efficient use of bits than storing linear encoded color because the gamma
|
||||
corrected value distribution more closely matches human perception of
|
||||
luminance.
|
||||
|
||||
For color data it is nearly always a perceptual quality win to use sRGB input
|
||||
source textures that are then compressed using the ASTC sRGB compression mode
|
||||
(compress using the `-cs` command line option rather than the `-cl` command
|
||||
line option). Note that sRGB gamma correction is only applied to the RGB
|
||||
components during decode; the alpha component is always treated as linear
|
||||
encoded data.
|
||||
|
||||
*Important:* The uncompressed input texture provided on the command line must
|
||||
be stored in the sRGB color space for `-cs` to function correctly.
|
||||
|
||||
## Encoding HDR data
|
||||
|
||||
HDR data can be encoded just like LDR data, but with some caveats around
|
||||
handling the alpha component.
|
||||
|
||||
For many use cases the alpha component is an actual alpha opacity component and
|
||||
is therefore used for storing an LDR value between 0 and 1. For these cases use
|
||||
the `-ch` compressor option which will treat the RGB components as HDR, but the
|
||||
A component as LDR.
|
||||
|
||||
For other use cases the alpha component is simply a fourth data component which
|
||||
is also storing an HDR value. For these cases use the `-cH` compressor option
|
||||
which will treat all components as HDR data.
|
||||
|
||||
- - -
|
||||
|
||||
_Copyright © 2019-2024, Arm Limited and contributors. All rights reserved._
|
||||
@@ -0,0 +1,71 @@
|
||||
# The .astc File Format
|
||||
|
||||
The default file format for compressed textures generated by `astcenc`, as well
|
||||
as from many other ASTC compressors, is the `.astc` format. This is a very
|
||||
simple format consisting of a small header followed immediately by the binary
|
||||
payload for a single image surface.
|
||||
|
||||
Header
|
||||
======
|
||||
|
||||
The header is a fixed 16 byte structure, defined as storing only bytes to avoid
|
||||
any endianness issues or incur any padding overhead.
|
||||
|
||||
```
|
||||
struct astc_header
|
||||
{
|
||||
uint8_t magic[4];
|
||||
uint8_t block_x;
|
||||
uint8_t block_y;
|
||||
uint8_t block_z;
|
||||
uint8_t dim_x[3];
|
||||
uint8_t dim_y[3];
|
||||
uint8_t dim_z[3];
|
||||
};
|
||||
```
|
||||
|
||||
Magic number
|
||||
------------
|
||||
|
||||
The 4 byte magic number at the start of the file acts as a format identifier.
|
||||
|
||||
```
|
||||
magic[0] = 0x13;
|
||||
magic[1] = 0xAB;
|
||||
magic[2] = 0xA1;
|
||||
magic[3] = 0x5C;
|
||||
```
|
||||
|
||||
Block size
|
||||
----------
|
||||
|
||||
The `block_*` fields store the ASTC block dimensions in texels. For 2D images
|
||||
the Z dimension must be set to 1.
|
||||
|
||||
Image dimensions
|
||||
----------------
|
||||
|
||||
The `dim_*` fields store the image dimensions in texels. For 2D images the
|
||||
Z dimension must be set to 1.
|
||||
|
||||
Note that the image is not required to be an exact multiple of the compressed
|
||||
block size; the compressed data may include padding that is discarded during
|
||||
decompression.
|
||||
|
||||
Each dimension is a 24 bit unsigned value that is reconstructed from the stored
|
||||
byte values as:
|
||||
|
||||
```
|
||||
decoded_dim = dim[0] + (dim[1] << 8) + (dim[2] << 16);
|
||||
```
|
||||
|
||||
Binary payload
|
||||
==============
|
||||
|
||||
The binary payload is a byte stream that immediately follows the header. It
|
||||
contains 16 bytes per compressed block. The number of compressed blocks is
|
||||
determined from the header information.
|
||||
|
||||
- - -
|
||||
|
||||
_Copyright © 2020-2022, Arm Limited and contributors. All rights reserved._
|
||||
@@ -0,0 +1,488 @@
|
||||
# ASTC Format Overview
|
||||
|
||||
Adaptive Scalable Texture Compression (ASTC) is an advanced lossy texture
|
||||
compression technology developed by Arm and AMD. It has been adopted as an
|
||||
official Khronos extension to the OpenGL and OpenGL ES APIs, and as a standard
|
||||
optional feature for the Vulkan API.
|
||||
|
||||
ASTC offers a number of advantages over earlier texture compression formats:
|
||||
|
||||
* **Format flexibility:** ASTC supports compressing between 1 and 4 channels of
|
||||
data, including support for one non-correlated channel such as RGB+A
|
||||
(correlated RGB, non-correlated alpha).
|
||||
* **Bit rate flexibility:** ASTC supports compressing images with a fine
|
||||
grained choice of bit rates between 0.89 and 8 bits per texel (bpt). The bit
|
||||
rate choice is independent to the color format choice.
|
||||
* **Advanced format support:** ASTC supports compressing images in either low
|
||||
dynamic range (LDR), LDR sRGB, or high dynamic range (HDR) color spaces, as
|
||||
well as support for compressing 3D volumetric textures.
|
||||
* **Improved image quality:** Despite the high degree of format flexibility,
|
||||
ASTC manages to beat nearly all legacy texture compression formats -- such as
|
||||
ETC2, PVRCT, and the BC formats -- on image quality at equivalent bit
|
||||
rates.
|
||||
|
||||
This article explores the ASTC format, and how it manages to generate the
|
||||
flexibility and quality improvements that it achieves.
|
||||
|
||||
|
||||
Why ASTC?
|
||||
=========
|
||||
|
||||
Before the creation of ASTC, the format and bit rate coverage of the available
|
||||
formats was very sparse:
|
||||
|
||||

|
||||
|
||||
In reality the situation is even worse than this diagram shows, as many of
|
||||
these formats are proprietary or simply not available on some operating
|
||||
systems, so any single platform will have very limited compression choices.
|
||||
|
||||
For developers this situation makes developing content which is portable across
|
||||
multiple platforms a tricky proposition. It's almost certain that differently
|
||||
compressed assets will be needed for different platforms. Each asset pack would
|
||||
likely then need to use different levels of compression, and may even have to
|
||||
fall back to no compression for some assets on some platforms, which leaves
|
||||
either some image quality or some memory bandwidth efficiency untapped.
|
||||
|
||||
It was clear a better way was needed, so the Khronos group asked members to
|
||||
submit proposals for a new compression algorithm to be adopted in the same
|
||||
manner that the earlier ETC algorithm was adopted for OpenGL ES. ASTC was the
|
||||
result of this, and has been adopted as an official algorithm for OpenGL,
|
||||
OpenGL ES, and Vulkan.
|
||||
|
||||
|
||||
Format overview
|
||||
===============
|
||||
|
||||
Given the fragmentation issues with the existing compression formats, it should
|
||||
be no surprise that the high level design objectives for ASTC were to have
|
||||
something which could be used across the whole range of art assets found in
|
||||
modern content, and which allows artists to have more control over the quality
|
||||
to bit rate tradeoff.
|
||||
|
||||
There are quite a few technical components which make up the ASTC format, so
|
||||
before we dive into detail it will be useful to give an overview of how ASTC
|
||||
works at a higher level.
|
||||
|
||||
|
||||
Block compression
|
||||
-----------------
|
||||
|
||||
Compression formats for real-time graphics need the ability to quickly and
|
||||
efficiently make random samples into a texture. This places two technical
|
||||
requirements on any compression format:
|
||||
|
||||
* It must be possible to compute the address of data in memory given only a
|
||||
sample coordinate.
|
||||
* It must be possible to decompress random samples without decompressing too
|
||||
much surrounding data.
|
||||
|
||||
The standard solution for this used by all contemporary real-time formats,
|
||||
including ASTC, is to divide the image into fixed-size blocks of texels, each
|
||||
of which is compressed into a fixed number of output bits. This feature makes
|
||||
it possible to access texels quickly, in any order, and with a well-bounded
|
||||
decompression cost.
|
||||
|
||||
The 2D block footprints in ASTC range from 4x4 texels up to 12x12 texels, which
|
||||
all compress into 128-bit output blocks. By dividing 128 bits by the number of
|
||||
texels in the footprint, we derive the format bit rates which range from 8 bpt
|
||||
(`128/(4*4)`) down to 0.89 bpt (`128/(12*12)`).
|
||||
|
||||
|
||||
Color encoding
|
||||
--------------
|
||||
|
||||
ASTC uses gradients to assign the color values of each texel. Each compressed
|
||||
block stores the end-point colors for a gradient, and an interpolation weight
|
||||
for each texel which defines the texel's location along that gradient. During
|
||||
decompression the color value for each texel is generated by interpolating
|
||||
between the two end-point colors, based on the per-texel weight.
|
||||
|
||||

|
||||
|
||||
In many cases a block will contain a complex distribution of colors, for
|
||||
example a red ball sitting on green grass. In these scenarios a single color
|
||||
gradient will not be able to accurately represent all of the texels' values. To
|
||||
support this ASTC allows a block to define up to four distinct color gradients,
|
||||
known as partitions, and can assign each texel to a single partition. For our
|
||||
example we require two partitions, one for our ball texels and one for our
|
||||
grass texels.
|
||||
|
||||

|
||||
|
||||
Now that you know the high level operation of the format, we can dive into more
|
||||
detail.
|
||||
|
||||
|
||||
Integer encoding
|
||||
================
|
||||
|
||||
Initially the idea of fractional bits per texel sounds implausible, or even
|
||||
impossible, because we're so used to storing numbers as a whole number of bits.
|
||||
However, it's not quite as strange as it sounds. ASTC uses an encoding
|
||||
technique called Bounded Integer Sequence Encoding (BISE), which makes heavy
|
||||
use of storing numbers with a fractional number of bits to pack information
|
||||
more efficiently.
|
||||
|
||||
|
||||
Storing alphabets
|
||||
-----------------
|
||||
|
||||
Even though color and weight values per texel are notionally floating-point
|
||||
values, we have far too few bits available to directly store the actual values,
|
||||
so they must be quantized during compression to reduce the storage size. For
|
||||
example, if we have a floating-point weight for each texel in the range 0.0 to
|
||||
1.0 we could choose to quantize it to five values - 0.0, 0.25, 0.5, 0.75, and
|
||||
1.0 - which we can then represent in storage using the integer values 0 to 4.
|
||||
|
||||
In the general case we need to be able to efficiently store characters of an
|
||||
alphabet containing N symbols if we choose quantize to N levels. An N symbol
|
||||
alphabet contains `log2(N)` bits of information per character. If we have an
|
||||
alphabet of 5 possible symbols then each character contains ~2.32 bits of
|
||||
information, but simple binary storage would require us to round up to 3 bits.
|
||||
This wastes 22.3% of our storage capacity. The chart below shows the percentage
|
||||
of our bit-space wasted when using simple binary encoding to store an arbitrary
|
||||
N symbol alphabet:
|
||||
|
||||

|
||||
|
||||
... which shows for most alphabet sizes we waste a lot of our storage capacity
|
||||
when using an integer number of bits per character. Efficiency is of critical
|
||||
importance to a compression format, so this is something we needed to be able
|
||||
to improve.
|
||||
|
||||
**Note:** We could have chosen to round-up the quantization level to the next
|
||||
power of two, and at least use the bits we're spending. However, this forces
|
||||
the encoder to spend bits which could be used elsewhere for a bigger benefit,
|
||||
so it will reduce image quality and is a sub-optimal solution.
|
||||
|
||||
|
||||
Quints
|
||||
------
|
||||
|
||||
Instead of rounding up a 5 symbol alphabet - called a "quint" in BISE - to
|
||||
three bits, we could choose to instead pack three quint characters together.
|
||||
Three characters in a 5-symbol alphabet have 5<sup>3</sup> (125) combinations,
|
||||
and contain 6.97 bits of information. We can store this in 7 bits and have a
|
||||
storage waste of only 0.5%.
|
||||
|
||||
|
||||
Trits
|
||||
-----
|
||||
|
||||
We can similarly construct a 3-symbol alphabet - called a "trit" in BISE - and
|
||||
pack trit characters in groups of five. Each character group has 3<sup>5</sup>
|
||||
(243) combinations, and contains 7.92 bits of information. We can store this in
|
||||
8 bits and have a storage waste of only 1%.
|
||||
|
||||
|
||||
BISE
|
||||
----
|
||||
|
||||
The BISE encoding used by ASTC allows storage of character sequences using
|
||||
arbitrary alphabets of up to 256 symbols, encoding each alphabet size in the
|
||||
most space-efficient choice of bits, trits, and quints.
|
||||
|
||||
* Alphabets with up to (2<sup>n</sup> - 1) symbols can be encoded using n bits
|
||||
per character.
|
||||
* Alphabets with up (3 * 2<sup>n</sup> - 1) symbols can be encoded using n bits
|
||||
(m) and a trit (t) per character, and reconstructed using the equation
|
||||
(t * 2<sup>n</sup> + m).
|
||||
* Alphabets with up to (5 * 2<sup>n</sup> - 1) symbols can be encoded using n
|
||||
bits (m) and a quint (q) per character, and reconstructed using the equation
|
||||
(q * 2<sup>n</sup> + m).
|
||||
|
||||
When the number of characters in a sequence is not a multiple of three or five
|
||||
we need to avoid wasting storage at the end of the sequence, so we add another
|
||||
constraint on the encoding. If the last few values in the sequence to encode
|
||||
are zero, the last few bits in the encoded bit string must also be zero.
|
||||
Ideally, the number of non-zero bits should be easily calculated and not depend
|
||||
on the magnitudes of the previous encoded values. This is a little tricky to
|
||||
arrange during compression, but it is possible. This means that we do not need
|
||||
to store any padding after the end of the bit sequence, as we can safely assume
|
||||
that they are zero bits.
|
||||
|
||||
With this constraint in place - and by some smart packing the bits, trits, and
|
||||
quints - BISE encodes an string of S characters in an N symbol alphabet using a
|
||||
fixed number of bits:
|
||||
|
||||
* S values up to (2<sup>n</sup> - 1) uses (NS) bits.
|
||||
* S values up to (3 * 2<sup>n</sup> - 1) uses (NS + ceil(8S / 5)) bits.
|
||||
* S values up to (5 * 2<sup>n</sup> - 1) uses (NS + ceil(7S / 3)) bits.
|
||||
|
||||
... and the compressor will choose the one of these which produces the smallest
|
||||
storage for the alphabet size being stored; some will use binary, some will use
|
||||
bits and a trit, and some will use bits and a quint. If we compare the storage
|
||||
efficiency of BISE against simple binary for the range of possible alphabet
|
||||
sizes we might want to encode we can see that it is much more efficient.
|
||||
|
||||

|
||||
|
||||
|
||||
Block sizes
|
||||
===========
|
||||
|
||||
ASTC always compresses blocks of texels into 128-bit outputs, but allows the
|
||||
developer to select from a range of block sizes to enable a fine-grained
|
||||
tradeoff between image quality and size.
|
||||
|
||||
| Block footprint | Bits/texel | | Block footprint | Bits/texel |
|
||||
| --------------- | ---------- | --- | --------------- | ---------- |
|
||||
| 4x4 | 8.00 | | 10x5 | 2.56 |
|
||||
| 5x4 | 6.40 | | 10x6 | 2.13 |
|
||||
| 5x5 | 5.12 | | 8x8 | 2.00 |
|
||||
| 6x5 | 4.27 | | 10x8 | 1.60 |
|
||||
| 6x6 | 3.56 | | 10x10 | 1.28 |
|
||||
| 8x5 | 3.20 | | 12x10 | 1.07 |
|
||||
| 8x6 | 2.67 | | 12x12 | 0.89 |
|
||||
|
||||
|
||||
|
||||
Color endpoints
|
||||
===============
|
||||
|
||||
The color data for a block is encoded as a gradient between two color
|
||||
endpoints, with each texel selecting a position along that gradient which is
|
||||
then interpolated during decompression. ASTC supports 16 color endpoint
|
||||
encoding schemes, known as "endpoint modes". Options for endpoint modes
|
||||
include:
|
||||
|
||||
* Varying the number of color channels: e.g. luminance, luminance + alpha, rgb,
|
||||
and rgba.
|
||||
* Varying the encoding method: e.g. direct, base+offset, base+scale,
|
||||
quantization level.
|
||||
* Varying the data range: e.g. low dynamic range, or high dynamic range
|
||||
|
||||
The endpoint modes, and the endpoint color BISE quantization level, can be
|
||||
chosen on a per-block basis.
|
||||
|
||||
|
||||
Color partitions
|
||||
================
|
||||
|
||||
Colors within a block are often complex, and cannot be accurately captured by a
|
||||
single color gradient, as discussed earlier with our example of a red ball
|
||||
lying on green grass. ASTC allows up to four color gradients - known as
|
||||
"partitions" - to be assigned to a single block. Each texel is then assigned to
|
||||
a single partition for the purposes of decompression.
|
||||
|
||||
Rather then directly storing the partition assignment for each texel, which
|
||||
would need a lot of decompressor hardware to store it for all block sizes, we
|
||||
generate it procedurally. Each block only needs to store the partition index -
|
||||
which is the seed for the procedural generator - and the per texel assignment
|
||||
can then be generated on-the-fly during decompression. The image below shows
|
||||
the generated texel assignments for two (top), three (middle), and four
|
||||
(bottom) partitions for the 8x8 block size.
|
||||
|
||||

|
||||
|
||||
The number of partitions and the partition index can be chosen on a per-block
|
||||
basis, and a different color endpoint mode can be chosen per partition.
|
||||
|
||||
**Note:** ASTC uses a 10-bit seed to drive the partition assignments. The hash
|
||||
used will introduce horizontal bias in a third of the partitions, vertical bias
|
||||
in a third, and no bias in the rest. As they are procedurally generated not all
|
||||
of the partitions are useful, in particular with the smaller block sizes.
|
||||
|
||||
* Many partitions are duplicates.
|
||||
* Many partitions are degenerate (an N partition hash results in at least one
|
||||
partition assignment that contains no texels).
|
||||
|
||||
|
||||
Texel weights
|
||||
=============
|
||||
|
||||
Each texel requires a weight, which defines the relative contribution of each
|
||||
color endpoint when interpolating the color gradient.
|
||||
|
||||
For smaller block sizes we can choose to store the weight directly, with one
|
||||
weight per texel, but for the larger block sizes we simply do not have enough
|
||||
bits of storage to do this. To work around this ASTC allows the weight grid to
|
||||
be stored at a lower resolution than the texel grid. The per-texel weights are
|
||||
interpolated from the stored weight grid during decompression using a bilinear
|
||||
interpolation.
|
||||
|
||||
The number of texel weights, and the weight value BISE quantization level, can
|
||||
be chosen on a per-block basis.
|
||||
|
||||
|
||||
Dual-plane weights
|
||||
------------------
|
||||
|
||||
Using a single weight for all color channels works well when there is good
|
||||
correlation across the channels, but this is not always the case. Common
|
||||
examples where we would expect to get low correlation at least some of the time
|
||||
are textures storing RGBA data - alpha masks are not usually closely
|
||||
correlated with the color value - or normal data - the X and Y normal values
|
||||
often change independently.
|
||||
|
||||
ASTC allows a dual-plane mode, which uses two separate weight grids for each
|
||||
texel. A single channel can be assigned to a second plane of weights, while
|
||||
the other three use the first plane of weights.
|
||||
|
||||
The use of dual-plane mode can be chosen on a per-block basis, but its use
|
||||
prevents the use of four color partitions as we do not have enough bits to
|
||||
concurrently store both an extra plane of weights and an extra set of color
|
||||
endpoints.
|
||||
|
||||
|
||||
End results
|
||||
===========
|
||||
|
||||
So, if we pull all of this together what do we end up with?
|
||||
|
||||
|
||||
Adaptive
|
||||
--------
|
||||
|
||||
The first word in the name of ASTC is "adaptive", and it should now hopefully
|
||||
be clear why. Each block always compresses into 128-bits of storage, but the
|
||||
developer can choose from a wide range of texel block sizes and the compressor
|
||||
gets a huge amount of latitude to determine how those 128 bits are used.
|
||||
|
||||
The compressor can trade off the number of bits assigned to colors (number of
|
||||
partitions, endpoint mode, and stored quantization level) and weights (number
|
||||
of weights per block, use of dual-plane, and stored quantization level) on a
|
||||
per-block basis to get the best image quality possible.
|
||||
|
||||

|
||||
|
||||
|
||||
Format support
|
||||
--------------
|
||||
|
||||
The compression scheme used by ASTC effectively compresses arbitrary sequences
|
||||
of floating point numbers, with a flexible number of channels, across any of
|
||||
the supported block sizes. There is no real notion of "color format" in the
|
||||
format itself at all, beyond the color endpoint mode selection, although a
|
||||
sensible compressor will want to use some format-specific heuristics to drive
|
||||
an efficient state-space search.
|
||||
|
||||
The orthogonal encoding design allows ASTC to provide almost complete coverage
|
||||
of our desirable format matrix from earlier, across a wide range of bit rates:
|
||||
|
||||

|
||||
|
||||
The only significant omission is the absence of a dedicated two channel
|
||||
encoding for HDR textures. We simply ran out of entries in the space we had for
|
||||
encoding color endpoint modes, and this one didn't make the cut.
|
||||
|
||||
The flexibility allowed by ASTC ticks the requirement that almost any asset can
|
||||
be compressed to some degree, at an appropriate bitrate for its quality needs.
|
||||
This is a powerful enabler for a compression format, because it puts control in
|
||||
the hands of content creators and not arbitrary format restrictions.
|
||||
|
||||
|
||||
Image quality
|
||||
-------------
|
||||
|
||||
The normal expectation would be that this level of format flexibility would
|
||||
come at a cost of image quality; it has to cost something, right? Luckily this
|
||||
isn't true. The high packing efficiency allowed by BISE encoding, and the
|
||||
ability to dynamically choose where to spend encoding space on a per-block
|
||||
basis, means that an ASTC compressor is not forced to spend bits on things that
|
||||
don't help image quality.
|
||||
|
||||
This gives some significant improvements in image quality compared to the older
|
||||
texture formats, even though ASTC also handles a much wider range of options.
|
||||
|
||||
* ASTC at 2 bpt outperforms PVRTC at 2 bpt by ~2.0dB.
|
||||
* ASTC at 3.56 bpt outperforms PVRTC and BC1 at 4 bpt by ~1.5dB, and ETC2 by
|
||||
~0.7dB, despite a 10% bit rate disadvantage.
|
||||
* ASTC at 8 bpt for LDR formats is comparable in quality to BC7 at 8 bpt.
|
||||
* ASTC at 8 bpt for HDR formats is comparable in quality to BC6H at 8 bpt.
|
||||
|
||||
Differences as small as 0.25dB are visible to the human eye, and remember that
|
||||
dB uses a logarithmic scale, so these are significant image quality
|
||||
improvements.
|
||||
|
||||
|
||||
3D compression
|
||||
--------------
|
||||
|
||||
One of the nice bonus features of ASTC is that the techniques which underpin
|
||||
the format generalize to compressing volumetric texture data without needing
|
||||
very much additional decompression hardware.
|
||||
|
||||
ASTC is therefore also able to optionally support compression of 3D textures,
|
||||
which is a unique feature not found in any earlier format, at the following
|
||||
bit rates:
|
||||
|
||||
| Block footprint | Bits/texel | | Block footprint | Bits/texel |
|
||||
| --------------- | ---------- | --- | --------------- | ---------- |
|
||||
| 3x3x3 | 4.74 | | 5x5x4 | 1.28 |
|
||||
| 4x3x3 | 3.56 | | 5x5x5 | 1.02 |
|
||||
| 4x4x3 | 2.67 | | 6x5x5 | 0.85 |
|
||||
| 4x4x4 | 2.00 | | 6x6x5 | 0.71 |
|
||||
| 5x4x4 | 1.60 | | 6x6x6 | 0.59 |
|
||||
|
||||
|
||||
Availability
|
||||
============
|
||||
|
||||
The ASTC functionality is specified as a set of feature profiles, allowing
|
||||
GPU hardware manufacturers to select which parts of the standard they
|
||||
implement. There are four commonly seen profiles:
|
||||
|
||||
* "LDR":
|
||||
* 2D blocks.
|
||||
* LDR and sRGB color space.
|
||||
* [KHR_texture_compression_astc_ldr][astc_ldr]: KHR OpenGL ES extension.
|
||||
* "LDR + Sliced 3D":
|
||||
* 2D blocks and sliced 3D blocks.
|
||||
* LDR and sRGB color space.
|
||||
* [KHR_texture_compression_astc_sliced_3d][astc_3d]: KHR OpenGL ES extension.
|
||||
* "HDR":
|
||||
* 2D and sliced 3D blocks.
|
||||
* LDR, sRGB, and HDR color spaces.
|
||||
* [KHR_texture_compression_astc_hdr][astc_ldr]: KHR OpenGL ES extension.
|
||||
* "Full":
|
||||
* 2D, sliced 3D, and volumetric 3D blocks.
|
||||
* LDR, sRGB, and HDR color spaces.
|
||||
* [OES_texture_compression_astc][astc_full]: OES OpenGL ES extension.
|
||||
|
||||
The LDR profile is mandatory in OpenGL ES 3.2 and a standardized optional
|
||||
feature for Vulkan, and therefore widely supported on contemporary mobile
|
||||
devices. The 2D HDR profile is not mandatory, but is widely supported.
|
||||
|
||||
3D texturing
|
||||
------------
|
||||
|
||||
The APIs expose 3D textures in two flavors.
|
||||
|
||||
The sliced 3D texture support builds a 3D texture from an array of 2D image
|
||||
slices that have each been individually compressed using 2D ASTC compression.
|
||||
This is required for the HDR profile, so is also widely supported.
|
||||
|
||||
The volumetric 3D texture support uses the native 3D block sizes provided by
|
||||
ASTC to implement true volumetric compression. This enables a wider choice of
|
||||
low bitrate options than the 2D blocks, which is particularly important for 3D
|
||||
textures of any non-trivial size. Volumetric formats are not widely supported,
|
||||
but are supported on all of the Arm Mali GPUs that support ASTC.
|
||||
|
||||
ASTC decode mode
|
||||
----------------
|
||||
|
||||
ASTC is specified to decompress texels into fp16 intermediate values, except
|
||||
for sRGB which always decompresses into 8-bit UNORM intermediates. For many use
|
||||
cases this gives more dynamic range and precision than required. This can cause
|
||||
a reduction in both texture cache efficiency and texture filtering performance
|
||||
due to the larger decompressed data size.
|
||||
|
||||
A pair of extensions exist, and are widely supported on recent mobile GPUs,
|
||||
which allow applications to reduce the intermediate precision to either UNORM8
|
||||
(recommended for LDR textures) or RGB9e5 (recommended for HDR textures).
|
||||
|
||||
* [OES_texture_compression_astc_decode_mode][astc_decode]: Allow UNORM8
|
||||
intermediates
|
||||
* [OES_texture_compression_astc_decode_mode_rgb9e5][astc_decode]: Allow RGB9e5
|
||||
intermediates
|
||||
|
||||
[astc_ldr]: https://www.khronos.org/registry/OpenGL/extensions/KHR/KHR_texture_compression_astc_hdr.txt
|
||||
[astc_3d]: https://www.khronos.org/registry/OpenGL/extensions/KHR/KHR_texture_compression_astc_sliced_3d.txt
|
||||
[astc_full]: https://www.khronos.org/registry/OpenGL/extensions/OES/OES_texture_compression_astc.txt
|
||||
[astc_decode]: https://www.khronos.org/registry/OpenGL/extensions/EXT/EXT_texture_compression_astc_decode_mode.txt
|
||||
|
||||
- - -
|
||||
|
||||
_Copyright © 2019-2022, Arm Limited and contributors. All rights reserved._
|
||||
|
After Width: | Height: | Size: 115 KiB |
|
After Width: | Height: | Size: 23 KiB |
|
After Width: | Height: | Size: 29 KiB |
|
After Width: | Height: | Size: 122 KiB |
|
After Width: | Height: | Size: 76 KiB |
|
After Width: | Height: | Size: 55 KiB |
|
After Width: | Height: | Size: 79 KiB |
|
After Width: | Height: | Size: 47 KiB |
@@ -0,0 +1,79 @@
|
||||
# Terminology for the ASTC Encoder
|
||||
|
||||
Like most software, the `astcenc` code base has a set of naming conventions
|
||||
for variables which are used to ensure both accuracy and reasonable brevity.
|
||||
|
||||
:construction: These conventions are being used for new patches, so new code
|
||||
will conform to this, but older code is still being cleaned up to follow
|
||||
these conventions.
|
||||
|
||||
## Counts
|
||||
|
||||
For counts of things prefer `<x>_count` rather than `<x>s`. For example:
|
||||
|
||||
* `plane_count`
|
||||
* `weight_count`
|
||||
* `texel_count`
|
||||
|
||||
Where possible aim for descriptive loop variables, as these are more literate
|
||||
than simple `i` or `j` variables. For example:
|
||||
|
||||
* `plane_index`
|
||||
* `weight_index`
|
||||
* `texel_index`
|
||||
|
||||
## Ideal, Unpacked Quantized, vs Packed Quantized
|
||||
|
||||
Variables that are quantized, such as endpoint colors and weights, have
|
||||
multiple states depending on how they are being used.
|
||||
|
||||
**Ideal values** represent arbitrary numeric values that can take any value.
|
||||
These are often used during compression to work out the best value before
|
||||
any quantization is applied. For example, integer weights in the 0-64 range can
|
||||
take any of the 65 values available.
|
||||
|
||||
**Quant uvalues** represent the unpacked numeric value after any quantization
|
||||
rounding has been applied. These are often used during compression to work out
|
||||
the error for the quantized value compared to the ideal value. For example,
|
||||
`QUANT_3` weights in the 0-64 range can only take one of `[0, 32, 64]`.
|
||||
|
||||
**Quant pvalues** represent the packed numeric value in the quantized alphabet.
|
||||
This is what ends up encoded in the ASTC data, although note that the encoded
|
||||
ordering is scrambled to simplify hardware. For example, `QUANT_3` weights
|
||||
originally in the 0-64 range can only take one of `[0, 1, 2]`.
|
||||
|
||||
For example:
|
||||
|
||||
* `weights_ideal_value`
|
||||
* `weights_quant_uvalue`
|
||||
* `weights_quant_pvalue`
|
||||
|
||||
## Full vs Decimated interpolation weights
|
||||
|
||||
Weight grids have multiple states depending on how they are being used.
|
||||
|
||||
**full_weights** represent per texel weight grids, storing one weight per texel.
|
||||
|
||||
**decimated_weights** represent reduced weight grids, which can store fewer
|
||||
weights and which are bilinear interpolated to generate the full weight grid.
|
||||
|
||||
Full weights have no variable prefix,but decimated weights are stored with
|
||||
a `dec_` prefix.
|
||||
|
||||
* `dec_weights_ideal_value`
|
||||
* `dec_weights_quant_uvalue`
|
||||
* `dec_weights_quant_pvalue`
|
||||
|
||||
## Weight vs Significance
|
||||
|
||||
The original encoder used "weight" for multiple purposes - texel significance
|
||||
(weight the error), color channel significance (weight the error), and endpoint
|
||||
interpolation weights. This gets very confusing in functions using all three!
|
||||
|
||||
We are slowly refactoring the code to only use "weight" to mean the endpoint
|
||||
interpolation weights. The error weighting factors used for other purposes are
|
||||
being updated to use the using the term "significance".
|
||||
|
||||
- - -
|
||||
|
||||
_Copyright © 2020-2022, Arm Limited and contributors. All rights reserved._
|
||||
@@ -0,0 +1,120 @@
|
||||
# Testing astcenc
|
||||
|
||||
The repository contains a small suite of tests which can be used to sanity
|
||||
check source code changes to the compressor. It must be noted that this test
|
||||
suite is relatively limited in scope and does not cover every feature or
|
||||
bitrate of the standard.
|
||||
|
||||
# Required software
|
||||
|
||||
Running the tests requires Python 3.7 to be installed on the host machine, and
|
||||
an `astcenc-avx2` release build to have been previously compiled and installed
|
||||
into an directory called `astcenc` in the root of the git checkout. This
|
||||
can be achieved by configuring the CMake build using the install prefix
|
||||
`-DCMAKE_INSTALL_PREFIX=../` and then running a build with the `install` build
|
||||
target.
|
||||
|
||||
# Running C++ unit tests
|
||||
|
||||
We support a small (but growing) number of C++ unit tests, which are written
|
||||
using the `googletest` framework and integrated in the CMake "CTest" test
|
||||
framework.
|
||||
|
||||
To build unit tests pull the `googletest` git submodule and add
|
||||
`-DASTCENC_UNITTEST=ON` to the CMake command line when configuring.
|
||||
|
||||
To run unit tests use the CMake `ctest` utility from your build directory after
|
||||
you have built the tests.
|
||||
|
||||
```shell
|
||||
cd build
|
||||
ctest --verbose
|
||||
```
|
||||
|
||||
# Running command line tests
|
||||
|
||||
To run the command line tests, which aim to get coverage of the command line
|
||||
options and core codec stability without testing the compression quality
|
||||
itself, run the command line:
|
||||
|
||||
python3 -m unittest discover -s Test -p astc_test*.py -v
|
||||
|
||||
# Running image tests
|
||||
|
||||
To run the image test suite run the following command from the root directory
|
||||
of the repository:
|
||||
|
||||
python3 ./Test/astc_test_image.py
|
||||
|
||||
This will run though a series of image compression tests, comparing the image
|
||||
PSNR against a set of reference results from the last stable baseline. The test
|
||||
will fail if any reduction in PSNR above a set threshold is detected. Note that
|
||||
performance information is reported, but regressions will not flag a failure.
|
||||
|
||||
For debug purposes, all decompressed test output images and result CSV files
|
||||
are stored in the `TestOutput` directory, using the same test set structure as
|
||||
the `Test/Images` folder.
|
||||
|
||||
## Test selection
|
||||
|
||||
The runner supports a number of options to filter down what is run, enabling
|
||||
developers to focus local testing on the parts of the code they are working on.
|
||||
|
||||
* `--encoder` selects which encoder to run. By default the `avx2` encoder is
|
||||
selected. Note that some out-of-tree reference encoders (older encoders, and
|
||||
some third-party encoders) are supported for comparison purposes. These will
|
||||
not work without the binaries being manually provided; they are not
|
||||
distributed here.
|
||||
* `--test-set` selects which image set to run. By default the `Small` image
|
||||
test set is selected, which aims to provide basic coverage of many different
|
||||
color formats and color profiles.
|
||||
* `--block-size` selects which block size to run. By default a range of
|
||||
block sizes (2D and 3D) are used.
|
||||
* `--color-profile` selects which color profiles from the standard should be
|
||||
used (LDR, LDR sRGB, or HDR) to select images. By default all are selected.
|
||||
* `--color-format` selects which color formats should be used (L, XY, RGB,
|
||||
RGBA) to select images. By default all are selected.
|
||||
|
||||
## Performance tests
|
||||
|
||||
To provide less noisy performance results the test suite supports compressing
|
||||
each image multiple times and returning the best measured performance. To
|
||||
enable this mode use the following options:
|
||||
|
||||
* `--repeats <M>` : Run M test compression passes which are timed.
|
||||
|
||||
**Note:** The reference CSV contains performance results measured on an Intel
|
||||
Core i5 9600K running at 4.3GHz, running each test 5 times.
|
||||
|
||||
## Updating reference data
|
||||
|
||||
The reference PSNR and performance scores are stored in CSVs committed to the
|
||||
repository. This data is created by running the tests using the last stable
|
||||
release on a standard test machine we use for performance testing builds.
|
||||
|
||||
It can be useful for developers to rebuild the reference results for their
|
||||
local machine, in particular for measuring performance improvements. To build
|
||||
new reference CSVs, download the current reference `astcenc` binary (1.7) from
|
||||
GitHub for your host OS and place it in to the `./Binaries/1.7/` directory.
|
||||
Once this is done, run the command:
|
||||
|
||||
python3 ./Test/astc_test_image.py --encoder 1.7 --test-set all --repeats 5
|
||||
|
||||
... to regenerate the reference CSV files.
|
||||
|
||||
**WARNING:** This can take some hours to complete, and it is best done when the
|
||||
test suite gets exclusive use of the machine to avoid other processing slowing
|
||||
down the compression and disturbing the performance data. It is recommended to
|
||||
shutdown or disable any background applications that are running.
|
||||
|
||||
## Valgrind memcheck
|
||||
|
||||
It is always worth running the Valgrind memcheck tool to validate that we have
|
||||
not introduced any obvious memory errors. Build a release build with symbols
|
||||
information with `-DCMAKE_BUILD_TYPE=RelWithDebInfo` and then run:
|
||||
|
||||
valgrind --tool=memcheck --track-origins=yes <command>
|
||||
|
||||
- - -
|
||||
|
||||
_Copyright © 2019-2022, Arm Limited and contributors. All rights reserved._
|
||||
@@ -0,0 +1,250 @@
|
||||
# About
|
||||
|
||||
The Arm® Adaptive Scalable Texture Compression (ASTC) Encoder, `astcenc`, is
|
||||
a command-line tool for compressing and decompressing images using the ASTC
|
||||
texture compression standard.
|
||||
|
||||
## The ASTC format
|
||||
|
||||
The ASTC compressed data format, developed by Arm® and AMD, has been adopted as
|
||||
an official extension to the OpenGL®, OpenGL ES, and Vulkan® graphics APIs. It
|
||||
provides a major step forward in terms of both the image quality at a given
|
||||
bitrate, and the format and bitrate flexibility available to content creators.
|
||||
This allows more assets to use compression, often at a reduced bitrate compared
|
||||
to other formats, reducing memory storage and bandwidth requirements.
|
||||
|
||||
Read the [ASTC Format Overview][1] for a quick introduction to the format, or
|
||||
read the full [Khronos Data Format Specification][2] for all the details.
|
||||
|
||||
## License
|
||||
|
||||
This project is licensed under the Apache 2.0 license. By downloading any
|
||||
component from this repository you acknowledge that you accept terms specified
|
||||
in the [LICENSE.txt](LICENSE.txt) file.
|
||||
|
||||
# Encoder feature support
|
||||
|
||||
The encoder supports compression of low dynamic range (BMP, JPEG, PNG, TGA) and
|
||||
high dynamic range (EXR, HDR) images, as well as a subset of image data wrapped
|
||||
in the DDS and KTX container formats, into ASTC or KTX format output images.
|
||||
|
||||
The decoder supports decompression of ASTC or KTX format input images into low
|
||||
dynamic range (BMP, PNG, TGA), high dynamic range (EXR, HDR), or DDS and KTX
|
||||
wrapped output images.
|
||||
|
||||
The encoder allows control over the compression time/quality tradeoff with
|
||||
`exhaustive`, `verythorough`, `thorough`, `medium`, `fast`, and `fastest`
|
||||
encoding quality presets.
|
||||
|
||||
The encoder allows compression time and quality analysis by reporting the
|
||||
compression time, and the Peak Signal-to-Noise Ratio (PSNR) between the input
|
||||
image and the compressed output.
|
||||
|
||||
## ASTC format support
|
||||
|
||||
The `astcenc` compressor supports generation of images for all three profiles
|
||||
allowed by the ASTC specification:
|
||||
|
||||
* 2D Low Dynamic Range (LDR profile)
|
||||
* 2D LDR and High Dynamic Range (HDR profile)
|
||||
* 2D and 3D, LDR and HDR (Full profile)
|
||||
|
||||
It also supports all of the ASTC block sizes and compression modes, allowing
|
||||
content creators to use the full spectrum of quality-to-bitrate options ranging
|
||||
from 0.89 bits/pixel up to 8 bits/pixel.
|
||||
|
||||
# Prebuilt binaries
|
||||
|
||||
Release build binaries for the `astcenc` stable releases are provided in the
|
||||
[GitHub Releases page][3].
|
||||
|
||||
* Change log: [5.x series](./Docs/ChangeLog-5x.md)
|
||||
|
||||
Binaries are provided for 64-bit builds on Windows, macOS, and Linux.
|
||||
|
||||
## Windows and Linux
|
||||
|
||||
For Windows and Linux the builds of the astcenc are provided as multiple
|
||||
binaries, each tuned for a specific SIMD instruction set.
|
||||
|
||||
For x86-64 we provide, in order of increasing performance:
|
||||
|
||||
* `astcenc-sse2` - uses SSE2
|
||||
* `astcenc-sse4.1` - uses SSE4.1 and POPCNT
|
||||
* `astcenc-avx2` - uses AVX2, SSE4.2, POPCNT, and F16C
|
||||
|
||||
The x86-64 SSE2 builds will work on all x86-64 machines, but it is the slowest
|
||||
of the three. The other two require extended CPU instruction set support which
|
||||
is not universally available, but each step gains ~15% more performance.
|
||||
|
||||
For Arm we provide, in order of increasing performance:
|
||||
|
||||
* `astcenc-sve_256` - uses 256-bit SVE
|
||||
* `astcenc-sve_128` - uses 128-bit SVE
|
||||
* `astcenc-neon` - uses NEON
|
||||
|
||||
Note: The Arm Scalable Vector Extensions (SVE) allow CPUs to have a variable
|
||||
vector length. The astcenc implementation is not written in a length-agnostic
|
||||
style and requires the binary to match the vector length on the host CPU.
|
||||
|
||||
## macOS
|
||||
|
||||
For macOS devices we provide a single universal binary `astcenc`, which allows
|
||||
the OS to automatically use the correct binary variant for the current host
|
||||
machine. Support is provided for three architecture slices:
|
||||
|
||||
* `x86_64` - uses the `astcenc-sse4.1` build defined above.
|
||||
* `x86_64h` - uses the `astcenc-avx2` build defined above.
|
||||
* `arm64` - uses the `astcenc-neon` build defined above.
|
||||
|
||||
## Repository branches
|
||||
|
||||
The `main` branch is an active development branch for the compressor. It aims
|
||||
to be a stable branch for the latest major release series, but as it is used
|
||||
for ongoing development expect it to have some volatility. We recommend using
|
||||
the latest stable release tag for production development.
|
||||
|
||||
The `4.x` branch is a stable branch for the older 4.x release series. It is no
|
||||
longer under active development, but is a supported branch that continues to
|
||||
get back-ported bug fixes.
|
||||
|
||||
The `1.x`, `2.x`, and `3.x` branches are stable branches for older releases.
|
||||
They are no longer under active development or getting bug fixes.
|
||||
|
||||
Any other branches you might find are development branches for new features or
|
||||
optimizations, so might be interesting to play with but should be considered
|
||||
transient and unstable.
|
||||
|
||||
|
||||
# Getting started
|
||||
|
||||
Open a terminal, change to the appropriate directory for your system, and run
|
||||
the astcenc encoder program, like this on Linux or macOS:
|
||||
|
||||
./astcenc
|
||||
|
||||
... or like this on Windows:
|
||||
|
||||
astcenc
|
||||
|
||||
Invoking `astcenc -help` gives an extensive help message, including usage
|
||||
instructions and details of all available command line options. A summary of
|
||||
the main encoder options are shown below.
|
||||
|
||||
## Compressing an image
|
||||
|
||||
Compress an image using the `-cl` \ `-cs` \ `-ch` \ `-cH` modes. For example:
|
||||
|
||||
astcenc -cl example.png example.astc 6x6 -medium
|
||||
|
||||
This compresses `example.png` using the LDR color profile and a 6x6 block
|
||||
footprint (3.56 bits/pixel). The `-medium` quality preset gives a reasonable
|
||||
image quality for a relatively fast compression speed, so is a good starting
|
||||
point for compression. The output is stored to a linear color space compressed
|
||||
image, `example.astc`.
|
||||
|
||||
The modes available are:
|
||||
|
||||
* `-cl` : use the linear LDR color profile.
|
||||
* `-cs` : use the sRGB LDR color profile.
|
||||
* `-ch` : use the HDR color profile, tuned for HDR RGB and LDR A.
|
||||
* `-cH` : use the HDR color profile, tuned for HDR RGBA.
|
||||
|
||||
If you intend to use the resulting image with the decode mode extensions to
|
||||
limit the decompressed precision to UNORM8, it is recommended that you also
|
||||
specify the `-decode_unorm8` flag. This will ensure that the compressor uses
|
||||
the correct rounding rules when choosing encodings.
|
||||
|
||||
## Decompressing an image
|
||||
|
||||
Decompress an image using the `-dl` \ `-ds` \ `-dh` \ `-dH` modes. For example:
|
||||
|
||||
astcenc -dh example.astc example.tga
|
||||
|
||||
This decompresses `example.astc` using the full HDR feature profile, storing
|
||||
the decompressed output to `example.tga`.
|
||||
|
||||
The modes available mirror the options used for compression, but use a `d`
|
||||
prefix. Note that for decompression there is no difference between the two HDR
|
||||
modes, they are both provided simply to maintain symmetry across operations.
|
||||
|
||||
## Measuring image quality
|
||||
|
||||
Review the compression quality using the `-tl` \ `-ts` \ `-th` \ `-tH` modes.
|
||||
For example:
|
||||
|
||||
astcenc -tl example.png example.tga 5x5 -thorough
|
||||
|
||||
This is equivalent to using using the LDR color profile and a 5x5 block size
|
||||
to compress the image, using the `-thorough` quality preset, and then
|
||||
immediately decompressing the image and saving the result. This can be used
|
||||
to enable a visual inspection of the compressed image quality. In addition
|
||||
this mode also prints out some image quality metrics to the console.
|
||||
|
||||
The modes available mirror the options used for compression, but use a `t`
|
||||
prefix.
|
||||
|
||||
## Experimenting
|
||||
|
||||
Efficient real-time graphics benefits from minimizing compressed texture size,
|
||||
as it reduces memory footprint, reduces memory bandwidth, saves energy, and can
|
||||
improve texture cache efficiency. However, like any lossy compression format
|
||||
there will come a point where the compressed image quality is unacceptable
|
||||
because there are simply not enough bits to represent the output with the
|
||||
precision needed. We recommend experimenting with the block footprint to find
|
||||
the optimum balance between size and quality, as the finely adjustable
|
||||
compression ratio is one of major strengths of the ASTC format.
|
||||
|
||||
The compression speed can be controlled from `-fastest`, through `-fast`,
|
||||
`-medium` and `-thorough`, up to `-exhaustive`. In general, the more time the
|
||||
encoder has to spend looking for good encodings the better the results, but it
|
||||
does result in increasingly small improvements for the amount of time required.
|
||||
|
||||
There are many other command line options for tuning the encoder parameters
|
||||
which can be used to fine tune the compression algorithm. See the command line
|
||||
help message for more details.
|
||||
|
||||
# Documentation
|
||||
|
||||
The [ASTC Format Overview](./Docs/FormatOverview.md) page provides a high level
|
||||
introduction to the ASTC texture format, how it encodes data, and why it is
|
||||
both flexible and efficient.
|
||||
|
||||
The [Effective ASTC Encoding](./Docs/Encoding.md) page looks at some of the
|
||||
guidelines that should be followed when compressing data using `astcenc`.
|
||||
It covers:
|
||||
|
||||
* How to efficiently encode data with fewer than 4 channels.
|
||||
* How to efficiently encode normal maps, sRGB data, and HDR data.
|
||||
* Coding equivalents to other compression formats.
|
||||
|
||||
The [ASTC Developer Guide][5] document (external link) provides a more detailed
|
||||
guide for developers using the `astcenc` compressor.
|
||||
|
||||
The [.astc File Format](./Docs/FileFormat.md) page provides a light-weight
|
||||
specification for the `.astc` file format and how to read or write it.
|
||||
|
||||
The [Building ASTC Encoder](./Docs/Building.md) page provides instructions on
|
||||
how to build `astcenc` from the sources in this repository.
|
||||
|
||||
The [Testing ASTC Encoder](./Docs/Testing.md) page provides instructions on
|
||||
how to test any modifications to the source code in this repository.
|
||||
|
||||
# Support
|
||||
|
||||
If you have issues with the `astcenc` encoder, or questions about the ASTC
|
||||
texture format itself, please raise them in the GitHub issue tracker.
|
||||
|
||||
If you have any questions about Arm GPUs, application development for Arm GPUs,
|
||||
or general mobile graphics development or technology please submit them on the
|
||||
[Arm Community graphics forums][4].
|
||||
|
||||
- - -
|
||||
|
||||
_Copyright © 2013-2025, Arm Limited and contributors. All rights reserved._
|
||||
|
||||
[1]: ./Docs/FormatOverview.md
|
||||
[2]: https://www.khronos.org/registry/DataFormat/specs/1.4/dataformat.1.4.html#ASTC
|
||||
[3]: https://github.com/ARM-software/astc-encoder/releases
|
||||
[4]: https://community.arm.com/support-forums/f/graphics-gaming-and-vr-forum/
|
||||
[5]: https://developer.arm.com/documentation/102162/latest/?lang=en
|
||||
@@ -0,0 +1,126 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# ----------------------------------------------------------------------------
|
||||
# Copyright 2020-2025 Arm Limited
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
# use this file except in compliance with the License. You may obtain a copy
|
||||
# of the License at:
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
# Overwrite the LTO flags to force fat LTO; worth 3-4% performance
|
||||
# See https://gitlab.kitware.com/cmake/cmake/-/issues/16808
|
||||
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND ${ASTCENC_CLI})
|
||||
set(CMAKE_CXX_COMPILE_OPTIONS_IPO "-flto")
|
||||
endif()
|
||||
|
||||
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND ${ASTCENC_CLI})
|
||||
set(CMAKE_CXX_COMPILE_OPTIONS_IPO "-flto=auto")
|
||||
endif()
|
||||
|
||||
if(${ASTCENC_DECOMPRESSOR})
|
||||
set(ASTCENC_CODEC dec)
|
||||
else()
|
||||
set(ASTCENC_CODEC enc)
|
||||
endif()
|
||||
|
||||
set(ASTCENC_ARTIFACTS native none sve_256 sve_128 neon avx2 sse4.1 sse2)
|
||||
set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_SVE_256} ${ASTCENC_ISA_SVE_128} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2})
|
||||
list(LENGTH ASTCENC_ARTIFACTS ASTCENC_ARTIFACTS_LEN)
|
||||
math(EXPR ASTCENC_ARTIFACTS_LEN "${ASTCENC_ARTIFACTS_LEN} - 1")
|
||||
|
||||
foreach(INDEX RANGE ${ASTCENC_ARTIFACTS_LEN})
|
||||
list(GET ASTCENC_ARTIFACTS ${INDEX} ASTCENC_ARTIFACT)
|
||||
list(GET ASTCENC_CONFIGS ${INDEX} ASTCENC_CONFIG)
|
||||
if(${ASTCENC_CONFIG})
|
||||
set(ASTCENC_ISA_SIMD ${ASTCENC_ARTIFACT})
|
||||
|
||||
if(${ASTCENC_ISA_SIMD} MATCHES "sve_256")
|
||||
# Not suported on macOS
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "sve_128")
|
||||
# Not suported on macOS
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "neon")
|
||||
set(CMAKE_OSX_ARCHITECTURES arm64)
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
|
||||
set(CMAKE_OSX_ARCHITECTURES x86_64)
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse4.1")
|
||||
set(CMAKE_OSX_ARCHITECTURES x86_64)
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2")
|
||||
set(CMAKE_OSX_ARCHITECTURES x86_64h)
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "none")
|
||||
# Using "none" uses implicit architecture
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "native")
|
||||
# Using "native" uses implicit architecture
|
||||
else()
|
||||
message(FATAL_ERROR "'${ASTCENC_ISA_SIMD}' is unknown ISA")
|
||||
endif()
|
||||
|
||||
include(cmake_core.cmake)
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
if(${ASTCENC_CLI} AND ${ASTCENC_UNIVERSAL_BUILD})
|
||||
add_custom_target(
|
||||
astc${ASTCENC_CODEC}
|
||||
ALL
|
||||
COMMAND
|
||||
lipo -create -output $<TARGET_FILE_DIR:astc${ASTCENC_CODEC}-sse4.1>/astc${ASTCENC_CODEC} -arch x86_64 $<TARGET_FILE:astc${ASTCENC_CODEC}-sse4.1> -arch x86_64h $<TARGET_FILE:astc${ASTCENC_CODEC}-avx2> -arch arm64 $<TARGET_FILE:astc${ASTCENC_CODEC}-neon>
|
||||
VERBATIM)
|
||||
|
||||
add_dependencies(
|
||||
astc${ASTCENC_CODEC}
|
||||
astc${ASTCENC_CODEC}-sse4.1
|
||||
astc${ASTCENC_CODEC}-avx2
|
||||
astc${ASTCENC_CODEC}-neon)
|
||||
|
||||
install(PROGRAMS $<TARGET_FILE_DIR:astc${ASTCENC_CODEC}-sse4.1>/astc${ASTCENC_CODEC}
|
||||
DESTINATION bin)
|
||||
endif()
|
||||
|
||||
if(${ASTCENC_SHAREDLIB} AND ${ASTCENC_UNIVERSAL_BUILD})
|
||||
add_custom_target(
|
||||
astc${ASTCENC_CODEC}-shared
|
||||
ALL
|
||||
COMMAND
|
||||
lipo -create -output $<TARGET_FILE_DIR:astc${ASTCENC_CODEC}-sse4.1-shared>/libastc${ASTCENC_CODEC}-shared.dylib -arch x86_64 $<TARGET_FILE:astc${ASTCENC_CODEC}-sse4.1-shared> -arch x86_64h $<TARGET_FILE:astc${ASTCENC_CODEC}-avx2-shared> -arch arm64 $<TARGET_FILE:astc${ASTCENC_CODEC}-neon-shared>
|
||||
VERBATIM)
|
||||
|
||||
add_dependencies(
|
||||
astc${ASTCENC_CODEC}-shared
|
||||
astc${ASTCENC_CODEC}-sse4.1-shared
|
||||
astc${ASTCENC_CODEC}-avx2-shared
|
||||
astc${ASTCENC_CODEC}-neon-shared)
|
||||
|
||||
install(PROGRAMS $<TARGET_FILE_DIR:astc${ASTCENC_CODEC}-sse4.1-shared>/libastc${ASTCENC_CODEC}-shared.dylib
|
||||
DESTINATION lib)
|
||||
endif()
|
||||
|
||||
# - - - - - - - - - - - - - - - - - -
|
||||
# Unit testing
|
||||
if(${ASTCENC_UNITTEST})
|
||||
set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
|
||||
set(CMAKE_OSX_ARCHITECTURES x86_64;arm64)
|
||||
add_subdirectory(GoogleTest)
|
||||
|
||||
# Workaround GoogleTest CRT selection issue issue
|
||||
# See https://github.com/google/googletest/issues/4067
|
||||
set_property(
|
||||
TARGET gtest
|
||||
PROPERTY
|
||||
MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
|
||||
|
||||
set_property(
|
||||
TARGET gtest_main
|
||||
PROPERTY
|
||||
MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
|
||||
|
||||
enable_testing()
|
||||
add_subdirectory(UnitTest)
|
||||
endif()
|
||||
@@ -0,0 +1,106 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2020-2021 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Fuzz target for physical_to_symbolic().
|
||||
*
|
||||
* This function is the first entrypoint for decompressing a 16 byte block of
|
||||
* input ASTC data from disk. The 16 bytes can contain arbitrary data; they
|
||||
* are read from an external source, but the block size used must be a valid
|
||||
* ASTC block footprint.
|
||||
*/
|
||||
|
||||
|
||||
#include "astcenc_internal.h"
|
||||
|
||||
#include <fuzzer/FuzzedDataProvider.h>
|
||||
#include <array>
|
||||
#include <vector>
|
||||
|
||||
struct BlockSizes
|
||||
{
|
||||
int x;
|
||||
int y;
|
||||
int z;
|
||||
};
|
||||
|
||||
std::array<BlockSizes, 3> testSz {{
|
||||
{ 4, 4, 1}, // Highest bitrate
|
||||
{12, 12, 1}, // Largest 2D block
|
||||
{6, 6, 6} // Largest 3D block
|
||||
}};
|
||||
|
||||
std::array<block_size_descriptor, 3> testBSD;
|
||||
|
||||
/**
|
||||
* @brief Utility function to create all of the block size descriptors needed.
|
||||
*
|
||||
* This is triggered once via a static initializer.
|
||||
*
|
||||
* Triggering once is important so that we only create a single BSD per block
|
||||
* size we need, rather than one per fuzzer iteration (it's expensive). This
|
||||
* improves fuzzer throughput by ~ 1000x!
|
||||
*
|
||||
* Triggering via a static initializer, rather than a lazy init in the fuzzer
|
||||
* function, is important because is means that the BSD is allocated before
|
||||
* fuzzing starts. This means that leaksanitizer will ignore the fact that we
|
||||
* "leak" the dynamic allocations inside the BSD (we never call term()).
|
||||
*/
|
||||
bool bsd_initializer()
|
||||
{
|
||||
for (int i = 0; i < testSz.size(); i++)
|
||||
{
|
||||
init_block_size_descriptor(
|
||||
testSz[i].x,
|
||||
testSz[i].y,
|
||||
testSz[i].z,
|
||||
false,
|
||||
4,
|
||||
1.0f,
|
||||
testBSD[i]);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
|
||||
{
|
||||
// Preinitialize the block size descriptors we need
|
||||
static bool init = bsd_initializer();
|
||||
|
||||
// Must have 4 (select block size) and 16 (payload) bytes
|
||||
if (size < 4 + 16)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
FuzzedDataProvider stream(data, size);
|
||||
|
||||
// Select a block size to test
|
||||
int i = stream.ConsumeIntegralInRange<int>(0, testSz.size() - 1);
|
||||
|
||||
// Populate the physical block
|
||||
uint8_t pcb[16];
|
||||
std::vector<uint8_t> buffer = stream.ConsumeBytes<uint8_t>(16);
|
||||
std::memcpy(pcb, buffer.data(), 16);
|
||||
|
||||
// Call the function under test
|
||||
symbolic_compressed_block scb;
|
||||
physical_to_symbolic(testBSD[i], pcb, scb);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,51 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# ----------------------------------------------------------------------------
|
||||
# Copyright 2020-2024 Arm Limited
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
# use this file except in compliance with the License. You may obtain a copy
|
||||
# of the License at:
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
set(ASTCENC_ARTIFACTS native none sve_256 sve_128 neon avx2 sse4.1 sse2)
|
||||
set(ASTCENC_CONFIGS ${ASTCENC_ISA_NATIVE} ${ASTCENC_ISA_NONE} ${ASTCENC_ISA_SVE_256} ${ASTCENC_ISA_SVE_128} ${ASTCENC_ISA_NEON} ${ASTCENC_ISA_AVX2} ${ASTCENC_ISA_SSE41} ${ASTCENC_ISA_SSE2})
|
||||
list(LENGTH ASTCENC_ARTIFACTS ASTCENC_ARTIFACTS_LEN)
|
||||
math(EXPR ASTCENC_ARTIFACTS_LEN "${ASTCENC_ARTIFACTS_LEN} - 1")
|
||||
|
||||
foreach(INDEX RANGE ${ASTCENC_ARTIFACTS_LEN})
|
||||
list(GET ASTCENC_ARTIFACTS ${INDEX} ASTCENC_ARTIFACT)
|
||||
list(GET ASTCENC_CONFIGS ${INDEX} ASTCENC_CONFIG)
|
||||
if(${ASTCENC_CONFIG})
|
||||
set(ASTCENC_ISA_SIMD ${ASTCENC_ARTIFACT})
|
||||
|
||||
if(${ASTCENC_ISA_SIMD} MATCHES "sve_256")
|
||||
# Not supported on macOS
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "sve_128")
|
||||
# Not supported on macOS
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "neon")
|
||||
set(CMAKE_OSX_ARCHITECTURES arm64)
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
|
||||
set(CMAKE_OSX_ARCHITECTURES x86_64)
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse4.1")
|
||||
set(CMAKE_OSX_ARCHITECTURES x86_64)
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2")
|
||||
set(CMAKE_OSX_ARCHITECTURES x86_64h)
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "none")
|
||||
# Using "none" uses implicit architecture
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "native")
|
||||
# Using "native" uses implicit architecture
|
||||
else()
|
||||
message(FATAL_ERROR "'${ASTCENC_ISA_SIMD}' is unknown ISA")
|
||||
endif()
|
||||
|
||||
include(cmake_core.cmake)
|
||||
endif()
|
||||
endforeach()
|
||||
@@ -0,0 +1,198 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# ----------------------------------------------------------------------------
|
||||
# Copyright 2020-2025 Arm Limited
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
# use this file except in compliance with the License. You may obtain a copy
|
||||
# of the License at:
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
include(../cmake_compiler.cmake)
|
||||
|
||||
set(ASTCENC_TEST test-unit-${ASTCENC_ISA_SIMD})
|
||||
|
||||
add_executable(${ASTCENC_TEST})
|
||||
|
||||
set_property(TARGET ${ASTCENC_TEST}
|
||||
PROPERTY
|
||||
CXX_STANDARD 17)
|
||||
|
||||
# Enable LTO under the conditions where the codec library will use LTO.
|
||||
# The library link will fail if the settings don't match
|
||||
if(${ASTCENC_CLI})
|
||||
set_property(TARGET ${ASTCENC_TEST}
|
||||
PROPERTY
|
||||
INTERPROCEDURAL_OPTIMIZATION_RELEASE True)
|
||||
endif()
|
||||
|
||||
# Use a static runtime on MSVC builds (ignored on non-MSVC compilers)
|
||||
set_property(TARGET ${ASTCENC_TEST}
|
||||
PROPERTY
|
||||
MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
|
||||
|
||||
|
||||
target_sources(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
test_simd.cpp
|
||||
test_softfloat.cpp
|
||||
test_decode.cpp)
|
||||
|
||||
target_include_directories(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
${gtest_SOURCE_DIR}/include)
|
||||
|
||||
target_link_libraries(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
astcenc-${ASTCENC_ISA_SIMD}-static)
|
||||
|
||||
target_compile_options(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
# Use pthreads on Linux/macOS
|
||||
$<$<PLATFORM_ID:Linux,Darwin>:-pthread>
|
||||
|
||||
# MSVC compiler defines
|
||||
$<${is_msvc_fe}:/EHsc>
|
||||
$<$<AND:$<BOOL:${ASTCENC_WERROR}>,${is_msvc_fe}>:/WX>
|
||||
$<${is_msvccl}:/wd4324>
|
||||
|
||||
# G++ and Clang++ compiler defines
|
||||
$<${is_gnu_fe}:-Wall>
|
||||
$<${is_gnu_fe}:-Wextra>
|
||||
$<${is_gnu_fe}:-Wpedantic>
|
||||
$<$<AND:$<BOOL:${ASTCENC_WERROR}>,${is_gnu_fe}>:-Werror>
|
||||
$<${is_gnu_fe}:-Wshadow>
|
||||
$<${is_gnu_fe}:-Wdouble-promotion>
|
||||
$<${is_clang}:-Wdocumentation>
|
||||
|
||||
# Hide noise thrown up by Clang 10 and clang-cl
|
||||
$<${is_gnu_fe}:-Wno-unknown-warning-option>
|
||||
$<${is_gnu_fe}:-Wno-c++98-compat-pedantic>
|
||||
$<${is_gnu_fe}:-Wno-c++98-c++11-compat-pedantic>
|
||||
$<${is_gnu_fe}:-Wno-float-equal>
|
||||
$<${is_gnu_fe}:-Wno-overriding-option>
|
||||
$<${is_gnu_fe}:-Wno-unsafe-buffer-usage>
|
||||
$<${is_clang}:-Wno-switch-default>
|
||||
|
||||
# Ignore things that the googletest build triggers
|
||||
$<${is_gnu_fe}:-Wno-unknown-warning-option>
|
||||
$<${is_gnu_fe}:-Wno-double-promotion>
|
||||
$<${is_gnu_fe}:-Wno-undef>
|
||||
$<${is_gnu_fe}:-Wno-reserved-identifier>
|
||||
$<${is_gnu_fe}:-Wno-global-constructors>)
|
||||
|
||||
# Set up configuration for SIMD ISA builds
|
||||
if(${ASTCENC_ISA_SIMD} MATCHES "none")
|
||||
target_compile_definitions(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SVE=0
|
||||
ASTCENC_SSE=0
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=0
|
||||
ASTCENC_F16C=0)
|
||||
|
||||
if(${ASTCENC_BIG_ENDIAN})
|
||||
target_compile_definitions(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
ASTCENC_BIG_ENDIAN=1)
|
||||
endif()
|
||||
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "neon")
|
||||
target_compile_definitions(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=1
|
||||
ASTCENC_SVE=0
|
||||
ASTCENC_SSE=0
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=0
|
||||
ASTCENC_F16C=0)
|
||||
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "sve_256")
|
||||
target_compile_definitions(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=1
|
||||
ASTCENC_SVE=8
|
||||
ASTCENC_SSE=0
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=0
|
||||
ASTCENC_F16C=0)
|
||||
|
||||
# Enable SVE
|
||||
target_compile_options(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
-march=armv8-a+sve -msve-vector-bits=256)
|
||||
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "sve_128")
|
||||
target_compile_definitions(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=1
|
||||
ASTCENC_SVE=4
|
||||
ASTCENC_SSE=0
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=0
|
||||
ASTCENC_F16C=0)
|
||||
|
||||
# Enable SVE
|
||||
target_compile_options(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
-march=armv8-a+sve)
|
||||
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse2")
|
||||
target_compile_definitions(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SVE=0
|
||||
ASTCENC_SSE=20
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=0
|
||||
ASTCENC_F16C=0)
|
||||
|
||||
target_compile_options(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
$<$<CXX_COMPILER_ID:${GNU_LIKE}>:-msse2>)
|
||||
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "sse4.1")
|
||||
target_compile_definitions(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SVE=0
|
||||
ASTCENC_SSE=41
|
||||
ASTCENC_AVX=0
|
||||
ASTCENC_POPCNT=1
|
||||
ASTCENC_F16C=0)
|
||||
|
||||
target_compile_options(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-msse4.1 -mpopcnt>)
|
||||
|
||||
elseif(${ASTCENC_ISA_SIMD} MATCHES "avx2")
|
||||
target_compile_definitions(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
ASTCENC_NEON=0
|
||||
ASTCENC_SVE=0
|
||||
ASTCENC_SSE=41
|
||||
ASTCENC_AVX=2
|
||||
ASTCENC_POPCNT=1
|
||||
ASTCENC_F16C=1)
|
||||
|
||||
target_compile_options(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-mavx2 -mpopcnt -mf16c>
|
||||
$<$<CXX_COMPILER_ID:MSVC>:/arch:AVX2>)
|
||||
|
||||
endif()
|
||||
|
||||
target_link_libraries(${ASTCENC_TEST}
|
||||
PRIVATE
|
||||
gtest_main)
|
||||
|
||||
add_test(NAME ${ASTCENC_TEST}
|
||||
COMMAND ${ASTCENC_TEST})
|
||||
@@ -0,0 +1,80 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Unit tests for the vectorized SIMD functionality.
|
||||
*/
|
||||
|
||||
#include <limits>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
#include "../astcenc.h"
|
||||
|
||||
namespace astcenc
|
||||
{
|
||||
|
||||
/** @brief Test harness for exploring issue #447. */
|
||||
TEST(decode, decode12x12)
|
||||
{
|
||||
astcenc_error status;
|
||||
astcenc_config config;
|
||||
astcenc_context* context;
|
||||
|
||||
static const astcenc_swizzle swizzle {
|
||||
ASTCENC_SWZ_R, ASTCENC_SWZ_G, ASTCENC_SWZ_B, ASTCENC_SWZ_A
|
||||
};
|
||||
|
||||
uint8_t data[16] {
|
||||
#if 0
|
||||
0x84,0x00,0x38,0xC8,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0xB3,0x4D,0x78
|
||||
#else
|
||||
0x29,0x00,0x1A,0x97,0x01,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0xCF,0x97,0x86
|
||||
#endif
|
||||
};
|
||||
|
||||
uint8_t output[12*12*4];
|
||||
astcenc_config_init(ASTCENC_PRF_LDR, 12, 12, 1, ASTCENC_PRE_MEDIUM, 0, &config);
|
||||
|
||||
status = astcenc_context_alloc(&config, 1, &context);
|
||||
EXPECT_EQ(status, ASTCENC_SUCCESS);
|
||||
|
||||
astcenc_image image;
|
||||
image.dim_x = 12;
|
||||
image.dim_y = 12;
|
||||
image.dim_z = 1;
|
||||
image.data_type = ASTCENC_TYPE_U8;
|
||||
uint8_t* slices = output;
|
||||
image.data = reinterpret_cast<void**>(&slices);
|
||||
|
||||
status = astcenc_decompress_image(context, data, 16, &image, &swizzle, 0);
|
||||
EXPECT_EQ(status, ASTCENC_SUCCESS);
|
||||
#if 0
|
||||
for (int y = 0; y < 12; y++)
|
||||
{
|
||||
for (int x = 0; x < 12; x++)
|
||||
{
|
||||
uint8_t* pixel = output + (12 * 4 * y) + (4 * x);
|
||||
printf("[%2dx%2d] = %03d, %03d, %03d, %03d\n", x, y, pixel[0], pixel[1], pixel[2], pixel[3]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2021 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Unit tests for the software half-float library.
|
||||
*/
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
#include "../astcenc_internal.h"
|
||||
|
||||
namespace astcenc
|
||||
{
|
||||
|
||||
#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
|
||||
|
||||
/** @brief Test normal numbers. */
|
||||
TEST(softfloat, FP16NormalNumbers)
|
||||
{
|
||||
float result = sf16_to_float((15 << 10) + 1);
|
||||
EXPECT_NEAR(result, 1.00098f, 0.00005f);
|
||||
}
|
||||
|
||||
/** @brief Test denormal numbers. */
|
||||
TEST(softfloat, FP16DenormalNumbers)
|
||||
{
|
||||
float result = sf16_to_float((0 << 10) + 1);
|
||||
EXPECT_NEAR(result, 5.96046e-08f, 0.00005f);
|
||||
}
|
||||
|
||||
/** @brief Test zero. */
|
||||
TEST(softfloat, FP16Zero)
|
||||
{
|
||||
float result = sf16_to_float(0x0000);
|
||||
EXPECT_EQ(result, 0.0f);
|
||||
}
|
||||
|
||||
/** @brief Test infinity. */
|
||||
TEST(softfloat, FP16Infinity)
|
||||
{
|
||||
float result = sf16_to_float((31 << 10) + 0);
|
||||
EXPECT_TRUE(std::isinf(result));
|
||||
}
|
||||
|
||||
/** @brief Test NaN. */
|
||||
TEST(softfloat, FP16NaN)
|
||||
{
|
||||
float result = sf16_to_float(0xFFFF);
|
||||
EXPECT_TRUE(std::isnan(result));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
@@ -0,0 +1,874 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2020-2025 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief The core astcenc codec library interface.
|
||||
*
|
||||
* This interface is the entry point to the core astcenc codec. It aims to be easy to use for
|
||||
* non-experts, but also to allow experts to have fine control over the compressor heuristics if
|
||||
* needed. The core codec only handles compression and decompression, transferring all inputs and
|
||||
* outputs via memory buffers. To catch obvious input/output buffer sizing issues, which can cause
|
||||
* security and stability problems, all transfer buffers are explicitly sized.
|
||||
*
|
||||
* While the aim is that we keep this interface mostly stable, it should be viewed as a mutable
|
||||
* interface tied to a specific source version. We are not trying to maintain backwards
|
||||
* compatibility across codec versions.
|
||||
*
|
||||
* The API state management is based around an explicit context object, which is the context for all
|
||||
* allocated memory resources needed to compress and decompress a single image. A context can be
|
||||
* used to sequentially compress multiple images using the same configuration, allowing setup
|
||||
* overheads to be amortized over multiple images, which is particularly important when images are
|
||||
* small.
|
||||
*
|
||||
* Multi-threading can be used two ways.
|
||||
*
|
||||
* * An application wishing to process multiple images in parallel can allocate multiple
|
||||
* contexts and assign each context to a thread.
|
||||
* * An application wishing to process a single image in using multiple threads can configure
|
||||
* contexts for multi-threaded use, and invoke astcenc_compress/decompress() once per thread
|
||||
* for faster processing. The caller is responsible for creating the worker threads, and
|
||||
* synchronizing between images.
|
||||
*
|
||||
* Extended instruction set support
|
||||
* ================================
|
||||
*
|
||||
* This library supports use of extended instruction sets, such as SSE4.1 and AVX2. These are
|
||||
* enabled at compile time when building the library. There is no runtime checking in the core
|
||||
* library that the instruction sets used are actually available. Checking compatibility is the
|
||||
* responsibility of the calling code.
|
||||
*
|
||||
* Threading
|
||||
* =========
|
||||
*
|
||||
* In pseudo-code, the usage for manual user threading looks like this:
|
||||
*
|
||||
* // Configure the compressor run
|
||||
* astcenc_config my_config;
|
||||
* astcenc_config_init(..., &my_config);
|
||||
*
|
||||
* // Power users can tweak <my_config> settings here ...
|
||||
*
|
||||
* // Allocate working state given config and thread_count
|
||||
* astcenc_context* my_context;
|
||||
* astcenc_context_alloc(&my_config, thread_count, &my_context);
|
||||
*
|
||||
* // Compress each image using these config settings
|
||||
* foreach image:
|
||||
* // For each thread in the thread pool
|
||||
* for i in range(0, thread_count):
|
||||
* astcenc_compress_image(my_context, &my_input, my_output, i);
|
||||
*
|
||||
* astcenc_compress_reset(my_context);
|
||||
*
|
||||
* // Clean up
|
||||
* astcenc_context_free(my_context);
|
||||
*
|
||||
* Images
|
||||
* ======
|
||||
*
|
||||
* The codec supports compressing single images, which can be either 2D images or volumetric 3D
|
||||
* images. Calling code is responsible for any handling of aggregate types, such as mipmap chains,
|
||||
* texture arrays, or sliced 3D textures.
|
||||
*
|
||||
* Images are passed in as an astcenc_image structure. Inputs can be either 8-bit unorm, 16-bit
|
||||
* half-float, or 32-bit float, as indicated by the data_type field.
|
||||
*
|
||||
* Images can be any dimension; there is no requirement to be a multiple of the ASTC block size.
|
||||
*
|
||||
* Data is always passed in as 4 color components, and accessed as an array of 2D image slices. Data
|
||||
* within an image slice is always tightly packed without padding. Addressing looks like this:
|
||||
*
|
||||
* data[z_coord][y_coord * x_dim * 4 + x_coord * 4 ] // Red
|
||||
* data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 1] // Green
|
||||
* data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 2] // Blue
|
||||
* data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 3] // Alpha
|
||||
*
|
||||
* Common compressor usage
|
||||
* =======================
|
||||
*
|
||||
* One of the most important things for coding image quality is to align the input data component
|
||||
* count with the ASTC color endpoint mode. This avoids wasting bits encoding components you don't
|
||||
* actually need in the endpoint colors.
|
||||
*
|
||||
* | Input data | Encoding swizzle | Sampling swizzle |
|
||||
* | ------------ | ---------------- | ---------------- |
|
||||
* | 1 component | RRR1 | .[rgb] |
|
||||
* | 2 components | RRRG | .[rgb]a |
|
||||
* | 3 components | RGB1 | .rgb |
|
||||
* | 4 components | RGBA | .rgba |
|
||||
*
|
||||
* The 1 and 2 component modes recommend sampling from "g" to recover the luminance value as this
|
||||
* provide best compatibility with other texture formats where the green component may be stored at
|
||||
* higher precision than the others, such as RGB565. For ASTC any of the RGB components can be used;
|
||||
* the luminance endpoint component will be returned for all three.
|
||||
*
|
||||
* When using the normal map compression mode ASTC will store normals as a two component X+Y map.
|
||||
* Input images must contain unit-length normalized and should be passed in using a two component
|
||||
* swizzle. The astcenc command line tool defaults to an RRRG swizzle, but some developers prefer
|
||||
* to use GGGR for compatability with BC5n which will work just as well. The Z component can be
|
||||
* recovered programmatically in shader code, using knowledge that the vector is unit length and
|
||||
* that Z must be positive for a tangent-space normal map.
|
||||
*
|
||||
* Decompress-only usage
|
||||
* =====================
|
||||
*
|
||||
* For some use cases it is useful to have a cut-down context and/or library which supports
|
||||
* decompression but not compression.
|
||||
*
|
||||
* A context can be made decompress-only using the ASTCENC_FLG_DECOMPRESS_ONLY flag when the context
|
||||
* is allocated. These contexts have lower dynamic memory footprint than a full context.
|
||||
*
|
||||
* The entire library can be made decompress-only by building the files with the define
|
||||
* ASTCENC_DECOMPRESS_ONLY set. In this build the context will be smaller, and the library will
|
||||
* exclude the functionality which is only needed for compression. This reduces the binary size by
|
||||
* ~180KB. For these builds contexts must be created with the ASTCENC_FLG_DECOMPRESS_ONLY flag.
|
||||
*
|
||||
* Note that context structures returned by a library built as decompress-only are incompatible with
|
||||
* a library built with compression included, and visa versa, as they have different sizes and
|
||||
* memory layout.
|
||||
*
|
||||
* Self-decompress-only usage
|
||||
* ==========================
|
||||
*
|
||||
* ASTC is a complex format with a large search space. The parts of this search space that are
|
||||
* searched is determined by heuristics that are, in part, tied to the quality level used when
|
||||
* creating the context.
|
||||
*
|
||||
* A normal context is capable of decompressing any ASTC texture, including those generated by other
|
||||
* compressors with unknown heuristics. This is the most flexible implementation, but forces the
|
||||
* data tables used by the codec to include entries that are not needed during compression. This
|
||||
* can slow down context creation by a significant amount, especially for the faster compression
|
||||
* modes where few data table entries are actually used. To optimize this use case the context can
|
||||
* be created with the ASTCENC_FLG_SELF_DECOMPRESS_ONLY flag. This tells the compressor that it will
|
||||
* only be asked to decompress images that it compressed itself, allowing the data tables to
|
||||
* exclude entries that are not needed by the current compression configuration. This reduces the
|
||||
* size of the context data tables in memory and improves context creation performance. Note that,
|
||||
* as of the 3.6 release, this flag no longer affects compression performance.
|
||||
*
|
||||
* Using this flag while attempting to decompress an valid image which was created by another
|
||||
* compressor, or even another astcenc compressor version or configuration, may result in blocks
|
||||
* returning as solid magenta or NaN value error blocks.
|
||||
*/
|
||||
|
||||
#ifndef ASTCENC_INCLUDED
|
||||
#define ASTCENC_INCLUDED
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
|
||||
#if defined(ASTCENC_DYNAMIC_LIBRARY)
|
||||
#if defined(_MSC_VER)
|
||||
#define ASTCENC_PUBLIC extern "C" __declspec(dllexport)
|
||||
#else
|
||||
#define ASTCENC_PUBLIC extern "C" __attribute__ ((visibility ("default")))
|
||||
#endif
|
||||
#else
|
||||
#define ASTCENC_PUBLIC
|
||||
#endif
|
||||
|
||||
/* ============================================================================
|
||||
Data declarations
|
||||
============================================================================ */
|
||||
|
||||
/**
|
||||
* @brief An opaque structure; see astcenc_internal.h for definition.
|
||||
*/
|
||||
struct astcenc_context;
|
||||
|
||||
/**
|
||||
* @brief A codec API error code.
|
||||
*/
|
||||
enum astcenc_error {
|
||||
/** @brief The call was successful. */
|
||||
ASTCENC_SUCCESS = 0,
|
||||
/** @brief The call failed due to low memory, or undersized I/O buffers. */
|
||||
ASTCENC_ERR_OUT_OF_MEM,
|
||||
/** @brief The call failed due to the build using fast math. */
|
||||
ASTCENC_ERR_BAD_CPU_FLOAT,
|
||||
/** @brief The call failed due to an out-of-spec parameter. */
|
||||
ASTCENC_ERR_BAD_PARAM,
|
||||
/** @brief The call failed due to an out-of-spec block size. */
|
||||
ASTCENC_ERR_BAD_BLOCK_SIZE,
|
||||
/** @brief The call failed due to an out-of-spec color profile. */
|
||||
ASTCENC_ERR_BAD_PROFILE,
|
||||
/** @brief The call failed due to an out-of-spec quality value. */
|
||||
ASTCENC_ERR_BAD_QUALITY,
|
||||
/** @brief The call failed due to an out-of-spec component swizzle. */
|
||||
ASTCENC_ERR_BAD_SWIZZLE,
|
||||
/** @brief The call failed due to an out-of-spec flag set. */
|
||||
ASTCENC_ERR_BAD_FLAGS,
|
||||
/** @brief The call failed due to the context not supporting the operation. */
|
||||
ASTCENC_ERR_BAD_CONTEXT,
|
||||
/** @brief The call failed due to unimplemented functionality. */
|
||||
ASTCENC_ERR_NOT_IMPLEMENTED,
|
||||
/** @brief The call failed due to an out-of-spec decode mode flag set. */
|
||||
ASTCENC_ERR_BAD_DECODE_MODE,
|
||||
#if defined(ASTCENC_DIAGNOSTICS)
|
||||
/** @brief The call failed due to an issue with diagnostic tracing. */
|
||||
ASTCENC_ERR_DTRACE_FAILURE,
|
||||
#endif
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief A codec color profile.
|
||||
*/
|
||||
enum astcenc_profile {
|
||||
/** @brief The LDR sRGB color profile. */
|
||||
ASTCENC_PRF_LDR_SRGB = 0,
|
||||
/** @brief The LDR linear color profile. */
|
||||
ASTCENC_PRF_LDR,
|
||||
/** @brief The HDR RGB with LDR alpha color profile. */
|
||||
ASTCENC_PRF_HDR_RGB_LDR_A,
|
||||
/** @brief The HDR RGBA color profile. */
|
||||
ASTCENC_PRF_HDR
|
||||
};
|
||||
|
||||
/** @brief The fastest, lowest quality, search preset. */
|
||||
static const float ASTCENC_PRE_FASTEST = 0.0f;
|
||||
|
||||
/** @brief The fast search preset. */
|
||||
static const float ASTCENC_PRE_FAST = 10.0f;
|
||||
|
||||
/** @brief The medium quality search preset. */
|
||||
static const float ASTCENC_PRE_MEDIUM = 60.0f;
|
||||
|
||||
/** @brief The thorough quality search preset. */
|
||||
static const float ASTCENC_PRE_THOROUGH = 98.0f;
|
||||
|
||||
/** @brief The thorough quality search preset. */
|
||||
static const float ASTCENC_PRE_VERYTHOROUGH = 99.0f;
|
||||
|
||||
/** @brief The exhaustive, highest quality, search preset. */
|
||||
static const float ASTCENC_PRE_EXHAUSTIVE = 100.0f;
|
||||
|
||||
/**
|
||||
* @brief A codec component swizzle selector.
|
||||
*/
|
||||
enum astcenc_swz
|
||||
{
|
||||
/** @brief Select the red component. */
|
||||
ASTCENC_SWZ_R = 0,
|
||||
/** @brief Select the green component. */
|
||||
ASTCENC_SWZ_G = 1,
|
||||
/** @brief Select the blue component. */
|
||||
ASTCENC_SWZ_B = 2,
|
||||
/** @brief Select the alpha component. */
|
||||
ASTCENC_SWZ_A = 3,
|
||||
/** @brief Use a constant zero component. */
|
||||
ASTCENC_SWZ_0 = 4,
|
||||
/** @brief Use a constant one component. */
|
||||
ASTCENC_SWZ_1 = 5,
|
||||
/** @brief Use a reconstructed normal vector Z component. */
|
||||
ASTCENC_SWZ_Z = 6
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief A texel component swizzle.
|
||||
*/
|
||||
struct astcenc_swizzle
|
||||
{
|
||||
/** @brief The red component selector. */
|
||||
astcenc_swz r;
|
||||
/** @brief The green component selector. */
|
||||
astcenc_swz g;
|
||||
/** @brief The blue component selector. */
|
||||
astcenc_swz b;
|
||||
/** @brief The alpha component selector. */
|
||||
astcenc_swz a;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief A texel component data format.
|
||||
*/
|
||||
enum astcenc_type
|
||||
{
|
||||
/** @brief Unorm 8-bit data per component. */
|
||||
ASTCENC_TYPE_U8 = 0,
|
||||
/** @brief 16-bit float per component. */
|
||||
ASTCENC_TYPE_F16 = 1,
|
||||
/** @brief 32-bit float per component. */
|
||||
ASTCENC_TYPE_F32 = 2
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Function pointer type for compression progress reporting callback.
|
||||
*/
|
||||
extern "C" typedef void (*astcenc_progress_callback)(float);
|
||||
|
||||
/**
|
||||
* @brief Enable normal map compression.
|
||||
*
|
||||
* Input data will be treated a two component normal map, storing X and Y, and the codec will
|
||||
* optimize for angular error rather than simple linear PSNR. In this mode the input swizzle should
|
||||
* be e.g. rrrg (the default ordering for ASTC normals on the command line) or gggr (the ordering
|
||||
* used by BC5n).
|
||||
*/
|
||||
static const unsigned int ASTCENC_FLG_MAP_NORMAL = 1 << 0;
|
||||
|
||||
/**
|
||||
* @brief Enable compression heuristics that assume use of decode_unorm8 decode mode.
|
||||
*
|
||||
* The decode_unorm8 decode mode rounds differently to the decode_fp16 decode mode, so enabling this
|
||||
* flag during compression will allow the compressor to use the correct rounding when selecting
|
||||
* encodings. This will improve the compressed image quality if your application is using the
|
||||
* decode_unorm8 decode mode, but will reduce image quality if using decode_fp16.
|
||||
*
|
||||
* Note that LDR_SRGB images will always use decode_unorm8 for the RGB channels, irrespective of
|
||||
* this setting.
|
||||
*/
|
||||
static const unsigned int ASTCENC_FLG_USE_DECODE_UNORM8 = 1 << 1;
|
||||
|
||||
/**
|
||||
* @brief Enable alpha weighting.
|
||||
*
|
||||
* The input alpha value is used for transparency, so errors in the RGB components are weighted by
|
||||
* the transparency level. This allows the codec to more accurately encode the alpha value in areas
|
||||
* where the color value is less significant.
|
||||
*/
|
||||
static const unsigned int ASTCENC_FLG_USE_ALPHA_WEIGHT = 1 << 2;
|
||||
|
||||
/**
|
||||
* @brief Enable perceptual error metrics.
|
||||
*
|
||||
* This mode enables perceptual compression mode, which will optimize for perceptual error rather
|
||||
* than best PSNR. Only some input modes support perceptual error metrics.
|
||||
*/
|
||||
static const unsigned int ASTCENC_FLG_USE_PERCEPTUAL = 1 << 3;
|
||||
|
||||
/**
|
||||
* @brief Create a decompression-only context.
|
||||
*
|
||||
* This mode disables support for compression. This enables context allocation to skip some
|
||||
* transient buffer allocation, resulting in lower memory usage.
|
||||
*/
|
||||
static const unsigned int ASTCENC_FLG_DECOMPRESS_ONLY = 1 << 4;
|
||||
|
||||
/**
|
||||
* @brief Create a self-decompression context.
|
||||
*
|
||||
* This mode configures the compressor so that it is only guaranteed to be able to decompress images
|
||||
* that were actually created using the current context. This is the common case for compression use
|
||||
* cases, and setting this flag enables additional optimizations, but does mean that the context
|
||||
* cannot reliably decompress arbitrary ASTC images.
|
||||
*/
|
||||
static const unsigned int ASTCENC_FLG_SELF_DECOMPRESS_ONLY = 1 << 5;
|
||||
|
||||
/**
|
||||
* @brief Enable RGBM map compression.
|
||||
*
|
||||
* Input data will be treated as HDR data that has been stored in an LDR RGBM-encoded wrapper
|
||||
* format. Data must be preprocessed by the user to be in LDR RGBM format before calling the
|
||||
* compression function, this flag is only used to control the use of RGBM-specific heuristics and
|
||||
* error metrics.
|
||||
*
|
||||
* IMPORTANT: The ASTC format is prone to bad failure modes with unconstrained RGBM data; very small
|
||||
* M values can round to zero due to quantization and result in black or white pixels. It is highly
|
||||
* recommended that the minimum value of M used in the encoding is kept above a lower threshold (try
|
||||
* 16 or 32). Applying this threshold reduces the number of very dark colors that can be
|
||||
* represented, but is still higher precision than 8-bit LDR.
|
||||
*
|
||||
* When this flag is set the value of @c rgbm_m_scale in the context must be set to the RGBM scale
|
||||
* factor used during reconstruction. This defaults to 5 when in RGBM mode.
|
||||
*
|
||||
* It is recommended that the value of @c cw_a_weight is set to twice the value of the multiplier
|
||||
* scale, ensuring that the M value is accurately encoded. This defaults to 10 when in RGBM mode,
|
||||
* matching the default scale factor.
|
||||
*/
|
||||
static const unsigned int ASTCENC_FLG_MAP_RGBM = 1 << 6;
|
||||
|
||||
/**
|
||||
* @brief The bit mask of all valid flags.
|
||||
*/
|
||||
static const unsigned int ASTCENC_ALL_FLAGS =
|
||||
ASTCENC_FLG_MAP_NORMAL |
|
||||
ASTCENC_FLG_MAP_RGBM |
|
||||
ASTCENC_FLG_USE_ALPHA_WEIGHT |
|
||||
ASTCENC_FLG_USE_PERCEPTUAL |
|
||||
ASTCENC_FLG_USE_DECODE_UNORM8 |
|
||||
ASTCENC_FLG_DECOMPRESS_ONLY |
|
||||
ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
|
||||
|
||||
/**
|
||||
* @brief The config structure.
|
||||
*
|
||||
* This structure will initially be populated by a call to astcenc_config_init, but power users may
|
||||
* modify it before calling astcenc_context_alloc. See astcenccli_toplevel_help.cpp for full user
|
||||
* documentation of the power-user settings.
|
||||
*
|
||||
* Note for any settings which are associated with a specific color component, the value in the
|
||||
* config applies to the component that exists after any compression data swizzle is applied.
|
||||
*/
|
||||
struct astcenc_config
|
||||
{
|
||||
/** @brief The color profile. */
|
||||
astcenc_profile profile;
|
||||
|
||||
/** @brief The set of set flags. */
|
||||
unsigned int flags;
|
||||
|
||||
/** @brief The ASTC block size X dimension. */
|
||||
unsigned int block_x;
|
||||
|
||||
/** @brief The ASTC block size Y dimension. */
|
||||
unsigned int block_y;
|
||||
|
||||
/** @brief The ASTC block size Z dimension. */
|
||||
unsigned int block_z;
|
||||
|
||||
/** @brief The red component weight scale for error weighting (-cw). */
|
||||
float cw_r_weight;
|
||||
|
||||
/** @brief The green component weight scale for error weighting (-cw). */
|
||||
float cw_g_weight;
|
||||
|
||||
/** @brief The blue component weight scale for error weighting (-cw). */
|
||||
float cw_b_weight;
|
||||
|
||||
/** @brief The alpha component weight scale for error weighting (-cw). */
|
||||
float cw_a_weight;
|
||||
|
||||
/**
|
||||
* @brief The radius for any alpha-weight scaling (-a).
|
||||
*
|
||||
* It is recommended that this is set to 1 when using FLG_USE_ALPHA_WEIGHT on a texture that
|
||||
* will be sampled using linear texture filtering to minimize color bleed out of transparent
|
||||
* texels that are adjacent to non-transparent texels.
|
||||
*/
|
||||
unsigned int a_scale_radius;
|
||||
|
||||
/** @brief The RGBM scale factor for the shared multiplier (-rgbm). */
|
||||
float rgbm_m_scale;
|
||||
|
||||
/**
|
||||
* @brief The maximum number of partitions searched (-partitioncountlimit).
|
||||
*
|
||||
* Valid values are between 1 and 4.
|
||||
*/
|
||||
unsigned int tune_partition_count_limit;
|
||||
|
||||
/**
|
||||
* @brief The maximum number of partitions searched (-2partitionindexlimit).
|
||||
*
|
||||
* Valid values are between 1 and 1024.
|
||||
*/
|
||||
unsigned int tune_2partition_index_limit;
|
||||
|
||||
/**
|
||||
* @brief The maximum number of partitions searched (-3partitionindexlimit).
|
||||
*
|
||||
* Valid values are between 1 and 1024.
|
||||
*/
|
||||
unsigned int tune_3partition_index_limit;
|
||||
|
||||
/**
|
||||
* @brief The maximum number of partitions searched (-4partitionindexlimit).
|
||||
*
|
||||
* Valid values are between 1 and 1024.
|
||||
*/
|
||||
unsigned int tune_4partition_index_limit;
|
||||
|
||||
/**
|
||||
* @brief The maximum centile for block modes searched (-blockmodelimit).
|
||||
*
|
||||
* Valid values are between 1 and 100.
|
||||
*/
|
||||
unsigned int tune_block_mode_limit;
|
||||
|
||||
/**
|
||||
* @brief The maximum iterative refinements applied (-refinementlimit).
|
||||
*
|
||||
* Valid values are between 1 and N; there is no technical upper limit
|
||||
* but little benefit is expected after N=4.
|
||||
*/
|
||||
unsigned int tune_refinement_limit;
|
||||
|
||||
/**
|
||||
* @brief The number of trial candidates per mode search (-candidatelimit).
|
||||
*
|
||||
* Valid values are between 1 and TUNE_MAX_TRIAL_CANDIDATES.
|
||||
*/
|
||||
unsigned int tune_candidate_limit;
|
||||
|
||||
/**
|
||||
* @brief The number of trial partitionings per search (-2partitioncandidatelimit).
|
||||
*
|
||||
* Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
|
||||
*/
|
||||
unsigned int tune_2partitioning_candidate_limit;
|
||||
|
||||
/**
|
||||
* @brief The number of trial partitionings per search (-3partitioncandidatelimit).
|
||||
*
|
||||
* Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
|
||||
*/
|
||||
unsigned int tune_3partitioning_candidate_limit;
|
||||
|
||||
/**
|
||||
* @brief The number of trial partitionings per search (-4partitioncandidatelimit).
|
||||
*
|
||||
* Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
|
||||
*/
|
||||
unsigned int tune_4partitioning_candidate_limit;
|
||||
|
||||
/**
|
||||
* @brief The dB threshold for stopping block search (-dblimit).
|
||||
*
|
||||
* This option is ineffective for HDR textures.
|
||||
*/
|
||||
float tune_db_limit;
|
||||
|
||||
/**
|
||||
* @brief The amount of MSE overshoot needed to early-out trials.
|
||||
*
|
||||
* The first early-out is for 1 partition, 1 plane trials, where we try a minimal encode using
|
||||
* the high probability block modes. This can short-cut compression for simple blocks.
|
||||
*
|
||||
* The second early-out is for refinement trials, where we can exit refinement once quality is
|
||||
* reached.
|
||||
*/
|
||||
float tune_mse_overshoot;
|
||||
|
||||
/**
|
||||
* @brief The threshold for skipping 3.1/4.1 trials (-2partitionlimitfactor).
|
||||
*
|
||||
* This option is further scaled for normal maps, so it skips less often.
|
||||
*/
|
||||
float tune_2partition_early_out_limit_factor;
|
||||
|
||||
/**
|
||||
* @brief The threshold for skipping 4.1 trials (-3partitionlimitfactor).
|
||||
*
|
||||
* This option is further scaled for normal maps, so it skips less often.
|
||||
*/
|
||||
float tune_3partition_early_out_limit_factor;
|
||||
|
||||
/**
|
||||
* @brief The threshold for skipping two weight planes (-2planelimitcorrelation).
|
||||
*
|
||||
* This option is ineffective for normal maps.
|
||||
*/
|
||||
float tune_2plane_early_out_limit_correlation;
|
||||
|
||||
/**
|
||||
* @brief The config enable for the mode0 fast-path search.
|
||||
*
|
||||
* If this is set to TUNE_MIN_TEXELS_MODE0 or higher then the early-out fast mode0
|
||||
* search is enabled. This option is ineffective for 3D block sizes.
|
||||
*/
|
||||
float tune_search_mode0_enable;
|
||||
|
||||
/**
|
||||
* @brief The progress callback, can be @c nullptr.
|
||||
*
|
||||
* If this is specified the codec will peridocially report progress for
|
||||
* compression as a percentage between 0 and 100. The callback is called from one
|
||||
* of the compressor threads, so doing significant work in the callback will
|
||||
* reduce compression performance.
|
||||
*/
|
||||
astcenc_progress_callback progress_callback;
|
||||
|
||||
#if defined(ASTCENC_DIAGNOSTICS)
|
||||
/**
|
||||
* @brief The path to save the diagnostic trace data to.
|
||||
*
|
||||
* This option is not part of the public API, and requires special builds
|
||||
* of the library.
|
||||
*/
|
||||
const char* trace_file_path;
|
||||
#endif
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief An uncompressed 2D or 3D image.
|
||||
*
|
||||
* 3D image are passed in as an array of 2D slices. Each slice has identical
|
||||
* size and color format.
|
||||
*/
|
||||
struct astcenc_image
|
||||
{
|
||||
/** @brief The X dimension of the image, in texels. */
|
||||
unsigned int dim_x;
|
||||
|
||||
/** @brief The Y dimension of the image, in texels. */
|
||||
unsigned int dim_y;
|
||||
|
||||
/** @brief The Z dimension of the image, in texels. */
|
||||
unsigned int dim_z;
|
||||
|
||||
/** @brief The data type per component. */
|
||||
astcenc_type data_type;
|
||||
|
||||
/** @brief The array of 2D slices, of length @c dim_z. */
|
||||
void** data;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief A block encoding metadata query result.
|
||||
*
|
||||
* If the block is an error block or a constant color block or an error block all fields other than
|
||||
* the profile, block dimensions, and error/constant indicator will be zero.
|
||||
*/
|
||||
struct astcenc_block_info
|
||||
{
|
||||
/** @brief The block encoding color profile. */
|
||||
astcenc_profile profile;
|
||||
|
||||
/** @brief The number of texels in the X dimension. */
|
||||
unsigned int block_x;
|
||||
|
||||
/** @brief The number of texels in the Y dimension. */
|
||||
unsigned int block_y;
|
||||
|
||||
/** @brief The number of texel in the Z dimension. */
|
||||
unsigned int block_z;
|
||||
|
||||
/** @brief The number of texels in the block. */
|
||||
unsigned int texel_count;
|
||||
|
||||
/** @brief True if this block is an error block. */
|
||||
bool is_error_block;
|
||||
|
||||
/** @brief True if this block is a constant color block. */
|
||||
bool is_constant_block;
|
||||
|
||||
/** @brief True if this block is an HDR block. */
|
||||
bool is_hdr_block;
|
||||
|
||||
/** @brief True if this block uses two weight planes. */
|
||||
bool is_dual_plane_block;
|
||||
|
||||
/** @brief The number of partitions if not constant color. */
|
||||
unsigned int partition_count;
|
||||
|
||||
/** @brief The partition index if 2 - 4 partitions used. */
|
||||
unsigned int partition_index;
|
||||
|
||||
/** @brief The component index of the second plane if dual plane. */
|
||||
unsigned int dual_plane_component;
|
||||
|
||||
/** @brief The color endpoint encoding mode for each partition. */
|
||||
unsigned int color_endpoint_modes[4];
|
||||
|
||||
/** @brief The number of color endpoint quantization levels. */
|
||||
unsigned int color_level_count;
|
||||
|
||||
/** @brief The number of weight quantization levels. */
|
||||
unsigned int weight_level_count;
|
||||
|
||||
/** @brief The number of weights in the X dimension. */
|
||||
unsigned int weight_x;
|
||||
|
||||
/** @brief The number of weights in the Y dimension. */
|
||||
unsigned int weight_y;
|
||||
|
||||
/** @brief The number of weights in the Z dimension. */
|
||||
unsigned int weight_z;
|
||||
|
||||
/** @brief The unpacked color endpoints for each partition. */
|
||||
float color_endpoints[4][2][4];
|
||||
|
||||
/** @brief The per-texel interpolation weights for the block. */
|
||||
float weight_values_plane1[216];
|
||||
|
||||
/** @brief The per-texel interpolation weights for the block. */
|
||||
float weight_values_plane2[216];
|
||||
|
||||
/** @brief The per-texel partition assignments for the block. */
|
||||
uint8_t partition_assignment[216];
|
||||
};
|
||||
|
||||
/**
|
||||
* Populate a codec config based on default settings.
|
||||
*
|
||||
* Power users can edit the returned config struct to fine tune before allocating the context.
|
||||
*
|
||||
* @param profile Color profile.
|
||||
* @param block_x ASTC block size X dimension.
|
||||
* @param block_y ASTC block size Y dimension.
|
||||
* @param block_z ASTC block size Z dimension.
|
||||
* @param quality Search quality preset / effort level. Either an
|
||||
* @c ASTCENC_PRE_* value, or a effort level between 0
|
||||
* and 100. Performance is not linear between 0 and 100.
|
||||
|
||||
* @param flags A valid set of @c ASTCENC_FLG_* flag bits.
|
||||
* @param[out] config Output config struct to populate.
|
||||
*
|
||||
* @return @c ASTCENC_SUCCESS on success, or an error if the inputs are invalid
|
||||
* either individually, or in combination.
|
||||
*/
|
||||
ASTCENC_PUBLIC astcenc_error astcenc_config_init(
|
||||
astcenc_profile profile,
|
||||
unsigned int block_x,
|
||||
unsigned int block_y,
|
||||
unsigned int block_z,
|
||||
float quality,
|
||||
unsigned int flags,
|
||||
astcenc_config* config);
|
||||
|
||||
/**
|
||||
* @brief Allocate a new codec context based on a config.
|
||||
*
|
||||
* This function allocates all of the memory resources and threads needed by the codec. This can be
|
||||
* slow, so it is recommended that contexts are reused to serially compress or decompress multiple
|
||||
* images to amortize setup cost.
|
||||
*
|
||||
* Contexts can be allocated to support only decompression using the @c ASTCENC_FLG_DECOMPRESS_ONLY
|
||||
* flag when creating the configuration. The compression functions will fail if invoked. For a
|
||||
* decompress-only library build the @c ASTCENC_FLG_DECOMPRESS_ONLY flag must be set when creating
|
||||
* any context.
|
||||
*
|
||||
* @param[in] config Codec config.
|
||||
* @param thread_count Thread count to configure for.
|
||||
* @param[out] context Location to store an opaque context pointer.
|
||||
*
|
||||
* @return @c ASTCENC_SUCCESS on success, or an error if context creation failed.
|
||||
*/
|
||||
ASTCENC_PUBLIC astcenc_error astcenc_context_alloc(
|
||||
const astcenc_config* config,
|
||||
unsigned int thread_count,
|
||||
astcenc_context** context);
|
||||
|
||||
/**
|
||||
* @brief Compress an image.
|
||||
*
|
||||
* A single context can only compress or decompress a single image at a time.
|
||||
*
|
||||
* For a context configured for multi-threading, any set of the N threads can call this function.
|
||||
* Work will be dynamically scheduled across the threads available. Each thread must have a unique
|
||||
* @c thread_index.
|
||||
*
|
||||
* @param context Codec context.
|
||||
* @param[in,out] image An input image, in 2D slices.
|
||||
* @param swizzle Compression data swizzle, applied before compression.
|
||||
* @param[out] data_out Pointer to output data array.
|
||||
* @param data_len Length of the output data array.
|
||||
* @param thread_index Thread index [0..N-1] of calling thread.
|
||||
*
|
||||
* @return @c ASTCENC_SUCCESS on success, or an error if compression failed.
|
||||
*/
|
||||
ASTCENC_PUBLIC astcenc_error astcenc_compress_image(
|
||||
astcenc_context* context,
|
||||
astcenc_image* image,
|
||||
const astcenc_swizzle* swizzle,
|
||||
uint8_t* data_out,
|
||||
size_t data_len,
|
||||
unsigned int thread_index);
|
||||
|
||||
/**
|
||||
* @brief Reset the codec state for a new compression.
|
||||
*
|
||||
* The caller is responsible for synchronizing threads in the worker thread pool. This function must
|
||||
* only be called when all threads have exited the @c astcenc_compress_image() function for image N,
|
||||
* but before any thread enters it for image N + 1.
|
||||
*
|
||||
* Calling this is not required (but won't hurt), if the context is created for single threaded use.
|
||||
*
|
||||
* @param context Codec context.
|
||||
*
|
||||
* @return @c ASTCENC_SUCCESS on success, or an error if reset failed.
|
||||
*/
|
||||
ASTCENC_PUBLIC astcenc_error astcenc_compress_reset(
|
||||
astcenc_context* context);
|
||||
|
||||
/**
|
||||
* @brief Cancel any pending compression operation.
|
||||
*
|
||||
* The caller must behave as if the compression completed normally, even though the data will be
|
||||
* undefined. They are still responsible for synchronizing threads in the worker thread pool, and
|
||||
* must call reset before starting another compression.
|
||||
*
|
||||
* @param context Codec context.
|
||||
*
|
||||
* @return @c ASTCENC_SUCCESS on success, or an error if cancellation failed.
|
||||
*/
|
||||
ASTCENC_PUBLIC astcenc_error astcenc_compress_cancel(
|
||||
astcenc_context* context);
|
||||
|
||||
/**
|
||||
* @brief Decompress an image.
|
||||
*
|
||||
* @param context Codec context.
|
||||
* @param[in] data Pointer to compressed data.
|
||||
* @param data_len Length of the compressed data, in bytes.
|
||||
* @param[in,out] image_out Output image.
|
||||
* @param swizzle Decompression data swizzle, applied after decompression.
|
||||
* @param thread_index Thread index [0..N-1] of calling thread.
|
||||
*
|
||||
* @return @c ASTCENC_SUCCESS on success, or an error if decompression failed.
|
||||
*/
|
||||
ASTCENC_PUBLIC astcenc_error astcenc_decompress_image(
|
||||
astcenc_context* context,
|
||||
const uint8_t* data,
|
||||
size_t data_len,
|
||||
astcenc_image* image_out,
|
||||
const astcenc_swizzle* swizzle,
|
||||
unsigned int thread_index);
|
||||
|
||||
/**
|
||||
* @brief Reset the codec state for a new decompression.
|
||||
*
|
||||
* The caller is responsible for synchronizing threads in the worker thread pool. This function must
|
||||
* only be called when all threads have exited the @c astcenc_decompress_image() function for image
|
||||
* N, but before any thread enters it for image N + 1.
|
||||
*
|
||||
* Calling this is not required (but won't hurt), if the context is created for single threaded use.
|
||||
*
|
||||
* @param context Codec context.
|
||||
*
|
||||
* @return @c ASTCENC_SUCCESS on success, or an error if reset failed.
|
||||
*/
|
||||
ASTCENC_PUBLIC astcenc_error astcenc_decompress_reset(
|
||||
astcenc_context* context);
|
||||
|
||||
/**
|
||||
* Free the compressor context.
|
||||
*
|
||||
* @param context The codec context.
|
||||
*/
|
||||
ASTCENC_PUBLIC void astcenc_context_free(
|
||||
astcenc_context* context);
|
||||
|
||||
/**
|
||||
* @brief Provide a high level summary of a block's encoding.
|
||||
*
|
||||
* This feature is primarily useful for codec developers but may be useful for developers building
|
||||
* advanced content packaging pipelines.
|
||||
*
|
||||
* @param context Codec context.
|
||||
* @param data One block of compressed ASTC data.
|
||||
* @param info The output info structure to populate.
|
||||
*
|
||||
* @return @c ASTCENC_SUCCESS if the block was decoded, or an error otherwise. Note that this
|
||||
* function will return success even if the block itself was an error block encoding, as the
|
||||
* decode was correctly handled.
|
||||
*/
|
||||
ASTCENC_PUBLIC astcenc_error astcenc_get_block_info(
|
||||
astcenc_context* context,
|
||||
const uint8_t data[16],
|
||||
astcenc_block_info* info);
|
||||
|
||||
/**
|
||||
* @brief Get a printable string for specific status code.
|
||||
*
|
||||
* @param status The status value.
|
||||
*
|
||||
* @return A human readable nul-terminated string.
|
||||
*/
|
||||
ASTCENC_PUBLIC const char* astcenc_get_error_string(
|
||||
astcenc_error status);
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,948 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2025 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Functions for finding dominant direction of a set of colors.
|
||||
*/
|
||||
#if !defined(ASTCENC_DECOMPRESS_ONLY)
|
||||
|
||||
#include "astcenc_internal.h"
|
||||
|
||||
#include <cassert>
|
||||
|
||||
/**
|
||||
* @brief Compute the average RGB color of each partition.
|
||||
*
|
||||
* The algorithm here uses a vectorized sequential scan and per-partition
|
||||
* color accumulators, using select() to mask texel lanes in other partitions.
|
||||
*
|
||||
* We only accumulate sums for N-1 partitions during the scan; the value for
|
||||
* the last partition can be computed given that we know the block-wide average
|
||||
* already.
|
||||
*
|
||||
* Because of this we could reduce the loop iteration count so it "just" spans
|
||||
* the max texel index needed for the N-1 partitions, which could need fewer
|
||||
* iterations than the full block texel count. However, this makes the loop
|
||||
* count erratic and causes more branch mispredictions so is a net loss.
|
||||
*
|
||||
* @param pi The partitioning to use.
|
||||
* @param blk The block data to process.
|
||||
* @param[out] averages The output averages. Unused partition indices will
|
||||
* not be initialized, and lane<3> will be zero.
|
||||
*/
|
||||
static void compute_partition_averages_rgb(
|
||||
const partition_info& pi,
|
||||
const image_block& blk,
|
||||
vfloat4 averages[BLOCK_MAX_PARTITIONS]
|
||||
) {
|
||||
unsigned int partition_count = pi.partition_count;
|
||||
size_t texel_count = blk.texel_count;
|
||||
promise(texel_count > 0);
|
||||
|
||||
// For 1 partition just use the precomputed mean
|
||||
if (partition_count == 1)
|
||||
{
|
||||
averages[0] = blk.data_mean.swz<0, 1, 2>();
|
||||
}
|
||||
// For 2 partitions scan results for partition 0, compute partition 1
|
||||
else if (partition_count == 2)
|
||||
{
|
||||
vfloatacc pp_avg_rgb[3] {};
|
||||
|
||||
vint lane_id = vint::lane_id();
|
||||
for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vint texel_partition(pi.partition_of_texel + i);
|
||||
|
||||
vmask lane_mask = lane_id < vint_from_size(texel_count);
|
||||
lane_id += vint(ASTCENC_SIMD_WIDTH);
|
||||
|
||||
vmask p0_mask = lane_mask & (texel_partition == vint(0));
|
||||
|
||||
vfloat data_r = loada(blk.data_r + i);
|
||||
haccumulate(pp_avg_rgb[0], data_r, p0_mask);
|
||||
|
||||
vfloat data_g = loada(blk.data_g + i);
|
||||
haccumulate(pp_avg_rgb[1], data_g, p0_mask);
|
||||
|
||||
vfloat data_b = loada(blk.data_b + i);
|
||||
haccumulate(pp_avg_rgb[2], data_b, p0_mask);
|
||||
}
|
||||
|
||||
vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
|
||||
|
||||
vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0]),
|
||||
hadd_s(pp_avg_rgb[1]),
|
||||
hadd_s(pp_avg_rgb[2]));
|
||||
|
||||
vfloat4 p1_total = block_total - p0_total;
|
||||
|
||||
averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
|
||||
averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
|
||||
}
|
||||
// For 3 partitions scan results for partition 0/1, compute partition 2
|
||||
else if (partition_count == 3)
|
||||
{
|
||||
vfloatacc pp_avg_rgb[2][3] {};
|
||||
|
||||
vint lane_id = vint::lane_id();
|
||||
for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vint texel_partition(pi.partition_of_texel + i);
|
||||
|
||||
vmask lane_mask = lane_id < vint_from_size(texel_count);
|
||||
lane_id += vint(ASTCENC_SIMD_WIDTH);
|
||||
|
||||
vmask p0_mask = lane_mask & (texel_partition == vint(0));
|
||||
vmask p1_mask = lane_mask & (texel_partition == vint(1));
|
||||
|
||||
vfloat data_r = loada(blk.data_r + i);
|
||||
haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
|
||||
haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
|
||||
|
||||
vfloat data_g = loada(blk.data_g + i);
|
||||
haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
|
||||
haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
|
||||
|
||||
vfloat data_b = loada(blk.data_b + i);
|
||||
haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
|
||||
haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
|
||||
}
|
||||
|
||||
vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
|
||||
|
||||
vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
|
||||
hadd_s(pp_avg_rgb[0][1]),
|
||||
hadd_s(pp_avg_rgb[0][2]));
|
||||
|
||||
vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
|
||||
hadd_s(pp_avg_rgb[1][1]),
|
||||
hadd_s(pp_avg_rgb[1][2]));
|
||||
|
||||
vfloat4 p2_total = block_total - p0_total - p1_total;
|
||||
|
||||
averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
|
||||
averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
|
||||
averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
// For 4 partitions scan results for partition 0/1/2, compute partition 3
|
||||
vfloatacc pp_avg_rgb[3][3] {};
|
||||
|
||||
vint lane_id = vint::lane_id();
|
||||
for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vint texel_partition(pi.partition_of_texel + i);
|
||||
|
||||
vmask lane_mask = lane_id < vint_from_size(texel_count);
|
||||
lane_id += vint(ASTCENC_SIMD_WIDTH);
|
||||
|
||||
vmask p0_mask = lane_mask & (texel_partition == vint(0));
|
||||
vmask p1_mask = lane_mask & (texel_partition == vint(1));
|
||||
vmask p2_mask = lane_mask & (texel_partition == vint(2));
|
||||
|
||||
vfloat data_r = loada(blk.data_r + i);
|
||||
haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
|
||||
haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
|
||||
haccumulate(pp_avg_rgb[2][0], data_r, p2_mask);
|
||||
|
||||
vfloat data_g = loada(blk.data_g + i);
|
||||
haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
|
||||
haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
|
||||
haccumulate(pp_avg_rgb[2][1], data_g, p2_mask);
|
||||
|
||||
vfloat data_b = loada(blk.data_b + i);
|
||||
haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
|
||||
haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
|
||||
haccumulate(pp_avg_rgb[2][2], data_b, p2_mask);
|
||||
}
|
||||
|
||||
vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
|
||||
|
||||
vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
|
||||
hadd_s(pp_avg_rgb[0][1]),
|
||||
hadd_s(pp_avg_rgb[0][2]));
|
||||
|
||||
vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
|
||||
hadd_s(pp_avg_rgb[1][1]),
|
||||
hadd_s(pp_avg_rgb[1][2]));
|
||||
|
||||
vfloat4 p2_total = vfloat3(hadd_s(pp_avg_rgb[2][0]),
|
||||
hadd_s(pp_avg_rgb[2][1]),
|
||||
hadd_s(pp_avg_rgb[2][2]));
|
||||
|
||||
vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
|
||||
|
||||
averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
|
||||
averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
|
||||
averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
|
||||
averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Compute the average RGBA color of each partition.
|
||||
*
|
||||
* The algorithm here uses a vectorized sequential scan and per-partition
|
||||
* color accumulators, using select() to mask texel lanes in other partitions.
|
||||
*
|
||||
* We only accumulate sums for N-1 partitions during the scan; the value for
|
||||
* the last partition can be computed given that we know the block-wide average
|
||||
* already.
|
||||
*
|
||||
* Because of this we could reduce the loop iteration count so it "just" spans
|
||||
* the max texel index needed for the N-1 partitions, which could need fewer
|
||||
* iterations than the full block texel count. However, this makes the loop
|
||||
* count erratic and causes more branch mispredictions so is a net loss.
|
||||
*
|
||||
* @param pi The partitioning to use.
|
||||
* @param blk The block data to process.
|
||||
* @param[out] averages The output averages. Unused partition indices will
|
||||
* not be initialized.
|
||||
*/
|
||||
static void compute_partition_averages_rgba(
|
||||
const partition_info& pi,
|
||||
const image_block& blk,
|
||||
vfloat4 averages[BLOCK_MAX_PARTITIONS]
|
||||
) {
|
||||
unsigned int partition_count = pi.partition_count;
|
||||
size_t texel_count = blk.texel_count;
|
||||
promise(texel_count > 0);
|
||||
|
||||
// For 1 partition just use the precomputed mean
|
||||
if (partition_count == 1)
|
||||
{
|
||||
averages[0] = blk.data_mean;
|
||||
}
|
||||
// For 2 partitions scan results for partition 0, compute partition 1
|
||||
else if (partition_count == 2)
|
||||
{
|
||||
vfloat4 pp_avg_rgba[4] {};
|
||||
|
||||
vint lane_id = vint::lane_id();
|
||||
for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vint texel_partition(pi.partition_of_texel + i);
|
||||
|
||||
vmask lane_mask = lane_id < vint_from_size(texel_count);
|
||||
lane_id += vint(ASTCENC_SIMD_WIDTH);
|
||||
|
||||
vmask p0_mask = lane_mask & (texel_partition == vint(0));
|
||||
|
||||
vfloat data_r = loada(blk.data_r + i);
|
||||
haccumulate(pp_avg_rgba[0], data_r, p0_mask);
|
||||
|
||||
vfloat data_g = loada(blk.data_g + i);
|
||||
haccumulate(pp_avg_rgba[1], data_g, p0_mask);
|
||||
|
||||
vfloat data_b = loada(blk.data_b + i);
|
||||
haccumulate(pp_avg_rgba[2], data_b, p0_mask);
|
||||
|
||||
vfloat data_a = loada(blk.data_a + i);
|
||||
haccumulate(pp_avg_rgba[3], data_a, p0_mask);
|
||||
}
|
||||
|
||||
vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
|
||||
|
||||
vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0]),
|
||||
hadd_s(pp_avg_rgba[1]),
|
||||
hadd_s(pp_avg_rgba[2]),
|
||||
hadd_s(pp_avg_rgba[3]));
|
||||
|
||||
vfloat4 p1_total = block_total - p0_total;
|
||||
|
||||
averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
|
||||
averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
|
||||
}
|
||||
// For 3 partitions scan results for partition 0/1, compute partition 2
|
||||
else if (partition_count == 3)
|
||||
{
|
||||
vfloat4 pp_avg_rgba[2][4] {};
|
||||
|
||||
vint lane_id = vint::lane_id();
|
||||
for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vint texel_partition(pi.partition_of_texel + i);
|
||||
|
||||
vmask lane_mask = lane_id < vint_from_size(texel_count);
|
||||
lane_id += vint(ASTCENC_SIMD_WIDTH);
|
||||
|
||||
vmask p0_mask = lane_mask & (texel_partition == vint(0));
|
||||
vmask p1_mask = lane_mask & (texel_partition == vint(1));
|
||||
|
||||
vfloat data_r = loada(blk.data_r + i);
|
||||
haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
|
||||
haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
|
||||
|
||||
vfloat data_g = loada(blk.data_g + i);
|
||||
haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
|
||||
haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
|
||||
|
||||
vfloat data_b = loada(blk.data_b + i);
|
||||
haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
|
||||
haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
|
||||
|
||||
vfloat data_a = loada(blk.data_a + i);
|
||||
haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
|
||||
haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
|
||||
}
|
||||
|
||||
vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
|
||||
|
||||
vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
|
||||
hadd_s(pp_avg_rgba[0][1]),
|
||||
hadd_s(pp_avg_rgba[0][2]),
|
||||
hadd_s(pp_avg_rgba[0][3]));
|
||||
|
||||
vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
|
||||
hadd_s(pp_avg_rgba[1][1]),
|
||||
hadd_s(pp_avg_rgba[1][2]),
|
||||
hadd_s(pp_avg_rgba[1][3]));
|
||||
|
||||
vfloat4 p2_total = block_total - p0_total - p1_total;
|
||||
|
||||
averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
|
||||
averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
|
||||
averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
// For 4 partitions scan results for partition 0/1/2, compute partition 3
|
||||
vfloat4 pp_avg_rgba[3][4] {};
|
||||
|
||||
vint lane_id = vint::lane_id();
|
||||
for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vint texel_partition(pi.partition_of_texel + i);
|
||||
|
||||
vmask lane_mask = lane_id < vint_from_size(texel_count);
|
||||
lane_id += vint(ASTCENC_SIMD_WIDTH);
|
||||
|
||||
vmask p0_mask = lane_mask & (texel_partition == vint(0));
|
||||
vmask p1_mask = lane_mask & (texel_partition == vint(1));
|
||||
vmask p2_mask = lane_mask & (texel_partition == vint(2));
|
||||
|
||||
vfloat data_r = loada(blk.data_r + i);
|
||||
haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
|
||||
haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
|
||||
haccumulate(pp_avg_rgba[2][0], data_r, p2_mask);
|
||||
|
||||
vfloat data_g = loada(blk.data_g + i);
|
||||
haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
|
||||
haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
|
||||
haccumulate(pp_avg_rgba[2][1], data_g, p2_mask);
|
||||
|
||||
vfloat data_b = loada(blk.data_b + i);
|
||||
haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
|
||||
haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
|
||||
haccumulate(pp_avg_rgba[2][2], data_b, p2_mask);
|
||||
|
||||
vfloat data_a = loada(blk.data_a + i);
|
||||
haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
|
||||
haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
|
||||
haccumulate(pp_avg_rgba[2][3], data_a, p2_mask);
|
||||
}
|
||||
|
||||
vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
|
||||
|
||||
vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
|
||||
hadd_s(pp_avg_rgba[0][1]),
|
||||
hadd_s(pp_avg_rgba[0][2]),
|
||||
hadd_s(pp_avg_rgba[0][3]));
|
||||
|
||||
vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
|
||||
hadd_s(pp_avg_rgba[1][1]),
|
||||
hadd_s(pp_avg_rgba[1][2]),
|
||||
hadd_s(pp_avg_rgba[1][3]));
|
||||
|
||||
vfloat4 p2_total = vfloat4(hadd_s(pp_avg_rgba[2][0]),
|
||||
hadd_s(pp_avg_rgba[2][1]),
|
||||
hadd_s(pp_avg_rgba[2][2]),
|
||||
hadd_s(pp_avg_rgba[2][3]));
|
||||
|
||||
vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
|
||||
|
||||
averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
|
||||
averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
|
||||
averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
|
||||
averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
|
||||
}
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void compute_avgs_and_dirs_4_comp(
|
||||
const partition_info& pi,
|
||||
const image_block& blk,
|
||||
partition_metrics pm[BLOCK_MAX_PARTITIONS]
|
||||
) {
|
||||
size_t partition_count = pi.partition_count;
|
||||
promise(partition_count > 0);
|
||||
|
||||
// Pre-compute partition_averages
|
||||
vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
|
||||
compute_partition_averages_rgba(pi, blk, partition_averages);
|
||||
|
||||
for (size_t partition = 0; partition < partition_count; partition++)
|
||||
{
|
||||
const uint8_t *texel_indexes = pi.texels_of_partition[partition];
|
||||
size_t texel_count = pi.partition_texel_count[partition];
|
||||
promise(texel_count > 0);
|
||||
|
||||
vfloat4 average = partition_averages[partition];
|
||||
pm[partition].avg = average;
|
||||
|
||||
vfloat4 sum_xp = vfloat4::zero();
|
||||
vfloat4 sum_yp = vfloat4::zero();
|
||||
vfloat4 sum_zp = vfloat4::zero();
|
||||
vfloat4 sum_wp = vfloat4::zero();
|
||||
|
||||
for (size_t i = 0; i < texel_count; i++)
|
||||
{
|
||||
unsigned int iwt = texel_indexes[i];
|
||||
vfloat4 texel_datum = blk.texel(iwt);
|
||||
texel_datum = texel_datum - average;
|
||||
|
||||
vfloat4 zero = vfloat4::zero();
|
||||
|
||||
vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
|
||||
sum_xp += select(zero, texel_datum, tdm0);
|
||||
|
||||
vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
|
||||
sum_yp += select(zero, texel_datum, tdm1);
|
||||
|
||||
vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
|
||||
sum_zp += select(zero, texel_datum, tdm2);
|
||||
|
||||
vmask4 tdm3 = texel_datum.swz<3,3,3,3>() > zero;
|
||||
sum_wp += select(zero, texel_datum, tdm3);
|
||||
}
|
||||
|
||||
vfloat4 prod_xp = dot(sum_xp, sum_xp);
|
||||
vfloat4 prod_yp = dot(sum_yp, sum_yp);
|
||||
vfloat4 prod_zp = dot(sum_zp, sum_zp);
|
||||
vfloat4 prod_wp = dot(sum_wp, sum_wp);
|
||||
|
||||
vfloat4 best_vector = sum_xp;
|
||||
vfloat4 best_sum = prod_xp;
|
||||
|
||||
vmask4 mask = prod_yp > best_sum;
|
||||
best_vector = select(best_vector, sum_yp, mask);
|
||||
best_sum = select(best_sum, prod_yp, mask);
|
||||
|
||||
mask = prod_zp > best_sum;
|
||||
best_vector = select(best_vector, sum_zp, mask);
|
||||
best_sum = select(best_sum, prod_zp, mask);
|
||||
|
||||
mask = prod_wp > best_sum;
|
||||
best_vector = select(best_vector, sum_wp, mask);
|
||||
|
||||
pm[partition].dir = best_vector;
|
||||
}
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void compute_avgs_and_dirs_3_comp(
|
||||
const partition_info& pi,
|
||||
const image_block& blk,
|
||||
unsigned int omitted_component,
|
||||
partition_metrics pm[BLOCK_MAX_PARTITIONS]
|
||||
) {
|
||||
// Pre-compute partition_averages
|
||||
vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
|
||||
compute_partition_averages_rgba(pi, blk, partition_averages);
|
||||
|
||||
const float* data_vr = blk.data_r;
|
||||
const float* data_vg = blk.data_g;
|
||||
const float* data_vb = blk.data_b;
|
||||
|
||||
// TODO: Data-driven permute would be useful to avoid this ...
|
||||
if (omitted_component == 0)
|
||||
{
|
||||
partition_averages[0] = partition_averages[0].swz<1, 2, 3>();
|
||||
partition_averages[1] = partition_averages[1].swz<1, 2, 3>();
|
||||
partition_averages[2] = partition_averages[2].swz<1, 2, 3>();
|
||||
partition_averages[3] = partition_averages[3].swz<1, 2, 3>();
|
||||
|
||||
data_vr = blk.data_g;
|
||||
data_vg = blk.data_b;
|
||||
data_vb = blk.data_a;
|
||||
}
|
||||
else if (omitted_component == 1)
|
||||
{
|
||||
partition_averages[0] = partition_averages[0].swz<0, 2, 3>();
|
||||
partition_averages[1] = partition_averages[1].swz<0, 2, 3>();
|
||||
partition_averages[2] = partition_averages[2].swz<0, 2, 3>();
|
||||
partition_averages[3] = partition_averages[3].swz<0, 2, 3>();
|
||||
|
||||
data_vg = blk.data_b;
|
||||
data_vb = blk.data_a;
|
||||
}
|
||||
else if (omitted_component == 2)
|
||||
{
|
||||
partition_averages[0] = partition_averages[0].swz<0, 1, 3>();
|
||||
partition_averages[1] = partition_averages[1].swz<0, 1, 3>();
|
||||
partition_averages[2] = partition_averages[2].swz<0, 1, 3>();
|
||||
partition_averages[3] = partition_averages[3].swz<0, 1, 3>();
|
||||
|
||||
data_vb = blk.data_a;
|
||||
}
|
||||
else
|
||||
{
|
||||
partition_averages[0] = partition_averages[0].swz<0, 1, 2>();
|
||||
partition_averages[1] = partition_averages[1].swz<0, 1, 2>();
|
||||
partition_averages[2] = partition_averages[2].swz<0, 1, 2>();
|
||||
partition_averages[3] = partition_averages[3].swz<0, 1, 2>();
|
||||
}
|
||||
|
||||
size_t partition_count = pi.partition_count;
|
||||
promise(partition_count > 0);
|
||||
|
||||
for (size_t partition = 0; partition < partition_count; partition++)
|
||||
{
|
||||
const uint8_t *texel_indexes = pi.texels_of_partition[partition];
|
||||
size_t texel_count = pi.partition_texel_count[partition];
|
||||
promise(texel_count > 0);
|
||||
|
||||
vfloat4 average = partition_averages[partition];
|
||||
pm[partition].avg = average;
|
||||
|
||||
vfloat4 sum_xp = vfloat4::zero();
|
||||
vfloat4 sum_yp = vfloat4::zero();
|
||||
vfloat4 sum_zp = vfloat4::zero();
|
||||
|
||||
for (size_t i = 0; i < texel_count; i++)
|
||||
{
|
||||
unsigned int iwt = texel_indexes[i];
|
||||
|
||||
vfloat4 texel_datum = vfloat3(data_vr[iwt],
|
||||
data_vg[iwt],
|
||||
data_vb[iwt]);
|
||||
texel_datum = texel_datum - average;
|
||||
|
||||
vfloat4 zero = vfloat4::zero();
|
||||
|
||||
vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
|
||||
sum_xp += select(zero, texel_datum, tdm0);
|
||||
|
||||
vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
|
||||
sum_yp += select(zero, texel_datum, tdm1);
|
||||
|
||||
vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
|
||||
sum_zp += select(zero, texel_datum, tdm2);
|
||||
}
|
||||
|
||||
vfloat4 prod_xp = dot(sum_xp, sum_xp);
|
||||
vfloat4 prod_yp = dot(sum_yp, sum_yp);
|
||||
vfloat4 prod_zp = dot(sum_zp, sum_zp);
|
||||
|
||||
vfloat4 best_vector = sum_xp;
|
||||
vfloat4 best_sum = prod_xp;
|
||||
|
||||
vmask4 mask = prod_yp > best_sum;
|
||||
best_vector = select(best_vector, sum_yp, mask);
|
||||
best_sum = select(best_sum, prod_yp, mask);
|
||||
|
||||
mask = prod_zp > best_sum;
|
||||
best_vector = select(best_vector, sum_zp, mask);
|
||||
|
||||
pm[partition].dir = best_vector;
|
||||
}
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void compute_avgs_and_dirs_3_comp_rgb(
|
||||
const partition_info& pi,
|
||||
const image_block& blk,
|
||||
partition_metrics pm[BLOCK_MAX_PARTITIONS]
|
||||
) {
|
||||
size_t partition_count = pi.partition_count;
|
||||
promise(partition_count > 0);
|
||||
|
||||
// Pre-compute partition_averages
|
||||
vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
|
||||
compute_partition_averages_rgb(pi, blk, partition_averages);
|
||||
|
||||
for (size_t partition = 0; partition < partition_count; partition++)
|
||||
{
|
||||
const uint8_t *texel_indexes = pi.texels_of_partition[partition];
|
||||
size_t texel_count = pi.partition_texel_count[partition];
|
||||
promise(texel_count > 0);
|
||||
|
||||
vfloat4 average = partition_averages[partition];
|
||||
pm[partition].avg = average;
|
||||
|
||||
vfloat4 sum_xp = vfloat4::zero();
|
||||
vfloat4 sum_yp = vfloat4::zero();
|
||||
vfloat4 sum_zp = vfloat4::zero();
|
||||
|
||||
for (size_t i = 0; i < texel_count; i++)
|
||||
{
|
||||
unsigned int iwt = texel_indexes[i];
|
||||
|
||||
vfloat4 texel_datum = blk.texel3(iwt);
|
||||
texel_datum = texel_datum - average;
|
||||
|
||||
vfloat4 zero = vfloat4::zero();
|
||||
|
||||
vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
|
||||
sum_xp += select(zero, texel_datum, tdm0);
|
||||
|
||||
vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
|
||||
sum_yp += select(zero, texel_datum, tdm1);
|
||||
|
||||
vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
|
||||
sum_zp += select(zero, texel_datum, tdm2);
|
||||
}
|
||||
|
||||
vfloat4 prod_xp = dot(sum_xp, sum_xp);
|
||||
vfloat4 prod_yp = dot(sum_yp, sum_yp);
|
||||
vfloat4 prod_zp = dot(sum_zp, sum_zp);
|
||||
|
||||
vfloat4 best_vector = sum_xp;
|
||||
vfloat4 best_sum = prod_xp;
|
||||
|
||||
vmask4 mask = prod_yp > best_sum;
|
||||
best_vector = select(best_vector, sum_yp, mask);
|
||||
best_sum = select(best_sum, prod_yp, mask);
|
||||
|
||||
mask = prod_zp > best_sum;
|
||||
best_vector = select(best_vector, sum_zp, mask);
|
||||
|
||||
pm[partition].dir = best_vector;
|
||||
}
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void compute_avgs_and_dirs_2_comp(
|
||||
const partition_info& pt,
|
||||
const image_block& blk,
|
||||
unsigned int component1,
|
||||
unsigned int component2,
|
||||
partition_metrics pm[BLOCK_MAX_PARTITIONS]
|
||||
) {
|
||||
vfloat4 average;
|
||||
|
||||
const float* data_vr = nullptr;
|
||||
const float* data_vg = nullptr;
|
||||
|
||||
if (component1 == 0 && component2 == 1)
|
||||
{
|
||||
average = blk.data_mean.swz<0, 1>();
|
||||
|
||||
data_vr = blk.data_r;
|
||||
data_vg = blk.data_g;
|
||||
}
|
||||
else if (component1 == 0 && component2 == 2)
|
||||
{
|
||||
average = blk.data_mean.swz<0, 2>();
|
||||
|
||||
data_vr = blk.data_r;
|
||||
data_vg = blk.data_b;
|
||||
}
|
||||
else // (component1 == 1 && component2 == 2)
|
||||
{
|
||||
assert(component1 == 1 && component2 == 2);
|
||||
|
||||
average = blk.data_mean.swz<1, 2>();
|
||||
|
||||
data_vr = blk.data_g;
|
||||
data_vg = blk.data_b;
|
||||
}
|
||||
|
||||
size_t partition_count = pt.partition_count;
|
||||
promise(partition_count > 0);
|
||||
|
||||
for (size_t partition = 0; partition < partition_count; partition++)
|
||||
{
|
||||
const uint8_t *texel_indexes = pt.texels_of_partition[partition];
|
||||
size_t texel_count = pt.partition_texel_count[partition];
|
||||
promise(texel_count > 0);
|
||||
|
||||
// Only compute a partition mean if more than one partition
|
||||
if (partition_count > 1)
|
||||
{
|
||||
average = vfloat4::zero();
|
||||
for (size_t i = 0; i < texel_count; i++)
|
||||
{
|
||||
unsigned int iwt = texel_indexes[i];
|
||||
average += vfloat2(data_vr[iwt], data_vg[iwt]);
|
||||
}
|
||||
|
||||
average = average / static_cast<float>(texel_count);
|
||||
}
|
||||
|
||||
pm[partition].avg = average;
|
||||
|
||||
vfloat4 sum_xp = vfloat4::zero();
|
||||
vfloat4 sum_yp = vfloat4::zero();
|
||||
|
||||
for (size_t i = 0; i < texel_count; i++)
|
||||
{
|
||||
unsigned int iwt = texel_indexes[i];
|
||||
vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]);
|
||||
texel_datum = texel_datum - average;
|
||||
|
||||
vfloat4 zero = vfloat4::zero();
|
||||
|
||||
vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
|
||||
sum_xp += select(zero, texel_datum, tdm0);
|
||||
|
||||
vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
|
||||
sum_yp += select(zero, texel_datum, tdm1);
|
||||
}
|
||||
|
||||
vfloat4 prod_xp = dot(sum_xp, sum_xp);
|
||||
vfloat4 prod_yp = dot(sum_yp, sum_yp);
|
||||
|
||||
vfloat4 best_vector = sum_xp;
|
||||
vfloat4 best_sum = prod_xp;
|
||||
|
||||
vmask4 mask = prod_yp > best_sum;
|
||||
best_vector = select(best_vector, sum_yp, mask);
|
||||
|
||||
pm[partition].dir = best_vector;
|
||||
}
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void compute_error_squared_rgba(
|
||||
const partition_info& pi,
|
||||
const image_block& blk,
|
||||
const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],
|
||||
const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],
|
||||
float line_lengths[BLOCK_MAX_PARTITIONS],
|
||||
float& uncor_error,
|
||||
float& samec_error
|
||||
) {
|
||||
size_t partition_count = pi.partition_count;
|
||||
promise(partition_count > 0);
|
||||
|
||||
vfloatacc uncor_errorsumv = vfloatacc::zero();
|
||||
vfloatacc samec_errorsumv = vfloatacc::zero();
|
||||
|
||||
for (size_t partition = 0; partition < partition_count; partition++)
|
||||
{
|
||||
const uint8_t *texel_indexes = pi.texels_of_partition[partition];
|
||||
|
||||
processed_line4 l_uncor = uncor_plines[partition];
|
||||
processed_line4 l_samec = samec_plines[partition];
|
||||
|
||||
size_t texel_count = pi.partition_texel_count[partition];
|
||||
promise(texel_count > 0);
|
||||
|
||||
// Vectorize some useful scalar inputs
|
||||
vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
|
||||
vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
|
||||
vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
|
||||
vfloat l_uncor_bs3(l_uncor.bs.lane<3>());
|
||||
|
||||
vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
|
||||
vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
|
||||
vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
|
||||
vfloat l_uncor_amod3(l_uncor.amod.lane<3>());
|
||||
|
||||
vfloat l_samec_bs0(l_samec.bs.lane<0>());
|
||||
vfloat l_samec_bs1(l_samec.bs.lane<1>());
|
||||
vfloat l_samec_bs2(l_samec.bs.lane<2>());
|
||||
vfloat l_samec_bs3(l_samec.bs.lane<3>());
|
||||
|
||||
assert(all(l_samec.amod == vfloat4(0.0f)));
|
||||
|
||||
vfloat uncor_loparamv(1e10f);
|
||||
vfloat uncor_hiparamv(-1e10f);
|
||||
|
||||
vfloat ew_r(blk.channel_weight.lane<0>());
|
||||
vfloat ew_g(blk.channel_weight.lane<1>());
|
||||
vfloat ew_b(blk.channel_weight.lane<2>());
|
||||
vfloat ew_a(blk.channel_weight.lane<3>());
|
||||
|
||||
// This implementation over-shoots, but this is safe as we initialize the texel_indexes
|
||||
// array to extend the last value. This means min/max are not impacted, but we need to mask
|
||||
// out the dummy values when we compute the line weighting.
|
||||
vint lane_ids = vint::lane_id();
|
||||
for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vmask mask = lane_ids < vint_from_size(texel_count);
|
||||
const uint8_t* texel_idxs = texel_indexes + i;
|
||||
|
||||
vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
|
||||
vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
|
||||
vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
|
||||
vfloat data_a = gatherf_byte_inds<vfloat>(blk.data_a, texel_idxs);
|
||||
|
||||
vfloat uncor_param = (data_r * l_uncor_bs0)
|
||||
+ (data_g * l_uncor_bs1)
|
||||
+ (data_b * l_uncor_bs2)
|
||||
+ (data_a * l_uncor_bs3);
|
||||
|
||||
uncor_loparamv = min(uncor_param, uncor_loparamv);
|
||||
uncor_hiparamv = max(uncor_param, uncor_hiparamv);
|
||||
|
||||
vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
|
||||
+ (uncor_param * l_uncor_bs0);
|
||||
vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
|
||||
+ (uncor_param * l_uncor_bs1);
|
||||
vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
|
||||
+ (uncor_param * l_uncor_bs2);
|
||||
vfloat uncor_dist3 = (l_uncor_amod3 - data_a)
|
||||
+ (uncor_param * l_uncor_bs3);
|
||||
|
||||
vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
|
||||
+ (ew_g * uncor_dist1 * uncor_dist1)
|
||||
+ (ew_b * uncor_dist2 * uncor_dist2)
|
||||
+ (ew_a * uncor_dist3 * uncor_dist3);
|
||||
|
||||
haccumulate(uncor_errorsumv, uncor_err, mask);
|
||||
|
||||
// Process samechroma data
|
||||
vfloat samec_param = (data_r * l_samec_bs0)
|
||||
+ (data_g * l_samec_bs1)
|
||||
+ (data_b * l_samec_bs2)
|
||||
+ (data_a * l_samec_bs3);
|
||||
|
||||
vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
|
||||
vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
|
||||
vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
|
||||
vfloat samec_dist3 = samec_param * l_samec_bs3 - data_a;
|
||||
|
||||
vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
|
||||
+ (ew_g * samec_dist1 * samec_dist1)
|
||||
+ (ew_b * samec_dist2 * samec_dist2)
|
||||
+ (ew_a * samec_dist3 * samec_dist3);
|
||||
|
||||
haccumulate(samec_errorsumv, samec_err, mask);
|
||||
|
||||
lane_ids += vint(ASTCENC_SIMD_WIDTH);
|
||||
}
|
||||
|
||||
// Turn very small numbers and NaNs into a small number
|
||||
float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);
|
||||
line_lengths[partition] = astc::max(uncor_linelen, 1e-7f);
|
||||
}
|
||||
|
||||
uncor_error = hadd_s(uncor_errorsumv);
|
||||
samec_error = hadd_s(samec_errorsumv);
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void compute_error_squared_rgb(
|
||||
const partition_info& pi,
|
||||
const image_block& blk,
|
||||
partition_lines3 plines[BLOCK_MAX_PARTITIONS],
|
||||
float& uncor_error,
|
||||
float& samec_error
|
||||
) {
|
||||
size_t partition_count = pi.partition_count;
|
||||
promise(partition_count > 0);
|
||||
|
||||
vfloatacc uncor_errorsumv = vfloatacc::zero();
|
||||
vfloatacc samec_errorsumv = vfloatacc::zero();
|
||||
|
||||
for (size_t partition = 0; partition < partition_count; partition++)
|
||||
{
|
||||
partition_lines3& pl = plines[partition];
|
||||
const uint8_t *texel_indexes = pi.texels_of_partition[partition];
|
||||
size_t texel_count = pi.partition_texel_count[partition];
|
||||
promise(texel_count > 0);
|
||||
|
||||
processed_line3 l_uncor = pl.uncor_pline;
|
||||
processed_line3 l_samec = pl.samec_pline;
|
||||
|
||||
// Vectorize some useful scalar inputs
|
||||
vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
|
||||
vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
|
||||
vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
|
||||
|
||||
vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
|
||||
vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
|
||||
vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
|
||||
|
||||
vfloat l_samec_bs0(l_samec.bs.lane<0>());
|
||||
vfloat l_samec_bs1(l_samec.bs.lane<1>());
|
||||
vfloat l_samec_bs2(l_samec.bs.lane<2>());
|
||||
|
||||
assert(all(l_samec.amod == vfloat4(0.0f)));
|
||||
|
||||
vfloat uncor_loparamv(1e10f);
|
||||
vfloat uncor_hiparamv(-1e10f);
|
||||
|
||||
vfloat ew_r(blk.channel_weight.lane<0>());
|
||||
vfloat ew_g(blk.channel_weight.lane<1>());
|
||||
vfloat ew_b(blk.channel_weight.lane<2>());
|
||||
|
||||
// This implementation over-shoots, but this is safe as we initialize the weights array
|
||||
// to extend the last value. This means min/max are not impacted, but we need to mask
|
||||
// out the dummy values when we compute the line weighting.
|
||||
vint lane_ids = vint::lane_id();
|
||||
for (size_t i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vmask mask = lane_ids < vint_from_size(texel_count);
|
||||
const uint8_t* texel_idxs = texel_indexes + i;
|
||||
|
||||
vfloat data_r = gatherf_byte_inds<vfloat>(blk.data_r, texel_idxs);
|
||||
vfloat data_g = gatherf_byte_inds<vfloat>(blk.data_g, texel_idxs);
|
||||
vfloat data_b = gatherf_byte_inds<vfloat>(blk.data_b, texel_idxs);
|
||||
|
||||
vfloat uncor_param = (data_r * l_uncor_bs0)
|
||||
+ (data_g * l_uncor_bs1)
|
||||
+ (data_b * l_uncor_bs2);
|
||||
|
||||
uncor_loparamv = min(uncor_param, uncor_loparamv);
|
||||
uncor_hiparamv = max(uncor_param, uncor_hiparamv);
|
||||
|
||||
vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
|
||||
+ (uncor_param * l_uncor_bs0);
|
||||
vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
|
||||
+ (uncor_param * l_uncor_bs1);
|
||||
vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
|
||||
+ (uncor_param * l_uncor_bs2);
|
||||
|
||||
vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
|
||||
+ (ew_g * uncor_dist1 * uncor_dist1)
|
||||
+ (ew_b * uncor_dist2 * uncor_dist2);
|
||||
|
||||
haccumulate(uncor_errorsumv, uncor_err, mask);
|
||||
|
||||
// Process samechroma data
|
||||
vfloat samec_param = (data_r * l_samec_bs0)
|
||||
+ (data_g * l_samec_bs1)
|
||||
+ (data_b * l_samec_bs2);
|
||||
|
||||
vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
|
||||
vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
|
||||
vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
|
||||
|
||||
vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
|
||||
+ (ew_g * samec_dist1 * samec_dist1)
|
||||
+ (ew_b * samec_dist2 * samec_dist2);
|
||||
|
||||
haccumulate(samec_errorsumv, samec_err, mask);
|
||||
|
||||
lane_ids += vint(ASTCENC_SIMD_WIDTH);
|
||||
}
|
||||
|
||||
// Turn very small numbers and NaNs into a small number
|
||||
float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);
|
||||
pl.line_length = astc::max(uncor_linelen, 1e-7f);
|
||||
}
|
||||
|
||||
uncor_error = hadd_s(uncor_errorsumv);
|
||||
samec_error = hadd_s(samec_errorsumv);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,941 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
#include <utility>
|
||||
|
||||
/**
|
||||
* @brief Functions for color unquantization.
|
||||
*/
|
||||
|
||||
#include "astcenc_internal.h"
|
||||
|
||||
/**
|
||||
* @brief Un-blue-contract a color.
|
||||
*
|
||||
* This function reverses any applied blue contraction.
|
||||
*
|
||||
* @param input The input color that has been blue-contracted.
|
||||
*
|
||||
* @return The uncontracted color.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vint4 uncontract_color(
|
||||
vint4 input
|
||||
) {
|
||||
vmask4 mask(true, true, false, false);
|
||||
vint4 bc0 = asr<1>(input + input.lane<2>());
|
||||
return select(input, bc0, mask);
|
||||
}
|
||||
|
||||
void rgba_delta_unpack(
|
||||
vint4 input0,
|
||||
vint4 input1,
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
// Apply bit transfer
|
||||
bit_transfer_signed(input1, input0);
|
||||
|
||||
// Apply blue-uncontraction if needed
|
||||
int rgb_sum = hadd_rgb_s(input1);
|
||||
input1 = input1 + input0;
|
||||
if (rgb_sum < 0)
|
||||
{
|
||||
input0 = uncontract_color(input0);
|
||||
input1 = uncontract_color(input1);
|
||||
std::swap(input0, input1);
|
||||
}
|
||||
|
||||
output0 = clamp(0, 255, input0);
|
||||
output1 = clamp(0, 255, input1);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Unpack an LDR RGB color that uses delta encoding.
|
||||
*
|
||||
* Output alpha set to 255.
|
||||
*
|
||||
* @param input0 The packed endpoint 0 color.
|
||||
* @param input1 The packed endpoint 1 color deltas.
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void rgb_delta_unpack(
|
||||
vint4 input0,
|
||||
vint4 input1,
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
rgba_delta_unpack(input0, input1, output0, output1);
|
||||
output0.set_lane<3>(255);
|
||||
output1.set_lane<3>(255);
|
||||
}
|
||||
|
||||
void rgba_unpack(
|
||||
vint4 input0,
|
||||
vint4 input1,
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
// Apply blue-uncontraction if needed
|
||||
if (hadd_rgb_s(input0) > hadd_rgb_s(input1))
|
||||
{
|
||||
input0 = uncontract_color(input0);
|
||||
input1 = uncontract_color(input1);
|
||||
std::swap(input0, input1);
|
||||
}
|
||||
|
||||
output0 = input0;
|
||||
output1 = input1;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Unpack an LDR RGB color that uses direct encoding.
|
||||
*
|
||||
* Output alpha set to 255.
|
||||
*
|
||||
* @param input0 The packed endpoint 0 color.
|
||||
* @param input1 The packed endpoint 1 color.
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void rgb_unpack(
|
||||
vint4 input0,
|
||||
vint4 input1,
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
rgba_unpack(input0, input1, output0, output1);
|
||||
output0.set_lane<3>(255);
|
||||
output1.set_lane<3>(255);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Unpack an LDR RGBA color that uses scaled encoding.
|
||||
*
|
||||
* Note only the RGB channels use the scaled encoding, alpha uses direct.
|
||||
*
|
||||
* @param input0 The packed endpoint 0 color.
|
||||
* @param alpha1 The packed endpoint 1 alpha value.
|
||||
* @param scale The packed quantized scale.
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void rgb_scale_alpha_unpack(
|
||||
vint4 input0,
|
||||
uint8_t alpha1,
|
||||
uint8_t scale,
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
output1 = input0;
|
||||
output1.set_lane<3>(alpha1);
|
||||
|
||||
output0 = asr<8>(input0 * scale);
|
||||
output0.set_lane<3>(input0.lane<3>());
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Unpack an LDR RGB color that uses scaled encoding.
|
||||
*
|
||||
* Output alpha is 255.
|
||||
*
|
||||
* @param input0 The packed endpoint 0 color.
|
||||
* @param scale The packed scale.
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void rgb_scale_unpack(
|
||||
vint4 input0,
|
||||
int scale,
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
output1 = input0;
|
||||
output1.set_lane<3>(255);
|
||||
|
||||
output0 = asr<8>(input0 * scale);
|
||||
output0.set_lane<3>(255);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Unpack an LDR L color that uses direct encoding.
|
||||
*
|
||||
* Output alpha is 255.
|
||||
*
|
||||
* @param input The packed endpoints.
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void luminance_unpack(
|
||||
const uint8_t input[2],
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
int lum0 = input[0];
|
||||
int lum1 = input[1];
|
||||
output0 = vint4(lum0, lum0, lum0, 255);
|
||||
output1 = vint4(lum1, lum1, lum1, 255);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Unpack an LDR L color that uses delta encoding.
|
||||
*
|
||||
* Output alpha is 255.
|
||||
*
|
||||
* @param input The packed endpoints (L0, L1).
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void luminance_delta_unpack(
|
||||
const uint8_t input[2],
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
int v0 = input[0];
|
||||
int v1 = input[1];
|
||||
int l0 = (v0 >> 2) | (v1 & 0xC0);
|
||||
int l1 = l0 + (v1 & 0x3F);
|
||||
|
||||
l1 = astc::min(l1, 255);
|
||||
|
||||
output0 = vint4(l0, l0, l0, 255);
|
||||
output1 = vint4(l1, l1, l1, 255);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Unpack an LDR LA color that uses direct encoding.
|
||||
*
|
||||
* @param input The packed endpoints (L0, L1, A0, A1).
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void luminance_alpha_unpack(
|
||||
const uint8_t input[4],
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
int lum0 = input[0];
|
||||
int lum1 = input[1];
|
||||
int alpha0 = input[2];
|
||||
int alpha1 = input[3];
|
||||
output0 = vint4(lum0, lum0, lum0, alpha0);
|
||||
output1 = vint4(lum1, lum1, lum1, alpha1);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Unpack an LDR LA color that uses delta encoding.
|
||||
*
|
||||
* @param input The packed endpoints (L0, L1, A0, A1).
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void luminance_alpha_delta_unpack(
|
||||
const uint8_t input[4],
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
int lum0 = input[0];
|
||||
int lum1 = input[1];
|
||||
int alpha0 = input[2];
|
||||
int alpha1 = input[3];
|
||||
|
||||
lum0 |= (lum1 & 0x80) << 1;
|
||||
alpha0 |= (alpha1 & 0x80) << 1;
|
||||
lum1 &= 0x7F;
|
||||
alpha1 &= 0x7F;
|
||||
|
||||
if (lum1 & 0x40)
|
||||
{
|
||||
lum1 -= 0x80;
|
||||
}
|
||||
|
||||
if (alpha1 & 0x40)
|
||||
{
|
||||
alpha1 -= 0x80;
|
||||
}
|
||||
|
||||
lum0 >>= 1;
|
||||
lum1 >>= 1;
|
||||
alpha0 >>= 1;
|
||||
alpha1 >>= 1;
|
||||
lum1 += lum0;
|
||||
alpha1 += alpha0;
|
||||
|
||||
lum1 = astc::clamp(lum1, 0, 255);
|
||||
alpha1 = astc::clamp(alpha1, 0, 255);
|
||||
|
||||
output0 = vint4(lum0, lum0, lum0, alpha0);
|
||||
output1 = vint4(lum1, lum1, lum1, alpha1);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Unpack an HDR RGB + offset encoding.
|
||||
*
|
||||
* @param input The packed endpoints (packed and modal).
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void hdr_rgbo_unpack(
|
||||
const uint8_t input[4],
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
int v0 = input[0];
|
||||
int v1 = input[1];
|
||||
int v2 = input[2];
|
||||
int v3 = input[3];
|
||||
|
||||
int modeval = ((v0 & 0xC0) >> 6) | (((v1 & 0x80) >> 7) << 2) | (((v2 & 0x80) >> 7) << 3);
|
||||
|
||||
int majcomp;
|
||||
int mode;
|
||||
if ((modeval & 0xC) != 0xC)
|
||||
{
|
||||
majcomp = modeval >> 2;
|
||||
mode = modeval & 3;
|
||||
}
|
||||
else if (modeval != 0xF)
|
||||
{
|
||||
majcomp = modeval & 3;
|
||||
mode = 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
majcomp = 0;
|
||||
mode = 5;
|
||||
}
|
||||
|
||||
int red = v0 & 0x3F;
|
||||
int green = v1 & 0x1F;
|
||||
int blue = v2 & 0x1F;
|
||||
int scale = v3 & 0x1F;
|
||||
|
||||
int bit0 = (v1 >> 6) & 1;
|
||||
int bit1 = (v1 >> 5) & 1;
|
||||
int bit2 = (v2 >> 6) & 1;
|
||||
int bit3 = (v2 >> 5) & 1;
|
||||
int bit4 = (v3 >> 7) & 1;
|
||||
int bit5 = (v3 >> 6) & 1;
|
||||
int bit6 = (v3 >> 5) & 1;
|
||||
|
||||
int ohcomp = 1 << mode;
|
||||
|
||||
if (ohcomp & 0x30)
|
||||
green |= bit0 << 6;
|
||||
if (ohcomp & 0x3A)
|
||||
green |= bit1 << 5;
|
||||
if (ohcomp & 0x30)
|
||||
blue |= bit2 << 6;
|
||||
if (ohcomp & 0x3A)
|
||||
blue |= bit3 << 5;
|
||||
|
||||
if (ohcomp & 0x3D)
|
||||
scale |= bit6 << 5;
|
||||
if (ohcomp & 0x2D)
|
||||
scale |= bit5 << 6;
|
||||
if (ohcomp & 0x04)
|
||||
scale |= bit4 << 7;
|
||||
|
||||
if (ohcomp & 0x3B)
|
||||
red |= bit4 << 6;
|
||||
if (ohcomp & 0x04)
|
||||
red |= bit3 << 6;
|
||||
|
||||
if (ohcomp & 0x10)
|
||||
red |= bit5 << 7;
|
||||
if (ohcomp & 0x0F)
|
||||
red |= bit2 << 7;
|
||||
|
||||
if (ohcomp & 0x05)
|
||||
red |= bit1 << 8;
|
||||
if (ohcomp & 0x0A)
|
||||
red |= bit0 << 8;
|
||||
|
||||
if (ohcomp & 0x05)
|
||||
red |= bit0 << 9;
|
||||
if (ohcomp & 0x02)
|
||||
red |= bit6 << 9;
|
||||
|
||||
if (ohcomp & 0x01)
|
||||
red |= bit3 << 10;
|
||||
if (ohcomp & 0x02)
|
||||
red |= bit5 << 10;
|
||||
|
||||
// expand to 12 bits.
|
||||
static const int shamts[6] { 1, 1, 2, 3, 4, 5 };
|
||||
int shamt = shamts[mode];
|
||||
red <<= shamt;
|
||||
green <<= shamt;
|
||||
blue <<= shamt;
|
||||
scale <<= shamt;
|
||||
|
||||
// on modes 0 to 4, the values stored for "green" and "blue" are differentials,
|
||||
// not absolute values.
|
||||
if (mode != 5)
|
||||
{
|
||||
green = red - green;
|
||||
blue = red - blue;
|
||||
}
|
||||
|
||||
// switch around components.
|
||||
int temp;
|
||||
switch (majcomp)
|
||||
{
|
||||
case 1:
|
||||
temp = red;
|
||||
red = green;
|
||||
green = temp;
|
||||
break;
|
||||
case 2:
|
||||
temp = red;
|
||||
red = blue;
|
||||
blue = temp;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
int red0 = red - scale;
|
||||
int green0 = green - scale;
|
||||
int blue0 = blue - scale;
|
||||
|
||||
// clamp to [0,0xFFF].
|
||||
if (red < 0)
|
||||
red = 0;
|
||||
if (green < 0)
|
||||
green = 0;
|
||||
if (blue < 0)
|
||||
blue = 0;
|
||||
|
||||
if (red0 < 0)
|
||||
red0 = 0;
|
||||
if (green0 < 0)
|
||||
green0 = 0;
|
||||
if (blue0 < 0)
|
||||
blue0 = 0;
|
||||
|
||||
output0 = vint4(red0 << 4, green0 << 4, blue0 << 4, 0x7800);
|
||||
output1 = vint4(red << 4, green << 4, blue << 4, 0x7800);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Unpack an HDR RGB direct encoding.
|
||||
*
|
||||
* @param input The packed endpoints (packed and modal).
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void hdr_rgb_unpack(
|
||||
const uint8_t input[6],
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
|
||||
int v0 = input[0];
|
||||
int v1 = input[1];
|
||||
int v2 = input[2];
|
||||
int v3 = input[3];
|
||||
int v4 = input[4];
|
||||
int v5 = input[5];
|
||||
|
||||
// extract all the fixed-placement bitfields
|
||||
int modeval = ((v1 & 0x80) >> 7) | (((v2 & 0x80) >> 7) << 1) | (((v3 & 0x80) >> 7) << 2);
|
||||
|
||||
int majcomp = ((v4 & 0x80) >> 7) | (((v5 & 0x80) >> 7) << 1);
|
||||
|
||||
if (majcomp == 3)
|
||||
{
|
||||
output0 = vint4(v0 << 8, v2 << 8, (v4 & 0x7F) << 9, 0x7800);
|
||||
output1 = vint4(v1 << 8, v3 << 8, (v5 & 0x7F) << 9, 0x7800);
|
||||
return;
|
||||
}
|
||||
|
||||
int a = v0 | ((v1 & 0x40) << 2);
|
||||
int b0 = v2 & 0x3f;
|
||||
int b1 = v3 & 0x3f;
|
||||
int c = v1 & 0x3f;
|
||||
int d0 = v4 & 0x7f;
|
||||
int d1 = v5 & 0x7f;
|
||||
|
||||
// get hold of the number of bits in 'd0' and 'd1'
|
||||
static const int dbits_tab[8] { 7, 6, 7, 6, 5, 6, 5, 6 };
|
||||
int dbits = dbits_tab[modeval];
|
||||
|
||||
// extract six variable-placement bits
|
||||
int bit0 = (v2 >> 6) & 1;
|
||||
int bit1 = (v3 >> 6) & 1;
|
||||
int bit2 = (v4 >> 6) & 1;
|
||||
int bit3 = (v5 >> 6) & 1;
|
||||
int bit4 = (v4 >> 5) & 1;
|
||||
int bit5 = (v5 >> 5) & 1;
|
||||
|
||||
// and prepend the variable-placement bits depending on mode.
|
||||
int ohmod = 1 << modeval; // one-hot-mode
|
||||
if (ohmod & 0xA4)
|
||||
a |= bit0 << 9;
|
||||
if (ohmod & 0x8)
|
||||
a |= bit2 << 9;
|
||||
if (ohmod & 0x50)
|
||||
a |= bit4 << 9;
|
||||
|
||||
if (ohmod & 0x50)
|
||||
a |= bit5 << 10;
|
||||
if (ohmod & 0xA0)
|
||||
a |= bit1 << 10;
|
||||
|
||||
if (ohmod & 0xC0)
|
||||
a |= bit2 << 11;
|
||||
|
||||
if (ohmod & 0x4)
|
||||
c |= bit1 << 6;
|
||||
if (ohmod & 0xE8)
|
||||
c |= bit3 << 6;
|
||||
|
||||
if (ohmod & 0x20)
|
||||
c |= bit2 << 7;
|
||||
|
||||
if (ohmod & 0x5B)
|
||||
{
|
||||
b0 |= bit0 << 6;
|
||||
b1 |= bit1 << 6;
|
||||
}
|
||||
|
||||
if (ohmod & 0x12)
|
||||
{
|
||||
b0 |= bit2 << 7;
|
||||
b1 |= bit3 << 7;
|
||||
}
|
||||
|
||||
if (ohmod & 0xAF)
|
||||
{
|
||||
d0 |= bit4 << 5;
|
||||
d1 |= bit5 << 5;
|
||||
}
|
||||
|
||||
if (ohmod & 0x5)
|
||||
{
|
||||
d0 |= bit2 << 6;
|
||||
d1 |= bit3 << 6;
|
||||
}
|
||||
|
||||
// sign-extend 'd0' and 'd1'
|
||||
// note: this code assumes that signed right-shift actually sign-fills, not zero-fills.
|
||||
int32_t d0x = d0;
|
||||
int32_t d1x = d1;
|
||||
int sx_shamt = 32 - dbits;
|
||||
d0x <<= sx_shamt;
|
||||
d0x >>= sx_shamt;
|
||||
d1x <<= sx_shamt;
|
||||
d1x >>= sx_shamt;
|
||||
d0 = d0x;
|
||||
d1 = d1x;
|
||||
|
||||
// expand all values to 12 bits, with left-shift as needed.
|
||||
int val_shamt = (modeval >> 1) ^ 3;
|
||||
a <<= val_shamt;
|
||||
b0 <<= val_shamt;
|
||||
b1 <<= val_shamt;
|
||||
c <<= val_shamt;
|
||||
d0 <<= val_shamt;
|
||||
d1 <<= val_shamt;
|
||||
|
||||
// then compute the actual color values.
|
||||
int red1 = a;
|
||||
int green1 = a - b0;
|
||||
int blue1 = a - b1;
|
||||
int red0 = a - c;
|
||||
int green0 = a - b0 - c - d0;
|
||||
int blue0 = a - b1 - c - d1;
|
||||
|
||||
// clamp the color components to [0,2^12 - 1]
|
||||
red0 = astc::clamp(red0, 0, 4095);
|
||||
green0 = astc::clamp(green0, 0, 4095);
|
||||
blue0 = astc::clamp(blue0, 0, 4095);
|
||||
|
||||
red1 = astc::clamp(red1, 0, 4095);
|
||||
green1 = astc::clamp(green1, 0, 4095);
|
||||
blue1 = astc::clamp(blue1, 0, 4095);
|
||||
|
||||
// switch around the color components
|
||||
int temp0, temp1;
|
||||
switch (majcomp)
|
||||
{
|
||||
case 1: // switch around red and green
|
||||
temp0 = red0;
|
||||
temp1 = red1;
|
||||
red0 = green0;
|
||||
red1 = green1;
|
||||
green0 = temp0;
|
||||
green1 = temp1;
|
||||
break;
|
||||
case 2: // switch around red and blue
|
||||
temp0 = red0;
|
||||
temp1 = red1;
|
||||
red0 = blue0;
|
||||
red1 = blue1;
|
||||
blue0 = temp0;
|
||||
blue1 = temp1;
|
||||
break;
|
||||
case 0: // no switch
|
||||
break;
|
||||
}
|
||||
|
||||
output0 = vint4(red0 << 4, green0 << 4, blue0 << 4, 0x7800);
|
||||
output1 = vint4(red1 << 4, green1 << 4, blue1 << 4, 0x7800);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Unpack an HDR RGB + LDR A direct encoding.
|
||||
*
|
||||
* @param input The packed endpoints (packed and modal).
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void hdr_rgb_ldr_alpha_unpack(
|
||||
const uint8_t input[8],
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
hdr_rgb_unpack(input, output0, output1);
|
||||
|
||||
int v6 = input[6];
|
||||
int v7 = input[7];
|
||||
output0.set_lane<3>(v6);
|
||||
output1.set_lane<3>(v7);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Unpack an HDR L (small range) direct encoding.
|
||||
*
|
||||
* @param input The packed endpoints (packed and modal).
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void hdr_luminance_small_range_unpack(
|
||||
const uint8_t input[2],
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
int v0 = input[0];
|
||||
int v1 = input[1];
|
||||
|
||||
int y0, y1;
|
||||
if (v0 & 0x80)
|
||||
{
|
||||
y0 = ((v1 & 0xE0) << 4) | ((v0 & 0x7F) << 2);
|
||||
y1 = (v1 & 0x1F) << 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = ((v1 & 0xF0) << 4) | ((v0 & 0x7F) << 1);
|
||||
y1 = (v1 & 0xF) << 1;
|
||||
}
|
||||
|
||||
y1 += y0;
|
||||
if (y1 > 0xFFF)
|
||||
{
|
||||
y1 = 0xFFF;
|
||||
}
|
||||
|
||||
output0 = vint4(y0 << 4, y0 << 4, y0 << 4, 0x7800);
|
||||
output1 = vint4(y1 << 4, y1 << 4, y1 << 4, 0x7800);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Unpack an HDR L (large range) direct encoding.
|
||||
*
|
||||
* @param input The packed endpoints (packed and modal).
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void hdr_luminance_large_range_unpack(
|
||||
const uint8_t input[2],
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
int v0 = input[0];
|
||||
int v1 = input[1];
|
||||
|
||||
int y0, y1;
|
||||
if (v1 >= v0)
|
||||
{
|
||||
y0 = v0 << 4;
|
||||
y1 = v1 << 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
y0 = (v1 << 4) + 8;
|
||||
y1 = (v0 << 4) - 8;
|
||||
}
|
||||
|
||||
output0 = vint4(y0 << 4, y0 << 4, y0 << 4, 0x7800);
|
||||
output1 = vint4(y1 << 4, y1 << 4, y1 << 4, 0x7800);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Unpack an HDR A direct encoding.
|
||||
*
|
||||
* @param input The packed endpoints (packed and modal).
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void hdr_alpha_unpack(
|
||||
const uint8_t input[2],
|
||||
int& output0,
|
||||
int& output1
|
||||
) {
|
||||
|
||||
int v6 = input[0];
|
||||
int v7 = input[1];
|
||||
|
||||
int selector = ((v6 >> 7) & 1) | ((v7 >> 6) & 2);
|
||||
v6 &= 0x7F;
|
||||
v7 &= 0x7F;
|
||||
if (selector == 3)
|
||||
{
|
||||
output0 = v6 << 5;
|
||||
output1 = v7 << 5;
|
||||
}
|
||||
else
|
||||
{
|
||||
v6 |= (v7 << (selector + 1)) & 0x780;
|
||||
v7 &= (0x3f >> selector);
|
||||
v7 ^= 32 >> selector;
|
||||
v7 -= 32 >> selector;
|
||||
v6 <<= (4 - selector);
|
||||
v7 <<= (4 - selector);
|
||||
v7 += v6;
|
||||
|
||||
if (v7 < 0)
|
||||
{
|
||||
v7 = 0;
|
||||
}
|
||||
else if (v7 > 0xFFF)
|
||||
{
|
||||
v7 = 0xFFF;
|
||||
}
|
||||
|
||||
output0 = v6;
|
||||
output1 = v7;
|
||||
}
|
||||
|
||||
output0 <<= 4;
|
||||
output1 <<= 4;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Unpack an HDR RGBA direct encoding.
|
||||
*
|
||||
* @param input The packed endpoints (packed and modal).
|
||||
* @param[out] output0 The unpacked endpoint 0 color.
|
||||
* @param[out] output1 The unpacked endpoint 1 color.
|
||||
*/
|
||||
static void hdr_rgb_hdr_alpha_unpack(
|
||||
const uint8_t input[8],
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
hdr_rgb_unpack(input, output0, output1);
|
||||
|
||||
int alpha0, alpha1;
|
||||
hdr_alpha_unpack(input + 6, alpha0, alpha1);
|
||||
|
||||
output0.set_lane<3>(alpha0);
|
||||
output1.set_lane<3>(alpha1);
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void unpack_color_endpoints(
|
||||
astcenc_profile decode_mode,
|
||||
int format,
|
||||
const uint8_t* input,
|
||||
bool& rgb_hdr,
|
||||
bool& alpha_hdr,
|
||||
vint4& output0,
|
||||
vint4& output1
|
||||
) {
|
||||
// Assume no NaNs and LDR endpoints unless set later
|
||||
rgb_hdr = false;
|
||||
alpha_hdr = false;
|
||||
|
||||
bool alpha_hdr_default = false;
|
||||
|
||||
switch (format)
|
||||
{
|
||||
case FMT_LUMINANCE:
|
||||
luminance_unpack(input, output0, output1);
|
||||
break;
|
||||
|
||||
case FMT_LUMINANCE_DELTA:
|
||||
luminance_delta_unpack(input, output0, output1);
|
||||
break;
|
||||
|
||||
case FMT_HDR_LUMINANCE_SMALL_RANGE:
|
||||
rgb_hdr = true;
|
||||
alpha_hdr_default = true;
|
||||
hdr_luminance_small_range_unpack(input, output0, output1);
|
||||
break;
|
||||
|
||||
case FMT_HDR_LUMINANCE_LARGE_RANGE:
|
||||
rgb_hdr = true;
|
||||
alpha_hdr_default = true;
|
||||
hdr_luminance_large_range_unpack(input, output0, output1);
|
||||
break;
|
||||
|
||||
case FMT_LUMINANCE_ALPHA:
|
||||
luminance_alpha_unpack(input, output0, output1);
|
||||
break;
|
||||
|
||||
case FMT_LUMINANCE_ALPHA_DELTA:
|
||||
luminance_alpha_delta_unpack(input, output0, output1);
|
||||
break;
|
||||
|
||||
case FMT_RGB_SCALE:
|
||||
{
|
||||
vint4 input0q(input[0], input[1], input[2], 0);
|
||||
uint8_t scale = input[3];
|
||||
rgb_scale_unpack(input0q, scale, output0, output1);
|
||||
}
|
||||
break;
|
||||
|
||||
case FMT_RGB_SCALE_ALPHA:
|
||||
{
|
||||
vint4 input0q(input[0], input[1], input[2], input[4]);
|
||||
uint8_t alpha1q = input[5];
|
||||
uint8_t scaleq = input[3];
|
||||
rgb_scale_alpha_unpack(input0q, alpha1q, scaleq, output0, output1);
|
||||
}
|
||||
break;
|
||||
|
||||
case FMT_HDR_RGB_SCALE:
|
||||
rgb_hdr = true;
|
||||
alpha_hdr_default = true;
|
||||
hdr_rgbo_unpack(input, output0, output1);
|
||||
break;
|
||||
|
||||
case FMT_RGB:
|
||||
{
|
||||
vint4 input0q(input[0], input[2], input[4], 0);
|
||||
vint4 input1q(input[1], input[3], input[5], 0);
|
||||
rgb_unpack(input0q, input1q, output0, output1);
|
||||
}
|
||||
break;
|
||||
|
||||
case FMT_RGB_DELTA:
|
||||
{
|
||||
vint4 input0q(input[0], input[2], input[4], 0);
|
||||
vint4 input1q(input[1], input[3], input[5], 0);
|
||||
rgb_delta_unpack(input0q, input1q, output0, output1);
|
||||
}
|
||||
break;
|
||||
|
||||
case FMT_HDR_RGB:
|
||||
rgb_hdr = true;
|
||||
alpha_hdr_default = true;
|
||||
hdr_rgb_unpack(input, output0, output1);
|
||||
break;
|
||||
|
||||
case FMT_RGBA:
|
||||
{
|
||||
vint4 input0q(input[0], input[2], input[4], input[6]);
|
||||
vint4 input1q(input[1], input[3], input[5], input[7]);
|
||||
rgba_unpack(input0q, input1q, output0, output1);
|
||||
}
|
||||
break;
|
||||
|
||||
case FMT_RGBA_DELTA:
|
||||
{
|
||||
vint4 input0q(input[0], input[2], input[4], input[6]);
|
||||
vint4 input1q(input[1], input[3], input[5], input[7]);
|
||||
rgba_delta_unpack(input0q, input1q, output0, output1);
|
||||
}
|
||||
break;
|
||||
|
||||
case FMT_HDR_RGB_LDR_ALPHA:
|
||||
rgb_hdr = true;
|
||||
hdr_rgb_ldr_alpha_unpack(input, output0, output1);
|
||||
break;
|
||||
|
||||
case FMT_HDR_RGBA:
|
||||
rgb_hdr = true;
|
||||
alpha_hdr = true;
|
||||
hdr_rgb_hdr_alpha_unpack(input, output0, output1);
|
||||
break;
|
||||
}
|
||||
|
||||
// Assign a correct default alpha
|
||||
if (alpha_hdr_default)
|
||||
{
|
||||
if (decode_mode == ASTCENC_PRF_HDR)
|
||||
{
|
||||
output0.set_lane<3>(0x7800);
|
||||
output1.set_lane<3>(0x7800);
|
||||
alpha_hdr = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
output0.set_lane<3>(0x00FF);
|
||||
output1.set_lane<3>(0x00FF);
|
||||
alpha_hdr = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Handle endpoint errors and expansion
|
||||
|
||||
// Linear LDR 8-bit endpoints are expanded to 16-bit by replication
|
||||
if (decode_mode == ASTCENC_PRF_LDR)
|
||||
{
|
||||
// Error color - HDR endpoint in an LDR encoding
|
||||
if (rgb_hdr || alpha_hdr)
|
||||
{
|
||||
output0 = vint4(0xFF, 0x00, 0xFF, 0xFF);
|
||||
output1 = vint4(0xFF, 0x00, 0xFF, 0xFF);
|
||||
rgb_hdr = false;
|
||||
alpha_hdr = false;
|
||||
}
|
||||
|
||||
output0 = output0 * 257;
|
||||
output1 = output1 * 257;
|
||||
}
|
||||
// sRGB LDR 8-bit endpoints are expanded to 16 bit by:
|
||||
// - RGB = shift left by 8 bits and OR with 0x80
|
||||
// - A = replication
|
||||
else if (decode_mode == ASTCENC_PRF_LDR_SRGB)
|
||||
{
|
||||
// Error color - HDR endpoint in an LDR encoding
|
||||
if (rgb_hdr || alpha_hdr)
|
||||
{
|
||||
output0 = vint4(0xFF, 0x00, 0xFF, 0xFF);
|
||||
output1 = vint4(0xFF, 0x00, 0xFF, 0xFF);
|
||||
rgb_hdr = false;
|
||||
alpha_hdr = false;
|
||||
}
|
||||
|
||||
output0 = lsl<8>(output0) | vint4(0x80);
|
||||
output1 = lsl<8>(output1) | vint4(0x80);
|
||||
}
|
||||
// An HDR profile decode, but may be using linear LDR endpoints
|
||||
// Linear LDR 8-bit endpoints are expanded to 16-bit by replication
|
||||
// HDR endpoints are already 16-bit
|
||||
else
|
||||
{
|
||||
vmask4 hdr_lanes(rgb_hdr, rgb_hdr, rgb_hdr, alpha_hdr);
|
||||
vint4 output_scale = select(vint4(257), vint4(1), hdr_lanes);
|
||||
output0 = output0 * output_scale;
|
||||
output1 = output1 * output_scale;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,472 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2022 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
#if !defined(ASTCENC_DECOMPRESS_ONLY)
|
||||
|
||||
/**
|
||||
* @brief Functions to calculate variance per component in a NxN footprint.
|
||||
*
|
||||
* We need N to be parametric, so the routine below uses summed area tables in order to execute in
|
||||
* O(1) time independent of how big N is.
|
||||
*
|
||||
* The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first
|
||||
* perform a binary reduction, and then distributes the results. This method means that there is no
|
||||
* serial dependency between a given element and the next one, and also significantly improves
|
||||
* numerical stability allowing us to use floats rather than doubles.
|
||||
*/
|
||||
|
||||
#include "astcenc_internal.h"
|
||||
|
||||
#include <cassert>
|
||||
|
||||
/**
|
||||
* @brief Generate a prefix-sum array using the Brent-Kung algorithm.
|
||||
*
|
||||
* This will take an input array of the form:
|
||||
* v0, v1, v2, ...
|
||||
* ... and modify in-place to turn it into a prefix-sum array of the form:
|
||||
* v0, v0+v1, v0+v1+v2, ...
|
||||
*
|
||||
* @param d The array to prefix-sum.
|
||||
* @param items The number of items in the array.
|
||||
* @param stride The item spacing in the array; i.e. dense arrays should use 1.
|
||||
*/
|
||||
static void brent_kung_prefix_sum(
|
||||
vfloat4* d,
|
||||
size_t items,
|
||||
int stride
|
||||
) {
|
||||
if (items < 2)
|
||||
return;
|
||||
|
||||
size_t lc_stride = 2;
|
||||
size_t log2_stride = 1;
|
||||
|
||||
// The reduction-tree loop
|
||||
do {
|
||||
size_t step = lc_stride >> 1;
|
||||
size_t start = lc_stride - 1;
|
||||
size_t iters = items >> log2_stride;
|
||||
|
||||
vfloat4 *da = d + (start * stride);
|
||||
ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
|
||||
size_t ofs_stride = stride << log2_stride;
|
||||
|
||||
while (iters)
|
||||
{
|
||||
*da = *da + da[ofs];
|
||||
da += ofs_stride;
|
||||
iters--;
|
||||
}
|
||||
|
||||
log2_stride += 1;
|
||||
lc_stride <<= 1;
|
||||
} while (lc_stride <= items);
|
||||
|
||||
// The expansion-tree loop
|
||||
do {
|
||||
log2_stride -= 1;
|
||||
lc_stride >>= 1;
|
||||
|
||||
size_t step = lc_stride >> 1;
|
||||
size_t start = step + lc_stride - 1;
|
||||
size_t iters = (items - step) >> log2_stride;
|
||||
|
||||
vfloat4 *da = d + (start * stride);
|
||||
ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
|
||||
size_t ofs_stride = stride << log2_stride;
|
||||
|
||||
while (iters)
|
||||
{
|
||||
*da = *da + da[ofs];
|
||||
da += ofs_stride;
|
||||
iters--;
|
||||
}
|
||||
} while (lc_stride > 2);
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void compute_pixel_region_variance(
|
||||
astcenc_contexti& ctx,
|
||||
const pixel_region_args& arg
|
||||
) {
|
||||
// Unpack the memory structure into local variables
|
||||
const astcenc_image* img = arg.img;
|
||||
astcenc_swizzle swz = arg.swz;
|
||||
bool have_z = arg.have_z;
|
||||
|
||||
int size_x = arg.size_x;
|
||||
int size_y = arg.size_y;
|
||||
int size_z = arg.size_z;
|
||||
|
||||
int offset_x = arg.offset_x;
|
||||
int offset_y = arg.offset_y;
|
||||
int offset_z = arg.offset_z;
|
||||
|
||||
int alpha_kernel_radius = arg.alpha_kernel_radius;
|
||||
|
||||
float* input_alpha_averages = ctx.input_alpha_averages;
|
||||
vfloat4* work_memory = arg.work_memory;
|
||||
|
||||
// Compute memory sizes and dimensions that we need
|
||||
int kernel_radius = alpha_kernel_radius;
|
||||
int kerneldim = 2 * kernel_radius + 1;
|
||||
int kernel_radius_xy = kernel_radius;
|
||||
int kernel_radius_z = have_z ? kernel_radius : 0;
|
||||
|
||||
int padsize_x = size_x + kerneldim;
|
||||
int padsize_y = size_y + kerneldim;
|
||||
int padsize_z = size_z + (have_z ? kerneldim : 0);
|
||||
int sizeprod = padsize_x * padsize_y * padsize_z;
|
||||
|
||||
int zd_start = have_z ? 1 : 0;
|
||||
|
||||
vfloat4 *varbuf1 = work_memory;
|
||||
vfloat4 *varbuf2 = work_memory + sizeprod;
|
||||
|
||||
// Scaling factors to apply to Y and Z for accesses into the work buffers
|
||||
int yst = padsize_x;
|
||||
int zst = padsize_x * padsize_y;
|
||||
|
||||
// Scaling factors to apply to Y and Z for accesses into result buffers
|
||||
int ydt = img->dim_x;
|
||||
int zdt = img->dim_x * img->dim_y;
|
||||
|
||||
// Macros to act as accessor functions for the work-memory
|
||||
#define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x]
|
||||
#define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x]
|
||||
|
||||
// Load N and N^2 values into the work buffers
|
||||
if (img->data_type == ASTCENC_TYPE_U8)
|
||||
{
|
||||
// Swizzle data structure 4 = ZERO, 5 = ONE
|
||||
uint8_t data[6];
|
||||
data[ASTCENC_SWZ_0] = 0;
|
||||
data[ASTCENC_SWZ_1] = 255;
|
||||
|
||||
for (int z = zd_start; z < padsize_z; z++)
|
||||
{
|
||||
int z_src = (z - zd_start) + offset_z - kernel_radius_z;
|
||||
z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
|
||||
uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]);
|
||||
|
||||
for (int y = 1; y < padsize_y; y++)
|
||||
{
|
||||
int y_src = (y - 1) + offset_y - kernel_radius_xy;
|
||||
y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
|
||||
|
||||
for (int x = 1; x < padsize_x; x++)
|
||||
{
|
||||
int x_src = (x - 1) + offset_x - kernel_radius_xy;
|
||||
x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
|
||||
|
||||
data[0] = data8[(4 * img->dim_x * y_src) + (4 * x_src )];
|
||||
data[1] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
|
||||
data[2] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
|
||||
data[3] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
|
||||
|
||||
uint8_t r = data[swz.r];
|
||||
uint8_t g = data[swz.g];
|
||||
uint8_t b = data[swz.b];
|
||||
uint8_t a = data[swz.a];
|
||||
|
||||
vfloat4 d = vfloat4 (r * (1.0f / 255.0f),
|
||||
g * (1.0f / 255.0f),
|
||||
b * (1.0f / 255.0f),
|
||||
a * (1.0f / 255.0f));
|
||||
|
||||
VARBUF1(z, y, x) = d;
|
||||
VARBUF2(z, y, x) = d * d;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (img->data_type == ASTCENC_TYPE_F16)
|
||||
{
|
||||
// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
|
||||
uint16_t data[6];
|
||||
data[ASTCENC_SWZ_0] = 0;
|
||||
data[ASTCENC_SWZ_1] = 0x3C00;
|
||||
|
||||
for (int z = zd_start; z < padsize_z; z++)
|
||||
{
|
||||
int z_src = (z - zd_start) + offset_z - kernel_radius_z;
|
||||
z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
|
||||
uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]);
|
||||
|
||||
for (int y = 1; y < padsize_y; y++)
|
||||
{
|
||||
int y_src = (y - 1) + offset_y - kernel_radius_xy;
|
||||
y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
|
||||
|
||||
for (int x = 1; x < padsize_x; x++)
|
||||
{
|
||||
int x_src = (x - 1) + offset_x - kernel_radius_xy;
|
||||
x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
|
||||
|
||||
data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src )];
|
||||
data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
|
||||
data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
|
||||
data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
|
||||
|
||||
vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
|
||||
vfloat4 d = float16_to_float(di);
|
||||
|
||||
VARBUF1(z, y, x) = d;
|
||||
VARBUF2(z, y, x) = d * d;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else // if (img->data_type == ASTCENC_TYPE_F32)
|
||||
{
|
||||
assert(img->data_type == ASTCENC_TYPE_F32);
|
||||
|
||||
// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
|
||||
float data[6];
|
||||
data[ASTCENC_SWZ_0] = 0.0f;
|
||||
data[ASTCENC_SWZ_1] = 1.0f;
|
||||
|
||||
for (int z = zd_start; z < padsize_z; z++)
|
||||
{
|
||||
int z_src = (z - zd_start) + offset_z - kernel_radius_z;
|
||||
z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
|
||||
float* data32 = static_cast<float*>(img->data[z_src]);
|
||||
|
||||
for (int y = 1; y < padsize_y; y++)
|
||||
{
|
||||
int y_src = (y - 1) + offset_y - kernel_radius_xy;
|
||||
y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
|
||||
|
||||
for (int x = 1; x < padsize_x; x++)
|
||||
{
|
||||
int x_src = (x - 1) + offset_x - kernel_radius_xy;
|
||||
x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
|
||||
|
||||
data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src )];
|
||||
data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
|
||||
data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
|
||||
data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
|
||||
|
||||
float r = data[swz.r];
|
||||
float g = data[swz.g];
|
||||
float b = data[swz.b];
|
||||
float a = data[swz.a];
|
||||
|
||||
vfloat4 d(r, g, b, a);
|
||||
|
||||
VARBUF1(z, y, x) = d;
|
||||
VARBUF2(z, y, x) = d * d;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pad with an extra layer of 0s; this forms the edge of the SAT tables
|
||||
vfloat4 vbz = vfloat4::zero();
|
||||
for (int z = 0; z < padsize_z; z++)
|
||||
{
|
||||
for (int y = 0; y < padsize_y; y++)
|
||||
{
|
||||
VARBUF1(z, y, 0) = vbz;
|
||||
VARBUF2(z, y, 0) = vbz;
|
||||
}
|
||||
|
||||
for (int x = 0; x < padsize_x; x++)
|
||||
{
|
||||
VARBUF1(z, 0, x) = vbz;
|
||||
VARBUF2(z, 0, x) = vbz;
|
||||
}
|
||||
}
|
||||
|
||||
if (have_z)
|
||||
{
|
||||
for (int y = 0; y < padsize_y; y++)
|
||||
{
|
||||
for (int x = 0; x < padsize_x; x++)
|
||||
{
|
||||
VARBUF1(0, y, x) = vbz;
|
||||
VARBUF2(0, y, x) = vbz;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Generate summed-area tables for N and N^2; this is done in-place, using
|
||||
// a Brent-Kung parallel-prefix based algorithm to minimize precision loss
|
||||
for (int z = zd_start; z < padsize_z; z++)
|
||||
{
|
||||
for (int y = 1; y < padsize_y; y++)
|
||||
{
|
||||
brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1);
|
||||
brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1);
|
||||
}
|
||||
}
|
||||
|
||||
for (int z = zd_start; z < padsize_z; z++)
|
||||
{
|
||||
for (int x = 1; x < padsize_x; x++)
|
||||
{
|
||||
brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst);
|
||||
brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst);
|
||||
}
|
||||
}
|
||||
|
||||
if (have_z)
|
||||
{
|
||||
for (int y = 1; y < padsize_y; y++)
|
||||
{
|
||||
for (int x = 1; x < padsize_x; x++)
|
||||
{
|
||||
brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst);
|
||||
brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compute a few constants used in the variance-calculation.
|
||||
float alpha_kdim = static_cast<float>(2 * alpha_kernel_radius + 1);
|
||||
float alpha_rsamples;
|
||||
|
||||
if (have_z)
|
||||
{
|
||||
alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim * alpha_kdim);
|
||||
}
|
||||
else
|
||||
{
|
||||
alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim);
|
||||
}
|
||||
|
||||
// Use the summed-area tables to compute variance for each neighborhood
|
||||
if (have_z)
|
||||
{
|
||||
for (int z = 0; z < size_z; z++)
|
||||
{
|
||||
int z_src = z + kernel_radius_z;
|
||||
int z_dst = z + offset_z;
|
||||
int z_low = z_src - alpha_kernel_radius;
|
||||
int z_high = z_src + alpha_kernel_radius + 1;
|
||||
|
||||
for (int y = 0; y < size_y; y++)
|
||||
{
|
||||
int y_src = y + kernel_radius_xy;
|
||||
int y_dst = y + offset_y;
|
||||
int y_low = y_src - alpha_kernel_radius;
|
||||
int y_high = y_src + alpha_kernel_radius + 1;
|
||||
|
||||
for (int x = 0; x < size_x; x++)
|
||||
{
|
||||
int x_src = x + kernel_radius_xy;
|
||||
int x_dst = x + offset_x;
|
||||
int x_low = x_src - alpha_kernel_radius;
|
||||
int x_high = x_src + alpha_kernel_radius + 1;
|
||||
|
||||
// Summed-area table lookups for alpha average
|
||||
float vasum = ( VARBUF1(z_high, y_low, x_low).lane<3>()
|
||||
- VARBUF1(z_high, y_low, x_high).lane<3>()
|
||||
- VARBUF1(z_high, y_high, x_low).lane<3>()
|
||||
+ VARBUF1(z_high, y_high, x_high).lane<3>()) -
|
||||
( VARBUF1(z_low, y_low, x_low).lane<3>()
|
||||
- VARBUF1(z_low, y_low, x_high).lane<3>()
|
||||
- VARBUF1(z_low, y_high, x_low).lane<3>()
|
||||
+ VARBUF1(z_low, y_high, x_high).lane<3>());
|
||||
|
||||
int out_index = z_dst * zdt + y_dst * ydt + x_dst;
|
||||
input_alpha_averages[out_index] = (vasum * alpha_rsamples);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int y = 0; y < size_y; y++)
|
||||
{
|
||||
int y_src = y + kernel_radius_xy;
|
||||
int y_dst = y + offset_y;
|
||||
int y_low = y_src - alpha_kernel_radius;
|
||||
int y_high = y_src + alpha_kernel_radius + 1;
|
||||
|
||||
for (int x = 0; x < size_x; x++)
|
||||
{
|
||||
int x_src = x + kernel_radius_xy;
|
||||
int x_dst = x + offset_x;
|
||||
int x_low = x_src - alpha_kernel_radius;
|
||||
int x_high = x_src + alpha_kernel_radius + 1;
|
||||
|
||||
// Summed-area table lookups for alpha average
|
||||
float vasum = VARBUF1(0, y_low, x_low).lane<3>()
|
||||
- VARBUF1(0, y_low, x_high).lane<3>()
|
||||
- VARBUF1(0, y_high, x_low).lane<3>()
|
||||
+ VARBUF1(0, y_high, x_high).lane<3>();
|
||||
|
||||
int out_index = y_dst * ydt + x_dst;
|
||||
input_alpha_averages[out_index] = (vasum * alpha_rsamples);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
unsigned int init_compute_averages(
|
||||
const astcenc_image& img,
|
||||
unsigned int alpha_kernel_radius,
|
||||
const astcenc_swizzle& swz,
|
||||
avg_args& ag
|
||||
) {
|
||||
unsigned int size_x = img.dim_x;
|
||||
unsigned int size_y = img.dim_y;
|
||||
unsigned int size_z = img.dim_z;
|
||||
|
||||
// Compute maximum block size and from that the working memory buffer size
|
||||
unsigned int kernel_radius = alpha_kernel_radius;
|
||||
unsigned int kerneldim = 2 * kernel_radius + 1;
|
||||
|
||||
bool have_z = (size_z > 1);
|
||||
unsigned int max_blk_size_xy = have_z ? 16 : 32;
|
||||
unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u);
|
||||
|
||||
unsigned int max_padsize_xy = max_blk_size_xy + kerneldim;
|
||||
unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0);
|
||||
|
||||
// Perform block-wise averages calculations across the image
|
||||
// Initialize fields which are not populated until later
|
||||
ag.arg.size_x = 0;
|
||||
ag.arg.size_y = 0;
|
||||
ag.arg.size_z = 0;
|
||||
ag.arg.offset_x = 0;
|
||||
ag.arg.offset_y = 0;
|
||||
ag.arg.offset_z = 0;
|
||||
ag.arg.work_memory = nullptr;
|
||||
|
||||
ag.arg.img = &img;
|
||||
ag.arg.swz = swz;
|
||||
ag.arg.have_z = have_z;
|
||||
ag.arg.alpha_kernel_radius = alpha_kernel_radius;
|
||||
|
||||
ag.img_size_x = size_x;
|
||||
ag.img_size_y = size_y;
|
||||
ag.img_size_z = size_z;
|
||||
ag.blk_size_xy = max_blk_size_xy;
|
||||
ag.blk_size_z = max_blk_size_z;
|
||||
ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z;
|
||||
|
||||
// The parallel task count
|
||||
unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z;
|
||||
unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy;
|
||||
return z_tasks * y_tasks;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,622 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Functions to decompress a symbolic block.
|
||||
*/
|
||||
|
||||
#include "astcenc_internal.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
|
||||
/**
|
||||
* @brief Compute the integer linear interpolation of two color endpoints.
|
||||
*
|
||||
* @param u8_mask The mask for lanes using decode_unorm8 rather than decode_f16.
|
||||
* @param color0 The endpoint0 color.
|
||||
* @param color1 The endpoint1 color.
|
||||
* @param weights The interpolation weight (between 0 and 64).
|
||||
*
|
||||
* @return The interpolated color.
|
||||
*/
|
||||
static vint4 lerp_color_int(
|
||||
vmask4 u8_mask,
|
||||
vint4 color0,
|
||||
vint4 color1,
|
||||
vint4 weights
|
||||
) {
|
||||
vint4 weight1 = weights;
|
||||
vint4 weight0 = vint4(64) - weight1;
|
||||
|
||||
vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
|
||||
color = asr<6>(color);
|
||||
|
||||
// For decode_unorm8 values force the codec to bit replicate. This allows the
|
||||
// rest of the codec to assume the full 0xFFFF range for everything and ignore
|
||||
// the decode_mode setting
|
||||
vint4 color_u8 = asr<8>(color) * vint4(257);
|
||||
color = select(color, color_u8, u8_mask);
|
||||
|
||||
return color;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Convert integer color value into a float value for the decoder.
|
||||
*
|
||||
* @param data The integer color value post-interpolation.
|
||||
* @param lns_mask If set treat lane as HDR (LNS) else LDR (unorm16).
|
||||
*
|
||||
* @return The float color value.
|
||||
*/
|
||||
static inline vfloat4 decode_texel(
|
||||
vint4 data,
|
||||
vmask4 lns_mask
|
||||
) {
|
||||
vint4 color_lns = vint4::zero();
|
||||
vint4 color_unorm = vint4::zero();
|
||||
|
||||
if (any(lns_mask))
|
||||
{
|
||||
color_lns = lns_to_sf16(data);
|
||||
}
|
||||
|
||||
if (!all(lns_mask))
|
||||
{
|
||||
color_unorm = unorm16_to_sf16(data);
|
||||
}
|
||||
|
||||
// Pick components and then convert to FP16
|
||||
vint4 datai = select(color_unorm, color_lns, lns_mask);
|
||||
return float16_to_float(datai);
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void unpack_weights(
|
||||
const block_size_descriptor& bsd,
|
||||
const symbolic_compressed_block& scb,
|
||||
const decimation_info& di,
|
||||
bool is_dual_plane,
|
||||
int weights_plane1[BLOCK_MAX_TEXELS],
|
||||
int weights_plane2[BLOCK_MAX_TEXELS]
|
||||
) {
|
||||
// Safe to overshoot as all arrays are allocated to full size
|
||||
if (!is_dual_plane)
|
||||
{
|
||||
// Build full 64-entry weight lookup table
|
||||
vtable_64x8 table;
|
||||
vtable_prepare(table, scb.weights);
|
||||
|
||||
for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vint summed_value(8);
|
||||
vint weight_count(di.texel_weight_count + i);
|
||||
int max_weight_count = hmax_s(weight_count);
|
||||
|
||||
promise(max_weight_count > 0);
|
||||
for (int j = 0; j < max_weight_count; j++)
|
||||
{
|
||||
vint texel_weights(di.texel_weights_tr[j] + i);
|
||||
vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
|
||||
|
||||
summed_value += vtable_lookup_32bit(table, texel_weights) * texel_weights_int;
|
||||
}
|
||||
|
||||
store(lsr<4>(summed_value), weights_plane1 + i);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Build a 32-entry weight lookup table per plane
|
||||
// Plane 1
|
||||
vtable_32x8 tab_plane1;
|
||||
vtable_prepare(tab_plane1, scb.weights);
|
||||
|
||||
// Plane 2
|
||||
vtable_32x8 tab_plane2;
|
||||
vtable_prepare(tab_plane2, scb.weights + 32);
|
||||
|
||||
for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vint sum_plane1(8);
|
||||
vint sum_plane2(8);
|
||||
|
||||
vint weight_count(di.texel_weight_count + i);
|
||||
int max_weight_count = hmax_s(weight_count);
|
||||
|
||||
promise(max_weight_count > 0);
|
||||
for (int j = 0; j < max_weight_count; j++)
|
||||
{
|
||||
vint texel_weights(di.texel_weights_tr[j] + i);
|
||||
vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
|
||||
|
||||
sum_plane1 += vtable_lookup_32bit(tab_plane1, texel_weights) * texel_weights_int;
|
||||
sum_plane2 += vtable_lookup_32bit(tab_plane2, texel_weights) * texel_weights_int;
|
||||
}
|
||||
|
||||
store(lsr<4>(sum_plane1), weights_plane1 + i);
|
||||
store(lsr<4>(sum_plane2), weights_plane2 + i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return an FP32 NaN value for use in error colors.
|
||||
*
|
||||
* This NaN encoding will turn into 0xFFFF when converted to an FP16 NaN.
|
||||
*
|
||||
* @return The float color value.
|
||||
*/
|
||||
static float error_color_nan()
|
||||
{
|
||||
if32 v;
|
||||
v.u = 0xFFFFE000U;
|
||||
return v.f;
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void decompress_symbolic_block(
|
||||
astcenc_profile decode_mode,
|
||||
const block_size_descriptor& bsd,
|
||||
int xpos,
|
||||
int ypos,
|
||||
int zpos,
|
||||
const symbolic_compressed_block& scb,
|
||||
image_block& blk
|
||||
) {
|
||||
blk.xpos = xpos;
|
||||
blk.ypos = ypos;
|
||||
blk.zpos = zpos;
|
||||
|
||||
blk.data_min = vfloat4::zero();
|
||||
blk.data_mean = vfloat4::zero();
|
||||
blk.data_max = vfloat4::zero();
|
||||
blk.grayscale = false;
|
||||
|
||||
// If we detected an error-block, blow up immediately.
|
||||
if (scb.block_type == SYM_BTYPE_ERROR)
|
||||
{
|
||||
for (unsigned int i = 0; i < bsd.texel_count; i++)
|
||||
{
|
||||
blk.data_r[i] = error_color_nan();
|
||||
blk.data_g[i] = error_color_nan();
|
||||
blk.data_b[i] = error_color_nan();
|
||||
blk.data_a[i] = error_color_nan();
|
||||
blk.rgb_lns[i] = 0;
|
||||
blk.alpha_lns[i] = 0;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if ((scb.block_type == SYM_BTYPE_CONST_F16) ||
|
||||
(scb.block_type == SYM_BTYPE_CONST_U16))
|
||||
{
|
||||
vfloat4 color;
|
||||
uint8_t use_lns = 0;
|
||||
|
||||
// UNORM16 constant color block
|
||||
if (scb.block_type == SYM_BTYPE_CONST_U16)
|
||||
{
|
||||
vint4 colori(scb.constant_color);
|
||||
|
||||
// Determine the UNORM8 rounding on the decode
|
||||
vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
|
||||
|
||||
// The real decoder would just use the top 8 bits, but we rescale
|
||||
// in to a 16-bit value that rounds correctly.
|
||||
vint4 colori_u8 = asr<8>(colori) * 257;
|
||||
colori = select(colori, colori_u8, u8_mask);
|
||||
|
||||
vint4 colorf16 = unorm16_to_sf16(colori);
|
||||
color = float16_to_float(colorf16);
|
||||
}
|
||||
// FLOAT16 constant color block
|
||||
else
|
||||
{
|
||||
switch (decode_mode)
|
||||
{
|
||||
case ASTCENC_PRF_LDR_SRGB:
|
||||
case ASTCENC_PRF_LDR:
|
||||
color = vfloat4(error_color_nan());
|
||||
break;
|
||||
case ASTCENC_PRF_HDR_RGB_LDR_A:
|
||||
case ASTCENC_PRF_HDR:
|
||||
// Constant-color block; unpack from FP16 to FP32.
|
||||
color = float16_to_float(vint4(scb.constant_color));
|
||||
use_lns = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < bsd.texel_count; i++)
|
||||
{
|
||||
blk.data_r[i] = color.lane<0>();
|
||||
blk.data_g[i] = color.lane<1>();
|
||||
blk.data_b[i] = color.lane<2>();
|
||||
blk.data_a[i] = color.lane<3>();
|
||||
blk.rgb_lns[i] = use_lns;
|
||||
blk.alpha_lns[i] = use_lns;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// Get the appropriate partition-table entry
|
||||
int partition_count = scb.partition_count;
|
||||
const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
|
||||
|
||||
// Get the appropriate block descriptors
|
||||
const auto& bm = bsd.get_block_mode(scb.block_mode);
|
||||
const auto& di = bsd.get_decimation_info(bm.decimation_mode);
|
||||
|
||||
bool is_dual_plane = static_cast<bool>(bm.is_dual_plane);
|
||||
|
||||
// Unquantize and undecimate the weights
|
||||
int plane1_weights[BLOCK_MAX_TEXELS];
|
||||
int plane2_weights[BLOCK_MAX_TEXELS];
|
||||
unpack_weights(bsd, scb, di, is_dual_plane, plane1_weights, plane2_weights);
|
||||
|
||||
// Now that we have endpoint colors and weights, we can unpack texel colors
|
||||
int plane2_component = scb.plane2_component;
|
||||
vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);
|
||||
|
||||
vmask4 u8_mask = get_u8_component_mask(decode_mode, blk);
|
||||
|
||||
for (int i = 0; i < partition_count; i++)
|
||||
{
|
||||
// Decode the color endpoints for this partition
|
||||
vint4 ep0;
|
||||
vint4 ep1;
|
||||
bool rgb_lns;
|
||||
bool a_lns;
|
||||
|
||||
unpack_color_endpoints(decode_mode,
|
||||
scb.color_formats[i],
|
||||
scb.color_values[i],
|
||||
rgb_lns, a_lns,
|
||||
ep0, ep1);
|
||||
|
||||
vmask4 lns_mask(rgb_lns, rgb_lns, rgb_lns, a_lns);
|
||||
|
||||
int texel_count = pi.partition_texel_count[i];
|
||||
for (int j = 0; j < texel_count; j++)
|
||||
{
|
||||
int tix = pi.texels_of_partition[i][j];
|
||||
vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
|
||||
vint4 color = lerp_color_int(u8_mask, ep0, ep1, weight);
|
||||
vfloat4 colorf = decode_texel(color, lns_mask);
|
||||
|
||||
blk.data_r[tix] = colorf.lane<0>();
|
||||
blk.data_g[tix] = colorf.lane<1>();
|
||||
blk.data_b[tix] = colorf.lane<2>();
|
||||
blk.data_a[tix] = colorf.lane<3>();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(ASTCENC_DECOMPRESS_ONLY)
|
||||
|
||||
/* See header for documentation. */
|
||||
float compute_symbolic_block_difference_2plane(
|
||||
const astcenc_config& config,
|
||||
const block_size_descriptor& bsd,
|
||||
const symbolic_compressed_block& scb,
|
||||
const image_block& blk
|
||||
) {
|
||||
// If we detected an error-block, blow up immediately.
|
||||
if (scb.block_type == SYM_BTYPE_ERROR)
|
||||
{
|
||||
return ERROR_CALC_DEFAULT;
|
||||
}
|
||||
|
||||
assert(scb.block_mode >= 0);
|
||||
assert(scb.partition_count == 1);
|
||||
assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 1);
|
||||
|
||||
// Get the appropriate block descriptor
|
||||
const block_mode& bm = bsd.get_block_mode(scb.block_mode);
|
||||
const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
|
||||
|
||||
// Unquantize and undecimate the weights
|
||||
int plane1_weights[BLOCK_MAX_TEXELS];
|
||||
int plane2_weights[BLOCK_MAX_TEXELS];
|
||||
unpack_weights(bsd, scb, di, true, plane1_weights, plane2_weights);
|
||||
|
||||
vmask4 plane2_mask = vint4::lane_id() == vint4(scb.plane2_component);
|
||||
|
||||
vfloat4 summa = vfloat4::zero();
|
||||
|
||||
// Decode the color endpoints for this partition
|
||||
vint4 ep0;
|
||||
vint4 ep1;
|
||||
bool rgb_lns;
|
||||
bool a_lns;
|
||||
|
||||
unpack_color_endpoints(config.profile,
|
||||
scb.color_formats[0],
|
||||
scb.color_values[0],
|
||||
rgb_lns, a_lns,
|
||||
ep0, ep1);
|
||||
|
||||
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
|
||||
|
||||
// Unpack and compute error for each texel in the partition
|
||||
unsigned int texel_count = bsd.texel_count;
|
||||
for (unsigned int i = 0; i < texel_count; i++)
|
||||
{
|
||||
vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
|
||||
vint4 colori = lerp_color_int(u8_mask, ep0, ep1, weight);
|
||||
|
||||
vfloat4 color = int_to_float(colori);
|
||||
vfloat4 oldColor = blk.texel(i);
|
||||
|
||||
// Compare error using a perceptual decode metric for RGBM textures
|
||||
if (config.flags & ASTCENC_FLG_MAP_RGBM)
|
||||
{
|
||||
// Fail encodings that result in zero weight M pixels. Note that this can cause
|
||||
// "interesting" artifacts if we reject all useful encodings - we typically get max
|
||||
// brightness encodings instead which look just as bad. We recommend users apply a
|
||||
// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
|
||||
// getting small M values post-quantization, but we can't prove it would never
|
||||
// happen, especially at low bit rates ...
|
||||
if (color.lane<3>() == 0.0f)
|
||||
{
|
||||
return -ERROR_CALC_DEFAULT;
|
||||
}
|
||||
|
||||
// Compute error based on decoded RGBM color
|
||||
color = vfloat4(
|
||||
color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
|
||||
color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
|
||||
color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
|
||||
1.0f
|
||||
);
|
||||
|
||||
oldColor = vfloat4(
|
||||
oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
|
||||
oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
|
||||
oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
|
||||
1.0f
|
||||
);
|
||||
}
|
||||
|
||||
vfloat4 error = oldColor - color;
|
||||
error = min(abs(error), 1e15f);
|
||||
error = error * error;
|
||||
|
||||
summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
|
||||
}
|
||||
|
||||
return summa.lane<0>();
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
float compute_symbolic_block_difference_1plane(
|
||||
const astcenc_config& config,
|
||||
const block_size_descriptor& bsd,
|
||||
const symbolic_compressed_block& scb,
|
||||
const image_block& blk
|
||||
) {
|
||||
assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 0);
|
||||
|
||||
// If we detected an error-block, blow up immediately.
|
||||
if (scb.block_type == SYM_BTYPE_ERROR)
|
||||
{
|
||||
return ERROR_CALC_DEFAULT;
|
||||
}
|
||||
|
||||
assert(scb.block_mode >= 0);
|
||||
|
||||
// Get the appropriate partition-table entry
|
||||
unsigned int partition_count = scb.partition_count;
|
||||
const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
|
||||
|
||||
// Get the appropriate block descriptor
|
||||
const block_mode& bm = bsd.get_block_mode(scb.block_mode);
|
||||
const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
|
||||
|
||||
// Unquantize and undecimate the weights
|
||||
int plane1_weights[BLOCK_MAX_TEXELS];
|
||||
unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
|
||||
|
||||
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
|
||||
|
||||
vfloat4 summa = vfloat4::zero();
|
||||
for (unsigned int i = 0; i < partition_count; i++)
|
||||
{
|
||||
// Decode the color endpoints for this partition
|
||||
vint4 ep0;
|
||||
vint4 ep1;
|
||||
bool rgb_lns;
|
||||
bool a_lns;
|
||||
|
||||
unpack_color_endpoints(config.profile,
|
||||
scb.color_formats[i],
|
||||
scb.color_values[i],
|
||||
rgb_lns, a_lns,
|
||||
ep0, ep1);
|
||||
|
||||
// Unpack and compute error for each texel in the partition
|
||||
unsigned int texel_count = pi.partition_texel_count[i];
|
||||
for (unsigned int j = 0; j < texel_count; j++)
|
||||
{
|
||||
unsigned int tix = pi.texels_of_partition[i][j];
|
||||
vint4 colori = lerp_color_int(u8_mask, ep0, ep1,
|
||||
vint4(plane1_weights[tix]));
|
||||
|
||||
vfloat4 color = int_to_float(colori);
|
||||
vfloat4 oldColor = blk.texel(tix);
|
||||
|
||||
// Compare error using a perceptual decode metric for RGBM textures
|
||||
if (config.flags & ASTCENC_FLG_MAP_RGBM)
|
||||
{
|
||||
// Fail encodings that result in zero weight M pixels. Note that this can cause
|
||||
// "interesting" artifacts if we reject all useful encodings - we typically get max
|
||||
// brightness encodings instead which look just as bad. We recommend users apply a
|
||||
// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
|
||||
// getting small M values post-quantization, but we can't prove it would never
|
||||
// happen, especially at low bit rates ...
|
||||
if (color.lane<3>() == 0.0f)
|
||||
{
|
||||
return -ERROR_CALC_DEFAULT;
|
||||
}
|
||||
|
||||
// Compute error based on decoded RGBM color
|
||||
color = vfloat4(
|
||||
color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
|
||||
color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
|
||||
color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
|
||||
1.0f
|
||||
);
|
||||
|
||||
oldColor = vfloat4(
|
||||
oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
|
||||
oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
|
||||
oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
|
||||
1.0f
|
||||
);
|
||||
}
|
||||
|
||||
vfloat4 error = oldColor - color;
|
||||
error = min(abs(error), 1e15f);
|
||||
error = error * error;
|
||||
|
||||
summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
|
||||
}
|
||||
}
|
||||
|
||||
return summa.lane<0>();
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
float compute_symbolic_block_difference_1plane_1partition(
|
||||
const astcenc_config& config,
|
||||
const block_size_descriptor& bsd,
|
||||
const symbolic_compressed_block& scb,
|
||||
const image_block& blk
|
||||
) {
|
||||
// If we detected an error-block, blow up immediately.
|
||||
if (scb.block_type == SYM_BTYPE_ERROR)
|
||||
{
|
||||
return ERROR_CALC_DEFAULT;
|
||||
}
|
||||
|
||||
assert(scb.block_mode >= 0);
|
||||
assert(bsd.get_partition_info(scb.partition_count, scb.partition_index).partition_count == 1);
|
||||
|
||||
// Get the appropriate block descriptor
|
||||
const block_mode& bm = bsd.get_block_mode(scb.block_mode);
|
||||
const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
|
||||
|
||||
// Unquantize and undecimate the weights
|
||||
ASTCENC_ALIGNAS int plane1_weights[BLOCK_MAX_TEXELS];
|
||||
unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
|
||||
|
||||
// Decode the color endpoints for this partition
|
||||
vint4 ep0;
|
||||
vint4 ep1;
|
||||
bool rgb_lns;
|
||||
bool a_lns;
|
||||
|
||||
unpack_color_endpoints(config.profile,
|
||||
scb.color_formats[0],
|
||||
scb.color_values[0],
|
||||
rgb_lns, a_lns,
|
||||
ep0, ep1);
|
||||
|
||||
vmask4 u8_mask = get_u8_component_mask(config.profile, blk);
|
||||
|
||||
// Unpack and compute error for each texel in the partition
|
||||
vfloatacc summav = vfloatacc::zero();
|
||||
|
||||
vint lane_id = vint::lane_id();
|
||||
|
||||
unsigned int texel_count = bsd.texel_count;
|
||||
for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
// Compute EP1 contribution
|
||||
vint weight1 = vint::loada(plane1_weights + i);
|
||||
vint ep1_r = vint(ep1.lane<0>()) * weight1;
|
||||
vint ep1_g = vint(ep1.lane<1>()) * weight1;
|
||||
vint ep1_b = vint(ep1.lane<2>()) * weight1;
|
||||
vint ep1_a = vint(ep1.lane<3>()) * weight1;
|
||||
|
||||
// Compute EP0 contribution
|
||||
vint weight0 = vint(64) - weight1;
|
||||
vint ep0_r = vint(ep0.lane<0>()) * weight0;
|
||||
vint ep0_g = vint(ep0.lane<1>()) * weight0;
|
||||
vint ep0_b = vint(ep0.lane<2>()) * weight0;
|
||||
vint ep0_a = vint(ep0.lane<3>()) * weight0;
|
||||
|
||||
// Combine contributions
|
||||
vint colori_r = asr<6>(ep0_r + ep1_r + vint(32));
|
||||
vint colori_g = asr<6>(ep0_g + ep1_g + vint(32));
|
||||
vint colori_b = asr<6>(ep0_b + ep1_b + vint(32));
|
||||
vint colori_a = asr<6>(ep0_a + ep1_a + vint(32));
|
||||
|
||||
// If using a U8 decode mode bit replicate top 8 bits
|
||||
// so rest of codec can assume 0xFFFF max range everywhere
|
||||
vint colori_r8 = asr<8>(colori_r) * vint(257);
|
||||
colori_r = select(colori_r, colori_r8, vmask(u8_mask.lane<0>()));
|
||||
|
||||
vint colori_g8 = asr<8>(colori_g) * vint(257);
|
||||
colori_g = select(colori_g, colori_g8, vmask(u8_mask.lane<1>()));
|
||||
|
||||
vint colori_b8 = asr<8>(colori_b) * vint(257);
|
||||
colori_b = select(colori_b, colori_b8, vmask(u8_mask.lane<2>()));
|
||||
|
||||
vint colori_a8 = asr<8>(colori_a) * vint(257);
|
||||
colori_a = select(colori_a, colori_a8, vmask(u8_mask.lane<3>()));
|
||||
|
||||
// Compute color diff
|
||||
vfloat color_r = int_to_float(colori_r);
|
||||
vfloat color_g = int_to_float(colori_g);
|
||||
vfloat color_b = int_to_float(colori_b);
|
||||
vfloat color_a = int_to_float(colori_a);
|
||||
|
||||
vfloat color_orig_r = loada(blk.data_r + i);
|
||||
vfloat color_orig_g = loada(blk.data_g + i);
|
||||
vfloat color_orig_b = loada(blk.data_b + i);
|
||||
vfloat color_orig_a = loada(blk.data_a + i);
|
||||
|
||||
vfloat color_error_r = min(abs(color_orig_r - color_r), vfloat(1e15f));
|
||||
vfloat color_error_g = min(abs(color_orig_g - color_g), vfloat(1e15f));
|
||||
vfloat color_error_b = min(abs(color_orig_b - color_b), vfloat(1e15f));
|
||||
vfloat color_error_a = min(abs(color_orig_a - color_a), vfloat(1e15f));
|
||||
|
||||
// Compute squared error metric
|
||||
color_error_r = color_error_r * color_error_r;
|
||||
color_error_g = color_error_g * color_error_g;
|
||||
color_error_b = color_error_b * color_error_b;
|
||||
color_error_a = color_error_a * color_error_a;
|
||||
|
||||
vfloat metric = color_error_r * blk.channel_weight.lane<0>()
|
||||
+ color_error_g * blk.channel_weight.lane<1>()
|
||||
+ color_error_b * blk.channel_weight.lane<2>()
|
||||
+ color_error_a * blk.channel_weight.lane<3>();
|
||||
|
||||
// Mask off bad lanes
|
||||
vmask mask = lane_id < vint(texel_count);
|
||||
lane_id += vint(ASTCENC_SIMD_WIDTH);
|
||||
haccumulate(summav, metric, mask);
|
||||
}
|
||||
|
||||
return hadd_s(summav);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,245 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2021-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Functions for the library entrypoint.
|
||||
*/
|
||||
|
||||
#if defined(ASTCENC_DIAGNOSTICS)
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdarg>
|
||||
#include <cstdio>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <string>
|
||||
|
||||
#include "astcenc_diagnostic_trace.h"
|
||||
|
||||
/** @brief The global trace logger. */
|
||||
static TraceLog* g_TraceLog = nullptr;
|
||||
|
||||
/** @brief The JSON indentation level. */
|
||||
static const size_t g_trace_indent = 2;
|
||||
|
||||
TraceLog::TraceLog(
|
||||
const char* file_name):
|
||||
m_file(file_name, std::ofstream::out | std::ofstream::binary)
|
||||
{
|
||||
assert(!g_TraceLog);
|
||||
g_TraceLog = this;
|
||||
m_root = new TraceNode("root");
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
TraceNode* TraceLog::get_current_leaf()
|
||||
{
|
||||
if (m_stack.size())
|
||||
{
|
||||
return m_stack.back();
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
size_t TraceLog::get_depth()
|
||||
{
|
||||
return m_stack.size();
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
TraceLog::~TraceLog()
|
||||
{
|
||||
assert(g_TraceLog == this);
|
||||
delete m_root;
|
||||
g_TraceLog = nullptr;
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
TraceNode::TraceNode(
|
||||
const char* format,
|
||||
...
|
||||
) {
|
||||
// Format the name string
|
||||
constexpr size_t bufsz = 256;
|
||||
char buffer[bufsz];
|
||||
|
||||
va_list args;
|
||||
va_start (args, format);
|
||||
vsnprintf (buffer, bufsz, format, args);
|
||||
va_end (args);
|
||||
|
||||
// Guarantee there is a nul terminator
|
||||
buffer[bufsz - 1] = 0;
|
||||
|
||||
// Generate the node
|
||||
TraceNode* parent = g_TraceLog->get_current_leaf();
|
||||
size_t depth = g_TraceLog->get_depth();
|
||||
g_TraceLog->m_stack.push_back(this);
|
||||
|
||||
bool comma = parent && parent->m_attrib_count;
|
||||
auto& out = g_TraceLog->m_file;
|
||||
|
||||
if (parent)
|
||||
{
|
||||
parent->m_attrib_count++;
|
||||
}
|
||||
|
||||
if (comma)
|
||||
{
|
||||
out << ',';
|
||||
}
|
||||
|
||||
if (depth)
|
||||
{
|
||||
out << '\n';
|
||||
}
|
||||
|
||||
size_t out_indent = (depth * 2) * g_trace_indent;
|
||||
size_t in_indent = (depth * 2 + 1) * g_trace_indent;
|
||||
|
||||
std::string out_indents("");
|
||||
if (out_indent)
|
||||
{
|
||||
out_indents = std::string(out_indent, ' ');
|
||||
}
|
||||
|
||||
std::string in_indents(in_indent, ' ');
|
||||
|
||||
out << out_indents << "[ \"node\", \"" << buffer << "\",\n";
|
||||
out << in_indents << "[";
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void TraceNode::add_attrib(
|
||||
std::string type,
|
||||
std::string key,
|
||||
std::string value
|
||||
) {
|
||||
(void)type;
|
||||
|
||||
size_t depth = g_TraceLog->get_depth();
|
||||
size_t indent = (depth * 2) * g_trace_indent;
|
||||
auto& out = g_TraceLog->m_file;
|
||||
bool comma = m_attrib_count;
|
||||
m_attrib_count++;
|
||||
|
||||
if (comma)
|
||||
{
|
||||
out << ',';
|
||||
}
|
||||
|
||||
out << '\n';
|
||||
out << std::string(indent, ' ') << "[ "
|
||||
<< "\"" << key << "\", "
|
||||
<< value << " ]";
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
TraceNode::~TraceNode()
|
||||
{
|
||||
g_TraceLog->m_stack.pop_back();
|
||||
|
||||
auto& out = g_TraceLog->m_file;
|
||||
size_t depth = g_TraceLog->get_depth();
|
||||
size_t out_indent = (depth * 2) * g_trace_indent;
|
||||
size_t in_indent = (depth * 2 + 1) * g_trace_indent;
|
||||
|
||||
std::string out_indents("");
|
||||
if (out_indent)
|
||||
{
|
||||
out_indents = std::string(out_indent, ' ');
|
||||
}
|
||||
|
||||
std::string in_indents(in_indent, ' ');
|
||||
|
||||
if (m_attrib_count)
|
||||
{
|
||||
out << "\n" << in_indents;
|
||||
}
|
||||
out << "]\n";
|
||||
|
||||
out << out_indents << "]";
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void trace_add_data(
|
||||
const char* key,
|
||||
const char* format,
|
||||
...
|
||||
) {
|
||||
constexpr size_t bufsz = 256;
|
||||
char buffer[bufsz];
|
||||
|
||||
va_list args;
|
||||
va_start (args, format);
|
||||
vsnprintf (buffer, bufsz, format, args);
|
||||
va_end (args);
|
||||
|
||||
// Guarantee there is a nul terminator
|
||||
buffer[bufsz - 1] = 0;
|
||||
|
||||
std::string value = "\"" + std::string(buffer) + "\"";
|
||||
|
||||
TraceNode* node = g_TraceLog->get_current_leaf();
|
||||
node->add_attrib("str", key, value);
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void trace_add_data(
|
||||
const char* key,
|
||||
float value
|
||||
) {
|
||||
// Turn infinities into parseable values
|
||||
if (std::isinf(value))
|
||||
{
|
||||
if (value > 0.0f)
|
||||
{
|
||||
value = std::numeric_limits<float>::max();
|
||||
}
|
||||
else
|
||||
{
|
||||
value = -std::numeric_limits<float>::max();
|
||||
}
|
||||
}
|
||||
|
||||
char buffer[256];
|
||||
sprintf(buffer, "%.20g", (double)value);
|
||||
TraceNode* node = g_TraceLog->get_current_leaf();
|
||||
node->add_attrib("float", key, buffer);
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void trace_add_data(
|
||||
const char* key,
|
||||
int value
|
||||
) {
|
||||
TraceNode* node = g_TraceLog->get_current_leaf();
|
||||
node->add_attrib("int", key, std::to_string(value));
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void trace_add_data(
|
||||
const char* key,
|
||||
unsigned int value
|
||||
) {
|
||||
TraceNode* node = g_TraceLog->get_current_leaf();
|
||||
node->add_attrib("int", key, std::to_string(value));
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,219 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2021-2022 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief This module provides a set of diagnostic tracing utilities.
|
||||
*
|
||||
* Overview
|
||||
* ========
|
||||
*
|
||||
* The built-in diagnostic trace tool generates a hierarchical JSON tree structure. The tree
|
||||
* hierarchy contains three levels:
|
||||
*
|
||||
* - block
|
||||
* - pass
|
||||
* - candidate
|
||||
*
|
||||
* One block node exists for each compressed block in the image. One pass node exists for each major
|
||||
* pass (N partition, M planes, O components) applied to a block. One candidate node exists for each
|
||||
* encoding candidate trialed for a pass.
|
||||
*
|
||||
* Each node contains both the hierarchy but also a number of attributes which explain the behavior.
|
||||
* For example, the block node contains the block coordinates in the image, the pass explains the
|
||||
* pass configuration, and the candidate will explain the candidate encoding such as weight
|
||||
* decimation, refinement error, etc.
|
||||
*
|
||||
* Trace Nodes are designed as scope-managed C++ objects with stack-like push/pop behavior.
|
||||
* Constructing a trace node on the stack will automatically add it to the current node as a child,
|
||||
* and then make it the current node. Destroying the current node will pop the stack and set the
|
||||
* parent to the current node. This provides a robust mechanism for ensuring reliable nesting in the
|
||||
* tree structure.
|
||||
*
|
||||
* A set of utility macros are provided to add attribute annotations to the current trace node.
|
||||
*
|
||||
* Usage
|
||||
* =====
|
||||
*
|
||||
* Create Trace Nodes on the stack using the @c TRACE_NODE() macro. This will compile-out completely
|
||||
* in builds with diagnostics disabled.
|
||||
*
|
||||
* Add annotations to the current trace node using the @c trace_add_data() macro. This will
|
||||
* similarly compile out completely in builds with diagnostics disabled.
|
||||
*
|
||||
* If you need to add additional code to support diagnostics-only behavior wrap
|
||||
* it in preprocessor guards:
|
||||
*
|
||||
* #if defined(ASTCENC_DIAGNOSTICS)
|
||||
* #endif
|
||||
*/
|
||||
|
||||
#ifndef ASTCENC_DIAGNOSTIC_TRACE_INCLUDED
|
||||
#define ASTCENC_DIAGNOSTIC_TRACE_INCLUDED
|
||||
|
||||
#if defined(ASTCENC_DIAGNOSTICS)
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
|
||||
/**
|
||||
* @brief Class representing a single node in the trace hierarchy.
|
||||
*/
|
||||
class TraceNode
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* @brief Construct a new node.
|
||||
*
|
||||
* Constructing a node will push to the the top of the stack, automatically making it a child of
|
||||
* the current node, and then setting it to become the current node.
|
||||
*
|
||||
* @param format The format template for the node name.
|
||||
* @param ... The format parameters.
|
||||
*/
|
||||
TraceNode(const char* format, ...);
|
||||
|
||||
/**
|
||||
* @brief Add an attribute to this node.
|
||||
*
|
||||
* Note that no quoting is applied to the @c value, so if quoting is needed it must be done by
|
||||
* the caller.
|
||||
*
|
||||
* @param type The type of the attribute.
|
||||
* @param key The key of the attribute.
|
||||
* @param value The value of the attribute.
|
||||
*/
|
||||
void add_attrib(std::string type, std::string key, std::string value);
|
||||
|
||||
/**
|
||||
* @brief Destroy this node.
|
||||
*
|
||||
* Destroying a node will pop it from the top of the stack, making its parent the current node.
|
||||
* It is invalid behavior to destroy a node that is not the current node; usage must conform to
|
||||
* stack push-pop semantics.
|
||||
*/
|
||||
~TraceNode();
|
||||
|
||||
/**
|
||||
* @brief The number of attributes and child nodes in this node.
|
||||
*/
|
||||
unsigned int m_attrib_count { 0 };
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Class representing the trace log file being written.
|
||||
*/
|
||||
class TraceLog
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* @brief Create a new trace log.
|
||||
*
|
||||
* The trace log is global; there can be only one at a time.
|
||||
*
|
||||
* @param file_name The name of the file to write.
|
||||
*/
|
||||
TraceLog(const char* file_name);
|
||||
|
||||
/**
|
||||
* @brief Detroy the trace log.
|
||||
*
|
||||
* Trace logs MUST be cleanly destroyed to ensure the file gets written.
|
||||
*/
|
||||
~TraceLog();
|
||||
|
||||
/**
|
||||
* @brief Get the current child node.
|
||||
*
|
||||
* @return The current leaf node.
|
||||
*/
|
||||
TraceNode* get_current_leaf();
|
||||
|
||||
/**
|
||||
* @brief Get the stack depth of the current child node.
|
||||
*
|
||||
* @return The current leaf node stack depth.
|
||||
*/
|
||||
size_t get_depth();
|
||||
|
||||
/**
|
||||
* @brief The file stream to write to.
|
||||
*/
|
||||
std::ofstream m_file;
|
||||
|
||||
/**
|
||||
* @brief The stack of nodes (newest at the back).
|
||||
*/
|
||||
std::vector<TraceNode*> m_stack;
|
||||
|
||||
private:
|
||||
/**
|
||||
* @brief The root node in the JSON file.
|
||||
*/
|
||||
TraceNode* m_root;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Utility macro to create a trace node on the stack.
|
||||
*
|
||||
* @param name The variable name to use.
|
||||
* @param ... The name template and format parameters.
|
||||
*/
|
||||
#define TRACE_NODE(name, ...) TraceNode name(__VA_ARGS__);
|
||||
|
||||
/**
|
||||
* @brief Add a string annotation to the current node.
|
||||
*
|
||||
* @param key The name of the attribute.
|
||||
* @param format The format template for the attribute value.
|
||||
* @param ... The format parameters.
|
||||
*/
|
||||
void trace_add_data(const char* key, const char* format, ...);
|
||||
|
||||
/**
|
||||
* @brief Add a float annotation to the current node.
|
||||
*
|
||||
* @param key The name of the attribute.
|
||||
* @param value The value of the attribute.
|
||||
*/
|
||||
void trace_add_data(const char* key, float value);
|
||||
|
||||
/**
|
||||
* @brief Add an integer annotation to the current node.
|
||||
*
|
||||
* @param key The name of the attribute.
|
||||
* @param value The value of the attribute.
|
||||
*/
|
||||
void trace_add_data(const char* key, int value);
|
||||
|
||||
/**
|
||||
* @brief Add an unsigned integer annotation to the current node.
|
||||
*
|
||||
* @param key The name of the attribute.
|
||||
* @param value The value of the attribute.
|
||||
*/
|
||||
void trace_add_data(const char* key, unsigned int value);
|
||||
|
||||
#else
|
||||
|
||||
#define TRACE_NODE(name, ...)
|
||||
|
||||
#define trace_add_data(...)
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,781 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2025 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
#if !defined(ASTCENC_DECOMPRESS_ONLY)
|
||||
|
||||
/**
|
||||
* @brief Functions for finding best partition for a block.
|
||||
*
|
||||
* The partition search operates in two stages. The first pass uses kmeans clustering to group
|
||||
* texels into an ideal partitioning for the requested partition count, and then compares that
|
||||
* against the 1024 partitionings generated by the ASTC partition hash function. The generated
|
||||
* partitions are then ranked by the number of texels in the wrong partition, compared to the ideal
|
||||
* clustering. All 1024 partitions are tested for similarity and ranked, apart from duplicates and
|
||||
* partitionings that actually generate fewer than the requested partition count, but only the top
|
||||
* N candidates are actually put through a more detailed search. N is determined by the compressor
|
||||
* quality preset.
|
||||
*
|
||||
* For the detailed search, each candidate is checked against two possible encoding methods:
|
||||
*
|
||||
* - The best partitioning assuming different chroma colors (RGB + RGB or RGB + delta endpoints).
|
||||
* - The best partitioning assuming same chroma colors (RGB + scale endpoints).
|
||||
*
|
||||
* This is implemented by computing the compute mean color and dominant direction for each
|
||||
* partition. This defines two lines, both of which go through the mean color value.
|
||||
*
|
||||
* - One line has a direction defined by the dominant direction; this is used to assess the error
|
||||
* from using an uncorrelated color representation.
|
||||
* - The other line goes through (0,0,0,1) and is used to assess the error from using a same chroma
|
||||
* (RGB + scale) color representation.
|
||||
*
|
||||
* The best candidate is selected by computing the squared-errors that result from using these
|
||||
* lines for endpoint selection.
|
||||
*/
|
||||
|
||||
#include <limits>
|
||||
#include "astcenc_internal.h"
|
||||
|
||||
/**
|
||||
* @brief Pick some initial kmeans cluster centers.
|
||||
*
|
||||
* @param blk The image block color data to compress.
|
||||
* @param texel_count The number of texels in the block.
|
||||
* @param partition_count The number of partitions in the block.
|
||||
* @param[out] cluster_centers The initial partition cluster center colors.
|
||||
*/
|
||||
static void kmeans_init(
|
||||
const image_block& blk,
|
||||
unsigned int texel_count,
|
||||
unsigned int partition_count,
|
||||
vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS]
|
||||
) {
|
||||
promise(texel_count > 0);
|
||||
promise(partition_count > 0);
|
||||
|
||||
unsigned int clusters_selected = 0;
|
||||
float distances[BLOCK_MAX_TEXELS];
|
||||
|
||||
// Pick a random sample as first cluster center; 145897 from random.org
|
||||
unsigned int sample = 145897 % texel_count;
|
||||
vfloat4 center_color = blk.texel(sample);
|
||||
cluster_centers[clusters_selected] = center_color;
|
||||
clusters_selected++;
|
||||
|
||||
// Compute the distance to the first cluster center
|
||||
float distance_sum = 0.0f;
|
||||
for (unsigned int i = 0; i < texel_count; i++)
|
||||
{
|
||||
vfloat4 color = blk.texel(i);
|
||||
vfloat4 diff = color - center_color;
|
||||
float distance = dot_s(diff * diff, blk.channel_weight);
|
||||
distance_sum += distance;
|
||||
distances[i] = distance;
|
||||
}
|
||||
|
||||
// More numbers from random.org for weighted-random center selection
|
||||
const float cluster_cutoffs[9] {
|
||||
0.626220f, 0.932770f, 0.275454f,
|
||||
0.318558f, 0.240113f, 0.009190f,
|
||||
0.347661f, 0.731960f, 0.156391f
|
||||
};
|
||||
|
||||
unsigned int cutoff = (clusters_selected - 1) + 3 * (partition_count - 2);
|
||||
|
||||
// Pick the remaining samples as needed
|
||||
while (true)
|
||||
{
|
||||
// Pick the next center in a weighted-random fashion.
|
||||
float summa = 0.0f;
|
||||
float distance_cutoff = distance_sum * cluster_cutoffs[cutoff++];
|
||||
for (sample = 0; sample < texel_count; sample++)
|
||||
{
|
||||
summa += distances[sample];
|
||||
if (summa >= distance_cutoff)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Clamp to a valid range and store the selected cluster center
|
||||
sample = astc::min(sample, texel_count - 1);
|
||||
|
||||
center_color = blk.texel(sample);
|
||||
cluster_centers[clusters_selected++] = center_color;
|
||||
if (clusters_selected >= partition_count)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
// Compute the distance to the new cluster center, keep the min dist
|
||||
distance_sum = 0.0f;
|
||||
for (unsigned int i = 0; i < texel_count; i++)
|
||||
{
|
||||
vfloat4 color = blk.texel(i);
|
||||
vfloat4 diff = color - center_color;
|
||||
float distance = dot_s(diff * diff, blk.channel_weight);
|
||||
distance = astc::min(distance, distances[i]);
|
||||
distance_sum += distance;
|
||||
distances[i] = distance;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Assign texels to clusters, based on a set of chosen center points.
|
||||
*
|
||||
* @param blk The image block color data to compress.
|
||||
* @param texel_count The number of texels in the block.
|
||||
* @param partition_count The number of partitions in the block.
|
||||
* @param cluster_centers The partition cluster center colors.
|
||||
* @param[out] partition_of_texel The partition assigned for each texel.
|
||||
*/
|
||||
static void kmeans_assign(
|
||||
const image_block& blk,
|
||||
unsigned int texel_count,
|
||||
unsigned int partition_count,
|
||||
const vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
|
||||
uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
|
||||
) {
|
||||
promise(texel_count > 0);
|
||||
promise(partition_count > 0);
|
||||
|
||||
uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
|
||||
|
||||
// Find the best partition for every texel
|
||||
for (unsigned int i = 0; i < texel_count; i++)
|
||||
{
|
||||
float best_distance = std::numeric_limits<float>::max();
|
||||
unsigned int best_partition = 0;
|
||||
|
||||
vfloat4 color = blk.texel(i);
|
||||
for (unsigned int j = 0; j < partition_count; j++)
|
||||
{
|
||||
vfloat4 diff = color - cluster_centers[j];
|
||||
float distance = dot_s(diff * diff, blk.channel_weight);
|
||||
if (distance < best_distance)
|
||||
{
|
||||
best_distance = distance;
|
||||
best_partition = j;
|
||||
}
|
||||
}
|
||||
|
||||
partition_of_texel[i] = static_cast<uint8_t>(best_partition);
|
||||
partition_texel_count[best_partition]++;
|
||||
}
|
||||
|
||||
// It is possible to get a situation where a partition ends up without any texels. In this case,
|
||||
// assign texel N to partition N. This is silly, but ensures that every partition retains at
|
||||
// least one texel. Reassigning a texel in this manner may cause another partition to go empty,
|
||||
// so if we actually did a reassignment, run the whole loop over again.
|
||||
bool problem_case;
|
||||
do
|
||||
{
|
||||
problem_case = false;
|
||||
for (unsigned int i = 0; i < partition_count; i++)
|
||||
{
|
||||
if (partition_texel_count[i] == 0)
|
||||
{
|
||||
partition_texel_count[partition_of_texel[i]]--;
|
||||
partition_texel_count[i]++;
|
||||
partition_of_texel[i] = static_cast<uint8_t>(i);
|
||||
problem_case = true;
|
||||
}
|
||||
}
|
||||
} while (problem_case);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Compute new cluster centers based on their center of gravity.
|
||||
*
|
||||
* @param blk The image block color data to compress.
|
||||
* @param texel_count The number of texels in the block.
|
||||
* @param partition_count The number of partitions in the block.
|
||||
* @param[out] cluster_centers The new cluster center colors.
|
||||
* @param partition_of_texel The partition assigned for each texel.
|
||||
*/
|
||||
static void kmeans_update(
|
||||
const image_block& blk,
|
||||
unsigned int texel_count,
|
||||
unsigned int partition_count,
|
||||
vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
|
||||
const uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
|
||||
) {
|
||||
promise(texel_count > 0);
|
||||
promise(partition_count > 0);
|
||||
|
||||
vfloat4 color_sum[BLOCK_MAX_PARTITIONS] {
|
||||
vfloat4::zero(),
|
||||
vfloat4::zero(),
|
||||
vfloat4::zero(),
|
||||
vfloat4::zero()
|
||||
};
|
||||
|
||||
uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
|
||||
|
||||
// Find the center of gravity in each cluster
|
||||
for (unsigned int i = 0; i < texel_count; i++)
|
||||
{
|
||||
uint8_t partition = partition_of_texel[i];
|
||||
color_sum[partition] += blk.texel(i);
|
||||
partition_texel_count[partition]++;
|
||||
}
|
||||
|
||||
// Set the center of gravity to be the new cluster center
|
||||
for (unsigned int i = 0; i < partition_count; i++)
|
||||
{
|
||||
float scale = 1.0f / static_cast<float>(partition_texel_count[i]);
|
||||
cluster_centers[i] = color_sum[i] * scale;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Compute bit-mismatch for partitioning in 2-partition mode.
|
||||
*
|
||||
* @param a The texel assignment bitvector for the block.
|
||||
* @param b The texel assignment bitvector for the partition table.
|
||||
*
|
||||
* @return The number of bit mismatches.
|
||||
*/
|
||||
static inline uint8_t partition_mismatch2(
|
||||
const uint64_t a[2],
|
||||
const uint64_t b[2]
|
||||
) {
|
||||
int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]);
|
||||
int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]);
|
||||
|
||||
// Divide by 2 because XOR always counts errors twice, once when missing
|
||||
// in the expected position, and again when present in the wrong partition
|
||||
return static_cast<uint8_t>(astc::min(v1, v2) / 2);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Compute bit-mismatch for partitioning in 3-partition mode.
|
||||
*
|
||||
* @param a The texel assignment bitvector for the block.
|
||||
* @param b The texel assignment bitvector for the partition table.
|
||||
*
|
||||
* @return The number of bit mismatches.
|
||||
*/
|
||||
static inline uint8_t partition_mismatch3(
|
||||
const uint64_t a[3],
|
||||
const uint64_t b[3]
|
||||
) {
|
||||
int p00 = popcount(a[0] ^ b[0]);
|
||||
int p01 = popcount(a[0] ^ b[1]);
|
||||
int p02 = popcount(a[0] ^ b[2]);
|
||||
|
||||
int p10 = popcount(a[1] ^ b[0]);
|
||||
int p11 = popcount(a[1] ^ b[1]);
|
||||
int p12 = popcount(a[1] ^ b[2]);
|
||||
|
||||
int p20 = popcount(a[2] ^ b[0]);
|
||||
int p21 = popcount(a[2] ^ b[1]);
|
||||
int p22 = popcount(a[2] ^ b[2]);
|
||||
|
||||
int s0 = p11 + p22;
|
||||
int s1 = p12 + p21;
|
||||
int v0 = astc::min(s0, s1) + p00;
|
||||
|
||||
int s2 = p10 + p22;
|
||||
int s3 = p12 + p20;
|
||||
int v1 = astc::min(s2, s3) + p01;
|
||||
|
||||
int s4 = p10 + p21;
|
||||
int s5 = p11 + p20;
|
||||
int v2 = astc::min(s4, s5) + p02;
|
||||
|
||||
// Divide by 2 because XOR always counts errors twice, once when missing
|
||||
// in the expected position, and again when present in the wrong partition
|
||||
return static_cast<uint8_t>(astc::min(v0, v1, v2) / 2);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Compute bit-mismatch for partitioning in 4-partition mode.
|
||||
*
|
||||
* @param a The texel assignment bitvector for the block.
|
||||
* @param b The texel assignment bitvector for the partition table.
|
||||
*
|
||||
* @return The number of bit mismatches.
|
||||
*/
|
||||
static inline uint8_t partition_mismatch4(
|
||||
const uint64_t a[4],
|
||||
const uint64_t b[4]
|
||||
) {
|
||||
int p00 = popcount(a[0] ^ b[0]);
|
||||
int p01 = popcount(a[0] ^ b[1]);
|
||||
int p02 = popcount(a[0] ^ b[2]);
|
||||
int p03 = popcount(a[0] ^ b[3]);
|
||||
|
||||
int p10 = popcount(a[1] ^ b[0]);
|
||||
int p11 = popcount(a[1] ^ b[1]);
|
||||
int p12 = popcount(a[1] ^ b[2]);
|
||||
int p13 = popcount(a[1] ^ b[3]);
|
||||
|
||||
int p20 = popcount(a[2] ^ b[0]);
|
||||
int p21 = popcount(a[2] ^ b[1]);
|
||||
int p22 = popcount(a[2] ^ b[2]);
|
||||
int p23 = popcount(a[2] ^ b[3]);
|
||||
|
||||
int p30 = popcount(a[3] ^ b[0]);
|
||||
int p31 = popcount(a[3] ^ b[1]);
|
||||
int p32 = popcount(a[3] ^ b[2]);
|
||||
int p33 = popcount(a[3] ^ b[3]);
|
||||
|
||||
int mx23 = astc::min(p22 + p33, p23 + p32);
|
||||
int mx13 = astc::min(p21 + p33, p23 + p31);
|
||||
int mx12 = astc::min(p21 + p32, p22 + p31);
|
||||
int mx03 = astc::min(p20 + p33, p23 + p30);
|
||||
int mx02 = astc::min(p20 + p32, p22 + p30);
|
||||
int mx01 = astc::min(p21 + p30, p20 + p31);
|
||||
|
||||
int v0 = p00 + astc::min(p11 + mx23, p12 + mx13, p13 + mx12);
|
||||
int v1 = p01 + astc::min(p10 + mx23, p12 + mx03, p13 + mx02);
|
||||
int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01);
|
||||
int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12);
|
||||
|
||||
// Divide by 2 because XOR always counts errors twice, once when missing
|
||||
// in the expected position, and again when present in the wrong partition
|
||||
return static_cast<uint8_t>(astc::min(v0, v1, v2, v3) / 2);
|
||||
}
|
||||
|
||||
using mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*);
|
||||
|
||||
/**
|
||||
* @brief Count the partition table mismatches vs the data clustering.
|
||||
*
|
||||
* @param bsd The block size information.
|
||||
* @param partition_count The number of partitions in the block.
|
||||
* @param bitmaps The block texel partition assignment patterns.
|
||||
* @param[out] mismatch_counts The array storing per partitioning mismatch counts.
|
||||
*/
|
||||
static void count_partition_mismatch_bits(
|
||||
const block_size_descriptor& bsd,
|
||||
unsigned int partition_count,
|
||||
const uint64_t bitmaps[BLOCK_MAX_PARTITIONS],
|
||||
uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS]
|
||||
) {
|
||||
unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1];
|
||||
promise(active_count > 0);
|
||||
|
||||
if (partition_count == 2)
|
||||
{
|
||||
for (unsigned int i = 0; i < active_count; i++)
|
||||
{
|
||||
mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]);
|
||||
assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
|
||||
assert(mismatch_counts[i] < bsd.texel_count);
|
||||
}
|
||||
}
|
||||
else if (partition_count == 3)
|
||||
{
|
||||
for (unsigned int i = 0; i < active_count; i++)
|
||||
{
|
||||
mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]);
|
||||
assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
|
||||
assert(mismatch_counts[i] < bsd.texel_count);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (unsigned int i = 0; i < active_count; i++)
|
||||
{
|
||||
mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]);
|
||||
assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
|
||||
assert(mismatch_counts[i] < bsd.texel_count);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Use counting sort on the mismatch array to sort partition candidates.
|
||||
*
|
||||
* @param partitioning_count The number of packed partitionings.
|
||||
* @param mismatch_count Partitioning mismatch counts, in index order.
|
||||
* @param[out] partition_ordering Partition index values, in mismatch order.
|
||||
*
|
||||
* @return The number of active partitions in this selection.
|
||||
*/
|
||||
static unsigned int get_partition_ordering_by_mismatch_bits(
|
||||
unsigned int texel_count,
|
||||
unsigned int partitioning_count,
|
||||
const uint8_t mismatch_count[BLOCK_MAX_PARTITIONINGS],
|
||||
uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
|
||||
) {
|
||||
promise(partitioning_count > 0);
|
||||
uint16_t mscount[BLOCK_MAX_KMEANS_TEXELS] { 0 };
|
||||
|
||||
// Create the histogram of mismatch counts
|
||||
for (unsigned int i = 0; i < partitioning_count; i++)
|
||||
{
|
||||
mscount[mismatch_count[i]]++;
|
||||
}
|
||||
|
||||
// Create a running sum from the histogram array
|
||||
// Indices store previous values only; i.e. exclude self after sum
|
||||
uint16_t sum = 0;
|
||||
for (unsigned int i = 0; i < texel_count; i++)
|
||||
{
|
||||
uint16_t cnt = mscount[i];
|
||||
mscount[i] = sum;
|
||||
sum += cnt;
|
||||
}
|
||||
|
||||
// Use the running sum as the index, incrementing after read to allow
|
||||
// sequential entries with the same count
|
||||
for (unsigned int i = 0; i < partitioning_count; i++)
|
||||
{
|
||||
unsigned int idx = mscount[mismatch_count[i]]++;
|
||||
partition_ordering[idx] = static_cast<uint16_t>(i);
|
||||
}
|
||||
|
||||
return partitioning_count;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Use k-means clustering to compute a partition ordering for a block..
|
||||
*
|
||||
* @param bsd The block size information.
|
||||
* @param blk The image block color data to compress.
|
||||
* @param partition_count The desired number of partitions in the block.
|
||||
* @param[out] partition_ordering The list of recommended partition indices, in priority order.
|
||||
*
|
||||
* @return The number of active partitionings in this selection.
|
||||
*/
|
||||
static unsigned int compute_kmeans_partition_ordering(
|
||||
const block_size_descriptor& bsd,
|
||||
const image_block& blk,
|
||||
unsigned int partition_count,
|
||||
uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
|
||||
) {
|
||||
vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS];
|
||||
uint8_t texel_partitions[BLOCK_MAX_TEXELS];
|
||||
|
||||
// Use three passes of k-means clustering to partition the block data
|
||||
for (unsigned int i = 0; i < 3; i++)
|
||||
{
|
||||
if (i == 0)
|
||||
{
|
||||
kmeans_init(blk, bsd.texel_count, partition_count, cluster_centers);
|
||||
}
|
||||
else
|
||||
{
|
||||
kmeans_update(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
|
||||
}
|
||||
|
||||
kmeans_assign(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
|
||||
}
|
||||
|
||||
// Construct the block bitmaps of texel assignments to each partition
|
||||
uint64_t bitmaps[BLOCK_MAX_PARTITIONS] { 0 };
|
||||
unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS);
|
||||
promise(texels_to_process > 0);
|
||||
for (unsigned int i = 0; i < texels_to_process; i++)
|
||||
{
|
||||
unsigned int idx = bsd.kmeans_texels[i];
|
||||
bitmaps[texel_partitions[idx]] |= 1ULL << i;
|
||||
}
|
||||
|
||||
// Count the mismatch between the block and the format's partition tables
|
||||
uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS];
|
||||
count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts);
|
||||
|
||||
// Sort the partitions based on the number of mismatched bits
|
||||
return get_partition_ordering_by_mismatch_bits(
|
||||
texels_to_process,
|
||||
bsd.partitioning_count_selected[partition_count - 1],
|
||||
mismatch_counts, partition_ordering);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Insert a partitioning into an order list of results, sorted by error.
|
||||
*
|
||||
* @param max_values The max number of entries in the best result arrays.
|
||||
* @param this_error The error of the new entry.
|
||||
* @param this_partition The partition ID of the new entry.
|
||||
* @param[out] best_errors The array of best error values.
|
||||
* @param[out] best_partitions The array of best partition values.
|
||||
*/
|
||||
static void insert_result(
|
||||
unsigned int max_values,
|
||||
float this_error,
|
||||
unsigned int this_partition,
|
||||
float* best_errors,
|
||||
unsigned int* best_partitions)
|
||||
{
|
||||
promise(max_values > 0);
|
||||
|
||||
// Don't bother searching if the current worst error beats the new error
|
||||
if (this_error >= best_errors[max_values - 1])
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Else insert into the list in error-order
|
||||
for (unsigned int i = 0; i < max_values; i++)
|
||||
{
|
||||
// Existing result is better - move on ...
|
||||
if (this_error > best_errors[i])
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Move existing results down one
|
||||
for (unsigned int j = max_values - 1; j > i; j--)
|
||||
{
|
||||
best_errors[j] = best_errors[j - 1];
|
||||
best_partitions[j] = best_partitions[j - 1];
|
||||
}
|
||||
|
||||
// Insert new result
|
||||
best_errors[i] = this_error;
|
||||
best_partitions[i] = this_partition;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
unsigned int find_best_partition_candidates(
|
||||
const block_size_descriptor& bsd,
|
||||
const image_block& blk,
|
||||
unsigned int partition_count,
|
||||
unsigned int partition_search_limit,
|
||||
unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES],
|
||||
unsigned int requested_candidates
|
||||
) {
|
||||
// Constant used to estimate quantization error for a given partitioning; the optimal value for
|
||||
// this depends on bitrate. These values have been determined empirically.
|
||||
unsigned int texels_per_block = bsd.texel_count;
|
||||
float weight_imprecision_estim = 0.055f;
|
||||
if (texels_per_block <= 20)
|
||||
{
|
||||
weight_imprecision_estim = 0.03f;
|
||||
}
|
||||
else if (texels_per_block <= 31)
|
||||
{
|
||||
weight_imprecision_estim = 0.04f;
|
||||
}
|
||||
else if (texels_per_block <= 41)
|
||||
{
|
||||
weight_imprecision_estim = 0.05f;
|
||||
}
|
||||
|
||||
promise(partition_count > 0);
|
||||
promise(partition_search_limit > 0);
|
||||
|
||||
weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim;
|
||||
|
||||
uint16_t partition_sequence[BLOCK_MAX_PARTITIONINGS];
|
||||
unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence);
|
||||
partition_search_limit = astc::min(partition_search_limit, sequence_len);
|
||||
requested_candidates = astc::min(partition_search_limit, requested_candidates);
|
||||
|
||||
bool uses_alpha = !blk.is_constant_channel(3);
|
||||
|
||||
// Partitioning errors assuming uncorrelated-chrominance endpoints
|
||||
float uncor_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
|
||||
unsigned int uncor_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
|
||||
|
||||
// Partitioning errors assuming same-chrominance endpoints
|
||||
float samec_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
|
||||
unsigned int samec_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
|
||||
|
||||
for (unsigned int i = 0; i < requested_candidates; i++)
|
||||
{
|
||||
uncor_best_errors[i] = ERROR_CALC_DEFAULT;
|
||||
samec_best_errors[i] = ERROR_CALC_DEFAULT;
|
||||
}
|
||||
|
||||
if (uses_alpha)
|
||||
{
|
||||
for (unsigned int i = 0; i < partition_search_limit; i++)
|
||||
{
|
||||
unsigned int partition = partition_sequence[i];
|
||||
const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
|
||||
|
||||
// Compute weighting to give to each component in each partition
|
||||
partition_metrics pms[BLOCK_MAX_PARTITIONS];
|
||||
|
||||
compute_avgs_and_dirs_4_comp(pi, blk, pms);
|
||||
|
||||
line4 uncor_lines[BLOCK_MAX_PARTITIONS];
|
||||
line4 samec_lines[BLOCK_MAX_PARTITIONS];
|
||||
|
||||
processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS];
|
||||
processed_line4 samec_plines[BLOCK_MAX_PARTITIONS];
|
||||
|
||||
float line_lengths[BLOCK_MAX_PARTITIONS];
|
||||
|
||||
for (unsigned int j = 0; j < partition_count; j++)
|
||||
{
|
||||
partition_metrics& pm = pms[j];
|
||||
|
||||
uncor_lines[j].a = pm.avg;
|
||||
uncor_lines[j].b = normalize_safe(pm.dir, unit4());
|
||||
|
||||
uncor_plines[j].amod = uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b);
|
||||
uncor_plines[j].bs = uncor_lines[j].b;
|
||||
|
||||
samec_lines[j].a = vfloat4::zero();
|
||||
samec_lines[j].b = normalize_safe(pm.avg, unit4());
|
||||
|
||||
samec_plines[j].amod = vfloat4::zero();
|
||||
samec_plines[j].bs = samec_lines[j].b;
|
||||
}
|
||||
|
||||
float uncor_error = 0.0f;
|
||||
float samec_error = 0.0f;
|
||||
|
||||
compute_error_squared_rgba(pi,
|
||||
blk,
|
||||
uncor_plines,
|
||||
samec_plines,
|
||||
line_lengths,
|
||||
uncor_error,
|
||||
samec_error);
|
||||
|
||||
// Compute an estimate of error introduced by weight quantization imprecision.
|
||||
// This error is computed as follows, for each partition
|
||||
// 1: compute the principal-axis vector (full length) in error-space
|
||||
// 2: convert the principal-axis vector to regular RGB-space
|
||||
// 3: scale the vector by a constant that estimates average quantization error
|
||||
// 4: for each texel, square the vector, then do a dot-product with the texel's
|
||||
// error weight; sum up the results across all texels.
|
||||
// 4(optimized): square the vector once, then do a dot-product with the average
|
||||
// texel error, then multiply by the number of texels.
|
||||
|
||||
for (unsigned int j = 0; j < partition_count; j++)
|
||||
{
|
||||
float tpp = static_cast<float>(pi.partition_texel_count[j]);
|
||||
vfloat4 error_weights(tpp * weight_imprecision_estim);
|
||||
|
||||
vfloat4 uncor_vector = uncor_lines[j].b * line_lengths[j];
|
||||
vfloat4 samec_vector = samec_lines[j].b * line_lengths[j];
|
||||
|
||||
uncor_error += dot_s(uncor_vector * uncor_vector, error_weights);
|
||||
samec_error += dot_s(samec_vector * samec_vector, error_weights);
|
||||
}
|
||||
|
||||
insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
|
||||
insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (unsigned int i = 0; i < partition_search_limit; i++)
|
||||
{
|
||||
unsigned int partition = partition_sequence[i];
|
||||
const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
|
||||
|
||||
// Compute weighting to give to each component in each partition
|
||||
partition_metrics pms[BLOCK_MAX_PARTITIONS];
|
||||
compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
|
||||
|
||||
partition_lines3 plines[BLOCK_MAX_PARTITIONS];
|
||||
|
||||
for (unsigned int j = 0; j < partition_count; j++)
|
||||
{
|
||||
partition_metrics& pm = pms[j];
|
||||
partition_lines3& pl = plines[j];
|
||||
|
||||
pl.uncor_line.a = pm.avg;
|
||||
pl.uncor_line.b = normalize_safe(pm.dir, unit3());
|
||||
|
||||
pl.samec_line.a = vfloat4::zero();
|
||||
pl.samec_line.b = normalize_safe(pm.avg, unit3());
|
||||
|
||||
pl.uncor_pline.amod = pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b);
|
||||
pl.uncor_pline.bs = pl.uncor_line.b;
|
||||
|
||||
pl.samec_pline.amod = vfloat4::zero();
|
||||
pl.samec_pline.bs = pl.samec_line.b;
|
||||
}
|
||||
|
||||
float uncor_error = 0.0f;
|
||||
float samec_error = 0.0f;
|
||||
|
||||
compute_error_squared_rgb(pi,
|
||||
blk,
|
||||
plines,
|
||||
uncor_error,
|
||||
samec_error);
|
||||
|
||||
// Compute an estimate of error introduced by weight quantization imprecision.
|
||||
// This error is computed as follows, for each partition
|
||||
// 1: compute the principal-axis vector (full length) in error-space
|
||||
// 2: convert the principal-axis vector to regular RGB-space
|
||||
// 3: scale the vector by a constant that estimates average quantization error
|
||||
// 4: for each texel, square the vector, then do a dot-product with the texel's
|
||||
// error weight; sum up the results across all texels.
|
||||
// 4(optimized): square the vector once, then do a dot-product with the average
|
||||
// texel error, then multiply by the number of texels.
|
||||
|
||||
for (unsigned int j = 0; j < partition_count; j++)
|
||||
{
|
||||
partition_lines3& pl = plines[j];
|
||||
|
||||
float tpp = static_cast<float>(pi.partition_texel_count[j]);
|
||||
vfloat4 error_weights(tpp * weight_imprecision_estim);
|
||||
|
||||
vfloat4 uncor_vector = pl.uncor_line.b * pl.line_length;
|
||||
vfloat4 samec_vector = pl.samec_line.b * pl.line_length;
|
||||
|
||||
uncor_error += dot3_s(uncor_vector * uncor_vector, error_weights);
|
||||
samec_error += dot3_s(samec_vector * samec_vector, error_weights);
|
||||
}
|
||||
|
||||
insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
|
||||
insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
|
||||
}
|
||||
}
|
||||
|
||||
unsigned int interleave[2 * TUNE_MAX_PARTITIONING_CANDIDATES];
|
||||
for (unsigned int i = 0; i < requested_candidates; i++)
|
||||
{
|
||||
interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
|
||||
interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
|
||||
}
|
||||
|
||||
uint64_t bitmasks[1024/64] { 0 };
|
||||
unsigned int emitted = 0;
|
||||
|
||||
// Deduplicate the first "requested" entries
|
||||
for (unsigned int i = 0; i < requested_candidates * 2; i++)
|
||||
{
|
||||
unsigned int partition = interleave[i];
|
||||
|
||||
unsigned int word = partition / 64;
|
||||
unsigned int bit = partition % 64;
|
||||
|
||||
bool written = bitmasks[word] & (1ull << bit);
|
||||
|
||||
if (!written)
|
||||
{
|
||||
best_partitions[emitted] = partition;
|
||||
bitmasks[word] |= 1ull << bit;
|
||||
emitted++;
|
||||
|
||||
if (emitted == requested_candidates)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return emitted;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,558 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Functions for creating in-memory ASTC image structures.
|
||||
*/
|
||||
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
|
||||
#include "astcenc_internal.h"
|
||||
|
||||
/**
|
||||
* @brief Loader pipeline function type for data fetch from memory.
|
||||
*/
|
||||
using pixel_loader = vfloat4(*)(const void*, int);
|
||||
|
||||
/**
|
||||
* @brief Loader pipeline function type for swizzling data in a vector.
|
||||
*/
|
||||
using pixel_swizzler = vfloat4(*)(vfloat4, const astcenc_swizzle&);
|
||||
|
||||
/**
|
||||
* @brief Loader pipeline function type for converting data in a vector to LNS.
|
||||
*/
|
||||
using pixel_converter = vfloat4(*)(vfloat4, vmask4);
|
||||
|
||||
/**
|
||||
* @brief Load a 8-bit UNORM texel from a data array.
|
||||
*
|
||||
* @param data The data pointer.
|
||||
* @param base_offset The index offset to the start of the pixel.
|
||||
*/
|
||||
static vfloat4 load_texel_u8(
|
||||
const void* data,
|
||||
int base_offset
|
||||
) {
|
||||
const uint8_t* data8 = static_cast<const uint8_t*>(data);
|
||||
return int_to_float(vint4(data8 + base_offset)) / 255.0f;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Load a 16-bit fp16 texel from a data array.
|
||||
*
|
||||
* @param data The data pointer.
|
||||
* @param base_offset The index offset to the start of the pixel.
|
||||
*/
|
||||
static vfloat4 load_texel_f16(
|
||||
const void* data,
|
||||
int base_offset
|
||||
) {
|
||||
const uint16_t* data16 = static_cast<const uint16_t*>(data);
|
||||
int r = data16[base_offset ];
|
||||
int g = data16[base_offset + 1];
|
||||
int b = data16[base_offset + 2];
|
||||
int a = data16[base_offset + 3];
|
||||
return float16_to_float(vint4(r, g, b, a));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Load a 32-bit float texel from a data array.
|
||||
*
|
||||
* @param data The data pointer.
|
||||
* @param base_offset The index offset to the start of the pixel.
|
||||
*/
|
||||
static vfloat4 load_texel_f32(
|
||||
const void* data,
|
||||
int base_offset
|
||||
) {
|
||||
const float* data32 = static_cast<const float*>(data);
|
||||
return vfloat4(data32 + base_offset);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Dummy no-op swizzle function.
|
||||
*
|
||||
* @param data The source RGBA vector to swizzle.
|
||||
* @param swz The swizzle to use.
|
||||
*/
|
||||
static vfloat4 swz_texel_skip(
|
||||
vfloat4 data,
|
||||
const astcenc_swizzle& swz
|
||||
) {
|
||||
(void)swz;
|
||||
return data;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Swizzle a texel into a new arrangement.
|
||||
*
|
||||
* @param data The source RGBA vector to swizzle.
|
||||
* @param swz The swizzle to use.
|
||||
*/
|
||||
static vfloat4 swz_texel(
|
||||
vfloat4 data,
|
||||
const astcenc_swizzle& swz
|
||||
) {
|
||||
ASTCENC_ALIGNAS float datas[6];
|
||||
|
||||
storea(data, datas);
|
||||
datas[ASTCENC_SWZ_0] = 0.0f;
|
||||
datas[ASTCENC_SWZ_1] = 1.0f;
|
||||
|
||||
return vfloat4(datas[swz.r], datas[swz.g], datas[swz.b], datas[swz.a]);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Encode a texel that is entirely LDR linear.
|
||||
*
|
||||
* @param data The RGBA data to encode.
|
||||
* @param lns_mask The mask for the HDR channels than need LNS encoding.
|
||||
*/
|
||||
static vfloat4 encode_texel_unorm(
|
||||
vfloat4 data,
|
||||
vmask4 lns_mask
|
||||
) {
|
||||
(void)lns_mask;
|
||||
return data * 65535.0f;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Encode a texel that includes at least some HDR LNS texels.
|
||||
*
|
||||
* @param data The RGBA data to encode.
|
||||
* @param lns_mask The mask for the HDR channels than need LNS encoding.
|
||||
*/
|
||||
static vfloat4 encode_texel_lns(
|
||||
vfloat4 data,
|
||||
vmask4 lns_mask
|
||||
) {
|
||||
vfloat4 datav_unorm = data * 65535.0f;
|
||||
vfloat4 datav_lns = float_to_lns(data);
|
||||
return select(datav_unorm, datav_lns, lns_mask);
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void load_image_block(
|
||||
astcenc_profile decode_mode,
|
||||
const astcenc_image& img,
|
||||
image_block& blk,
|
||||
const block_size_descriptor& bsd,
|
||||
unsigned int xpos,
|
||||
unsigned int ypos,
|
||||
unsigned int zpos,
|
||||
const astcenc_swizzle& swz
|
||||
) {
|
||||
unsigned int xsize = img.dim_x;
|
||||
unsigned int ysize = img.dim_y;
|
||||
unsigned int zsize = img.dim_z;
|
||||
|
||||
blk.xpos = xpos;
|
||||
blk.ypos = ypos;
|
||||
blk.zpos = zpos;
|
||||
|
||||
// True if any non-identity swizzle
|
||||
bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
|
||||
(swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
|
||||
|
||||
int idx = 0;
|
||||
|
||||
vfloat4 data_min(1e38f);
|
||||
vfloat4 data_mean(0.0f);
|
||||
vfloat4 data_mean_scale(1.0f / static_cast<float>(bsd.texel_count));
|
||||
vfloat4 data_max(-1e38f);
|
||||
vmask4 grayscalev(true);
|
||||
|
||||
// This works because we impose the same choice everywhere during encode
|
||||
uint8_t rgb_lns = (decode_mode == ASTCENC_PRF_HDR) ||
|
||||
(decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A) ? 1 : 0;
|
||||
uint8_t a_lns = decode_mode == ASTCENC_PRF_HDR ? 1 : 0;
|
||||
vint4 use_lns(rgb_lns, rgb_lns, rgb_lns, a_lns);
|
||||
vmask4 lns_mask = use_lns != vint4::zero();
|
||||
|
||||
// Set up the function pointers for loading pipeline as needed
|
||||
pixel_loader loader = load_texel_u8;
|
||||
if (img.data_type == ASTCENC_TYPE_F16)
|
||||
{
|
||||
loader = load_texel_f16;
|
||||
}
|
||||
else if (img.data_type == ASTCENC_TYPE_F32)
|
||||
{
|
||||
loader = load_texel_f32;
|
||||
}
|
||||
|
||||
pixel_swizzler swizzler = swz_texel_skip;
|
||||
if (needs_swz)
|
||||
{
|
||||
swizzler = swz_texel;
|
||||
}
|
||||
|
||||
pixel_converter converter = encode_texel_unorm;
|
||||
if (any(lns_mask))
|
||||
{
|
||||
converter = encode_texel_lns;
|
||||
}
|
||||
|
||||
for (unsigned int z = 0; z < bsd.zdim; z++)
|
||||
{
|
||||
unsigned int zi = astc::min(zpos + z, zsize - 1);
|
||||
void* plane = img.data[zi];
|
||||
|
||||
for (unsigned int y = 0; y < bsd.ydim; y++)
|
||||
{
|
||||
unsigned int yi = astc::min(ypos + y, ysize - 1);
|
||||
|
||||
for (unsigned int x = 0; x < bsd.xdim; x++)
|
||||
{
|
||||
unsigned int xi = astc::min(xpos + x, xsize - 1);
|
||||
|
||||
vfloat4 datav = loader(plane, (4 * xsize * yi) + (4 * xi));
|
||||
datav = swizzler(datav, swz);
|
||||
datav = converter(datav, lns_mask);
|
||||
|
||||
// Compute block metadata
|
||||
data_min = min(data_min, datav);
|
||||
data_mean += datav * data_mean_scale;
|
||||
data_max = max(data_max, datav);
|
||||
|
||||
grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
|
||||
|
||||
blk.data_r[idx] = datav.lane<0>();
|
||||
blk.data_g[idx] = datav.lane<1>();
|
||||
blk.data_b[idx] = datav.lane<2>();
|
||||
blk.data_a[idx] = datav.lane<3>();
|
||||
|
||||
blk.rgb_lns[idx] = rgb_lns;
|
||||
blk.alpha_lns[idx] = a_lns;
|
||||
|
||||
idx++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Reverse the encoding so we store origin block in the original format
|
||||
vfloat4 data_enc = blk.texel(0);
|
||||
vfloat4 data_enc_unorm = data_enc / 65535.0f;
|
||||
vfloat4 data_enc_lns = vfloat4::zero();
|
||||
|
||||
if (rgb_lns || a_lns)
|
||||
{
|
||||
data_enc_lns = float16_to_float(lns_to_sf16(float_to_int(data_enc)));
|
||||
}
|
||||
|
||||
blk.origin_texel = select(data_enc_unorm, data_enc_lns, lns_mask);
|
||||
|
||||
// Store block metadata
|
||||
blk.data_min = data_min;
|
||||
blk.data_mean = data_mean;
|
||||
blk.data_max = data_max;
|
||||
blk.grayscale = all(grayscalev);
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void load_image_block_fast_ldr(
|
||||
astcenc_profile decode_mode,
|
||||
const astcenc_image& img,
|
||||
image_block& blk,
|
||||
const block_size_descriptor& bsd,
|
||||
unsigned int xpos,
|
||||
unsigned int ypos,
|
||||
unsigned int zpos,
|
||||
const astcenc_swizzle& swz
|
||||
) {
|
||||
(void)swz;
|
||||
(void)decode_mode;
|
||||
|
||||
unsigned int xsize = img.dim_x;
|
||||
unsigned int ysize = img.dim_y;
|
||||
|
||||
blk.xpos = xpos;
|
||||
blk.ypos = ypos;
|
||||
blk.zpos = zpos;
|
||||
|
||||
vfloat4 data_min(1e38f);
|
||||
vfloat4 data_mean = vfloat4::zero();
|
||||
vfloat4 data_max(-1e38f);
|
||||
vmask4 grayscalev(true);
|
||||
int idx = 0;
|
||||
|
||||
const uint8_t* plane = static_cast<const uint8_t*>(img.data[0]);
|
||||
for (unsigned int y = ypos; y < ypos + bsd.ydim; y++)
|
||||
{
|
||||
unsigned int yi = astc::min(y, ysize - 1);
|
||||
|
||||
for (unsigned int x = xpos; x < xpos + bsd.xdim; x++)
|
||||
{
|
||||
unsigned int xi = astc::min(x, xsize - 1);
|
||||
|
||||
vint4 datavi = vint4(plane + (4 * xsize * yi) + (4 * xi));
|
||||
vfloat4 datav = int_to_float(datavi) * (65535.0f / 255.0f);
|
||||
|
||||
// Compute block metadata
|
||||
data_min = min(data_min, datav);
|
||||
data_mean += datav;
|
||||
data_max = max(data_max, datav);
|
||||
|
||||
grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
|
||||
|
||||
blk.data_r[idx] = datav.lane<0>();
|
||||
blk.data_g[idx] = datav.lane<1>();
|
||||
blk.data_b[idx] = datav.lane<2>();
|
||||
blk.data_a[idx] = datav.lane<3>();
|
||||
|
||||
idx++;
|
||||
}
|
||||
}
|
||||
|
||||
// Reverse the encoding so we store origin block in the original format
|
||||
blk.origin_texel = blk.texel(0) / 65535.0f;
|
||||
|
||||
// Store block metadata
|
||||
blk.rgb_lns[0] = 0;
|
||||
blk.alpha_lns[0] = 0;
|
||||
blk.data_min = data_min;
|
||||
blk.data_mean = data_mean / static_cast<float>(bsd.texel_count);
|
||||
blk.data_max = data_max;
|
||||
blk.grayscale = all(grayscalev);
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void store_image_block(
|
||||
astcenc_image& img,
|
||||
const image_block& blk,
|
||||
const block_size_descriptor& bsd,
|
||||
unsigned int xpos,
|
||||
unsigned int ypos,
|
||||
unsigned int zpos,
|
||||
const astcenc_swizzle& swz
|
||||
) {
|
||||
unsigned int x_size = img.dim_x;
|
||||
unsigned int x_start = xpos;
|
||||
unsigned int x_end = astc::min(x_size, xpos + bsd.xdim);
|
||||
unsigned int x_count = x_end - x_start;
|
||||
unsigned int x_nudge = bsd.xdim - x_count;
|
||||
|
||||
unsigned int y_size = img.dim_y;
|
||||
unsigned int y_start = ypos;
|
||||
unsigned int y_end = astc::min(y_size, ypos + bsd.ydim);
|
||||
unsigned int y_count = y_end - y_start;
|
||||
unsigned int y_nudge = (bsd.ydim - y_count) * bsd.xdim;
|
||||
|
||||
unsigned int z_size = img.dim_z;
|
||||
unsigned int z_start = zpos;
|
||||
unsigned int z_end = astc::min(z_size, zpos + bsd.zdim);
|
||||
|
||||
// True if any non-identity swizzle
|
||||
bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
|
||||
(swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
|
||||
|
||||
// True if any swizzle uses Z reconstruct
|
||||
bool needs_z = (swz.r == ASTCENC_SWZ_Z) || (swz.g == ASTCENC_SWZ_Z) ||
|
||||
(swz.b == ASTCENC_SWZ_Z) || (swz.a == ASTCENC_SWZ_Z);
|
||||
|
||||
int idx = 0;
|
||||
if (img.data_type == ASTCENC_TYPE_U8)
|
||||
{
|
||||
for (unsigned int z = z_start; z < z_end; z++)
|
||||
{
|
||||
// Fetch the image plane
|
||||
uint8_t* data8 = static_cast<uint8_t*>(img.data[z]);
|
||||
|
||||
for (unsigned int y = y_start; y < y_end; y++)
|
||||
{
|
||||
uint8_t* data8_row = data8 + (4 * x_size * y) + (4 * x_start);
|
||||
|
||||
for (unsigned int x = 0; x < x_count; x += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
unsigned int max_texels = ASTCENC_SIMD_WIDTH;
|
||||
unsigned int used_texels = astc::min(x_count - x, max_texels);
|
||||
|
||||
// Unaligned load as rows are not always SIMD_WIDTH long
|
||||
vfloat data_r(blk.data_r + idx);
|
||||
vfloat data_g(blk.data_g + idx);
|
||||
vfloat data_b(blk.data_b + idx);
|
||||
vfloat data_a(blk.data_a + idx);
|
||||
|
||||
vint data_ri = float_to_int_rtn(min(data_r, 1.0f) * 255.0f);
|
||||
vint data_gi = float_to_int_rtn(min(data_g, 1.0f) * 255.0f);
|
||||
vint data_bi = float_to_int_rtn(min(data_b, 1.0f) * 255.0f);
|
||||
vint data_ai = float_to_int_rtn(min(data_a, 1.0f) * 255.0f);
|
||||
|
||||
if (needs_swz)
|
||||
{
|
||||
vint swizzle_table[7];
|
||||
swizzle_table[ASTCENC_SWZ_0] = vint(0);
|
||||
swizzle_table[ASTCENC_SWZ_1] = vint(255);
|
||||
swizzle_table[ASTCENC_SWZ_R] = data_ri;
|
||||
swizzle_table[ASTCENC_SWZ_G] = data_gi;
|
||||
swizzle_table[ASTCENC_SWZ_B] = data_bi;
|
||||
swizzle_table[ASTCENC_SWZ_A] = data_ai;
|
||||
|
||||
if (needs_z)
|
||||
{
|
||||
vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f);
|
||||
vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f);
|
||||
vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y);
|
||||
data_z = max(data_z, 0.0f);
|
||||
data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f);
|
||||
|
||||
swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f);
|
||||
}
|
||||
|
||||
data_ri = swizzle_table[swz.r];
|
||||
data_gi = swizzle_table[swz.g];
|
||||
data_bi = swizzle_table[swz.b];
|
||||
data_ai = swizzle_table[swz.a];
|
||||
}
|
||||
|
||||
// Errors are NaN encoded - convert to magenta error color
|
||||
// Branch is OK here - it is almost never true so predicts well
|
||||
vmask nan_mask = data_r != data_r;
|
||||
if (any(nan_mask))
|
||||
{
|
||||
data_ri = select(data_ri, vint(0xFF), nan_mask);
|
||||
data_gi = select(data_gi, vint(0x00), nan_mask);
|
||||
data_bi = select(data_bi, vint(0xFF), nan_mask);
|
||||
data_ai = select(data_ai, vint(0xFF), nan_mask);
|
||||
}
|
||||
|
||||
vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai);
|
||||
vmask store_mask = vint::lane_id() < vint(used_texels);
|
||||
store_lanes_masked(data8_row, data_rgbai, store_mask);
|
||||
|
||||
data8_row += ASTCENC_SIMD_WIDTH * 4;
|
||||
idx += used_texels;
|
||||
}
|
||||
idx += x_nudge;
|
||||
}
|
||||
idx += y_nudge;
|
||||
}
|
||||
}
|
||||
else if (img.data_type == ASTCENC_TYPE_F16)
|
||||
{
|
||||
for (unsigned int z = z_start; z < z_end; z++)
|
||||
{
|
||||
// Fetch the image plane
|
||||
uint16_t* data16 = static_cast<uint16_t*>(img.data[z]);
|
||||
|
||||
for (unsigned int y = y_start; y < y_end; y++)
|
||||
{
|
||||
uint16_t* data16_row = data16 + (4 * x_size * y) + (4 * x_start);
|
||||
|
||||
for (unsigned int x = 0; x < x_count; x++)
|
||||
{
|
||||
vint4 color;
|
||||
|
||||
// NaNs are handled inline - no need to special case
|
||||
if (needs_swz)
|
||||
{
|
||||
float data[7];
|
||||
data[ASTCENC_SWZ_0] = 0.0f;
|
||||
data[ASTCENC_SWZ_1] = 1.0f;
|
||||
data[ASTCENC_SWZ_R] = blk.data_r[idx];
|
||||
data[ASTCENC_SWZ_G] = blk.data_g[idx];
|
||||
data[ASTCENC_SWZ_B] = blk.data_b[idx];
|
||||
data[ASTCENC_SWZ_A] = blk.data_a[idx];
|
||||
|
||||
if (needs_z)
|
||||
{
|
||||
float xN = (data[0] * 2.0f) - 1.0f;
|
||||
float yN = (data[3] * 2.0f) - 1.0f;
|
||||
float zN = 1.0f - xN * xN - yN * yN;
|
||||
if (zN < 0.0f)
|
||||
{
|
||||
zN = 0.0f;
|
||||
}
|
||||
data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
|
||||
}
|
||||
|
||||
vfloat4 colorf(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
|
||||
color = float_to_float16(colorf);
|
||||
}
|
||||
else
|
||||
{
|
||||
vfloat4 colorf = blk.texel(idx);
|
||||
color = float_to_float16(colorf);
|
||||
}
|
||||
|
||||
// TODO: Vectorize with store N shorts?
|
||||
data16_row[0] = static_cast<uint16_t>(color.lane<0>());
|
||||
data16_row[1] = static_cast<uint16_t>(color.lane<1>());
|
||||
data16_row[2] = static_cast<uint16_t>(color.lane<2>());
|
||||
data16_row[3] = static_cast<uint16_t>(color.lane<3>());
|
||||
data16_row += 4;
|
||||
idx++;
|
||||
}
|
||||
idx += x_nudge;
|
||||
}
|
||||
idx += y_nudge;
|
||||
}
|
||||
}
|
||||
else // if (img.data_type == ASTCENC_TYPE_F32)
|
||||
{
|
||||
assert(img.data_type == ASTCENC_TYPE_F32);
|
||||
|
||||
for (unsigned int z = z_start; z < z_end; z++)
|
||||
{
|
||||
// Fetch the image plane
|
||||
float* data32 = static_cast<float*>(img.data[z]);
|
||||
|
||||
for (unsigned int y = y_start; y < y_end; y++)
|
||||
{
|
||||
float* data32_row = data32 + (4 * x_size * y) + (4 * x_start);
|
||||
|
||||
for (unsigned int x = 0; x < x_count; x++)
|
||||
{
|
||||
vfloat4 color = blk.texel(idx);
|
||||
|
||||
// NaNs are handled inline - no need to special case
|
||||
if (needs_swz)
|
||||
{
|
||||
float data[7];
|
||||
data[ASTCENC_SWZ_0] = 0.0f;
|
||||
data[ASTCENC_SWZ_1] = 1.0f;
|
||||
data[ASTCENC_SWZ_R] = color.lane<0>();
|
||||
data[ASTCENC_SWZ_G] = color.lane<1>();
|
||||
data[ASTCENC_SWZ_B] = color.lane<2>();
|
||||
data[ASTCENC_SWZ_A] = color.lane<3>();
|
||||
|
||||
if (needs_z)
|
||||
{
|
||||
float xN = (data[0] * 2.0f) - 1.0f;
|
||||
float yN = (data[3] * 2.0f) - 1.0f;
|
||||
float zN = 1.0f - xN * xN - yN * yN;
|
||||
if (zN < 0.0f)
|
||||
{
|
||||
zN = 0.0f;
|
||||
}
|
||||
data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
|
||||
}
|
||||
|
||||
color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
|
||||
}
|
||||
|
||||
store(color, data32_row);
|
||||
data32_row += 4;
|
||||
idx++;
|
||||
}
|
||||
idx += x_nudge;
|
||||
}
|
||||
idx += y_nudge;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,739 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Functions for encoding/decoding Bounded Integer Sequence Encoding.
|
||||
*/
|
||||
|
||||
#include "astcenc_internal.h"
|
||||
|
||||
#include <array>
|
||||
|
||||
/** @brief Unpacked quint triplets <low,middle,high> for each packed value */
|
||||
// TODO: Bitpack these into a uint16_t?
|
||||
static const uint8_t quints_of_integer[128][3] {
|
||||
{0, 0, 0}, {1, 0, 0}, {2, 0, 0}, {3, 0, 0},
|
||||
{4, 0, 0}, {0, 4, 0}, {4, 4, 0}, {4, 4, 4},
|
||||
{0, 1, 0}, {1, 1, 0}, {2, 1, 0}, {3, 1, 0},
|
||||
{4, 1, 0}, {1, 4, 0}, {4, 4, 1}, {4, 4, 4},
|
||||
{0, 2, 0}, {1, 2, 0}, {2, 2, 0}, {3, 2, 0},
|
||||
{4, 2, 0}, {2, 4, 0}, {4, 4, 2}, {4, 4, 4},
|
||||
{0, 3, 0}, {1, 3, 0}, {2, 3, 0}, {3, 3, 0},
|
||||
{4, 3, 0}, {3, 4, 0}, {4, 4, 3}, {4, 4, 4},
|
||||
{0, 0, 1}, {1, 0, 1}, {2, 0, 1}, {3, 0, 1},
|
||||
{4, 0, 1}, {0, 4, 1}, {4, 0, 4}, {0, 4, 4},
|
||||
{0, 1, 1}, {1, 1, 1}, {2, 1, 1}, {3, 1, 1},
|
||||
{4, 1, 1}, {1, 4, 1}, {4, 1, 4}, {1, 4, 4},
|
||||
{0, 2, 1}, {1, 2, 1}, {2, 2, 1}, {3, 2, 1},
|
||||
{4, 2, 1}, {2, 4, 1}, {4, 2, 4}, {2, 4, 4},
|
||||
{0, 3, 1}, {1, 3, 1}, {2, 3, 1}, {3, 3, 1},
|
||||
{4, 3, 1}, {3, 4, 1}, {4, 3, 4}, {3, 4, 4},
|
||||
{0, 0, 2}, {1, 0, 2}, {2, 0, 2}, {3, 0, 2},
|
||||
{4, 0, 2}, {0, 4, 2}, {2, 0, 4}, {3, 0, 4},
|
||||
{0, 1, 2}, {1, 1, 2}, {2, 1, 2}, {3, 1, 2},
|
||||
{4, 1, 2}, {1, 4, 2}, {2, 1, 4}, {3, 1, 4},
|
||||
{0, 2, 2}, {1, 2, 2}, {2, 2, 2}, {3, 2, 2},
|
||||
{4, 2, 2}, {2, 4, 2}, {2, 2, 4}, {3, 2, 4},
|
||||
{0, 3, 2}, {1, 3, 2}, {2, 3, 2}, {3, 3, 2},
|
||||
{4, 3, 2}, {3, 4, 2}, {2, 3, 4}, {3, 3, 4},
|
||||
{0, 0, 3}, {1, 0, 3}, {2, 0, 3}, {3, 0, 3},
|
||||
{4, 0, 3}, {0, 4, 3}, {0, 0, 4}, {1, 0, 4},
|
||||
{0, 1, 3}, {1, 1, 3}, {2, 1, 3}, {3, 1, 3},
|
||||
{4, 1, 3}, {1, 4, 3}, {0, 1, 4}, {1, 1, 4},
|
||||
{0, 2, 3}, {1, 2, 3}, {2, 2, 3}, {3, 2, 3},
|
||||
{4, 2, 3}, {2, 4, 3}, {0, 2, 4}, {1, 2, 4},
|
||||
{0, 3, 3}, {1, 3, 3}, {2, 3, 3}, {3, 3, 3},
|
||||
{4, 3, 3}, {3, 4, 3}, {0, 3, 4}, {1, 3, 4}
|
||||
};
|
||||
|
||||
/** @brief Packed quint values for each unpacked value, indexed [hi][mid][lo]. */
|
||||
static const uint8_t integer_of_quints[5][5][5] {
|
||||
{
|
||||
{0, 1, 2, 3, 4},
|
||||
{8, 9, 10, 11, 12},
|
||||
{16, 17, 18, 19, 20},
|
||||
{24, 25, 26, 27, 28},
|
||||
{5, 13, 21, 29, 6}
|
||||
},
|
||||
{
|
||||
{32, 33, 34, 35, 36},
|
||||
{40, 41, 42, 43, 44},
|
||||
{48, 49, 50, 51, 52},
|
||||
{56, 57, 58, 59, 60},
|
||||
{37, 45, 53, 61, 14}
|
||||
},
|
||||
{
|
||||
{64, 65, 66, 67, 68},
|
||||
{72, 73, 74, 75, 76},
|
||||
{80, 81, 82, 83, 84},
|
||||
{88, 89, 90, 91, 92},
|
||||
{69, 77, 85, 93, 22}
|
||||
},
|
||||
{
|
||||
{96, 97, 98, 99, 100},
|
||||
{104, 105, 106, 107, 108},
|
||||
{112, 113, 114, 115, 116},
|
||||
{120, 121, 122, 123, 124},
|
||||
{101, 109, 117, 125, 30}
|
||||
},
|
||||
{
|
||||
{102, 103, 70, 71, 38},
|
||||
{110, 111, 78, 79, 46},
|
||||
{118, 119, 86, 87, 54},
|
||||
{126, 127, 94, 95, 62},
|
||||
{39, 47, 55, 63, 31}
|
||||
}
|
||||
};
|
||||
|
||||
/** @brief Unpacked trit quintuplets <low,...,high> for each packed value */
|
||||
// TODO: Bitpack these into a uint16_t?
|
||||
static const uint8_t trits_of_integer[256][5] {
|
||||
{0, 0, 0, 0, 0}, {1, 0, 0, 0, 0}, {2, 0, 0, 0, 0}, {0, 0, 2, 0, 0},
|
||||
{0, 1, 0, 0, 0}, {1, 1, 0, 0, 0}, {2, 1, 0, 0, 0}, {1, 0, 2, 0, 0},
|
||||
{0, 2, 0, 0, 0}, {1, 2, 0, 0, 0}, {2, 2, 0, 0, 0}, {2, 0, 2, 0, 0},
|
||||
{0, 2, 2, 0, 0}, {1, 2, 2, 0, 0}, {2, 2, 2, 0, 0}, {2, 0, 2, 0, 0},
|
||||
{0, 0, 1, 0, 0}, {1, 0, 1, 0, 0}, {2, 0, 1, 0, 0}, {0, 1, 2, 0, 0},
|
||||
{0, 1, 1, 0, 0}, {1, 1, 1, 0, 0}, {2, 1, 1, 0, 0}, {1, 1, 2, 0, 0},
|
||||
{0, 2, 1, 0, 0}, {1, 2, 1, 0, 0}, {2, 2, 1, 0, 0}, {2, 1, 2, 0, 0},
|
||||
{0, 0, 0, 2, 2}, {1, 0, 0, 2, 2}, {2, 0, 0, 2, 2}, {0, 0, 2, 2, 2},
|
||||
{0, 0, 0, 1, 0}, {1, 0, 0, 1, 0}, {2, 0, 0, 1, 0}, {0, 0, 2, 1, 0},
|
||||
{0, 1, 0, 1, 0}, {1, 1, 0, 1, 0}, {2, 1, 0, 1, 0}, {1, 0, 2, 1, 0},
|
||||
{0, 2, 0, 1, 0}, {1, 2, 0, 1, 0}, {2, 2, 0, 1, 0}, {2, 0, 2, 1, 0},
|
||||
{0, 2, 2, 1, 0}, {1, 2, 2, 1, 0}, {2, 2, 2, 1, 0}, {2, 0, 2, 1, 0},
|
||||
{0, 0, 1, 1, 0}, {1, 0, 1, 1, 0}, {2, 0, 1, 1, 0}, {0, 1, 2, 1, 0},
|
||||
{0, 1, 1, 1, 0}, {1, 1, 1, 1, 0}, {2, 1, 1, 1, 0}, {1, 1, 2, 1, 0},
|
||||
{0, 2, 1, 1, 0}, {1, 2, 1, 1, 0}, {2, 2, 1, 1, 0}, {2, 1, 2, 1, 0},
|
||||
{0, 1, 0, 2, 2}, {1, 1, 0, 2, 2}, {2, 1, 0, 2, 2}, {1, 0, 2, 2, 2},
|
||||
{0, 0, 0, 2, 0}, {1, 0, 0, 2, 0}, {2, 0, 0, 2, 0}, {0, 0, 2, 2, 0},
|
||||
{0, 1, 0, 2, 0}, {1, 1, 0, 2, 0}, {2, 1, 0, 2, 0}, {1, 0, 2, 2, 0},
|
||||
{0, 2, 0, 2, 0}, {1, 2, 0, 2, 0}, {2, 2, 0, 2, 0}, {2, 0, 2, 2, 0},
|
||||
{0, 2, 2, 2, 0}, {1, 2, 2, 2, 0}, {2, 2, 2, 2, 0}, {2, 0, 2, 2, 0},
|
||||
{0, 0, 1, 2, 0}, {1, 0, 1, 2, 0}, {2, 0, 1, 2, 0}, {0, 1, 2, 2, 0},
|
||||
{0, 1, 1, 2, 0}, {1, 1, 1, 2, 0}, {2, 1, 1, 2, 0}, {1, 1, 2, 2, 0},
|
||||
{0, 2, 1, 2, 0}, {1, 2, 1, 2, 0}, {2, 2, 1, 2, 0}, {2, 1, 2, 2, 0},
|
||||
{0, 2, 0, 2, 2}, {1, 2, 0, 2, 2}, {2, 2, 0, 2, 2}, {2, 0, 2, 2, 2},
|
||||
{0, 0, 0, 0, 2}, {1, 0, 0, 0, 2}, {2, 0, 0, 0, 2}, {0, 0, 2, 0, 2},
|
||||
{0, 1, 0, 0, 2}, {1, 1, 0, 0, 2}, {2, 1, 0, 0, 2}, {1, 0, 2, 0, 2},
|
||||
{0, 2, 0, 0, 2}, {1, 2, 0, 0, 2}, {2, 2, 0, 0, 2}, {2, 0, 2, 0, 2},
|
||||
{0, 2, 2, 0, 2}, {1, 2, 2, 0, 2}, {2, 2, 2, 0, 2}, {2, 0, 2, 0, 2},
|
||||
{0, 0, 1, 0, 2}, {1, 0, 1, 0, 2}, {2, 0, 1, 0, 2}, {0, 1, 2, 0, 2},
|
||||
{0, 1, 1, 0, 2}, {1, 1, 1, 0, 2}, {2, 1, 1, 0, 2}, {1, 1, 2, 0, 2},
|
||||
{0, 2, 1, 0, 2}, {1, 2, 1, 0, 2}, {2, 2, 1, 0, 2}, {2, 1, 2, 0, 2},
|
||||
{0, 2, 2, 2, 2}, {1, 2, 2, 2, 2}, {2, 2, 2, 2, 2}, {2, 0, 2, 2, 2},
|
||||
{0, 0, 0, 0, 1}, {1, 0, 0, 0, 1}, {2, 0, 0, 0, 1}, {0, 0, 2, 0, 1},
|
||||
{0, 1, 0, 0, 1}, {1, 1, 0, 0, 1}, {2, 1, 0, 0, 1}, {1, 0, 2, 0, 1},
|
||||
{0, 2, 0, 0, 1}, {1, 2, 0, 0, 1}, {2, 2, 0, 0, 1}, {2, 0, 2, 0, 1},
|
||||
{0, 2, 2, 0, 1}, {1, 2, 2, 0, 1}, {2, 2, 2, 0, 1}, {2, 0, 2, 0, 1},
|
||||
{0, 0, 1, 0, 1}, {1, 0, 1, 0, 1}, {2, 0, 1, 0, 1}, {0, 1, 2, 0, 1},
|
||||
{0, 1, 1, 0, 1}, {1, 1, 1, 0, 1}, {2, 1, 1, 0, 1}, {1, 1, 2, 0, 1},
|
||||
{0, 2, 1, 0, 1}, {1, 2, 1, 0, 1}, {2, 2, 1, 0, 1}, {2, 1, 2, 0, 1},
|
||||
{0, 0, 1, 2, 2}, {1, 0, 1, 2, 2}, {2, 0, 1, 2, 2}, {0, 1, 2, 2, 2},
|
||||
{0, 0, 0, 1, 1}, {1, 0, 0, 1, 1}, {2, 0, 0, 1, 1}, {0, 0, 2, 1, 1},
|
||||
{0, 1, 0, 1, 1}, {1, 1, 0, 1, 1}, {2, 1, 0, 1, 1}, {1, 0, 2, 1, 1},
|
||||
{0, 2, 0, 1, 1}, {1, 2, 0, 1, 1}, {2, 2, 0, 1, 1}, {2, 0, 2, 1, 1},
|
||||
{0, 2, 2, 1, 1}, {1, 2, 2, 1, 1}, {2, 2, 2, 1, 1}, {2, 0, 2, 1, 1},
|
||||
{0, 0, 1, 1, 1}, {1, 0, 1, 1, 1}, {2, 0, 1, 1, 1}, {0, 1, 2, 1, 1},
|
||||
{0, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 1, 1}, {1, 1, 2, 1, 1},
|
||||
{0, 2, 1, 1, 1}, {1, 2, 1, 1, 1}, {2, 2, 1, 1, 1}, {2, 1, 2, 1, 1},
|
||||
{0, 1, 1, 2, 2}, {1, 1, 1, 2, 2}, {2, 1, 1, 2, 2}, {1, 1, 2, 2, 2},
|
||||
{0, 0, 0, 2, 1}, {1, 0, 0, 2, 1}, {2, 0, 0, 2, 1}, {0, 0, 2, 2, 1},
|
||||
{0, 1, 0, 2, 1}, {1, 1, 0, 2, 1}, {2, 1, 0, 2, 1}, {1, 0, 2, 2, 1},
|
||||
{0, 2, 0, 2, 1}, {1, 2, 0, 2, 1}, {2, 2, 0, 2, 1}, {2, 0, 2, 2, 1},
|
||||
{0, 2, 2, 2, 1}, {1, 2, 2, 2, 1}, {2, 2, 2, 2, 1}, {2, 0, 2, 2, 1},
|
||||
{0, 0, 1, 2, 1}, {1, 0, 1, 2, 1}, {2, 0, 1, 2, 1}, {0, 1, 2, 2, 1},
|
||||
{0, 1, 1, 2, 1}, {1, 1, 1, 2, 1}, {2, 1, 1, 2, 1}, {1, 1, 2, 2, 1},
|
||||
{0, 2, 1, 2, 1}, {1, 2, 1, 2, 1}, {2, 2, 1, 2, 1}, {2, 1, 2, 2, 1},
|
||||
{0, 2, 1, 2, 2}, {1, 2, 1, 2, 2}, {2, 2, 1, 2, 2}, {2, 1, 2, 2, 2},
|
||||
{0, 0, 0, 1, 2}, {1, 0, 0, 1, 2}, {2, 0, 0, 1, 2}, {0, 0, 2, 1, 2},
|
||||
{0, 1, 0, 1, 2}, {1, 1, 0, 1, 2}, {2, 1, 0, 1, 2}, {1, 0, 2, 1, 2},
|
||||
{0, 2, 0, 1, 2}, {1, 2, 0, 1, 2}, {2, 2, 0, 1, 2}, {2, 0, 2, 1, 2},
|
||||
{0, 2, 2, 1, 2}, {1, 2, 2, 1, 2}, {2, 2, 2, 1, 2}, {2, 0, 2, 1, 2},
|
||||
{0, 0, 1, 1, 2}, {1, 0, 1, 1, 2}, {2, 0, 1, 1, 2}, {0, 1, 2, 1, 2},
|
||||
{0, 1, 1, 1, 2}, {1, 1, 1, 1, 2}, {2, 1, 1, 1, 2}, {1, 1, 2, 1, 2},
|
||||
{0, 2, 1, 1, 2}, {1, 2, 1, 1, 2}, {2, 2, 1, 1, 2}, {2, 1, 2, 1, 2},
|
||||
{0, 2, 2, 2, 2}, {1, 2, 2, 2, 2}, {2, 2, 2, 2, 2}, {2, 1, 2, 2, 2}
|
||||
};
|
||||
|
||||
/** @brief Packed trit values for each unpacked value, indexed [hi][][][][lo]. */
|
||||
static const uint8_t integer_of_trits[3][3][3][3][3] {
|
||||
{
|
||||
{
|
||||
{
|
||||
{0, 1, 2},
|
||||
{4, 5, 6},
|
||||
{8, 9, 10}
|
||||
},
|
||||
{
|
||||
{16, 17, 18},
|
||||
{20, 21, 22},
|
||||
{24, 25, 26}
|
||||
},
|
||||
{
|
||||
{3, 7, 15},
|
||||
{19, 23, 27},
|
||||
{12, 13, 14}
|
||||
}
|
||||
},
|
||||
{
|
||||
{
|
||||
{32, 33, 34},
|
||||
{36, 37, 38},
|
||||
{40, 41, 42}
|
||||
},
|
||||
{
|
||||
{48, 49, 50},
|
||||
{52, 53, 54},
|
||||
{56, 57, 58}
|
||||
},
|
||||
{
|
||||
{35, 39, 47},
|
||||
{51, 55, 59},
|
||||
{44, 45, 46}
|
||||
}
|
||||
},
|
||||
{
|
||||
{
|
||||
{64, 65, 66},
|
||||
{68, 69, 70},
|
||||
{72, 73, 74}
|
||||
},
|
||||
{
|
||||
{80, 81, 82},
|
||||
{84, 85, 86},
|
||||
{88, 89, 90}
|
||||
},
|
||||
{
|
||||
{67, 71, 79},
|
||||
{83, 87, 91},
|
||||
{76, 77, 78}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
{
|
||||
{
|
||||
{128, 129, 130},
|
||||
{132, 133, 134},
|
||||
{136, 137, 138}
|
||||
},
|
||||
{
|
||||
{144, 145, 146},
|
||||
{148, 149, 150},
|
||||
{152, 153, 154}
|
||||
},
|
||||
{
|
||||
{131, 135, 143},
|
||||
{147, 151, 155},
|
||||
{140, 141, 142}
|
||||
}
|
||||
},
|
||||
{
|
||||
{
|
||||
{160, 161, 162},
|
||||
{164, 165, 166},
|
||||
{168, 169, 170}
|
||||
},
|
||||
{
|
||||
{176, 177, 178},
|
||||
{180, 181, 182},
|
||||
{184, 185, 186}
|
||||
},
|
||||
{
|
||||
{163, 167, 175},
|
||||
{179, 183, 187},
|
||||
{172, 173, 174}
|
||||
}
|
||||
},
|
||||
{
|
||||
{
|
||||
{192, 193, 194},
|
||||
{196, 197, 198},
|
||||
{200, 201, 202}
|
||||
},
|
||||
{
|
||||
{208, 209, 210},
|
||||
{212, 213, 214},
|
||||
{216, 217, 218}
|
||||
},
|
||||
{
|
||||
{195, 199, 207},
|
||||
{211, 215, 219},
|
||||
{204, 205, 206}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
{
|
||||
{
|
||||
{96, 97, 98},
|
||||
{100, 101, 102},
|
||||
{104, 105, 106}
|
||||
},
|
||||
{
|
||||
{112, 113, 114},
|
||||
{116, 117, 118},
|
||||
{120, 121, 122}
|
||||
},
|
||||
{
|
||||
{99, 103, 111},
|
||||
{115, 119, 123},
|
||||
{108, 109, 110}
|
||||
}
|
||||
},
|
||||
{
|
||||
{
|
||||
{224, 225, 226},
|
||||
{228, 229, 230},
|
||||
{232, 233, 234}
|
||||
},
|
||||
{
|
||||
{240, 241, 242},
|
||||
{244, 245, 246},
|
||||
{248, 249, 250}
|
||||
},
|
||||
{
|
||||
{227, 231, 239},
|
||||
{243, 247, 251},
|
||||
{236, 237, 238}
|
||||
}
|
||||
},
|
||||
{
|
||||
{
|
||||
{28, 29, 30},
|
||||
{60, 61, 62},
|
||||
{92, 93, 94}
|
||||
},
|
||||
{
|
||||
{156, 157, 158},
|
||||
{188, 189, 190},
|
||||
{220, 221, 222}
|
||||
},
|
||||
{
|
||||
{31, 63, 127},
|
||||
{159, 191, 255},
|
||||
{252, 253, 254}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief The number of bits, trits, and quints needed for a quant level.
|
||||
*/
|
||||
struct btq_count
|
||||
{
|
||||
/** @brief The number of bits. */
|
||||
uint8_t bits:6;
|
||||
|
||||
/** @brief The number of trits. */
|
||||
uint8_t trits:1;
|
||||
|
||||
/** @brief The number of quints. */
|
||||
uint8_t quints:1;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief The table of bits, trits, and quints needed for a quant encode.
|
||||
*/
|
||||
static const std::array<btq_count, 21> btq_counts {{
|
||||
{ 1, 0, 0 }, // QUANT_2
|
||||
{ 0, 1, 0 }, // QUANT_3
|
||||
{ 2, 0, 0 }, // QUANT_4
|
||||
{ 0, 0, 1 }, // QUANT_5
|
||||
{ 1, 1, 0 }, // QUANT_6
|
||||
{ 3, 0, 0 }, // QUANT_8
|
||||
{ 1, 0, 1 }, // QUANT_10
|
||||
{ 2, 1, 0 }, // QUANT_12
|
||||
{ 4, 0, 0 }, // QUANT_16
|
||||
{ 2, 0, 1 }, // QUANT_20
|
||||
{ 3, 1, 0 }, // QUANT_24
|
||||
{ 5, 0, 0 }, // QUANT_32
|
||||
{ 3, 0, 1 }, // QUANT_40
|
||||
{ 4, 1, 0 }, // QUANT_48
|
||||
{ 6, 0, 0 }, // QUANT_64
|
||||
{ 4, 0, 1 }, // QUANT_80
|
||||
{ 5, 1, 0 }, // QUANT_96
|
||||
{ 7, 0, 0 }, // QUANT_128
|
||||
{ 5, 0, 1 }, // QUANT_160
|
||||
{ 6, 1, 0 }, // QUANT_192
|
||||
{ 8, 0, 0 } // QUANT_256
|
||||
}};
|
||||
|
||||
/**
|
||||
* @brief The sequence scale, round, and divisors needed to compute sizing.
|
||||
*
|
||||
* The length of a quantized sequence in bits is:
|
||||
* (scale * <sequence_len> + round) / divisor
|
||||
*/
|
||||
struct ise_size
|
||||
{
|
||||
/** @brief The scaling parameter. */
|
||||
uint8_t scale:6;
|
||||
|
||||
/** @brief The divisor parameter. */
|
||||
uint8_t divisor:2;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief The table of scale, round, and divisors needed for quant sizing.
|
||||
*/
|
||||
static const std::array<ise_size, 21> ise_sizes {{
|
||||
{ 1, 0 }, // QUANT_2
|
||||
{ 8, 2 }, // QUANT_3
|
||||
{ 2, 0 }, // QUANT_4
|
||||
{ 7, 1 }, // QUANT_5
|
||||
{ 13, 2 }, // QUANT_6
|
||||
{ 3, 0 }, // QUANT_8
|
||||
{ 10, 1 }, // QUANT_10
|
||||
{ 18, 2 }, // QUANT_12
|
||||
{ 4, 0 }, // QUANT_16
|
||||
{ 13, 1 }, // QUANT_20
|
||||
{ 23, 2 }, // QUANT_24
|
||||
{ 5, 0 }, // QUANT_32
|
||||
{ 16, 1 }, // QUANT_40
|
||||
{ 28, 2 }, // QUANT_48
|
||||
{ 6, 0 }, // QUANT_64
|
||||
{ 19, 1 }, // QUANT_80
|
||||
{ 33, 2 }, // QUANT_96
|
||||
{ 7, 0 }, // QUANT_128
|
||||
{ 22, 1 }, // QUANT_160
|
||||
{ 38, 2 }, // QUANT_192
|
||||
{ 8, 0 } // QUANT_256
|
||||
}};
|
||||
|
||||
/* See header for documentation. */
|
||||
unsigned int get_ise_sequence_bitcount(
|
||||
unsigned int character_count,
|
||||
quant_method quant_level
|
||||
) {
|
||||
// Cope with out-of bounds values - input might be invalid
|
||||
if (static_cast<size_t>(quant_level) >= ise_sizes.size())
|
||||
{
|
||||
// Arbitrary large number that's more than an ASTC block can hold
|
||||
return 1024;
|
||||
}
|
||||
|
||||
auto& entry = ise_sizes[quant_level];
|
||||
unsigned int divisor = (entry.divisor << 1) + 1;
|
||||
return (entry.scale * character_count + divisor - 1) / divisor;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Write up to 8 bits at an arbitrary bit offset.
|
||||
*
|
||||
* The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
|
||||
* span two separate bytes in memory.
|
||||
*
|
||||
* @param value The value to write.
|
||||
* @param bitcount The number of bits to write, starting from LSB.
|
||||
* @param bitoffset The bit offset to store at, between 0 and 7.
|
||||
* @param[in,out] ptr The data pointer to write to.
|
||||
*/
|
||||
static inline void write_bits(
|
||||
unsigned int value,
|
||||
unsigned int bitcount,
|
||||
unsigned int bitoffset,
|
||||
uint8_t ptr[2]
|
||||
) {
|
||||
unsigned int mask = (1 << bitcount) - 1;
|
||||
value &= mask;
|
||||
ptr += bitoffset >> 3;
|
||||
bitoffset &= 7;
|
||||
value <<= bitoffset;
|
||||
mask <<= bitoffset;
|
||||
mask = ~mask;
|
||||
|
||||
ptr[0] &= mask;
|
||||
ptr[0] |= value;
|
||||
ptr[1] &= mask >> 8;
|
||||
ptr[1] |= value >> 8;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Read up to 16 bits from two bytes.
|
||||
*
|
||||
* This function reads a packed N-bit field from two bytes in memory. The stored value must exist
|
||||
* within the two bytes, but can start at an arbitary bit offset and span the two bytes in memory.
|
||||
*
|
||||
* @param bitcount The number of bits to read.
|
||||
* @param bitoffset The bit offset to read from, between 0 and 7.
|
||||
* @param[in,out] ptr The data pointer to read from.
|
||||
*
|
||||
* @return The read value.
|
||||
*/
|
||||
static inline unsigned int read_bits(
|
||||
unsigned int bitcount,
|
||||
unsigned int bitoffset,
|
||||
const uint8_t* ptr
|
||||
) {
|
||||
unsigned int mask = (1 << bitcount) - 1;
|
||||
ptr += bitoffset >> 3;
|
||||
bitoffset &= 7;
|
||||
unsigned int value = ptr[0] | (ptr[1] << 8);
|
||||
value >>= bitoffset;
|
||||
value &= mask;
|
||||
return value;
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void encode_ise(
|
||||
quant_method quant_level,
|
||||
unsigned int character_count,
|
||||
const uint8_t* input_data,
|
||||
uint8_t* output_data,
|
||||
unsigned int bit_offset
|
||||
) {
|
||||
promise(character_count > 0);
|
||||
|
||||
unsigned int bits = btq_counts[quant_level].bits;
|
||||
unsigned int trits = btq_counts[quant_level].trits;
|
||||
unsigned int quints = btq_counts[quant_level].quints;
|
||||
unsigned int mask = (1 << bits) - 1;
|
||||
|
||||
// Write out trits and bits
|
||||
if (trits)
|
||||
{
|
||||
unsigned int i = 0;
|
||||
unsigned int full_trit_blocks = character_count / 5;
|
||||
|
||||
for (unsigned int j = 0; j < full_trit_blocks; j++)
|
||||
{
|
||||
unsigned int i4 = input_data[i + 4] >> bits;
|
||||
unsigned int i3 = input_data[i + 3] >> bits;
|
||||
unsigned int i2 = input_data[i + 2] >> bits;
|
||||
unsigned int i1 = input_data[i + 1] >> bits;
|
||||
unsigned int i0 = input_data[i + 0] >> bits;
|
||||
|
||||
uint8_t T = integer_of_trits[i4][i3][i2][i1][i0];
|
||||
|
||||
// The max size of a trit bit count is 6, so we can always safely
|
||||
// pack a single MX value with the following 1 or 2 T bits.
|
||||
uint8_t pack;
|
||||
|
||||
// Element 0 + T0 + T1
|
||||
pack = (input_data[i++] & mask) | (((T >> 0) & 0x3) << bits);
|
||||
write_bits(pack, bits + 2, bit_offset, output_data);
|
||||
bit_offset += bits + 2;
|
||||
|
||||
// Element 1 + T2 + T3
|
||||
pack = (input_data[i++] & mask) | (((T >> 2) & 0x3) << bits);
|
||||
write_bits(pack, bits + 2, bit_offset, output_data);
|
||||
bit_offset += bits + 2;
|
||||
|
||||
// Element 2 + T4
|
||||
pack = (input_data[i++] & mask) | (((T >> 4) & 0x1) << bits);
|
||||
write_bits(pack, bits + 1, bit_offset, output_data);
|
||||
bit_offset += bits + 1;
|
||||
|
||||
// Element 3 + T5 + T6
|
||||
pack = (input_data[i++] & mask) | (((T >> 5) & 0x3) << bits);
|
||||
write_bits(pack, bits + 2, bit_offset, output_data);
|
||||
bit_offset += bits + 2;
|
||||
|
||||
// Element 4 + T7
|
||||
pack = (input_data[i++] & mask) | (((T >> 7) & 0x1) << bits);
|
||||
write_bits(pack, bits + 1, bit_offset, output_data);
|
||||
bit_offset += bits + 1;
|
||||
}
|
||||
|
||||
// Loop tail for a partial block
|
||||
if (i != character_count)
|
||||
{
|
||||
// i4 cannot be present - we know the block is partial
|
||||
// i0 must be present - we know the block isn't empty
|
||||
unsigned int i4 = 0;
|
||||
unsigned int i3 = i + 3 >= character_count ? 0 : input_data[i + 3] >> bits;
|
||||
unsigned int i2 = i + 2 >= character_count ? 0 : input_data[i + 2] >> bits;
|
||||
unsigned int i1 = i + 1 >= character_count ? 0 : input_data[i + 1] >> bits;
|
||||
unsigned int i0 = input_data[i + 0] >> bits;
|
||||
|
||||
uint8_t T = integer_of_trits[i4][i3][i2][i1][i0];
|
||||
|
||||
for (unsigned int j = 0; i < character_count; i++, j++)
|
||||
{
|
||||
// Truncated table as this iteration is always partital
|
||||
static const uint8_t tbits[4] { 2, 2, 1, 2 };
|
||||
static const uint8_t tshift[4] { 0, 2, 4, 5 };
|
||||
|
||||
uint8_t pack = (input_data[i] & mask) |
|
||||
(((T >> tshift[j]) & ((1 << tbits[j]) - 1)) << bits);
|
||||
|
||||
write_bits(pack, bits + tbits[j], bit_offset, output_data);
|
||||
bit_offset += bits + tbits[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
// Write out quints and bits
|
||||
else if (quints)
|
||||
{
|
||||
unsigned int i = 0;
|
||||
unsigned int full_quint_blocks = character_count / 3;
|
||||
|
||||
for (unsigned int j = 0; j < full_quint_blocks; j++)
|
||||
{
|
||||
unsigned int i2 = input_data[i + 2] >> bits;
|
||||
unsigned int i1 = input_data[i + 1] >> bits;
|
||||
unsigned int i0 = input_data[i + 0] >> bits;
|
||||
|
||||
uint8_t T = integer_of_quints[i2][i1][i0];
|
||||
|
||||
// The max size of a quint bit count is 5, so we can always safely
|
||||
// pack a single M value with the following 2 or 3 T bits.
|
||||
uint8_t pack;
|
||||
|
||||
// Element 0
|
||||
pack = (input_data[i++] & mask) | (((T >> 0) & 0x7) << bits);
|
||||
write_bits(pack, bits + 3, bit_offset, output_data);
|
||||
bit_offset += bits + 3;
|
||||
|
||||
// Element 1
|
||||
pack = (input_data[i++] & mask) | (((T >> 3) & 0x3) << bits);
|
||||
write_bits(pack, bits + 2, bit_offset, output_data);
|
||||
bit_offset += bits + 2;
|
||||
|
||||
// Element 2
|
||||
pack = (input_data[i++] & mask) | (((T >> 5) & 0x3) << bits);
|
||||
write_bits(pack, bits + 2, bit_offset, output_data);
|
||||
bit_offset += bits + 2;
|
||||
}
|
||||
|
||||
// Loop tail for a partial block
|
||||
if (i != character_count)
|
||||
{
|
||||
// i2 cannot be present - we know the block is partial
|
||||
// i0 must be present - we know the block isn't empty
|
||||
unsigned int i2 = 0;
|
||||
unsigned int i1 = i + 1 >= character_count ? 0 : input_data[i + 1] >> bits;
|
||||
unsigned int i0 = input_data[i + 0] >> bits;
|
||||
|
||||
uint8_t T = integer_of_quints[i2][i1][i0];
|
||||
|
||||
for (unsigned int j = 0; i < character_count; i++, j++)
|
||||
{
|
||||
// Truncated table as this iteration is always partital
|
||||
static const uint8_t tbits[2] { 3, 2 };
|
||||
static const uint8_t tshift[2] { 0, 3 };
|
||||
|
||||
uint8_t pack = (input_data[i] & mask) |
|
||||
(((T >> tshift[j]) & ((1 << tbits[j]) - 1)) << bits);
|
||||
|
||||
write_bits(pack, bits + tbits[j], bit_offset, output_data);
|
||||
bit_offset += bits + tbits[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
// Write out just bits
|
||||
else
|
||||
{
|
||||
for (unsigned int i = 0; i < character_count; i++)
|
||||
{
|
||||
write_bits(input_data[i], bits, bit_offset, output_data);
|
||||
bit_offset += bits;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void decode_ise(
|
||||
quant_method quant_level,
|
||||
unsigned int character_count,
|
||||
const uint8_t* input_data,
|
||||
uint8_t* output_data,
|
||||
unsigned int bit_offset
|
||||
) {
|
||||
promise(character_count > 0);
|
||||
|
||||
// Note: due to how the trit/quint-block unpacking is done in this function, we may write more
|
||||
// temporary results than the number of outputs. The maximum actual number of results is 64 bit,
|
||||
// but we keep 4 additional character_count of padding.
|
||||
uint8_t results[68];
|
||||
uint8_t tq_blocks[22] { 0 }; // Trit-blocks or quint-blocks, must be zeroed
|
||||
|
||||
unsigned int bits = btq_counts[quant_level].bits;
|
||||
unsigned int trits = btq_counts[quant_level].trits;
|
||||
unsigned int quints = btq_counts[quant_level].quints;
|
||||
|
||||
unsigned int lcounter = 0;
|
||||
unsigned int hcounter = 0;
|
||||
|
||||
// Collect bits for each element, as well as bits for any trit-blocks and quint-blocks.
|
||||
for (unsigned int i = 0; i < character_count; i++)
|
||||
{
|
||||
results[i] = static_cast<uint8_t>(read_bits(bits, bit_offset, input_data));
|
||||
bit_offset += bits;
|
||||
|
||||
if (trits)
|
||||
{
|
||||
static const uint8_t bits_to_read[5] { 2, 2, 1, 2, 1 };
|
||||
static const uint8_t block_shift[5] { 0, 2, 4, 5, 7 };
|
||||
static const uint8_t next_lcounter[5] { 1, 2, 3, 4, 0 };
|
||||
static const uint8_t hcounter_incr[5] { 0, 0, 0, 0, 1 };
|
||||
unsigned int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
|
||||
bit_offset += bits_to_read[lcounter];
|
||||
tq_blocks[hcounter] |= tdata << block_shift[lcounter];
|
||||
hcounter += hcounter_incr[lcounter];
|
||||
lcounter = next_lcounter[lcounter];
|
||||
}
|
||||
|
||||
if (quints)
|
||||
{
|
||||
static const uint8_t bits_to_read[3] { 3, 2, 2 };
|
||||
static const uint8_t block_shift[3] { 0, 3, 5 };
|
||||
static const uint8_t next_lcounter[3] { 1, 2, 0 };
|
||||
static const uint8_t hcounter_incr[3] { 0, 0, 1 };
|
||||
unsigned int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
|
||||
bit_offset += bits_to_read[lcounter];
|
||||
tq_blocks[hcounter] |= tdata << block_shift[lcounter];
|
||||
hcounter += hcounter_incr[lcounter];
|
||||
lcounter = next_lcounter[lcounter];
|
||||
}
|
||||
}
|
||||
|
||||
// Unpack trit-blocks or quint-blocks as needed
|
||||
if (trits)
|
||||
{
|
||||
unsigned int trit_blocks = (character_count + 4) / 5;
|
||||
promise(trit_blocks > 0);
|
||||
for (unsigned int i = 0; i < trit_blocks; i++)
|
||||
{
|
||||
const uint8_t *tritptr = trits_of_integer[tq_blocks[i]];
|
||||
results[5 * i ] |= tritptr[0] << bits;
|
||||
results[5 * i + 1] |= tritptr[1] << bits;
|
||||
results[5 * i + 2] |= tritptr[2] << bits;
|
||||
results[5 * i + 3] |= tritptr[3] << bits;
|
||||
results[5 * i + 4] |= tritptr[4] << bits;
|
||||
}
|
||||
}
|
||||
|
||||
if (quints)
|
||||
{
|
||||
unsigned int quint_blocks = (character_count + 2) / 3;
|
||||
promise(quint_blocks > 0);
|
||||
for (unsigned int i = 0; i < quint_blocks; i++)
|
||||
{
|
||||
const uint8_t *quintptr = quints_of_integer[tq_blocks[i]];
|
||||
results[3 * i ] |= quintptr[0] << bits;
|
||||
results[3 * i + 1] |= quintptr[1] << bits;
|
||||
results[3 * i + 2] |= quintptr[2] << bits;
|
||||
}
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < character_count; i++)
|
||||
{
|
||||
output_data[i] = results[i];
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,346 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2025 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Functions and data declarations for the outer context.
|
||||
*
|
||||
* The outer context includes thread-pool management, which is slower to
|
||||
* compile due to increased use of C++ stdlib. The inner context used in the
|
||||
* majority of the codec library does not include this.
|
||||
*/
|
||||
|
||||
#ifndef ASTCENC_INTERNAL_ENTRY_INCLUDED
|
||||
#define ASTCENC_INTERNAL_ENTRY_INCLUDED
|
||||
|
||||
#include <atomic>
|
||||
#include <condition_variable>
|
||||
#include <functional>
|
||||
#include <mutex>
|
||||
|
||||
#include "astcenc_internal.h"
|
||||
|
||||
/* ============================================================================
|
||||
Parallel execution control
|
||||
============================================================================ */
|
||||
|
||||
/**
|
||||
* @brief A simple counter-based manager for parallel task execution.
|
||||
*
|
||||
* The task processing execution consists of:
|
||||
*
|
||||
* * A single-threaded init stage.
|
||||
* * A multi-threaded processing stage.
|
||||
* * A condition variable so threads can wait for processing completion.
|
||||
*
|
||||
* The init stage will be executed by the first thread to arrive in the critical section, there is
|
||||
* no main thread in the thread pool.
|
||||
*
|
||||
* The processing stage uses dynamic dispatch to assign task tickets to threads on an on-demand
|
||||
* basis. Threads may each therefore executed different numbers of tasks, depending on their
|
||||
* processing complexity. The task queue and the task tickets are just counters; the caller must map
|
||||
* these integers to an actual processing partition in a specific problem domain.
|
||||
*
|
||||
* The exit wait condition is needed to ensure processing has finished before a worker thread can
|
||||
* progress to the next stage of the pipeline. Specifically a worker may exit the processing stage
|
||||
* because there are no new tasks to assign to it while other worker threads are still processing.
|
||||
* Calling @c wait() will ensure that all other worker have finished before the thread can proceed.
|
||||
*
|
||||
* The basic usage model:
|
||||
*
|
||||
* // --------- From single-threaded code ---------
|
||||
*
|
||||
* // Reset the tracker state
|
||||
* manager->reset()
|
||||
*
|
||||
* // --------- From multi-threaded code ---------
|
||||
*
|
||||
* // Run the stage init; only first thread actually runs the lambda
|
||||
* manager->init(<lambda>)
|
||||
*
|
||||
* do
|
||||
* {
|
||||
* // Request a task assignment
|
||||
* uint task_count;
|
||||
* uint base_index = manager->get_tasks(<granule>, task_count);
|
||||
*
|
||||
* // Process any tasks we were given (task_count <= granule size)
|
||||
* if (task_count)
|
||||
* {
|
||||
* // Run the user task processing code for N tasks here
|
||||
* ...
|
||||
*
|
||||
* // Flag these tasks as complete
|
||||
* manager->complete_tasks(task_count);
|
||||
* }
|
||||
* } while (task_count);
|
||||
*
|
||||
* // Wait for all threads to complete tasks before progressing
|
||||
* manager->wait()
|
||||
*
|
||||
* // Run the stage term; only first thread actually runs the lambda
|
||||
* manager->term(<lambda>)
|
||||
*/
|
||||
class ParallelManager
|
||||
{
|
||||
private:
|
||||
/** @brief Lock used for critical section and condition synchronization. */
|
||||
std::mutex m_lock;
|
||||
|
||||
/** @brief True if the current operation is cancelled. */
|
||||
std::atomic<bool> m_is_cancelled;
|
||||
|
||||
/** @brief True if the stage init() step has been executed. */
|
||||
bool m_init_done;
|
||||
|
||||
/** @brief True if the stage term() step has been executed. */
|
||||
bool m_term_done;
|
||||
|
||||
/** @brief Condition variable for tracking stage processing completion. */
|
||||
std::condition_variable m_complete;
|
||||
|
||||
/** @brief Number of tasks started, but not necessarily finished. */
|
||||
std::atomic<unsigned int> m_start_count;
|
||||
|
||||
/** @brief Number of tasks finished. */
|
||||
unsigned int m_done_count;
|
||||
|
||||
/** @brief Number of tasks that need to be processed. */
|
||||
unsigned int m_task_count;
|
||||
|
||||
/** @brief Progress callback (optional). */
|
||||
astcenc_progress_callback m_callback;
|
||||
|
||||
/** @brief Lock used for callback synchronization. */
|
||||
std::mutex m_callback_lock;
|
||||
|
||||
/** @brief Minimum progress before making a callback. */
|
||||
float m_callback_min_diff;
|
||||
|
||||
/** @brief Last progress callback value. */
|
||||
float m_callback_last_value;
|
||||
|
||||
public:
|
||||
/** @brief Create a new ParallelManager. */
|
||||
ParallelManager()
|
||||
{
|
||||
reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Reset the tracker for a new processing batch.
|
||||
*
|
||||
* This must be called from single-threaded code before starting the multi-threaded processing
|
||||
* operations.
|
||||
*/
|
||||
void reset()
|
||||
{
|
||||
m_init_done = false;
|
||||
m_term_done = false;
|
||||
m_is_cancelled = false;
|
||||
m_start_count = 0;
|
||||
m_done_count = 0;
|
||||
m_task_count = 0;
|
||||
m_callback = nullptr;
|
||||
m_callback_last_value = 0.0f;
|
||||
m_callback_min_diff = 1.0f;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Clear the tracker and stop new tasks being assigned.
|
||||
*
|
||||
* Note, all in-flight tasks in a worker will still complete normally.
|
||||
*/
|
||||
void cancel()
|
||||
{
|
||||
m_is_cancelled = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Trigger the pipeline stage init step.
|
||||
*
|
||||
* This can be called from multi-threaded code. The first thread to hit this will process the
|
||||
* initialization. Other threads will block and wait for it to complete.
|
||||
*
|
||||
* @param init_func Callable which executes the stage initialization. It must return the
|
||||
* total number of tasks in the stage.
|
||||
*/
|
||||
void init(std::function<unsigned int(void)> init_func)
|
||||
{
|
||||
std::lock_guard<std::mutex> lck(m_lock);
|
||||
if (!m_init_done)
|
||||
{
|
||||
m_task_count = init_func();
|
||||
m_init_done = true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Trigger the pipeline stage init step.
|
||||
*
|
||||
* This can be called from multi-threaded code. The first thread to hit this will process the
|
||||
* initialization. Other threads will block and wait for it to complete.
|
||||
*
|
||||
* @param task_count Total number of tasks needing processing.
|
||||
* @param callback Function pointer for progress status callbacks.
|
||||
*/
|
||||
void init(unsigned int task_count, astcenc_progress_callback callback)
|
||||
{
|
||||
std::lock_guard<std::mutex> lck(m_lock);
|
||||
if (!m_init_done)
|
||||
{
|
||||
m_callback = callback;
|
||||
m_task_count = task_count;
|
||||
m_init_done = true;
|
||||
|
||||
// Report every 1% or 4096 blocks, whichever is larger, to avoid callback overhead
|
||||
float min_diff = (4096.0f / static_cast<float>(task_count)) * 100.0f;
|
||||
m_callback_min_diff = astc::max(min_diff, 1.0f);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Request a task assignment.
|
||||
*
|
||||
* Assign up to @c granule tasks to the caller for processing.
|
||||
*
|
||||
* @param granule Maximum number of tasks that can be assigned.
|
||||
* @param[out] count Actual number of tasks assigned, or zero if no tasks were assigned.
|
||||
*
|
||||
* @return Task index of the first assigned task; assigned tasks increment from this.
|
||||
*/
|
||||
unsigned int get_task_assignment(unsigned int granule, unsigned int& count)
|
||||
{
|
||||
unsigned int base = m_start_count.fetch_add(granule, std::memory_order_relaxed);
|
||||
if (m_is_cancelled || base >= m_task_count)
|
||||
{
|
||||
count = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
count = astc::min(m_task_count - base, granule);
|
||||
return base;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Complete a task assignment.
|
||||
*
|
||||
* Mark @c count tasks as complete. This will notify all threads blocked on @c wait() if this
|
||||
* completes the processing of the stage.
|
||||
*
|
||||
* @param count The number of completed tasks.
|
||||
*/
|
||||
void complete_task_assignment(unsigned int count)
|
||||
{
|
||||
// Note: m_done_count cannot use an atomic without the mutex; this has a race between the
|
||||
// update here and the wait() for other threads
|
||||
unsigned int local_count;
|
||||
float local_last_value;
|
||||
{
|
||||
std::unique_lock<std::mutex> lck(m_lock);
|
||||
m_done_count += count;
|
||||
local_count = m_done_count;
|
||||
local_last_value = m_callback_last_value;
|
||||
|
||||
// Ensure the progress bar hits 100%
|
||||
if (m_callback && m_done_count == m_task_count)
|
||||
{
|
||||
std::unique_lock<std::mutex> cblck(m_callback_lock);
|
||||
m_callback(100.0f);
|
||||
m_callback_last_value = 100.0f;
|
||||
}
|
||||
|
||||
// Notify if nothing left to do
|
||||
if (m_is_cancelled || m_done_count == m_task_count)
|
||||
{
|
||||
lck.unlock();
|
||||
m_complete.notify_all();
|
||||
}
|
||||
}
|
||||
|
||||
// Process progress callback if we have one
|
||||
if (m_callback)
|
||||
{
|
||||
// Initial lockless test - have we progressed enough to emit?
|
||||
float num = static_cast<float>(local_count);
|
||||
float den = static_cast<float>(m_task_count);
|
||||
float this_value = (num / den) * 100.0f;
|
||||
bool report_test = (this_value - local_last_value) > m_callback_min_diff;
|
||||
|
||||
// Recheck under lock, because another thread might report first
|
||||
if (report_test)
|
||||
{
|
||||
std::unique_lock<std::mutex> cblck(m_callback_lock);
|
||||
bool report_retest = (this_value - m_callback_last_value) > m_callback_min_diff;
|
||||
if (report_retest)
|
||||
{
|
||||
m_callback(this_value);
|
||||
m_callback_last_value = this_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Wait for stage processing to complete.
|
||||
*/
|
||||
void wait()
|
||||
{
|
||||
std::unique_lock<std::mutex> lck(m_lock);
|
||||
m_complete.wait(lck, [this]{ return m_is_cancelled || m_done_count == m_task_count; });
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Trigger the pipeline stage term step.
|
||||
*
|
||||
* This can be called from multi-threaded code. The first thread to hit this will process the
|
||||
* work pool termination. Caller must have called @c wait() prior to calling this function to
|
||||
* ensure that processing is complete.
|
||||
*
|
||||
* @param term_func Callable which executes the stage termination.
|
||||
*/
|
||||
void term(std::function<void(void)> term_func)
|
||||
{
|
||||
std::lock_guard<std::mutex> lck(m_lock);
|
||||
if (!m_term_done)
|
||||
{
|
||||
term_func();
|
||||
m_term_done = true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief The astcenc compression context.
|
||||
*/
|
||||
struct astcenc_context
|
||||
{
|
||||
/** @brief The context internal state. */
|
||||
astcenc_contexti context;
|
||||
|
||||
#if !defined(ASTCENC_DECOMPRESS_ONLY)
|
||||
/** @brief The parallel manager for averages computation. */
|
||||
ParallelManager manage_avg;
|
||||
|
||||
/** @brief The parallel manager for compression. */
|
||||
ParallelManager manage_compress;
|
||||
#endif
|
||||
|
||||
/** @brief The parallel manager for decompression. */
|
||||
ParallelManager manage_decompress;
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,48 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2021 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
#include "astcenc_mathlib.h"
|
||||
|
||||
/**
|
||||
* @brief 64-bit rotate left.
|
||||
*
|
||||
* @param val The value to rotate.
|
||||
* @param count The rotation, in bits.
|
||||
*/
|
||||
static inline uint64_t rotl(uint64_t val, int count)
|
||||
{
|
||||
return (val << count) | (val >> (64 - count));
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void astc::rand_init(uint64_t state[2])
|
||||
{
|
||||
state[0] = 0xfaf9e171cea1ec6bULL;
|
||||
state[1] = 0xf1b318cc06af5d71ULL;
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
uint64_t astc::rand(uint64_t state[2])
|
||||
{
|
||||
uint64_t s0 = state[0];
|
||||
uint64_t s1 = state[1];
|
||||
uint64_t res = s0 + s1;
|
||||
s1 ^= s0;
|
||||
state[0] = rotl(s0, 24) ^ s1 ^ (s1 << 16);
|
||||
state[1] = rotl(s1, 37);
|
||||
return res;
|
||||
}
|
||||
@@ -0,0 +1,505 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2025 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/*
|
||||
* This module implements a variety of mathematical data types and library
|
||||
* functions used by the codec.
|
||||
*/
|
||||
|
||||
#ifndef ASTC_MATHLIB_H_INCLUDED
|
||||
#define ASTC_MATHLIB_H_INCLUDED
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <cmath>
|
||||
|
||||
#ifndef ASTCENC_POPCNT
|
||||
#if defined(__POPCNT__)
|
||||
#define ASTCENC_POPCNT 1
|
||||
#else
|
||||
#define ASTCENC_POPCNT 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef ASTCENC_F16C
|
||||
#if defined(__F16C__)
|
||||
#define ASTCENC_F16C 1
|
||||
#else
|
||||
#define ASTCENC_F16C 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef ASTCENC_SSE
|
||||
#if defined(__SSE4_2__)
|
||||
#define ASTCENC_SSE 42
|
||||
#elif defined(__SSE4_1__)
|
||||
#define ASTCENC_SSE 41
|
||||
#elif defined(__SSE2__) || (defined(_M_AMD64) && !defined(_M_ARM64EC))
|
||||
#define ASTCENC_SSE 20
|
||||
#else
|
||||
#define ASTCENC_SSE 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef ASTCENC_AVX
|
||||
#if defined(__AVX2__)
|
||||
#define ASTCENC_AVX 2
|
||||
#define ASTCENC_X86_GATHERS 1
|
||||
#elif defined(__AVX__)
|
||||
#define ASTCENC_AVX 1
|
||||
#define ASTCENC_X86_GATHERS 1
|
||||
#else
|
||||
#define ASTCENC_AVX 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef ASTCENC_NEON
|
||||
#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
|
||||
#define ASTCENC_NEON 1
|
||||
#else
|
||||
#define ASTCENC_NEON 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef ASTCENC_SVE
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256
|
||||
#define ASTCENC_SVE 8
|
||||
// Auto-detected SVE can only assume vector width of 4 is available, but
|
||||
// must also allow for hardware being longer and so all use of intrinsics
|
||||
// must explicitly use predicate masks to limit to 4-wide.
|
||||
#else
|
||||
#define ASTCENC_SVE 4
|
||||
#endif
|
||||
#else
|
||||
#define ASTCENC_SVE 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Force vector-sized SIMD alignment
|
||||
#if ASTCENC_AVX || ASTCENC_SVE == 8
|
||||
#define ASTCENC_VECALIGN 32
|
||||
#elif ASTCENC_SSE || ASTCENC_NEON || ASTCENC_SVE == 4
|
||||
#define ASTCENC_VECALIGN 16
|
||||
// Use default alignment for non-SIMD builds
|
||||
#else
|
||||
#define ASTCENC_VECALIGN 0
|
||||
#endif
|
||||
|
||||
// C++11 states that alignas(0) should be ignored but GCC doesn't do
|
||||
// this on some versions, so workaround and avoid emitting alignas(0)
|
||||
#if ASTCENC_VECALIGN > 0
|
||||
#define ASTCENC_ALIGNAS alignas(ASTCENC_VECALIGN)
|
||||
#else
|
||||
#define ASTCENC_ALIGNAS
|
||||
#endif
|
||||
|
||||
#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 || ASTCENC_POPCNT != 0
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
/* ============================================================================
|
||||
Fast math library; note that many of the higher-order functions in this set
|
||||
use approximations which are less accurate, but faster, than <cmath> standard
|
||||
library equivalents.
|
||||
|
||||
Note: Many of these are not necessarily faster than simple C versions when
|
||||
used on a single scalar value, but are included for testing purposes as most
|
||||
have an option based on SSE intrinsics and therefore provide an obvious route
|
||||
to future vectorization.
|
||||
============================================================================ */
|
||||
|
||||
// Union for manipulation of float bit patterns
|
||||
typedef union
|
||||
{
|
||||
uint32_t u;
|
||||
int32_t s;
|
||||
float f;
|
||||
} if32;
|
||||
|
||||
// These are namespaced to avoid colliding with C standard library functions.
|
||||
namespace astc
|
||||
{
|
||||
|
||||
static const float PI = 3.14159265358979323846f;
|
||||
static const float PI_OVER_TWO = 1.57079632679489661923f;
|
||||
|
||||
/**
|
||||
* @brief SP float absolute value.
|
||||
*
|
||||
* @param v The value to make absolute.
|
||||
*
|
||||
* @return The absolute value.
|
||||
*/
|
||||
static inline float fabs(float v)
|
||||
{
|
||||
return std::fabs(v);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Test if a float value is a nan.
|
||||
*
|
||||
* @param v The value test.
|
||||
*
|
||||
* @return Zero is not a NaN, non-zero otherwise.
|
||||
*/
|
||||
static inline bool isnan(float v)
|
||||
{
|
||||
return v != v;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the minimum of two values.
|
||||
*
|
||||
* For floats, NaNs are turned into @c q.
|
||||
*
|
||||
* @param p The first value to compare.
|
||||
* @param q The second value to compare.
|
||||
*
|
||||
* @return The smallest value.
|
||||
*/
|
||||
template<typename T>
|
||||
static inline T min(T p, T q)
|
||||
{
|
||||
return p < q ? p : q;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the minimum of three values.
|
||||
*
|
||||
* For floats, NaNs are turned into @c r.
|
||||
*
|
||||
* @param p The first value to compare.
|
||||
* @param q The second value to compare.
|
||||
* @param r The third value to compare.
|
||||
*
|
||||
* @return The smallest value.
|
||||
*/
|
||||
template<typename T>
|
||||
static inline T min(T p, T q, T r)
|
||||
{
|
||||
return min(min(p, q), r);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the minimum of four values.
|
||||
*
|
||||
* For floats, NaNs are turned into @c s.
|
||||
*
|
||||
* @param p The first value to compare.
|
||||
* @param q The second value to compare.
|
||||
* @param r The third value to compare.
|
||||
* @param s The fourth value to compare.
|
||||
*
|
||||
* @return The smallest value.
|
||||
*/
|
||||
template<typename T>
|
||||
static inline T min(T p, T q, T r, T s)
|
||||
{
|
||||
return min(min(p, q), min(r, s));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the maximum of two values.
|
||||
*
|
||||
* For floats, NaNs are turned into @c q.
|
||||
*
|
||||
* @param p The first value to compare.
|
||||
* @param q The second value to compare.
|
||||
*
|
||||
* @return The largest value.
|
||||
*/
|
||||
template<typename T>
|
||||
static inline T max(T p, T q)
|
||||
{
|
||||
return p > q ? p : q;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the maximum of three values.
|
||||
*
|
||||
* For floats, NaNs are turned into @c r.
|
||||
*
|
||||
* @param p The first value to compare.
|
||||
* @param q The second value to compare.
|
||||
* @param r The third value to compare.
|
||||
*
|
||||
* @return The largest value.
|
||||
*/
|
||||
template<typename T>
|
||||
static inline T max(T p, T q, T r)
|
||||
{
|
||||
return max(max(p, q), r);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the maximum of four values.
|
||||
*
|
||||
* For floats, NaNs are turned into @c s.
|
||||
*
|
||||
* @param p The first value to compare.
|
||||
* @param q The second value to compare.
|
||||
* @param r The third value to compare.
|
||||
* @param s The fourth value to compare.
|
||||
*
|
||||
* @return The largest value.
|
||||
*/
|
||||
template<typename T>
|
||||
static inline T max(T p, T q, T r, T s)
|
||||
{
|
||||
return max(max(p, q), max(r, s));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Clamp a value value between @c mn and @c mx.
|
||||
*
|
||||
* For floats, NaNs are turned into @c mn.
|
||||
*
|
||||
* @param v The value to clamp.
|
||||
* @param mn The min value (inclusive).
|
||||
* @param mx The max value (inclusive).
|
||||
*
|
||||
* @return The clamped value.
|
||||
*/
|
||||
template<typename T>
|
||||
inline T clamp(T v, T mn, T mx)
|
||||
{
|
||||
// Do not reorder; correct NaN handling relies on the fact that comparison
|
||||
// with NaN returns false and will fall-though to the "min" value.
|
||||
if (v > mx) return mx;
|
||||
if (v > mn) return v;
|
||||
return mn;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Clamp a float value between 0.0f and 1.0f.
|
||||
*
|
||||
* NaNs are turned into 0.0f.
|
||||
*
|
||||
* @param v The value to clamp.
|
||||
*
|
||||
* @return The clamped value.
|
||||
*/
|
||||
static inline float clamp1f(float v)
|
||||
{
|
||||
return astc::clamp(v, 0.0f, 1.0f);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Clamp a float value between 0.0f and 255.0f.
|
||||
*
|
||||
* NaNs are turned into 0.0f.
|
||||
*
|
||||
* @param v The value to clamp.
|
||||
*
|
||||
* @return The clamped value.
|
||||
*/
|
||||
static inline float clamp255f(float v)
|
||||
{
|
||||
return astc::clamp(v, 0.0f, 255.0f);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief SP float round-down.
|
||||
*
|
||||
* @param v The value to round.
|
||||
*
|
||||
* @return The rounded value.
|
||||
*/
|
||||
static inline float flt_rd(float v)
|
||||
{
|
||||
return std::floor(v);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief SP float round-to-nearest and convert to integer.
|
||||
*
|
||||
* @param v The value to round.
|
||||
*
|
||||
* @return The rounded value.
|
||||
*/
|
||||
static inline int flt2int_rtn(float v)
|
||||
{
|
||||
|
||||
return static_cast<int>(v + 0.5f);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief SP float round down and convert to integer.
|
||||
*
|
||||
* @param v The value to round.
|
||||
*
|
||||
* @return The rounded value.
|
||||
*/
|
||||
static inline int flt2int_rd(float v)
|
||||
{
|
||||
return static_cast<int>(v);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief SP float bit-interpreted as an integer.
|
||||
*
|
||||
* @param v The value to bitcast.
|
||||
*
|
||||
* @return The converted value.
|
||||
*/
|
||||
static inline int float_as_int(float v)
|
||||
{
|
||||
union { int a; float b; } u;
|
||||
u.b = v;
|
||||
return u.a;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Integer bit-interpreted as an SP float.
|
||||
*
|
||||
* @param v The value to bitcast.
|
||||
*
|
||||
* @return The converted value.
|
||||
*/
|
||||
static inline float int_as_float(int v)
|
||||
{
|
||||
union { int a; float b; } u;
|
||||
u.a = v;
|
||||
return u.b;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Fast approximation of 1.0 / sqrt(val).
|
||||
*
|
||||
* @param v The input value.
|
||||
*
|
||||
* @return The approximated result.
|
||||
*/
|
||||
static inline float rsqrt(float v)
|
||||
{
|
||||
return 1.0f / std::sqrt(v);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Fast approximation of sqrt(val).
|
||||
*
|
||||
* @param v The input value.
|
||||
*
|
||||
* @return The approximated result.
|
||||
*/
|
||||
static inline float sqrt(float v)
|
||||
{
|
||||
return std::sqrt(v);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Extract mantissa and exponent of a float value.
|
||||
*
|
||||
* @param v The input value.
|
||||
* @param[out] expo The output exponent.
|
||||
*
|
||||
* @return The mantissa.
|
||||
*/
|
||||
static inline float frexp(float v, int* expo)
|
||||
{
|
||||
if32 p;
|
||||
p.f = v;
|
||||
*expo = ((p.u >> 23) & 0xFF) - 126;
|
||||
p.u = (p.u & 0x807fffff) | 0x3f000000;
|
||||
return p.f;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Initialize the seed structure for a random number generator.
|
||||
*
|
||||
* Important note: For the purposes of ASTC we want sets of random numbers to
|
||||
* use the codec, but we want the same seed value across instances and threads
|
||||
* to ensure that image output is stable across compressor runs and across
|
||||
* platforms. Every PRNG created by this call will therefore return the same
|
||||
* sequence of values ...
|
||||
*
|
||||
* @param state The state structure to initialize.
|
||||
*/
|
||||
void rand_init(uint64_t state[2]);
|
||||
|
||||
/**
|
||||
* @brief Return the next random number from the generator.
|
||||
*
|
||||
* This RNG is an implementation of the "xoroshoro-128+ 1.0" PRNG, based on the
|
||||
* public-domain implementation given by David Blackman & Sebastiano Vigna at
|
||||
* http://vigna.di.unimi.it/xorshift/xoroshiro128plus.c
|
||||
*
|
||||
* @param state The state structure to use/update.
|
||||
*/
|
||||
uint64_t rand(uint64_t state[2]);
|
||||
|
||||
}
|
||||
|
||||
/* ============================================================================
|
||||
Softfloat library with fp32 and fp16 conversion functionality.
|
||||
============================================================================ */
|
||||
#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
|
||||
/* narrowing float->float conversions */
|
||||
uint16_t float_to_sf16(float val);
|
||||
float sf16_to_float(uint16_t val);
|
||||
#endif
|
||||
|
||||
/*********************************
|
||||
Vector library
|
||||
*********************************/
|
||||
#include "astcenc_vecmathlib.h"
|
||||
|
||||
/*********************************
|
||||
Declaration of line types
|
||||
*********************************/
|
||||
// parametric line, 2D: The line is given by line = a + b * t.
|
||||
|
||||
struct line2
|
||||
{
|
||||
vfloat4 a;
|
||||
vfloat4 b;
|
||||
};
|
||||
|
||||
// parametric line, 3D
|
||||
struct line3
|
||||
{
|
||||
vfloat4 a;
|
||||
vfloat4 b;
|
||||
};
|
||||
|
||||
struct line4
|
||||
{
|
||||
vfloat4 a;
|
||||
vfloat4 b;
|
||||
};
|
||||
|
||||
|
||||
struct processed_line2
|
||||
{
|
||||
vfloat4 amod;
|
||||
vfloat4 bs;
|
||||
};
|
||||
|
||||
struct processed_line3
|
||||
{
|
||||
vfloat4 amod;
|
||||
vfloat4 bs;
|
||||
};
|
||||
|
||||
struct processed_line4
|
||||
{
|
||||
vfloat4 amod;
|
||||
vfloat4 bs;
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,411 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2021 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Soft-float library for IEEE-754.
|
||||
*/
|
||||
#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
|
||||
|
||||
#include "astcenc_mathlib.h"
|
||||
|
||||
/* sized soft-float types. These are mapped to the sized integer
|
||||
types of C99, instead of C's floating-point types; this is because
|
||||
the library needs to maintain exact, bit-level control on all
|
||||
operations on these data types. */
|
||||
typedef uint16_t sf16;
|
||||
typedef uint32_t sf32;
|
||||
|
||||
/******************************************
|
||||
helper functions and their lookup tables
|
||||
******************************************/
|
||||
/* count leading zeros functions. Only used when the input is nonzero. */
|
||||
|
||||
#if defined(__GNUC__) && (defined(__i386) || defined(__amd64))
|
||||
#elif defined(__arm__) && defined(__ARMCC_VERSION)
|
||||
#elif defined(__arm__) && defined(__GNUC__)
|
||||
#else
|
||||
/* table used for the slow default versions. */
|
||||
static const uint8_t clz_table[256] =
|
||||
{
|
||||
8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
};
|
||||
#endif
|
||||
|
||||
/*
|
||||
32-bit count-leading-zeros function: use the Assembly instruction whenever possible. */
|
||||
static uint32_t clz32(uint32_t inp)
|
||||
{
|
||||
#if defined(__GNUC__) && (defined(__i386) || defined(__amd64))
|
||||
uint32_t bsr;
|
||||
__asm__("bsrl %1, %0": "=r"(bsr):"r"(inp | 1));
|
||||
return 31 - bsr;
|
||||
#else
|
||||
#if defined(__arm__) && defined(__ARMCC_VERSION)
|
||||
return __clz(inp); /* armcc builtin */
|
||||
#else
|
||||
#if defined(__arm__) && defined(__GNUC__)
|
||||
uint32_t lz;
|
||||
__asm__("clz %0, %1": "=r"(lz):"r"(inp));
|
||||
return lz;
|
||||
#else
|
||||
/* slow default version */
|
||||
uint32_t summa = 24;
|
||||
if (inp >= UINT32_C(0x10000))
|
||||
{
|
||||
inp >>= 16;
|
||||
summa -= 16;
|
||||
}
|
||||
if (inp >= UINT32_C(0x100))
|
||||
{
|
||||
inp >>= 8;
|
||||
summa -= 8;
|
||||
}
|
||||
return summa + clz_table[inp];
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
/* the five rounding modes that IEEE-754r defines */
|
||||
typedef enum
|
||||
{
|
||||
SF_UP = 0, /* round towards positive infinity */
|
||||
SF_DOWN = 1, /* round towards negative infinity */
|
||||
SF_TOZERO = 2, /* round towards zero */
|
||||
SF_NEARESTEVEN = 3, /* round toward nearest value; if mid-between, round to even value */
|
||||
SF_NEARESTAWAY = 4 /* round toward nearest value; if mid-between, round away from zero */
|
||||
} roundmode;
|
||||
|
||||
|
||||
static uint32_t rtne_shift32(uint32_t inp, uint32_t shamt)
|
||||
{
|
||||
uint32_t vl1 = UINT32_C(1) << shamt;
|
||||
uint32_t inp2 = inp + (vl1 >> 1); /* added 0.5 ULP */
|
||||
uint32_t msk = (inp | UINT32_C(1)) & vl1; /* nonzero if odd. '| 1' forces it to 1 if the shamt is 0. */
|
||||
msk--; /* negative if even, nonnegative if odd. */
|
||||
inp2 -= (msk >> 31); /* subtract epsilon before shift if even. */
|
||||
inp2 >>= shamt;
|
||||
return inp2;
|
||||
}
|
||||
|
||||
static uint32_t rtna_shift32(uint32_t inp, uint32_t shamt)
|
||||
{
|
||||
uint32_t vl1 = (UINT32_C(1) << shamt) >> 1;
|
||||
inp += vl1;
|
||||
inp >>= shamt;
|
||||
return inp;
|
||||
}
|
||||
|
||||
static uint32_t rtup_shift32(uint32_t inp, uint32_t shamt)
|
||||
{
|
||||
uint32_t vl1 = UINT32_C(1) << shamt;
|
||||
inp += vl1;
|
||||
inp--;
|
||||
inp >>= shamt;
|
||||
return inp;
|
||||
}
|
||||
|
||||
/* convert from FP16 to FP32. */
|
||||
static sf32 sf16_to_sf32(sf16 inp)
|
||||
{
|
||||
uint32_t inpx = inp;
|
||||
|
||||
/*
|
||||
This table contains, for every FP16 sign/exponent value combination,
|
||||
the difference between the input FP16 value and the value obtained
|
||||
by shifting the correct FP32 result right by 13 bits.
|
||||
This table allows us to handle every case except denormals and NaN
|
||||
with just 1 table lookup, 2 shifts and 1 add.
|
||||
*/
|
||||
|
||||
#define WITH_MSB(a) (UINT32_C(a) | (1u << 31))
|
||||
static const uint32_t tbl[64] =
|
||||
{
|
||||
WITH_MSB(0x00000), 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,
|
||||
0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,
|
||||
0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,
|
||||
0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, WITH_MSB(0x38000),
|
||||
WITH_MSB(0x38000), 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,
|
||||
0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,
|
||||
0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,
|
||||
0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, WITH_MSB(0x70000)
|
||||
};
|
||||
|
||||
uint32_t res = tbl[inpx >> 10];
|
||||
res += inpx;
|
||||
|
||||
/* Normal cases: MSB of 'res' not set. */
|
||||
if ((res & WITH_MSB(0)) == 0)
|
||||
{
|
||||
return res << 13;
|
||||
}
|
||||
|
||||
/* Infinity and Zero: 10 LSB of 'res' not set. */
|
||||
if ((res & 0x3FF) == 0)
|
||||
{
|
||||
return res << 13;
|
||||
}
|
||||
|
||||
/* NaN: the exponent field of 'inp' is non-zero. */
|
||||
if ((inpx & 0x7C00) != 0)
|
||||
{
|
||||
/* All NaNs are quietened. */
|
||||
return (res << 13) | 0x400000;
|
||||
}
|
||||
|
||||
/* Denormal cases */
|
||||
uint32_t sign = (inpx & 0x8000) << 16;
|
||||
uint32_t mskval = inpx & 0x7FFF;
|
||||
uint32_t leadingzeroes = clz32(mskval);
|
||||
mskval <<= leadingzeroes;
|
||||
return (mskval >> 8) + ((0x85 - leadingzeroes) << 23) + sign;
|
||||
}
|
||||
|
||||
/* Conversion routine that converts from FP32 to FP16. It supports denormals and all rounding modes. If a NaN is given as input, it is quietened. */
|
||||
static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
|
||||
{
|
||||
/* for each possible sign/exponent combination, store a case index. This gives a 512-byte table */
|
||||
static const uint8_t tab[512] {
|
||||
0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
|
||||
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
|
||||
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
|
||||
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
|
||||
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
|
||||
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
|
||||
10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
||||
20, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
|
||||
30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 40,
|
||||
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
|
||||
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
|
||||
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
|
||||
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
|
||||
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
|
||||
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
|
||||
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 50,
|
||||
|
||||
5, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||
15, 15, 15, 15, 15, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
|
||||
25, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
|
||||
35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 45,
|
||||
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
|
||||
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
|
||||
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
|
||||
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
|
||||
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
|
||||
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
|
||||
45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 55,
|
||||
};
|
||||
|
||||
/* many of the cases below use a case-dependent magic constant. So we look up a magic constant before actually performing the switch. This table allows us to group cases, thereby minimizing code
|
||||
size. */
|
||||
static const uint32_t tabx[60] {
|
||||
UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x80000000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
|
||||
UINT32_C(1), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8001), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
|
||||
UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
|
||||
UINT32_C(0xC8001FFF), UINT32_C(0xC8000000), UINT32_C(0xC8000000), UINT32_C(0xC8000FFF), UINT32_C(0xC8001000),
|
||||
UINT32_C(0x58000000), UINT32_C(0x38001FFF), UINT32_C(0x58000000), UINT32_C(0x58000FFF), UINT32_C(0x58001000),
|
||||
UINT32_C(0x7C00), UINT32_C(0x7BFF), UINT32_C(0x7BFF), UINT32_C(0x7C00), UINT32_C(0x7C00),
|
||||
UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFC00),
|
||||
UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000),
|
||||
UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000)
|
||||
};
|
||||
|
||||
uint32_t p;
|
||||
uint32_t idx = rmode + tab[inp >> 23];
|
||||
uint32_t vlx = tabx[idx];
|
||||
switch (idx)
|
||||
{
|
||||
/*
|
||||
Positive number which may be Infinity or NaN.
|
||||
We need to check whether it is NaN; if it is, quieten it by setting the top bit of the mantissa.
|
||||
(If we don't do this quieting, then a NaN that is distinguished only by having
|
||||
its low-order bits set, would be turned into an INF. */
|
||||
case 50:
|
||||
case 51:
|
||||
case 52:
|
||||
case 53:
|
||||
case 54:
|
||||
case 55:
|
||||
case 56:
|
||||
case 57:
|
||||
case 58:
|
||||
case 59:
|
||||
/*
|
||||
the input value is 0x7F800000 or 0xFF800000 if it is INF.
|
||||
By subtracting 1, we get 7F7FFFFF or FF7FFFFF, that is, bit 23 becomes zero.
|
||||
For NaNs, however, this operation will keep bit 23 with the value 1.
|
||||
We can then extract bit 23, and logical-OR bit 9 of the result with this
|
||||
bit in order to quieten the NaN (a Quiet NaN is a NaN where the top bit
|
||||
of the mantissa is set.)
|
||||
*/
|
||||
p = (inp - 1) & UINT32_C(0x800000); /* zero if INF, nonzero if NaN. */
|
||||
return static_cast<sf16>(((inp + vlx) >> 13) | (p >> 14));
|
||||
/*
|
||||
positive, exponent = 0, round-mode == UP; need to check whether number actually is 0.
|
||||
If it is, then return 0, else return 1 (the smallest representable nonzero number)
|
||||
*/
|
||||
case 0:
|
||||
/*
|
||||
-inp will set the MSB if the input number is nonzero.
|
||||
Thus (-inp) >> 31 will turn into 0 if the input number is 0 and 1 otherwise.
|
||||
*/
|
||||
return static_cast<sf16>(static_cast<uint32_t>((-static_cast<int32_t>(inp))) >> 31);
|
||||
|
||||
/*
|
||||
negative, exponent = , round-mode == DOWN, need to check whether number is
|
||||
actually 0. If it is, return 0x8000 ( float -0.0 )
|
||||
Else return the smallest negative number ( 0x8001 ) */
|
||||
case 6:
|
||||
/*
|
||||
in this case 'vlx' is 0x80000000. By subtracting the input value from it,
|
||||
we obtain a value that is 0 if the input value is in fact zero and has
|
||||
the MSB set if it isn't. We then right-shift the value by 31 places to
|
||||
get a value that is 0 if the input is -0.0 and 1 otherwise.
|
||||
*/
|
||||
return static_cast<sf16>(((vlx - inp) >> 31) + UINT32_C(0x8000));
|
||||
|
||||
/*
|
||||
for all other cases involving underflow/overflow, we don't need to
|
||||
do actual tests; we just return 'vlx'.
|
||||
*/
|
||||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
case 4:
|
||||
case 5:
|
||||
case 7:
|
||||
case 8:
|
||||
case 9:
|
||||
case 10:
|
||||
case 11:
|
||||
case 12:
|
||||
case 13:
|
||||
case 14:
|
||||
case 15:
|
||||
case 16:
|
||||
case 17:
|
||||
case 18:
|
||||
case 19:
|
||||
case 40:
|
||||
case 41:
|
||||
case 42:
|
||||
case 43:
|
||||
case 44:
|
||||
case 45:
|
||||
case 46:
|
||||
case 47:
|
||||
case 48:
|
||||
case 49:
|
||||
return static_cast<sf16>(vlx);
|
||||
|
||||
/*
|
||||
for normal numbers, 'vlx' is the difference between the FP32 value of a number and the
|
||||
FP16 representation of the same number left-shifted by 13 places. In addition, a rounding constant is
|
||||
baked into 'vlx': for rounding-away-from zero, the constant is 2^13 - 1, causing roundoff away
|
||||
from zero. for round-to-nearest away, the constant is 2^12, causing roundoff away from zero.
|
||||
for round-to-nearest-even, the constant is 2^12 - 1. This causes correct round-to-nearest-even
|
||||
except for odd input numbers. For odd input numbers, we need to add 1 to the constant. */
|
||||
|
||||
/* normal number, all rounding modes except round-to-nearest-even: */
|
||||
case 30:
|
||||
case 31:
|
||||
case 32:
|
||||
case 34:
|
||||
case 35:
|
||||
case 36:
|
||||
case 37:
|
||||
case 39:
|
||||
return static_cast<sf16>((inp + vlx) >> 13);
|
||||
|
||||
/* normal number, round-to-nearest-even. */
|
||||
case 33:
|
||||
case 38:
|
||||
p = inp + vlx;
|
||||
p += (inp >> 13) & 1;
|
||||
return static_cast<sf16>(p >> 13);
|
||||
|
||||
/*
|
||||
the various denormal cases. These are not expected to be common, so their performance is a bit
|
||||
less important. For each of these cases, we need to extract an exponent and a mantissa
|
||||
(including the implicit '1'!), and then right-shift the mantissa by a shift-amount that
|
||||
depends on the exponent. The shift must apply the correct rounding mode. 'vlx' is used to supply the
|
||||
sign of the resulting denormal number.
|
||||
*/
|
||||
case 21:
|
||||
case 22:
|
||||
case 25:
|
||||
case 27:
|
||||
/* denormal, round towards zero. */
|
||||
p = 126 - ((inp >> 23) & 0xFF);
|
||||
return static_cast<sf16>((((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000)) >> p) | vlx);
|
||||
case 20:
|
||||
case 26:
|
||||
/* denormal, round away from zero. */
|
||||
p = 126 - ((inp >> 23) & 0xFF);
|
||||
return static_cast<sf16>(rtup_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
|
||||
case 24:
|
||||
case 29:
|
||||
/* denormal, round to nearest-away */
|
||||
p = 126 - ((inp >> 23) & 0xFF);
|
||||
return static_cast<sf16>(rtna_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
|
||||
case 23:
|
||||
case 28:
|
||||
/* denormal, round to nearest-even. */
|
||||
p = 126 - ((inp >> 23) & 0xFF);
|
||||
return static_cast<sf16>(rtne_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* convert from soft-float to native-float */
|
||||
float sf16_to_float(uint16_t p)
|
||||
{
|
||||
if32 i;
|
||||
i.u = sf16_to_sf32(p);
|
||||
return i.f;
|
||||
}
|
||||
|
||||
/* convert from native-float to soft-float */
|
||||
uint16_t float_to_sf16(float p)
|
||||
{
|
||||
if32 i;
|
||||
i.f = p;
|
||||
return sf32_to_sf16(i.u, SF_NEARESTEVEN);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,481 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Functions for generating partition tables on demand.
|
||||
*/
|
||||
|
||||
#include "astcenc_internal.h"
|
||||
|
||||
/** @brief The number of 64-bit words needed to represent a canonical partition bit pattern. */
|
||||
#define BIT_PATTERN_WORDS (((ASTCENC_BLOCK_MAX_TEXELS * 2) + 63) / 64)
|
||||
|
||||
/**
|
||||
* @brief Generate a canonical representation of a partition pattern.
|
||||
*
|
||||
* The returned value stores two bits per texel, for up to 6x6x6 texels, where the two bits store
|
||||
* the remapped texel index. Remapping ensures that we only match on the partition pattern,
|
||||
* independent of the partition order generated by the hash.
|
||||
*
|
||||
* @param texel_count The number of texels in the block.
|
||||
* @param partition_of_texel The partition assignments, in hash order.
|
||||
* @param[out] bit_pattern The output bit pattern representation.
|
||||
*/
|
||||
static void generate_canonical_partitioning(
|
||||
unsigned int texel_count,
|
||||
const uint8_t* partition_of_texel,
|
||||
uint64_t bit_pattern[BIT_PATTERN_WORDS]
|
||||
) {
|
||||
// Clear the pattern
|
||||
for (unsigned int i = 0; i < BIT_PATTERN_WORDS; i++)
|
||||
{
|
||||
bit_pattern[i] = 0;
|
||||
}
|
||||
|
||||
// Store a mapping to reorder the raw partitions so that the partitions are ordered such
|
||||
// that the lowest texel index in partition N is smaller than the lowest texel index in
|
||||
// partition N + 1.
|
||||
int mapped_index[BLOCK_MAX_PARTITIONS];
|
||||
int map_weight_count = 0;
|
||||
|
||||
for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONS; i++)
|
||||
{
|
||||
mapped_index[i] = -1;
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < texel_count; i++)
|
||||
{
|
||||
int index = partition_of_texel[i];
|
||||
if (mapped_index[index] < 0)
|
||||
{
|
||||
mapped_index[index] = map_weight_count++;
|
||||
}
|
||||
|
||||
uint64_t xlat_index = mapped_index[index];
|
||||
bit_pattern[i >> 5] |= xlat_index << (2 * (i & 0x1F));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Compare two canonical patterns to see if they are the same.
|
||||
*
|
||||
* @param part1 The first canonical bit pattern to check.
|
||||
* @param part2 The second canonical bit pattern to check.
|
||||
*
|
||||
* @return @c true if the patterns are the same, @c false otherwise.
|
||||
*/
|
||||
static bool compare_canonical_partitionings(
|
||||
const uint64_t part1[BIT_PATTERN_WORDS],
|
||||
const uint64_t part2[BIT_PATTERN_WORDS]
|
||||
) {
|
||||
return (part1[0] == part2[0])
|
||||
#if BIT_PATTERN_WORDS > 1
|
||||
&& (part1[1] == part2[1])
|
||||
#endif
|
||||
#if BIT_PATTERN_WORDS > 2
|
||||
&& (part1[2] == part2[2])
|
||||
#endif
|
||||
#if BIT_PATTERN_WORDS > 3
|
||||
&& (part1[3] == part2[3])
|
||||
#endif
|
||||
#if BIT_PATTERN_WORDS > 4
|
||||
&& (part1[4] == part2[4])
|
||||
#endif
|
||||
#if BIT_PATTERN_WORDS > 5
|
||||
&& (part1[5] == part2[5])
|
||||
#endif
|
||||
#if BIT_PATTERN_WORDS > 6
|
||||
&& (part1[6] == part2[6])
|
||||
#endif
|
||||
;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Hash function used for procedural partition assignment.
|
||||
*
|
||||
* @param inp The hash seed.
|
||||
*
|
||||
* @return The hashed value.
|
||||
*/
|
||||
static uint32_t hash52(
|
||||
uint32_t inp
|
||||
) {
|
||||
inp ^= inp >> 15;
|
||||
|
||||
// (2^4 + 1) * (2^7 + 1) * (2^17 - 1)
|
||||
inp *= 0xEEDE0891;
|
||||
inp ^= inp >> 5;
|
||||
inp += inp << 16;
|
||||
inp ^= inp >> 7;
|
||||
inp ^= inp >> 3;
|
||||
inp ^= inp << 6;
|
||||
inp ^= inp >> 17;
|
||||
return inp;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Select texel assignment for a single coordinate.
|
||||
*
|
||||
* @param seed The seed - the partition index from the block.
|
||||
* @param x The texel X coordinate in the block.
|
||||
* @param y The texel Y coordinate in the block.
|
||||
* @param z The texel Z coordinate in the block.
|
||||
* @param partition_count The total partition count of this encoding.
|
||||
* @param small_block @c true if the block has fewer than 32 texels.
|
||||
*
|
||||
* @return The assigned partition index for this texel.
|
||||
*/
|
||||
static uint8_t select_partition(
|
||||
int seed,
|
||||
int x,
|
||||
int y,
|
||||
int z,
|
||||
int partition_count,
|
||||
bool small_block
|
||||
) {
|
||||
// For small blocks bias the coordinates to get better distribution
|
||||
if (small_block)
|
||||
{
|
||||
x <<= 1;
|
||||
y <<= 1;
|
||||
z <<= 1;
|
||||
}
|
||||
|
||||
seed += (partition_count - 1) * 1024;
|
||||
|
||||
uint32_t rnum = hash52(seed);
|
||||
|
||||
uint8_t seed1 = rnum & 0xF;
|
||||
uint8_t seed2 = (rnum >> 4) & 0xF;
|
||||
uint8_t seed3 = (rnum >> 8) & 0xF;
|
||||
uint8_t seed4 = (rnum >> 12) & 0xF;
|
||||
uint8_t seed5 = (rnum >> 16) & 0xF;
|
||||
uint8_t seed6 = (rnum >> 20) & 0xF;
|
||||
uint8_t seed7 = (rnum >> 24) & 0xF;
|
||||
uint8_t seed8 = (rnum >> 28) & 0xF;
|
||||
uint8_t seed9 = (rnum >> 18) & 0xF;
|
||||
uint8_t seed10 = (rnum >> 22) & 0xF;
|
||||
uint8_t seed11 = (rnum >> 26) & 0xF;
|
||||
uint8_t seed12 = ((rnum >> 30) | (rnum << 2)) & 0xF;
|
||||
|
||||
// Squaring all the seeds in order to bias their distribution towards lower values.
|
||||
seed1 *= seed1;
|
||||
seed2 *= seed2;
|
||||
seed3 *= seed3;
|
||||
seed4 *= seed4;
|
||||
seed5 *= seed5;
|
||||
seed6 *= seed6;
|
||||
seed7 *= seed7;
|
||||
seed8 *= seed8;
|
||||
seed9 *= seed9;
|
||||
seed10 *= seed10;
|
||||
seed11 *= seed11;
|
||||
seed12 *= seed12;
|
||||
|
||||
int sh1, sh2;
|
||||
if (seed & 1)
|
||||
{
|
||||
sh1 = (seed & 2 ? 4 : 5);
|
||||
sh2 = (partition_count == 3 ? 6 : 5);
|
||||
}
|
||||
else
|
||||
{
|
||||
sh1 = (partition_count == 3 ? 6 : 5);
|
||||
sh2 = (seed & 2 ? 4 : 5);
|
||||
}
|
||||
|
||||
int sh3 = (seed & 0x10) ? sh1 : sh2;
|
||||
|
||||
seed1 >>= sh1;
|
||||
seed2 >>= sh2;
|
||||
seed3 >>= sh1;
|
||||
seed4 >>= sh2;
|
||||
seed5 >>= sh1;
|
||||
seed6 >>= sh2;
|
||||
seed7 >>= sh1;
|
||||
seed8 >>= sh2;
|
||||
|
||||
seed9 >>= sh3;
|
||||
seed10 >>= sh3;
|
||||
seed11 >>= sh3;
|
||||
seed12 >>= sh3;
|
||||
|
||||
int a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
|
||||
int b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
|
||||
int c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
|
||||
int d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
|
||||
|
||||
// Apply the saw
|
||||
a &= 0x3F;
|
||||
b &= 0x3F;
|
||||
c &= 0x3F;
|
||||
d &= 0x3F;
|
||||
|
||||
// Remove some of the components if we are to output < 4 partitions.
|
||||
if (partition_count <= 3)
|
||||
{
|
||||
d = 0;
|
||||
}
|
||||
|
||||
if (partition_count <= 2)
|
||||
{
|
||||
c = 0;
|
||||
}
|
||||
|
||||
if (partition_count <= 1)
|
||||
{
|
||||
b = 0;
|
||||
}
|
||||
|
||||
uint8_t partition;
|
||||
if (a >= b && a >= c && a >= d)
|
||||
{
|
||||
partition = 0;
|
||||
}
|
||||
else if (b >= c && b >= d)
|
||||
{
|
||||
partition = 1;
|
||||
}
|
||||
else if (c >= d)
|
||||
{
|
||||
partition = 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
partition = 3;
|
||||
}
|
||||
|
||||
return partition;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Generate a single partition info structure.
|
||||
*
|
||||
* @param[out] bsd The block size information.
|
||||
* @param partition_count The partition count of this partitioning.
|
||||
* @param partition_index The partition index / seed of this partitioning.
|
||||
* @param partition_remap_index The remapped partition index of this partitioning.
|
||||
* @param[out] pi The partition info structure to populate.
|
||||
*
|
||||
* @return True if this is a useful partition index, False if we can skip it.
|
||||
*/
|
||||
static bool generate_one_partition_info_entry(
|
||||
block_size_descriptor& bsd,
|
||||
unsigned int partition_count,
|
||||
unsigned int partition_index,
|
||||
unsigned int partition_remap_index,
|
||||
partition_info& pi
|
||||
) {
|
||||
int texels_per_block = bsd.texel_count;
|
||||
bool small_block = texels_per_block < 32;
|
||||
|
||||
uint8_t *partition_of_texel = pi.partition_of_texel;
|
||||
|
||||
// Assign texels to partitions
|
||||
int texel_idx = 0;
|
||||
int counts[BLOCK_MAX_PARTITIONS] { 0 };
|
||||
for (unsigned int z = 0; z < bsd.zdim; z++)
|
||||
{
|
||||
for (unsigned int y = 0; y < bsd.ydim; y++)
|
||||
{
|
||||
for (unsigned int x = 0; x < bsd.xdim; x++)
|
||||
{
|
||||
uint8_t part = select_partition(partition_index, x, y, z, partition_count, small_block);
|
||||
pi.texels_of_partition[part][counts[part]++] = static_cast<uint8_t>(texel_idx++);
|
||||
*partition_of_texel++ = part;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fill loop tail so we can overfetch later
|
||||
for (unsigned int i = 0; i < partition_count; i++)
|
||||
{
|
||||
size_t ptex_count = counts[i];
|
||||
size_t ptex_count_simd = round_up_to_simd_multiple_vla(ptex_count);
|
||||
for (size_t j = ptex_count; j < ptex_count_simd; j++)
|
||||
{
|
||||
pi.texels_of_partition[i][j] = pi.texels_of_partition[i][ptex_count - 1];
|
||||
}
|
||||
}
|
||||
|
||||
// Populate the actual procedural partition count
|
||||
if (counts[0] == 0)
|
||||
{
|
||||
pi.partition_count = 0;
|
||||
}
|
||||
else if (counts[1] == 0)
|
||||
{
|
||||
pi.partition_count = 1;
|
||||
}
|
||||
else if (counts[2] == 0)
|
||||
{
|
||||
pi.partition_count = 2;
|
||||
}
|
||||
else if (counts[3] == 0)
|
||||
{
|
||||
pi.partition_count = 3;
|
||||
}
|
||||
else
|
||||
{
|
||||
pi.partition_count = 4;
|
||||
}
|
||||
|
||||
// Populate the partition index
|
||||
pi.partition_index = static_cast<uint16_t>(partition_index);
|
||||
|
||||
// Populate the coverage bitmaps for 2/3/4 partitions
|
||||
uint64_t* bitmaps { nullptr };
|
||||
if (partition_count == 2)
|
||||
{
|
||||
bitmaps = bsd.coverage_bitmaps_2[partition_remap_index];
|
||||
}
|
||||
else if (partition_count == 3)
|
||||
{
|
||||
bitmaps = bsd.coverage_bitmaps_3[partition_remap_index];
|
||||
}
|
||||
else if (partition_count == 4)
|
||||
{
|
||||
bitmaps = bsd.coverage_bitmaps_4[partition_remap_index];
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONS; i++)
|
||||
{
|
||||
pi.partition_texel_count[i] = static_cast<uint8_t>(counts[i]);
|
||||
}
|
||||
|
||||
// Valid partitionings have texels in all of the requested partitions
|
||||
bool valid = pi.partition_count == partition_count;
|
||||
|
||||
if (bitmaps)
|
||||
{
|
||||
// Populate the partition coverage bitmap
|
||||
for (unsigned int i = 0; i < partition_count; i++)
|
||||
{
|
||||
bitmaps[i] = 0ULL;
|
||||
}
|
||||
|
||||
unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS);
|
||||
for (unsigned int i = 0; i < texels_to_process; i++)
|
||||
{
|
||||
unsigned int idx = bsd.kmeans_texels[i];
|
||||
bitmaps[pi.partition_of_texel[idx]] |= 1ULL << i;
|
||||
}
|
||||
}
|
||||
|
||||
return valid;
|
||||
}
|
||||
|
||||
static void build_partition_table_for_one_partition_count(
|
||||
block_size_descriptor& bsd,
|
||||
bool can_omit_partitionings,
|
||||
unsigned int partition_count_cutoff,
|
||||
unsigned int partition_count,
|
||||
partition_info* ptab,
|
||||
uint64_t* canonical_patterns
|
||||
) {
|
||||
unsigned int next_index = 0;
|
||||
bsd.partitioning_count_selected[partition_count - 1] = 0;
|
||||
bsd.partitioning_count_all[partition_count - 1] = 0;
|
||||
|
||||
// Skip tables larger than config max partition count if we can omit modes
|
||||
if (can_omit_partitionings && (partition_count > partition_count_cutoff))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Iterate through twice
|
||||
// - Pass 0: Keep selected partitionings
|
||||
// - Pass 1: Keep non-selected partitionings (skip if in omit mode)
|
||||
unsigned int max_iter = can_omit_partitionings ? 1 : 2;
|
||||
|
||||
// Tracker for things we built in the first iteration
|
||||
uint8_t build[BLOCK_MAX_PARTITIONINGS] { 0 };
|
||||
for (unsigned int x = 0; x < max_iter; x++)
|
||||
{
|
||||
for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONINGS; i++)
|
||||
{
|
||||
// Don't include things we built in the first pass
|
||||
if ((x == 1) && build[i])
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
bool keep_useful = generate_one_partition_info_entry(bsd, partition_count, i, next_index, ptab[next_index]);
|
||||
if ((x == 0) && !keep_useful)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
generate_canonical_partitioning(bsd.texel_count, ptab[next_index].partition_of_texel, canonical_patterns + next_index * BIT_PATTERN_WORDS);
|
||||
bool keep_canonical = true;
|
||||
for (unsigned int j = 0; j < next_index; j++)
|
||||
{
|
||||
bool match = compare_canonical_partitionings(canonical_patterns + next_index * BIT_PATTERN_WORDS, canonical_patterns + j * BIT_PATTERN_WORDS);
|
||||
if (match)
|
||||
{
|
||||
keep_canonical = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (keep_useful && keep_canonical)
|
||||
{
|
||||
if (x == 0)
|
||||
{
|
||||
bsd.partitioning_packed_index[partition_count - 2][i] = static_cast<uint16_t>(next_index);
|
||||
bsd.partitioning_count_selected[partition_count - 1]++;
|
||||
bsd.partitioning_count_all[partition_count - 1]++;
|
||||
build[i] = 1;
|
||||
next_index++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (x == 1)
|
||||
{
|
||||
bsd.partitioning_packed_index[partition_count - 2][i] = static_cast<uint16_t>(next_index);
|
||||
bsd.partitioning_count_all[partition_count - 1]++;
|
||||
next_index++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void init_partition_tables(
|
||||
block_size_descriptor& bsd,
|
||||
bool can_omit_partitionings,
|
||||
unsigned int partition_count_cutoff
|
||||
) {
|
||||
partition_info* par_tab2 = bsd.partitionings;
|
||||
partition_info* par_tab3 = par_tab2 + BLOCK_MAX_PARTITIONINGS;
|
||||
partition_info* par_tab4 = par_tab3 + BLOCK_MAX_PARTITIONINGS;
|
||||
partition_info* par_tab1 = par_tab4 + BLOCK_MAX_PARTITIONINGS;
|
||||
|
||||
generate_one_partition_info_entry(bsd, 1, 0, 0, *par_tab1);
|
||||
bsd.partitioning_count_selected[0] = 1;
|
||||
bsd.partitioning_count_all[0] = 1;
|
||||
|
||||
uint64_t* canonical_patterns = new uint64_t[BLOCK_MAX_PARTITIONINGS * BIT_PATTERN_WORDS];
|
||||
|
||||
build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 2, par_tab2, canonical_patterns);
|
||||
build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 3, par_tab3, canonical_patterns);
|
||||
build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 4, par_tab4, canonical_patterns);
|
||||
|
||||
delete[] canonical_patterns;
|
||||
}
|
||||
@@ -0,0 +1,166 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2020-2021 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Platform-specific function implementations.
|
||||
*
|
||||
* This module contains functions for querying the host extended ISA support.
|
||||
*/
|
||||
|
||||
// Include before the defines below to pick up any auto-setup based on compiler
|
||||
// built-in config, if not being set explicitly by the build system
|
||||
#include "astcenc_internal.h"
|
||||
|
||||
#if (ASTCENC_SSE > 0) || (ASTCENC_AVX > 0) || \
|
||||
(ASTCENC_POPCNT > 0) || (ASTCENC_F16C > 0)
|
||||
|
||||
static bool g_init { false };
|
||||
|
||||
/** Does this CPU support SSE 4.1? Set to -1 if not yet initialized. */
|
||||
static bool g_cpu_has_sse41 { false };
|
||||
|
||||
/** Does this CPU support AVX2? Set to -1 if not yet initialized. */
|
||||
static bool g_cpu_has_avx2 { false };
|
||||
|
||||
/** Does this CPU support POPCNT? Set to -1 if not yet initialized. */
|
||||
static bool g_cpu_has_popcnt { false };
|
||||
|
||||
/** Does this CPU support F16C? Set to -1 if not yet initialized. */
|
||||
static bool g_cpu_has_f16c { false };
|
||||
|
||||
/* ============================================================================
|
||||
Platform code for Visual Studio
|
||||
============================================================================ */
|
||||
#if !defined(__clang__) && defined(_MSC_VER)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <Windows.h>
|
||||
#include <intrin.h>
|
||||
|
||||
/**
|
||||
* @brief Detect platform CPU ISA support and update global trackers.
|
||||
*/
|
||||
static void detect_cpu_isa()
|
||||
{
|
||||
int data[4];
|
||||
|
||||
__cpuid(data, 0);
|
||||
int num_id = data[0];
|
||||
|
||||
if (num_id >= 1)
|
||||
{
|
||||
__cpuidex(data, 1, 0);
|
||||
// SSE41 = Bank 1, ECX, bit 19
|
||||
g_cpu_has_sse41 = data[2] & (1 << 19) ? true : false;
|
||||
// POPCNT = Bank 1, ECX, bit 23
|
||||
g_cpu_has_popcnt = data[2] & (1 << 23) ? true : false;
|
||||
// F16C = Bank 1, ECX, bit 29
|
||||
g_cpu_has_f16c = data[2] & (1 << 29) ? true : false;
|
||||
}
|
||||
|
||||
if (num_id >= 7)
|
||||
{
|
||||
__cpuidex(data, 7, 0);
|
||||
// AVX2 = Bank 7, EBX, bit 5
|
||||
g_cpu_has_avx2 = data[1] & (1 << 5) ? true : false;
|
||||
}
|
||||
|
||||
// Ensure state bits are updated before init flag is updated
|
||||
MemoryBarrier();
|
||||
g_init = true;
|
||||
}
|
||||
|
||||
/* ============================================================================
|
||||
Platform code for GCC and Clang
|
||||
============================================================================ */
|
||||
#else
|
||||
#include <cpuid.h>
|
||||
|
||||
/**
|
||||
* @brief Detect platform CPU ISA support and update global trackers.
|
||||
*/
|
||||
static void detect_cpu_isa()
|
||||
{
|
||||
unsigned int data[4];
|
||||
|
||||
if (__get_cpuid_count(1, 0, &data[0], &data[1], &data[2], &data[3]))
|
||||
{
|
||||
// SSE41 = Bank 1, ECX, bit 19
|
||||
g_cpu_has_sse41 = data[2] & (1 << 19) ? true : false;
|
||||
// POPCNT = Bank 1, ECX, bit 23
|
||||
g_cpu_has_popcnt = data[2] & (1 << 23) ? true : false;
|
||||
// F16C = Bank 1, ECX, bit 29
|
||||
g_cpu_has_f16c = data[2] & (1 << 29) ? true : false;
|
||||
}
|
||||
|
||||
g_cpu_has_avx2 = 0;
|
||||
if (__get_cpuid_count(7, 0, &data[0], &data[1], &data[2], &data[3]))
|
||||
{
|
||||
// AVX2 = Bank 7, EBX, bit 5
|
||||
g_cpu_has_avx2 = data[1] & (1 << 5) ? true : false;
|
||||
}
|
||||
|
||||
// Ensure state bits are updated before init flag is updated
|
||||
__sync_synchronize();
|
||||
g_init = true;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* See header for documentation. */
|
||||
bool cpu_supports_popcnt()
|
||||
{
|
||||
if (!g_init)
|
||||
{
|
||||
detect_cpu_isa();
|
||||
}
|
||||
|
||||
return g_cpu_has_popcnt;
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
bool cpu_supports_f16c()
|
||||
{
|
||||
if (!g_init)
|
||||
{
|
||||
detect_cpu_isa();
|
||||
}
|
||||
|
||||
return g_cpu_has_f16c;
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
bool cpu_supports_sse41()
|
||||
{
|
||||
if (!g_init)
|
||||
{
|
||||
detect_cpu_isa();
|
||||
}
|
||||
|
||||
return g_cpu_has_sse41;
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
bool cpu_supports_avx2()
|
||||
{
|
||||
if (!g_init)
|
||||
{
|
||||
detect_cpu_isa();
|
||||
}
|
||||
|
||||
return g_cpu_has_avx2;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,903 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Functions and data tables for numeric quantization..
|
||||
*/
|
||||
|
||||
#include "astcenc_internal.h"
|
||||
|
||||
#if !defined(ASTCENC_DECOMPRESS_ONLY)
|
||||
|
||||
// Not scrambled, starts from QUANT_6
|
||||
const uint8_t color_unquant_to_uquant_tables[17][512] {
|
||||
{ // QUANT_6
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51,
|
||||
51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51,
|
||||
51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51,
|
||||
51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 102, 102, 102, 102, 102, 102,
|
||||
102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
|
||||
102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
|
||||
102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
|
||||
153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
|
||||
153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
|
||||
153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
|
||||
153, 153, 153, 153, 153, 153, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
|
||||
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
|
||||
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
|
||||
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
|
||||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
|
||||
},
|
||||
{ // QUANT_8
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
|
||||
36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
|
||||
36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73,
|
||||
73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73,
|
||||
73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 109, 109, 109, 109, 109, 109, 109, 109, 109,
|
||||
109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
|
||||
109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
|
||||
146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146,
|
||||
146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146,
|
||||
146, 146, 146, 146, 146, 146, 146, 146, 146, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
|
||||
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
|
||||
182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219,
|
||||
219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219,
|
||||
219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 255, 255, 255, 255, 255,
|
||||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
|
||||
},
|
||||
{ // QUANT_10
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 28, 28,
|
||||
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
|
||||
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56,
|
||||
56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56,
|
||||
56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
|
||||
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
|
||||
84, 84, 84, 84, 84, 84, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113,
|
||||
113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113,
|
||||
142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
|
||||
142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 171, 171, 171, 171, 171, 171,
|
||||
171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171,
|
||||
171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199,
|
||||
199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199,
|
||||
199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227,
|
||||
227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227,
|
||||
227, 227, 227, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
|
||||
},
|
||||
{ // QUANT_12
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 23, 23, 23, 23, 23, 23, 23, 23,
|
||||
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
|
||||
23, 23, 23, 23, 23, 23, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46,
|
||||
46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69,
|
||||
69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69,
|
||||
69, 69, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92,
|
||||
92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116,
|
||||
116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116,
|
||||
139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139,
|
||||
139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163,
|
||||
163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 186, 186,
|
||||
186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186,
|
||||
186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209,
|
||||
209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 232, 232, 232, 232, 232, 232,
|
||||
232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
|
||||
232, 232, 232, 232, 232, 232, 232, 232, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
|
||||
},
|
||||
{ // QUANT_16
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
|
||||
17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
|
||||
34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51,
|
||||
51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 68, 68, 68, 68, 68, 68, 68, 68,
|
||||
68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 68, 85, 85, 85, 85, 85, 85,
|
||||
85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 102, 102, 102, 102,
|
||||
102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 119, 119,
|
||||
119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119,
|
||||
136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136,
|
||||
136, 136, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
|
||||
153, 153, 153, 153, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170,
|
||||
170, 170, 170, 170, 170, 170, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
|
||||
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221,
|
||||
221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238,
|
||||
238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
|
||||
},
|
||||
{ // QUANT_20
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
|
||||
13, 13, 13, 13, 13, 13, 13, 13, 13, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
|
||||
27, 27, 27, 27, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 54,
|
||||
54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 67, 67, 67, 67, 67, 67,
|
||||
67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80,
|
||||
80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 94,
|
||||
94, 94, 94, 94, 94, 94, 94, 94, 94, 94, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
|
||||
107, 107, 107, 107, 107, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121,
|
||||
134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 148, 148, 148, 148, 148,
|
||||
148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161,
|
||||
161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 161, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175,
|
||||
175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 175, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188,
|
||||
188, 188, 188, 188, 188, 188, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201,
|
||||
201, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 228, 228, 228, 228,
|
||||
228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 242, 242, 242, 242, 242, 242, 242, 242, 242,
|
||||
242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
|
||||
},
|
||||
{ // QUANT_24
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
||||
11, 11, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 33, 33,
|
||||
33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
|
||||
44, 44, 44, 44, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 66, 66, 66, 66, 66, 66,
|
||||
66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77,
|
||||
77, 77, 77, 77, 77, 77, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 88, 99, 99, 99, 99,
|
||||
99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110,
|
||||
110, 110, 110, 110, 110, 110, 110, 110, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121,
|
||||
134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 145, 145, 145, 145, 145, 145, 145, 145,
|
||||
145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156,
|
||||
156, 156, 156, 156, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 178, 178, 178, 178, 178, 178,
|
||||
178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 178, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189,
|
||||
189, 189, 189, 189, 189, 189, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 211, 211, 211, 211,
|
||||
211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222,
|
||||
222, 222, 222, 222, 222, 222, 222, 222, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 244, 244,
|
||||
244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
|
||||
},
|
||||
{ // QUANT_32
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16,
|
||||
16, 16, 16, 16, 16, 16, 16, 16, 16, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 33, 33, 33, 33, 33, 33,
|
||||
33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 49, 49, 49, 49, 49,
|
||||
49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 66, 66, 66, 66,
|
||||
66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 82, 82, 82,
|
||||
82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 99, 99,
|
||||
99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 115,
|
||||
115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 115, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123,
|
||||
132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140, 140,
|
||||
140, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156,
|
||||
156, 156, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 165, 173, 173, 173, 173, 173, 173, 173, 173, 173, 173, 173, 173, 173,
|
||||
173, 173, 173, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 181, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189,
|
||||
189, 189, 189, 189, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 198, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206,
|
||||
206, 206, 206, 206, 206, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222,
|
||||
222, 222, 222, 222, 222, 222, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 231, 239, 239, 239, 239, 239, 239, 239, 239, 239,
|
||||
239, 239, 239, 239, 239, 239, 239, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 247, 255, 255, 255, 255, 255, 255, 255, 255, 255
|
||||
},
|
||||
{ // QUANT_40
|
||||
0, 0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
|
||||
13, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 32, 32, 32, 32, 32,
|
||||
32, 32, 32, 32, 32, 32, 32, 32, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
|
||||
45, 45, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 65, 65, 65, 65,
|
||||
65, 65, 65, 65, 65, 65, 65, 65, 65, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
|
||||
78, 78, 78, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 97, 97, 97,
|
||||
97, 97, 97, 97, 97, 97, 97, 97, 97, 97, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 104, 110, 110, 110, 110, 110, 110, 110, 110, 110,
|
||||
110, 110, 110, 110, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123, 123,
|
||||
132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 132, 138, 138, 138, 138, 138, 138, 138, 138, 138, 138, 138, 138, 138, 145, 145, 145, 145,
|
||||
145, 145, 145, 145, 145, 145, 145, 145, 145, 151, 151, 151, 151, 151, 151, 151, 151, 151, 151, 151, 151, 151, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158,
|
||||
158, 158, 158, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 177, 177, 177,
|
||||
177, 177, 177, 177, 177, 177, 177, 177, 177, 177, 184, 184, 184, 184, 184, 184, 184, 184, 184, 184, 184, 184, 184, 190, 190, 190, 190, 190, 190, 190, 190, 190,
|
||||
190, 190, 190, 190, 197, 197, 197, 197, 197, 197, 197, 197, 197, 197, 197, 197, 197, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 203, 210, 210,
|
||||
210, 210, 210, 210, 210, 210, 210, 210, 210, 210, 210, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 216, 223, 223, 223, 223, 223, 223, 223, 223,
|
||||
223, 223, 223, 223, 223, 229, 229, 229, 229, 229, 229, 229, 229, 229, 229, 229, 229, 229, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 236, 242,
|
||||
242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 242, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 255, 255, 255, 255, 255, 255, 255
|
||||
},
|
||||
{ // QUANT_48
|
||||
0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 16, 16, 16, 16,
|
||||
16, 16, 16, 16, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 32, 32, 32, 32,
|
||||
32, 32, 32, 32, 32, 32, 32, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 48, 48, 48, 48,
|
||||
48, 48, 48, 48, 48, 48, 48, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 65, 65, 65,
|
||||
65, 65, 65, 65, 65, 65, 65, 65, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 81, 81,
|
||||
81, 81, 81, 81, 81, 81, 81, 81, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 97, 97,
|
||||
97, 97, 97, 97, 97, 97, 97, 97, 97, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 103, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 113, 113,
|
||||
113, 113, 113, 113, 113, 113, 113, 113, 113, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124,
|
||||
131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 142, 142, 142, 142, 142, 142, 142, 142, 142,
|
||||
142, 142, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 158, 158, 158, 158, 158, 158, 158, 158, 158,
|
||||
158, 158, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 169, 169, 169, 169, 169, 169, 169, 169, 169, 169, 169, 174, 174, 174, 174, 174, 174, 174, 174,
|
||||
174, 174, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 179, 185, 185, 185, 185, 185, 185, 185, 185, 185, 185, 185, 190, 190, 190, 190, 190, 190, 190, 190,
|
||||
190, 190, 190, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 207, 207, 207, 207, 207, 207, 207,
|
||||
207, 207, 207, 207, 212, 212, 212, 212, 212, 212, 212, 212, 212, 212, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 217, 223, 223, 223, 223, 223, 223, 223,
|
||||
223, 223, 223, 223, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 234, 234, 234, 234, 234, 234, 234, 234, 234, 234, 234, 239, 239, 239, 239, 239, 239,
|
||||
239, 239, 239, 239, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250, 255, 255, 255, 255, 255, 255
|
||||
},
|
||||
{ // QUANT_64
|
||||
0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 12, 12, 12, 12, 12, 12, 12, 12, 16, 16, 16,
|
||||
16, 16, 16, 16, 16, 20, 20, 20, 20, 20, 20, 20, 20, 24, 24, 24, 24, 24, 24, 24, 24, 28, 28, 28, 28, 28, 28, 28, 28, 32, 32, 32,
|
||||
32, 32, 32, 32, 32, 36, 36, 36, 36, 36, 36, 36, 36, 40, 40, 40, 40, 40, 40, 40, 40, 44, 44, 44, 44, 44, 44, 44, 44, 48, 48, 48,
|
||||
48, 48, 48, 48, 48, 52, 52, 52, 52, 52, 52, 52, 52, 56, 56, 56, 56, 56, 56, 56, 56, 60, 60, 60, 60, 60, 60, 60, 60, 60, 65, 65,
|
||||
65, 65, 65, 65, 65, 65, 65, 69, 69, 69, 69, 69, 69, 69, 69, 73, 73, 73, 73, 73, 73, 73, 73, 77, 77, 77, 77, 77, 77, 77, 77, 81,
|
||||
81, 81, 81, 81, 81, 81, 81, 85, 85, 85, 85, 85, 85, 85, 85, 89, 89, 89, 89, 89, 89, 89, 89, 93, 93, 93, 93, 93, 93, 93, 93, 97,
|
||||
97, 97, 97, 97, 97, 97, 97, 101, 101, 101, 101, 101, 101, 101, 101, 105, 105, 105, 105, 105, 105, 105, 105, 109, 109, 109, 109, 109, 109, 109, 109, 113,
|
||||
113, 113, 113, 113, 113, 113, 113, 117, 117, 117, 117, 117, 117, 117, 117, 121, 121, 121, 121, 121, 121, 121, 121, 125, 125, 125, 125, 125, 125, 125, 125, 125,
|
||||
130, 130, 130, 130, 130, 130, 130, 130, 130, 134, 134, 134, 134, 134, 134, 134, 134, 138, 138, 138, 138, 138, 138, 138, 138, 142, 142, 142, 142, 142, 142, 142,
|
||||
142, 146, 146, 146, 146, 146, 146, 146, 146, 150, 150, 150, 150, 150, 150, 150, 150, 154, 154, 154, 154, 154, 154, 154, 154, 158, 158, 158, 158, 158, 158, 158,
|
||||
158, 162, 162, 162, 162, 162, 162, 162, 162, 166, 166, 166, 166, 166, 166, 166, 166, 170, 170, 170, 170, 170, 170, 170, 170, 174, 174, 174, 174, 174, 174, 174,
|
||||
174, 178, 178, 178, 178, 178, 178, 178, 178, 182, 182, 182, 182, 182, 182, 182, 182, 186, 186, 186, 186, 186, 186, 186, 186, 190, 190, 190, 190, 190, 190, 190,
|
||||
190, 190, 195, 195, 195, 195, 195, 195, 195, 195, 195, 199, 199, 199, 199, 199, 199, 199, 199, 203, 203, 203, 203, 203, 203, 203, 203, 207, 207, 207, 207, 207,
|
||||
207, 207, 207, 211, 211, 211, 211, 211, 211, 211, 211, 215, 215, 215, 215, 215, 215, 215, 215, 219, 219, 219, 219, 219, 219, 219, 219, 223, 223, 223, 223, 223,
|
||||
223, 223, 223, 227, 227, 227, 227, 227, 227, 227, 227, 231, 231, 231, 231, 231, 231, 231, 231, 235, 235, 235, 235, 235, 235, 235, 235, 239, 239, 239, 239, 239,
|
||||
239, 239, 239, 243, 243, 243, 243, 243, 243, 243, 243, 247, 247, 247, 247, 247, 247, 247, 247, 251, 251, 251, 251, 251, 251, 251, 251, 255, 255, 255, 255, 255
|
||||
},
|
||||
{ // QUANT_80
|
||||
0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 6, 6, 6, 6, 6, 6, 9, 9, 9, 9, 9, 9, 9, 13, 13, 13, 13, 13, 13, 13, 16, 16,
|
||||
16, 16, 16, 16, 19, 19, 19, 19, 19, 19, 22, 22, 22, 22, 22, 22, 25, 25, 25, 25, 25, 25, 25, 29, 29, 29, 29, 29, 29, 29, 32, 32,
|
||||
32, 32, 32, 32, 35, 35, 35, 35, 35, 35, 38, 38, 38, 38, 38, 38, 38, 42, 42, 42, 42, 42, 42, 42, 45, 45, 45, 45, 45, 45, 48, 48,
|
||||
48, 48, 48, 48, 51, 51, 51, 51, 51, 51, 54, 54, 54, 54, 54, 54, 54, 58, 58, 58, 58, 58, 58, 58, 61, 61, 61, 61, 61, 61, 64, 64,
|
||||
64, 64, 64, 64, 67, 67, 67, 67, 67, 67, 67, 71, 71, 71, 71, 71, 71, 71, 74, 74, 74, 74, 74, 74, 77, 77, 77, 77, 77, 77, 80, 80,
|
||||
80, 80, 80, 80, 83, 83, 83, 83, 83, 83, 83, 87, 87, 87, 87, 87, 87, 87, 90, 90, 90, 90, 90, 90, 93, 93, 93, 93, 93, 93, 96, 96,
|
||||
96, 96, 96, 96, 96, 100, 100, 100, 100, 100, 100, 100, 103, 103, 103, 103, 103, 103, 106, 106, 106, 106, 106, 106, 109, 109, 109, 109, 109, 109, 112, 112,
|
||||
112, 112, 112, 112, 112, 116, 116, 116, 116, 116, 116, 116, 119, 119, 119, 119, 119, 119, 122, 122, 122, 122, 122, 122, 125, 125, 125, 125, 125, 125, 125, 125,
|
||||
130, 130, 130, 130, 130, 130, 130, 130, 133, 133, 133, 133, 133, 133, 136, 136, 136, 136, 136, 136, 139, 139, 139, 139, 139, 139, 139, 143, 143, 143, 143, 143,
|
||||
143, 143, 146, 146, 146, 146, 146, 146, 149, 149, 149, 149, 149, 149, 152, 152, 152, 152, 152, 152, 155, 155, 155, 155, 155, 155, 155, 159, 159, 159, 159, 159,
|
||||
159, 159, 162, 162, 162, 162, 162, 162, 165, 165, 165, 165, 165, 165, 168, 168, 168, 168, 168, 168, 168, 172, 172, 172, 172, 172, 172, 172, 175, 175, 175, 175,
|
||||
175, 175, 178, 178, 178, 178, 178, 178, 181, 181, 181, 181, 181, 181, 184, 184, 184, 184, 184, 184, 184, 188, 188, 188, 188, 188, 188, 188, 191, 191, 191, 191,
|
||||
191, 191, 194, 194, 194, 194, 194, 194, 197, 197, 197, 197, 197, 197, 197, 201, 201, 201, 201, 201, 201, 201, 204, 204, 204, 204, 204, 204, 207, 207, 207, 207,
|
||||
207, 207, 210, 210, 210, 210, 210, 210, 213, 213, 213, 213, 213, 213, 213, 217, 217, 217, 217, 217, 217, 217, 220, 220, 220, 220, 220, 220, 223, 223, 223, 223,
|
||||
223, 223, 226, 226, 226, 226, 226, 226, 226, 230, 230, 230, 230, 230, 230, 230, 233, 233, 233, 233, 233, 233, 236, 236, 236, 236, 236, 236, 239, 239, 239, 239,
|
||||
239, 239, 242, 242, 242, 242, 242, 242, 242, 246, 246, 246, 246, 246, 246, 246, 249, 249, 249, 249, 249, 249, 252, 252, 252, 252, 252, 252, 255, 255, 255, 255
|
||||
},
|
||||
{ // QUANT_96
|
||||
0, 0, 0, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8, 10, 10, 10, 10, 10, 13, 13, 13, 13, 13, 13, 16, 16,
|
||||
16, 16, 16, 18, 18, 18, 18, 18, 21, 21, 21, 21, 21, 21, 24, 24, 24, 24, 24, 26, 26, 26, 26, 26, 29, 29, 29, 29, 29, 29, 32, 32,
|
||||
32, 32, 32, 32, 35, 35, 35, 35, 35, 37, 37, 37, 37, 37, 40, 40, 40, 40, 40, 40, 43, 43, 43, 43, 43, 45, 45, 45, 45, 45, 48, 48,
|
||||
48, 48, 48, 48, 51, 51, 51, 51, 51, 53, 53, 53, 53, 53, 56, 56, 56, 56, 56, 56, 59, 59, 59, 59, 59, 61, 61, 61, 61, 61, 64, 64,
|
||||
64, 64, 64, 64, 67, 67, 67, 67, 67, 67, 70, 70, 70, 70, 70, 72, 72, 72, 72, 72, 75, 75, 75, 75, 75, 75, 78, 78, 78, 78, 78, 80,
|
||||
80, 80, 80, 80, 83, 83, 83, 83, 83, 83, 86, 86, 86, 86, 86, 88, 88, 88, 88, 88, 91, 91, 91, 91, 91, 91, 94, 94, 94, 94, 94, 96,
|
||||
96, 96, 96, 96, 99, 99, 99, 99, 99, 99, 102, 102, 102, 102, 102, 104, 104, 104, 104, 104, 107, 107, 107, 107, 107, 107, 110, 110, 110, 110, 110, 112,
|
||||
112, 112, 112, 112, 115, 115, 115, 115, 115, 115, 118, 118, 118, 118, 118, 120, 120, 120, 120, 120, 123, 123, 123, 123, 123, 123, 126, 126, 126, 126, 126, 126,
|
||||
129, 129, 129, 129, 129, 129, 132, 132, 132, 132, 132, 132, 135, 135, 135, 135, 135, 137, 137, 137, 137, 137, 140, 140, 140, 140, 140, 140, 143, 143, 143, 143,
|
||||
143, 145, 145, 145, 145, 145, 148, 148, 148, 148, 148, 148, 151, 151, 151, 151, 151, 153, 153, 153, 153, 153, 156, 156, 156, 156, 156, 156, 159, 159, 159, 159,
|
||||
159, 161, 161, 161, 161, 161, 164, 164, 164, 164, 164, 164, 167, 167, 167, 167, 167, 169, 169, 169, 169, 169, 172, 172, 172, 172, 172, 172, 175, 175, 175, 175,
|
||||
175, 177, 177, 177, 177, 177, 180, 180, 180, 180, 180, 180, 183, 183, 183, 183, 183, 185, 185, 185, 185, 185, 188, 188, 188, 188, 188, 188, 191, 191, 191, 191,
|
||||
191, 191, 194, 194, 194, 194, 194, 196, 196, 196, 196, 196, 199, 199, 199, 199, 199, 199, 202, 202, 202, 202, 202, 204, 204, 204, 204, 204, 207, 207, 207, 207,
|
||||
207, 207, 210, 210, 210, 210, 210, 212, 212, 212, 212, 212, 215, 215, 215, 215, 215, 215, 218, 218, 218, 218, 218, 220, 220, 220, 220, 220, 223, 223, 223, 223,
|
||||
223, 223, 226, 226, 226, 226, 226, 226, 229, 229, 229, 229, 229, 231, 231, 231, 231, 231, 234, 234, 234, 234, 234, 234, 237, 237, 237, 237, 237, 239, 239, 239,
|
||||
239, 239, 242, 242, 242, 242, 242, 242, 245, 245, 245, 245, 245, 247, 247, 247, 247, 247, 250, 250, 250, 250, 250, 250, 253, 253, 253, 253, 253, 255, 255, 255
|
||||
},
|
||||
{ // QUANT_128
|
||||
0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8, 8, 8, 8, 10, 10, 10, 10, 12, 12, 12, 12, 14, 14, 14, 14, 16,
|
||||
16, 16, 16, 18, 18, 18, 18, 20, 20, 20, 20, 22, 22, 22, 22, 24, 24, 24, 24, 26, 26, 26, 26, 28, 28, 28, 28, 30, 30, 30, 30, 32,
|
||||
32, 32, 32, 34, 34, 34, 34, 36, 36, 36, 36, 38, 38, 38, 38, 40, 40, 40, 40, 42, 42, 42, 42, 44, 44, 44, 44, 46, 46, 46, 46, 48,
|
||||
48, 48, 48, 50, 50, 50, 50, 52, 52, 52, 52, 54, 54, 54, 54, 56, 56, 56, 56, 58, 58, 58, 58, 60, 60, 60, 60, 62, 62, 62, 62, 64,
|
||||
64, 64, 64, 66, 66, 66, 66, 68, 68, 68, 68, 70, 70, 70, 70, 72, 72, 72, 72, 74, 74, 74, 74, 76, 76, 76, 76, 78, 78, 78, 78, 80,
|
||||
80, 80, 80, 82, 82, 82, 82, 84, 84, 84, 84, 86, 86, 86, 86, 88, 88, 88, 88, 90, 90, 90, 90, 92, 92, 92, 92, 94, 94, 94, 94, 96,
|
||||
96, 96, 96, 98, 98, 98, 98, 100, 100, 100, 100, 102, 102, 102, 102, 104, 104, 104, 104, 106, 106, 106, 106, 108, 108, 108, 108, 110, 110, 110, 110, 112,
|
||||
112, 112, 112, 114, 114, 114, 114, 116, 116, 116, 116, 118, 118, 118, 118, 120, 120, 120, 120, 122, 122, 122, 122, 124, 124, 124, 124, 126, 126, 126, 126, 126,
|
||||
129, 129, 129, 129, 129, 131, 131, 131, 131, 133, 133, 133, 133, 135, 135, 135, 135, 137, 137, 137, 137, 139, 139, 139, 139, 141, 141, 141, 141, 143, 143, 143,
|
||||
143, 145, 145, 145, 145, 147, 147, 147, 147, 149, 149, 149, 149, 151, 151, 151, 151, 153, 153, 153, 153, 155, 155, 155, 155, 157, 157, 157, 157, 159, 159, 159,
|
||||
159, 161, 161, 161, 161, 163, 163, 163, 163, 165, 165, 165, 165, 167, 167, 167, 167, 169, 169, 169, 169, 171, 171, 171, 171, 173, 173, 173, 173, 175, 175, 175,
|
||||
175, 177, 177, 177, 177, 179, 179, 179, 179, 181, 181, 181, 181, 183, 183, 183, 183, 185, 185, 185, 185, 187, 187, 187, 187, 189, 189, 189, 189, 191, 191, 191,
|
||||
191, 193, 193, 193, 193, 195, 195, 195, 195, 197, 197, 197, 197, 199, 199, 199, 199, 201, 201, 201, 201, 203, 203, 203, 203, 205, 205, 205, 205, 207, 207, 207,
|
||||
207, 209, 209, 209, 209, 211, 211, 211, 211, 213, 213, 213, 213, 215, 215, 215, 215, 217, 217, 217, 217, 219, 219, 219, 219, 221, 221, 221, 221, 223, 223, 223,
|
||||
223, 225, 225, 225, 225, 227, 227, 227, 227, 229, 229, 229, 229, 231, 231, 231, 231, 233, 233, 233, 233, 235, 235, 235, 235, 237, 237, 237, 237, 239, 239, 239,
|
||||
239, 241, 241, 241, 241, 243, 243, 243, 243, 245, 245, 245, 245, 247, 247, 247, 247, 249, 249, 249, 249, 251, 251, 251, 251, 253, 253, 253, 253, 255, 255, 255
|
||||
},
|
||||
{ // QUANT_160
|
||||
0, 0, 1, 1, 1, 3, 3, 3, 4, 4, 4, 6, 6, 6, 6, 8, 8, 8, 9, 9, 9, 11, 11, 11, 12, 12, 12, 14, 14, 14, 14, 16,
|
||||
16, 16, 17, 17, 17, 19, 19, 19, 20, 20, 20, 22, 22, 22, 22, 24, 24, 24, 25, 25, 25, 27, 27, 27, 28, 28, 28, 30, 30, 30, 30, 32,
|
||||
32, 32, 33, 33, 33, 35, 35, 35, 36, 36, 36, 38, 38, 38, 38, 40, 40, 40, 41, 41, 41, 43, 43, 43, 44, 44, 44, 46, 46, 46, 46, 48,
|
||||
48, 48, 49, 49, 49, 51, 51, 51, 52, 52, 52, 54, 54, 54, 54, 56, 56, 56, 57, 57, 57, 59, 59, 59, 60, 60, 60, 62, 62, 62, 62, 64,
|
||||
64, 64, 65, 65, 65, 67, 67, 67, 68, 68, 68, 70, 70, 70, 70, 72, 72, 72, 73, 73, 73, 75, 75, 75, 76, 76, 76, 78, 78, 78, 78, 80,
|
||||
80, 80, 81, 81, 81, 83, 83, 83, 84, 84, 84, 86, 86, 86, 86, 88, 88, 88, 89, 89, 89, 91, 91, 91, 92, 92, 92, 94, 94, 94, 94, 96,
|
||||
96, 96, 97, 97, 97, 99, 99, 99, 100, 100, 100, 102, 102, 102, 102, 104, 104, 104, 105, 105, 105, 107, 107, 107, 108, 108, 108, 110, 110, 110, 110, 112,
|
||||
112, 112, 113, 113, 113, 115, 115, 115, 116, 116, 116, 118, 118, 118, 118, 120, 120, 120, 121, 121, 121, 123, 123, 123, 124, 124, 124, 126, 126, 126, 126, 126,
|
||||
129, 129, 129, 129, 129, 131, 131, 131, 132, 132, 132, 134, 134, 134, 135, 135, 135, 137, 137, 137, 137, 139, 139, 139, 140, 140, 140, 142, 142, 142, 143, 143,
|
||||
143, 145, 145, 145, 145, 147, 147, 147, 148, 148, 148, 150, 150, 150, 151, 151, 151, 153, 153, 153, 153, 155, 155, 155, 156, 156, 156, 158, 158, 158, 159, 159,
|
||||
159, 161, 161, 161, 161, 163, 163, 163, 164, 164, 164, 166, 166, 166, 167, 167, 167, 169, 169, 169, 169, 171, 171, 171, 172, 172, 172, 174, 174, 174, 175, 175,
|
||||
175, 177, 177, 177, 177, 179, 179, 179, 180, 180, 180, 182, 182, 182, 183, 183, 183, 185, 185, 185, 185, 187, 187, 187, 188, 188, 188, 190, 190, 190, 191, 191,
|
||||
191, 193, 193, 193, 193, 195, 195, 195, 196, 196, 196, 198, 198, 198, 199, 199, 199, 201, 201, 201, 201, 203, 203, 203, 204, 204, 204, 206, 206, 206, 207, 207,
|
||||
207, 209, 209, 209, 209, 211, 211, 211, 212, 212, 212, 214, 214, 214, 215, 215, 215, 217, 217, 217, 217, 219, 219, 219, 220, 220, 220, 222, 222, 222, 223, 223,
|
||||
223, 225, 225, 225, 225, 227, 227, 227, 228, 228, 228, 230, 230, 230, 231, 231, 231, 233, 233, 233, 233, 235, 235, 235, 236, 236, 236, 238, 238, 238, 239, 239,
|
||||
239, 241, 241, 241, 241, 243, 243, 243, 244, 244, 244, 246, 246, 246, 247, 247, 247, 249, 249, 249, 249, 251, 251, 251, 252, 252, 252, 254, 254, 254, 255, 255
|
||||
},
|
||||
{ // QUANT_192
|
||||
0, 0, 1, 1, 2, 2, 2, 4, 4, 4, 5, 5, 6, 6, 6, 8, 8, 8, 9, 9, 10, 10, 10, 12, 12, 12, 13, 13, 14, 14, 14, 16,
|
||||
16, 16, 17, 17, 18, 18, 18, 20, 20, 20, 21, 21, 22, 22, 22, 24, 24, 24, 25, 25, 26, 26, 26, 28, 28, 28, 29, 29, 30, 30, 30, 32,
|
||||
32, 32, 33, 33, 34, 34, 34, 36, 36, 36, 37, 37, 38, 38, 38, 40, 40, 40, 41, 41, 42, 42, 42, 44, 44, 44, 45, 45, 46, 46, 46, 48,
|
||||
48, 48, 49, 49, 50, 50, 50, 52, 52, 52, 53, 53, 54, 54, 54, 56, 56, 56, 57, 57, 58, 58, 58, 60, 60, 60, 61, 61, 62, 62, 62, 64,
|
||||
64, 64, 65, 65, 66, 66, 66, 68, 68, 68, 69, 69, 70, 70, 70, 72, 72, 72, 73, 73, 74, 74, 74, 76, 76, 76, 77, 77, 78, 78, 78, 80,
|
||||
80, 80, 81, 81, 82, 82, 82, 84, 84, 84, 85, 85, 86, 86, 86, 88, 88, 88, 89, 89, 90, 90, 90, 92, 92, 92, 93, 93, 94, 94, 94, 96,
|
||||
96, 96, 97, 97, 98, 98, 98, 100, 100, 100, 101, 101, 102, 102, 102, 104, 104, 104, 105, 105, 106, 106, 106, 108, 108, 108, 109, 109, 110, 110, 110, 112,
|
||||
112, 112, 113, 113, 114, 114, 114, 116, 116, 116, 117, 117, 118, 118, 118, 120, 120, 120, 121, 121, 122, 122, 122, 124, 124, 124, 125, 125, 126, 126, 126, 126,
|
||||
129, 129, 129, 129, 130, 130, 131, 131, 131, 133, 133, 133, 134, 134, 135, 135, 135, 137, 137, 137, 138, 138, 139, 139, 139, 141, 141, 141, 142, 142, 143, 143,
|
||||
143, 145, 145, 145, 146, 146, 147, 147, 147, 149, 149, 149, 150, 150, 151, 151, 151, 153, 153, 153, 154, 154, 155, 155, 155, 157, 157, 157, 158, 158, 159, 159,
|
||||
159, 161, 161, 161, 162, 162, 163, 163, 163, 165, 165, 165, 166, 166, 167, 167, 167, 169, 169, 169, 170, 170, 171, 171, 171, 173, 173, 173, 174, 174, 175, 175,
|
||||
175, 177, 177, 177, 178, 178, 179, 179, 179, 181, 181, 181, 182, 182, 183, 183, 183, 185, 185, 185, 186, 186, 187, 187, 187, 189, 189, 189, 190, 190, 191, 191,
|
||||
191, 193, 193, 193, 194, 194, 195, 195, 195, 197, 197, 197, 198, 198, 199, 199, 199, 201, 201, 201, 202, 202, 203, 203, 203, 205, 205, 205, 206, 206, 207, 207,
|
||||
207, 209, 209, 209, 210, 210, 211, 211, 211, 213, 213, 213, 214, 214, 215, 215, 215, 217, 217, 217, 218, 218, 219, 219, 219, 221, 221, 221, 222, 222, 223, 223,
|
||||
223, 225, 225, 225, 226, 226, 227, 227, 227, 229, 229, 229, 230, 230, 231, 231, 231, 233, 233, 233, 234, 234, 235, 235, 235, 237, 237, 237, 238, 238, 239, 239,
|
||||
239, 241, 241, 241, 242, 242, 243, 243, 243, 245, 245, 245, 246, 246, 247, 247, 247, 249, 249, 249, 250, 250, 251, 251, 251, 253, 253, 253, 254, 254, 255, 255
|
||||
},
|
||||
{ // QUANT_256
|
||||
0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15,
|
||||
16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31,
|
||||
32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40, 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47,
|
||||
48, 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55, 56, 56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63,
|
||||
64, 64, 65, 65, 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71, 72, 72, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79,
|
||||
80, 80, 81, 81, 82, 82, 83, 83, 84, 84, 85, 85, 86, 86, 87, 87, 88, 88, 89, 89, 90, 90, 91, 91, 92, 92, 93, 93, 94, 94, 95, 95,
|
||||
96, 96, 97, 97, 98, 98, 99, 99, 100, 100, 101, 101, 102, 102, 103, 103, 104, 104, 105, 105, 106, 106, 107, 107, 108, 108, 109, 109, 110, 110, 111, 111,
|
||||
112, 112, 113, 113, 114, 114, 115, 115, 116, 116, 117, 117, 118, 118, 119, 119, 120, 120, 121, 121, 122, 122, 123, 123, 124, 124, 125, 125, 126, 126, 127, 127,
|
||||
128, 128, 129, 129, 130, 130, 131, 131, 132, 132, 133, 133, 134, 134, 135, 135, 136, 136, 137, 137, 138, 138, 139, 139, 140, 140, 141, 141, 142, 142, 143, 143,
|
||||
144, 144, 145, 145, 146, 146, 147, 147, 148, 148, 149, 149, 150, 150, 151, 151, 152, 152, 153, 153, 154, 154, 155, 155, 156, 156, 157, 157, 158, 158, 159, 159,
|
||||
160, 160, 161, 161, 162, 162, 163, 163, 164, 164, 165, 165, 166, 166, 167, 167, 168, 168, 169, 169, 170, 170, 171, 171, 172, 172, 173, 173, 174, 174, 175, 175,
|
||||
176, 176, 177, 177, 178, 178, 179, 179, 180, 180, 181, 181, 182, 182, 183, 183, 184, 184, 185, 185, 186, 186, 187, 187, 188, 188, 189, 189, 190, 190, 191, 191,
|
||||
192, 192, 193, 193, 194, 194, 195, 195, 196, 196, 197, 197, 198, 198, 199, 199, 200, 200, 201, 201, 202, 202, 203, 203, 204, 204, 205, 205, 206, 206, 207, 207,
|
||||
208, 208, 209, 209, 210, 210, 211, 211, 212, 212, 213, 213, 214, 214, 215, 215, 216, 216, 217, 217, 218, 218, 219, 219, 220, 220, 221, 221, 222, 222, 223, 223,
|
||||
224, 224, 225, 225, 226, 226, 227, 227, 228, 228, 229, 229, 230, 230, 231, 231, 232, 232, 233, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238, 239, 239,
|
||||
240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255, 255
|
||||
},
|
||||
};
|
||||
|
||||
// Starts from QUANT_6
|
||||
// Scrambled
|
||||
const uint8_t color_uquant_to_scrambled_pquant_tables[17][256] {
|
||||
{ // QUANT_6
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
|
||||
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
|
||||
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
|
||||
5, 5, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
|
||||
},
|
||||
{ // QUANT_8
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
|
||||
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
|
||||
5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7,
|
||||
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
|
||||
},
|
||||
{ // QUANT_10
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7,
|
||||
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
|
||||
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5,
|
||||
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
|
||||
5, 5, 5, 5, 5, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
|
||||
},
|
||||
{ // QUANT_12
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
2, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 10, 10, 10, 10, 10, 10, 10, 10,
|
||||
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
|
||||
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
||||
11, 11, 11, 11, 11, 11, 11, 11, 7, 7, 7, 7, 7, 7, 7, 7,
|
||||
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
3, 3, 3, 3, 3, 3, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 5, 5, 5,
|
||||
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
|
||||
5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
|
||||
},
|
||||
{ // QUANT_16
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5,
|
||||
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7,
|
||||
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
|
||||
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||||
9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
|
||||
10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
||||
11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
|
||||
12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
|
||||
13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
|
||||
14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15
|
||||
},
|
||||
{ // QUANT_20
|
||||
0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||||
4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 8, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16,
|
||||
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 10, 10, 10, 10, 10, 10, 10, 10, 10,
|
||||
10, 10, 10, 10, 10, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
|
||||
14, 14, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
|
||||
19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 15, 15,
|
||||
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 11, 11, 11, 11, 11,
|
||||
11, 11, 11, 11, 11, 11, 11, 11, 11, 7, 7, 7, 7, 7, 7, 7,
|
||||
7, 7, 7, 7, 7, 7, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
3, 3, 3, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
|
||||
17, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 9, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 5, 5, 5, 5,
|
||||
5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, 1, 1
|
||||
},
|
||||
{ // QUANT_24
|
||||
0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||||
8, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 10, 10, 10, 10, 10, 10, 10, 10, 10,
|
||||
10, 10, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 12, 12, 12, 12, 12, 12, 12, 12,
|
||||
12, 12, 12, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 6, 6,
|
||||
6, 6, 6, 6, 6, 6, 6, 6, 6, 14, 14, 14, 14, 14, 14, 14,
|
||||
14, 14, 14, 14, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
|
||||
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 15, 15, 15, 15,
|
||||
15, 15, 15, 15, 15, 15, 15, 7, 7, 7, 7, 7, 7, 7, 7, 7,
|
||||
7, 7, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 13, 13, 13,
|
||||
13, 13, 13, 13, 13, 13, 13, 13, 5, 5, 5, 5, 5, 5, 5, 5,
|
||||
5, 5, 5, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 11, 11,
|
||||
11, 11, 11, 11, 11, 11, 11, 11, 11, 3, 3, 3, 3, 3, 3, 3,
|
||||
3, 3, 3, 3, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 9,
|
||||
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1, 1
|
||||
},
|
||||
{ // QUANT_32
|
||||
0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4,
|
||||
4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6,
|
||||
6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8,
|
||||
8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10,
|
||||
10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12,
|
||||
12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 14,
|
||||
14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15,
|
||||
16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17,
|
||||
18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
|
||||
19, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21,
|
||||
21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23,
|
||||
23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25,
|
||||
25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27,
|
||||
27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29,
|
||||
29, 29, 29, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31
|
||||
},
|
||||
{ // QUANT_40
|
||||
0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16,
|
||||
24, 24, 24, 24, 24, 24, 24, 32, 32, 32, 32, 32, 32, 32, 2, 2,
|
||||
2, 2, 2, 2, 10, 10, 10, 10, 10, 10, 18, 18, 18, 18, 18, 18,
|
||||
18, 26, 26, 26, 26, 26, 26, 34, 34, 34, 34, 34, 34, 34, 4, 4,
|
||||
4, 4, 4, 4, 12, 12, 12, 12, 12, 12, 12, 20, 20, 20, 20, 20,
|
||||
20, 28, 28, 28, 28, 28, 28, 28, 36, 36, 36, 36, 36, 36, 36, 6,
|
||||
6, 6, 6, 6, 6, 14, 14, 14, 14, 14, 14, 22, 22, 22, 22, 22,
|
||||
22, 22, 30, 30, 30, 30, 30, 30, 38, 38, 38, 38, 38, 38, 38, 38,
|
||||
39, 39, 39, 39, 39, 39, 39, 39, 31, 31, 31, 31, 31, 31, 23, 23,
|
||||
23, 23, 23, 23, 23, 15, 15, 15, 15, 15, 15, 7, 7, 7, 7, 7,
|
||||
7, 37, 37, 37, 37, 37, 37, 37, 29, 29, 29, 29, 29, 29, 29, 21,
|
||||
21, 21, 21, 21, 21, 13, 13, 13, 13, 13, 13, 13, 5, 5, 5, 5,
|
||||
5, 5, 35, 35, 35, 35, 35, 35, 35, 27, 27, 27, 27, 27, 27, 19,
|
||||
19, 19, 19, 19, 19, 19, 11, 11, 11, 11, 11, 11, 3, 3, 3, 3,
|
||||
3, 3, 33, 33, 33, 33, 33, 33, 33, 25, 25, 25, 25, 25, 25, 25,
|
||||
17, 17, 17, 17, 17, 17, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1
|
||||
},
|
||||
{ // QUANT_48
|
||||
0, 0, 0, 16, 16, 16, 16, 16, 32, 32, 32, 32, 32, 32, 2, 2,
|
||||
2, 2, 2, 18, 18, 18, 18, 18, 34, 34, 34, 34, 34, 34, 4, 4,
|
||||
4, 4, 4, 20, 20, 20, 20, 20, 20, 36, 36, 36, 36, 36, 6, 6,
|
||||
6, 6, 6, 22, 22, 22, 22, 22, 22, 38, 38, 38, 38, 38, 38, 8,
|
||||
8, 8, 8, 8, 24, 24, 24, 24, 24, 40, 40, 40, 40, 40, 40, 10,
|
||||
10, 10, 10, 10, 26, 26, 26, 26, 26, 42, 42, 42, 42, 42, 42, 12,
|
||||
12, 12, 12, 12, 28, 28, 28, 28, 28, 28, 44, 44, 44, 44, 44, 14,
|
||||
14, 14, 14, 14, 30, 30, 30, 30, 30, 30, 46, 46, 46, 46, 46, 46,
|
||||
47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 15, 15, 15, 15,
|
||||
15, 45, 45, 45, 45, 45, 29, 29, 29, 29, 29, 29, 13, 13, 13, 13,
|
||||
13, 43, 43, 43, 43, 43, 43, 27, 27, 27, 27, 27, 11, 11, 11, 11,
|
||||
11, 41, 41, 41, 41, 41, 41, 25, 25, 25, 25, 25, 9, 9, 9, 9,
|
||||
9, 39, 39, 39, 39, 39, 39, 23, 23, 23, 23, 23, 23, 7, 7, 7,
|
||||
7, 7, 37, 37, 37, 37, 37, 21, 21, 21, 21, 21, 21, 5, 5, 5,
|
||||
5, 5, 35, 35, 35, 35, 35, 35, 19, 19, 19, 19, 19, 3, 3, 3,
|
||||
3, 3, 33, 33, 33, 33, 33, 33, 17, 17, 17, 17, 17, 1, 1, 1
|
||||
},
|
||||
{ // QUANT_64
|
||||
0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4,
|
||||
4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8,
|
||||
8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12,
|
||||
12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 15, 16,
|
||||
16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 20,
|
||||
20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 24,
|
||||
24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27, 28,
|
||||
28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 31,
|
||||
32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35,
|
||||
36, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 39, 39, 39,
|
||||
40, 40, 40, 40, 41, 41, 41, 41, 42, 42, 42, 42, 43, 43, 43, 43,
|
||||
44, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47,
|
||||
47, 48, 48, 48, 48, 49, 49, 49, 49, 50, 50, 50, 50, 51, 51, 51,
|
||||
51, 52, 52, 52, 52, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 55,
|
||||
55, 56, 56, 56, 56, 57, 57, 57, 57, 58, 58, 58, 58, 59, 59, 59,
|
||||
59, 60, 60, 60, 60, 61, 61, 61, 61, 62, 62, 62, 62, 63, 63, 63
|
||||
},
|
||||
{ // QUANT_80
|
||||
0, 0, 16, 16, 16, 32, 32, 32, 48, 48, 48, 64, 64, 64, 64, 2,
|
||||
2, 2, 18, 18, 18, 34, 34, 34, 50, 50, 50, 66, 66, 66, 66, 4,
|
||||
4, 4, 20, 20, 20, 36, 36, 36, 52, 52, 52, 52, 68, 68, 68, 6,
|
||||
6, 6, 22, 22, 22, 38, 38, 38, 54, 54, 54, 54, 70, 70, 70, 8,
|
||||
8, 8, 24, 24, 24, 40, 40, 40, 40, 56, 56, 56, 72, 72, 72, 10,
|
||||
10, 10, 26, 26, 26, 42, 42, 42, 42, 58, 58, 58, 74, 74, 74, 12,
|
||||
12, 12, 28, 28, 28, 28, 44, 44, 44, 60, 60, 60, 76, 76, 76, 14,
|
||||
14, 14, 30, 30, 30, 30, 46, 46, 46, 62, 62, 62, 78, 78, 78, 78,
|
||||
79, 79, 79, 79, 63, 63, 63, 47, 47, 47, 31, 31, 31, 31, 15, 15,
|
||||
15, 77, 77, 77, 61, 61, 61, 45, 45, 45, 29, 29, 29, 29, 13, 13,
|
||||
13, 75, 75, 75, 59, 59, 59, 43, 43, 43, 43, 27, 27, 27, 11, 11,
|
||||
11, 73, 73, 73, 57, 57, 57, 41, 41, 41, 41, 25, 25, 25, 9, 9,
|
||||
9, 71, 71, 71, 55, 55, 55, 55, 39, 39, 39, 23, 23, 23, 7, 7,
|
||||
7, 69, 69, 69, 53, 53, 53, 53, 37, 37, 37, 21, 21, 21, 5, 5,
|
||||
5, 67, 67, 67, 67, 51, 51, 51, 35, 35, 35, 19, 19, 19, 3, 3,
|
||||
3, 65, 65, 65, 65, 49, 49, 49, 33, 33, 33, 17, 17, 17, 1, 1
|
||||
},
|
||||
{ // QUANT_96
|
||||
0, 32, 32, 32, 64, 64, 64, 2, 2, 34, 34, 34, 66, 66, 66, 4,
|
||||
4, 36, 36, 36, 68, 68, 68, 6, 6, 38, 38, 38, 70, 70, 70, 8,
|
||||
8, 8, 40, 40, 72, 72, 72, 10, 10, 10, 42, 42, 74, 74, 74, 12,
|
||||
12, 12, 44, 44, 76, 76, 76, 14, 14, 14, 46, 46, 78, 78, 78, 16,
|
||||
16, 16, 48, 48, 48, 80, 80, 80, 18, 18, 50, 50, 50, 82, 82, 82,
|
||||
20, 20, 52, 52, 52, 84, 84, 84, 22, 22, 54, 54, 54, 86, 86, 86,
|
||||
24, 24, 56, 56, 56, 88, 88, 88, 26, 26, 58, 58, 58, 90, 90, 90,
|
||||
28, 28, 60, 60, 60, 92, 92, 92, 30, 30, 62, 62, 62, 94, 94, 94,
|
||||
95, 95, 95, 63, 63, 63, 31, 31, 93, 93, 93, 61, 61, 61, 29, 29,
|
||||
91, 91, 91, 59, 59, 59, 27, 27, 89, 89, 89, 57, 57, 57, 25, 25,
|
||||
87, 87, 87, 55, 55, 55, 23, 23, 85, 85, 85, 53, 53, 53, 21, 21,
|
||||
83, 83, 83, 51, 51, 51, 19, 19, 81, 81, 81, 49, 49, 49, 17, 17,
|
||||
17, 79, 79, 79, 47, 47, 15, 15, 15, 77, 77, 77, 45, 45, 13, 13,
|
||||
13, 75, 75, 75, 43, 43, 11, 11, 11, 73, 73, 73, 41, 41, 9, 9,
|
||||
9, 71, 71, 71, 39, 39, 39, 7, 7, 69, 69, 69, 37, 37, 37, 5,
|
||||
5, 67, 67, 67, 35, 35, 35, 3, 3, 65, 65, 65, 33, 33, 33, 1
|
||||
},
|
||||
{ // QUANT_128
|
||||
0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
|
||||
8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16,
|
||||
16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24,
|
||||
24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32,
|
||||
32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40,
|
||||
40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48,
|
||||
48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55, 56,
|
||||
56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 63,
|
||||
64, 64, 65, 65, 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71,
|
||||
72, 72, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79,
|
||||
80, 80, 81, 81, 82, 82, 83, 83, 84, 84, 85, 85, 86, 86, 87, 87,
|
||||
88, 88, 89, 89, 90, 90, 91, 91, 92, 92, 93, 93, 94, 94, 95, 95,
|
||||
96, 96, 97, 97, 98, 98, 99, 99, 100, 100, 101, 101, 102, 102, 103, 103,
|
||||
104, 104, 105, 105, 106, 106, 107, 107, 108, 108, 109, 109, 110, 110, 111, 111,
|
||||
112, 112, 113, 113, 114, 114, 115, 115, 116, 116, 117, 117, 118, 118, 119, 119,
|
||||
120, 120, 121, 121, 122, 122, 123, 123, 124, 124, 125, 125, 126, 126, 127, 127
|
||||
},
|
||||
{ // QUANT_160
|
||||
0, 32, 64, 64, 96, 128, 128, 128, 2, 34, 66, 66, 98, 130, 130, 130,
|
||||
4, 36, 68, 68, 100, 132, 132, 132, 6, 38, 70, 70, 102, 134, 134, 134,
|
||||
8, 40, 72, 72, 104, 136, 136, 136, 10, 42, 74, 74, 106, 138, 138, 138,
|
||||
12, 44, 76, 76, 108, 140, 140, 140, 14, 46, 78, 78, 110, 142, 142, 142,
|
||||
16, 48, 80, 80, 112, 144, 144, 144, 18, 50, 82, 82, 114, 146, 146, 146,
|
||||
20, 52, 84, 84, 116, 148, 148, 148, 22, 54, 86, 86, 118, 150, 150, 150,
|
||||
24, 56, 88, 88, 120, 152, 152, 152, 26, 58, 90, 90, 122, 154, 154, 154,
|
||||
28, 60, 92, 92, 124, 156, 156, 156, 30, 62, 94, 94, 126, 158, 158, 158,
|
||||
159, 159, 159, 127, 95, 95, 63, 31, 157, 157, 157, 125, 93, 93, 61, 29,
|
||||
155, 155, 155, 123, 91, 91, 59, 27, 153, 153, 153, 121, 89, 89, 57, 25,
|
||||
151, 151, 151, 119, 87, 87, 55, 23, 149, 149, 149, 117, 85, 85, 53, 21,
|
||||
147, 147, 147, 115, 83, 83, 51, 19, 145, 145, 145, 113, 81, 81, 49, 17,
|
||||
143, 143, 143, 111, 79, 79, 47, 15, 141, 141, 141, 109, 77, 77, 45, 13,
|
||||
139, 139, 139, 107, 75, 75, 43, 11, 137, 137, 137, 105, 73, 73, 41, 9,
|
||||
135, 135, 135, 103, 71, 71, 39, 7, 133, 133, 133, 101, 69, 69, 37, 5,
|
||||
131, 131, 131, 99, 67, 67, 35, 3, 129, 129, 129, 97, 65, 65, 33, 1
|
||||
},
|
||||
{ // QUANT_192
|
||||
0, 64, 128, 128, 2, 66, 130, 130, 4, 68, 132, 132, 6, 70, 134, 134,
|
||||
8, 72, 136, 136, 10, 74, 138, 138, 12, 76, 140, 140, 14, 78, 142, 142,
|
||||
16, 80, 144, 144, 18, 82, 146, 146, 20, 84, 148, 148, 22, 86, 150, 150,
|
||||
24, 88, 152, 152, 26, 90, 154, 154, 28, 92, 156, 156, 30, 94, 158, 158,
|
||||
32, 96, 160, 160, 34, 98, 162, 162, 36, 100, 164, 164, 38, 102, 166, 166,
|
||||
40, 104, 168, 168, 42, 106, 170, 170, 44, 108, 172, 172, 46, 110, 174, 174,
|
||||
48, 112, 176, 176, 50, 114, 178, 178, 52, 116, 180, 180, 54, 118, 182, 182,
|
||||
56, 120, 184, 184, 58, 122, 186, 186, 60, 124, 188, 188, 62, 126, 190, 190,
|
||||
191, 191, 127, 63, 189, 189, 125, 61, 187, 187, 123, 59, 185, 185, 121, 57,
|
||||
183, 183, 119, 55, 181, 181, 117, 53, 179, 179, 115, 51, 177, 177, 113, 49,
|
||||
175, 175, 111, 47, 173, 173, 109, 45, 171, 171, 107, 43, 169, 169, 105, 41,
|
||||
167, 167, 103, 39, 165, 165, 101, 37, 163, 163, 99, 35, 161, 161, 97, 33,
|
||||
159, 159, 95, 31, 157, 157, 93, 29, 155, 155, 91, 27, 153, 153, 89, 25,
|
||||
151, 151, 87, 23, 149, 149, 85, 21, 147, 147, 83, 19, 145, 145, 81, 17,
|
||||
143, 143, 79, 15, 141, 141, 77, 13, 139, 139, 75, 11, 137, 137, 73, 9,
|
||||
135, 135, 71, 7, 133, 133, 69, 5, 131, 131, 67, 3, 129, 129, 65, 1
|
||||
},
|
||||
{ // QUANT_256
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
||||
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
|
||||
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
|
||||
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
|
||||
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
|
||||
96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
|
||||
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
|
||||
128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
|
||||
144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
|
||||
160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
|
||||
176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
|
||||
192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
|
||||
208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
|
||||
224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
|
||||
240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
// Starts from QUANT_6
|
||||
// Scrambled
|
||||
static const uint8_t color_scrambled_pquant_to_uquant_q6[6] {
|
||||
0, 255, 51, 204, 102, 153
|
||||
};
|
||||
|
||||
static const uint8_t color_scrambled_pquant_to_uquant_q8[8] {
|
||||
0, 36, 73, 109, 146, 182, 219, 255
|
||||
};
|
||||
|
||||
static const uint8_t color_scrambled_pquant_to_uquant_q10[10] {
|
||||
0, 255, 28, 227, 56, 199, 84, 171, 113, 142
|
||||
};
|
||||
|
||||
static const uint8_t color_scrambled_pquant_to_uquant_q12[12] {
|
||||
0, 255, 69, 186, 23, 232, 92, 163, 46, 209, 116, 139
|
||||
};
|
||||
|
||||
static const uint8_t color_scrambled_pquant_to_uquant_q16[16] {
|
||||
0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255
|
||||
};
|
||||
|
||||
static const uint8_t color_scrambled_pquant_to_uquant_q20[20] {
|
||||
0, 255, 67, 188, 13, 242, 80, 175, 27, 228, 94, 161, 40, 215, 107, 148,
|
||||
54, 201, 121, 134
|
||||
};
|
||||
|
||||
static const uint8_t color_scrambled_pquant_to_uquant_q24[24] {
|
||||
0, 255, 33, 222, 66, 189, 99, 156, 11, 244, 44, 211, 77, 178, 110, 145,
|
||||
22, 233, 55, 200, 88, 167, 121, 134
|
||||
};
|
||||
|
||||
static const uint8_t color_scrambled_pquant_to_uquant_q32[32] {
|
||||
0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123,
|
||||
132, 140, 148, 156, 165, 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255
|
||||
};
|
||||
|
||||
static const uint8_t color_scrambled_pquant_to_uquant_q40[40] {
|
||||
0, 255, 32, 223, 65, 190, 97, 158, 6, 249, 39, 216, 71, 184, 104, 151,
|
||||
13, 242, 45, 210, 78, 177, 110, 145, 19, 236, 52, 203, 84, 171, 117, 138,
|
||||
26, 229, 58, 197, 91, 164, 123, 132
|
||||
};
|
||||
|
||||
static const uint8_t color_scrambled_pquant_to_uquant_q48[48] {
|
||||
0, 255, 16, 239, 32, 223, 48, 207, 65, 190, 81, 174, 97, 158, 113, 142,
|
||||
5, 250, 21, 234, 38, 217, 54, 201, 70, 185, 86, 169, 103, 152, 119, 136,
|
||||
11, 244, 27, 228, 43, 212, 59, 196, 76, 179, 92, 163, 108, 147, 124, 131
|
||||
};
|
||||
|
||||
static const uint8_t color_scrambled_pquant_to_uquant_q64[64] {
|
||||
0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
|
||||
65, 69, 73, 77, 81, 85, 89, 93, 97, 101, 105, 109, 113, 117, 121, 125,
|
||||
130, 134, 138, 142, 146, 150, 154, 158, 162, 166, 170, 174, 178, 182, 186, 190,
|
||||
195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, 239, 243, 247, 251, 255,
|
||||
};
|
||||
|
||||
static const uint8_t color_scrambled_pquant_to_uquant_q80[80] {
|
||||
0, 255, 16, 239, 32, 223, 48, 207, 64, 191, 80, 175, 96, 159, 112, 143,
|
||||
3, 252, 19, 236, 35, 220, 51, 204, 67, 188, 83, 172, 100, 155, 116, 139,
|
||||
6, 249, 22, 233, 38, 217, 54, 201, 71, 184, 87, 168, 103, 152, 119, 136,
|
||||
9, 246, 25, 230, 42, 213, 58, 197, 74, 181, 90, 165, 106, 149, 122, 133,
|
||||
13, 242, 29, 226, 45, 210, 61, 194, 77, 178, 93, 162, 109, 146, 125, 130
|
||||
};
|
||||
|
||||
static const uint8_t color_scrambled_pquant_to_uquant_q96[96] {
|
||||
0, 255, 8, 247, 16, 239, 24, 231, 32, 223, 40, 215, 48, 207, 56, 199,
|
||||
64, 191, 72, 183, 80, 175, 88, 167, 96, 159, 104, 151, 112, 143, 120, 135,
|
||||
2, 253, 10, 245, 18, 237, 26, 229, 35, 220, 43, 212, 51, 204, 59, 196,
|
||||
67, 188, 75, 180, 83, 172, 91, 164, 99, 156, 107, 148, 115, 140, 123, 132,
|
||||
5, 250, 13, 242, 21, 234, 29, 226, 37, 218, 45, 210, 53, 202, 61, 194,
|
||||
70, 185, 78, 177, 86, 169, 94, 161, 102, 153, 110, 145, 118, 137, 126, 129
|
||||
};
|
||||
|
||||
static const uint8_t color_scrambled_pquant_to_uquant_q128[128] {
|
||||
0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
|
||||
32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
|
||||
64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94,
|
||||
96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126,
|
||||
129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149, 151, 153, 155, 157, 159,
|
||||
161, 163, 165, 167, 169, 171, 173, 175, 177, 179, 181, 183, 185, 187, 189, 191,
|
||||
193, 195, 197, 199, 201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223,
|
||||
225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255
|
||||
};
|
||||
|
||||
static const uint8_t color_scrambled_pquant_to_uquant_q160[160] {
|
||||
0, 255, 8, 247, 16, 239, 24, 231, 32, 223, 40, 215, 48, 207, 56, 199,
|
||||
64, 191, 72, 183, 80, 175, 88, 167, 96, 159, 104, 151, 112, 143, 120, 135,
|
||||
1, 254, 9, 246, 17, 238, 25, 230, 33, 222, 41, 214, 49, 206, 57, 198,
|
||||
65, 190, 73, 182, 81, 174, 89, 166, 97, 158, 105, 150, 113, 142, 121, 134,
|
||||
3, 252, 11, 244, 19, 236, 27, 228, 35, 220, 43, 212, 51, 204, 59, 196,
|
||||
67, 188, 75, 180, 83, 172, 91, 164, 99, 156, 107, 148, 115, 140, 123, 132,
|
||||
4, 251, 12, 243, 20, 235, 28, 227, 36, 219, 44, 211, 52, 203, 60, 195,
|
||||
68, 187, 76, 179, 84, 171, 92, 163, 100, 155, 108, 147, 116, 139, 124, 131,
|
||||
6, 249, 14, 241, 22, 233, 30, 225, 38, 217, 46, 209, 54, 201, 62, 193,
|
||||
70, 185, 78, 177, 86, 169, 94, 161, 102, 153, 110, 145, 118, 137, 126, 129
|
||||
};
|
||||
|
||||
static const uint8_t color_scrambled_pquant_to_uquant_q192[192] {
|
||||
0, 255, 4, 251, 8, 247, 12, 243, 16, 239, 20, 235, 24, 231, 28, 227,
|
||||
32, 223, 36, 219, 40, 215, 44, 211, 48, 207, 52, 203, 56, 199, 60, 195,
|
||||
64, 191, 68, 187, 72, 183, 76, 179, 80, 175, 84, 171, 88, 167, 92, 163,
|
||||
96, 159, 100, 155, 104, 151, 108, 147, 112, 143, 116, 139, 120, 135, 124, 131,
|
||||
1, 254, 5, 250, 9, 246, 13, 242, 17, 238, 21, 234, 25, 230, 29, 226,
|
||||
33, 222, 37, 218, 41, 214, 45, 210, 49, 206, 53, 202, 57, 198, 61, 194,
|
||||
65, 190, 69, 186, 73, 182, 77, 178, 81, 174, 85, 170, 89, 166, 93, 162,
|
||||
97, 158, 101, 154, 105, 150, 109, 146, 113, 142, 117, 138, 121, 134, 125, 130,
|
||||
2, 253, 6, 249, 10, 245, 14, 241, 18, 237, 22, 233, 26, 229, 30, 225,
|
||||
34, 221, 38, 217, 42, 213, 46, 209, 50, 205, 54, 201, 58, 197, 62, 193,
|
||||
66, 189, 70, 185, 74, 181, 78, 177, 82, 173, 86, 169, 90, 165, 94, 161,
|
||||
98, 157, 102, 153, 106, 149, 110, 145, 114, 141, 118, 137, 122, 133, 126, 129
|
||||
};
|
||||
|
||||
static const uint8_t color_scrambled_pquant_to_uquant_q256[256] {
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
||||
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
|
||||
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
|
||||
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
|
||||
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
|
||||
96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
|
||||
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
|
||||
128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
|
||||
144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
|
||||
160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
|
||||
176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
|
||||
192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
|
||||
208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
|
||||
224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
|
||||
240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
|
||||
};
|
||||
|
||||
const uint8_t* color_scrambled_pquant_to_uquant_tables[17] {
|
||||
color_scrambled_pquant_to_uquant_q6,
|
||||
color_scrambled_pquant_to_uquant_q8,
|
||||
color_scrambled_pquant_to_uquant_q10,
|
||||
color_scrambled_pquant_to_uquant_q12,
|
||||
color_scrambled_pquant_to_uquant_q16,
|
||||
color_scrambled_pquant_to_uquant_q20,
|
||||
color_scrambled_pquant_to_uquant_q24,
|
||||
color_scrambled_pquant_to_uquant_q32,
|
||||
color_scrambled_pquant_to_uquant_q40,
|
||||
color_scrambled_pquant_to_uquant_q48,
|
||||
color_scrambled_pquant_to_uquant_q64,
|
||||
color_scrambled_pquant_to_uquant_q80,
|
||||
color_scrambled_pquant_to_uquant_q96,
|
||||
color_scrambled_pquant_to_uquant_q128,
|
||||
color_scrambled_pquant_to_uquant_q160,
|
||||
color_scrambled_pquant_to_uquant_q192,
|
||||
color_scrambled_pquant_to_uquant_q256
|
||||
};
|
||||
|
||||
// The quant_mode_table[integer_count/2][bits] gives us the quantization level for a given integer
|
||||
// count and number of bits that the integer may fit into.
|
||||
const int8_t quant_mode_table[10][128] {
|
||||
{
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
|
||||
},
|
||||
{
|
||||
-1, -1, 0, 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
|
||||
},
|
||||
{
|
||||
-1, -1, -1, -1, 0, 0, 0, 1, 2, 2, 3, 4, 5, 5, 6, 7,
|
||||
8, 8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
|
||||
},
|
||||
{
|
||||
-1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
|
||||
4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
|
||||
12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
|
||||
},
|
||||
{
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 1, 1, 1,
|
||||
2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 7,
|
||||
8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13,
|
||||
14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 19, 19, 19,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
|
||||
},
|
||||
{
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0,
|
||||
1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5,
|
||||
5, 5, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 10, 10,
|
||||
10, 10, 11, 11, 11, 11, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14,
|
||||
15, 15, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 19, 19, 19, 19,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
|
||||
},
|
||||
{
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
|
||||
4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
|
||||
8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11,
|
||||
12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15,
|
||||
16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
|
||||
},
|
||||
{
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2,
|
||||
2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6,
|
||||
6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 9, 9, 9,
|
||||
9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, 12, 12, 12, 13,
|
||||
13, 13, 13, 13, 14, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16,
|
||||
16, 16, 17, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 19,
|
||||
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
|
||||
},
|
||||
{
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
|
||||
2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
|
||||
5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7,
|
||||
8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10,
|
||||
11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13,
|
||||
14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16,
|
||||
17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19
|
||||
},
|
||||
{
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
|
||||
1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4,
|
||||
4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6,
|
||||
6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9,
|
||||
9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11,
|
||||
12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14,
|
||||
14, 14, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 17, 17
|
||||
}
|
||||
};
|
||||
@@ -0,0 +1,544 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Functions for converting between symbolic and physical encodings.
|
||||
*/
|
||||
|
||||
#include "astcenc_internal.h"
|
||||
|
||||
#include <cassert>
|
||||
|
||||
/**
|
||||
* @brief Reverse bits in a byte.
|
||||
*
|
||||
* @param p The value to reverse.
|
||||
*
|
||||
* @return The reversed result.
|
||||
*/
|
||||
static inline int bitrev8(int p)
|
||||
{
|
||||
p = ((p & 0x0F) << 4) | ((p >> 4) & 0x0F);
|
||||
p = ((p & 0x33) << 2) | ((p >> 2) & 0x33);
|
||||
p = ((p & 0x55) << 1) | ((p >> 1) & 0x55);
|
||||
return p;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @brief Read up to 8 bits at an arbitrary bit offset.
|
||||
*
|
||||
* The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
|
||||
* span two separate bytes in memory.
|
||||
*
|
||||
* @param bitcount The number of bits to read.
|
||||
* @param bitoffset The bit offset to read from, between 0 and 7.
|
||||
* @param[in,out] ptr The data pointer to read from.
|
||||
*
|
||||
* @return The read value.
|
||||
*/
|
||||
static inline int read_bits(
|
||||
int bitcount,
|
||||
int bitoffset,
|
||||
const uint8_t* ptr
|
||||
) {
|
||||
int mask = (1 << bitcount) - 1;
|
||||
ptr += bitoffset >> 3;
|
||||
bitoffset &= 7;
|
||||
int value = ptr[0] | (ptr[1] << 8);
|
||||
value >>= bitoffset;
|
||||
value &= mask;
|
||||
return value;
|
||||
}
|
||||
|
||||
#if !defined(ASTCENC_DECOMPRESS_ONLY)
|
||||
|
||||
/**
|
||||
* @brief Write up to 8 bits at an arbitrary bit offset.
|
||||
*
|
||||
* The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so
|
||||
* may span two separate bytes in memory.
|
||||
*
|
||||
* @param value The value to write.
|
||||
* @param bitcount The number of bits to write, starting from LSB.
|
||||
* @param bitoffset The bit offset to store at, between 0 and 7.
|
||||
* @param[in,out] ptr The data pointer to write to.
|
||||
*/
|
||||
static inline void write_bits(
|
||||
int value,
|
||||
int bitcount,
|
||||
int bitoffset,
|
||||
uint8_t* ptr
|
||||
) {
|
||||
int mask = (1 << bitcount) - 1;
|
||||
value &= mask;
|
||||
ptr += bitoffset >> 3;
|
||||
bitoffset &= 7;
|
||||
value <<= bitoffset;
|
||||
mask <<= bitoffset;
|
||||
mask = ~mask;
|
||||
|
||||
ptr[0] &= mask;
|
||||
ptr[0] |= value;
|
||||
ptr[1] &= mask >> 8;
|
||||
ptr[1] |= value >> 8;
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void symbolic_to_physical(
|
||||
const block_size_descriptor& bsd,
|
||||
const symbolic_compressed_block& scb,
|
||||
uint8_t pcb[16]
|
||||
) {
|
||||
assert(scb.block_type != SYM_BTYPE_ERROR);
|
||||
|
||||
// Constant color block using UNORM16 colors
|
||||
if (scb.block_type == SYM_BTYPE_CONST_U16)
|
||||
{
|
||||
// There is currently no attempt to coalesce larger void-extents
|
||||
static const uint8_t cbytes[8] { 0xFC, 0xFD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
|
||||
for (unsigned int i = 0; i < 8; i++)
|
||||
{
|
||||
pcb[i] = cbytes[i];
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++)
|
||||
{
|
||||
pcb[2 * i + 8] = scb.constant_color[i] & 0xFF;
|
||||
pcb[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// Constant color block using FP16 colors
|
||||
if (scb.block_type == SYM_BTYPE_CONST_F16)
|
||||
{
|
||||
// There is currently no attempt to coalesce larger void-extents
|
||||
static const uint8_t cbytes[8] { 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
|
||||
for (unsigned int i = 0; i < 8; i++)
|
||||
{
|
||||
pcb[i] = cbytes[i];
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++)
|
||||
{
|
||||
pcb[2 * i + 8] = scb.constant_color[i] & 0xFF;
|
||||
pcb[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
unsigned int partition_count = scb.partition_count;
|
||||
|
||||
// Compress the weights.
|
||||
// They are encoded as an ordinary integer-sequence, then bit-reversed
|
||||
uint8_t weightbuf[16] { 0 };
|
||||
|
||||
const auto& bm = bsd.get_block_mode(scb.block_mode);
|
||||
const auto& di = bsd.get_decimation_info(bm.decimation_mode);
|
||||
int weight_count = di.weight_count;
|
||||
quant_method weight_quant_method = bm.get_weight_quant_mode();
|
||||
float weight_quant_levels = static_cast<float>(get_quant_level(weight_quant_method));
|
||||
int is_dual_plane = bm.is_dual_plane;
|
||||
|
||||
const auto& qat = quant_and_xfer_tables[weight_quant_method];
|
||||
|
||||
int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;
|
||||
|
||||
int bits_for_weights = get_ise_sequence_bitcount(real_weight_count, weight_quant_method);
|
||||
|
||||
uint8_t weights[64];
|
||||
if (is_dual_plane)
|
||||
{
|
||||
for (int i = 0; i < weight_count; i++)
|
||||
{
|
||||
float uqw = static_cast<float>(scb.weights[i]);
|
||||
float qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
|
||||
int qwi = static_cast<int>(qw + 0.5f);
|
||||
weights[2 * i] = qat.scramble_map[qwi];
|
||||
|
||||
uqw = static_cast<float>(scb.weights[i + WEIGHTS_PLANE2_OFFSET]);
|
||||
qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
|
||||
qwi = static_cast<int>(qw + 0.5f);
|
||||
weights[2 * i + 1] = qat.scramble_map[qwi];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0; i < weight_count; i++)
|
||||
{
|
||||
float uqw = static_cast<float>(scb.weights[i]);
|
||||
float qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
|
||||
int qwi = static_cast<int>(qw + 0.5f);
|
||||
weights[i] = qat.scramble_map[qwi];
|
||||
}
|
||||
}
|
||||
|
||||
encode_ise(weight_quant_method, real_weight_count, weights, weightbuf, 0);
|
||||
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
pcb[i] = static_cast<uint8_t>(bitrev8(weightbuf[15 - i]));
|
||||
}
|
||||
|
||||
write_bits(scb.block_mode, 11, 0, pcb);
|
||||
write_bits(partition_count - 1, 2, 11, pcb);
|
||||
|
||||
int below_weights_pos = 128 - bits_for_weights;
|
||||
|
||||
// Encode partition index and color endpoint types for blocks with 2+ partitions
|
||||
if (partition_count > 1)
|
||||
{
|
||||
write_bits(scb.partition_index, 6, 13, pcb);
|
||||
write_bits(scb.partition_index >> 6, PARTITION_INDEX_BITS - 6, 19, pcb);
|
||||
|
||||
if (scb.color_formats_matched)
|
||||
{
|
||||
write_bits(scb.color_formats[0] << 2, 6, 13 + PARTITION_INDEX_BITS, pcb);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Check endpoint types for each partition to determine the lowest class present
|
||||
int low_class = 4;
|
||||
|
||||
for (unsigned int i = 0; i < partition_count; i++)
|
||||
{
|
||||
int class_of_format = scb.color_formats[i] >> 2;
|
||||
low_class = astc::min(class_of_format, low_class);
|
||||
}
|
||||
|
||||
if (low_class == 3)
|
||||
{
|
||||
low_class = 2;
|
||||
}
|
||||
|
||||
int encoded_type = low_class + 1;
|
||||
int bitpos = 2;
|
||||
|
||||
for (unsigned int i = 0; i < partition_count; i++)
|
||||
{
|
||||
int classbit_of_format = (scb.color_formats[i] >> 2) - low_class;
|
||||
encoded_type |= classbit_of_format << bitpos;
|
||||
bitpos++;
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < partition_count; i++)
|
||||
{
|
||||
int lowbits_of_format = scb.color_formats[i] & 3;
|
||||
encoded_type |= lowbits_of_format << bitpos;
|
||||
bitpos += 2;
|
||||
}
|
||||
|
||||
int encoded_type_lowpart = encoded_type & 0x3F;
|
||||
int encoded_type_highpart = encoded_type >> 6;
|
||||
int encoded_type_highpart_size = (3 * partition_count) - 4;
|
||||
int encoded_type_highpart_pos = 128 - bits_for_weights - encoded_type_highpart_size;
|
||||
write_bits(encoded_type_lowpart, 6, 13 + PARTITION_INDEX_BITS, pcb);
|
||||
write_bits(encoded_type_highpart, encoded_type_highpart_size, encoded_type_highpart_pos, pcb);
|
||||
below_weights_pos -= encoded_type_highpart_size;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
write_bits(scb.color_formats[0], 4, 13, pcb);
|
||||
}
|
||||
|
||||
// In dual-plane mode, encode the color component of the second plane of weights
|
||||
if (is_dual_plane)
|
||||
{
|
||||
write_bits(scb.plane2_component, 2, below_weights_pos - 2, pcb);
|
||||
}
|
||||
|
||||
// Encode the color components
|
||||
uint8_t values_to_encode[32];
|
||||
int valuecount_to_encode = 0;
|
||||
|
||||
const uint8_t* pack_table = color_uquant_to_scrambled_pquant_tables[scb.quant_mode - QUANT_6];
|
||||
for (unsigned int i = 0; i < scb.partition_count; i++)
|
||||
{
|
||||
int vals = 2 * (scb.color_formats[i] >> 2) + 2;
|
||||
assert(vals <= 8);
|
||||
for (int j = 0; j < vals; j++)
|
||||
{
|
||||
values_to_encode[j + valuecount_to_encode] = pack_table[scb.color_values[i][j]];
|
||||
}
|
||||
valuecount_to_encode += vals;
|
||||
}
|
||||
|
||||
encode_ise(scb.get_color_quant_mode(), valuecount_to_encode, values_to_encode, pcb,
|
||||
scb.partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* See header for documentation. */
|
||||
void physical_to_symbolic(
|
||||
const block_size_descriptor& bsd,
|
||||
const uint8_t pcb[16],
|
||||
symbolic_compressed_block& scb
|
||||
) {
|
||||
uint8_t bswapped[16];
|
||||
|
||||
scb.block_type = SYM_BTYPE_NONCONST;
|
||||
|
||||
// Extract header fields
|
||||
int block_mode = read_bits(11, 0, pcb);
|
||||
if ((block_mode & 0x1FF) == 0x1FC)
|
||||
{
|
||||
// Constant color block
|
||||
|
||||
// Check what format the data has
|
||||
if (block_mode & 0x200)
|
||||
{
|
||||
scb.block_type = SYM_BTYPE_CONST_F16;
|
||||
}
|
||||
else
|
||||
{
|
||||
scb.block_type = SYM_BTYPE_CONST_U16;
|
||||
}
|
||||
|
||||
scb.partition_count = 0;
|
||||
for (int i = 0; i < 4; i++)
|
||||
{
|
||||
scb.constant_color[i] = pcb[2 * i + 8] | (pcb[2 * i + 9] << 8);
|
||||
}
|
||||
|
||||
// Additionally, check that the void-extent
|
||||
if (bsd.zdim == 1)
|
||||
{
|
||||
// 2D void-extent
|
||||
int rsvbits = read_bits(2, 10, pcb);
|
||||
if (rsvbits != 3)
|
||||
{
|
||||
scb.block_type = SYM_BTYPE_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
// Low values span 3 bytes so need two read_bits calls
|
||||
int vx_low_s = read_bits(8, 12, pcb) | (read_bits(5, 12 + 8, pcb) << 8);
|
||||
int vx_high_s = read_bits(13, 25, pcb);
|
||||
int vx_low_t = read_bits(8, 38, pcb) | (read_bits(5, 38 + 8, pcb) << 8);
|
||||
int vx_high_t = read_bits(13, 51, pcb);
|
||||
|
||||
int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF &&
|
||||
vx_low_t == 0x1FFF && vx_high_t == 0x1FFF;
|
||||
|
||||
if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t) && !all_ones)
|
||||
{
|
||||
scb.block_type = SYM_BTYPE_ERROR;
|
||||
return;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// 3D void-extent
|
||||
int vx_low_s = read_bits(9, 10, pcb);
|
||||
int vx_high_s = read_bits(9, 19, pcb);
|
||||
int vx_low_t = read_bits(9, 28, pcb);
|
||||
int vx_high_t = read_bits(9, 37, pcb);
|
||||
int vx_low_r = read_bits(9, 46, pcb);
|
||||
int vx_high_r = read_bits(9, 55, pcb);
|
||||
|
||||
int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF &&
|
||||
vx_low_t == 0x1FF && vx_high_t == 0x1FF &&
|
||||
vx_low_r == 0x1FF && vx_high_r == 0x1FF;
|
||||
|
||||
if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t || vx_low_r >= vx_high_r) && !all_ones)
|
||||
{
|
||||
scb.block_type = SYM_BTYPE_ERROR;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
unsigned int packed_index = bsd.block_mode_packed_index[block_mode];
|
||||
if (packed_index == BLOCK_BAD_BLOCK_MODE)
|
||||
{
|
||||
scb.block_type = SYM_BTYPE_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
const auto& bm = bsd.get_block_mode(block_mode);
|
||||
const auto& di = bsd.get_decimation_info(bm.decimation_mode);
|
||||
|
||||
int weight_count = di.weight_count;
|
||||
promise(weight_count > 0);
|
||||
|
||||
quant_method weight_quant_method = static_cast<quant_method>(bm.quant_mode);
|
||||
int is_dual_plane = bm.is_dual_plane;
|
||||
|
||||
int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;
|
||||
|
||||
int partition_count = read_bits(2, 11, pcb) + 1;
|
||||
promise(partition_count > 0);
|
||||
|
||||
scb.block_mode = static_cast<uint16_t>(block_mode);
|
||||
scb.partition_count = static_cast<uint8_t>(partition_count);
|
||||
|
||||
for (int i = 0; i < 16; i++)
|
||||
{
|
||||
bswapped[i] = static_cast<uint8_t>(bitrev8(pcb[15 - i]));
|
||||
}
|
||||
|
||||
int bits_for_weights = get_ise_sequence_bitcount(real_weight_count, weight_quant_method);
|
||||
|
||||
int below_weights_pos = 128 - bits_for_weights;
|
||||
|
||||
uint8_t indices[64];
|
||||
const auto& qat = quant_and_xfer_tables[weight_quant_method];
|
||||
|
||||
decode_ise(weight_quant_method, real_weight_count, bswapped, indices, 0);
|
||||
|
||||
if (is_dual_plane)
|
||||
{
|
||||
for (int i = 0; i < weight_count; i++)
|
||||
{
|
||||
scb.weights[i] = qat.unscramble_and_unquant_map[indices[2 * i]];
|
||||
scb.weights[i + WEIGHTS_PLANE2_OFFSET] = qat.unscramble_and_unquant_map[indices[2 * i + 1]];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0; i < weight_count; i++)
|
||||
{
|
||||
scb.weights[i] = qat.unscramble_and_unquant_map[indices[i]];
|
||||
}
|
||||
}
|
||||
|
||||
if (is_dual_plane && partition_count == 4)
|
||||
{
|
||||
scb.block_type = SYM_BTYPE_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
scb.color_formats_matched = 0;
|
||||
|
||||
// Determine the format of each endpoint pair
|
||||
int color_formats[BLOCK_MAX_PARTITIONS];
|
||||
int encoded_type_highpart_size = 0;
|
||||
if (partition_count == 1)
|
||||
{
|
||||
color_formats[0] = read_bits(4, 13, pcb);
|
||||
scb.partition_index = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
encoded_type_highpart_size = (3 * partition_count) - 4;
|
||||
below_weights_pos -= encoded_type_highpart_size;
|
||||
int encoded_type = read_bits(6, 13 + PARTITION_INDEX_BITS, pcb) |
|
||||
(read_bits(encoded_type_highpart_size, below_weights_pos, pcb) << 6);
|
||||
int baseclass = encoded_type & 0x3;
|
||||
if (baseclass == 0)
|
||||
{
|
||||
for (int i = 0; i < partition_count; i++)
|
||||
{
|
||||
color_formats[i] = (encoded_type >> 2) & 0xF;
|
||||
}
|
||||
|
||||
below_weights_pos += encoded_type_highpart_size;
|
||||
scb.color_formats_matched = 1;
|
||||
encoded_type_highpart_size = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
int bitpos = 2;
|
||||
baseclass--;
|
||||
|
||||
for (int i = 0; i < partition_count; i++)
|
||||
{
|
||||
color_formats[i] = (((encoded_type >> bitpos) & 1) + baseclass) << 2;
|
||||
bitpos++;
|
||||
}
|
||||
|
||||
for (int i = 0; i < partition_count; i++)
|
||||
{
|
||||
color_formats[i] |= (encoded_type >> bitpos) & 3;
|
||||
bitpos += 2;
|
||||
}
|
||||
}
|
||||
scb.partition_index = static_cast<uint16_t>(read_bits(10, 13, pcb));
|
||||
}
|
||||
|
||||
for (int i = 0; i < partition_count; i++)
|
||||
{
|
||||
scb.color_formats[i] = static_cast<uint8_t>(color_formats[i]);
|
||||
}
|
||||
|
||||
// Determine number of color endpoint integers
|
||||
int color_integer_count = 0;
|
||||
for (int i = 0; i < partition_count; i++)
|
||||
{
|
||||
int endpoint_class = color_formats[i] >> 2;
|
||||
color_integer_count += (endpoint_class + 1) * 2;
|
||||
}
|
||||
|
||||
if (color_integer_count > 18)
|
||||
{
|
||||
scb.block_type = SYM_BTYPE_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
// Determine the color endpoint format to use
|
||||
static const int color_bits_arr[5] { -1, 115 - 4, 113 - 4 - PARTITION_INDEX_BITS, 113 - 4 - PARTITION_INDEX_BITS, 113 - 4 - PARTITION_INDEX_BITS };
|
||||
int color_bits = color_bits_arr[partition_count] - bits_for_weights - encoded_type_highpart_size;
|
||||
if (is_dual_plane)
|
||||
{
|
||||
color_bits -= 2;
|
||||
}
|
||||
|
||||
if (color_bits < 0)
|
||||
{
|
||||
color_bits = 0;
|
||||
}
|
||||
|
||||
int color_quant_level = quant_mode_table[color_integer_count >> 1][color_bits];
|
||||
if (color_quant_level < QUANT_6)
|
||||
{
|
||||
scb.block_type = SYM_BTYPE_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
// Unpack the integer color values and assign to endpoints
|
||||
scb.quant_mode = static_cast<quant_method>(color_quant_level);
|
||||
|
||||
uint8_t values_to_decode[32];
|
||||
decode_ise(static_cast<quant_method>(color_quant_level), color_integer_count, pcb,
|
||||
values_to_decode, (partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS));
|
||||
|
||||
int valuecount_to_decode = 0;
|
||||
const uint8_t* unpack_table = color_scrambled_pquant_to_uquant_tables[scb.quant_mode - QUANT_6];
|
||||
for (int i = 0; i < partition_count; i++)
|
||||
{
|
||||
int vals = 2 * (color_formats[i] >> 2) + 2;
|
||||
for (int j = 0; j < vals; j++)
|
||||
{
|
||||
scb.color_values[i][j] = unpack_table[values_to_decode[j + valuecount_to_decode]];
|
||||
}
|
||||
valuecount_to_decode += vals;
|
||||
}
|
||||
|
||||
// Fetch component for second-plane in the case of dual plane of weights.
|
||||
scb.plane2_component = -1;
|
||||
if (is_dual_plane)
|
||||
{
|
||||
scb.plane2_component = static_cast<int8_t>(read_bits(2, below_weights_pos - 2, pcb));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,608 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2019-2025 Arm Limited
|
||||
// Copyright 2008 Jose Fonseca
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/*
|
||||
* This module implements vector support for floats, ints, and vector lane
|
||||
* control masks. It provides access to both explicit vector width types, and
|
||||
* flexible N-wide types where N can be determined at compile time.
|
||||
*
|
||||
* The design of this module encourages use of vector length agnostic code, via
|
||||
* the vint, vfloat, and vmask types. These will take on the widest SIMD vector
|
||||
* with that is available at compile time. The current vector width is
|
||||
* accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant.
|
||||
*
|
||||
* Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types.
|
||||
* These are provided primarily for prototyping and algorithm debug of VLA
|
||||
* implementations.
|
||||
*
|
||||
* Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4
|
||||
* types. These are provided for use by VLA code, but are also expected to be
|
||||
* used as a fixed-width type and will supported a reference C++ fallback for
|
||||
* use on platforms without SIMD intrinsics.
|
||||
*
|
||||
* Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8
|
||||
* types. These are provide for use by VLA code, and are not expected to be
|
||||
* used as a fixed-width type in normal code. No reference C implementation is
|
||||
* provided on platforms without underlying SIMD intrinsics.
|
||||
*
|
||||
* With the current implementation ISA support is provided for:
|
||||
*
|
||||
* * 1-wide for scalar reference
|
||||
* * 4-wide for Armv8-A NEON
|
||||
* * 4-wide for x86-64 SSE2
|
||||
* * 4-wide for x86-64 SSE4.1
|
||||
* * 8-wide for Armv8-A SVE
|
||||
* * 8-wide for x86-64 AVX2
|
||||
*/
|
||||
|
||||
#ifndef ASTC_VECMATHLIB_H_INCLUDED
|
||||
#define ASTC_VECMATHLIB_H_INCLUDED
|
||||
|
||||
#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#if ASTCENC_SVE != 0
|
||||
#include <arm_sve.h>
|
||||
#include <arm_neon_sve_bridge.h>
|
||||
#endif
|
||||
|
||||
#if ASTCENC_NEON != 0
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__clang__) && defined(_MSC_VER)
|
||||
#define ASTCENC_SIMD_INLINE __forceinline
|
||||
#define ASTCENC_NO_INLINE
|
||||
#elif defined(__GNUC__) && !defined(__clang__)
|
||||
#define ASTCENC_SIMD_INLINE __attribute__((always_inline)) inline
|
||||
#define ASTCENC_NO_INLINE __attribute__ ((noinline))
|
||||
#else
|
||||
#define ASTCENC_SIMD_INLINE __attribute__((always_inline, nodebug)) inline
|
||||
#define ASTCENC_NO_INLINE __attribute__ ((noinline))
|
||||
#endif
|
||||
|
||||
template<typename T> T gatherf_byte_inds(const float* base, const uint8_t* indices);
|
||||
|
||||
#if ASTCENC_AVX >= 2
|
||||
// If we have AVX2 expose 8-wide VLA.
|
||||
#include "astcenc_vecmathlib_sse_4.h"
|
||||
#include "astcenc_vecmathlib_common_4.h"
|
||||
#include "astcenc_vecmathlib_avx2_8.h"
|
||||
|
||||
#define ASTCENC_SIMD_WIDTH 8
|
||||
|
||||
using vfloat = vfloat8;
|
||||
|
||||
#if defined(ASTCENC_NO_INVARIANCE)
|
||||
using vfloatacc = vfloat8;
|
||||
#else
|
||||
using vfloatacc = vfloat4;
|
||||
#endif
|
||||
|
||||
using vint = vint8;
|
||||
using vmask = vmask8;
|
||||
|
||||
using vtable_16x8 = vtable8_16x8;
|
||||
using vtable_32x8 = vtable8_32x8;
|
||||
using vtable_64x8 = vtable8_64x8;
|
||||
|
||||
constexpr auto loada = vfloat8::loada;
|
||||
constexpr auto load1 = vfloat8::load1;
|
||||
constexpr auto vint_from_size = vint8_from_size;
|
||||
|
||||
#elif ASTCENC_SSE >= 20
|
||||
// If we have SSE expose 4-wide VLA, and 4-wide fixed width.
|
||||
#include "astcenc_vecmathlib_sse_4.h"
|
||||
#include "astcenc_vecmathlib_common_4.h"
|
||||
|
||||
#define ASTCENC_SIMD_WIDTH 4
|
||||
|
||||
using vfloat = vfloat4;
|
||||
using vfloatacc = vfloat4;
|
||||
using vint = vint4;
|
||||
using vmask = vmask4;
|
||||
|
||||
using vtable_16x8 = vtable4_16x8;
|
||||
using vtable_32x8 = vtable4_32x8;
|
||||
using vtable_64x8 = vtable4_64x8;
|
||||
|
||||
constexpr auto loada = vfloat4::loada;
|
||||
constexpr auto load1 = vfloat4::load1;
|
||||
constexpr auto vint_from_size = vint4_from_size;
|
||||
|
||||
#elif ASTCENC_SVE == 8
|
||||
// Check the compiler is configured with fixed-length 256-bit SVE.
|
||||
#if !defined(__ARM_FEATURE_SVE_BITS) || (__ARM_FEATURE_SVE_BITS != 256)
|
||||
#error "__ARM_FEATURE_SVE_BITS is not set to 256 bits"
|
||||
#endif
|
||||
|
||||
// If we have SVE configured as 8-wide, expose 8-wide VLA.
|
||||
#include "astcenc_vecmathlib_neon_4.h"
|
||||
#include "astcenc_vecmathlib_common_4.h"
|
||||
#include "astcenc_vecmathlib_sve_8.h"
|
||||
|
||||
#define ASTCENC_SIMD_WIDTH 8
|
||||
|
||||
using vfloat = vfloat8;
|
||||
|
||||
#if defined(ASTCENC_NO_INVARIANCE)
|
||||
using vfloatacc = vfloat8;
|
||||
#else
|
||||
using vfloatacc = vfloat4;
|
||||
#endif
|
||||
|
||||
using vint = vint8;
|
||||
using vmask = vmask8;
|
||||
|
||||
using vtable_16x8 = vtable8_16x8;
|
||||
using vtable_32x8 = vtable8_32x8;
|
||||
using vtable_64x8 = vtable8_64x8;
|
||||
|
||||
constexpr auto loada = vfloat8::loada;
|
||||
constexpr auto load1 = vfloat8::load1;
|
||||
constexpr auto vint_from_size = vint8_from_size;
|
||||
|
||||
#elif ASTCENC_NEON > 0
|
||||
// If we have NEON expose 4-wide VLA.
|
||||
#include "astcenc_vecmathlib_neon_4.h"
|
||||
#include "astcenc_vecmathlib_common_4.h"
|
||||
|
||||
#define ASTCENC_SIMD_WIDTH 4
|
||||
|
||||
using vfloat = vfloat4;
|
||||
using vfloatacc = vfloat4;
|
||||
using vint = vint4;
|
||||
using vmask = vmask4;
|
||||
|
||||
using vtable_16x8 = vtable4_16x8;
|
||||
using vtable_32x8 = vtable4_32x8;
|
||||
using vtable_64x8 = vtable4_64x8;
|
||||
|
||||
constexpr auto loada = vfloat4::loada;
|
||||
constexpr auto load1 = vfloat4::load1;
|
||||
constexpr auto vint_from_size = vint4_from_size;
|
||||
|
||||
#else
|
||||
// If we have nothing expose 4-wide VLA, and 4-wide fixed width.
|
||||
|
||||
// Note: We no longer expose the 1-wide scalar fallback because it is not
|
||||
// invariant with the 4-wide path due to algorithms that use horizontal
|
||||
// operations that accumulate a local vector sum before accumulating into
|
||||
// a running sum.
|
||||
//
|
||||
// For 4 items adding into an accumulator using 1-wide vectors the sum is:
|
||||
//
|
||||
// result = ((((sum + l0) + l1) + l2) + l3)
|
||||
//
|
||||
// ... whereas the accumulator for a 4-wide vector sum is:
|
||||
//
|
||||
// result = sum + ((l0 + l2) + (l1 + l3))
|
||||
//
|
||||
// In "normal maths" this is the same, but the floating point reassociation
|
||||
// differences mean that these will not produce the same result.
|
||||
|
||||
#include "astcenc_vecmathlib_none_4.h"
|
||||
#include "astcenc_vecmathlib_common_4.h"
|
||||
|
||||
#define ASTCENC_SIMD_WIDTH 4
|
||||
|
||||
using vfloat = vfloat4;
|
||||
using vfloatacc = vfloat4;
|
||||
using vint = vint4;
|
||||
using vmask = vmask4;
|
||||
|
||||
using vtable_16x8 = vtable4_16x8;
|
||||
using vtable_32x8 = vtable4_32x8;
|
||||
using vtable_64x8 = vtable4_64x8;
|
||||
|
||||
constexpr auto loada = vfloat4::loada;
|
||||
constexpr auto load1 = vfloat4::load1;
|
||||
constexpr auto vint_from_size = vint4_from_size;
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief Round a count down to the largest multiple of the SIMD width.
|
||||
*
|
||||
* Assumption that the vector width is a power of two ...
|
||||
*
|
||||
* @param count The unrounded value.
|
||||
*
|
||||
* @return The rounded value.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE size_t round_down_to_simd_multiple_vla(size_t count)
|
||||
{
|
||||
return count & static_cast<size_t>(~(ASTCENC_SIMD_WIDTH - 1));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Round a count up to the largest multiple of the SIMD width.
|
||||
*
|
||||
* Assumption that the vector width is a power of two ...
|
||||
*
|
||||
* @param count The unrounded value.
|
||||
*
|
||||
* @return The rounded value.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE size_t round_up_to_simd_multiple_vla(size_t count)
|
||||
{
|
||||
size_t multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
|
||||
return multiples * ASTCENC_SIMD_WIDTH;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return @c a with lanes negated if the @c b lane is negative.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b)
|
||||
{
|
||||
vint ia = float_as_int(a);
|
||||
vint ib = float_as_int(b);
|
||||
vint sign_mask(static_cast<int>(0x80000000));
|
||||
vint r = ia ^ (ib & sign_mask);
|
||||
return int_as_float(r);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return fast, but approximate, vector atan(x).
|
||||
*
|
||||
* Max error of this implementation is 0.004883.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat atan(vfloat x)
|
||||
{
|
||||
vmask c = abs(x) > vfloat(1.0f);
|
||||
vfloat z = change_sign(vfloat(astc::PI_OVER_TWO), x);
|
||||
vfloat y = select(x, vfloat(1.0f) / x, c);
|
||||
y = y / (y * y * vfloat(0.28f) + vfloat(1.0f));
|
||||
return select(y, z - y, c);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return fast, but approximate, vector atan2(x, y).
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x)
|
||||
{
|
||||
vfloat z = atan(abs(y / x));
|
||||
vmask xmask = x < vfloat::zero();
|
||||
return change_sign(select(z, vfloat(astc::PI) - z, xmask), y);
|
||||
}
|
||||
|
||||
/*
|
||||
* @brief Factory that returns a unit length 4 component vfloat4.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vfloat4 unit4()
|
||||
{
|
||||
return vfloat4(0.5f);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a unit length 3 component vfloat4.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vfloat4 unit3()
|
||||
{
|
||||
float val = 0.577350258827209473f;
|
||||
return vfloat4(val, val, val, 0.0f);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a unit length 2 component vfloat4.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vfloat4 unit2()
|
||||
{
|
||||
float val = 0.707106769084930420f;
|
||||
return vfloat4(val, val, 0.0f, 0.0f);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a 3 component vfloat4.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vfloat4 vfloat3(float a, float b, float c)
|
||||
{
|
||||
return vfloat4(a, b, c, 0.0f);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Factory that returns a 2 component vfloat4.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vfloat4 vfloat2(float a, float b)
|
||||
{
|
||||
return vfloat4(a, b, 0.0f, 0.0f);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Normalize a non-zero length vector to unit length.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vfloat4 normalize(vfloat4 a)
|
||||
{
|
||||
vfloat4 length = dot(a, a);
|
||||
return a / sqrt(length);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Normalize a vector, returning @c safe if len is zero.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vfloat4 normalize_safe(vfloat4 a, vfloat4 safe)
|
||||
{
|
||||
vfloat4 length = dot(a, a);
|
||||
if (length.lane<0>() != 0.0f)
|
||||
{
|
||||
return a / sqrt(length);
|
||||
}
|
||||
|
||||
return safe;
|
||||
}
|
||||
|
||||
|
||||
|
||||
#define POLY0(x, c0) ( c0)
|
||||
#define POLY1(x, c0, c1) ((POLY0(x, c1) * x) + c0)
|
||||
#define POLY2(x, c0, c1, c2) ((POLY1(x, c1, c2) * x) + c0)
|
||||
#define POLY3(x, c0, c1, c2, c3) ((POLY2(x, c1, c2, c3) * x) + c0)
|
||||
#define POLY4(x, c0, c1, c2, c3, c4) ((POLY3(x, c1, c2, c3, c4) * x) + c0)
|
||||
#define POLY5(x, c0, c1, c2, c3, c4, c5) ((POLY4(x, c1, c2, c3, c4, c5) * x) + c0)
|
||||
|
||||
/**
|
||||
* @brief Compute an approximate exp2(x) for each lane in the vector.
|
||||
*
|
||||
* Based on 5th degree minimax polynomials, ported from this blog
|
||||
* https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vfloat4 exp2(vfloat4 x)
|
||||
{
|
||||
x = clamp(-126.99999f, 129.0f, x);
|
||||
|
||||
vint4 ipart = float_to_int(x - 0.5f);
|
||||
vfloat4 fpart = x - int_to_float(ipart);
|
||||
|
||||
// Integer contrib, using 1 << ipart
|
||||
vfloat4 iexp = int_as_float(lsl<23>(ipart + 127));
|
||||
|
||||
// Fractional contrib, using polynomial fit of 2^x in range [-0.5, 0.5)
|
||||
vfloat4 fexp = POLY5(fpart,
|
||||
9.9999994e-1f,
|
||||
6.9315308e-1f,
|
||||
2.4015361e-1f,
|
||||
5.5826318e-2f,
|
||||
8.9893397e-3f,
|
||||
1.8775767e-3f);
|
||||
|
||||
return iexp * fexp;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Compute an approximate log2(x) for each lane in the vector.
|
||||
*
|
||||
* Based on 5th degree minimax polynomials, ported from this blog
|
||||
* https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vfloat4 log2(vfloat4 x)
|
||||
{
|
||||
vint4 exp(0x7F800000);
|
||||
vint4 mant(0x007FFFFF);
|
||||
vint4 one(0x3F800000);
|
||||
|
||||
vint4 i = float_as_int(x);
|
||||
|
||||
vfloat4 e = int_to_float(lsr<23>(i & exp) - 127);
|
||||
|
||||
vfloat4 m = int_as_float((i & mant) | one);
|
||||
|
||||
// Polynomial fit of log2(x)/(x - 1), for x in range [1, 2)
|
||||
vfloat4 p = POLY4(m,
|
||||
2.8882704548164776201f,
|
||||
-2.52074962577807006663f,
|
||||
1.48116647521213171641f,
|
||||
-0.465725644288844778798f,
|
||||
0.0596515482674574969533f);
|
||||
|
||||
// Increases the polynomial degree, but ensures that log2(1) == 0
|
||||
p = p * (m - 1.0f);
|
||||
|
||||
return p + e;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Compute an approximate pow(x, y) for each lane in the vector.
|
||||
*
|
||||
* Power function based on the exp2(log2(x) * y) transform.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vfloat4 pow(vfloat4 x, vfloat4 y)
|
||||
{
|
||||
vmask4 zero_mask = y == vfloat4(0.0f);
|
||||
vfloat4 estimate = exp2(log2(x) * y);
|
||||
|
||||
// Guarantee that y == 0 returns exactly 1.0f
|
||||
return select(estimate, vfloat4(1.0f), zero_mask);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Count the leading zeros for each lane in @c a.
|
||||
*
|
||||
* Valid for all data values of @c a; will return a per-lane value [0, 32].
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vint4 clz(vint4 a)
|
||||
{
|
||||
// This function is a horrible abuse of floating point exponents to convert
|
||||
// the original integer value into a 2^N encoding we can recover easily.
|
||||
|
||||
// Convert to float without risk of rounding up by keeping only top 8 bits.
|
||||
// This trick is is guaranteed to keep top 8 bits and clear the 9th.
|
||||
a = (~lsr<8>(a)) & a;
|
||||
a = float_as_int(int_to_float(a));
|
||||
|
||||
// Extract and unbias exponent
|
||||
a = vint4(127 + 31) - lsr<23>(a);
|
||||
|
||||
// Clamp result to a valid 32-bit range
|
||||
return clamp(0, 32, a);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return lanewise 2^a for each lane in @c a.
|
||||
*
|
||||
* Use of signed int means that this is only valid for values in range [0, 31].
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a)
|
||||
{
|
||||
// 2^30 is the largest signed number than can be represented
|
||||
assert(all(a < vint4(31)));
|
||||
|
||||
// This function is a horrible abuse of floating point to use the exponent
|
||||
// and float conversion to generate a 2^N multiple.
|
||||
|
||||
// Bias the exponent
|
||||
vint4 exp = a + 127;
|
||||
exp = lsl<23>(exp);
|
||||
|
||||
// Reinterpret the bits as a float, and then convert to an int
|
||||
vfloat4 f = int_as_float(exp);
|
||||
return float_to_int(f);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Convert unorm16 [0, 65535] to float16 in range [0, 1].
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p)
|
||||
{
|
||||
vint4 fp16_one = vint4(0x3C00);
|
||||
vint4 fp16_small = lsl<8>(p);
|
||||
|
||||
vmask4 is_one = p == vint4(0xFFFF);
|
||||
vmask4 is_small = p < vint4(4);
|
||||
|
||||
// Manually inline clz() on Visual Studio to avoid release build codegen bug
|
||||
// see https://github.com/ARM-software/astc-encoder/issues/259
|
||||
#if !defined(__clang__) && defined(_MSC_VER)
|
||||
vint4 a = (~lsr<8>(p)) & p;
|
||||
a = float_as_int(int_to_float(a));
|
||||
a = vint4(127 + 31) - lsr<23>(a);
|
||||
vint4 lz = clamp(0, 32, a) - 16;
|
||||
#else
|
||||
vint4 lz = clz(p) - 16;
|
||||
#endif
|
||||
|
||||
p = p * two_to_the_n(lz + 1);
|
||||
p = p & vint4(0xFFFF);
|
||||
|
||||
p = lsr<6>(p);
|
||||
|
||||
p = p | lsl<10>(vint4(14) - lz);
|
||||
|
||||
vint4 r = select(p, fp16_one, is_one);
|
||||
r = select(r, fp16_small, is_small);
|
||||
return r;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Convert 16-bit LNS to float16.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vint4 lns_to_sf16(vint4 p)
|
||||
{
|
||||
vint4 mc = p & 0x7FF;
|
||||
vint4 ec = lsr<11>(p);
|
||||
|
||||
vint4 mc_512 = mc * 3;
|
||||
vmask4 mask_512 = mc < vint4(512);
|
||||
|
||||
vint4 mc_1536 = mc * 4 - 512;
|
||||
vmask4 mask_1536 = mc < vint4(1536);
|
||||
|
||||
vint4 mc_else = mc * 5 - 2048;
|
||||
|
||||
vint4 mt = mc_else;
|
||||
mt = select(mt, mc_1536, mask_1536);
|
||||
mt = select(mt, mc_512, mask_512);
|
||||
|
||||
vint4 res = lsl<10>(ec) | lsr<3>(mt);
|
||||
return min(res, vint4(0x7BFF));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Extract mantissa and exponent of a float value.
|
||||
*
|
||||
* @param a The input value.
|
||||
* @param[out] exp The output exponent.
|
||||
*
|
||||
* @return The mantissa.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp)
|
||||
{
|
||||
// Interpret the bits as an integer
|
||||
vint4 ai = float_as_int(a);
|
||||
|
||||
// Extract and unbias the exponent
|
||||
exp = (lsr<23>(ai) & 0xFF) - 126;
|
||||
|
||||
// Extract and unbias the mantissa
|
||||
vint4 manti = (ai & static_cast<int>(0x807FFFFF)) | 0x3F000000;
|
||||
return int_as_float(manti);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Convert float to 16-bit LNS.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE vfloat4 float_to_lns(vfloat4 a)
|
||||
{
|
||||
vint4 exp;
|
||||
vfloat4 mant = frexp(a, exp);
|
||||
|
||||
// Do these early before we start messing about ...
|
||||
vmask4 mask_underflow_nan = ~(a > vfloat4(1.0f / 67108864.0f));
|
||||
vmask4 mask_infinity = a >= vfloat4(65536.0f);
|
||||
|
||||
// If input is smaller than 2^-14, multiply by 2^25 and don't bias.
|
||||
vmask4 exp_lt_m13 = exp < vint4(-13);
|
||||
|
||||
vfloat4 a1a = a * 33554432.0f;
|
||||
vint4 expa = vint4::zero();
|
||||
|
||||
vfloat4 a1b = (mant - 0.5f) * 4096;
|
||||
vint4 expb = exp + 14;
|
||||
|
||||
a = select(a1b, a1a, exp_lt_m13);
|
||||
exp = select(expb, expa, exp_lt_m13);
|
||||
|
||||
vmask4 a_lt_384 = a < vfloat4(384.0f);
|
||||
vmask4 a_lt_1408 = a <= vfloat4(1408.0f);
|
||||
|
||||
vfloat4 a2a = a * (4.0f / 3.0f);
|
||||
vfloat4 a2b = a + 128.0f;
|
||||
vfloat4 a2c = (a + 512.0f) * (4.0f / 5.0f);
|
||||
|
||||
a = a2c;
|
||||
a = select(a, a2b, a_lt_1408);
|
||||
a = select(a, a2a, a_lt_384);
|
||||
|
||||
a = a + (int_to_float(exp) * 2048.0f) + 1.0f;
|
||||
|
||||
a = select(a, vfloat4(65535.0f), mask_infinity);
|
||||
a = select(a, vfloat4::zero(), mask_underflow_nan);
|
||||
|
||||
return a;
|
||||
}
|
||||
|
||||
namespace astc
|
||||
{
|
||||
|
||||
static ASTCENC_SIMD_INLINE float pow(float x, float y)
|
||||
{
|
||||
return pow(vfloat4(x), vfloat4(y)).lane<0>();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif // #ifndef ASTC_VECMATHLIB_H_INCLUDED
|
||||
@@ -0,0 +1,421 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2020-2025 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Generic 4x32-bit vector functions.
|
||||
*
|
||||
* This module implements generic 4-wide vector functions that are valid for
|
||||
* all instruction sets, typically implemented using lower level 4-wide
|
||||
* operations that are ISA-specific.
|
||||
*/
|
||||
|
||||
#ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
|
||||
#define ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
|
||||
|
||||
#ifndef ASTCENC_SIMD_INLINE
|
||||
#error "Include astcenc_vecmathlib.h, do not include directly"
|
||||
#endif
|
||||
|
||||
#include <cstdio>
|
||||
#include <limits>
|
||||
|
||||
// ============================================================================
|
||||
// vint4 operators and functions
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by scalar addition.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, int b)
|
||||
{
|
||||
return a + vint4(b);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by vector incremental addition.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4& operator+=(vint4& a, const vint4& b)
|
||||
{
|
||||
a = a + b;
|
||||
return a;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by scalar subtraction.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, int b)
|
||||
{
|
||||
return a - vint4(b);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by scalar multiplication.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, int b)
|
||||
{
|
||||
return a * vint4(b);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by scalar bitwise or.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, int b)
|
||||
{
|
||||
return a | vint4(b);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by scalar bitwise and.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, int b)
|
||||
{
|
||||
return a & vint4(b);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by scalar bitwise xor.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, int b)
|
||||
{
|
||||
return a ^ vint4(b);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the clamped value between min and max.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 clamp(int minv, int maxv, vint4 a)
|
||||
{
|
||||
return min(max(a, vint4(minv)), vint4(maxv));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the horizontal sum of RGB vector lanes as a scalar.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
|
||||
{
|
||||
return a.lane<0>() + a.lane<1>() + a.lane<2>();
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the horizontal minimum of a vector.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE int hmin_s(vint4 a)
|
||||
{
|
||||
return hmin(a).lane<0>();
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Generate a vint4 from a size_t.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vint4 vint4_from_size(size_t a)
|
||||
{
|
||||
assert(a <= std::numeric_limits<int>::max());
|
||||
return vint4(static_cast<int>(a));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the horizontal maximum of a vector.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE int hmax_s(vint4 a)
|
||||
{
|
||||
return hmax(a).lane<0>();
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// vfloat4 operators and functions
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by vector incremental addition.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4& operator+=(vfloat4& a, const vfloat4& b)
|
||||
{
|
||||
a = a + b;
|
||||
return a;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by scalar addition.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, float b)
|
||||
{
|
||||
return a + vfloat4(b);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by scalar subtraction.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, float b)
|
||||
{
|
||||
return a - vfloat4(b);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by scalar multiplication.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, float b)
|
||||
{
|
||||
return a * vfloat4(b);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: scalar by vector multiplication.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 operator*(float a, vfloat4 b)
|
||||
{
|
||||
return vfloat4(a) * b;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: vector by scalar division.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, float b)
|
||||
{
|
||||
return a / vfloat4(b);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Overload: scalar by vector division.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 operator/(float a, vfloat4 b)
|
||||
{
|
||||
return vfloat4(a) / b;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the min vector of a vector and a scalar.
|
||||
*
|
||||
* If either lane value is NaN, @c b will be returned for that lane.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, float b)
|
||||
{
|
||||
return min(a, vfloat4(b));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the max vector of a vector and a scalar.
|
||||
*
|
||||
* If either lane value is NaN, @c b will be returned for that lane.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, float b)
|
||||
{
|
||||
return max(a, vfloat4(b));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the clamped value between min and max.
|
||||
*
|
||||
* It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
|
||||
* then @c min will be returned for that lane.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
|
||||
{
|
||||
// Do not reorder - second operand will return if either is NaN
|
||||
return min(max(a, minv), maxv);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the clamped value between 0.0f and 1.0f.
|
||||
*
|
||||
* If @c a is NaN then zero will be returned for that lane.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a)
|
||||
{
|
||||
// Do not reorder - second operand will return if either is NaN
|
||||
return min(max(a, vfloat4::zero()), 1.0f);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the horizontal minimum of a vector.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE float hmin_s(vfloat4 a)
|
||||
{
|
||||
return hmin(a).lane<0>();
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the horizontal min of RGB vector lanes as a scalar.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE float hmin_rgb_s(vfloat4 a)
|
||||
{
|
||||
a.set_lane<3>(a.lane<0>());
|
||||
return hmin_s(a);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the horizontal maximum of a vector.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE float hmax_s(vfloat4 a)
|
||||
{
|
||||
return hmax(a).lane<0>();
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Accumulate lane-wise sums for a vector.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a)
|
||||
{
|
||||
accum = accum + a;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Accumulate lane-wise sums for a masked vector.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a, vmask4 m)
|
||||
{
|
||||
a = select(vfloat4::zero(), a, m);
|
||||
haccumulate(accum, a);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the horizontal sum of RGB vector lanes as a scalar.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a)
|
||||
{
|
||||
return a.lane<0>() + a.lane<1>() + a.lane<2>();
|
||||
}
|
||||
|
||||
#if !defined(ASTCENC_USE_NATIVE_DOT_PRODUCT)
|
||||
|
||||
/**
|
||||
* @brief Return the dot product for the full 4 lanes, returning scalar.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b)
|
||||
{
|
||||
vfloat4 m = a * b;
|
||||
return hadd_s(m);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the dot product for the full 4 lanes, returning vector.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b)
|
||||
{
|
||||
vfloat4 m = a * b;
|
||||
return vfloat4(hadd_s(m));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the dot product for the bottom 3 lanes, returning scalar.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b)
|
||||
{
|
||||
vfloat4 m = a * b;
|
||||
return hadd_rgb_s(m);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the dot product for the bottom 3 lanes, returning vector.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b)
|
||||
{
|
||||
vfloat4 m = a * b;
|
||||
float d3 = hadd_rgb_s(m);
|
||||
return vfloat4(d3, d3, d3, 0.0f);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(ASTCENC_USE_NATIVE_POPCOUNT)
|
||||
|
||||
/**
|
||||
* @brief Population bit count.
|
||||
*
|
||||
* @param v The value to population count.
|
||||
*
|
||||
* @return The number of 1 bits.
|
||||
*/
|
||||
static inline int popcount(uint64_t v)
|
||||
{
|
||||
uint64_t mask1 = 0x5555555555555555ULL;
|
||||
uint64_t mask2 = 0x3333333333333333ULL;
|
||||
uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL;
|
||||
v -= (v >> 1) & mask1;
|
||||
v = (v & mask2) + ((v >> 2) & mask2);
|
||||
v += v >> 4;
|
||||
v &= mask3;
|
||||
v *= 0x0101010101010101ULL;
|
||||
v >>= 56;
|
||||
return static_cast<int>(v);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief Apply signed bit transfer.
|
||||
*
|
||||
* @param input0 The first encoded endpoint.
|
||||
* @param input1 The second encoded endpoint.
|
||||
*/
|
||||
static ASTCENC_SIMD_INLINE void bit_transfer_signed(
|
||||
vint4& input0,
|
||||
vint4& input1
|
||||
) {
|
||||
input1 = lsr<1>(input1) | (input0 & 0x80);
|
||||
input0 = lsr<1>(input0) & 0x3F;
|
||||
|
||||
vmask4 mask = (input0 & 0x20) != vint4::zero();
|
||||
input0 = select(input0, input0 - 0x40, mask);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Debug function to print a vector of ints.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void print(vint4 a)
|
||||
{
|
||||
ASTCENC_ALIGNAS int v[4];
|
||||
storea(a, v);
|
||||
printf("v4_i32:\n %8d %8d %8d %8d\n",
|
||||
v[0], v[1], v[2], v[3]);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Debug function to print a vector of ints.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void printx(vint4 a)
|
||||
{
|
||||
ASTCENC_ALIGNAS int v[4];
|
||||
storea(a, v);
|
||||
|
||||
unsigned int uv[4];
|
||||
std::memcpy(uv, v, sizeof(int) * 4);
|
||||
|
||||
printf("v4_i32:\n %08x %08x %08x %08x\n",
|
||||
uv[0], uv[1], uv[2], uv[3]);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Debug function to print a vector of floats.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void print(vfloat4 a)
|
||||
{
|
||||
ASTCENC_ALIGNAS float v[4];
|
||||
storea(a, v);
|
||||
printf("v4_f32:\n %0.4f %0.4f %0.4f %0.4f\n",
|
||||
static_cast<double>(v[0]), static_cast<double>(v[1]),
|
||||
static_cast<double>(v[2]), static_cast<double>(v[3]));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Debug function to print a vector of masks.
|
||||
*/
|
||||
ASTCENC_SIMD_INLINE void print(vmask4 a)
|
||||
{
|
||||
print(select(vint4(0), vint4(1), a));
|
||||
}
|
||||
|
||||
#endif // #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
|
||||
@@ -0,0 +1,496 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
#if !defined(ASTCENC_DECOMPRESS_ONLY)
|
||||
|
||||
/**
|
||||
* @brief Functions for angular-sum algorithm for weight alignment.
|
||||
*
|
||||
* This algorithm works as follows:
|
||||
* - we compute a complex number P as (cos s*i, sin s*i) for each weight,
|
||||
* where i is the input value and s is a scaling factor based on the spacing between the weights.
|
||||
* - we then add together complex numbers for all the weights.
|
||||
* - we then compute the length and angle of the resulting sum.
|
||||
*
|
||||
* This should produce the following results:
|
||||
* - perfect alignment results in a vector whose length is equal to the sum of lengths of all inputs
|
||||
* - even distribution results in a vector of length 0.
|
||||
* - all samples identical results in perfect alignment for every scaling.
|
||||
*
|
||||
* For each scaling factor within a given set, we compute an alignment factor from 0 to 1. This
|
||||
* should then result in some scalings standing out as having particularly good alignment factors;
|
||||
* we can use this to produce a set of candidate scale/shift values for various quantization levels;
|
||||
* we should then actually try them and see what happens.
|
||||
*/
|
||||
|
||||
#include "astcenc_internal.h"
|
||||
#include "astcenc_vecmathlib.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include <cfloat>
|
||||
|
||||
static constexpr unsigned int ANGULAR_STEPS { 32 };
|
||||
|
||||
static_assert((ANGULAR_STEPS % ASTCENC_SIMD_WIDTH) == 0,
|
||||
"ANGULAR_STEPS must be multiple of ASTCENC_SIMD_WIDTH");
|
||||
|
||||
static_assert(ANGULAR_STEPS >= 32,
|
||||
"ANGULAR_STEPS must be at least max(steps_for_quant_level)");
|
||||
|
||||
// Store a reduced sin/cos table for 64 possible weight values; this causes
|
||||
// slight quality loss compared to using sin() and cos() directly. Must be 2^N.
|
||||
static constexpr unsigned int SINCOS_STEPS { 64 };
|
||||
|
||||
static const uint8_t steps_for_quant_level[12] {
|
||||
2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32
|
||||
};
|
||||
|
||||
ASTCENC_ALIGNAS static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
|
||||
ASTCENC_ALIGNAS static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
|
||||
|
||||
#if defined(ASTCENC_DIAGNOSTICS)
|
||||
static bool print_once { true };
|
||||
#endif
|
||||
|
||||
/* See header for documentation. */
|
||||
void prepare_angular_tables()
|
||||
{
|
||||
for (unsigned int i = 0; i < ANGULAR_STEPS; i++)
|
||||
{
|
||||
float angle_step = static_cast<float>(i + 1);
|
||||
|
||||
for (unsigned int j = 0; j < SINCOS_STEPS; j++)
|
||||
{
|
||||
sin_table[j][i] = static_cast<float>(sinf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
|
||||
cos_table[j][i] = static_cast<float>(cosf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Compute the angular alignment factors and offsets.
|
||||
*
|
||||
* @param weight_count The number of (decimated) weights.
|
||||
* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
|
||||
* @param max_angular_steps The maximum number of steps to be tested.
|
||||
* @param[out] offsets The output angular offsets array.
|
||||
*/
|
||||
static void compute_angular_offsets(
|
||||
unsigned int weight_count,
|
||||
const float* dec_weight_ideal_value,
|
||||
unsigned int max_angular_steps,
|
||||
float* offsets
|
||||
) {
|
||||
promise(weight_count > 0);
|
||||
promise(max_angular_steps > 0);
|
||||
|
||||
ASTCENC_ALIGNAS int isamplev[BLOCK_MAX_WEIGHTS];
|
||||
|
||||
// Precompute isample; arrays are always allocated 64 elements long
|
||||
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
// Ideal weight can be outside [0, 1] range, so clamp to fit table
|
||||
vfloat ideal_weight = clampzo(loada(dec_weight_ideal_value + i));
|
||||
|
||||
// Convert a weight to a sincos table index
|
||||
vfloat sample = ideal_weight * (SINCOS_STEPS - 1.0f);
|
||||
vint isample = float_to_int_rtn(sample);
|
||||
storea(isample, isamplev + i);
|
||||
}
|
||||
|
||||
// Arrays are multiple of SIMD width (ANGULAR_STEPS), safe to overshoot max
|
||||
vfloat mult(1.0f / (2.0f * astc::PI));
|
||||
|
||||
for (unsigned int i = 0; i < max_angular_steps; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vfloat anglesum_x = vfloat::zero();
|
||||
vfloat anglesum_y = vfloat::zero();
|
||||
|
||||
for (unsigned int j = 0; j < weight_count; j++)
|
||||
{
|
||||
int isample = isamplev[j];
|
||||
anglesum_x += loada(cos_table[isample] + i);
|
||||
anglesum_y += loada(sin_table[isample] + i);
|
||||
}
|
||||
|
||||
vfloat angle = atan2(anglesum_y, anglesum_x);
|
||||
vfloat ofs = angle * mult;
|
||||
storea(ofs, offsets + i);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief For a given step size compute the lowest and highest weight.
|
||||
*
|
||||
* Compute the lowest and highest weight that results from quantizing using the given stepsize and
|
||||
* offset, and then compute the resulting error. The cut errors indicate the error that results from
|
||||
* forcing samples that should have had one weight value one step up or down.
|
||||
*
|
||||
* @param weight_count The number of (decimated) weights.
|
||||
* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
|
||||
* @param max_angular_steps The maximum number of steps to be tested.
|
||||
* @param max_quant_steps The maximum quantization level to be tested.
|
||||
* @param offsets The angular offsets array.
|
||||
* @param[out] lowest_weight Per angular step, the lowest weight.
|
||||
* @param[out] weight_span Per angular step, the span between lowest and highest weight.
|
||||
* @param[out] error Per angular step, the error.
|
||||
* @param[out] cut_low_weight_error Per angular step, the low weight cut error.
|
||||
* @param[out] cut_high_weight_error Per angular step, the high weight cut error.
|
||||
*/
|
||||
static void compute_lowest_and_highest_weight(
|
||||
unsigned int weight_count,
|
||||
const float* dec_weight_ideal_value,
|
||||
unsigned int max_angular_steps,
|
||||
unsigned int max_quant_steps,
|
||||
const float* offsets,
|
||||
float* lowest_weight,
|
||||
int* weight_span,
|
||||
float* error,
|
||||
float* cut_low_weight_error,
|
||||
float* cut_high_weight_error
|
||||
) {
|
||||
promise(weight_count > 0);
|
||||
promise(max_angular_steps > 0);
|
||||
|
||||
vfloat rcp_stepsize = int_to_float(vint::lane_id()) + vfloat(1.0f);
|
||||
|
||||
// Compute minimum/maximum weights in the weight array. Our remapping
|
||||
// is monotonic, so the min/max rounded weights relate to the min/max
|
||||
// unrounded weights in a straightforward way.
|
||||
vfloat min_weight(FLT_MAX);
|
||||
vfloat max_weight(-FLT_MAX);
|
||||
|
||||
vint lane_id = vint::lane_id();
|
||||
for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vmask active = lane_id < vint(weight_count);
|
||||
lane_id += vint(ASTCENC_SIMD_WIDTH);
|
||||
|
||||
vfloat weights = loada(dec_weight_ideal_value + i);
|
||||
min_weight = min(min_weight, select(min_weight, weights, active));
|
||||
max_weight = max(max_weight, select(max_weight, weights, active));
|
||||
}
|
||||
|
||||
min_weight = hmin(min_weight);
|
||||
max_weight = hmax(max_weight);
|
||||
|
||||
// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
|
||||
for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
|
||||
{
|
||||
vfloat errval = vfloat::zero();
|
||||
vfloat cut_low_weight_err = vfloat::zero();
|
||||
vfloat cut_high_weight_err = vfloat::zero();
|
||||
vfloat offset = loada(offsets + sp);
|
||||
|
||||
// We know the min and max weight values, so we can figure out
|
||||
// the corresponding indices before we enter the loop.
|
||||
vfloat minidx = round(min_weight * rcp_stepsize - offset);
|
||||
vfloat maxidx = round(max_weight * rcp_stepsize - offset);
|
||||
|
||||
for (unsigned int j = 0; j < weight_count; j++)
|
||||
{
|
||||
vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset;
|
||||
vfloat svalrte = round(sval);
|
||||
vfloat diff = sval - svalrte;
|
||||
errval += diff * diff;
|
||||
|
||||
// Accumulate errors for minimum index
|
||||
vmask mask = svalrte == minidx;
|
||||
vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff;
|
||||
cut_low_weight_err = select(cut_low_weight_err, accum, mask);
|
||||
|
||||
// Accumulate errors for maximum index
|
||||
mask = svalrte == maxidx;
|
||||
accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff;
|
||||
cut_high_weight_err = select(cut_high_weight_err, accum, mask);
|
||||
}
|
||||
|
||||
// Write out min weight and weight span; clamp span to a usable range
|
||||
vint span = float_to_int(maxidx - minidx + vfloat(1));
|
||||
span = min(span, vint(max_quant_steps + 3));
|
||||
span = max(span, vint(2));
|
||||
storea(minidx, lowest_weight + sp);
|
||||
storea(span, weight_span + sp);
|
||||
|
||||
// The cut_(lowest/highest)_weight_error indicate the error that results from forcing
|
||||
// samples that should have had the weight value one step (up/down).
|
||||
vfloat ssize = 1.0f / rcp_stepsize;
|
||||
vfloat errscale = ssize * ssize;
|
||||
storea(errval * errscale, error + sp);
|
||||
storea(cut_low_weight_err * errscale, cut_low_weight_error + sp);
|
||||
storea(cut_high_weight_err * errscale, cut_high_weight_error + sp);
|
||||
|
||||
rcp_stepsize = rcp_stepsize + vfloat(ASTCENC_SIMD_WIDTH);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief The main function for the angular algorithm.
|
||||
*
|
||||
* @param weight_count The number of (decimated) weights.
|
||||
* @param dec_weight_ideal_value The ideal decimated unquantized weight values.
|
||||
* @param max_quant_level The maximum quantization level to be tested.
|
||||
* @param[out] low_value Per angular step, the lowest weight value.
|
||||
* @param[out] high_value Per angular step, the highest weight value.
|
||||
*/
|
||||
static void compute_angular_endpoints_for_quant_levels(
|
||||
unsigned int weight_count,
|
||||
const float* dec_weight_ideal_value,
|
||||
unsigned int max_quant_level,
|
||||
float low_value[TUNE_MAX_ANGULAR_QUANT + 1],
|
||||
float high_value[TUNE_MAX_ANGULAR_QUANT + 1]
|
||||
) {
|
||||
unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
|
||||
unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];
|
||||
|
||||
ASTCENC_ALIGNAS float angular_offsets[ANGULAR_STEPS];
|
||||
|
||||
compute_angular_offsets(weight_count, dec_weight_ideal_value,
|
||||
max_angular_steps, angular_offsets);
|
||||
|
||||
ASTCENC_ALIGNAS float lowest_weight[ANGULAR_STEPS];
|
||||
ASTCENC_ALIGNAS int32_t weight_span[ANGULAR_STEPS];
|
||||
ASTCENC_ALIGNAS float error[ANGULAR_STEPS];
|
||||
ASTCENC_ALIGNAS float cut_low_weight_error[ANGULAR_STEPS];
|
||||
ASTCENC_ALIGNAS float cut_high_weight_error[ANGULAR_STEPS];
|
||||
|
||||
compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value,
|
||||
max_angular_steps, max_quant_steps,
|
||||
angular_offsets, lowest_weight, weight_span, error,
|
||||
cut_low_weight_error, cut_high_weight_error);
|
||||
|
||||
// For each quantization level, find the best error terms. Use packed vectors so data-dependent
|
||||
// branches can become selects. This involves some integer to float casts, but the values are
|
||||
// small enough so they never round the wrong way.
|
||||
vfloat4 best_results[36];
|
||||
|
||||
// Initialize the array to some safe defaults
|
||||
promise(max_quant_steps > 0);
|
||||
for (unsigned int i = 0; i < (max_quant_steps + 4); i++)
|
||||
{
|
||||
// Lane<0> = Best error
|
||||
// Lane<1> = Best scale; -1 indicates no solution found
|
||||
// Lane<2> = Cut low weight
|
||||
best_results[i] = vfloat4(ERROR_CALC_DEFAULT, -1.0f, 0.0f, 0.0f);
|
||||
}
|
||||
|
||||
promise(max_angular_steps > 0);
|
||||
for (unsigned int i = 0; i < max_angular_steps; i++)
|
||||
{
|
||||
float i_flt = static_cast<float>(i);
|
||||
|
||||
int idx_span = weight_span[i];
|
||||
|
||||
float error_cut_low = error[i] + cut_low_weight_error[i];
|
||||
float error_cut_high = error[i] + cut_high_weight_error[i];
|
||||
float error_cut_low_high = error[i] + cut_low_weight_error[i] + cut_high_weight_error[i];
|
||||
|
||||
// Check best error against record N
|
||||
vfloat4 best_result = best_results[idx_span];
|
||||
vfloat4 new_result = vfloat4(error[i], i_flt, 0.0f, 0.0f);
|
||||
vmask4 mask = vfloat4(best_result.lane<0>()) > vfloat4(error[i]);
|
||||
best_results[idx_span] = select(best_result, new_result, mask);
|
||||
|
||||
// Check best error against record N-1 with either cut low or cut high
|
||||
best_result = best_results[idx_span - 1];
|
||||
|
||||
new_result = vfloat4(error_cut_low, i_flt, 1.0f, 0.0f);
|
||||
mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low);
|
||||
best_result = select(best_result, new_result, mask);
|
||||
|
||||
new_result = vfloat4(error_cut_high, i_flt, 0.0f, 0.0f);
|
||||
mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_high);
|
||||
best_results[idx_span - 1] = select(best_result, new_result, mask);
|
||||
|
||||
// Check best error against record N-2 with both cut low and high
|
||||
best_result = best_results[idx_span - 2];
|
||||
new_result = vfloat4(error_cut_low_high, i_flt, 1.0f, 0.0f);
|
||||
mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low_high);
|
||||
best_results[idx_span - 2] = select(best_result, new_result, mask);
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i <= max_quant_level; i++)
|
||||
{
|
||||
unsigned int q = steps_for_quant_level[i];
|
||||
int bsi = static_cast<int>(best_results[q].lane<1>());
|
||||
|
||||
// Did we find anything?
|
||||
#if defined(ASTCENC_DIAGNOSTICS)
|
||||
if ((bsi < 0) && print_once)
|
||||
{
|
||||
print_once = false;
|
||||
printf("INFO: Unable to find full encoding within search error limit.\n\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
bsi = astc::max(0, bsi);
|
||||
|
||||
float lwi = lowest_weight[bsi] + best_results[q].lane<2>();
|
||||
float hwi = lwi + static_cast<float>(q) - 1.0f;
|
||||
|
||||
float stepsize = 1.0f / (1.0f + static_cast<float>(bsi));
|
||||
low_value[i] = (angular_offsets[bsi] + lwi) * stepsize;
|
||||
high_value[i] = (angular_offsets[bsi] + hwi) * stepsize;
|
||||
}
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void compute_angular_endpoints_1plane(
|
||||
bool only_always,
|
||||
const block_size_descriptor& bsd,
|
||||
const float* dec_weight_ideal_value,
|
||||
unsigned int max_weight_quant,
|
||||
compression_working_buffers& tmpbuf
|
||||
) {
|
||||
float (&low_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
|
||||
float (&high_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
|
||||
|
||||
float (&low_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
|
||||
float (&high_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
|
||||
|
||||
unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
|
||||
: bsd.decimation_mode_count_selected;
|
||||
promise(max_decimation_modes > 0);
|
||||
for (unsigned int i = 0; i < max_decimation_modes; i++)
|
||||
{
|
||||
const decimation_mode& dm = bsd.decimation_modes[i];
|
||||
if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
|
||||
|
||||
unsigned int max_precision = dm.maxprec_1plane;
|
||||
if (max_precision > TUNE_MAX_ANGULAR_QUANT)
|
||||
{
|
||||
max_precision = TUNE_MAX_ANGULAR_QUANT;
|
||||
}
|
||||
|
||||
if (max_precision > max_weight_quant)
|
||||
{
|
||||
max_precision = max_weight_quant;
|
||||
}
|
||||
|
||||
compute_angular_endpoints_for_quant_levels(
|
||||
weight_count,
|
||||
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
|
||||
max_precision, low_values[i], high_values[i]);
|
||||
}
|
||||
|
||||
unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
|
||||
: bsd.block_mode_count_1plane_selected;
|
||||
promise(max_block_modes > 0);
|
||||
for (unsigned int i = 0; i < max_block_modes; i++)
|
||||
{
|
||||
const block_mode& bm = bsd.block_modes[i];
|
||||
assert(!bm.is_dual_plane);
|
||||
|
||||
unsigned int quant_mode = bm.quant_mode;
|
||||
unsigned int decim_mode = bm.decimation_mode;
|
||||
|
||||
if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
|
||||
{
|
||||
low_value[i] = low_values[decim_mode][quant_mode];
|
||||
high_value[i] = high_values[decim_mode][quant_mode];
|
||||
}
|
||||
else
|
||||
{
|
||||
low_value[i] = 0.0f;
|
||||
high_value[i] = 1.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void compute_angular_endpoints_2planes(
|
||||
const block_size_descriptor& bsd,
|
||||
const float* dec_weight_ideal_value,
|
||||
unsigned int max_weight_quant,
|
||||
compression_working_buffers& tmpbuf
|
||||
) {
|
||||
float (&low_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
|
||||
float (&high_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
|
||||
float (&low_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value2;
|
||||
float (&high_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value2;
|
||||
|
||||
float (&low_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
|
||||
float (&high_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
|
||||
float (&low_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values2;
|
||||
float (&high_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values2;
|
||||
|
||||
promise(bsd.decimation_mode_count_selected > 0);
|
||||
for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
|
||||
{
|
||||
const decimation_mode& dm = bsd.decimation_modes[i];
|
||||
if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
|
||||
|
||||
unsigned int max_precision = dm.maxprec_2planes;
|
||||
if (max_precision > TUNE_MAX_ANGULAR_QUANT)
|
||||
{
|
||||
max_precision = TUNE_MAX_ANGULAR_QUANT;
|
||||
}
|
||||
|
||||
if (max_precision > max_weight_quant)
|
||||
{
|
||||
max_precision = max_weight_quant;
|
||||
}
|
||||
|
||||
compute_angular_endpoints_for_quant_levels(
|
||||
weight_count,
|
||||
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
|
||||
max_precision, low_values1[i], high_values1[i]);
|
||||
|
||||
compute_angular_endpoints_for_quant_levels(
|
||||
weight_count,
|
||||
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
|
||||
max_precision, low_values2[i], high_values2[i]);
|
||||
}
|
||||
|
||||
unsigned int start = bsd.block_mode_count_1plane_selected;
|
||||
unsigned int end = bsd.block_mode_count_1plane_2plane_selected;
|
||||
for (unsigned int i = start; i < end; i++)
|
||||
{
|
||||
const block_mode& bm = bsd.block_modes[i];
|
||||
unsigned int quant_mode = bm.quant_mode;
|
||||
unsigned int decim_mode = bm.decimation_mode;
|
||||
|
||||
if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
|
||||
{
|
||||
low_value1[i] = low_values1[decim_mode][quant_mode];
|
||||
high_value1[i] = high_values1[decim_mode][quant_mode];
|
||||
low_value2[i] = low_values2[decim_mode][quant_mode];
|
||||
high_value2[i] = high_values2[decim_mode][quant_mode];
|
||||
}
|
||||
else
|
||||
{
|
||||
low_value1[i] = 0.0f;
|
||||
high_value1[i] = 1.0f;
|
||||
low_value2[i] = 0.0f;
|
||||
high_value2[i] = 1.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,147 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2021 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Data tables for quantization transfer.
|
||||
*/
|
||||
|
||||
#include "astcenc_internal.h"
|
||||
|
||||
#define _ 0 // Using _ to indicate an entry that will not be used.
|
||||
|
||||
const quant_and_transfer_table quant_and_xfer_tables[12] {
|
||||
// QUANT2, range 0..1
|
||||
{
|
||||
{0, 64},
|
||||
{0, 1},
|
||||
{0, 64},
|
||||
{0x4000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
|
||||
_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
|
||||
0x4000}
|
||||
},
|
||||
// QUANT_3, range 0..2
|
||||
{
|
||||
{0, 32, 64},
|
||||
{0, 1, 2},
|
||||
{0, 32, 64},
|
||||
{0x2000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
|
||||
_,_,0x4000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
|
||||
_,_,_,_,0x4020}
|
||||
},
|
||||
// QUANT_4, range 0..3
|
||||
{
|
||||
{0, 21, 43, 64},
|
||||
{0, 1, 2, 3},
|
||||
{0, 21, 43, 64},
|
||||
{0x1500,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x2b00,_,_,_,_,
|
||||
_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x4015,_,_,_,_,_,_,_,_,_,_,_,_,
|
||||
_,_,_,_,_,_,_,_,0x402b}
|
||||
},
|
||||
//QUANT_5, range 0..4
|
||||
{
|
||||
{0, 16, 32, 48, 64},
|
||||
{0, 1, 2, 3, 4},
|
||||
{0, 16, 32, 48, 64},
|
||||
{0x1000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x2000,_,_,_,_,_,_,_,_,_,
|
||||
_,_,_,_,_,_,0x3010,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x4020,_,_,_,
|
||||
_,_,_,_,_,_,_,_,_,_,_,_,0x4030}
|
||||
},
|
||||
// QUANT_6, range 0..5
|
||||
{
|
||||
{0, 12, 25, 39, 52, 64},
|
||||
{0, 2, 4, 5, 3, 1},
|
||||
{0, 64, 12, 52, 25, 39},
|
||||
{0x0c00,_,_,_,_,_,_,_,_,_,_,_,0x1900,_,_,_,_,_,_,_,_,_,_,_,_,
|
||||
0x270c,_,_,_,_,_,_,_,_,_,_,_,_,_,0x3419,_,_,_,_,_,_,_,_,_,_,
|
||||
_,_,0x4027,_,_,_,_,_,_,_,_,_,_,_,0x4034}
|
||||
},
|
||||
// QUANT_8, range 0..7
|
||||
{
|
||||
{0, 9, 18, 27, 37, 46, 55, 64},
|
||||
{0, 1, 2, 3, 4, 5, 6, 7},
|
||||
{0, 9, 18, 27, 37, 46, 55, 64},
|
||||
{0x0900,_,_,_,_,_,_,_,_,0x1200,_,_,_,_,_,_,_,_,0x1b09,_,_,
|
||||
_,_,_,_,_,_,0x2512,_,_,_,_,_,_,_,_,_,0x2e1b,_,_,_,_,_,_,_,_,
|
||||
0x3725,_,_,_,_,_,_,_,_,0x402e,_,_,_,_,_,_,_,_,0x4037}
|
||||
},
|
||||
// QUANT_10, range 0..9
|
||||
{
|
||||
{0, 7, 14, 21, 28, 36, 43, 50, 57, 64},
|
||||
{0, 2, 4, 6, 8, 9, 7, 5, 3, 1},
|
||||
{0, 64, 7, 57, 14, 50, 21, 43, 28, 36},
|
||||
{0x0700,_,_,_,_,_,_,0x0e00,_,_,_,_,_,_,0x1507,_,_,_,_,_,_,
|
||||
0x1c0e,_,_,_,_,_,_,0x2415,_,_,_,_,_,_,_,0x2b1c,_,_,_,_,_,
|
||||
_,0x3224,_,_,_,_,_,_,0x392b,_,_,_,_,_,_,0x4032,_,_,_,_,_,
|
||||
_,0x4039}
|
||||
},
|
||||
// QUANT_12, range 0..11
|
||||
{
|
||||
{0, 5, 11, 17, 23, 28, 36, 41, 47, 53, 59, 64},
|
||||
{0, 4, 8, 2, 6, 10, 11, 7, 3, 9, 5, 1},
|
||||
{0, 64, 17, 47, 5, 59, 23, 41, 11, 53, 28, 36},
|
||||
{0x0500,_,_,_,_,0x0b00,_,_,_,_,_,0x1105,_,_,_,_,_,
|
||||
0x170b,_,_,_,_,_,0x1c11,_,_,_,_,0x2417,_,_,_,_,_,_,_,
|
||||
0x291c,_,_,_,_,0x2f24,_,_,_,_,_,0x3529,_,_,_,_,_,
|
||||
0x3b2f,_,_,_,_,_,0x4035,_,_,_,_,0x403b}
|
||||
},
|
||||
// QUANT_16, range 0..15
|
||||
{
|
||||
{0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64},
|
||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
||||
{0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64},
|
||||
{0x0400,_,_,_,0x0800,_,_,_,0x0c04,_,_,_,0x1108,_,_,_,_,
|
||||
0x150c,_,_,_,0x1911,_,_,_,0x1d15,_,_,_,0x2319,_,_,_,_,
|
||||
_,0x271d,_,_,_,0x2b23,_,_,_,0x2f27,_,_,_,0x342b,_,_,_,
|
||||
_,0x382f,_,_,_,0x3c34,_,_,_,0x4038,_,_,_,0x403c}
|
||||
},
|
||||
// QUANT_20, range 0..19
|
||||
{
|
||||
{0, 3, 6, 9, 13, 16, 19, 23, 26, 29, 35, 38, 41, 45, 48, 51, 55, 58, 61, 64},
|
||||
{0, 4, 8, 12, 16, 2, 6, 10, 14, 18, 19, 15, 11, 7, 3, 17, 13, 9, 5, 1},
|
||||
{0, 64, 16, 48, 3, 61, 19, 45, 6, 58, 23, 41, 9, 55, 26, 38, 13, 51, 29, 35},
|
||||
{0x0300,_,_,0x0600,_,_,0x0903,_,_,0x0d06,_,_,_,
|
||||
0x1009,_,_,0x130d,_,_,0x1710,_,_,_,0x1a13,_,_,
|
||||
0x1d17,_,_,0x231a,_,_,_,_,_,0x261d,_,_,0x2923,_,_,
|
||||
0x2d26,_,_,_,0x3029,_,_,0x332d,_,_,0x3730,_,_,_,
|
||||
0x3a33,_,_,0x3d37,_,_,0x403a,_,_,0x403d}
|
||||
},
|
||||
// QUANT_24, range 0..23
|
||||
{
|
||||
{0, 2, 5, 8, 11, 13, 16, 19, 22, 24, 27, 30, 34, 37, 40, 42, 45, 48, 51, 53, 56, 59, 62, 64},
|
||||
{0, 8, 16, 2, 10, 18, 4, 12, 20, 6, 14, 22, 23, 15, 7, 21, 13, 5, 19, 11, 3, 17, 9, 1},
|
||||
{0, 64, 8, 56, 16, 48, 24, 40, 2, 62, 11, 53, 19, 45, 27, 37, 5, 59, 13, 51, 22, 42, 30, 34},
|
||||
{0x0200,_,0x0500,_,_,0x0802,_,_,0x0b05,_,_,0x0d08,
|
||||
_,0x100b,_,_,0x130d,_,_,0x1610,_,_,0x1813,_,
|
||||
0x1b16,_,_,0x1e18,_,_,0x221b,_,_,_,0x251e,_,_,
|
||||
0x2822,_,_,0x2a25,_,0x2d28,_,_,0x302a,_,_,0x332d,
|
||||
_,_,0x3530,_,0x3833,_,_,0x3b35,_,_,0x3e38,_,_,
|
||||
0x403b,_,0x403e}
|
||||
},
|
||||
// QUANT_32, range 0..31
|
||||
{
|
||||
{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},
|
||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
|
||||
{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},
|
||||
{0x0200,_,0x0400,_,0x0602,_,0x0804,_,0x0a06,_,
|
||||
0x0c08,_,0x0e0a,_,0x100c,_,0x120e,_,0x1410,_,
|
||||
0x1612,_,0x1814,_,0x1a16,_,0x1c18,_,0x1e1a,_,
|
||||
0x221c,_,_,_,0x241e,_,0x2622,_,0x2824,_,0x2a26,_,
|
||||
0x2c28,_,0x2e2a,_,0x302c,_,0x322e,_,0x3430,_,
|
||||
0x3632,_,0x3834,_,0x3a36,_,0x3c38,_,0x3e3a,_,
|
||||
0x403c,_,0x403e}
|
||||
}
|
||||
};
|
||||
@@ -0,0 +1,316 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2020-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Application entry point.
|
||||
*
|
||||
* This module contains the first command line entry point veneer, used to
|
||||
* validate that the host extended ISA availability matches the tool build.
|
||||
* It is compiled without any extended ISA support so it's guaranteed to be
|
||||
* executable without any invalid instruction errors.
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
|
||||
/**
|
||||
* @brief The main veneer entry point.
|
||||
*
|
||||
* @param argc The number of arguments.
|
||||
* @param argv The vector of arguments.
|
||||
*
|
||||
* @return 0 on success, non-zero otherwise.
|
||||
*/
|
||||
int astcenc_main_veneer(
|
||||
int argc,
|
||||
char **argv);
|
||||
|
||||
// x86-64 builds
|
||||
#if (ASTCENC_SSE > 20) || (ASTCENC_AVX > 0) || \
|
||||
(ASTCENC_POPCNT > 0) || (ASTCENC_F16C > 0)
|
||||
|
||||
static bool g_init { false };
|
||||
|
||||
/** Does this CPU support SSE 4.1? Set to -1 if not yet initialized. */
|
||||
static bool g_cpu_has_sse41 { false };
|
||||
|
||||
/** Does this CPU support AVX2? Set to -1 if not yet initialized. */
|
||||
static bool g_cpu_has_avx2 { false };
|
||||
|
||||
/** Does this CPU support POPCNT? Set to -1 if not yet initialized. */
|
||||
static bool g_cpu_has_popcnt { false };
|
||||
|
||||
/** Does this CPU support F16C? Set to -1 if not yet initialized. */
|
||||
static bool g_cpu_has_f16c { false };
|
||||
|
||||
/* ============================================================================
|
||||
Platform code for Visual Studio
|
||||
============================================================================ */
|
||||
#if !defined(__clang__) && defined(_MSC_VER)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <windows.h>
|
||||
#include <intrin.h>
|
||||
|
||||
/**
|
||||
* @brief Detect platform CPU ISA support and update global trackers.
|
||||
*/
|
||||
static void detect_cpu_isa()
|
||||
{
|
||||
int data[4];
|
||||
|
||||
__cpuid(data, 0);
|
||||
int num_id = data[0];
|
||||
|
||||
if (num_id >= 1)
|
||||
{
|
||||
__cpuidex(data, 1, 0);
|
||||
// SSE41 = Bank 1, ECX, bit 19
|
||||
g_cpu_has_sse41 = data[2] & (1 << 19) ? true : false;
|
||||
// POPCNT = Bank 1, ECX, bit 23
|
||||
g_cpu_has_popcnt = data[2] & (1 << 23) ? true : false;
|
||||
// F16C = Bank 1, ECX, bit 29
|
||||
g_cpu_has_f16c = data[2] & (1 << 29) ? true : false;
|
||||
}
|
||||
|
||||
if (num_id >= 7)
|
||||
{
|
||||
__cpuidex(data, 7, 0);
|
||||
// AVX2 = Bank 7, EBX, bit 5
|
||||
g_cpu_has_avx2 = data[1] & (1 << 5) ? true : false;
|
||||
}
|
||||
|
||||
// Ensure state bits are updated before init flag is updated
|
||||
MemoryBarrier();
|
||||
g_init = true;
|
||||
}
|
||||
|
||||
/* ============================================================================
|
||||
Platform code for GCC and Clang
|
||||
============================================================================ */
|
||||
#else
|
||||
#include <cpuid.h>
|
||||
|
||||
/**
|
||||
* @brief Detect platform CPU ISA support and update global trackers.
|
||||
*/
|
||||
static void detect_cpu_isa()
|
||||
{
|
||||
unsigned int data[4];
|
||||
|
||||
if (__get_cpuid_count(1, 0, &data[0], &data[1], &data[2], &data[3]))
|
||||
{
|
||||
// SSE41 = Bank 1, ECX, bit 19
|
||||
g_cpu_has_sse41 = data[2] & (1 << 19) ? true : false;
|
||||
// POPCNT = Bank 1, ECX, bit 23
|
||||
g_cpu_has_popcnt = data[2] & (1 << 23) ? true : false;
|
||||
// F16C = Bank 1, ECX, bit 29
|
||||
g_cpu_has_f16c = data[2] & (1 << 29) ? true : false;
|
||||
}
|
||||
|
||||
g_cpu_has_avx2 = 0;
|
||||
if (__get_cpuid_count(7, 0, &data[0], &data[1], &data[2], &data[3]))
|
||||
{
|
||||
// AVX2 = Bank 7, EBX, bit 5
|
||||
g_cpu_has_avx2 = data[1] & (1 << 5) ? true : false;
|
||||
}
|
||||
|
||||
// Ensure state bits are updated before init flag is updated
|
||||
__sync_synchronize();
|
||||
g_init = true;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if ASTCENC_POPCNT > 0
|
||||
/**
|
||||
* @brief Run-time detection if the host CPU supports the POPCNT extension.
|
||||
*
|
||||
* @return @c true if supported, @c false if not.
|
||||
*/
|
||||
static bool cpu_supports_popcnt()
|
||||
{
|
||||
if (!g_init)
|
||||
{
|
||||
detect_cpu_isa();
|
||||
}
|
||||
|
||||
return g_cpu_has_popcnt;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if ASTCENC_F16C > 0
|
||||
/**
|
||||
* @brief Run-time detection if the host CPU supports F16C extension.
|
||||
*
|
||||
* @return @c true if supported, @c false if not.
|
||||
*/
|
||||
static bool cpu_supports_f16c()
|
||||
{
|
||||
if (!g_init)
|
||||
{
|
||||
detect_cpu_isa();
|
||||
}
|
||||
|
||||
return g_cpu_has_f16c;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if ASTCENC_SSE >= 41
|
||||
/**
|
||||
* @brief Run-time detection if the host CPU supports SSE 4.1 extension.
|
||||
*
|
||||
* @return @c true if supported, @c false if not.
|
||||
*/
|
||||
static bool cpu_supports_sse41()
|
||||
{
|
||||
if (!g_init)
|
||||
{
|
||||
detect_cpu_isa();
|
||||
}
|
||||
|
||||
return g_cpu_has_sse41;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if ASTCENC_AVX >= 2
|
||||
/**
|
||||
* @brief Run-time detection if the host CPU supports AVX 2 extension.
|
||||
*
|
||||
* @return @c true if supported, @c false if not.
|
||||
*/
|
||||
static bool cpu_supports_avx2()
|
||||
{
|
||||
if (!g_init)
|
||||
{
|
||||
detect_cpu_isa();
|
||||
}
|
||||
|
||||
return g_cpu_has_avx2;
|
||||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief Print a string to stderr.
|
||||
*/
|
||||
static inline void print_error(
|
||||
const char* format
|
||||
) {
|
||||
fprintf(stderr, "%s", format);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Validate CPU ISA support meets the requirements of this build of the library.
|
||||
*
|
||||
* Each library build is statically compiled for a particular set of CPU ISA features, such as the
|
||||
* SIMD support or other ISA extensions such as POPCNT. This function checks that the host CPU
|
||||
* actually supports everything this build needs.
|
||||
*
|
||||
* @return Return @c true if validated, @c false otherwise.
|
||||
*/
|
||||
static bool validate_cpu_isa()
|
||||
{
|
||||
#if ASTCENC_AVX >= 2
|
||||
if (!cpu_supports_avx2())
|
||||
{
|
||||
print_error("ERROR: Host does not support AVX2 ISA extension\n");
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if ASTCENC_F16C >= 1
|
||||
if (!cpu_supports_f16c())
|
||||
{
|
||||
print_error("ERROR: Host does not support F16C ISA extension\n");
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if ASTCENC_SSE >= 41
|
||||
if (!cpu_supports_sse41())
|
||||
{
|
||||
print_error("ERROR: Host does not support SSE4.1 ISA extension\n");
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if ASTCENC_POPCNT >= 1
|
||||
if (!cpu_supports_popcnt())
|
||||
{
|
||||
print_error("ERROR: Host does not support POPCNT ISA extension\n");
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Validate Arm SVE availability
|
||||
#elif ASTCENC_SVE != 0
|
||||
|
||||
#include <sys/auxv.h>
|
||||
static bool cpu_supports_sve()
|
||||
{
|
||||
long hwcaps = getauxval(AT_HWCAP);
|
||||
return (hwcaps & HWCAP_SVE) != 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Print a string to stderr.
|
||||
*/
|
||||
static inline void print_error(
|
||||
const char* format
|
||||
) {
|
||||
fprintf(stderr, "%s", format);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Validate that SVE is supported.
|
||||
*
|
||||
* Note that this function checks that SVE is supported, but because it
|
||||
* runs in the veneer which is compiled without SVE support, we cannot
|
||||
* check the SVE width is correct. This is checked later.
|
||||
*/
|
||||
static bool validate_cpu_isa()
|
||||
{
|
||||
if (!cpu_supports_sve())
|
||||
{
|
||||
print_error("ERROR: Host does not support SVE ISA extension\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// Fallback for cases with no dynamic ISA availability
|
||||
static bool validate_cpu_isa()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
int main(
|
||||
int argc,
|
||||
char **argv
|
||||
) {
|
||||
if (!validate_cpu_isa())
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
return astcenc_main_veneer(argc, argv);
|
||||
}
|
||||
@@ -0,0 +1,73 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Application entry point second veneer.
|
||||
*
|
||||
* This module contains the second command line entry point veneer, used to
|
||||
* validate that Arm SVE vector width matches the tool build. When used, it is
|
||||
* compiled with SVE ISA support but without any vector legnth override, so it
|
||||
* will see the native SVE vector length exposed to the application.
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
|
||||
#if ASTCENC_SVE != 0
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief The main entry point.
|
||||
*
|
||||
* @param argc The number of arguments.
|
||||
* @param argv The vector of arguments.
|
||||
*
|
||||
* @return 0 on success, non-zero otherwise.
|
||||
*/
|
||||
int astcenc_main(
|
||||
int argc,
|
||||
char **argv);
|
||||
|
||||
/**
|
||||
* @brief Print a formatted string to stderr.
|
||||
*/
|
||||
template<typename ... _Args>
|
||||
static inline void print_error(
|
||||
const char* format,
|
||||
_Args...args
|
||||
) {
|
||||
fprintf(stderr, format, args...);
|
||||
}
|
||||
|
||||
int astcenc_main_veneer(
|
||||
int argc,
|
||||
char **argv
|
||||
) {
|
||||
// We don't need this check for 128-bit SVE, because that is compiled as
|
||||
// VLA code, using predicate masks in the augmented NEON.
|
||||
#if ASTCENC_SVE > 4
|
||||
// svcntw() returns compile-time length if used with -msve-vector-bits
|
||||
if (svcntw() != ASTCENC_SVE)
|
||||
{
|
||||
int bits = ASTCENC_SVE * 32;
|
||||
print_error("ERROR: Host SVE support is not a %u-bit implementation\n", bits);
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
return astcenc_main(argc, argv);
|
||||
}
|
||||
@@ -0,0 +1,413 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2022 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Functions for computing image error metrics.
|
||||
*/
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
|
||||
#include "astcenccli_internal.h"
|
||||
|
||||
/**
|
||||
* @brief An accumulator for errors.
|
||||
*/
|
||||
class error_accum4
|
||||
{
|
||||
public:
|
||||
/** @brief The running sum. */
|
||||
double sum_r { 0.0 };
|
||||
double sum_g { 0.0 };
|
||||
double sum_b { 0.0 };
|
||||
double sum_a { 0.0 };
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Incremental addition operator for error accumulators.
|
||||
*
|
||||
* @param val The accumulator to increment
|
||||
* @param inc The increment to apply
|
||||
*
|
||||
* @return The updated accumulator
|
||||
*/
|
||||
static error_accum4& operator+=(
|
||||
error_accum4 &val,
|
||||
vfloat4 inc
|
||||
) {
|
||||
val.sum_r += static_cast<double>(inc.lane<0>());
|
||||
val.sum_g += static_cast<double>(inc.lane<1>());
|
||||
val.sum_b += static_cast<double>(inc.lane<2>());
|
||||
val.sum_a += static_cast<double>(inc.lane<3>());
|
||||
return val;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief mPSNR tone-mapping operator for HDR images.
|
||||
*
|
||||
* @param val The color value to tone map
|
||||
* @param fstop The exposure fstop; should be in range [-125, 125]
|
||||
*
|
||||
* @return The mapped color value in [0.0f, 255.0f] range
|
||||
*/
|
||||
static float mpsnr_operator(
|
||||
float val,
|
||||
int fstop
|
||||
) {
|
||||
if32 p;
|
||||
p.u = 0x3f800000 + (fstop << 23); // 0x3f800000 is 1.0f
|
||||
val *= p.f;
|
||||
val = powf(val, (1.0f / 2.2f));
|
||||
val *= 255.0f;
|
||||
|
||||
return astc::clamp(val, 0.0f, 255.0f);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief mPSNR difference between two values.
|
||||
*
|
||||
* Differences are given as "val1 - val2".
|
||||
*
|
||||
* @param val1 The first color value
|
||||
* @param val2 The second color value
|
||||
* @param fstop_lo The low exposure fstop; should be in range [-125, 125]
|
||||
* @param fstop_hi The high exposure fstop; should be in range [-125, 125]
|
||||
*
|
||||
* @return The summed mPSNR difference across all active fstop levels
|
||||
*/
|
||||
static float mpsnr_sumdiff(
|
||||
float val1,
|
||||
float val2,
|
||||
int fstop_lo,
|
||||
int fstop_hi
|
||||
) {
|
||||
float summa = 0.0f;
|
||||
for (int i = fstop_lo; i <= fstop_hi; i++)
|
||||
{
|
||||
float mval1 = mpsnr_operator(val1, i);
|
||||
float mval2 = mpsnr_operator(val2, i);
|
||||
float mdiff = mval1 - mval2;
|
||||
summa += mdiff * mdiff;
|
||||
}
|
||||
return summa;
|
||||
}
|
||||
|
||||
/* See header for documentation */
|
||||
void compute_error_metrics(
|
||||
bool compute_hdr_metrics,
|
||||
bool compute_normal_metrics,
|
||||
int input_components,
|
||||
const astcenc_image* img1,
|
||||
const astcenc_image* img2,
|
||||
int fstop_lo,
|
||||
int fstop_hi
|
||||
) {
|
||||
static const int componentmasks[5] { 0x00, 0x07, 0x0C, 0x07, 0x0F };
|
||||
int componentmask = componentmasks[input_components];
|
||||
|
||||
error_accum4 errorsum;
|
||||
error_accum4 alpha_scaled_errorsum;
|
||||
error_accum4 log_errorsum;
|
||||
error_accum4 mpsnr_errorsum;
|
||||
double mean_angular_errorsum = 0.0;
|
||||
double worst_angular_errorsum = 0.0;
|
||||
|
||||
unsigned int dim_x = astc::min(img1->dim_x, img2->dim_x);
|
||||
unsigned int dim_y = astc::min(img1->dim_y, img2->dim_y);
|
||||
unsigned int dim_z = astc::min(img1->dim_z, img2->dim_z);
|
||||
|
||||
if (img1->dim_x != img2->dim_x ||
|
||||
img1->dim_y != img2->dim_y ||
|
||||
img1->dim_z != img2->dim_z)
|
||||
{
|
||||
printf("WARNING: Only intersection of images will be compared:\n"
|
||||
" Image 1: %dx%dx%d\n"
|
||||
" Image 2: %dx%dx%d\n",
|
||||
img1->dim_x, img1->dim_y, img1->dim_z,
|
||||
img2->dim_x, img2->dim_y, img2->dim_z);
|
||||
}
|
||||
|
||||
double rgb_peak = 0.0;
|
||||
unsigned int xsize1 = img1->dim_x;
|
||||
unsigned int xsize2 = img2->dim_x;
|
||||
|
||||
for (unsigned int z = 0; z < dim_z; z++)
|
||||
{
|
||||
for (unsigned int y = 0; y < dim_y; y++)
|
||||
{
|
||||
for (unsigned int x = 0; x < dim_x; x++)
|
||||
{
|
||||
vfloat4 color1;
|
||||
vfloat4 color2;
|
||||
|
||||
if (img1->data_type == ASTCENC_TYPE_U8)
|
||||
{
|
||||
uint8_t* data8 = static_cast<uint8_t*>(img1->data[z]);
|
||||
|
||||
color1 = vfloat4(
|
||||
data8[(4 * xsize1 * y) + (4 * x )],
|
||||
data8[(4 * xsize1 * y) + (4 * x + 1)],
|
||||
data8[(4 * xsize1 * y) + (4 * x + 2)],
|
||||
data8[(4 * xsize1 * y) + (4 * x + 3)]);
|
||||
|
||||
color1 = color1 / 255.0f;
|
||||
}
|
||||
else if (img1->data_type == ASTCENC_TYPE_F16)
|
||||
{
|
||||
uint16_t* data16 = static_cast<uint16_t*>(img1->data[z]);
|
||||
|
||||
vint4 color1i = vint4(
|
||||
data16[(4 * xsize1 * y) + (4 * x )],
|
||||
data16[(4 * xsize1 * y) + (4 * x + 1)],
|
||||
data16[(4 * xsize1 * y) + (4 * x + 2)],
|
||||
data16[(4 * xsize1 * y) + (4 * x + 3)]);
|
||||
|
||||
color1 = float16_to_float(color1i);
|
||||
color1 = clamp(0, 65504.0f, color1);
|
||||
}
|
||||
else // if (img1->data_type == ASTCENC_TYPE_F32)
|
||||
{
|
||||
assert(img1->data_type == ASTCENC_TYPE_F32);
|
||||
float* data32 = static_cast<float*>(img1->data[z]);
|
||||
|
||||
color1 = vfloat4(
|
||||
data32[(4 * xsize1 * y) + (4 * x )],
|
||||
data32[(4 * xsize1 * y) + (4 * x + 1)],
|
||||
data32[(4 * xsize1 * y) + (4 * x + 2)],
|
||||
data32[(4 * xsize1 * y) + (4 * x + 3)]);
|
||||
|
||||
color1 = clamp(0, 65504.0f, color1);
|
||||
}
|
||||
|
||||
if (img2->data_type == ASTCENC_TYPE_U8)
|
||||
{
|
||||
uint8_t* data8 = static_cast<uint8_t*>(img2->data[z]);
|
||||
|
||||
color2 = vfloat4(
|
||||
data8[(4 * xsize2 * y) + (4 * x )],
|
||||
data8[(4 * xsize2 * y) + (4 * x + 1)],
|
||||
data8[(4 * xsize2 * y) + (4 * x + 2)],
|
||||
data8[(4 * xsize2 * y) + (4 * x + 3)]);
|
||||
|
||||
color2 = color2 / 255.0f;
|
||||
}
|
||||
else if (img2->data_type == ASTCENC_TYPE_F16)
|
||||
{
|
||||
uint16_t* data16 = static_cast<uint16_t*>(img2->data[z]);
|
||||
|
||||
vint4 color2i = vint4(
|
||||
data16[(4 * xsize2 * y) + (4 * x )],
|
||||
data16[(4 * xsize2 * y) + (4 * x + 1)],
|
||||
data16[(4 * xsize2 * y) + (4 * x + 2)],
|
||||
data16[(4 * xsize2 * y) + (4 * x + 3)]);
|
||||
|
||||
color2 = float16_to_float(color2i);
|
||||
color2 = clamp(0, 65504.0f, color2);
|
||||
}
|
||||
else // if (img2->data_type == ASTCENC_TYPE_F32)
|
||||
{
|
||||
assert(img2->data_type == ASTCENC_TYPE_F32);
|
||||
float* data32 = static_cast<float*>(img2->data[z]);
|
||||
|
||||
color2 = vfloat4(
|
||||
data32[(4 * xsize2 * y) + (4 * x )],
|
||||
data32[(4 * xsize2 * y) + (4 * x + 1)],
|
||||
data32[(4 * xsize2 * y) + (4 * x + 2)],
|
||||
data32[(4 * xsize2 * y) + (4 * x + 3)]);
|
||||
|
||||
color2 = clamp(0, 65504.0f, color2);
|
||||
}
|
||||
|
||||
rgb_peak = astc::max(static_cast<double>(color1.lane<0>()),
|
||||
static_cast<double>(color1.lane<1>()),
|
||||
static_cast<double>(color1.lane<2>()),
|
||||
rgb_peak);
|
||||
|
||||
vfloat4 diffcolor = color1 - color2;
|
||||
vfloat4 diffcolor_sq = diffcolor * diffcolor;
|
||||
errorsum += diffcolor_sq;
|
||||
|
||||
vfloat4 alpha_scaled_diffcolor = vfloat4(
|
||||
diffcolor.lane<0>() * color1.lane<3>(),
|
||||
diffcolor.lane<1>() * color1.lane<3>(),
|
||||
diffcolor.lane<2>() * color1.lane<3>(),
|
||||
diffcolor.lane<3>());
|
||||
|
||||
vfloat4 alpha_scaled_diffcolor_sq = alpha_scaled_diffcolor * alpha_scaled_diffcolor;
|
||||
alpha_scaled_errorsum += alpha_scaled_diffcolor_sq;
|
||||
|
||||
if (compute_hdr_metrics)
|
||||
{
|
||||
vfloat4 log_input_color1 = log2(color1);
|
||||
vfloat4 log_input_color2 = log2(color2);
|
||||
|
||||
vfloat4 log_diffcolor = log_input_color1 - log_input_color2;
|
||||
|
||||
log_errorsum += log_diffcolor * log_diffcolor;
|
||||
|
||||
vfloat4 mpsnr_error = vfloat4(
|
||||
mpsnr_sumdiff(color1.lane<0>(), color2.lane<0>(), fstop_lo, fstop_hi),
|
||||
mpsnr_sumdiff(color1.lane<1>(), color2.lane<1>(), fstop_lo, fstop_hi),
|
||||
mpsnr_sumdiff(color1.lane<2>(), color2.lane<2>(), fstop_lo, fstop_hi),
|
||||
mpsnr_sumdiff(color1.lane<3>(), color2.lane<3>(), fstop_lo, fstop_hi));
|
||||
|
||||
mpsnr_errorsum += mpsnr_error;
|
||||
}
|
||||
|
||||
if (compute_normal_metrics)
|
||||
{
|
||||
// Decode the normal vector
|
||||
vfloat4 normal1 = (color1 - 0.5f) * 2.0f;
|
||||
normal1 = normalize_safe(normal1.swz<0, 1, 2>(), unit3());
|
||||
|
||||
vfloat4 normal2 = (color2 - 0.5f) * 2.0f;
|
||||
normal2 = normalize_safe(normal2.swz<0, 1, 2>(), unit3());
|
||||
|
||||
// Float error can push this outside of valid range for acos, so clamp to avoid NaN issues
|
||||
float normal_cos = clamp(-1.0f, 1.0f, dot3(normal1, normal2)).lane<0>();
|
||||
float rad_to_degrees = 180.0f / astc::PI;
|
||||
double error_degrees = std::acos(static_cast<double>(normal_cos)) * static_cast<double>(rad_to_degrees);
|
||||
|
||||
mean_angular_errorsum += error_degrees / (dim_x * dim_y * dim_z);
|
||||
worst_angular_errorsum = astc::max(worst_angular_errorsum, error_degrees);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
double pixels = static_cast<double>(dim_x * dim_y * dim_z);
|
||||
double samples = 0.0;
|
||||
|
||||
double num = 0.0;
|
||||
double alpha_num = 0.0;
|
||||
double log_num = 0.0;
|
||||
double mpsnr_num = 0.0;
|
||||
|
||||
if (componentmask & 1)
|
||||
{
|
||||
num += errorsum.sum_r;
|
||||
alpha_num += alpha_scaled_errorsum.sum_r;
|
||||
log_num += log_errorsum.sum_r;
|
||||
mpsnr_num += mpsnr_errorsum.sum_r;
|
||||
samples += pixels;
|
||||
}
|
||||
|
||||
if (componentmask & 2)
|
||||
{
|
||||
num += errorsum.sum_g;
|
||||
alpha_num += alpha_scaled_errorsum.sum_g;
|
||||
log_num += log_errorsum.sum_g;
|
||||
mpsnr_num += mpsnr_errorsum.sum_g;
|
||||
samples += pixels;
|
||||
}
|
||||
|
||||
if (componentmask & 4)
|
||||
{
|
||||
num += errorsum.sum_b;
|
||||
alpha_num += alpha_scaled_errorsum.sum_b;
|
||||
log_num += log_errorsum.sum_b;
|
||||
mpsnr_num += mpsnr_errorsum.sum_b;
|
||||
samples += pixels;
|
||||
}
|
||||
|
||||
if (componentmask & 8)
|
||||
{
|
||||
num += errorsum.sum_a;
|
||||
alpha_num += alpha_scaled_errorsum.sum_a;
|
||||
samples += pixels;
|
||||
}
|
||||
|
||||
double denom = samples;
|
||||
double stopcount = static_cast<double>(fstop_hi - fstop_lo + 1);
|
||||
double mpsnr_denom = pixels * 3.0 * stopcount * 255.0 * 255.0;
|
||||
|
||||
double psnr;
|
||||
if (num == 0.0)
|
||||
{
|
||||
psnr = 999.0;
|
||||
}
|
||||
else
|
||||
{
|
||||
psnr = 10.0 * log10(denom / num);
|
||||
}
|
||||
|
||||
double rgb_psnr = psnr;
|
||||
|
||||
printf("Quality metrics\n");
|
||||
printf("===============\n\n");
|
||||
|
||||
if (componentmask & 8)
|
||||
{
|
||||
printf(" PSNR (LDR-RGBA): %9.4f dB\n", psnr);
|
||||
|
||||
double alpha_psnr;
|
||||
if (alpha_num == 0.0)
|
||||
{
|
||||
alpha_psnr = 999.0;
|
||||
}
|
||||
else
|
||||
{
|
||||
alpha_psnr = 10.0 * log10(denom / alpha_num);
|
||||
}
|
||||
printf(" Alpha-weighted PSNR: %9.4f dB\n", alpha_psnr);
|
||||
|
||||
double rgb_num = errorsum.sum_r + errorsum.sum_g + errorsum.sum_b;
|
||||
if (rgb_num == 0.0)
|
||||
{
|
||||
rgb_psnr = 999.0;
|
||||
}
|
||||
else
|
||||
{
|
||||
rgb_psnr = 10.0 * log10(pixels * 3.0 / rgb_num);
|
||||
}
|
||||
printf(" PSNR (LDR-RGB): %9.4f dB\n", rgb_psnr);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf(" PSNR (LDR-RGB): %9.4f dB\n", psnr);
|
||||
}
|
||||
|
||||
if (compute_hdr_metrics)
|
||||
{
|
||||
printf(" PSNR (RGB norm to peak): %9.4f dB (peak %f)\n",
|
||||
rgb_psnr + 20.0 * log10(rgb_peak), rgb_peak);
|
||||
|
||||
double mpsnr;
|
||||
if (mpsnr_num == 0.0)
|
||||
{
|
||||
mpsnr = 999.0;
|
||||
}
|
||||
else
|
||||
{
|
||||
mpsnr = 10.0 * log10(mpsnr_denom / mpsnr_num);
|
||||
}
|
||||
|
||||
printf(" mPSNR (RGB): %9.4f dB (fstops %+d to %+d)\n",
|
||||
mpsnr, fstop_lo, fstop_hi);
|
||||
|
||||
double logrmse = sqrt(log_num / pixels);
|
||||
printf(" LogRMSE (RGB): %9.4f\n", logrmse);
|
||||
}
|
||||
|
||||
if (compute_normal_metrics)
|
||||
{
|
||||
printf(" Mean Angular Error: %9.4f degrees\n", mean_angular_errorsum);
|
||||
printf(" Worst Angular Error: %9.4f degrees\n", worst_angular_errorsum);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
@@ -0,0 +1,377 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2022 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Functions for creating in-memory ASTC image structures.
|
||||
*/
|
||||
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
|
||||
#include "astcenccli_internal.h"
|
||||
|
||||
/* See header for documentation. */
|
||||
astcenc_image *alloc_image(
|
||||
unsigned int bitness,
|
||||
unsigned int dim_x,
|
||||
unsigned int dim_y,
|
||||
unsigned int dim_z
|
||||
) {
|
||||
astcenc_image *img = new astcenc_image;
|
||||
img->dim_x = dim_x;
|
||||
img->dim_y = dim_y;
|
||||
img->dim_z = dim_z;
|
||||
|
||||
void** data = new void*[dim_z];
|
||||
img->data = data;
|
||||
|
||||
if (bitness == 8)
|
||||
{
|
||||
img->data_type = ASTCENC_TYPE_U8;
|
||||
for (unsigned int z = 0; z < dim_z; z++)
|
||||
{
|
||||
data[z] = new uint8_t[dim_x * dim_y * 4];
|
||||
}
|
||||
}
|
||||
else if (bitness == 16)
|
||||
{
|
||||
img->data_type = ASTCENC_TYPE_F16;
|
||||
for (unsigned int z = 0; z < dim_z; z++)
|
||||
{
|
||||
data[z] = new uint16_t[dim_x * dim_y * 4];
|
||||
}
|
||||
}
|
||||
else // if (bitness == 32)
|
||||
{
|
||||
assert(bitness == 32);
|
||||
img->data_type = ASTCENC_TYPE_F32;
|
||||
for (unsigned int z = 0; z < dim_z; z++)
|
||||
{
|
||||
data[z] = new float[dim_x * dim_y * 4];
|
||||
}
|
||||
}
|
||||
|
||||
return img;
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
void free_image(astcenc_image * img)
|
||||
{
|
||||
if (img == nullptr)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
for (unsigned int z = 0; z < img->dim_z; z++)
|
||||
{
|
||||
delete[] reinterpret_cast<char*>(img->data[z]);
|
||||
}
|
||||
|
||||
delete[] img->data;
|
||||
delete img;
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
int determine_image_components(const astcenc_image * img)
|
||||
{
|
||||
unsigned int dim_x = img->dim_x;
|
||||
unsigned int dim_y = img->dim_y;
|
||||
unsigned int dim_z = img->dim_z;
|
||||
|
||||
// Scan through the image data to determine how many color components the image has
|
||||
bool is_luma = true;
|
||||
bool has_alpha = false;
|
||||
|
||||
if (img->data_type == ASTCENC_TYPE_U8)
|
||||
{
|
||||
for (unsigned int z = 0; z < dim_z; z++)
|
||||
{
|
||||
uint8_t* data8 = static_cast<uint8_t*>(img->data[z]);
|
||||
|
||||
for (unsigned int y = 0; y < dim_y; y++)
|
||||
{
|
||||
for (unsigned int x = 0; x < dim_x; x++)
|
||||
{
|
||||
int r = data8[(4 * dim_x * y) + (4 * x )];
|
||||
int g = data8[(4 * dim_x * y) + (4 * x + 1)];
|
||||
int b = data8[(4 * dim_x * y) + (4 * x + 2)];
|
||||
int a = data8[(4 * dim_x * y) + (4 * x + 3)];
|
||||
|
||||
is_luma = is_luma && (r == g) && (r == b);
|
||||
has_alpha = has_alpha || (a != 0xFF);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (img->data_type == ASTCENC_TYPE_F16)
|
||||
{
|
||||
for (unsigned int z = 0; z < dim_z; z++)
|
||||
{
|
||||
uint16_t* data16 = static_cast<uint16_t*>(img->data[z]);
|
||||
|
||||
for (unsigned int y = 0; y < dim_y; y++)
|
||||
{
|
||||
for (unsigned int x = 0; x < dim_x; x++)
|
||||
{
|
||||
int r = data16[(4 * dim_x * y) + (4 * x )];
|
||||
int g = data16[(4 * dim_x * y) + (4 * x + 1)];
|
||||
int b = data16[(4 * dim_x * y) + (4 * x + 2)];
|
||||
int a = data16[(4 * dim_x * y) + (4 * x + 3)];
|
||||
|
||||
is_luma = is_luma && (r == g) && (r == b);
|
||||
has_alpha = has_alpha || ((a ^ 0xC3FF) != 0xFFFF);
|
||||
// a ^ 0xC3FF returns FFFF if and only if the input is 1.0
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else // if (img->data_type == ASTCENC_TYPE_F32)
|
||||
{
|
||||
assert(img->data_type == ASTCENC_TYPE_F32);
|
||||
|
||||
for (unsigned int z = 0; z < dim_z; z++)
|
||||
{
|
||||
float* data32 = static_cast<float*>(img->data[z]);
|
||||
|
||||
for (unsigned int y = 0; y < dim_y; y++)
|
||||
{
|
||||
for (unsigned int x = 0; x < dim_x; x++)
|
||||
{
|
||||
float r = data32[(4 * dim_x * y) + (4 * x )];
|
||||
float g = data32[(4 * dim_x * y) + (4 * x + 1)];
|
||||
float b = data32[(4 * dim_x * y) + (4 * x + 2)];
|
||||
float a = data32[(4 * dim_x * y) + (4 * x + 3)];
|
||||
|
||||
is_luma = is_luma && (r == g) && (r == b);
|
||||
has_alpha = has_alpha || (a != 1.0f);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int image_components = 1 + (is_luma == 0 ? 2 : 0) + (has_alpha ? 1 : 0);
|
||||
return image_components;
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
astcenc_image* astc_img_from_floatx4_array(
|
||||
const float* data,
|
||||
unsigned int dim_x,
|
||||
unsigned int dim_y,
|
||||
bool y_flip
|
||||
) {
|
||||
astcenc_image* img = alloc_image(16, dim_x, dim_y, 1);
|
||||
|
||||
for (unsigned int y = 0; y < dim_y; y++)
|
||||
{
|
||||
uint16_t* data16 = static_cast<uint16_t*>(img->data[0]);
|
||||
unsigned int y_src = y_flip ? (dim_y - y - 1) : y;
|
||||
const float* src = data + 4 * dim_x * y_src;
|
||||
|
||||
for (unsigned int x = 0; x < dim_x; x++)
|
||||
{
|
||||
vint4 colorf16 = float_to_float16(vfloat4(
|
||||
src[4 * x ],
|
||||
src[4 * x + 1],
|
||||
src[4 * x + 2],
|
||||
src[4 * x + 3]
|
||||
));
|
||||
|
||||
data16[(4 * dim_x * y) + (4 * x )] = static_cast<uint16_t>(colorf16.lane<0>());
|
||||
data16[(4 * dim_x * y) + (4 * x + 1)] = static_cast<uint16_t>(colorf16.lane<1>());
|
||||
data16[(4 * dim_x * y) + (4 * x + 2)] = static_cast<uint16_t>(colorf16.lane<2>());
|
||||
data16[(4 * dim_x * y) + (4 * x + 3)] = static_cast<uint16_t>(colorf16.lane<3>());
|
||||
}
|
||||
}
|
||||
|
||||
return img;
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
astcenc_image* astc_img_from_unorm8x4_array(
|
||||
const uint8_t* data,
|
||||
unsigned int dim_x,
|
||||
unsigned int dim_y,
|
||||
bool y_flip
|
||||
) {
|
||||
astcenc_image* img = alloc_image(8, dim_x, dim_y, 1);
|
||||
|
||||
for (unsigned int y = 0; y < dim_y; y++)
|
||||
{
|
||||
uint8_t* data8 = static_cast<uint8_t*>(img->data[0]);
|
||||
unsigned int y_src = y_flip ? (dim_y - y - 1) : y;
|
||||
const uint8_t* src = data + 4 * dim_x * y_src;
|
||||
|
||||
for (unsigned int x = 0; x < dim_x; x++)
|
||||
{
|
||||
data8[(4 * dim_x * y) + (4 * x )] = src[4 * x ];
|
||||
data8[(4 * dim_x * y) + (4 * x + 1)] = src[4 * x + 1];
|
||||
data8[(4 * dim_x * y) + (4 * x + 2)] = src[4 * x + 2];
|
||||
data8[(4 * dim_x * y) + (4 * x + 3)] = src[4 * x + 3];
|
||||
}
|
||||
}
|
||||
|
||||
return img;
|
||||
}
|
||||
|
||||
// initialize a flattened array of float values from an ASTC codec image
|
||||
// The returned array is allocated with new[] and must be deleted with delete[].
|
||||
/* See header for documentation. */
|
||||
float* floatx4_array_from_astc_img(
|
||||
const astcenc_image* img,
|
||||
bool y_flip,
|
||||
unsigned int z_index
|
||||
) {
|
||||
unsigned int dim_x = img->dim_x;
|
||||
unsigned int dim_y = img->dim_y;
|
||||
float *buf = new float[4 * dim_x * dim_y];
|
||||
|
||||
assert(z_index < img->dim_z);
|
||||
|
||||
if (img->data_type == ASTCENC_TYPE_U8)
|
||||
{
|
||||
uint8_t* data8 = static_cast<uint8_t*>(img->data[z_index]);
|
||||
for (unsigned int y = 0; y < dim_y; y++)
|
||||
{
|
||||
unsigned int ymod = y_flip ? dim_y - y - 1 : y;
|
||||
float* dst = buf + y * dim_x * 4;
|
||||
|
||||
for (unsigned int x = 0; x < dim_x; x++)
|
||||
{
|
||||
dst[4 * x ] = data8[(4 * dim_x * ymod) + (4 * x )] * (1.0f / 255.0f);
|
||||
dst[4 * x + 1] = data8[(4 * dim_x * ymod) + (4 * x + 1)] * (1.0f / 255.0f);
|
||||
dst[4 * x + 2] = data8[(4 * dim_x * ymod) + (4 * x + 2)] * (1.0f / 255.0f);
|
||||
dst[4 * x + 3] = data8[(4 * dim_x * ymod) + (4 * x + 3)] * (1.0f / 255.0f);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (img->data_type == ASTCENC_TYPE_F16)
|
||||
{
|
||||
uint16_t* data16 = static_cast<uint16_t*>(img->data[z_index]);
|
||||
for (unsigned int y = 0; y < dim_y; y++)
|
||||
{
|
||||
unsigned int ymod = y_flip ? dim_y - y - 1 : y;
|
||||
float *dst = buf + y * dim_x * 4;
|
||||
|
||||
for (unsigned int x = 0; x < dim_x; x++)
|
||||
{
|
||||
vint4 colori(
|
||||
data16[(4 * dim_x * ymod) + (4 * x )],
|
||||
data16[(4 * dim_x * ymod) + (4 * x + 1)],
|
||||
data16[(4 * dim_x * ymod) + (4 * x + 2)],
|
||||
data16[(4 * dim_x * ymod) + (4 * x + 3)]
|
||||
);
|
||||
|
||||
vfloat4 color = float16_to_float(colori);
|
||||
store(color, dst + 4 * x);
|
||||
}
|
||||
}
|
||||
}
|
||||
else // if (img->data_type == ASTCENC_TYPE_F32)
|
||||
{
|
||||
assert(img->data_type == ASTCENC_TYPE_F32);
|
||||
float* data32 = static_cast<float*>(img->data[z_index]);
|
||||
for (unsigned int y = 0; y < dim_y; y++)
|
||||
{
|
||||
unsigned int ymod = y_flip ? dim_y - y - 1 : y;
|
||||
float *dst = buf + y * dim_x * 4;
|
||||
|
||||
for (unsigned int x = 0; x < dim_x; x++)
|
||||
{
|
||||
dst[4 * x ] = data32[(4 * dim_x * ymod) + (4 * x )];
|
||||
dst[4 * x + 1] = data32[(4 * dim_x * ymod) + (4 * x + 1)];
|
||||
dst[4 * x + 2] = data32[(4 * dim_x * ymod) + (4 * x + 2)];
|
||||
dst[4 * x + 3] = data32[(4 * dim_x * ymod) + (4 * x + 3)];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
/* See header for documentation. */
|
||||
uint8_t* unorm8x4_array_from_astc_img(
|
||||
const astcenc_image* img,
|
||||
bool y_flip
|
||||
) {
|
||||
unsigned int dim_x = img->dim_x;
|
||||
unsigned int dim_y = img->dim_y;
|
||||
uint8_t* buf = new uint8_t[4 * dim_x * dim_y];
|
||||
|
||||
if (img->data_type == ASTCENC_TYPE_U8)
|
||||
{
|
||||
uint8_t* data8 = static_cast<uint8_t*>(img->data[0]);
|
||||
for (unsigned int y = 0; y < dim_y; y++)
|
||||
{
|
||||
unsigned int ymod = y_flip ? dim_y - y - 1 : y;
|
||||
uint8_t* dst = buf + y * dim_x * 4;
|
||||
|
||||
for (unsigned int x = 0; x < dim_x; x++)
|
||||
{
|
||||
dst[4 * x ] = data8[(4 * dim_x * ymod) + (4 * x )];
|
||||
dst[4 * x + 1] = data8[(4 * dim_x * ymod) + (4 * x + 1)];
|
||||
dst[4 * x + 2] = data8[(4 * dim_x * ymod) + (4 * x + 2)];
|
||||
dst[4 * x + 3] = data8[(4 * dim_x * ymod) + (4 * x + 3)];
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (img->data_type == ASTCENC_TYPE_F16)
|
||||
{
|
||||
uint16_t* data16 = static_cast<uint16_t*>(img->data[0]);
|
||||
for (unsigned int y = 0; y < dim_y; y++)
|
||||
{
|
||||
unsigned int ymod = y_flip ? dim_y - y - 1 : y;
|
||||
uint8_t* dst = buf + y * dim_x * 4;
|
||||
|
||||
for (unsigned int x = 0; x < dim_x; x++)
|
||||
{
|
||||
vint4 colori(
|
||||
data16[(4 * dim_x * ymod) + (4 * x )],
|
||||
data16[(4 * dim_x * ymod) + (4 * x + 1)],
|
||||
data16[(4 * dim_x * ymod) + (4 * x + 2)],
|
||||
data16[(4 * dim_x * ymod) + (4 * x + 3)]
|
||||
);
|
||||
|
||||
vfloat4 color = float16_to_float(colori);
|
||||
color = clamp(0.0f, 1.0f, color) * 255.0f;
|
||||
|
||||
colori = float_to_int_rtn(color);
|
||||
pack_and_store_low_bytes(colori, dst + 4 * x);
|
||||
}
|
||||
}
|
||||
}
|
||||
else // if (img->data_type == ASTCENC_TYPE_F32)
|
||||
{
|
||||
assert(img->data_type == ASTCENC_TYPE_F32);
|
||||
float* data32 = static_cast<float*>(img->data[0]);
|
||||
for (unsigned int y = 0; y < dim_y; y++)
|
||||
{
|
||||
unsigned int ymod = y_flip ? dim_y - y - 1 : y;
|
||||
uint8_t* dst = buf + y * dim_x * 4;
|
||||
|
||||
for (unsigned int x = 0; x < dim_x; x++)
|
||||
{
|
||||
dst[4 * x ] = static_cast<uint8_t>(astc::flt2int_rtn(astc::clamp1f(data32[(4 * dim_x * ymod) + (4 * x )]) * 255.0f));
|
||||
dst[4 * x + 1] = static_cast<uint8_t>(astc::flt2int_rtn(astc::clamp1f(data32[(4 * dim_x * ymod) + (4 * x + 1)]) * 255.0f));
|
||||
dst[4 * x + 2] = static_cast<uint8_t>(astc::flt2int_rtn(astc::clamp1f(data32[(4 * dim_x * ymod) + (4 * x + 2)]) * 255.0f));
|
||||
dst[4 * x + 3] = static_cast<uint8_t>(astc::flt2int_rtn(astc::clamp1f(data32[(4 * dim_x * ymod) + (4 * x + 3)]) * 255.0f));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return buf;
|
||||
}
|
||||
@@ -0,0 +1,174 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Functions for building the implementation of stb_image and tinyexr.
|
||||
*/
|
||||
|
||||
#include <cstdlib>
|
||||
#include <cstdio>
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
|
||||
#include "astcenccli_internal.h"
|
||||
|
||||
// Configure the STB image write library build.
|
||||
#define STB_IMAGE_IMPLEMENTATION
|
||||
#define STB_IMAGE_WRITE_IMPLEMENTATION
|
||||
#define STBI_NO_GIF
|
||||
#define STBI_NO_PIC
|
||||
#define STBI_NO_PNM
|
||||
#define STBI_NO_PNG
|
||||
#define STBI_NO_PSD
|
||||
|
||||
// Configure the TinyEXR library build.
|
||||
#define TINYEXR_IMPLEMENTATION
|
||||
|
||||
// Configure the Wuffs library build.
|
||||
#define WUFFS_IMPLEMENTATION
|
||||
#define WUFFS_CONFIG__MODULES
|
||||
#define WUFFS_CONFIG__MODULE__ADLER32
|
||||
#define WUFFS_CONFIG__MODULE__BASE
|
||||
#define WUFFS_CONFIG__MODULE__CRC32
|
||||
#define WUFFS_CONFIG__MODULE__DEFLATE
|
||||
#define WUFFS_CONFIG__MODULE__PNG
|
||||
#define WUFFS_CONFIG__MODULE__ZLIB
|
||||
#include "wuffs-v0.3.c"
|
||||
|
||||
// For both libraries force asserts (which can be triggered by corrupt input
|
||||
// images) to be handled at runtime in release builds to avoid security issues.
|
||||
#define STBI_ASSERT(x) astcenc_runtime_assert(x)
|
||||
#define TEXR_ASSERT(x) astcenc_runtime_assert(x)
|
||||
|
||||
/**
|
||||
* @brief Trap image load failures and convert into a runtime error.
|
||||
*/
|
||||
static void astcenc_runtime_assert(bool condition)
|
||||
{
|
||||
if (!condition)
|
||||
{
|
||||
print_error("ERROR: Corrupt input image\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
#include "ThirdParty/stb_image.h"
|
||||
#include "ThirdParty/stb_image_write.h"
|
||||
#include "ThirdParty/tinyexr.h"
|
||||
|
||||
/**
|
||||
* @brief Load an image using Wuffs to provide the loader.
|
||||
*
|
||||
* @param filename The name of the file to load.
|
||||
* @param y_flip Should the image be vertically flipped?
|
||||
* @param[out] is_hdr Is this an HDR image load?
|
||||
* @param[out] component_count The number of components in the data.
|
||||
*
|
||||
* @return The loaded image data in a canonical 4 channel format, or @c nullptr on error.
|
||||
*/
|
||||
astcenc_image* load_png_with_wuffs(
|
||||
const char* filename,
|
||||
bool y_flip,
|
||||
bool& is_hdr,
|
||||
unsigned int& component_count
|
||||
) {
|
||||
is_hdr = false;
|
||||
component_count = 4;
|
||||
|
||||
std::ifstream file(filename, std::ios::binary | std::ios::ate);
|
||||
if (!file)
|
||||
{
|
||||
print_error("ERROR: Failed to load image %s (can't fopen)\n", filename);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::streamsize size = file.tellg();
|
||||
file.seekg(0, std::ios::beg);
|
||||
|
||||
std::vector<uint8_t> buffer(size);
|
||||
file.read((char*)buffer.data(), size);
|
||||
|
||||
wuffs_png__decoder *dec = wuffs_png__decoder__alloc();
|
||||
if (!dec)
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
wuffs_base__image_config ic;
|
||||
wuffs_base__io_buffer src = wuffs_base__ptr_u8__reader(buffer.data(), size, true);
|
||||
wuffs_base__status status = wuffs_png__decoder__decode_image_config(dec, &ic, &src);
|
||||
if (status.repr)
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
uint32_t dim_x = wuffs_base__pixel_config__width(&ic.pixcfg);
|
||||
uint32_t dim_y = wuffs_base__pixel_config__height(&ic.pixcfg);
|
||||
size_t num_pixels = dim_x * dim_y;
|
||||
if (num_pixels > (SIZE_MAX / 4))
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Override the image's native pixel format to be RGBA_NONPREMUL
|
||||
wuffs_base__pixel_config__set(
|
||||
&ic.pixcfg,
|
||||
WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL,
|
||||
WUFFS_BASE__PIXEL_SUBSAMPLING__NONE,
|
||||
dim_x, dim_y);
|
||||
|
||||
// Configure the work buffer
|
||||
size_t workbuf_len = wuffs_png__decoder__workbuf_len(dec).max_incl;
|
||||
if (workbuf_len > SIZE_MAX)
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
wuffs_base__slice_u8 workbuf_slice = wuffs_base__make_slice_u8((uint8_t*)malloc(workbuf_len), workbuf_len);
|
||||
if (!workbuf_slice.ptr)
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
wuffs_base__slice_u8 pixbuf_slice = wuffs_base__make_slice_u8((uint8_t*)malloc(num_pixels * 4), num_pixels * 4);
|
||||
if (!pixbuf_slice.ptr)
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
wuffs_base__pixel_buffer pb;
|
||||
status = wuffs_base__pixel_buffer__set_from_slice(&pb, &ic.pixcfg, pixbuf_slice);
|
||||
if (status.repr)
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Decode the pixels
|
||||
status = wuffs_png__decoder__decode_frame(dec, &pb, &src, WUFFS_BASE__PIXEL_BLEND__SRC, workbuf_slice, NULL);
|
||||
if (status.repr)
|
||||
{
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
astcenc_image* img = astc_img_from_unorm8x4_array(pixbuf_slice.ptr, dim_x, dim_y, y_flip);
|
||||
|
||||
free(pixbuf_slice.ptr);
|
||||
free(workbuf_slice.ptr);
|
||||
free(dec);
|
||||
|
||||
return img;
|
||||
}
|
||||
@@ -0,0 +1,422 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2023 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Functions and data declarations.
|
||||
*/
|
||||
|
||||
#ifndef ASTCENCCLI_INTERNAL_INCLUDED
|
||||
#define ASTCENCCLI_INTERNAL_INCLUDED
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "astcenc.h"
|
||||
#include "astcenc_mathlib.h"
|
||||
|
||||
/**
|
||||
* @brief The payload stored in a compressed ASTC image.
|
||||
*/
|
||||
struct astc_compressed_image
|
||||
{
|
||||
/** @brief The block width in texels. */
|
||||
unsigned int block_x;
|
||||
|
||||
/** @brief The block height in texels. */
|
||||
unsigned int block_y;
|
||||
|
||||
/** @brief The block depth in texels. */
|
||||
unsigned int block_z;
|
||||
|
||||
/** @brief The image width in texels. */
|
||||
unsigned int dim_x;
|
||||
|
||||
/** @brief The image height in texels. */
|
||||
unsigned int dim_y;
|
||||
|
||||
/** @brief The image depth in texels. */
|
||||
unsigned int dim_z;
|
||||
|
||||
/** @brief The binary data payload. */
|
||||
uint8_t* data;
|
||||
|
||||
/** @brief The binary data length in bytes. */
|
||||
size_t data_len;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Config options that have been read from command line.
|
||||
*/
|
||||
struct cli_config_options
|
||||
{
|
||||
/** @brief The number of threads to use for processing. */
|
||||
unsigned int thread_count;
|
||||
|
||||
/** @brief The number of repeats to execute for benchmarking. */
|
||||
unsigned int repeat_count;
|
||||
|
||||
/** @brief The number of image slices to load for a 3D image. */
|
||||
unsigned int array_size;
|
||||
|
||||
/** @brief @c true if running in silent mode with minimal output. */
|
||||
bool silentmode;
|
||||
|
||||
/** @brief @c true if the images should be y-flipped. */
|
||||
bool y_flip;
|
||||
|
||||
/** @brief @c true if diagnostic images should be stored. */
|
||||
bool diagnostic_images;
|
||||
|
||||
/** @brief The low exposure fstop for error computation. */
|
||||
int low_fstop;
|
||||
|
||||
/** @brief The high exposure fstop for error computation. */
|
||||
int high_fstop;
|
||||
|
||||
/** @brief The pre-encode swizzle. */
|
||||
astcenc_swizzle swz_encode;
|
||||
|
||||
/** @brief The post-decode swizzle. */
|
||||
astcenc_swizzle swz_decode;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Print a string to stderr.
|
||||
*/
|
||||
static inline void print_error(
|
||||
const char* format
|
||||
) {
|
||||
fprintf(stderr, "%s", format);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Print a formatted string to stderr.
|
||||
*/
|
||||
template<typename ... _Args>
|
||||
static inline void print_error(
|
||||
const char* format,
|
||||
_Args...args
|
||||
) {
|
||||
fprintf(stderr, format, args...);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Load uncompressed image.
|
||||
*
|
||||
* @param filename The file path on disk.
|
||||
* @param y_flip Should this image be Y flipped?
|
||||
* @param[out] is_hdr Is the loaded image HDR?
|
||||
* @param[out] component_count The number of components in the loaded image.
|
||||
*
|
||||
* @return The astc image file, or nullptr on error.
|
||||
*/
|
||||
astcenc_image* load_ncimage(
|
||||
const char* filename,
|
||||
bool y_flip,
|
||||
bool& is_hdr,
|
||||
unsigned int& component_count);
|
||||
|
||||
/**
|
||||
* @brief Load uncompressed PNG image.
|
||||
*
|
||||
* @param filename The file path on disk.
|
||||
* @param y_flip Should this image be Y flipped?
|
||||
* @param[out] is_hdr Is the loaded image HDR?
|
||||
* @param[out] component_count The number of components in the loaded image.
|
||||
*
|
||||
* @return The astc image file, or nullptr on error.
|
||||
*/
|
||||
astcenc_image* load_png_with_wuffs(
|
||||
const char* filename,
|
||||
bool y_flip,
|
||||
bool& is_hdr,
|
||||
unsigned int& component_count);
|
||||
|
||||
/**
|
||||
* @brief Save an uncompressed image.
|
||||
*
|
||||
* @param img The source data for the image.
|
||||
* @param filename The name of the file to save.
|
||||
* @param y_flip Should the image be vertically flipped?
|
||||
*
|
||||
* @return @c true if the image saved OK, @c false on error.
|
||||
*/
|
||||
bool store_ncimage(
|
||||
const astcenc_image* img,
|
||||
const char* filename,
|
||||
int y_flip);
|
||||
|
||||
/**
|
||||
* @brief Check if the output file type requires a specific bitness.
|
||||
*
|
||||
* @param filename The file name, containing hte extension to check.
|
||||
*
|
||||
* @return Valid values are:
|
||||
* * -1 - error - unknown file type.
|
||||
* * 0 - no enforced bitness.
|
||||
* * 8 - enforced 8-bit UNORM.
|
||||
* * 16 - enforced 16-bit FP16.
|
||||
*/
|
||||
int get_output_filename_enforced_bitness(
|
||||
const char* filename);
|
||||
|
||||
/**
|
||||
* @brief Allocate a new image in a canonical format.
|
||||
*
|
||||
* Allocated images must be freed with a @c free_image() call.
|
||||
*
|
||||
* @param bitness The number of bits per component (8, 16, or 32).
|
||||
* @param dim_x The width of the image, in texels.
|
||||
* @param dim_y The height of the image, in texels.
|
||||
* @param dim_z The depth of the image, in texels.
|
||||
*
|
||||
* @return The allocated image, or @c nullptr on error.
|
||||
*/
|
||||
astcenc_image* alloc_image(
|
||||
unsigned int bitness,
|
||||
unsigned int dim_x,
|
||||
unsigned int dim_y,
|
||||
unsigned int dim_z);
|
||||
|
||||
/**
|
||||
* @brief Free an image.
|
||||
*
|
||||
* @param img The image to free.
|
||||
*/
|
||||
void free_image(
|
||||
astcenc_image* img);
|
||||
|
||||
/**
|
||||
* @brief Determine the number of active components in an image.
|
||||
*
|
||||
* @param img The image to analyze.
|
||||
*
|
||||
* @return The number of active components in the image.
|
||||
*/
|
||||
int determine_image_components(
|
||||
const astcenc_image* img);
|
||||
|
||||
/**
|
||||
* @brief Load a compressed .astc image.
|
||||
*
|
||||
* @param filename The file to load.
|
||||
* @param img The image to populate with loaded data.
|
||||
*
|
||||
* @return Non-zero on error, zero on success.
|
||||
*/
|
||||
int load_cimage(
|
||||
const char* filename,
|
||||
astc_compressed_image& img);
|
||||
|
||||
/**
|
||||
* @brief Store a compressed .astc image.
|
||||
*
|
||||
* @param img The image to store.
|
||||
* @param filename The file to save.
|
||||
*
|
||||
* @return Non-zero on error, zero on success.
|
||||
*/
|
||||
int store_cimage(
|
||||
const astc_compressed_image& img,
|
||||
const char* filename);
|
||||
|
||||
/**
|
||||
* @brief Load a compressed .ktx image.
|
||||
*
|
||||
* @param filename The file to load.
|
||||
* @param is_srgb Is this an sRGB encoded file?
|
||||
* @param img The image to populate with loaded data.
|
||||
*
|
||||
* @return Non-zero on error, zero on success.
|
||||
*/
|
||||
bool load_ktx_compressed_image(
|
||||
const char* filename,
|
||||
bool& is_srgb,
|
||||
astc_compressed_image& img) ;
|
||||
|
||||
/**
|
||||
* @brief Store a compressed .ktx image.
|
||||
*
|
||||
* @param img The image to store.
|
||||
* @param filename The file to store.
|
||||
* @param is_srgb Is this an sRGB encoded file?
|
||||
*
|
||||
* @return Non-zero on error, zero on success.
|
||||
*/
|
||||
bool store_ktx_compressed_image(
|
||||
const astc_compressed_image& img,
|
||||
const char* filename,
|
||||
bool is_srgb);
|
||||
|
||||
/**
|
||||
* @brief Create an image from a 2D float data array.
|
||||
*
|
||||
* @param data The raw input data.
|
||||
* @param dim_x The width of the image, in texels.
|
||||
* @param dim_y The height of the image, in texels.
|
||||
* @param y_flip Should this image be vertically flipped?
|
||||
*
|
||||
* @return The populated image.
|
||||
*/
|
||||
astcenc_image* astc_img_from_floatx4_array(
|
||||
const float* data,
|
||||
unsigned int dim_x,
|
||||
unsigned int dim_y,
|
||||
bool y_flip);
|
||||
|
||||
/**
|
||||
* @brief Create an image from a 2D byte data array.
|
||||
*
|
||||
* @param data The raw input data.
|
||||
* @param dim_x The width of the image, in texels.
|
||||
* @param dim_y The height of the image, in texels.
|
||||
* @param y_flip Should this image be vertically flipped?
|
||||
*
|
||||
* @return The populated image.
|
||||
*/
|
||||
astcenc_image* astc_img_from_unorm8x4_array(
|
||||
const uint8_t* data,
|
||||
unsigned int dim_x,
|
||||
unsigned int dim_y,
|
||||
bool y_flip);
|
||||
|
||||
/**
|
||||
* @brief Create a flattened RGBA FLOAT32 data array for a single slice from an image structure.
|
||||
*
|
||||
* The returned data array is allocated with @c new[] and must be freed with a @c delete[] call.
|
||||
*
|
||||
* @param img The input image.
|
||||
* @param y_flip Should the data in the array be Y flipped?
|
||||
* @param z_index The slice index to convert.
|
||||
*
|
||||
* @return The data array.
|
||||
*/
|
||||
float* floatx4_array_from_astc_img(
|
||||
const astcenc_image* img,
|
||||
bool y_flip,
|
||||
unsigned int z_index);
|
||||
|
||||
/**
|
||||
* @brief Create a flattened RGBA UNORM8 data array from an image structure.
|
||||
*
|
||||
* The returned data array is allocated with @c new[] and must be freed with a @c delete[] call.
|
||||
*
|
||||
* @param img The input image.
|
||||
* @param y_flip Should the data in the array be Y flipped?
|
||||
*
|
||||
* @return The data array.
|
||||
*/
|
||||
uint8_t* unorm8x4_array_from_astc_img(
|
||||
const astcenc_image* img,
|
||||
bool y_flip);
|
||||
|
||||
/* ============================================================================
|
||||
Functions for printing build info and help messages
|
||||
============================================================================ */
|
||||
|
||||
/**
|
||||
* @brief Print the tool copyright and version header to stdout.
|
||||
*/
|
||||
void astcenc_print_header();
|
||||
|
||||
/**
|
||||
* @brief Print the tool copyright, version, and short-form help to stdout.
|
||||
*/
|
||||
void astcenc_print_shorthelp();
|
||||
|
||||
/**
|
||||
* @brief Print the tool copyright, version, and long-form help to stdout.
|
||||
*/
|
||||
void astcenc_print_longhelp();
|
||||
|
||||
/**
|
||||
* @brief Compute error metrics comparing two images.
|
||||
*
|
||||
* @param compute_hdr_metrics True if HDR metrics should be computed.
|
||||
* @param compute_normal_metrics True if normal map metrics should be computed.
|
||||
* @param input_components The number of input color components.
|
||||
* @param img1 The original image.
|
||||
* @param img2 The compressed image.
|
||||
* @param fstop_lo The low exposure fstop (HDR only).
|
||||
* @param fstop_hi The high exposure fstop (HDR only).
|
||||
*/
|
||||
void compute_error_metrics(
|
||||
bool compute_hdr_metrics,
|
||||
bool compute_normal_metrics,
|
||||
int input_components,
|
||||
const astcenc_image* img1,
|
||||
const astcenc_image* img2,
|
||||
int fstop_lo,
|
||||
int fstop_hi);
|
||||
|
||||
/**
|
||||
* @brief Get the current time.
|
||||
*
|
||||
* @return The current time in seconds since arbitrary epoch.
|
||||
*/
|
||||
double get_time();
|
||||
|
||||
/**
|
||||
* @brief Get the number of CPU cores.
|
||||
*
|
||||
* @return The number of online or onlineable CPU cores in the system.
|
||||
*/
|
||||
int get_cpu_count();
|
||||
|
||||
/**
|
||||
* @brief Launch N worker threads and wait for them to complete.
|
||||
*
|
||||
* All threads run the same thread function, and have the same thread payload, but are given a
|
||||
* unique thread ID (0 .. N-1) as a parameter to the run function to allow thread-specific behavior.
|
||||
*
|
||||
* @param operation The name of the operation for this async task.
|
||||
* @param thread_count The number of threads to spawn.
|
||||
* @param func The function to execute. Must have the signature:
|
||||
* void (int thread_count, int thread_id, void* payload)
|
||||
* @param payload Pointer to an opaque thread payload object.
|
||||
*/
|
||||
void launch_threads(
|
||||
const char* operation,
|
||||
int thread_count,
|
||||
void (*func)(int, int, void*),
|
||||
void *payload);
|
||||
|
||||
/**
|
||||
* @brief Set the current thread name to a string value.
|
||||
*
|
||||
* For portability strings should be no longer than 16 characters.
|
||||
*
|
||||
* @param name The thread name.
|
||||
*/
|
||||
void set_thread_name(
|
||||
const char* name);
|
||||
|
||||
/**
|
||||
* @brief The main entry point.
|
||||
*
|
||||
* @param argc The number of arguments.
|
||||
* @param argv The vector of arguments.
|
||||
*
|
||||
* @return 0 on success, non-zero otherwise.
|
||||
*/
|
||||
int astcenc_main(
|
||||
int argc,
|
||||
char **argv);
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,309 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// ----------------------------------------------------------------------------
|
||||
// Copyright 2011-2024 Arm Limited
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at:
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations
|
||||
// under the License.
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* @brief Platform-specific function implementations.
|
||||
*
|
||||
* This module contains functions with strongly OS-dependent implementations:
|
||||
*
|
||||
* * CPU count queries
|
||||
* * Threading
|
||||
* * Time
|
||||
*
|
||||
* In addition to the basic thread abstraction (which is native pthreads on
|
||||
* all platforms, except Windows where it is an emulation of pthreads), a
|
||||
* utility function to create N threads and wait for them to complete a batch
|
||||
* task has also been provided.
|
||||
*/
|
||||
|
||||
#include "astcenccli_internal.h"
|
||||
|
||||
/* ============================================================================
|
||||
Platform code for Windows using the Win32 APIs.
|
||||
============================================================================ */
|
||||
#if defined(_WIN32) && !defined(__CYGWIN__)
|
||||
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#define NOMINMAX
|
||||
#include <windows.h>
|
||||
#include <Processthreadsapi.h>
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
|
||||
/** @brief Alias pthread_t to one of the internal Windows types. */
|
||||
typedef HANDLE pthread_t;
|
||||
|
||||
/** @brief Alias pthread_attr_t to one of the internal Windows types. */
|
||||
typedef int pthread_attr_t;
|
||||
|
||||
/**
|
||||
* @brief Proxy Windows @c CreateThread underneath a pthreads-like wrapper.
|
||||
*/
|
||||
static int pthread_create(
|
||||
pthread_t* thread,
|
||||
const pthread_attr_t* attribs,
|
||||
void* (*threadfunc)(void*),
|
||||
void* thread_arg
|
||||
) {
|
||||
static_cast<void>(attribs);
|
||||
LPTHREAD_START_ROUTINE func = reinterpret_cast<LPTHREAD_START_ROUTINE>(threadfunc);
|
||||
*thread = CreateThread(nullptr, 0, func, thread_arg, 0, nullptr);
|
||||
|
||||
// Ensure we return 0 on success, non-zero on error
|
||||
if (*thread == NULL)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Manually set CPU group and thread affinity.
|
||||
*
|
||||
* This is needed on Windows 10 or older to allow benefit from large core count
|
||||
* systems with more than 64 logical CPUs. The assignment is skipped on systems
|
||||
* with a single processor group, as it is not necessary.
|
||||
*/
|
||||
static void set_group_affinity(
|
||||
pthread_t thread,
|
||||
int thread_index
|
||||
) {
|
||||
// Skip thread assignment for hardware with a single CPU group
|
||||
int group_count = GetActiveProcessorGroupCount();
|
||||
if (group_count == 1)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Ensure we have a valid assign if user creates more threads than cores
|
||||
int assign_index = thread_index % get_cpu_count();
|
||||
int assign_group { 0 };
|
||||
int assign_group_cpu_count { 0 };
|
||||
|
||||
// Determine which core group and core in the group to use for this thread
|
||||
int group_cpu_count_sum { 0 };
|
||||
for (int group = 0; group < group_count; group++)
|
||||
{
|
||||
int group_cpu_count = static_cast<int>(GetMaximumProcessorCount(group));
|
||||
group_cpu_count_sum += group_cpu_count;
|
||||
|
||||
if (assign_index < group_cpu_count_sum)
|
||||
{
|
||||
assign_group = group;
|
||||
assign_group_cpu_count = group_cpu_count;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Set the affinity to the assigned group, and all supported cores
|
||||
GROUP_AFFINITY affinity {};
|
||||
affinity.Mask = (1 << assign_group_cpu_count) - 1;
|
||||
affinity.Group = assign_group;
|
||||
SetThreadGroupAffinity(thread, &affinity, nullptr);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Proxy Windows @c WaitForSingleObject underneath a pthreads-like wrapper.
|
||||
*/
|
||||
static int pthread_join(
|
||||
pthread_t thread,
|
||||
void** value
|
||||
) {
|
||||
static_cast<void>(value);
|
||||
WaitForSingleObject(thread, INFINITE);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* See header for documentation */
|
||||
int get_cpu_count()
|
||||
{
|
||||
DWORD cpu_count = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
|
||||
return static_cast<int>(cpu_count);
|
||||
}
|
||||
|
||||
/* See header for documentation */
|
||||
double get_time()
|
||||
{
|
||||
FILETIME tv;
|
||||
GetSystemTimePreciseAsFileTime(&tv);
|
||||
unsigned long long ticks = tv.dwHighDateTime;
|
||||
ticks = (ticks << 32) | tv.dwLowDateTime;
|
||||
return static_cast<double>(ticks) / 1.0e7;
|
||||
}
|
||||
|
||||
/* See header for documentation */
|
||||
void set_thread_name(
|
||||
const char* name
|
||||
) {
|
||||
// Names are limited to 16 characters
|
||||
wchar_t wname [16] { 0 };
|
||||
size_t name_len = std::strlen(name);
|
||||
size_t clamp_len = std::min<size_t>(name_len, 15);
|
||||
|
||||
// We know we only have basic 7-bit ASCII so just widen
|
||||
for (size_t i = 0; i < clamp_len; i++)
|
||||
{
|
||||
wname[i] = static_cast<wchar_t>(name[i]);
|
||||
}
|
||||
|
||||
SetThreadDescription(GetCurrentThread(), wname);
|
||||
}
|
||||
|
||||
/* ============================================================================
|
||||
Platform code for an platform using POSIX APIs.
|
||||
============================================================================ */
|
||||
#else
|
||||
|
||||
#include <pthread.h>
|
||||
#include <sys/time.h>
|
||||
#include <unistd.h>
|
||||
|
||||
/* See header for documentation */
|
||||
int get_cpu_count()
|
||||
{
|
||||
return static_cast<int>(sysconf(_SC_NPROCESSORS_ONLN));
|
||||
}
|
||||
|
||||
/* See header for documentation */
|
||||
double get_time()
|
||||
{
|
||||
timeval tv;
|
||||
gettimeofday(&tv, 0);
|
||||
return static_cast<double>(tv.tv_sec) + static_cast<double>(tv.tv_usec) * 1.0e-6;
|
||||
}
|
||||
|
||||
/* See header for documentation */
|
||||
void set_thread_name(
|
||||
const char* name
|
||||
) {
|
||||
// No standard mechanism, so be defensive here
|
||||
#if defined(__linux__)
|
||||
pthread_setname_np(pthread_self(), name);
|
||||
#elif defined(__APPLE__)
|
||||
pthread_setname_np(name);
|
||||
#else
|
||||
(void)name;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief Worker thread helper payload for launch_threads.
|
||||
*/
|
||||
struct launch_desc
|
||||
{
|
||||
/** @brief The native thread handle. */
|
||||
pthread_t thread_handle;
|
||||
/** @brief The total number of threads in the thread pool. */
|
||||
int thread_count;
|
||||
/** @brief The thread index in the thread pool. */
|
||||
int thread_id;
|
||||
/** @brief The user thread function to execute. */
|
||||
void (*func)(int, int, void*);
|
||||
/** @brief The user thread payload. */
|
||||
void* payload;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Helper function to translate thread entry points.
|
||||
*
|
||||
* Convert a (void*) thread entry to an (int, void*) thread entry, where the
|
||||
* integer contains the thread ID in the thread pool.
|
||||
*
|
||||
* @param p The thread launch helper payload.
|
||||
*/
|
||||
static void* launch_threads_helper(
|
||||
void *p
|
||||
) {
|
||||
launch_desc* ltd = reinterpret_cast<launch_desc*>(p);
|
||||
ltd->func(ltd->thread_count, ltd->thread_id, ltd->payload);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
/* See header for documentation */
|
||||
void launch_threads(
|
||||
const char* operation,
|
||||
int thread_count,
|
||||
void (*func)(int, int, void*),
|
||||
void *payload
|
||||
) {
|
||||
// Directly execute single threaded workloads on this thread
|
||||
if (thread_count <= 1)
|
||||
{
|
||||
func(1, 0, payload);
|
||||
return;
|
||||
}
|
||||
|
||||
// Otherwise spawn worker threads
|
||||
launch_desc *thread_descs = new launch_desc[thread_count];
|
||||
int actual_thread_count { 0 };
|
||||
|
||||
for (int i = 0; i < thread_count; i++)
|
||||
{
|
||||
thread_descs[actual_thread_count].thread_count = thread_count;
|
||||
thread_descs[actual_thread_count].thread_id = actual_thread_count;
|
||||
thread_descs[actual_thread_count].payload = payload;
|
||||
thread_descs[actual_thread_count].func = func;
|
||||
|
||||
// Handle pthread_create failing by simply using fewer threads
|
||||
int error = pthread_create(
|
||||
&(thread_descs[actual_thread_count].thread_handle),
|
||||
nullptr,
|
||||
launch_threads_helper,
|
||||
reinterpret_cast<void*>(thread_descs + actual_thread_count));
|
||||
|
||||
// Track how many threads we actually created
|
||||
if (!error)
|
||||
{
|
||||
// Windows needs explicit thread assignment to handle large core count systems
|
||||
#if defined(_WIN32) && !defined(__CYGWIN__)
|
||||
set_group_affinity(
|
||||
thread_descs[actual_thread_count].thread_handle,
|
||||
actual_thread_count);
|
||||
#endif
|
||||
|
||||
actual_thread_count++;
|
||||
}
|
||||
}
|
||||
|
||||
// If we did not create thread_count threads then emit a warning
|
||||
if (actual_thread_count != thread_count)
|
||||
{
|
||||
int log_count = actual_thread_count == 0 ? 1 : actual_thread_count;
|
||||
const char* log_s = log_count == 1 ? "" : "s";
|
||||
printf("WARNING: %s using %d thread%s due to thread creation error\n\n",
|
||||
operation, log_count, log_s);
|
||||
}
|
||||
|
||||
// If we managed to spawn any threads wait for them to complete
|
||||
if (actual_thread_count != 0)
|
||||
{
|
||||
for (int i = 0; i < actual_thread_count; i++)
|
||||
{
|
||||
pthread_join(thread_descs[i].thread_handle, nullptr);
|
||||
}
|
||||
}
|
||||
// Else fall back to using this thread
|
||||
else
|
||||
{
|
||||
func(1, 0, payload);
|
||||
}
|
||||
|
||||
delete[] thread_descs;
|
||||
}
|
||||