From 0bbdd4af2a282c027fabd1a1de1958d80b519fff Mon Sep 17 00:00:00 2001 From: Pavel Martishevsky Date: Wed, 18 Feb 2026 01:26:45 +0900 Subject: [PATCH] Re-works how LDS regions are defined and how their total size is computed to use for HLSL groupshared definition. --- .../Shaders/ZstdGpuDecompressLiterals.hlsl | 2 +- .../Shaders/ZstdGpuDecompressSequences.hlsl | 21 +- zstd/zstdgpu/Shaders/ZstdGpuInitFseTable.hlsl | 19 +- .../Shaders/ZstdGpuInitHuffmanTable.hlsl | 2 +- ...InitHuffmanTableAndDecompressLiterals.hlsl | 2 +- zstd/zstdgpu/zstdgpu_lds.h | 54 ++--- zstd/zstdgpu/zstdgpu_lds_decl_base.h | 30 +++ zstd/zstdgpu/zstdgpu_lds_decl_size.h | 21 ++ zstd/zstdgpu/zstdgpu_lds_decl_undef.h | 19 ++ zstd/zstdgpu/zstdgpu_lds_hlsl.h | 14 +- zstd/zstdgpu/zstdgpu_shaders.h | 202 +++++++++++++----- zstd/zstdgpu/zstdgpu_structs.h | 23 +- 12 files changed, 271 insertions(+), 138 deletions(-) create mode 100644 zstd/zstdgpu/zstdgpu_lds_decl_base.h create mode 100644 zstd/zstdgpu/zstdgpu_lds_decl_size.h create mode 100644 zstd/zstdgpu/zstdgpu_lds_decl_undef.h diff --git a/zstd/zstdgpu/Shaders/ZstdGpuDecompressLiterals.hlsl b/zstd/zstdgpu/Shaders/ZstdGpuDecompressLiterals.hlsl index dc6b3a6..71b4515 100644 --- a/zstd/zstdgpu/Shaders/ZstdGpuDecompressLiterals.hlsl +++ b/zstd/zstdgpu/Shaders/ZstdGpuDecompressLiterals.hlsl @@ -28,7 +28,7 @@ ZSTDGPU_DECOMPRESS_LITERALS_SRT() #include "../zstdgpu_srt_decl_undef.h" // WARN(pamartis): Wasteful, need only uint8_t but HLSL doesn't support it -groupshared uint32_t GS_Lds[kzstdgpu_MaxCount_HuffmanTableExpandedUInts]; +groupshared uint32_t GS_Lds[kzstdgpu_DecompressLiterals_LdsSize]; #define ZSTDGPU_LDS GS_Lds #include "../zstdgpu_lds_hlsl.h" diff --git a/zstd/zstdgpu/Shaders/ZstdGpuDecompressSequences.hlsl b/zstd/zstdgpu/Shaders/ZstdGpuDecompressSequences.hlsl index 3c32f9b..3158f8e 100644 --- a/zstd/zstdgpu/Shaders/ZstdGpuDecompressSequences.hlsl +++ b/zstd/zstdgpu/Shaders/ZstdGpuDecompressSequences.hlsl @@ -15,12 +15,6 @@ * Author(s): Pavel Martishevsky (pamartis@microsoft.com) */ -#include "../zstdgpu_shaders.h" - -#include "../zstdgpu_srt_decl_bind.h" -ZSTDGPU_DECOMPRESS_SEQUENCES_SRT() -#include "../zstdgpu_srt_decl_undef.h" - #ifdef __XBOX_SCARLETT #define __XBOX_ENABLE_WAVE32 1 #endif @@ -31,15 +25,24 @@ ZSTDGPU_DECOMPRESS_SEQUENCES_SRT() //#define USE_LDS_OUT_CACHE 1 #endif +#ifdef USE_LDS_OUT_CACHE +#define SEQ_CACHE_LEN 128 +#endif + +#include "../zstdgpu_shaders.h" + +#include "../zstdgpu_srt_decl_bind.h" +ZSTDGPU_DECOMPRESS_SEQUENCES_SRT() +#include "../zstdgpu_srt_decl_undef.h" + #ifdef USE_LDS_FSE_CACHE -groupshared uint32_t Lds[kzstdgpu_FseElemMaxCount_LLen + kzstdgpu_FseElemMaxCount_MLen + kzstdgpu_FseElemMaxCount_Offs]; +groupshared uint32_t Lds[kzstdgpu_DecompressSequences_LdsFseCache_LdsSize]; #define ZSTDGPU_LDS Lds #include "../zstdgpu_lds_hlsl.h" #endif #ifdef USE_LDS_OUT_CACHE -#define SEQ_CACHE_LEN 128 -groupshared uint32_t Lds[kzstdgpu_TgSizeX_DecompressSequences * (SEQ_CACHE_LEN + 1) * 3]; +groupshared uint32_t Lds[kzstdgpu_DecompressSequences_LdsOutCache_LdsSize]; #define ZSTDGPU_LDS Lds #include "../zstdgpu_lds_hlsl.h" #endif diff --git a/zstd/zstdgpu/Shaders/ZstdGpuInitFseTable.hlsl b/zstd/zstdgpu/Shaders/ZstdGpuInitFseTable.hlsl index 73d99a7..1842065 100644 --- a/zstd/zstdgpu/Shaders/ZstdGpuInitFseTable.hlsl +++ b/zstd/zstdgpu/Shaders/ZstdGpuInitFseTable.hlsl @@ -53,20 +53,11 @@ ConstantBuffer Constants : register(b0); ZSTDGPU_INIT_FSE_TABLE_SRT() #include "../zstdgpu_srt_decl_undef.h" -groupshared uint32_t Lds[ - 0 + - #if (ZSTD_BITCNT_NSTATE_METHOD == ZSTD_BITCNT_NSTATE_METHOD_DEFAULT) || (ZSTD_BITCNT_NSTATE_METHOD == ZSTD_BITCNT_NSTATE_METHOD_REFERENCE) - + kzstdgpu_MaxCount_FseProbs - + kzstdgpu_MaxCount_FseElemsAllDigitBits - #elif ZSTD_BITCNT_NSTATE_METHOD == ZSTD_BITCNT_NSTATE_METHOD_EXPERIMENTAL - + kzstdgpu_MaxCount_FseElems * 2 - + kzstdgpu_MaxCount_FseElemsOneDigitBits * 2 // kzstdgpu_MaxCount_FseElemsOneDigitBits - masks, kzstdgpu_MaxCount_FseElemsOneDigitBits - ones prefix - #endif - - #if IS_MULTI_WAVE - + kzstdgpu_WaveCountMax_InitFseTable * 3 + 3 - #endif -]; +#if ZSTD_BITCNT_NSTATE_METHOD == ZSTD_BITCNT_NSTATE_METHOD_DEFAULT +groupshared uint32_t Lds[kzstdgpu_InitFseTable_Default_LdsSize]; +#elif ZSTD_BITCNT_NSTATE_METHOD == ZSTD_BITCNT_NSTATE_METHOD_EXPERIMENTAL +groupshared uint32_t Lds[kzstdgpu_InitFseTable_Experimental_LdsSize]; +#endif #define ZSTDGPU_LDS Lds #include "../zstdgpu_lds_hlsl.h" diff --git a/zstd/zstdgpu/Shaders/ZstdGpuInitHuffmanTable.hlsl b/zstd/zstdgpu/Shaders/ZstdGpuInitHuffmanTable.hlsl index d75a94a..7746fe3 100644 --- a/zstd/zstdgpu/Shaders/ZstdGpuInitHuffmanTable.hlsl +++ b/zstd/zstdgpu/Shaders/ZstdGpuInitHuffmanTable.hlsl @@ -29,7 +29,7 @@ ZSTDGPU_INIT_HUFFMAN_TABLE_SRT() #include "../zstdgpu_srt_decl_undef.h" // WARN(pamartis): Wasteful, need only uint8_t but HLSL doesn't support it -groupshared uint32_t GS_Lds[kzstdgpu_MaxCount_HuffmanWeights + kzstdgpu_MaxCount_HuffmanWeightsAllDigitBits + kzstdgpu_MaxCount_HuffmanWeightRanks * 3 + 2]; +groupshared uint32_t GS_Lds[kzstdgpu_InitHuffmanTable_LdsSize]; #define ZSTDGPU_LDS GS_Lds #include "../zstdgpu_lds_hlsl.h" diff --git a/zstd/zstdgpu/Shaders/ZstdGpuInitHuffmanTableAndDecompressLiterals.hlsl b/zstd/zstdgpu/Shaders/ZstdGpuInitHuffmanTableAndDecompressLiterals.hlsl index 9ca38ff..97fbca9 100644 --- a/zstd/zstdgpu/Shaders/ZstdGpuInitHuffmanTableAndDecompressLiterals.hlsl +++ b/zstd/zstdgpu/Shaders/ZstdGpuInitHuffmanTableAndDecompressLiterals.hlsl @@ -33,7 +33,7 @@ ZSTDGPU_INIT_HUFFMAN_TABLE_AND_DECOMPRESS_LITERALS_SRT() #include "../zstdgpu_srt_decl_undef.h" // WARN(pamartis): Wasteful, need only uint8_t but HLSL doesn't support it -groupshared uint32_t GS_Lds[kzstdgpu_MaxCount_HuffmanWeights * 2 + kzstdgpu_MaxCount_HuffmanWeightsAllDigitBits + kzstdgpu_MaxCount_HuffmanWeightRanks * 3 + 2 + kzstdgpu_MaxCount_HuffmanTableExpandedUInts]; +groupshared uint32_t GS_Lds[kzstdgpu_InitHuffmanTableAndDecompressLiterals_LdsSize]; #define ZSTDGPU_LDS GS_Lds #include "../zstdgpu_lds_hlsl.h" diff --git a/zstd/zstdgpu/zstdgpu_lds.h b/zstd/zstdgpu/zstdgpu_lds.h index ee926c9..6344f0a 100644 --- a/zstd/zstdgpu/zstdgpu_lds.h +++ b/zstd/zstdgpu/zstdgpu_lds.h @@ -22,79 +22,53 @@ #ifdef __hlsl_dx_compiler -#ifndef ZSTDGPU_START_GROUPSHARED -#define ZSTDGPU_START_GROUPSHARED() \ - uint32_t GS_Region = 0; -#endif - -#ifndef ZSTDGPU_DECLARE_GROUPSHARED -#define ZSTDGPU_DECLARE_GROUPSHARED(name, size) \ - static const uint32_t GS_##name##_Size = size; \ - const uint32_t GS_##name = GS_Region; \ - GS_Region += GS_##name##_Size; -#endif - // For HLSL this file contains only definitions that don't contain references to the LDS because // HLSL doesn't support passing `groupshared` variables into functions #pragma dxc push #pragma dxc diagnostic ignored "-Wundefined-internal" -static uint32_t zstdgpu_LdsLoadU32(uint32_t offsetInUInt32); -static void zstdgpu_LdsStoreU32(uint32_t offsetInUInt32, uint32_t x); -static void zstdgpu_LdsAtomicAddU32(uint32_t offsetInUInt32, uint32_t x); -static void zstdgpu_LdsAtomicMaxU32(uint32_t offsetInUInt32, uint32_t x); -static void zstdgpu_LdsAtomicMinU32(uint32_t offsetInUInt32, uint32_t x); -static void zstdgpu_LdsAtomicAndU32(uint32_t offsetInUInt32, uint32_t x); -static void zstdgpu_LdsAtomicOrU32(uint32_t offsetInUInt32, uint32_t x); +static uint32_t zstdgpu_LdsLoadU32(zstdgpu_lds_const_uintptr_t offsetInUInt32); +static void zstdgpu_LdsStoreU32(zstdgpu_lds_uintptr_t offsetInUInt32, uint32_t x); +static void zstdgpu_LdsAtomicAddU32(zstdgpu_lds_uintptr_t offsetInUInt32, uint32_t x); +static void zstdgpu_LdsAtomicMaxU32(zstdgpu_lds_uintptr_t offsetInUInt32, uint32_t x); +static void zstdgpu_LdsAtomicMinU32(zstdgpu_lds_uintptr_t offsetInUInt32, uint32_t x); +static void zstdgpu_LdsAtomicAndU32(zstdgpu_lds_uintptr_t offsetInUInt32, uint32_t x); +static void zstdgpu_LdsAtomicOrU32(zstdgpu_lds_uintptr_t offsetInUInt32, uint32_t x); #pragma dxc pop #else -#ifndef ZSTDGPU_START_GROUPSHARED -#define ZSTDGPU_START_GROUPSHARED() \ - uint32_t GS_Region = 0; - -#endif - -#ifndef ZSTDGPU_DECLARE_GROUPSHARED -#define ZSTDGPU_DECLARE_GROUPSHARED(name, size) \ - static const uint32_t GS_##name##_Size = size; \ - GS_Region += size; \ - uint32_t GS_##name[size]; \ - /*ASSERT(GS_Remain < &GS_Region[GS_Region_Size]);*/ -#endif - -static inline uint32_t zstdgpu_LdsLoadU32(const uint32_t *address) +static inline uint32_t zstdgpu_LdsLoadU32(zstdgpu_lds_const_uintptr_t address) { return *address; } -static inline void zstdgpu_LdsStoreU32(uint32_t *address, uint32_t x) +static inline void zstdgpu_LdsStoreU32(zstdgpu_lds_uintptr_t address, uint32_t x) { *address = x; } -static inline void zstdgpu_LdsAtomicAddU32(uint32_t *address, uint32_t x) +static inline void zstdgpu_LdsAtomicAddU32(zstdgpu_lds_uintptr_t address, uint32_t x) { *address += x; } -static inline void zstdgpu_LdsAtomicMaxU32(uint32_t *address, uint32_t x) +static inline void zstdgpu_LdsAtomicMaxU32(zstdgpu_lds_uintptr_t address, uint32_t x) { *address = zstdgpu_MaxU32(*address, x); } -static inline void zstdgpu_LdsAtomicMinU32(uint32_t *address, uint32_t x) +static inline void zstdgpu_LdsAtomicMinU32(zstdgpu_lds_uintptr_t address, uint32_t x) { *address = zstdgpu_MinU32(*address, x); } -static inline void zstdgpu_LdsAtomicAndU32(uint32_t *address, uint32_t x) +static inline void zstdgpu_LdsAtomicAndU32(zstdgpu_lds_uintptr_t address, uint32_t x) { *address &= x; } -static inline void zstdgpu_LdsAtomicOrU32(uint32_t *address, uint32_t x) +static inline void zstdgpu_LdsAtomicOrU32(zstdgpu_lds_uintptr_t address, uint32_t x) { *address |= x; } diff --git a/zstd/zstdgpu/zstdgpu_lds_decl_base.h b/zstd/zstdgpu/zstdgpu_lds_decl_base.h new file mode 100644 index 0000000..4a5f98a --- /dev/null +++ b/zstd/zstdgpu/zstdgpu_lds_decl_base.h @@ -0,0 +1,30 @@ +/** + * Copyright (c) Microsoft. All rights reserved. + * This code is licensed under the MIT License (MIT). + * THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF + * ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY + * IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR + * PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. + * + * Advanced Technology Group (ATG) + * Author(s): Pavel Martishevsky (pamartis@microsoft.com) + * + * zstdgpu_lds_decl_base.h + * + * Defines ZSTDGPU_LDS_* macros that help to declare the base of LDS partition defined by a macrolist + * and the base for all subregions. + * Include this file before invoking a macrolist defining LDS partition, + * then include zstdgpu_lds_decl_undef.h afterwards to clean up. + */ + +#ifndef __hlsl_dx_compiler +#define ZSTDGPU_LDS_SIZE(size) uint32_t GS_Storage[kzstdgpu_##size##_LdsSize]; +#define ZSTDGPU_LDS_BASE(base) zstdgpu_lds_uintptr_t GS_Base = (0 != (base)) ? (base) : &GS_Storage[0]; +#else +#define ZSTDGPU_LDS_SIZE(size) +#define ZSTDGPU_LDS_BASE(base) zstdgpu_lds_uintptr_t GS_Base = base; +#endif + +#define ZSTDGPU_LDS_REGION(name, size) zstdgpu_lds_uintptr_t GS_##name = GS_Base; \ + GS_Base += size; + diff --git a/zstd/zstdgpu/zstdgpu_lds_decl_size.h b/zstd/zstdgpu/zstdgpu_lds_decl_size.h new file mode 100644 index 0000000..ec6ef2e --- /dev/null +++ b/zstd/zstdgpu/zstdgpu_lds_decl_size.h @@ -0,0 +1,21 @@ +/** + * Copyright (c) Microsoft. All rights reserved. + * This code is licensed under the MIT License (MIT). + * THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF + * ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY + * IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR + * PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. + * + * Advanced Technology Group (ATG) + * Author(s): Pavel Martishevsky (pamartis@microsoft.com) + * + * zstdgpu_lds_decl_size.h + * + * Defines ZSTDGPU_LDS_* macros that help to declare the total size of LDS partition defined by a macrolist + * Include this file before invoking a macrolist defining LDS partition, + * then include zstdgpu_lds_decl_undef.h afterwards to clean up. + */ + +#define ZSTDGPU_LDS_SIZE(size) static const uint32_t kzstdgpu_##size##_LdsSize = +#define ZSTDGPU_LDS_BASE(base) (base) +#define ZSTDGPU_LDS_REGION(name, size) + (size) diff --git a/zstd/zstdgpu/zstdgpu_lds_decl_undef.h b/zstd/zstdgpu/zstdgpu_lds_decl_undef.h new file mode 100644 index 0000000..441ac96 --- /dev/null +++ b/zstd/zstdgpu/zstdgpu_lds_decl_undef.h @@ -0,0 +1,19 @@ +/** + * Copyright (c) Microsoft. All rights reserved. + * This code is licensed under the MIT License (MIT). + * THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF + * ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY + * IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR + * PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT. + * + * Advanced Technology Group (ATG) + * Author(s): Pavel Martishevsky (pamartis@microsoft.com) + * + * zstdgpu_lds_decl_undef.h + * + * Undefs the ZSTDGPU_LDS_* macros defined by zstdgpu_lds_decl_size.h or zstdgpu_lds_decl_base.h. + */ + +#undef ZSTDGPU_LDS_REGION +#undef ZSTDGPU_LDS_BASE +#undef ZSTDGPU_LDS_SIZE diff --git a/zstd/zstdgpu/zstdgpu_lds_hlsl.h b/zstd/zstdgpu/zstdgpu_lds_hlsl.h index 8a92b87..9011d85 100644 --- a/zstd/zstdgpu/zstdgpu_lds_hlsl.h +++ b/zstd/zstdgpu/zstdgpu_lds_hlsl.h @@ -38,37 +38,37 @@ # error ZSTDGPU_LDS must be defined as `#define ZSTDGPU_LDS YourLdsRegion` #endif -static uint32_t zstdgpu_LdsLoadU32(uint32_t offsetInUInt32) +static uint32_t zstdgpu_LdsLoadU32(zstdgpu_lds_const_uintptr_t offsetInUInt32) { return ZSTDGPU_LDS[offsetInUInt32]; } -static void zstdgpu_LdsStoreU32(uint32_t offsetInUInt32, uint32_t x) +static void zstdgpu_LdsStoreU32(zstdgpu_lds_uintptr_t offsetInUInt32, uint32_t x) { ZSTDGPU_LDS[offsetInUInt32] = x; } -static void zstdgpu_LdsAtomicAddU32(uint32_t offsetInUInt32, uint32_t x) +static void zstdgpu_LdsAtomicAddU32(zstdgpu_lds_uintptr_t offsetInUInt32, uint32_t x) { InterlockedAdd(ZSTDGPU_LDS[offsetInUInt32], x); } -static void zstdgpu_LdsAtomicMaxU32(uint32_t offsetInUInt32, uint32_t x) +static void zstdgpu_LdsAtomicMaxU32(zstdgpu_lds_uintptr_t offsetInUInt32, uint32_t x) { InterlockedMax(ZSTDGPU_LDS[offsetInUInt32], x); } -static void zstdgpu_LdsAtomicMinU32(uint32_t offsetInUInt32, uint32_t x) +static void zstdgpu_LdsAtomicMinU32(zstdgpu_lds_uintptr_t offsetInUInt32, uint32_t x) { InterlockedMin(ZSTDGPU_LDS[offsetInUInt32], x); } -static void zstdgpu_LdsAtomicAndU32(uint32_t offsetInUInt32, uint32_t x) +static void zstdgpu_LdsAtomicAndU32(zstdgpu_lds_uintptr_t offsetInUInt32, uint32_t x) { InterlockedAnd(ZSTDGPU_LDS[offsetInUInt32], x); } -static void zstdgpu_LdsAtomicOrU32(uint32_t offsetInUInt32, uint32_t x) +static void zstdgpu_LdsAtomicOrU32(zstdgpu_lds_uintptr_t offsetInUInt32, uint32_t x) { InterlockedOr(ZSTDGPU_LDS[offsetInUInt32], x); } diff --git a/zstd/zstdgpu/zstdgpu_shaders.h b/zstd/zstdgpu/zstdgpu_shaders.h index ef759c5..0a2f2da 100644 --- a/zstd/zstdgpu/zstdgpu_shaders.h +++ b/zstd/zstdgpu/zstdgpu_shaders.h @@ -1592,14 +1592,53 @@ static void zstdgpu_ShaderEntry_ParseCompressedBlocks(ZSTDGPU_PARAM_INOUT(zstdgp srt.inoutCompressedBlocks[threadId] = outBlockData; } -static void zstdgpu_ShaderEntry_InitFseTable(ZSTDGPU_PARAM_INOUT(zstdgpu_InitFseTable_SRT) srt, uint32_t groupId, uint32_t i) -{ +// LDS partitioning macro list for FSE Table Initialisation (default shader) +#define ZSTDGPU_INIT_FSE_TABLE_LDS_DEFAULT(base, size) \ + ZSTDGPU_LDS_SIZE(size) \ + ZSTDGPU_LDS_BASE(base) \ + ZSTDGPU_LDS_REGION(CompactedPositiveFrqPrefixSumAndSymbols , kzstdgpu_MaxCount_FseProbs) \ + ZSTDGPU_LDS_REGION(SymbolBitMasks , kzstdgpu_MaxCount_FseElemsAllDigitBits) + +// LDS partitioning macro list for FSE Table Initialisation (experimental shader) +#define ZSTDGPU_INIT_FSE_TABLE_LDS_EXPERIMENTAL(base, size) \ + ZSTDGPU_LDS_SIZE(size) \ + ZSTDGPU_LDS_BASE(base) \ + ZSTDGPU_LDS_REGION(CompactedPositiveFrqPrefixSumAndSymbols , kzstdgpu_MaxCount_FseElems) \ + ZSTDGPU_LDS_REGION(SymbolShuffleScratch , kzstdgpu_MaxCount_FseElems) \ + ZSTDGPU_LDS_REGION(SymbolBitMasks , kzstdgpu_MaxCount_FseElemsOneDigitBits * 2) + +// LDS partitioning macro list tail for FSE Table Initialisation (when threadgroup contains multiple waves) +#define ZSTDGPU_INIT_FSE_TABLE_LDS_MULTI_WAVE() \ + ZSTDGPU_LDS_REGION(PerWaveDword0 , kzstdgpu_WaveCountMax_InitFseTable) \ + ZSTDGPU_LDS_REGION(PerWaveDword1 , kzstdgpu_WaveCountMax_InitFseTable) \ + ZSTDGPU_LDS_REGION(PerWaveDword2 , kzstdgpu_WaveCountMax_InitFseTable) \ + ZSTDGPU_LDS_REGION(PerGroupDword0 , 1) \ + ZSTDGPU_LDS_REGION(PerGroupDword1 , 1) \ + ZSTDGPU_LDS_REGION(PerGroupDword2 , 1) #ifndef IS_MULTI_WAVE #define IS_MULTI_WAVE 0 #define IS_MULTI_WAVE_UNDEF 1 #endif +#include "zstdgpu_lds_decl_size.h" +ZSTDGPU_INIT_FSE_TABLE_LDS_DEFAULT(0, InitFseTable_Default) +#if IS_MULTI_WAVE +ZSTDGPU_INIT_FSE_TABLE_LDS_MULTI_WAVE() +#endif +; + +ZSTDGPU_INIT_FSE_TABLE_LDS_EXPERIMENTAL(0, InitFseTable_Experimental) +#if IS_MULTI_WAVE +ZSTDGPU_INIT_FSE_TABLE_LDS_MULTI_WAVE() +#endif +; + +#include "zstdgpu_lds_decl_undef.h" + +static void zstdgpu_ShaderEntry_InitFseTable(ZSTDGPU_PARAM_INOUT(zstdgpu_InitFseTable_SRT) srt, uint32_t groupId, uint32_t i) +{ + #ifndef ZSTD_BITCNT_NSTATE_METHOD_REFERENCE #define ZSTD_BITCNT_NSTATE_METHOD_REFERENCE 0 #define ZSTD_BITCNT_NSTATE_METHOD_REFERENCE_UNDEF 1 @@ -1633,24 +1672,16 @@ static void zstdgpu_ShaderEntry_InitFseTable(ZSTDGPU_PARAM_INOUT(zstdgpu_InitFse const uint32_t waveIdx = WaveReadLaneFirst(i / laneCnt); //const uint32_t laneIdx = i % laneCnt; - ZSTDGPU_START_GROUPSHARED() - -#if (ZSTD_BITCNT_NSTATE_METHOD == ZSTD_BITCNT_NSTATE_METHOD_DEFAULT) || (ZSTD_BITCNT_NSTATE_METHOD == ZSTD_BITCNT_NSTATE_METHOD_REFERENCE) - ZSTDGPU_DECLARE_GROUPSHARED(CompactedPositiveFrqPrefixSumAndSymbols , kzstdgpu_MaxCount_FseProbs); //< WARN(pamartis): Wasteful, need only uint8_t but HLSL doesn't support it - ZSTDGPU_DECLARE_GROUPSHARED(SymbolBitMasks , kzstdgpu_MaxCount_FseElemsAllDigitBits); -#elif ZSTD_BITCNT_NSTATE_METHOD == ZSTD_BITCNT_NSTATE_METHOD_EXPERIMENTAL - ZSTDGPU_DECLARE_GROUPSHARED(CompactedPositiveFrqPrefixSumAndSymbols , kzstdgpu_MaxCount_FseElems); - ZSTDGPU_DECLARE_GROUPSHARED(SymbolShuffleScratch , kzstdgpu_MaxCount_FseElems); - ZSTDGPU_DECLARE_GROUPSHARED(SymbolBitMasks , kzstdgpu_MaxCount_FseElemsOneDigitBits * 2); -#endif -#if IS_MULTI_WAVE - ZSTDGPU_DECLARE_GROUPSHARED(PerWaveDword0 , kzstdgpu_WaveCountMax_InitFseTable); - ZSTDGPU_DECLARE_GROUPSHARED(PerWaveDword1 , kzstdgpu_WaveCountMax_InitFseTable); - ZSTDGPU_DECLARE_GROUPSHARED(PerWaveDword2 , kzstdgpu_WaveCountMax_InitFseTable); - ZSTDGPU_DECLARE_GROUPSHARED(PerGroupDword0 , 1); - ZSTDGPU_DECLARE_GROUPSHARED(PerGroupDword1 , 1); - ZSTDGPU_DECLARE_GROUPSHARED(PerGroupDword2 , 1); -#endif + #include "zstdgpu_lds_decl_base.h" + #if (ZSTD_BITCNT_NSTATE_METHOD == ZSTD_BITCNT_NSTATE_METHOD_DEFAULT) || (ZSTD_BITCNT_NSTATE_METHOD == ZSTD_BITCNT_NSTATE_METHOD_REFERENCE) + ZSTDGPU_INIT_FSE_TABLE_LDS_DEFAULT(0, InitFseTable_Default) + #elif ZSTD_BITCNT_NSTATE_METHOD == ZSTD_BITCNT_NSTATE_METHOD_EXPERIMENTAL + ZSTDGPU_INIT_FSE_TABLE_LDS_EXPERIMENTAL(0, InitFseTable_Experimental) + #endif + #if IS_MULTI_WAVE + ZSTDGPU_INIT_FSE_TABLE_LDS_MULTI_WAVE() + #endif + #include "zstdgpu_lds_decl_undef.h" const uint32_t tableStartIndex = srt.tableStartIndex + groupId; const uint32_t frqDataOffset = tableStartIndex * kzstdgpu_MaxCount_FseProbs; @@ -2450,6 +2481,22 @@ static void zstdgpu_ShaderEntry_DecodeHuffmanWeights(ZSTDGPU_PARAM_INOUT(zstdgpu } } +// LDS partitioning macro lists for Huffman Table pre-initialisation (sub-regions within PreInit). +#define ZSTDGPU_PRE_INIT_HUFFMAN_TABLE_LDS(base, size) \ + ZSTDGPU_LDS_SIZE(size) \ + ZSTDGPU_LDS_BASE(base) \ + ZSTDGPU_LDS_REGION(Bits , kzstdgpu_MaxCount_HuffmanWeights) \ + ZSTDGPU_LDS_REGION(BitsMask , kzstdgpu_MaxCount_HuffmanWeightsAllDigitBits) \ + ZSTDGPU_LDS_REGION(RankCount , kzstdgpu_MaxCount_HuffmanWeightRanks) \ + ZSTDGPU_LDS_REGION(RankCountPrefix , kzstdgpu_MaxCount_HuffmanWeightRanks) \ + ZSTDGPU_LDS_REGION(WeightSum , 1) \ + ZSTDGPU_LDS_REGION(BitsMax , 1) + + +#include "zstdgpu_lds_decl_size.h" +ZSTDGPU_PRE_INIT_HUFFMAN_TABLE_LDS(0, PreInitHuffmanTable); +#include "zstdgpu_lds_decl_undef.h" + static void zstdgpu_PreInitHuffmanTableToLds(ZSTDGPU_RO_TYPED_BUFFER(uint32_t, uint8_t) HuffmanWeights, ZSTDGPU_RO_TYPED_BUFFER(uint32_t, uint8_t) HuffmanWeightCount, uint32_t tableId, @@ -2480,12 +2527,9 @@ static void zstdgpu_PreInitHuffmanTableToLds(ZSTDGPU_RO_TYPED_BUFFER(uint32_t, u { const uint32_t weightCntMinusOne = HuffmanWeightCount[tableId]; - ZSTDGPU_DECLARE_GROUPSHARED(Bits , kzstdgpu_MaxCount_HuffmanWeights); //< WARN(pamartis): Wasteful, need only uint8_t but HLSL doesn't support it - ZSTDGPU_DECLARE_GROUPSHARED(BitsMask , kzstdgpu_MaxCount_HuffmanWeightsAllDigitBits); - ZSTDGPU_DECLARE_GROUPSHARED(RankCount , kzstdgpu_MaxCount_HuffmanWeightRanks); - ZSTDGPU_DECLARE_GROUPSHARED(RankCountPrefix , kzstdgpu_MaxCount_HuffmanWeightRanks); - ZSTDGPU_DECLARE_GROUPSHARED(WeightSum , 1); - ZSTDGPU_DECLARE_GROUPSHARED(BitsMax , 1); + #include "zstdgpu_lds_decl_base.h" + ZSTDGPU_PRE_INIT_HUFFMAN_TABLE_LDS(GS_Region, PreInitHuffmanTable) + #include "zstdgpu_lds_decl_undef.h" ZSTDGPU_FOR_WORK_ITEMS(i, 1, threadId, threadCnt) { @@ -2684,17 +2728,23 @@ static void zstdgpu_PreInitHuffmanTableToLds(ZSTDGPU_RO_TYPED_BUFFER(uint32_t, u outCodeTableSize = zstdgpu_LdsLoadU32(GS_RankCountPrefix + outBitsMax); } +// LDS partitioning macro lists for Huffman Table Initialisation (standalone shader) +#define ZSTDGPU_INIT_HUFFMAN_TABLE_LDS(base, size) \ + ZSTDGPU_LDS_SIZE(size) \ + ZSTDGPU_LDS_BASE(base) \ + ZSTDGPU_LDS_REGION(PreInit , kzstdgpu_PreInitHuffmanTable_LdsSize) \ + ZSTDGPU_LDS_REGION(RankIndex , kzstdgpu_MaxCount_HuffmanWeightRanks) \ + ZSTDGPU_LDS_REGION(CodeAndSymbol , 1) + +#include "zstdgpu_lds_decl_size.h" +ZSTDGPU_INIT_HUFFMAN_TABLE_LDS(0, InitHuffmanTable); +#include "zstdgpu_lds_decl_undef.h" + static void zstdgpu_ShaderEntry_InitHuffmanTable(ZSTDGPU_PARAM_INOUT(zstdgpu_InitHuffmanTable_SRT) srt, uint32_t groupId, uint32_t threadId, uint32_t tgSize) { - ZSTDGPU_START_GROUPSHARED() - ZSTDGPU_DECLARE_GROUPSHARED(PreInit , kzstdgpu_MaxCount_HuffmanWeights - + kzstdgpu_MaxCount_HuffmanWeightsAllDigitBits - + kzstdgpu_MaxCount_HuffmanWeightRanks - + kzstdgpu_MaxCount_HuffmanWeightRanks - + 2); - - ZSTDGPU_DECLARE_GROUPSHARED(RankIndex , kzstdgpu_MaxCount_HuffmanWeightRanks); - ZSTDGPU_DECLARE_GROUPSHARED(CodeAndSymbol , 1); + #include "zstdgpu_lds_decl_base.h" + ZSTDGPU_INIT_HUFFMAN_TABLE_LDS(0, InitHuffmanTable); + #include "zstdgpu_lds_decl_undef.h" uint32_t bitsMax = 0; uint32_t codeTableSize = 0; @@ -2785,6 +2835,16 @@ static void zstdgpu_ConvertThreadgroupIdToDecompressLiteralsInputs(ZSTDGPU_RO_BU } } +// LDS partitioning macro lists for Huffman Table Initialisation (standalone shader) +#define ZSTDGPU_DECOMPRESS_LITERALS_LDS(base, size) \ + ZSTDGPU_LDS_SIZE(size) \ + ZSTDGPU_LDS_BASE(base) \ + ZSTDGPU_LDS_REGION(HuffmanTable, kzstdgpu_MaxCount_HuffmanTableExpandedUInts) + +#include "zstdgpu_lds_decl_size.h" +ZSTDGPU_DECOMPRESS_LITERALS_LDS(0, DecompressLiterals); +#include "zstdgpu_lds_decl_undef.h" + static void zstdgpu_ShaderEntry_DecompressLiterals(ZSTDGPU_PARAM_INOUT(zstdgpu_DecompressLiterals_SRT) srt, uint32_t groupId, uint32_t threadId, uint32_t tgSize) { uint32_t htIndex = 0; @@ -2804,8 +2864,9 @@ static void zstdgpu_ShaderEntry_DecompressLiterals(ZSTDGPU_PARAM_INOUT(zstdgpu_D ); - ZSTDGPU_START_GROUPSHARED() - ZSTDGPU_DECLARE_GROUPSHARED(HuffmanTable, kzstdgpu_MaxCount_HuffmanTableExpandedUInts); + #include "zstdgpu_lds_decl_base.h" + ZSTDGPU_DECOMPRESS_LITERALS_LDS(0, DecompressLiterals); + #include "zstdgpu_lds_decl_undef.h" const uint32_t htInfo = WaveReadLaneFirst(srt.inHuffmanTableInfo[htIndex]); const uint32_t bitsMax = htInfo >> 16; @@ -2841,6 +2902,19 @@ static void zstdgpu_ShaderEntry_DecompressLiterals(ZSTDGPU_PARAM_INOUT(zstdgpu_D ); } +// LDS partitioning macro lists for combined Huffman Table Initialisation + Literal Decompression +#define ZSTDGPU_INIT_HUFFMAN_TABLE_AND_DECOMPRESS_LITERALS_LDS(base, size) \ + ZSTDGPU_LDS_SIZE(size) \ + ZSTDGPU_LDS_BASE(base) \ + ZSTDGPU_LDS_REGION(CodeAndSymbol , kzstdgpu_MaxCount_HuffmanWeights) \ + ZSTDGPU_LDS_REGION(PreInit , kzstdgpu_PreInitHuffmanTable_LdsSize) \ + ZSTDGPU_LDS_REGION(RankIndex , kzstdgpu_MaxCount_HuffmanWeightRanks) \ + ZSTDGPU_LDS_REGION(HuffmanTable , kzstdgpu_MaxCount_HuffmanTableExpandedUInts) + +#include "zstdgpu_lds_decl_size.h" +ZSTDGPU_INIT_HUFFMAN_TABLE_AND_DECOMPRESS_LITERALS_LDS(0, InitHuffmanTableAndDecompressLiterals); +#include "zstdgpu_lds_decl_undef.h" + static void zstdgpu_ShaderEntry_InitHuffmanTable_And_DecompressLiterals(ZSTDGPU_PARAM_INOUT(zstdgpu_InitHuffmanTable_And_DecompressLiterals_SRT) srt, uint32_t groupId, uint32_t threadId) { uint32_t htIndex = 0; @@ -2862,16 +2936,9 @@ static void zstdgpu_ShaderEntry_InitHuffmanTable_And_DecompressLiterals(ZSTDGPU_ // // The start of the Huffman Table initialisation // - ZSTDGPU_START_GROUPSHARED() - ZSTDGPU_DECLARE_GROUPSHARED(CodeAndSymbol , kzstdgpu_MaxCount_HuffmanWeights); - ZSTDGPU_DECLARE_GROUPSHARED(PreInit , kzstdgpu_MaxCount_HuffmanWeights - + kzstdgpu_MaxCount_HuffmanWeightsAllDigitBits - + kzstdgpu_MaxCount_HuffmanWeightRanks - + kzstdgpu_MaxCount_HuffmanWeightRanks - + 2); - - ZSTDGPU_DECLARE_GROUPSHARED(RankIndex , kzstdgpu_MaxCount_HuffmanWeightRanks); - ZSTDGPU_DECLARE_GROUPSHARED(HuffmanTable , kzstdgpu_MaxCount_HuffmanTableExpandedUInts); + #include "zstdgpu_lds_decl_base.h" + ZSTDGPU_INIT_HUFFMAN_TABLE_AND_DECOMPRESS_LITERALS_LDS(0, InitHuffmanTableAndDecompressLiterals); + #include "zstdgpu_lds_decl_undef.h" ZSTDGPU_RW_BUFFER(uint32_t) dummyBuffer; #ifndef __hlsl_dx_compiler @@ -3407,6 +3474,18 @@ static void zstdgpu_ShaderEntry_DecompressSequences(ZSTDGPU_PARAM_INOUT(zstdgpu_ //ZSTDGPU_ASSERT(bitBuffer.hadlastrefill && bitBuffer.bitcnt == 0); } +// LDS partitioning macro lists for sequence decompression with in-LDS caching +#define ZSTDGPU_DECOMPRESS_SEQUENCES_LDS_FSE_CACHE_LDS(base, size) \ + ZSTDGPU_LDS_SIZE(size) \ + ZSTDGPU_LDS_BASE(base) \ + ZSTDGPU_LDS_REGION(FsePackedLLen , kzstdgpu_FseElemMaxCount_LLen) \ + ZSTDGPU_LDS_REGION(FsePackedMLen , kzstdgpu_FseElemMaxCount_MLen) \ + ZSTDGPU_LDS_REGION(FsePackedOffs , kzstdgpu_FseElemMaxCount_Offs) + +#include "zstdgpu_lds_decl_size.h" +ZSTDGPU_DECOMPRESS_SEQUENCES_LDS_FSE_CACHE_LDS(0, DecompressSequences_LdsFseCache); +#include "zstdgpu_lds_decl_undef.h" + static void zstdgpu_ShaderEntry_DecompressSequences_LdsFseCache(ZSTDGPU_PARAM_INOUT(zstdgpu_DecompressSequences_SRT) srt, uint32_t groupId, uint32_t threadId, uint32_t tgSize) { ZSTDGPU_UNUSED(threadId); @@ -3466,10 +3545,9 @@ static void zstdgpu_ShaderEntry_DecompressSequences_LdsFseCache(ZSTDGPU_PARAM_IN const uint32_t outputStart = dst.offs; const uint32_t outputEnd = outputStart + dst.size; - ZSTDGPU_START_GROUPSHARED() - ZSTDGPU_DECLARE_GROUPSHARED(FsePackedLLen, kzstdgpu_FseElemMaxCount_LLen); - ZSTDGPU_DECLARE_GROUPSHARED(FsePackedMLen, kzstdgpu_FseElemMaxCount_MLen); - ZSTDGPU_DECLARE_GROUPSHARED(FsePackedOffs, kzstdgpu_FseElemMaxCount_Offs); + #include "zstdgpu_lds_decl_base.h" + ZSTDGPU_DECOMPRESS_SEQUENCES_LDS_FSE_CACHE_LDS(0, DecompressSequences_LdsFseCache); + #include "zstdgpu_lds_decl_undef.h" #define ZSTDGPU_PRELOAD_FSE_INTO_LDS(name) \ if (seqRef.fse##name < kzstdgpu_FseProbTableIndex_MinRLE) \ @@ -3595,6 +3673,21 @@ static void zstdgpu_ShaderEntry_DecompressSequences_LdsFseCache(ZSTDGPU_PARAM_IN ZSTDGPU_ASSERT(bitBuffer.hadlastrefill && bitBuffer.bitcnt == 0); } +// LDS partitioning macro lists for sequence decompression with in-LDS caching +#define ZSTDGPU_DECOMPRESS_SEQUENCES_LDS_OUT_CACHE_LDS(base, size) \ + ZSTDGPU_LDS_SIZE(size) \ + ZSTDGPU_LDS_BASE(base) \ + ZSTDGPU_LDS_REGION(OutputCache, kzstdgpu_TgSizeX_DecompressSequences * (SEQ_CACHE_LEN + 1) * 3) + +#ifndef SEQ_CACHE_LEN +#define SEQ_CACHE_LEN 128 +#define SEQ_CACHE_LEN_UNDEF 1 +#endif + +#include "zstdgpu_lds_decl_size.h" +ZSTDGPU_DECOMPRESS_SEQUENCES_LDS_OUT_CACHE_LDS(0, DecompressSequences_LdsOutCache); +#include "zstdgpu_lds_decl_undef.h" + static void zstdgpu_ShaderEntry_DecompressSequences_LdsOutCache(ZSTDGPU_PARAM_INOUT(zstdgpu_DecompressSequences_SRT) srt, uint32_t groupId, uint32_t threadId) { const uint32_t globalSeqBase = groupId * kzstdgpu_TgSizeX_DecompressSequences; @@ -3624,12 +3717,11 @@ static void zstdgpu_ShaderEntry_DecompressSequences_LdsOutCache(ZSTDGPU_PARAM_IN #ifndef SEQ_CACHE_LEN #define SEQ_CACHE_LEN 128 - #else - #error `SEQ_CACHE_LEN` must not be defined. #endif - ZSTDGPU_START_GROUPSHARED() - ZSTDGPU_DECLARE_GROUPSHARED(OutputCache, kzstdgpu_TgSizeX_DecompressSequences * (SEQ_CACHE_LEN + 1) * 3); + #include "zstdgpu_lds_decl_base.h" + ZSTDGPU_DECOMPRESS_SEQUENCES_LDS_OUT_CACHE_LDS(0, DecompressSequences_LdsOutCache); + #include "zstdgpu_lds_decl_undef.h" const uint32_t seqLdsLLenStart = 0; const uint32_t seqLdsMLenStart = kzstdgpu_TgSizeX_DecompressSequences * (SEQ_CACHE_LEN + 1); diff --git a/zstd/zstdgpu/zstdgpu_structs.h b/zstd/zstdgpu/zstdgpu_structs.h index 460640d..5809794 100644 --- a/zstd/zstdgpu/zstdgpu_structs.h +++ b/zstd/zstdgpu/zstdgpu_structs.h @@ -111,20 +111,23 @@ # endif #endif +// Opaque LDS address types: offset on HLSL, pointer on C++. +// Using these instead of raw uint32_t / uint32_t* prevents accidental type +// mismatches when storing intermediate LDS addresses in local variables. +#ifdef __hlsl_dx_compiler + typedef uint32_t zstdgpu_lds_uintptr_t; + typedef uint32_t zstdgpu_lds_const_uintptr_t; +#else + typedef uint32_t * zstdgpu_lds_uintptr_t; + typedef const uint32_t* zstdgpu_lds_const_uintptr_t; +#endif + #ifndef ZSTDGPU_PARAM_LDS_IN -# ifdef __hlsl_dx_compiler -# define ZSTDGPU_PARAM_LDS_IN(type) type -# else -# define ZSTDGPU_PARAM_LDS_IN(type) const type * -# endif +# define ZSTDGPU_PARAM_LDS_IN(type) zstdgpu_lds_const_uintptr_t #endif #ifndef ZSTDGPU_PARAM_LDS_INOUT -# ifdef __hlsl_dx_compiler -# define ZSTDGPU_PARAM_LDS_INOUT(type) type -# else -# define ZSTDGPU_PARAM_LDS_INOUT(type) type * -# endif +# define ZSTDGPU_PARAM_LDS_INOUT(type) zstdgpu_lds_uintptr_t #endif #ifndef ZSTDGPU_BRANCH