Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion zstd/zstdgpu/Shaders/ZstdGpuDecompressLiterals.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ ZSTDGPU_DECOMPRESS_LITERALS_SRT()
#include "../zstdgpu_srt_decl_undef.h"

// WARN(pamartis): Wasteful, need only uint8_t but HLSL doesn't support it
groupshared uint32_t GS_Lds[kzstdgpu_MaxCount_HuffmanTableExpandedUInts];
groupshared uint32_t GS_Lds[kzstdgpu_DecompressLiterals_LdsSize];
#define ZSTDGPU_LDS GS_Lds
#include "../zstdgpu_lds_hlsl.h"

Expand Down
21 changes: 12 additions & 9 deletions zstd/zstdgpu/Shaders/ZstdGpuDecompressSequences.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,6 @@
* Author(s): Pavel Martishevsky (pamartis@microsoft.com)
*/

#include "../zstdgpu_shaders.h"

#include "../zstdgpu_srt_decl_bind.h"
ZSTDGPU_DECOMPRESS_SEQUENCES_SRT()
#include "../zstdgpu_srt_decl_undef.h"

#ifdef __XBOX_SCARLETT
#define __XBOX_ENABLE_WAVE32 1
#endif
Expand All @@ -31,15 +25,24 @@ ZSTDGPU_DECOMPRESS_SEQUENCES_SRT()
//#define USE_LDS_OUT_CACHE 1
#endif

#ifdef USE_LDS_OUT_CACHE
#define SEQ_CACHE_LEN 128
#endif

#include "../zstdgpu_shaders.h"

#include "../zstdgpu_srt_decl_bind.h"
ZSTDGPU_DECOMPRESS_SEQUENCES_SRT()
#include "../zstdgpu_srt_decl_undef.h"

#ifdef USE_LDS_FSE_CACHE
groupshared uint32_t Lds[kzstdgpu_FseElemMaxCount_LLen + kzstdgpu_FseElemMaxCount_MLen + kzstdgpu_FseElemMaxCount_Offs];
groupshared uint32_t Lds[kzstdgpu_DecompressSequences_LdsFseCache_LdsSize];
#define ZSTDGPU_LDS Lds
#include "../zstdgpu_lds_hlsl.h"
#endif

#ifdef USE_LDS_OUT_CACHE
#define SEQ_CACHE_LEN 128
groupshared uint32_t Lds[kzstdgpu_TgSizeX_DecompressSequences * (SEQ_CACHE_LEN + 1) * 3];
groupshared uint32_t Lds[kzstdgpu_DecompressSequences_LdsOutCache_LdsSize];
#define ZSTDGPU_LDS Lds
#include "../zstdgpu_lds_hlsl.h"
#endif
Expand Down
19 changes: 5 additions & 14 deletions zstd/zstdgpu/Shaders/ZstdGpuInitFseTable.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -53,20 +53,11 @@ ConstantBuffer<Consts> Constants : register(b0);
ZSTDGPU_INIT_FSE_TABLE_SRT()
#include "../zstdgpu_srt_decl_undef.h"

groupshared uint32_t Lds[
0 +
#if (ZSTD_BITCNT_NSTATE_METHOD == ZSTD_BITCNT_NSTATE_METHOD_DEFAULT) || (ZSTD_BITCNT_NSTATE_METHOD == ZSTD_BITCNT_NSTATE_METHOD_REFERENCE)
+ kzstdgpu_MaxCount_FseProbs
+ kzstdgpu_MaxCount_FseElemsAllDigitBits
#elif ZSTD_BITCNT_NSTATE_METHOD == ZSTD_BITCNT_NSTATE_METHOD_EXPERIMENTAL
+ kzstdgpu_MaxCount_FseElems * 2
+ kzstdgpu_MaxCount_FseElemsOneDigitBits * 2 // kzstdgpu_MaxCount_FseElemsOneDigitBits - masks, kzstdgpu_MaxCount_FseElemsOneDigitBits - ones prefix
#endif

#if IS_MULTI_WAVE
+ kzstdgpu_WaveCountMax_InitFseTable * 3 + 3
#endif
];
#if ZSTD_BITCNT_NSTATE_METHOD == ZSTD_BITCNT_NSTATE_METHOD_DEFAULT
groupshared uint32_t Lds[kzstdgpu_InitFseTable_Default_LdsSize];
#elif ZSTD_BITCNT_NSTATE_METHOD == ZSTD_BITCNT_NSTATE_METHOD_EXPERIMENTAL
groupshared uint32_t Lds[kzstdgpu_InitFseTable_Experimental_LdsSize];
#endif

#define ZSTDGPU_LDS Lds
#include "../zstdgpu_lds_hlsl.h"
Expand Down
2 changes: 1 addition & 1 deletion zstd/zstdgpu/Shaders/ZstdGpuInitHuffmanTable.hlsl
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ ZSTDGPU_INIT_HUFFMAN_TABLE_SRT()
#include "../zstdgpu_srt_decl_undef.h"

// WARN(pamartis): Wasteful, need only uint8_t but HLSL doesn't support it
groupshared uint32_t GS_Lds[kzstdgpu_MaxCount_HuffmanWeights + kzstdgpu_MaxCount_HuffmanWeightsAllDigitBits + kzstdgpu_MaxCount_HuffmanWeightRanks * 3 + 2];
groupshared uint32_t GS_Lds[kzstdgpu_InitHuffmanTable_LdsSize];
#define ZSTDGPU_LDS GS_Lds
#include "../zstdgpu_lds_hlsl.h"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ ZSTDGPU_INIT_HUFFMAN_TABLE_AND_DECOMPRESS_LITERALS_SRT()
#include "../zstdgpu_srt_decl_undef.h"

// WARN(pamartis): Wasteful, need only uint8_t but HLSL doesn't support it
groupshared uint32_t GS_Lds[kzstdgpu_MaxCount_HuffmanWeights * 2 + kzstdgpu_MaxCount_HuffmanWeightsAllDigitBits + kzstdgpu_MaxCount_HuffmanWeightRanks * 3 + 2 + kzstdgpu_MaxCount_HuffmanTableExpandedUInts];
groupshared uint32_t GS_Lds[kzstdgpu_InitHuffmanTableAndDecompressLiterals_LdsSize];
#define ZSTDGPU_LDS GS_Lds
#include "../zstdgpu_lds_hlsl.h"

Expand Down
54 changes: 14 additions & 40 deletions zstd/zstdgpu/zstdgpu_lds.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,79 +22,53 @@

#ifdef __hlsl_dx_compiler

#ifndef ZSTDGPU_START_GROUPSHARED
#define ZSTDGPU_START_GROUPSHARED() \
uint32_t GS_Region = 0;
#endif

#ifndef ZSTDGPU_DECLARE_GROUPSHARED
#define ZSTDGPU_DECLARE_GROUPSHARED(name, size) \
static const uint32_t GS_##name##_Size = size; \
const uint32_t GS_##name = GS_Region; \
GS_Region += GS_##name##_Size;
#endif

// For HLSL this file contains only definitions that don't contain references to the LDS because
// HLSL doesn't support passing `groupshared` variables into functions

#pragma dxc push
#pragma dxc diagnostic ignored "-Wundefined-internal"
static uint32_t zstdgpu_LdsLoadU32(uint32_t offsetInUInt32);
static void zstdgpu_LdsStoreU32(uint32_t offsetInUInt32, uint32_t x);
static void zstdgpu_LdsAtomicAddU32(uint32_t offsetInUInt32, uint32_t x);
static void zstdgpu_LdsAtomicMaxU32(uint32_t offsetInUInt32, uint32_t x);
static void zstdgpu_LdsAtomicMinU32(uint32_t offsetInUInt32, uint32_t x);
static void zstdgpu_LdsAtomicAndU32(uint32_t offsetInUInt32, uint32_t x);
static void zstdgpu_LdsAtomicOrU32(uint32_t offsetInUInt32, uint32_t x);
static uint32_t zstdgpu_LdsLoadU32(zstdgpu_lds_const_uintptr_t offsetInUInt32);
static void zstdgpu_LdsStoreU32(zstdgpu_lds_uintptr_t offsetInUInt32, uint32_t x);
static void zstdgpu_LdsAtomicAddU32(zstdgpu_lds_uintptr_t offsetInUInt32, uint32_t x);
static void zstdgpu_LdsAtomicMaxU32(zstdgpu_lds_uintptr_t offsetInUInt32, uint32_t x);
static void zstdgpu_LdsAtomicMinU32(zstdgpu_lds_uintptr_t offsetInUInt32, uint32_t x);
static void zstdgpu_LdsAtomicAndU32(zstdgpu_lds_uintptr_t offsetInUInt32, uint32_t x);
static void zstdgpu_LdsAtomicOrU32(zstdgpu_lds_uintptr_t offsetInUInt32, uint32_t x);
#pragma dxc pop

#else

#ifndef ZSTDGPU_START_GROUPSHARED
#define ZSTDGPU_START_GROUPSHARED() \
uint32_t GS_Region = 0;

#endif

#ifndef ZSTDGPU_DECLARE_GROUPSHARED
#define ZSTDGPU_DECLARE_GROUPSHARED(name, size) \
static const uint32_t GS_##name##_Size = size; \
GS_Region += size; \
uint32_t GS_##name[size]; \
/*ASSERT(GS_Remain < &GS_Region[GS_Region_Size]);*/
#endif

static inline uint32_t zstdgpu_LdsLoadU32(const uint32_t *address)
static inline uint32_t zstdgpu_LdsLoadU32(zstdgpu_lds_const_uintptr_t address)
{
return *address;
}

static inline void zstdgpu_LdsStoreU32(uint32_t *address, uint32_t x)
static inline void zstdgpu_LdsStoreU32(zstdgpu_lds_uintptr_t address, uint32_t x)
{
*address = x;
}

static inline void zstdgpu_LdsAtomicAddU32(uint32_t *address, uint32_t x)
static inline void zstdgpu_LdsAtomicAddU32(zstdgpu_lds_uintptr_t address, uint32_t x)
{
*address += x;
}

static inline void zstdgpu_LdsAtomicMaxU32(uint32_t *address, uint32_t x)
static inline void zstdgpu_LdsAtomicMaxU32(zstdgpu_lds_uintptr_t address, uint32_t x)
{
*address = zstdgpu_MaxU32(*address, x);
}

static inline void zstdgpu_LdsAtomicMinU32(uint32_t *address, uint32_t x)
static inline void zstdgpu_LdsAtomicMinU32(zstdgpu_lds_uintptr_t address, uint32_t x)
{
*address = zstdgpu_MinU32(*address, x);
}

static inline void zstdgpu_LdsAtomicAndU32(uint32_t *address, uint32_t x)
static inline void zstdgpu_LdsAtomicAndU32(zstdgpu_lds_uintptr_t address, uint32_t x)
{
*address &= x;
}

static inline void zstdgpu_LdsAtomicOrU32(uint32_t *address, uint32_t x)
static inline void zstdgpu_LdsAtomicOrU32(zstdgpu_lds_uintptr_t address, uint32_t x)
{
*address |= x;
}
Expand Down
30 changes: 30 additions & 0 deletions zstd/zstdgpu/zstdgpu_lds_decl_base.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/**
* Copyright (c) Microsoft. All rights reserved.
* This code is licensed under the MIT License (MIT).
* THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
* ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
* IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
* PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
*
* Advanced Technology Group (ATG)
* Author(s): Pavel Martishevsky (pamartis@microsoft.com)
*
* zstdgpu_lds_decl_base.h
*
* Defines ZSTDGPU_LDS_* macros that help to declare the base of LDS partition defined by a macrolist
* and the base for all subregions.
* Include this file before invoking a macrolist defining LDS partition,
* then include zstdgpu_lds_decl_undef.h afterwards to clean up.
*/

#ifndef __hlsl_dx_compiler
#define ZSTDGPU_LDS_SIZE(size) uint32_t GS_Storage[kzstdgpu_##size##_LdsSize];
#define ZSTDGPU_LDS_BASE(base) zstdgpu_lds_uintptr_t GS_Base = (0 != (base)) ? (base) : &GS_Storage[0];
#else
#define ZSTDGPU_LDS_SIZE(size)
#define ZSTDGPU_LDS_BASE(base) zstdgpu_lds_uintptr_t GS_Base = base;
#endif

#define ZSTDGPU_LDS_REGION(name, size) zstdgpu_lds_uintptr_t GS_##name = GS_Base; \
GS_Base += size;

21 changes: 21 additions & 0 deletions zstd/zstdgpu/zstdgpu_lds_decl_size.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/**
* Copyright (c) Microsoft. All rights reserved.
* This code is licensed under the MIT License (MIT).
* THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
* ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
* IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
* PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
*
* Advanced Technology Group (ATG)
* Author(s): Pavel Martishevsky (pamartis@microsoft.com)
*
* zstdgpu_lds_decl_size.h
*
* Defines ZSTDGPU_LDS_* macros that help to declare the total size of LDS partition defined by a macrolist
* Include this file before invoking a macrolist defining LDS partition,
* then include zstdgpu_lds_decl_undef.h afterwards to clean up.
*/

#define ZSTDGPU_LDS_SIZE(size) static const uint32_t kzstdgpu_##size##_LdsSize =
#define ZSTDGPU_LDS_BASE(base) (base)
#define ZSTDGPU_LDS_REGION(name, size) + (size)
19 changes: 19 additions & 0 deletions zstd/zstdgpu/zstdgpu_lds_decl_undef.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
/**
* Copyright (c) Microsoft. All rights reserved.
* This code is licensed under the MIT License (MIT).
* THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
* ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
* IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
* PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
*
* Advanced Technology Group (ATG)
* Author(s): Pavel Martishevsky (pamartis@microsoft.com)
*
* zstdgpu_lds_decl_undef.h
*
* Undefs the ZSTDGPU_LDS_* macros defined by zstdgpu_lds_decl_size.h or zstdgpu_lds_decl_base.h.
*/

#undef ZSTDGPU_LDS_REGION
#undef ZSTDGPU_LDS_BASE
#undef ZSTDGPU_LDS_SIZE
14 changes: 7 additions & 7 deletions zstd/zstdgpu/zstdgpu_lds_hlsl.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,37 +38,37 @@
# error ZSTDGPU_LDS must be defined as `#define ZSTDGPU_LDS YourLdsRegion`
#endif

static uint32_t zstdgpu_LdsLoadU32(uint32_t offsetInUInt32)
static uint32_t zstdgpu_LdsLoadU32(zstdgpu_lds_const_uintptr_t offsetInUInt32)
{
return ZSTDGPU_LDS[offsetInUInt32];
}

static void zstdgpu_LdsStoreU32(uint32_t offsetInUInt32, uint32_t x)
static void zstdgpu_LdsStoreU32(zstdgpu_lds_uintptr_t offsetInUInt32, uint32_t x)
{
ZSTDGPU_LDS[offsetInUInt32] = x;
}

static void zstdgpu_LdsAtomicAddU32(uint32_t offsetInUInt32, uint32_t x)
static void zstdgpu_LdsAtomicAddU32(zstdgpu_lds_uintptr_t offsetInUInt32, uint32_t x)
{
InterlockedAdd(ZSTDGPU_LDS[offsetInUInt32], x);
}

static void zstdgpu_LdsAtomicMaxU32(uint32_t offsetInUInt32, uint32_t x)
static void zstdgpu_LdsAtomicMaxU32(zstdgpu_lds_uintptr_t offsetInUInt32, uint32_t x)
{
InterlockedMax(ZSTDGPU_LDS[offsetInUInt32], x);
}

static void zstdgpu_LdsAtomicMinU32(uint32_t offsetInUInt32, uint32_t x)
static void zstdgpu_LdsAtomicMinU32(zstdgpu_lds_uintptr_t offsetInUInt32, uint32_t x)
{
InterlockedMin(ZSTDGPU_LDS[offsetInUInt32], x);
}

static void zstdgpu_LdsAtomicAndU32(uint32_t offsetInUInt32, uint32_t x)
static void zstdgpu_LdsAtomicAndU32(zstdgpu_lds_uintptr_t offsetInUInt32, uint32_t x)
{
InterlockedAnd(ZSTDGPU_LDS[offsetInUInt32], x);
}

static void zstdgpu_LdsAtomicOrU32(uint32_t offsetInUInt32, uint32_t x)
static void zstdgpu_LdsAtomicOrU32(zstdgpu_lds_uintptr_t offsetInUInt32, uint32_t x)
{
InterlockedOr(ZSTDGPU_LDS[offsetInUInt32], x);
}
Expand Down
Loading