Fused Softmax::P3::Cuda Kernel

After unsuccessful attempts to optimize the triton kernel, let’s see what we can do with the Cuda version.

Like triton, I highly suggest you get your hands dirty with kernel writing. It’s ok to ask help from ChatGPT or your preferred AI.

As you can see in the following benchmark, my naive cuda version is even worse than my naive torch version.

softmax-p3-naive

If you look at my code you’ll know why.

1
__global__ void softmax_kernel(const float *x, float *out, int input_row_stride, int output_row_stride,
2
                               int n_rows, int n_cols)
3
{
4
    int row = blockIdx.x;
5
    if (row >= n_rows)
6
        return;
7

8
    const float *row_in = x + row * input_row_stride;
9
    float *row_out = out + row * output_row_stride;
10

11
    // Step 1: find max value for numerical stability
12
    float max_val = -FLT_MAX;
13
    for (int j = 0; j < n_cols; ++j)
14
        if (row_in[j] > max_val)
15
            max_val = row_in[j];
16

17
    // Step 2: compute sum of exp(x - max)
18
    float sum_exp = 0.0f;
19
    for (int j = 0; j < n_cols; ++j)
20
        sum_exp += expf(row_in[j] - max_val);
21

22
    // Step 3: normalize
23
    for (int j = 0; j < n_cols; ++j)
24
        row_out[j] = expf(row_in[j] - max_val) / sum_exp;
25
}

If you didn’t catch it, look at the following wrapper code.

1
at::Tensor softmax_cuda(at::Tensor x)
2
{
3

4
    // Same as triton, pre-allocate the memory
5
    // torch::zeros_like(x) is the alternative, but slower
6
    auto out = torch::empty_like(x);
7
    int64_t M = x.size(0), N = x.size(1);
8

9

10
    int threads = 1;
11
    int64_t block_count = M;
12
    int blocks = static_cast<int>(block_count);
13

14
    // data_ptr<T>() extracts the actual memory address
15
    // Cuda kernels need raw memory pointer
16
    const float *x_ptr = x.data_ptr<float>();
17
    float *out_ptr = out.data_ptr<float>();
18

19
    cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
20
    launch_softmax_kernel(x_ptr, out_ptr, x.stride(0), out.stride(0), M, N, blocks, threads, stream);
21
    C10_CUDA_CHECK(cudaGetLastError());
22
    return out;
23
}

We’re utilizing just 1 thread, and assing each row to just 1 block. Basically the GPU is sitting mostly empty, begging to be used the money’s worth.

multi-threaded & shared memory

Now let’s go multi-threaded. We will still have 1 block per row; just multiple threads per block. Which means we’re going to share each row with multiple threads. We need to put in some sync logic after each major step.

We define BLOCK_SIZE (which is the threads param), else we would get the following error:

1
error: a variable length array cannot have static storage duration float shared_sum_exp[blockDim.x];

1
__global__ void softmax_kernel_step1(const float *x, float *out, int input_row_stride,
2
                                     int output_row_stride, int n_rows, int n_cols)
3
{
4
#define BLOCK_SIZE 256
5
    int row = blockIdx.x;
6
    if (row >= n_rows)
7
        return;
8

9
    const float *row_in = x + row * input_row_stride;
10
    float *row_out = out + row * output_row_stride;
11

12
    // step 1: Each thread finds it's local max
13
    float local_max = -FLT_MAX;
14
    for (int j = threadIdx.x; j < n_cols; j += blockDim.x)
15
        local_max = fmaxf(local_max, row_in[j]);
16

17
    // only one thread writes final max
18
    __shared__ float max_val;
19
    __shared__ float shared_max[BLOCK_SIZE];
20
    shared_max[threadIdx.x] = local_max;
21
    __syncthreads(); // make sure all writes are done
22

23
    if (threadIdx.x == 0)
24
    {
25
        float global_max = -FLT_MAX;
26
        for (int t = 0; t < blockDim.x; ++t)
27
            global_max = fmaxf(global_max, shared_max[t]);
28
        max_val = global_max;
29
    }
30
    __syncthreads();
31

32
    float local_sum = 0.f;
33
    for (int j = threadIdx.x; j < n_cols; j += blockDim.x)
34
        local_sum += expf(row_in[j] - max_val);
35

36
    __shared__ float sum_exp;
37
    __shared__ float shared_sum_exp[BLOCK_SIZE];
38
    shared_sum_exp[threadIdx.x] = local_sum;
39
    __syncthreads();
40
    if (threadIdx.x == 0)
41
    {
42
        float total = 0.f;
43
        for (int t = 0; t < blockDim.x; ++t)
44
            total += shared_sum_exp[t];
45
        sum_exp = total;
46
    }
47

48
    __syncthreads();
49

50
    for (int j = threadIdx.x; j < n_cols; j += blockDim.x)
51
        row_out[j] = expf(row_in[j] - max_val) / sum_exp;
52
}

Now look at the results: softmax-p3-step1

haha, at least better than naive version. But this takes fixed size arrays, we can use shared memory to go to dynamic array.

1
__global__ void softmax_kernel_step2(const float *x, float *out, int input_row_stride,
2
                                     int output_row_stride, int n_rows, int n_cols)
3
{
4
    int row = blockIdx.x;
5
    if (row >= n_rows)
6
        return;
7

8
    extern __shared__ float shared[];
9

10
    const float *row_in = x + row * input_row_stride;
11
    float *row_out = out + row * output_row_stride;
12

13
    // step 1: Each thread finds it's local max
14
    float local_max = -FLT_MAX;
15
    for (int j = threadIdx.x; j < n_cols; j += blockDim.x)
16
        local_max = fmaxf(local_max, row_in[j]);
17

18
    // only one thread writes final max
19
    shared[threadIdx.x] = local_max;
20
    __syncthreads(); // make sure all writes are done
21

22
    // --- Parallel reduction for max
23
    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
24
    {
25
        if (threadIdx.x < offset)
26
            shared[threadIdx.x] = fmaxf(shared[threadIdx.x], shared[threadIdx.x + offset]);
27
        __syncthreads();
28
    }
29
    float max_val = shared[0];
30
    __syncthreads();
31

32
    float local_sum = 0.f;
33
    for (int j = threadIdx.x; j < n_cols; j += blockDim.x)
34
        local_sum += expf(row_in[j] - max_val);
35

36
    shared[threadIdx.x] = local_sum; // reuse same buffer
37
    __syncthreads();
38

39
    // --- Parallel reduction for sum
40
    for (int offset = blockDim.x / 2; offset > 0; offset >>= 1)
41
    {
42
        if (threadIdx.x < offset)
43
            shared[threadIdx.x] += shared[threadIdx.x + offset];
44
        __syncthreads();
45
    }
46
    float sum_exp = shared[0];
47
    __syncthreads();
48

49
    for (int j = threadIdx.x; j < n_cols; j += blockDim.x)
50
        row_out[j] = expf(row_in[j] - max_val) / sum_exp;
51
}

I did not observe any specific perf gains here, because the solution is essentially like the fixed sized shared arrays.

memory coalescing & vectorized float4 loads

Probably from the add kernel you remember vectorized float4 loads. It gave us good perf boost on Add kernel, let’s see how it works here.

softmax-p3-step2 The above diagram is what we get for $threads=256$ . And the following is what we get for $threads=1024$ .

We can marry best of both worlds with this condition:

1
    int threads = 512; // or 256
2
    if ( N > 14080 )
3
        threads = 1024;

which gives us: softmax-p3-mix

voila, we are not in a bad position at all. We have surpassed the naive version; and Triton kernel for $N > 14080$ .

Here is the code for vectorized memory access:

1
__global__ void softmax_kernel_step3_vec4(
2
    const float *__restrict__ x,
3
    float *__restrict__ out,
4
    int in_stride, int out_stride,
5
    int n_rows, int n_cols)
6
{
7
    int row = blockIdx.x;
8
    if (row >= n_rows)
9
        return;
10

11
    extern __shared__ float shared[]; // reuse the same buffer
12
    const float *__restrict__ row_in = x + row * in_stride;
13
    float *__restrict__ row_out = out + row * out_stride;
14

15
    // pass 1, local max, vectorized
16
    float local_max = -FLT_MAX;
17
    int n_vec = n_cols >> 2; // n_cols / 4
18
    for (int j4 = threadIdx.x; j4 < n_vec; j4 += blockDim.x)
19
    {
20
        float4 v = reinterpret_cast<const float4 *>(row_in)[j4];
21
        local_max = fmaxf(local_max, v.x);
22
        local_max = fmaxf(local_max, v.y);
23
        local_max = fmaxf(local_max, v.z);
24
        local_max = fmaxf(local_max, v.w);
25
    }
26
    // tail, if any, handle with first few threads
27
    int tail_start = n_vec << 2;
28
    for (int j = tail_start + threadIdx.x; j < n_cols; j += blockDim.x)
29
    {
30
        local_max = fmaxf(local_max, row_in[j]);
31
    }
32

33
    shared[threadIdx.x] = local_max;
34
    __syncthreads();
35

36
    // parallel reduction for max
37
    for (int off = blockDim.x >> 1; off > 0; off >>= 1)
38
    {
39
        if (threadIdx.x < off)
40
            shared[threadIdx.x] = fmaxf(shared[threadIdx.x], shared[threadIdx.x + off]);
41
        __syncthreads();
42
    }
43
    float max_val = shared[0];
44
    __syncthreads();
45

46
    // pass 2, write exp(x - max) to out, accumulate sum, vectorized
47
    float local_sum = 0.f;
48
    for (int j4 = threadIdx.x; j4 < n_vec; j4 += blockDim.x)
49
    {
50
        float4 v = reinterpret_cast<const float4 *>(row_in)[j4];
51
        v.x = expf(v.x - max_val);
52
        v.y = expf(v.y - max_val);
53
        v.z = expf(v.z - max_val);
54
        v.w = expf(v.w - max_val);
55
        local_sum += v.x + v.y + v.z + v.w;
56
        reinterpret_cast<float4 *>(row_out)[j4] = v; // store unnormalized exp
57
    }
58
    for (int j = tail_start + threadIdx.x; j < n_cols; j += blockDim.x)
59
    {
60
        float e = expf(row_in[j] - max_val);
61
        local_sum += e;
62
        row_out[j] = e;
63
    }
64

65
    shared[threadIdx.x] = local_sum;
66
    __syncthreads();
67

68
    // parallel reduction for sum
69
    for (int off = blockDim.x >> 1; off > 0; off >>= 1)
70
    {
71
        if (threadIdx.x < off)
72
            shared[threadIdx.x] += shared[threadIdx.x + off];
73
        __syncthreads();
74
    }
75
    float sum_exp = shared[0];
76
    __syncthreads();
77

78
    // pass 3, normalize, vectorized
79
    for (int j4 = threadIdx.x; j4 < n_vec; j4 += blockDim.x)
80
    {
81
        float4 v = reinterpret_cast<const float4 *>(row_out)[j4];
82
        v.x /= sum_exp;
83
        v.y /= sum_exp;
84
        v.z /= sum_exp;
85
        v.w /= sum_exp;
86
        reinterpret_cast<float4 *>(row_out)[j4] = v;
87
    }
88
    for (int j = tail_start + threadIdx.x; j < n_cols; j += blockDim.x)
89
    {
90
        row_out[j] /= sum_exp;
91
    }
92
}

warp-optimized reduction

I got this from chatgpt and included in the code; but didn’t get much benefits. I include the result here FYI. softmax-p3-step4-warp