Histogram

A histogram is a display of the number count of occurrences of data values in a dataset. Often the data items are grouped into specific ranges or bins (e.g. a specific color in an image, a letter (or group of letters) in a text). They are used whenever there is a large volume of data that needs to be analyzed to distill interesting events (think of feature extraction in computer vision). Parallelization Strategy:

  • Each thread processes a portion of the input array.
  • Atomic operations are used to update the global histogram to avoid race conditions.
__global__ void histogram_kernel(const char *__restrict__ data,
                                 unsigned int *__restrict__ histogram,
                                 const unsigned int length) {
const int i = threadIdx.x + blockIdx.x * blockDim.x;
 
if (i < length) {
    const int alphabet_position = data[i] - FIRST_CHAR;
    if (alphabet_position >= 0 && alphabet_position < ALPHABET_SIZE)
      atomicAdd(&(histogram[alphabet_position / CHAR_PER_BIN]), 1);
}

Optimizations

Coarsening

Each thread processes multiple elements.

__global__ void
    histogram_kernel(const char *__restrict__ data, unsigned int *__restrict__ histogram, const int length) {
  const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
  const unsigned int stride = blockDim.x * gridDim.x;
  
  // All threads in a blockk handle consecutive elements in each iteration
  for (unsigned int i = tid; i < length; i += stride) {
    const int alphabet_position = data[i] - FIRST_CHAR;
    if (alphabet_position >= 0 && alphabet_position < ALPHABET_SIZE)
      atomicAdd(&(histogram[alphabet_position / CHAR_PER_BIN]), 1);
  }
}

Privatization

Each thread maintains a local histogram before merging into the global one. We can privatize at different levels:

  • shared memory
  • registers
  • Or even committing on different region of global memory to boost performance.
__global__ void histogram_kernel(const char *__restrict__ data,
                                 unsigned int *__restrict__ histogram,
                                 const unsigned int length) {
  
  const unsigned int tid    = blockIdx.x * blockDim.x + threadIdx.x;
  const unsigned int stride = blockDim.x * gridDim.x;
  // Privatized bins
  __shared__ unsigned int histo_s[BIN_NUM];
#pragma unroll
  for (unsigned int binIdx = threadIdx.x; binIdx < BIN_NUM; binIdx += blockDim.x) { histo_s[binIdx] = 0; }
  __syncthreads();
  // Histogram
  for (unsigned int i = tid; i < length; i += stride) {
    const int alphabet_position = data[i] - FIRST_CHAR;
    if (alphabet_position >= 0 && alphabet_position < ALPHABET_SIZE)
      atomicAdd(&(histo_s[alphabet_position / CHAR_PER_BIN]), 1);
  }
  __syncthreads();
// Commit to global memory
#pragma unroll
  for (unsigned int binIdx = threadIdx.x; binIdx < BIN_NUM; binIdx += blockDim.x) {
    const unsigned int binValue = histo_s[binIdx];
    if (binValue > 0) {
      atomicAdd(&(histogram[binIdx]), binValue);
    }
  }
}

This approach uses a private histogram in shared memory before committing to the global histogram, reducing contention.

__global__ void histogram_kernel(const char *__restrict__ data,
                                 unsigned int *__restrict__ histogram,
                                 const unsigned int length) {
  const unsigned int tid    = blockIdx.x * blockDim.x + threadIdx.x;
  const unsigned int stride = blockDim.x * gridDim.x;
  
  // Privatized bins
  unsigned int histo_p[BIN_NUM];
  
__shared__ unsigned int histo_s[BIN_NUM];
#pragma unroll
for (unsigned int i = threadIdx.x; i < BIN_NUM; i += blockDim.x) {
    histo_s[i] = 0;
}
__syncthreads();
 
#pragma unroll
  for (unsigned int i = 0; i < BIN_NUM; i++) histo_p[i] = 0u;
  // Histogram
  for (unsigned int i = tid; i < length; i += stride) {
    const int alphabet_position = data[i] - FIRST_CHAR;
    if (alphabet_position >= 0 && alphabet_position < ALPHABET_SIZE)
      histo_p[alphabet_position / CHAR_PER_BIN] += 1;
  }
  // Commit to shared memory
#pragma unroll
  for (unsigned int binIdx = 0; binIdx < BIN_NUM; binIdx++) {
    const unsigned int binValue = histo_p[binIdx];
    if (binValue > 0) {
      atomicAdd(&(histo_s[binIdx]), binValue);
    }
  }
 
__syncthreads(); // Synchronization barrier
 
// Commit to global memory
#pragma unroll
  for (unsigned int binIdx = threadIdx.x; binIdx < BIN_NUM; binIdx += blockDim.x) {
    const unsigned int binValue = histo_s[binIdx];
    if (binValue > 0) {
      atomicAdd(&(histogram[binIdx]), binValue);
    }
  }
}

Aggregation

The concept of aggregating data values in contiguous regions to reduce atomic operations can be beneficial. To reduce high contention consecutive updates to the same bin are combined.

__global__ void histogram_kernel(const char *__restrict__ data,
                                 unsigned int *__restrict__ histogram,
                                 const unsigned int length) {
  const unsigned int tid    = blockIdx.x * blockDim.x + threadIdx.x;
  const unsigned int stride = blockDim.x * gridDim.x;
  // Privatized bins
  __shared__ unsigned int histo_s[BIN_NUM];
#pragma unroll
  for (unsigned int binIdx = threadIdx.x; binIdx < BIN_NUM; binIdx += blockDim.x) { histo_s[binIdx] = 0u; }
  __syncthreads();
  
  // Histogram
  unsigned int accumulator = 0;
  int prevBinIdx           = -1;
  for (unsigned int i = tid; i < length; i += stride) {
    int alphabet_position = data[i] - FIRST_CHAR;
    if (alphabet_position >= 0 && alphabet_position < ALPHABET_SIZE) {
      const int bin = alphabet_position / CHAR_PER_BIN;
      if (bin == prevBinIdx) {
        ++accumulator;
      } else {
        if (accumulator > 0) {
          atomicAdd(&(histo_s[prevBinIdx]), accumulator);
        }
        accumulator = 1;
        prevBinIdx  = bin;
      }
    }
  }
  if (accumulator > 0) {
    atomicAdd(&(histo_s[prevBinIdx]), accumulator);
  }
  __syncthreads();
// Commit to global memory
#pragma unroll
  for (unsigned int binIdx = threadIdx.x; binIdx < BIN_NUM; binIdx += blockDim.x) {
    const unsigned int binValue = histo_s[binIdx];
    if (binValue > 0) {
      atomicAdd(&(histogram[binIdx]), binValue);
    }
  }
}