I use this for all my CUDA - Stolen from TF 1DKern.cu
Posted: Sat Feb 03, 2024 4:37 pm
Its a basic iterator that just takes the hassle out of CUDA.( Its in the TensorFlow source code. )
- Code: Select all
#include <algorithm>
#include <complex>
#include <iostream>
#include <math.h>
#include <vector>
/*Begin 1DKern definition */
/*This is a direct copy of Tensorflows 1DKern code*/
namespace detail {
template <typename T>
class GpuGridRange {
struct Iterator {
__device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
__device__ T operator*() const { return index_;}
__device__ Iterator& operator++() {
index_ += delta_;
return *this;
}
__device__ bool operator!=(const Iterator& other) const {
bool greater = index_ > other.index_;
bool less = index_ < other.index_;
if(!other.delta_){
return less;
}
if(!delta_){
return greater;
}
return less || greater;
}
private:
T index_;
const T delta_;
}; //end Iterator struct
public:
__device__ GpuGridRange(T begin,T delta,T end)
: begin_(begin),delta_(delta),end_(end) {}
__device__ Iterator begin() const {return Iterator(begin_,delta_); }
__device__ Iterator end() const {return Iterator(end_,0);}
private:
T begin_;
T delta_;
T end_;
}; //end GPU class class
}; //end namespace detail
template <typename T> //Allows you to use GPU iterator with all data types
__device__ detail::GpuGridRange<T> GpuGridRangeX(T count) {
return detail::GpuGridRange<T>(
/*begin*/blockIdx.x * blockDim.x + threadIdx.x,
/*delta*/gridDim.x * blockDim.x, /*end*/count
);
}
template <typename T> //Allows you to use GPU iterator with all data types
__device__ detail::GpuGridRange<T> GpuGridRangeY(T count) {
return detail::GpuGridRange<T>(
/*begin*/blockIdx.y * blockDim.y + threadIdx.y,
/*delta*/gridDim.y * blockDim.y, /*end*/count
);
}
template <typename T> //Allows you to use GPU iterator with all data types
__device__ detail::GpuGridRange<T> GpuGridRangeZ(T count) {
return detail::GpuGridRange<T>(
/*begin*/blockIdx.z * blockDim.z + threadIdx.z,
/*delta*/gridDim.z * blockDim.z, /*end*/count
);
}
#define GPU_1D_KERN_LOOP(i, n) \
for (int i : ::GpuGridRangeX<int>(n))
#define GPU_AXIS_KERNEL_LOOP(i, n, axis) \
for (int i : ::GpuGridRange##axis<int>(n))
/*End 1DKern definition*/