cuda_runtime.h 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. /*
  2. * Minimum CUDA compatibility definitions header
  3. *
  4. * Copyright (c) 2019 rcombs
  5. *
  6. * This file is part of FFmpeg.
  7. *
  8. * FFmpeg is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU Lesser General Public
  10. * License as published by the Free Software Foundation; either
  11. * version 2.1 of the License, or (at your option) any later version.
  12. *
  13. * FFmpeg is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with FFmpeg; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. */
  22. #ifndef COMPAT_CUDA_CUDA_RUNTIME_H
  23. #define COMPAT_CUDA_CUDA_RUNTIME_H
  24. // Common macros
  25. #define __global__ __attribute__((global))
  26. #define __device__ __attribute__((device))
  27. #define __device_builtin__ __attribute__((device_builtin))
  28. #define __align__(N) __attribute__((aligned(N)))
  29. #define __inline__ __inline__ __attribute__((always_inline))
  30. #define max(a, b) ((a) > (b) ? (a) : (b))
  31. #define min(a, b) ((a) < (b) ? (a) : (b))
  32. #define abs(x) ((x) < 0 ? -(x) : (x))
  33. #define atomicAdd(a, b) (__atomic_fetch_add(a, b, __ATOMIC_SEQ_CST))
  34. // Basic typedefs
  35. typedef __device_builtin__ unsigned long long cudaTextureObject_t;
  36. typedef struct __device_builtin__ __align__(2) uchar2
  37. {
  38. unsigned char x, y;
  39. } uchar2;
  40. typedef struct __device_builtin__ __align__(4) ushort2
  41. {
  42. unsigned short x, y;
  43. } ushort2;
  44. typedef struct __device_builtin__ __align__(8) float2
  45. {
  46. float x, y;
  47. } float2;
  48. typedef struct __device_builtin__ __align__(8) int2
  49. {
  50. int x, y;
  51. } int2;
  52. typedef struct __device_builtin__ uint3
  53. {
  54. unsigned int x, y, z;
  55. } uint3;
  56. typedef struct uint3 dim3;
  57. typedef struct __device_builtin__ __align__(4) uchar4
  58. {
  59. unsigned char x, y, z, w;
  60. } uchar4;
  61. typedef struct __device_builtin__ __align__(8) ushort4
  62. {
  63. unsigned short x, y, z, w;
  64. } ushort4;
  65. typedef struct __device_builtin__ __align__(16) int4
  66. {
  67. int x, y, z, w;
  68. } int4;
  69. typedef struct __device_builtin__ __align__(16) float4
  70. {
  71. float x, y, z, w;
  72. } float4;
  73. // Accessors for special registers
  74. #define GETCOMP(reg, comp) \
  75. asm("mov.u32 %0, %%" #reg "." #comp ";" : "=r"(tmp)); \
  76. ret.comp = tmp;
  77. #define GET(name, reg) static inline __device__ uint3 name() {\
  78. uint3 ret; \
  79. unsigned tmp; \
  80. GETCOMP(reg, x) \
  81. GETCOMP(reg, y) \
  82. GETCOMP(reg, z) \
  83. return ret; \
  84. }
  85. GET(getBlockIdx, ctaid)
  86. GET(getBlockDim, ntid)
  87. GET(getThreadIdx, tid)
  88. // Instead of externs for these registers, we turn access to them into calls into trivial ASM
  89. #define blockIdx (getBlockIdx())
  90. #define blockDim (getBlockDim())
  91. #define threadIdx (getThreadIdx())
  92. // Basic initializers (simple macros rather than inline functions)
  93. #define make_int2(a, b) ((int2){.x = a, .y = b})
  94. #define make_uchar2(a, b) ((uchar2){.x = a, .y = b})
  95. #define make_ushort2(a, b) ((ushort2){.x = a, .y = b})
  96. #define make_float2(a, b) ((float2){.x = a, .y = b})
  97. #define make_int4(a, b, c, d) ((int4){.x = a, .y = b, .z = c, .w = d})
  98. #define make_uchar4(a, b, c, d) ((uchar4){.x = a, .y = b, .z = c, .w = d})
  99. #define make_ushort4(a, b, c, d) ((ushort4){.x = a, .y = b, .z = c, .w = d})
  100. #define make_float4(a, b, c, d) ((float4){.x = a, .y = b, .z = c, .w = d})
  101. // Conversions from the tex instruction's 4-register output to various types
  102. #define TEX2D(type, ret) static inline __device__ void conv(type* out, unsigned a, unsigned b, unsigned c, unsigned d) {*out = (ret);}
  103. TEX2D(unsigned char, a & 0xFF)
  104. TEX2D(unsigned short, a & 0xFFFF)
  105. TEX2D(float, a)
  106. TEX2D(uchar2, make_uchar2(a & 0xFF, b & 0xFF))
  107. TEX2D(ushort2, make_ushort2(a & 0xFFFF, b & 0xFFFF))
  108. TEX2D(float2, make_float2(a, b))
  109. TEX2D(uchar4, make_uchar4(a & 0xFF, b & 0xFF, c & 0xFF, d & 0xFF))
  110. TEX2D(ushort4, make_ushort4(a & 0xFFFF, b & 0xFFFF, c & 0xFFFF, d & 0xFFFF))
  111. TEX2D(float4, make_float4(a, b, c, d))
  112. // Template calling tex instruction and converting the output to the selected type
  113. template<typename T>
  114. inline __device__ T tex2D(cudaTextureObject_t texObject, float x, float y)
  115. {
  116. T ret;
  117. unsigned ret1, ret2, ret3, ret4;
  118. asm("tex.2d.v4.u32.f32 {%0, %1, %2, %3}, [%4, {%5, %6}];" :
  119. "=r"(ret1), "=r"(ret2), "=r"(ret3), "=r"(ret4) :
  120. "l"(texObject), "f"(x), "f"(y));
  121. conv(&ret, ret1, ret2, ret3, ret4);
  122. return ret;
  123. }
  124. template<>
  125. inline __device__ float4 tex2D<float4>(cudaTextureObject_t texObject, float x, float y)
  126. {
  127. float4 ret;
  128. asm("tex.2d.v4.f32.f32 {%0, %1, %2, %3}, [%4, {%5, %6}];" :
  129. "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) :
  130. "l"(texObject), "f"(x), "f"(y));
  131. return ret;
  132. }
  133. template<>
  134. inline __device__ float tex2D<float>(cudaTextureObject_t texObject, float x, float y)
  135. {
  136. return tex2D<float4>(texObject, x, y).x;
  137. }
  138. template<>
  139. inline __device__ float2 tex2D<float2>(cudaTextureObject_t texObject, float x, float y)
  140. {
  141. float4 ret = tex2D<float4>(texObject, x, y);
  142. return make_float2(ret.x, ret.y);
  143. }
  144. // Math helper functions
  145. static inline __device__ float floorf(float a) { return __builtin_floorf(a); }
  146. static inline __device__ float floor(float a) { return __builtin_floorf(a); }
  147. static inline __device__ double floor(double a) { return __builtin_floor(a); }
  148. static inline __device__ float ceilf(float a) { return __builtin_ceilf(a); }
  149. static inline __device__ float ceil(float a) { return __builtin_ceilf(a); }
  150. static inline __device__ double ceil(double a) { return __builtin_ceil(a); }
  151. static inline __device__ float truncf(float a) { return __builtin_truncf(a); }
  152. static inline __device__ float trunc(float a) { return __builtin_truncf(a); }
  153. static inline __device__ double trunc(double a) { return __builtin_trunc(a); }
  154. static inline __device__ float fabsf(float a) { return __builtin_fabsf(a); }
  155. static inline __device__ float fabs(float a) { return __builtin_fabsf(a); }
  156. static inline __device__ double fabs(double a) { return __builtin_fabs(a); }
  157. static inline __device__ float sqrtf(float a) { return __builtin_sqrtf(a); }
  158. static inline __device__ float __saturatef(float a) { return __nvvm_saturate_f(a); }
  159. static inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); }
  160. static inline __device__ float __cosf(float a) { return __nvvm_cos_approx_f(a); }
  161. static inline __device__ float __expf(float a) { return __nvvm_ex2_approx_f(a * (float)__builtin_log2(__builtin_exp(1))); }
  162. static inline __device__ float __powf(float a, float b) { return __nvvm_ex2_approx_f(__nvvm_lg2_approx_f(a) * b); }
  163. #endif /* COMPAT_CUDA_CUDA_RUNTIME_H */