gpu.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459
  1. #ifndef _GPU_H
  2. #define _GPU_H
  3. #include <isl/ast.h>
  4. #include <isl/id.h>
  5. #include <isl/id_to_ast_expr.h>
  6. #include <pet.h>
  7. #include "ppcg.h"
  8. #include "ppcg_options.h"
  9. /* An access to an outer array element or an iterator.
  10. * Accesses to iterators have an access relation that maps to an unnamed space.
  11. * An access may be both read and write.
  12. * If the access relation is empty, then the output dimension may
  13. * not be equal to the dimension of the corresponding array.
  14. */
  15. struct gpu_stmt_access {
  16. /* Access reads elements */
  17. int read;
  18. /* Access writes elements */
  19. int write;
  20. /* All writes are definite writes. */
  21. int exact_write;
  22. /* Is a single, fixed element being accessed? */
  23. isl_bool fixed_element;
  24. /* The number of index expressions specified in the access. */
  25. int n_index;
  26. /* May access relation */
  27. isl_map *access;
  28. /* May access relation with as domain a mapping from iteration domain
  29. * to a reference identifier.
  30. */
  31. isl_map *tagged_access;
  32. /* The reference id of the corresponding pet_expr. */
  33. isl_id *ref_id;
  34. struct gpu_stmt_access *next;
  35. };
  36. /* A representation of a user statement.
  37. * "stmt" points to the corresponding pet statement.
  38. * "id" is the identifier of the instance set of the statement.
  39. * "accesses" is a linked list of accesses performed by the statement.
  40. * If the statement has been killed, i.e., if it will not be scheduled,
  41. * then this linked list may be empty even if the actual statement does
  42. * perform accesses.
  43. */
  44. struct gpu_stmt {
  45. isl_id *id;
  46. struct pet_stmt *stmt;
  47. struct gpu_stmt_access *accesses;
  48. };
  49. /* Represents an outer array possibly accessed by a gpu_prog.
  50. */
  51. struct gpu_array_info {
  52. /* The array data space. */
  53. isl_space *space;
  54. /* Element type. */
  55. char *type;
  56. /* Element size. */
  57. int size;
  58. /* Name of the array. */
  59. char *name;
  60. /* Declared extent of original array. */
  61. isl_set *declared_extent;
  62. /* AST expression for declared size of original array. */
  63. isl_ast_expr *declared_size;
  64. /* Extent of the array that needs to be copied. */
  65. isl_set *extent;
  66. /* Number of indices. */
  67. unsigned n_index;
  68. /* For each index, a bound on "extent" in that direction. */
  69. isl_multi_pw_aff *bound;
  70. /* The corresponding access AST expression, if the array needs
  71. * to be allocated on the device.
  72. */
  73. isl_ast_expr *bound_expr;
  74. /* All references to this array; point to elements of a linked list. */
  75. int n_ref;
  76. struct gpu_stmt_access **refs;
  77. /* Is this array accessed at all by the program? */
  78. int accessed;
  79. /* Is this a scalar that is read-only within the entire program? */
  80. int read_only_scalar;
  81. /* Are the elements of the array structures? */
  82. int has_compound_element;
  83. /* Are the elements only accessed through constant index expressions? */
  84. int only_fixed_element;
  85. /* Is the array local to the scop? */
  86. int local;
  87. /* Is the array local and should it be declared on the host? */
  88. int declare_local;
  89. /* Is the corresponding global device memory accessed in any way? */
  90. int global;
  91. /* Should the array be linearized? */
  92. int linearize;
  93. /* Order dependences on this array.
  94. * Only used if live_range_reordering option is set.
  95. * It is set to NULL otherwise.
  96. */
  97. isl_union_map *dep_order;
  98. void *user;
  99. };
  100. /* Represents an outer array accessed by a ppcg_kernel, localized
  101. * to the context of this kernel.
  102. *
  103. * "array" points to the corresponding array in the gpu_prog.
  104. * The "n_group" "groups" are the reference groups associated to the array.
  105. * If "force_private" is set, then the array (in practice a scalar)
  106. * must be mapped to a register.
  107. * "global" is set if the global device memory corresponding
  108. * to this array is accessed by the kernel.
  109. * "bound" is equal to array->bound specialized to the current kernel.
  110. * "bound_expr" is the corresponding access AST expression.
  111. */
  112. struct gpu_local_array_info {
  113. struct gpu_array_info *array;
  114. int n_group;
  115. struct gpu_array_ref_group **groups;
  116. int force_private;
  117. int global;
  118. unsigned n_index;
  119. isl_multi_pw_aff *bound;
  120. isl_ast_expr *bound_expr;
  121. };
  122. __isl_give isl_ast_expr *gpu_local_array_info_linearize_index(
  123. struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr);
  124. /* A sequence of "n" names of types.
  125. */
  126. struct gpu_types {
  127. int n;
  128. char **name;
  129. };
  130. /* "read" and "write" contain the original access relations, possibly
  131. * involving member accesses.
  132. *
  133. * The elements of "array", as well as the ranges of "copy_in" and "copy_out"
  134. * only refer to the outer arrays of any possible member accesses.
  135. */
  136. struct gpu_prog {
  137. isl_ctx *ctx;
  138. struct ppcg_scop *scop;
  139. /* Set of parameter values */
  140. isl_set *context;
  141. /* All potential read accesses in the entire program */
  142. isl_union_map *read;
  143. /* All potential write accesses in the entire program */
  144. isl_union_map *may_write;
  145. /* All definite write accesses in the entire program */
  146. isl_union_map *must_write;
  147. /* All tagged definite kills in the entire program */
  148. isl_union_map *tagged_must_kill;
  149. /* The set of inner array elements that may be preserved. */
  150. isl_union_set *may_persist;
  151. /* A mapping from all innermost arrays to their outer arrays. */
  152. isl_union_map *to_outer;
  153. /* A mapping from the outer arrays to all corresponding inner arrays. */
  154. isl_union_map *to_inner;
  155. /* A mapping from all intermediate arrays to their outer arrays,
  156. * including an identity mapping from the anonymous 1D space to itself.
  157. */
  158. isl_union_map *any_to_outer;
  159. /* Order dependences on non-scalars. */
  160. isl_union_map *array_order;
  161. /* Array of statements */
  162. int n_stmts;
  163. struct gpu_stmt *stmts;
  164. int n_array;
  165. struct gpu_array_info *array;
  166. };
  167. struct gpu_gen {
  168. isl_ctx *ctx;
  169. struct ppcg_options *options;
  170. /* Callback for printing of AST in appropriate format. */
  171. __isl_give isl_printer *(*print)(__isl_take isl_printer *p,
  172. struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
  173. struct gpu_types *types, void *user);
  174. void *print_user;
  175. isl_id_to_ast_expr *(*build_ast_expr)(void *stmt,
  176. isl_ast_build *build,
  177. isl_multi_pw_aff *(*fn_index)(
  178. __isl_take isl_multi_pw_aff *mpa, isl_id *id,
  179. void *user),
  180. void *user_index,
  181. isl_ast_expr *(*fn_expr)(isl_ast_expr *expr,
  182. isl_id *id, void *user),
  183. void *user_expr);
  184. struct gpu_prog *prog;
  185. /* The generated AST. */
  186. isl_ast_node *tree;
  187. /* The sequence of types for which a definition has been printed. */
  188. struct gpu_types types;
  189. /* User specified tile, grid and block sizes for each kernel */
  190. isl_union_map *sizes;
  191. /* Effectively used tile, grid and block sizes for each kernel */
  192. isl_union_map *used_sizes;
  193. /* Identifier of the next kernel. */
  194. int kernel_id;
  195. };
  196. enum ppcg_group_access_type {
  197. ppcg_access_global,
  198. ppcg_access_shared,
  199. ppcg_access_private
  200. };
  201. enum ppcg_kernel_stmt_type {
  202. ppcg_kernel_copy,
  203. ppcg_kernel_domain,
  204. ppcg_kernel_sync
  205. };
  206. /* Representation of special statements, in particular copy statements
  207. * and __syncthreads statements, inside a kernel.
  208. *
  209. * type represents the kind of statement
  210. *
  211. *
  212. * for ppcg_kernel_copy statements we have
  213. *
  214. * read is set if the statement should copy data from global memory
  215. * to shared memory or registers.
  216. *
  217. * index expresses an access to the array element that needs to be copied
  218. * local_index expresses the corresponding element in the tile
  219. *
  220. * array refers to the original array being copied
  221. * local_array is a pointer to the appropriate element in the "array"
  222. * array of the ppcg_kernel to which this copy access belongs
  223. *
  224. *
  225. * for ppcg_kernel_domain statements we have
  226. *
  227. * stmt is the corresponding input statement
  228. *
  229. * n_access is the number of accesses in stmt
  230. * access is an array of local information about the accesses
  231. */
  232. struct ppcg_kernel_stmt {
  233. enum ppcg_kernel_stmt_type type;
  234. union {
  235. struct {
  236. int read;
  237. isl_ast_expr *index;
  238. isl_ast_expr *local_index;
  239. struct gpu_array_info *array;
  240. struct gpu_local_array_info *local_array;
  241. } c;
  242. struct {
  243. struct gpu_stmt *stmt;
  244. isl_id_to_ast_expr *ref2expr;
  245. } d;
  246. } u;
  247. };
  248. /* Representation of a local variable in a kernel.
  249. */
  250. struct ppcg_kernel_var {
  251. struct gpu_array_info *array;
  252. enum ppcg_group_access_type type;
  253. char *name;
  254. isl_vec *size;
  255. };
  256. /* Representation of a kernel.
  257. *
  258. * prog describes the original code from which the kernel is extracted.
  259. *
  260. * id is the sequence number of the kernel.
  261. *
  262. * block_ids contains the list of block identifiers for this kernel.
  263. * thread_ids contains the list of thread identifiers for this kernel.
  264. *
  265. * the first n_grid elements of grid_dim represent the specified size
  266. * of the grid.
  267. * the first n_block elements of block_dim represent the specified or
  268. * effective size of the block.
  269. * Note that in the input file, the sizes of the grid and the blocks
  270. * are specified in the order x, y, z, but internally, the sizes
  271. * are stored in reverse order, so that the last element always
  272. * refers to the x dimension.
  273. *
  274. * grid_size reflects the effective grid size.
  275. * grid_size_expr contains a corresponding access AST expression, built within
  276. * the context where the launch appears.
  277. *
  278. * context contains the values of the parameters and outer schedule dimensions
  279. * for which any statement instance in this kernel needs to be executed.
  280. *
  281. * n_sync is the number of synchronization operations that have
  282. * been introduced in the schedule tree corresponding to this kernel (so far).
  283. *
  284. * core contains the spaces of the statement domains that form
  285. * the core computation of the kernel. It is used to navigate
  286. * the tree during the construction of the device part of the schedule
  287. * tree in gpu_create_kernel.
  288. *
  289. * expanded_domain contains the original statement instances,
  290. * i.e., those that appear in the domains of access relations,
  291. * that are involved in the kernel.
  292. * contraction maps those original statement instances to
  293. * the statement instances that are active at the point
  294. * in the schedule tree where the kernel is created.
  295. *
  296. * arrays is the set of possibly accessed outer array elements.
  297. *
  298. * space is the schedule space of the AST context. That is, it represents
  299. * the loops of the generated host code containing the kernel launch.
  300. *
  301. * n_array is the total number of arrays in the input program and also
  302. * the number of element in the array array.
  303. * array contains information about each array that is local
  304. * to the current kernel. If an array is not used in a kernel,
  305. * then the corresponding entry does not contain any information.
  306. *
  307. * any_force_private is set if any array in the kernel is marked force_private
  308. *
  309. * block_filter contains constraints on the domain elements in the kernel
  310. * that encode the mapping to block identifiers, where the block identifiers
  311. * are represented by "n_grid" parameters with as names the elements
  312. * of "block_ids".
  313. *
  314. * thread_filter contains constraints on the domain elements in the kernel
  315. * that encode the mapping to thread identifiers, where the thread identifiers
  316. * are represented by "n_block" parameters with as names the elements
  317. * of "thread_ids".
  318. *
  319. * copy_schedule corresponds to the schedule dimensions of
  320. * the (tiled) schedule for this kernel that have been taken into account
  321. * for computing private/shared memory tiles.
  322. * The domain corresponds to the original statement instances, i.e.,
  323. * those that appear in the leaves of the schedule tree.
  324. * copy_schedule_dim is the dimension of this schedule.
  325. *
  326. * sync_writes contains write references that require synchronization.
  327. * Each reference is represented by a universe set in a space [S[i,j] -> R[]]
  328. * with S[i,j] the statement instance space and R[] the array reference.
  329. */
  330. struct ppcg_kernel {
  331. isl_ctx *ctx;
  332. struct ppcg_options *options;
  333. struct gpu_prog *prog;
  334. int id;
  335. isl_id_list *block_ids;
  336. isl_id_list *thread_ids;
  337. int n_grid;
  338. int n_block;
  339. int grid_dim[2];
  340. int block_dim[3];
  341. isl_multi_pw_aff *grid_size;
  342. isl_ast_expr *grid_size_expr;
  343. isl_set *context;
  344. int n_sync;
  345. isl_union_set *core;
  346. isl_union_set *arrays;
  347. isl_union_pw_multi_aff *contraction;
  348. isl_union_set *expanded_domain;
  349. isl_space *space;
  350. int n_array;
  351. struct gpu_local_array_info *array;
  352. int n_var;
  353. struct ppcg_kernel_var *var;
  354. int any_force_private;
  355. isl_union_set *block_filter;
  356. isl_union_set *thread_filter;
  357. isl_union_pw_multi_aff *copy_schedule;
  358. int copy_schedule_dim;
  359. isl_union_set *sync_writes;
  360. isl_ast_node *tree;
  361. };
  362. int gpu_array_is_scalar(struct gpu_array_info *array);
  363. int gpu_array_is_read_only_scalar(struct gpu_array_info *array);
  364. int gpu_array_requires_device_allocation(struct gpu_array_info *array);
  365. __isl_give isl_set *gpu_array_positive_size_guard(struct gpu_array_info *array);
  366. isl_bool gpu_array_can_be_private(struct gpu_array_info *array);
  367. struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop);
  368. void *gpu_prog_free(struct gpu_prog *prog);
  369. int ppcg_kernel_requires_array_argument(struct ppcg_kernel *kernel, int i);
  370. int generate_gpu(isl_ctx *ctx, const char *input, FILE *out,
  371. struct ppcg_options *options,
  372. __isl_give isl_printer *(*print)(__isl_take isl_printer *p,
  373. struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
  374. struct gpu_types *types, void *user), void *user);
  375. __isl_give isl_schedule_node *gpu_create_kernel(struct gpu_gen *gen,
  376. __isl_take isl_schedule_node *node, int scale,
  377. __isl_keep isl_multi_val *sizes);
  378. __isl_give isl_schedule *get_schedule(struct gpu_gen *gen);
  379. int has_any_permutable_node(__isl_keep isl_schedule *schedule);
  380. __isl_give isl_schedule *map_to_device(struct gpu_gen *gen,
  381. __isl_take isl_schedule *schedule,
  382. int to_from_device);
  383. __isl_give isl_ast_node *generate_code(struct gpu_gen *gen,
  384. __isl_take isl_schedule *schedule);
  385. __isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog);
  386. void collect_references(struct gpu_prog *prog, struct gpu_array_info *array);
  387. void collect_order_dependences(struct gpu_prog *prog);
  388. isl_bool only_fixed_element_accessed(struct gpu_array_info *array);
  389. #endif