#ifndef _GPU_H #define _GPU_H #include #include #include #include #include "ppcg.h" #include "ppcg_options.h" /* An access to an outer array element or an iterator. * Accesses to iterators have an access relation that maps to an unnamed space. * An access may be both read and write. * If the access relation is empty, then the output dimension may * not be equal to the dimension of the corresponding array. */ struct gpu_stmt_access { /* Access reads elements */ int read; /* Access writes elements */ int write; /* All writes are definite writes. */ int exact_write; /* Is a single, fixed element being accessed? */ isl_bool fixed_element; /* The number of index expressions specified in the access. */ int n_index; /* May access relation */ isl_map *access; /* May access relation with as domain a mapping from iteration domain * to a reference identifier. */ isl_map *tagged_access; /* The reference id of the corresponding pet_expr. */ isl_id *ref_id; struct gpu_stmt_access *next; }; /* A representation of a user statement. * "stmt" points to the corresponding pet statement. * "id" is the identifier of the instance set of the statement. * "accesses" is a linked list of accesses performed by the statement. * If the statement has been killed, i.e., if it will not be scheduled, * then this linked list may be empty even if the actual statement does * perform accesses. */ struct gpu_stmt { isl_id *id; struct pet_stmt *stmt; struct gpu_stmt_access *accesses; }; /* Represents an outer array possibly accessed by a gpu_prog. */ struct gpu_array_info { /* The array data space. */ isl_space *space; /* Element type. */ char *type; /* Element size. */ int size; /* Name of the array. */ char *name; /* Declared extent of original array. */ isl_set *declared_extent; /* AST expression for declared size of original array. */ isl_ast_expr *declared_size; /* Extent of the array that needs to be copied. */ isl_set *extent; /* Number of indices. */ unsigned n_index; /* For each index, a bound on "extent" in that direction. */ isl_multi_pw_aff *bound; /* The corresponding access AST expression, if the array needs * to be allocated on the device. */ isl_ast_expr *bound_expr; /* All references to this array; point to elements of a linked list. */ int n_ref; struct gpu_stmt_access **refs; /* Is this array accessed at all by the program? */ int accessed; /* Is this a scalar that is read-only within the entire program? */ int read_only_scalar; /* Are the elements of the array structures? */ int has_compound_element; /* Are the elements only accessed through constant index expressions? */ int only_fixed_element; /* Is the array local to the scop? */ int local; /* Is the array local and should it be declared on the host? */ int declare_local; /* Is the corresponding global device memory accessed in any way? */ int global; /* Should the array be linearized? */ int linearize; /* Order dependences on this array. * Only used if live_range_reordering option is set. * It is set to NULL otherwise. */ isl_union_map *dep_order; void *user; }; /* Represents an outer array accessed by a ppcg_kernel, localized * to the context of this kernel. * * "array" points to the corresponding array in the gpu_prog. * The "n_group" "groups" are the reference groups associated to the array. * If "force_private" is set, then the array (in practice a scalar) * must be mapped to a register. * "global" is set if the global device memory corresponding * to this array is accessed by the kernel. * "bound" is equal to array->bound specialized to the current kernel. * "bound_expr" is the corresponding access AST expression. */ struct gpu_local_array_info { struct gpu_array_info *array; int n_group; struct gpu_array_ref_group **groups; int force_private; int global; unsigned n_index; isl_multi_pw_aff *bound; isl_ast_expr *bound_expr; }; __isl_give isl_ast_expr *gpu_local_array_info_linearize_index( struct gpu_local_array_info *array, __isl_take isl_ast_expr *expr); /* A sequence of "n" names of types. */ struct gpu_types { int n; char **name; }; /* "read" and "write" contain the original access relations, possibly * involving member accesses. * * The elements of "array", as well as the ranges of "copy_in" and "copy_out" * only refer to the outer arrays of any possible member accesses. */ struct gpu_prog { isl_ctx *ctx; struct ppcg_scop *scop; /* Set of parameter values */ isl_set *context; /* All potential read accesses in the entire program */ isl_union_map *read; /* All potential write accesses in the entire program */ isl_union_map *may_write; /* All definite write accesses in the entire program */ isl_union_map *must_write; /* All tagged definite kills in the entire program */ isl_union_map *tagged_must_kill; /* The set of inner array elements that may be preserved. */ isl_union_set *may_persist; /* A mapping from all innermost arrays to their outer arrays. */ isl_union_map *to_outer; /* A mapping from the outer arrays to all corresponding inner arrays. */ isl_union_map *to_inner; /* A mapping from all intermediate arrays to their outer arrays, * including an identity mapping from the anonymous 1D space to itself. */ isl_union_map *any_to_outer; /* Order dependences on non-scalars. */ isl_union_map *array_order; /* Array of statements */ int n_stmts; struct gpu_stmt *stmts; int n_array; struct gpu_array_info *array; }; struct gpu_gen { isl_ctx *ctx; struct ppcg_options *options; /* Callback for printing of AST in appropriate format. */ __isl_give isl_printer *(*print)(__isl_take isl_printer *p, struct gpu_prog *prog, __isl_keep isl_ast_node *tree, struct gpu_types *types, void *user); void *print_user; isl_id_to_ast_expr *(*build_ast_expr)(void *stmt, isl_ast_build *build, isl_multi_pw_aff *(*fn_index)( __isl_take isl_multi_pw_aff *mpa, isl_id *id, void *user), void *user_index, isl_ast_expr *(*fn_expr)(isl_ast_expr *expr, isl_id *id, void *user), void *user_expr); struct gpu_prog *prog; /* The generated AST. */ isl_ast_node *tree; /* The sequence of types for which a definition has been printed. */ struct gpu_types types; /* User specified tile, grid and block sizes for each kernel */ isl_union_map *sizes; /* Effectively used tile, grid and block sizes for each kernel */ isl_union_map *used_sizes; /* Identifier of the next kernel. */ int kernel_id; }; enum ppcg_group_access_type { ppcg_access_global, ppcg_access_shared, ppcg_access_private }; enum ppcg_kernel_stmt_type { ppcg_kernel_copy, ppcg_kernel_domain, ppcg_kernel_sync }; /* Representation of special statements, in particular copy statements * and __syncthreads statements, inside a kernel. * * type represents the kind of statement * * * for ppcg_kernel_copy statements we have * * read is set if the statement should copy data from global memory * to shared memory or registers. * * index expresses an access to the array element that needs to be copied * local_index expresses the corresponding element in the tile * * array refers to the original array being copied * local_array is a pointer to the appropriate element in the "array" * array of the ppcg_kernel to which this copy access belongs * * * for ppcg_kernel_domain statements we have * * stmt is the corresponding input statement * * n_access is the number of accesses in stmt * access is an array of local information about the accesses */ struct ppcg_kernel_stmt { enum ppcg_kernel_stmt_type type; union { struct { int read; isl_ast_expr *index; isl_ast_expr *local_index; struct gpu_array_info *array; struct gpu_local_array_info *local_array; } c; struct { struct gpu_stmt *stmt; isl_id_to_ast_expr *ref2expr; } d; } u; }; /* Representation of a local variable in a kernel. */ struct ppcg_kernel_var { struct gpu_array_info *array; enum ppcg_group_access_type type; char *name; isl_vec *size; }; /* Representation of a kernel. * * prog describes the original code from which the kernel is extracted. * * id is the sequence number of the kernel. * * block_ids contains the list of block identifiers for this kernel. * thread_ids contains the list of thread identifiers for this kernel. * * the first n_grid elements of grid_dim represent the specified size * of the grid. * the first n_block elements of block_dim represent the specified or * effective size of the block. * Note that in the input file, the sizes of the grid and the blocks * are specified in the order x, y, z, but internally, the sizes * are stored in reverse order, so that the last element always * refers to the x dimension. * * grid_size reflects the effective grid size. * grid_size_expr contains a corresponding access AST expression, built within * the context where the launch appears. * * context contains the values of the parameters and outer schedule dimensions * for which any statement instance in this kernel needs to be executed. * * n_sync is the number of synchronization operations that have * been introduced in the schedule tree corresponding to this kernel (so far). * * core contains the spaces of the statement domains that form * the core computation of the kernel. It is used to navigate * the tree during the construction of the device part of the schedule * tree in gpu_create_kernel. * * expanded_domain contains the original statement instances, * i.e., those that appear in the domains of access relations, * that are involved in the kernel. * contraction maps those original statement instances to * the statement instances that are active at the point * in the schedule tree where the kernel is created. * * arrays is the set of possibly accessed outer array elements. * * space is the schedule space of the AST context. That is, it represents * the loops of the generated host code containing the kernel launch. * * n_array is the total number of arrays in the input program and also * the number of element in the array array. * array contains information about each array that is local * to the current kernel. If an array is not used in a kernel, * then the corresponding entry does not contain any information. * * any_force_private is set if any array in the kernel is marked force_private * * block_filter contains constraints on the domain elements in the kernel * that encode the mapping to block identifiers, where the block identifiers * are represented by "n_grid" parameters with as names the elements * of "block_ids". * * thread_filter contains constraints on the domain elements in the kernel * that encode the mapping to thread identifiers, where the thread identifiers * are represented by "n_block" parameters with as names the elements * of "thread_ids". * * copy_schedule corresponds to the schedule dimensions of * the (tiled) schedule for this kernel that have been taken into account * for computing private/shared memory tiles. * The domain corresponds to the original statement instances, i.e., * those that appear in the leaves of the schedule tree. * copy_schedule_dim is the dimension of this schedule. * * sync_writes contains write references that require synchronization. * Each reference is represented by a universe set in a space [S[i,j] -> R[]] * with S[i,j] the statement instance space and R[] the array reference. */ struct ppcg_kernel { isl_ctx *ctx; struct ppcg_options *options; struct gpu_prog *prog; int id; isl_id_list *block_ids; isl_id_list *thread_ids; int n_grid; int n_block; int grid_dim[2]; int block_dim[3]; isl_multi_pw_aff *grid_size; isl_ast_expr *grid_size_expr; isl_set *context; int n_sync; isl_union_set *core; isl_union_set *arrays; isl_union_pw_multi_aff *contraction; isl_union_set *expanded_domain; isl_space *space; int n_array; struct gpu_local_array_info *array; int n_var; struct ppcg_kernel_var *var; int any_force_private; isl_union_set *block_filter; isl_union_set *thread_filter; isl_union_pw_multi_aff *copy_schedule; int copy_schedule_dim; isl_union_set *sync_writes; isl_ast_node *tree; }; int gpu_array_is_scalar(struct gpu_array_info *array); int gpu_array_is_read_only_scalar(struct gpu_array_info *array); int gpu_array_requires_device_allocation(struct gpu_array_info *array); __isl_give isl_set *gpu_array_positive_size_guard(struct gpu_array_info *array); isl_bool gpu_array_can_be_private(struct gpu_array_info *array); struct gpu_prog *gpu_prog_alloc(isl_ctx *ctx, struct ppcg_scop *scop); void *gpu_prog_free(struct gpu_prog *prog); int ppcg_kernel_requires_array_argument(struct ppcg_kernel *kernel, int i); int generate_gpu(isl_ctx *ctx, const char *input, FILE *out, struct ppcg_options *options, __isl_give isl_printer *(*print)(__isl_take isl_printer *p, struct gpu_prog *prog, __isl_keep isl_ast_node *tree, struct gpu_types *types, void *user), void *user); __isl_give isl_schedule_node *gpu_create_kernel(struct gpu_gen *gen, __isl_take isl_schedule_node *node, int scale, __isl_keep isl_multi_val *sizes); __isl_give isl_schedule *get_schedule(struct gpu_gen *gen); int has_any_permutable_node(__isl_keep isl_schedule *schedule); __isl_give isl_schedule *map_to_device(struct gpu_gen *gen, __isl_take isl_schedule *schedule, int to_from_device); __isl_give isl_ast_node *generate_code(struct gpu_gen *gen, __isl_take isl_schedule *schedule); __isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog); void collect_references(struct gpu_prog *prog, struct gpu_array_info *array); void collect_order_dependences(struct gpu_prog *prog); isl_bool only_fixed_element_accessed(struct gpu_array_info *array); #endif