/* * Copyright 2013 Ecole Normale Superieure * Copyright 2015 Sven Verdoolaege * * Use of this software is governed by the MIT license * * Written by Sven Verdoolaege, * Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France */ #include #include #include #include #include #include "hybrid.h" #include "gpu_hybrid.h" #include "gpu_tree.h" #include "schedule.h" #include "util.h" /* Have all domain elements been filtered out before reaching * the "node" position in the schedule tree? */ static isl_bool has_empty_domain(__isl_keep isl_schedule_node *node) { isl_union_set *domain; isl_bool empty; domain = isl_schedule_node_get_domain(node); empty = isl_union_set_is_empty(domain); isl_union_set_free(domain); return empty; } /* Given a pointer to a phase in the result of hybrid tiling, * map the phase to the device, provided the phase is non-empty. * Empty phases can occur if the input schedule domain can be * covered by a small number of hexagons that all belong to the same phase. * * The input has the following form: * * M - CT - P - C - ... * * with M the phase marker, CT the space tiling, P the original * parent band and C the original child band. * The (outer dimensions of the) C band need to be mapped to threads. * The (outer dimension of the) CT band needs to be mapped to blocks. * The mapping to shared memory needs to be computed between the CT and * the P band. * * The C band is first shifted to start at zero. * Then the appropriate markers are introduced and a kernel is * created for the tree rooted at CT. * If the "unroll_gpu_tile" option is set, then the AST generator * is instructed to unroll the P and C bands. */ static __isl_give isl_schedule_node *update_phase( __isl_take isl_schedule_node *node, void *user) { struct gpu_gen *gen = user; int depth0, depth; isl_ctx *ctx; isl_id *id; isl_bool empty_domain; ppcg_ht_phase *phase; empty_domain = has_empty_domain(node); if (empty_domain < 0) return isl_schedule_node_free(node); if (empty_domain) return node; if (!node) return NULL; ctx = isl_schedule_node_get_ctx(node); phase = ppcg_ht_phase_extract_from_mark(node); depth0 = isl_schedule_node_get_tree_depth(node); node = isl_schedule_node_child(node, 0); node = isl_schedule_node_child(node, 0); node = isl_schedule_node_child(node, 0); node = ppcg_ht_phase_shift_space_point(phase, node); if (gen->options->unroll_gpu_tile) node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll); id = isl_id_alloc(ctx, "thread", NULL); node = isl_schedule_node_insert_mark(node, id); node = isl_schedule_node_parent(node); if (gen->options->unroll_gpu_tile) node = ppcg_set_schedule_node_type(node, isl_ast_loop_unroll); id = isl_id_alloc(ctx, "shared", NULL); node = isl_schedule_node_insert_mark(node, id); node = isl_schedule_node_parent(node); node = gpu_create_kernel(gen, node, 0, NULL); depth = isl_schedule_node_get_tree_depth(node); node = isl_schedule_node_ancestor(node, depth - depth0); return node; } /* Apply hybrid tiling on "node" and its parent based on the (valid) * bounds on the relative dependence distances "bounds" and * the tile sizes in "tile_sizes". * The number of elements in "tile_sizes" is at least as large * as the sum of the dimensions of the parent and the child node. * * Convert the tile_sizes to an isl_multi_val in the right space, * insert the hybrid tiling and then create a kernel inside each phase. * Finally, remove the phase marks. */ __isl_give isl_schedule_node *gpu_hybrid_tile(struct gpu_gen *gen, __isl_take isl_schedule_node *node, __isl_take ppcg_ht_bounds *bounds, int *tile_sizes) { isl_multi_val *mv; isl_space *space, *space2; if (!node || !bounds) goto error; space2 = isl_schedule_node_band_get_space(node); node = isl_schedule_node_parent(node); space = isl_schedule_node_band_get_space(node); space = isl_space_product(space, space2); mv = ppcg_multi_val_from_int_list(space, tile_sizes); node = ppcg_ht_bounds_insert_tiling(bounds, mv, node, gen->options); node = hybrid_tile_foreach_phase(node, &update_phase, gen); node = hybrid_tile_drop_phase_marks(node); return node; error: isl_schedule_node_free(node); ppcg_ht_bounds_free(bounds); return NULL; }