unsharp_opencl.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422
  1. /*
  2. * Copyright (C) 2013 Wei Gao <weigao@multicorewareinc.com>
  3. * Copyright (C) 2013 Lenny Wang
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. /**
  22. * @file
  23. * unsharp input video
  24. */
  25. #include "unsharp_opencl.h"
  26. #include "libavutil/common.h"
  27. #include "libavutil/opencl_internal.h"
  28. #define PLANE_NUM 3
  29. #define ROUND_TO_16(a) (((((a) - 1)/16)+1)*16)
  30. static inline void add_mask_counter(uint32_t *dst, uint32_t *counter1, uint32_t *counter2, int len)
  31. {
  32. int i;
  33. for (i = 0; i < len; i++) {
  34. dst[i] = counter1[i] + counter2[i];
  35. }
  36. }
  37. static int compute_mask(int step, uint32_t *mask)
  38. {
  39. int i, z, ret = 0;
  40. int counter_size = sizeof(uint32_t) * (2 * step + 1);
  41. uint32_t *temp1_counter, *temp2_counter, **counter;
  42. temp1_counter = av_mallocz(counter_size);
  43. if (!temp1_counter) {
  44. ret = AVERROR(ENOMEM);
  45. goto end;
  46. }
  47. temp2_counter = av_mallocz(counter_size);
  48. if (!temp2_counter) {
  49. ret = AVERROR(ENOMEM);
  50. goto end;
  51. }
  52. counter = av_mallocz_array(2 * step + 1, sizeof(uint32_t *));
  53. if (!counter) {
  54. ret = AVERROR(ENOMEM);
  55. goto end;
  56. }
  57. for (i = 0; i < 2 * step + 1; i++) {
  58. counter[i] = av_mallocz(counter_size);
  59. if (!counter[i]) {
  60. ret = AVERROR(ENOMEM);
  61. goto end;
  62. }
  63. }
  64. for (i = 0; i < 2 * step + 1; i++) {
  65. memset(temp1_counter, 0, counter_size);
  66. temp1_counter[i] = 1;
  67. for (z = 0; z < step * 2; z += 2) {
  68. add_mask_counter(temp2_counter, counter[z], temp1_counter, step * 2);
  69. memcpy(counter[z], temp1_counter, counter_size);
  70. add_mask_counter(temp1_counter, counter[z + 1], temp2_counter, step * 2);
  71. memcpy(counter[z + 1], temp2_counter, counter_size);
  72. }
  73. }
  74. memcpy(mask, temp1_counter, counter_size);
  75. end:
  76. av_freep(&temp1_counter);
  77. av_freep(&temp2_counter);
  78. for (i = 0; i < 2 * step + 1; i++) {
  79. av_freep(&counter[i]);
  80. }
  81. av_freep(&counter);
  82. return ret;
  83. }
  84. static int copy_separable_masks(cl_mem cl_mask_x, cl_mem cl_mask_y, int step_x, int step_y)
  85. {
  86. int ret = 0;
  87. uint32_t *mask_x, *mask_y;
  88. size_t size_mask_x = sizeof(uint32_t) * (2 * step_x + 1);
  89. size_t size_mask_y = sizeof(uint32_t) * (2 * step_y + 1);
  90. mask_x = av_mallocz_array(2 * step_x + 1, sizeof(uint32_t));
  91. if (!mask_x) {
  92. ret = AVERROR(ENOMEM);
  93. goto end;
  94. }
  95. mask_y = av_mallocz_array(2 * step_y + 1, sizeof(uint32_t));
  96. if (!mask_y) {
  97. ret = AVERROR(ENOMEM);
  98. goto end;
  99. }
  100. ret = compute_mask(step_x, mask_x);
  101. if (ret < 0)
  102. goto end;
  103. ret = compute_mask(step_y, mask_y);
  104. if (ret < 0)
  105. goto end;
  106. ret = av_opencl_buffer_write(cl_mask_x, (uint8_t *)mask_x, size_mask_x);
  107. ret = av_opencl_buffer_write(cl_mask_y, (uint8_t *)mask_y, size_mask_y);
  108. end:
  109. av_freep(&mask_x);
  110. av_freep(&mask_y);
  111. return ret;
  112. }
  113. static int generate_mask(AVFilterContext *ctx)
  114. {
  115. cl_mem masks[4];
  116. cl_mem mask_matrix[2];
  117. int i, ret = 0, step_x[2], step_y[2];
  118. UnsharpContext *unsharp = ctx->priv;
  119. mask_matrix[0] = unsharp->opencl_ctx.cl_luma_mask;
  120. mask_matrix[1] = unsharp->opencl_ctx.cl_chroma_mask;
  121. masks[0] = unsharp->opencl_ctx.cl_luma_mask_x;
  122. masks[1] = unsharp->opencl_ctx.cl_luma_mask_y;
  123. masks[2] = unsharp->opencl_ctx.cl_chroma_mask_x;
  124. masks[3] = unsharp->opencl_ctx.cl_chroma_mask_y;
  125. step_x[0] = unsharp->luma.steps_x;
  126. step_x[1] = unsharp->chroma.steps_x;
  127. step_y[0] = unsharp->luma.steps_y;
  128. step_y[1] = unsharp->chroma.steps_y;
  129. /* use default kernel if any matrix dim larger than 8 due to limited local mem size */
  130. if (step_x[0]>8 || step_x[1]>8 || step_y[0]>8 || step_y[1]>8)
  131. unsharp->opencl_ctx.use_fast_kernels = 0;
  132. else
  133. unsharp->opencl_ctx.use_fast_kernels = 1;
  134. if (!masks[0] || !masks[1] || !masks[2] || !masks[3]) {
  135. av_log(ctx, AV_LOG_ERROR, "Luma mask and chroma mask should not be NULL\n");
  136. return AVERROR(EINVAL);
  137. }
  138. if (!mask_matrix[0] || !mask_matrix[1]) {
  139. av_log(ctx, AV_LOG_ERROR, "Luma mask and chroma mask should not be NULL\n");
  140. return AVERROR(EINVAL);
  141. }
  142. for (i = 0; i < 2; i++) {
  143. ret = copy_separable_masks(masks[2*i], masks[2*i+1], step_x[i], step_y[i]);
  144. if (ret < 0)
  145. return ret;
  146. }
  147. return ret;
  148. }
  149. int ff_opencl_apply_unsharp(AVFilterContext *ctx, AVFrame *in, AVFrame *out)
  150. {
  151. int ret;
  152. AVFilterLink *link = ctx->inputs[0];
  153. UnsharpContext *unsharp = ctx->priv;
  154. cl_int status;
  155. FFOpenclParam kernel1 = {0};
  156. FFOpenclParam kernel2 = {0};
  157. int width = link->w;
  158. int height = link->h;
  159. int cw = FF_CEIL_RSHIFT(link->w, unsharp->hsub);
  160. int ch = FF_CEIL_RSHIFT(link->h, unsharp->vsub);
  161. size_t globalWorkSize1d = width * height + 2 * ch * cw;
  162. size_t globalWorkSize2dLuma[2];
  163. size_t globalWorkSize2dChroma[2];
  164. size_t localWorkSize2d[2] = {16, 16};
  165. if (unsharp->opencl_ctx.use_fast_kernels) {
  166. globalWorkSize2dLuma[0] = (size_t)ROUND_TO_16(width);
  167. globalWorkSize2dLuma[1] = (size_t)ROUND_TO_16(height);
  168. globalWorkSize2dChroma[0] = (size_t)ROUND_TO_16(cw);
  169. globalWorkSize2dChroma[1] = (size_t)(2*ROUND_TO_16(ch));
  170. kernel1.ctx = ctx;
  171. kernel1.kernel = unsharp->opencl_ctx.kernel_luma;
  172. ret = avpriv_opencl_set_parameter(&kernel1,
  173. FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_inbuf),
  174. FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_outbuf),
  175. FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_luma_mask_x),
  176. FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_luma_mask_y),
  177. FF_OPENCL_PARAM_INFO(unsharp->luma.amount),
  178. FF_OPENCL_PARAM_INFO(unsharp->luma.scalebits),
  179. FF_OPENCL_PARAM_INFO(unsharp->luma.halfscale),
  180. FF_OPENCL_PARAM_INFO(in->linesize[0]),
  181. FF_OPENCL_PARAM_INFO(out->linesize[0]),
  182. FF_OPENCL_PARAM_INFO(width),
  183. FF_OPENCL_PARAM_INFO(height),
  184. NULL);
  185. if (ret < 0)
  186. return ret;
  187. kernel2.ctx = ctx;
  188. kernel2.kernel = unsharp->opencl_ctx.kernel_chroma;
  189. ret = avpriv_opencl_set_parameter(&kernel2,
  190. FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_inbuf),
  191. FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_outbuf),
  192. FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_chroma_mask_x),
  193. FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_chroma_mask_y),
  194. FF_OPENCL_PARAM_INFO(unsharp->chroma.amount),
  195. FF_OPENCL_PARAM_INFO(unsharp->chroma.scalebits),
  196. FF_OPENCL_PARAM_INFO(unsharp->chroma.halfscale),
  197. FF_OPENCL_PARAM_INFO(in->linesize[0]),
  198. FF_OPENCL_PARAM_INFO(in->linesize[1]),
  199. FF_OPENCL_PARAM_INFO(out->linesize[0]),
  200. FF_OPENCL_PARAM_INFO(out->linesize[1]),
  201. FF_OPENCL_PARAM_INFO(link->w),
  202. FF_OPENCL_PARAM_INFO(link->h),
  203. FF_OPENCL_PARAM_INFO(cw),
  204. FF_OPENCL_PARAM_INFO(ch),
  205. NULL);
  206. if (ret < 0)
  207. return ret;
  208. status = clEnqueueNDRangeKernel(unsharp->opencl_ctx.command_queue,
  209. unsharp->opencl_ctx.kernel_luma, 2, NULL,
  210. globalWorkSize2dLuma, localWorkSize2d, 0, NULL, NULL);
  211. status |=clEnqueueNDRangeKernel(unsharp->opencl_ctx.command_queue,
  212. unsharp->opencl_ctx.kernel_chroma, 2, NULL,
  213. globalWorkSize2dChroma, localWorkSize2d, 0, NULL, NULL);
  214. if (status != CL_SUCCESS) {
  215. av_log(ctx, AV_LOG_ERROR, "OpenCL run kernel error occurred: %s\n", av_opencl_errstr(status));
  216. return AVERROR_EXTERNAL;
  217. }
  218. } else { /* use default kernel */
  219. kernel1.ctx = ctx;
  220. kernel1.kernel = unsharp->opencl_ctx.kernel_default;
  221. ret = avpriv_opencl_set_parameter(&kernel1,
  222. FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_inbuf),
  223. FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_outbuf),
  224. FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_luma_mask),
  225. FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_chroma_mask),
  226. FF_OPENCL_PARAM_INFO(unsharp->luma.amount),
  227. FF_OPENCL_PARAM_INFO(unsharp->chroma.amount),
  228. FF_OPENCL_PARAM_INFO(unsharp->luma.steps_x),
  229. FF_OPENCL_PARAM_INFO(unsharp->luma.steps_y),
  230. FF_OPENCL_PARAM_INFO(unsharp->chroma.steps_x),
  231. FF_OPENCL_PARAM_INFO(unsharp->chroma.steps_y),
  232. FF_OPENCL_PARAM_INFO(unsharp->luma.scalebits),
  233. FF_OPENCL_PARAM_INFO(unsharp->chroma.scalebits),
  234. FF_OPENCL_PARAM_INFO(unsharp->luma.halfscale),
  235. FF_OPENCL_PARAM_INFO(unsharp->chroma.halfscale),
  236. FF_OPENCL_PARAM_INFO(in->linesize[0]),
  237. FF_OPENCL_PARAM_INFO(in->linesize[1]),
  238. FF_OPENCL_PARAM_INFO(out->linesize[0]),
  239. FF_OPENCL_PARAM_INFO(out->linesize[1]),
  240. FF_OPENCL_PARAM_INFO(link->h),
  241. FF_OPENCL_PARAM_INFO(link->w),
  242. FF_OPENCL_PARAM_INFO(ch),
  243. FF_OPENCL_PARAM_INFO(cw),
  244. NULL);
  245. if (ret < 0)
  246. return ret;
  247. status = clEnqueueNDRangeKernel(unsharp->opencl_ctx.command_queue,
  248. unsharp->opencl_ctx.kernel_default, 1, NULL,
  249. &globalWorkSize1d, NULL, 0, NULL, NULL);
  250. if (status != CL_SUCCESS) {
  251. av_log(ctx, AV_LOG_ERROR, "OpenCL run kernel error occurred: %s\n", av_opencl_errstr(status));
  252. return AVERROR_EXTERNAL;
  253. }
  254. }
  255. //blocking map is suffficient, no need for clFinish
  256. //clFinish(unsharp->opencl_ctx.command_queue);
  257. return av_opencl_buffer_read_image(out->data, unsharp->opencl_ctx.out_plane_size,
  258. unsharp->opencl_ctx.plane_num, unsharp->opencl_ctx.cl_outbuf,
  259. unsharp->opencl_ctx.cl_outbuf_size);
  260. }
  261. int ff_opencl_unsharp_init(AVFilterContext *ctx)
  262. {
  263. int ret = 0;
  264. char build_opts[96];
  265. UnsharpContext *unsharp = ctx->priv;
  266. ret = av_opencl_init(NULL);
  267. if (ret < 0)
  268. return ret;
  269. ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_luma_mask,
  270. sizeof(uint32_t) * (2 * unsharp->luma.steps_x + 1) * (2 * unsharp->luma.steps_y + 1),
  271. CL_MEM_READ_ONLY, NULL);
  272. if (ret < 0)
  273. return ret;
  274. ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_chroma_mask,
  275. sizeof(uint32_t) * (2 * unsharp->chroma.steps_x + 1) * (2 * unsharp->chroma.steps_y + 1),
  276. CL_MEM_READ_ONLY, NULL);
  277. // separable filters
  278. if (ret < 0)
  279. return ret;
  280. ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_luma_mask_x,
  281. sizeof(uint32_t) * (2 * unsharp->luma.steps_x + 1),
  282. CL_MEM_READ_ONLY, NULL);
  283. if (ret < 0)
  284. return ret;
  285. ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_luma_mask_y,
  286. sizeof(uint32_t) * (2 * unsharp->luma.steps_y + 1),
  287. CL_MEM_READ_ONLY, NULL);
  288. if (ret < 0)
  289. return ret;
  290. ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_chroma_mask_x,
  291. sizeof(uint32_t) * (2 * unsharp->chroma.steps_x + 1),
  292. CL_MEM_READ_ONLY, NULL);
  293. if (ret < 0)
  294. return ret;
  295. ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_chroma_mask_y,
  296. sizeof(uint32_t) * (2 * unsharp->chroma.steps_y + 1),
  297. CL_MEM_READ_ONLY, NULL);
  298. if (ret < 0)
  299. return ret;
  300. ret = generate_mask(ctx);
  301. if (ret < 0)
  302. return ret;
  303. unsharp->opencl_ctx.plane_num = PLANE_NUM;
  304. unsharp->opencl_ctx.command_queue = av_opencl_get_command_queue();
  305. if (!unsharp->opencl_ctx.command_queue) {
  306. av_log(ctx, AV_LOG_ERROR, "Unable to get OpenCL command queue in filter 'unsharp'\n");
  307. return AVERROR(EINVAL);
  308. }
  309. snprintf(build_opts, 96, "-D LU_RADIUS_X=%d -D LU_RADIUS_Y=%d -D CH_RADIUS_X=%d -D CH_RADIUS_Y=%d",
  310. 2*unsharp->luma.steps_x+1, 2*unsharp->luma.steps_y+1, 2*unsharp->chroma.steps_x+1, 2*unsharp->chroma.steps_y+1);
  311. unsharp->opencl_ctx.program = av_opencl_compile("unsharp", build_opts);
  312. if (!unsharp->opencl_ctx.program) {
  313. av_log(ctx, AV_LOG_ERROR, "OpenCL failed to compile program 'unsharp'\n");
  314. return AVERROR(EINVAL);
  315. }
  316. if (unsharp->opencl_ctx.use_fast_kernels) {
  317. if (!unsharp->opencl_ctx.kernel_luma) {
  318. unsharp->opencl_ctx.kernel_luma = clCreateKernel(unsharp->opencl_ctx.program, "unsharp_luma", &ret);
  319. if (ret != CL_SUCCESS) {
  320. av_log(ctx, AV_LOG_ERROR, "OpenCL failed to create kernel 'unsharp_luma'\n");
  321. return ret;
  322. }
  323. }
  324. if (!unsharp->opencl_ctx.kernel_chroma) {
  325. unsharp->opencl_ctx.kernel_chroma = clCreateKernel(unsharp->opencl_ctx.program, "unsharp_chroma", &ret);
  326. if (ret < 0) {
  327. av_log(ctx, AV_LOG_ERROR, "OpenCL failed to create kernel 'unsharp_chroma'\n");
  328. return ret;
  329. }
  330. }
  331. }
  332. else {
  333. if (!unsharp->opencl_ctx.kernel_default) {
  334. unsharp->opencl_ctx.kernel_default = clCreateKernel(unsharp->opencl_ctx.program, "unsharp_default", &ret);
  335. if (ret < 0) {
  336. av_log(ctx, AV_LOG_ERROR, "OpenCL failed to create kernel 'unsharp_default'\n");
  337. return ret;
  338. }
  339. }
  340. }
  341. return ret;
  342. }
  343. void ff_opencl_unsharp_uninit(AVFilterContext *ctx)
  344. {
  345. UnsharpContext *unsharp = ctx->priv;
  346. av_opencl_buffer_release(&unsharp->opencl_ctx.cl_inbuf);
  347. av_opencl_buffer_release(&unsharp->opencl_ctx.cl_outbuf);
  348. av_opencl_buffer_release(&unsharp->opencl_ctx.cl_luma_mask);
  349. av_opencl_buffer_release(&unsharp->opencl_ctx.cl_chroma_mask);
  350. av_opencl_buffer_release(&unsharp->opencl_ctx.cl_luma_mask_x);
  351. av_opencl_buffer_release(&unsharp->opencl_ctx.cl_chroma_mask_x);
  352. av_opencl_buffer_release(&unsharp->opencl_ctx.cl_luma_mask_y);
  353. av_opencl_buffer_release(&unsharp->opencl_ctx.cl_chroma_mask_y);
  354. clReleaseKernel(unsharp->opencl_ctx.kernel_default);
  355. clReleaseKernel(unsharp->opencl_ctx.kernel_luma);
  356. clReleaseKernel(unsharp->opencl_ctx.kernel_chroma);
  357. clReleaseProgram(unsharp->opencl_ctx.program);
  358. unsharp->opencl_ctx.command_queue = NULL;
  359. av_opencl_uninit();
  360. }
  361. int ff_opencl_unsharp_process_inout_buf(AVFilterContext *ctx, AVFrame *in, AVFrame *out)
  362. {
  363. int ret = 0;
  364. AVFilterLink *link = ctx->inputs[0];
  365. UnsharpContext *unsharp = ctx->priv;
  366. int ch = FF_CEIL_RSHIFT(link->h, unsharp->vsub);
  367. if ((!unsharp->opencl_ctx.cl_inbuf) || (!unsharp->opencl_ctx.cl_outbuf)) {
  368. unsharp->opencl_ctx.in_plane_size[0] = (in->linesize[0] * in->height);
  369. unsharp->opencl_ctx.in_plane_size[1] = (in->linesize[1] * ch);
  370. unsharp->opencl_ctx.in_plane_size[2] = (in->linesize[2] * ch);
  371. unsharp->opencl_ctx.out_plane_size[0] = (out->linesize[0] * out->height);
  372. unsharp->opencl_ctx.out_plane_size[1] = (out->linesize[1] * ch);
  373. unsharp->opencl_ctx.out_plane_size[2] = (out->linesize[2] * ch);
  374. unsharp->opencl_ctx.cl_inbuf_size = unsharp->opencl_ctx.in_plane_size[0] +
  375. unsharp->opencl_ctx.in_plane_size[1] +
  376. unsharp->opencl_ctx.in_plane_size[2];
  377. unsharp->opencl_ctx.cl_outbuf_size = unsharp->opencl_ctx.out_plane_size[0] +
  378. unsharp->opencl_ctx.out_plane_size[1] +
  379. unsharp->opencl_ctx.out_plane_size[2];
  380. if (!unsharp->opencl_ctx.cl_inbuf) {
  381. ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_inbuf,
  382. unsharp->opencl_ctx.cl_inbuf_size,
  383. CL_MEM_READ_ONLY, NULL);
  384. if (ret < 0)
  385. return ret;
  386. }
  387. if (!unsharp->opencl_ctx.cl_outbuf) {
  388. ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_outbuf,
  389. unsharp->opencl_ctx.cl_outbuf_size,
  390. CL_MEM_READ_WRITE, NULL);
  391. if (ret < 0)
  392. return ret;
  393. }
  394. }
  395. return av_opencl_buffer_write_image(unsharp->opencl_ctx.cl_inbuf,
  396. unsharp->opencl_ctx.cl_inbuf_size,
  397. 0, in->data, unsharp->opencl_ctx.in_plane_size,
  398. unsharp->opencl_ctx.plane_num);
  399. }