int_altivec.c 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. /*
  2. * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. /**
  21. ** @file libavcodec/ppc/int_altivec.c
  22. ** integer misc ops.
  23. **/
  24. #include "libavcodec/dsputil.h"
  25. #include "gcc_fixes.h"
  26. #include "dsputil_altivec.h"
  27. #include "types_altivec.h"
  28. static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
  29. int size) {
  30. int i, size16;
  31. vector signed char vpix1;
  32. vector signed short vpix2, vdiff, vpix1l,vpix1h;
  33. union { vector signed int vscore;
  34. int32_t score[4];
  35. } u;
  36. u.vscore = vec_splat_s32(0);
  37. //
  38. //XXX lazy way, fix it later
  39. #define vec_unaligned_load(b) \
  40. vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b));
  41. size16 = size >> 4;
  42. while(size16) {
  43. // score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
  44. //load pix1 and the first batch of pix2
  45. vpix1 = vec_unaligned_load(pix1);
  46. vpix2 = vec_unaligned_load(pix2);
  47. pix2 += 8;
  48. //unpack
  49. vpix1h = vec_unpackh(vpix1);
  50. vdiff = vec_sub(vpix1h, vpix2);
  51. vpix1l = vec_unpackl(vpix1);
  52. // load another batch from pix2
  53. vpix2 = vec_unaligned_load(pix2);
  54. u.vscore = vec_msum(vdiff, vdiff, u.vscore);
  55. vdiff = vec_sub(vpix1l, vpix2);
  56. u.vscore = vec_msum(vdiff, vdiff, u.vscore);
  57. pix1 += 16;
  58. pix2 += 8;
  59. size16--;
  60. }
  61. u.vscore = vec_sums(u.vscore, vec_splat_s32(0));
  62. size %= 16;
  63. for (i = 0; i < size; i++) {
  64. u.score[3] += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
  65. }
  66. return u.score[3];
  67. }
  68. static void add_int16_altivec(int16_t * v1, int16_t * v2, int order)
  69. {
  70. int i;
  71. register vec_s16 vec, *pv;
  72. for(i = 0; i < order; i += 8){
  73. pv = (vec_s16*)v2;
  74. vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2));
  75. vec_st(vec_add(vec_ld(0, v1), vec), 0, v1);
  76. v1 += 8;
  77. v2 += 8;
  78. }
  79. }
  80. static void sub_int16_altivec(int16_t * v1, int16_t * v2, int order)
  81. {
  82. int i;
  83. register vec_s16 vec, *pv;
  84. for(i = 0; i < order; i += 8){
  85. pv = (vec_s16*)v2;
  86. vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2));
  87. vec_st(vec_sub(vec_ld(0, v1), vec), 0, v1);
  88. v1 += 8;
  89. v2 += 8;
  90. }
  91. }
  92. static int32_t scalarproduct_int16_altivec(int16_t * v1, int16_t * v2, int order, const int shift)
  93. {
  94. int i;
  95. LOAD_ZERO;
  96. register vec_s16 vec1, *pv;
  97. register vec_s32 res = vec_splat_s32(0), t;
  98. register vec_u32 shifts;
  99. DECLARE_ALIGNED_16(int32_t, ires);
  100. shifts = zero_u32v;
  101. if(shift & 0x10) shifts = vec_add(shifts, vec_sl(vec_splat_u32(0x08), vec_splat_u32(0x1)));
  102. if(shift & 0x08) shifts = vec_add(shifts, vec_splat_u32(0x08));
  103. if(shift & 0x04) shifts = vec_add(shifts, vec_splat_u32(0x04));
  104. if(shift & 0x02) shifts = vec_add(shifts, vec_splat_u32(0x02));
  105. if(shift & 0x01) shifts = vec_add(shifts, vec_splat_u32(0x01));
  106. for(i = 0; i < order; i += 8){
  107. pv = (vec_s16*)v1;
  108. vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1));
  109. t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
  110. t = vec_sr(t, shifts);
  111. res = vec_sums(t, res);
  112. v1 += 8;
  113. v2 += 8;
  114. }
  115. res = vec_splat(res, 3);
  116. vec_ste(res, 0, &ires);
  117. return ires;
  118. }
  119. void int_init_altivec(DSPContext* c, AVCodecContext *avctx)
  120. {
  121. c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec;
  122. c->add_int16 = add_int16_altivec;
  123. c->sub_int16 = sub_int16_altivec;
  124. c->scalarproduct_int16 = scalarproduct_int16_altivec;
  125. }