memcpy_forward_vp4cp4n2.S 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. //===----------------------Hexagon builtin routine ------------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // An optimized version of a memcpy which is equivalent to the following loop:
  10. //
  11. // volatile unsigned *dest;
  12. // unsigned *src;
  13. //
  14. // for (i = 0; i < num_words; ++i)
  15. // *dest++ = *src++;
  16. //
  17. // The corresponding C prototype for this function would be
  18. // void hexagon_memcpy_forward_vp4cp4n2(volatile unsigned *dest,
  19. // const unsigned *src,
  20. // unsigned num_words);
  21. //
  22. // *** Both dest and src must be aligned to 32-bit boundaries. ***
  23. // The code does not perform any runtime checks for this, and will fail
  24. // in bad ways if this requirement is not met.
  25. //
  26. // The "forward" in the name refers to the fact that the function copies
  27. // the words going forward in memory. It is incorrect to use this function
  28. // for cases where the original code copied words in any other order.
  29. //
  30. // *** This function is only for the use by the compiler. ***
  31. // The only indended use is for the LLVM compiler to generate calls to
  32. // this function, when a mem-copy loop, like the one above, is detected.
  33. .text
  34. // Inputs:
  35. // r0: dest
  36. // r1: src
  37. // r2: num_words
  38. .globl hexagon_memcpy_forward_vp4cp4n2
  39. .balign 32
  40. .type hexagon_memcpy_forward_vp4cp4n2,@function
  41. hexagon_memcpy_forward_vp4cp4n2:
  42. // Compute r3 to be the number of words remaining in the current page.
  43. // At the same time, compute r4 to be the number of 32-byte blocks
  44. // remaining in the page (for prefetch).
  45. {
  46. r3 = sub(##4096, r1)
  47. r5 = lsr(r2, #3)
  48. }
  49. {
  50. // The word count before end-of-page is in the 12 lowest bits of r3.
  51. // (If the address in r1 was already page-aligned, the bits are 0.)
  52. r3 = extractu(r3, #10, #2)
  53. r4 = extractu(r3, #7, #5)
  54. }
  55. {
  56. r3 = minu(r2, r3)
  57. r4 = minu(r5, r4)
  58. }
  59. {
  60. r4 = or(r4, ##2105344) // 2105344 = 0x202000
  61. p0 = cmp.eq(r3, #0)
  62. if (p0.new) jump:nt .Lskipprolog
  63. }
  64. l2fetch(r1, r4)
  65. {
  66. loop0(.Lprolog, r3)
  67. r2 = sub(r2, r3) // r2 = number of words left after the prolog.
  68. }
  69. .falign
  70. .Lprolog:
  71. {
  72. r4 = memw(r1++#4)
  73. memw(r0++#4) = r4.new
  74. } :endloop0
  75. .Lskipprolog:
  76. {
  77. // Let r3 = number of whole pages left (page = 1024 words).
  78. r3 = lsr(r2, #10)
  79. if (cmp.eq(r3.new, #0)) jump:nt .Lskipmain
  80. }
  81. {
  82. loop1(.Lout, r3)
  83. r2 = extractu(r2, #10, #0) // r2 = r2 & 1023
  84. r3 = ##2105472 // r3 = 0x202080 (prefetch info)
  85. }
  86. // Iterate over pages.
  87. .falign
  88. .Lout:
  89. // Prefetch each individual page.
  90. l2fetch(r1, r3)
  91. loop0(.Lpage, #512)
  92. .falign
  93. .Lpage:
  94. r5:4 = memd(r1++#8)
  95. {
  96. memw(r0++#8) = r4
  97. memw(r0+#4) = r5
  98. } :endloop0:endloop1
  99. .Lskipmain:
  100. {
  101. r3 = ##2105344 // r3 = 0x202000 (prefetch info)
  102. r4 = lsr(r2, #3) // r4 = number of 32-byte blocks remaining.
  103. p0 = cmp.eq(r2, #0)
  104. if (p0.new) jumpr:nt r31
  105. }
  106. {
  107. r3 = or(r3, r4)
  108. loop0(.Lepilog, r2)
  109. }
  110. l2fetch(r1, r3)
  111. .falign
  112. .Lepilog:
  113. {
  114. r4 = memw(r1++#4)
  115. memw(r0++#4) = r4.new
  116. } :endloop0
  117. jumpr r31
  118. .size hexagon_memcpy_forward_vp4cp4n2, . - hexagon_memcpy_forward_vp4cp4n2