You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

319 lines
6.0 KiB

  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build gc
  5. #define NOSPLIT 4
  6. #define RODATA 8
  7. // castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
  8. //
  9. // func castagnoliSSE42(crc uint32, p []byte) uint32
  10. TEXT ·castagnoliSSE42(SB), NOSPLIT, $0
  11. MOVL crc+0(FP), AX // CRC value
  12. MOVQ p+8(FP), SI // data pointer
  13. MOVQ p_len+16(FP), CX // len(p)
  14. // If there are fewer than 8 bytes to process, skip alignment.
  15. CMPQ CX, $8
  16. JL less_than_8
  17. MOVQ SI, BX
  18. ANDQ $7, BX
  19. JZ aligned
  20. // Process the first few bytes to 8-byte align the input.
  21. // BX = 8 - BX. We need to process this many bytes to align.
  22. SUBQ $1, BX
  23. XORQ $7, BX
  24. BTQ $0, BX
  25. JNC align_2
  26. CRC32B (SI), AX
  27. DECQ CX
  28. INCQ SI
  29. align_2:
  30. BTQ $1, BX
  31. JNC align_4
  32. // CRC32W (SI), AX
  33. BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
  34. SUBQ $2, CX
  35. ADDQ $2, SI
  36. align_4:
  37. BTQ $2, BX
  38. JNC aligned
  39. // CRC32L (SI), AX
  40. BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
  41. SUBQ $4, CX
  42. ADDQ $4, SI
  43. aligned:
  44. // The input is now 8-byte aligned and we can process 8-byte chunks.
  45. CMPQ CX, $8
  46. JL less_than_8
  47. CRC32Q (SI), AX
  48. ADDQ $8, SI
  49. SUBQ $8, CX
  50. JMP aligned
  51. less_than_8:
  52. // We may have some bytes left over; process 4 bytes, then 2, then 1.
  53. BTQ $2, CX
  54. JNC less_than_4
  55. // CRC32L (SI), AX
  56. BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
  57. ADDQ $4, SI
  58. less_than_4:
  59. BTQ $1, CX
  60. JNC less_than_2
  61. // CRC32W (SI), AX
  62. BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
  63. ADDQ $2, SI
  64. less_than_2:
  65. BTQ $0, CX
  66. JNC done
  67. CRC32B (SI), AX
  68. done:
  69. MOVL AX, ret+32(FP)
  70. RET
  71. // castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds)
  72. // bytes from each buffer.
  73. //
  74. // func castagnoliSSE42Triple(
  75. // crc1, crc2, crc3 uint32,
  76. // a, b, c []byte,
  77. // rounds uint32,
  78. // ) (retA uint32, retB uint32, retC uint32)
  79. TEXT ·castagnoliSSE42Triple(SB), NOSPLIT, $0
  80. MOVL crcA+0(FP), AX
  81. MOVL crcB+4(FP), CX
  82. MOVL crcC+8(FP), DX
  83. MOVQ a+16(FP), R8 // data pointer
  84. MOVQ b+40(FP), R9 // data pointer
  85. MOVQ c+64(FP), R10 // data pointer
  86. MOVL rounds+88(FP), R11
  87. loop:
  88. CRC32Q (R8), AX
  89. CRC32Q (R9), CX
  90. CRC32Q (R10), DX
  91. CRC32Q 8(R8), AX
  92. CRC32Q 8(R9), CX
  93. CRC32Q 8(R10), DX
  94. CRC32Q 16(R8), AX
  95. CRC32Q 16(R9), CX
  96. CRC32Q 16(R10), DX
  97. ADDQ $24, R8
  98. ADDQ $24, R9
  99. ADDQ $24, R10
  100. DECQ R11
  101. JNZ loop
  102. MOVL AX, retA+96(FP)
  103. MOVL CX, retB+100(FP)
  104. MOVL DX, retC+104(FP)
  105. RET
  106. // func haveSSE42() bool
  107. TEXT ·haveSSE42(SB), NOSPLIT, $0
  108. XORQ AX, AX
  109. INCL AX
  110. CPUID
  111. SHRQ $20, CX
  112. ANDQ $1, CX
  113. MOVB CX, ret+0(FP)
  114. RET
  115. // func haveCLMUL() bool
  116. TEXT ·haveCLMUL(SB), NOSPLIT, $0
  117. XORQ AX, AX
  118. INCL AX
  119. CPUID
  120. SHRQ $1, CX
  121. ANDQ $1, CX
  122. MOVB CX, ret+0(FP)
  123. RET
  124. // func haveSSE41() bool
  125. TEXT ·haveSSE41(SB), NOSPLIT, $0
  126. XORQ AX, AX
  127. INCL AX
  128. CPUID
  129. SHRQ $19, CX
  130. ANDQ $1, CX
  131. MOVB CX, ret+0(FP)
  132. RET
  133. // CRC32 polynomial data
  134. //
  135. // These constants are lifted from the
  136. // Linux kernel, since they avoid the costly
  137. // PSHUFB 16 byte reversal proposed in the
  138. // original Intel paper.
  139. DATA r2r1kp<>+0(SB)/8, $0x154442bd4
  140. DATA r2r1kp<>+8(SB)/8, $0x1c6e41596
  141. DATA r4r3kp<>+0(SB)/8, $0x1751997d0
  142. DATA r4r3kp<>+8(SB)/8, $0x0ccaa009e
  143. DATA rupolykp<>+0(SB)/8, $0x1db710641
  144. DATA rupolykp<>+8(SB)/8, $0x1f7011641
  145. DATA r5kp<>+0(SB)/8, $0x163cd6124
  146. GLOBL r2r1kp<>(SB), RODATA, $16
  147. GLOBL r4r3kp<>(SB), RODATA, $16
  148. GLOBL rupolykp<>(SB), RODATA, $16
  149. GLOBL r5kp<>(SB), RODATA, $8
  150. // Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
  151. // len(p) must be at least 64, and must be a multiple of 16.
  152. // func ieeeCLMUL(crc uint32, p []byte) uint32
  153. TEXT ·ieeeCLMUL(SB), NOSPLIT, $0
  154. MOVL crc+0(FP), X0 // Initial CRC value
  155. MOVQ p+8(FP), SI // data pointer
  156. MOVQ p_len+16(FP), CX // len(p)
  157. MOVOU (SI), X1
  158. MOVOU 16(SI), X2
  159. MOVOU 32(SI), X3
  160. MOVOU 48(SI), X4
  161. PXOR X0, X1
  162. ADDQ $64, SI // buf+=64
  163. SUBQ $64, CX // len-=64
  164. CMPQ CX, $64 // Less than 64 bytes left
  165. JB remain64
  166. MOVOA r2r1kp<>+0(SB), X0
  167. loopback64:
  168. MOVOA X1, X5
  169. MOVOA X2, X6
  170. MOVOA X3, X7
  171. MOVOA X4, X8
  172. PCLMULQDQ $0, X0, X1
  173. PCLMULQDQ $0, X0, X2
  174. PCLMULQDQ $0, X0, X3
  175. PCLMULQDQ $0, X0, X4
  176. // Load next early
  177. MOVOU (SI), X11
  178. MOVOU 16(SI), X12
  179. MOVOU 32(SI), X13
  180. MOVOU 48(SI), X14
  181. PCLMULQDQ $0x11, X0, X5
  182. PCLMULQDQ $0x11, X0, X6
  183. PCLMULQDQ $0x11, X0, X7
  184. PCLMULQDQ $0x11, X0, X8
  185. PXOR X5, X1
  186. PXOR X6, X2
  187. PXOR X7, X3
  188. PXOR X8, X4
  189. PXOR X11, X1
  190. PXOR X12, X2
  191. PXOR X13, X3
  192. PXOR X14, X4
  193. ADDQ $0x40, DI
  194. ADDQ $64, SI // buf+=64
  195. SUBQ $64, CX // len-=64
  196. CMPQ CX, $64 // Less than 64 bytes left?
  197. JGE loopback64
  198. // Fold result into a single register (X1)
  199. remain64:
  200. MOVOA r4r3kp<>+0(SB), X0
  201. MOVOA X1, X5
  202. PCLMULQDQ $0, X0, X1
  203. PCLMULQDQ $0x11, X0, X5
  204. PXOR X5, X1
  205. PXOR X2, X1
  206. MOVOA X1, X5
  207. PCLMULQDQ $0, X0, X1
  208. PCLMULQDQ $0x11, X0, X5
  209. PXOR X5, X1
  210. PXOR X3, X1
  211. MOVOA X1, X5
  212. PCLMULQDQ $0, X0, X1
  213. PCLMULQDQ $0x11, X0, X5
  214. PXOR X5, X1
  215. PXOR X4, X1
  216. // If there is less than 16 bytes left we are done
  217. CMPQ CX, $16
  218. JB finish
  219. // Encode 16 bytes
  220. remain16:
  221. MOVOU (SI), X10
  222. MOVOA X1, X5
  223. PCLMULQDQ $0, X0, X1
  224. PCLMULQDQ $0x11, X0, X5
  225. PXOR X5, X1
  226. PXOR X10, X1
  227. SUBQ $16, CX
  228. ADDQ $16, SI
  229. CMPQ CX, $16
  230. JGE remain16
  231. finish:
  232. // Fold final result into 32 bits and return it
  233. PCMPEQB X3, X3
  234. PCLMULQDQ $1, X1, X0
  235. PSRLDQ $8, X1
  236. PXOR X0, X1
  237. MOVOA X1, X2
  238. MOVQ r5kp<>+0(SB), X0
  239. // Creates 32 bit mask. Note that we don't care about upper half.
  240. PSRLQ $32, X3
  241. PSRLDQ $4, X2
  242. PAND X3, X1
  243. PCLMULQDQ $0, X0, X1
  244. PXOR X2, X1
  245. MOVOA rupolykp<>+0(SB), X0
  246. MOVOA X1, X2
  247. PAND X3, X1
  248. PCLMULQDQ $0x10, X0, X1
  249. PAND X3, X1
  250. PCLMULQDQ $0, X0, X1
  251. PXOR X2, X1
  252. // PEXTRD $1, X1, AX (SSE 4.1)
  253. BYTE $0x66; BYTE $0x0f; BYTE $0x3a
  254. BYTE $0x16; BYTE $0xc8; BYTE $0x01
  255. MOVL AX, ret+32(FP)
  256. RET