| 1 | ;; ********************************************************************* |
|---|
| 2 | ;; Intel64 mpn_submul_1 -- Multiply a limb vector with a limb and |
|---|
| 3 | ;; subtract the result from a second limb vector. |
|---|
| 4 | ;; |
|---|
| 5 | ;; Copyright (C) 2006 Jason Worth Martin <jason.worth.martin@gmail.com> |
|---|
| 6 | ;; |
|---|
| 7 | ;; This program is free software; you can redistribute it and/or modify |
|---|
| 8 | ;; it under the terms of the GNU General Public License as published by |
|---|
| 9 | ;; the Free Software Foundation; either version 2 of the License, or |
|---|
| 10 | ;; (at your option) any later version. |
|---|
| 11 | ;; |
|---|
| 12 | ;; This program is distributed in the hope that it will be useful, |
|---|
| 13 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 14 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|---|
| 15 | ;; GNU General Public License for more details. |
|---|
| 16 | ;; |
|---|
| 17 | ;; You should have received a copy of the GNU General Public License along |
|---|
| 18 | ;; with this program; if not, write to the Free Software Foundation, Inc., |
|---|
| 19 | ;; 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
|---|
| 20 | ;; |
|---|
| 21 | ;; ************************************************************************** |
|---|
| 22 | ;; |
|---|
| 23 | ;; |
|---|
| 24 | ;; CREDITS |
|---|
| 25 | ;; |
|---|
| 26 | ;; This code is based largely on Pierrick Gaudry's excellent assembly |
|---|
| 27 | ;; support for the AMD64 architecture. (Note that Intel64 and AMD64, |
|---|
| 28 | ;; while using the same instruction set, have very different |
|---|
| 29 | ;; microarchitectures. So, this code performs very poorly on AMD64 |
|---|
| 30 | ;; machines even though it is near-optimal on Intel64.) |
|---|
| 31 | ;; |
|---|
| 32 | ;; Roger Golliver works for Intel and provided insightful improvements |
|---|
| 33 | ;; particularly in using the "lea" instruction to perform additions |
|---|
| 34 | ;; and register-to-register moves. |
|---|
| 35 | ;; |
|---|
| 36 | ;; Eric Bainville has a brilliant exposition of optimizing arithmetic for |
|---|
| 37 | ;; AMD64 (http://www.bealto.it). I adapted many of the ideas he |
|---|
| 38 | ;; describes to Intel64. |
|---|
| 39 | ;; |
|---|
| 40 | ;; Agner Fog is a demigod in the x86 world. If you are reading assembly |
|---|
| 41 | ;; code files and you haven't heard of Agner Fog, then take a minute to |
|---|
| 42 | ;; look over his software optimization manuals (http://www.agner.org/). |
|---|
| 43 | ;; They are superb. |
|---|
| 44 | ;; |
|---|
| 45 | ;; ********************************************************************* |
|---|
| 46 | |
|---|
| 47 | |
|---|
| 48 | ;; With a 4-way unroll the code has |
|---|
| 49 | ;; |
|---|
| 50 | ;; cycles/limb |
|---|
| 51 | ;; Hammer: 4.6 |
|---|
| 52 | ;; Woodcrest: 4.6 |
|---|
| 53 | ;; |
|---|
| 54 | ;; With increased unrolling, it appears to converge to 4 cycles/limb |
|---|
| 55 | ;; on Intel Core 2 machines. I believe that this is optimal, however |
|---|
| 56 | ;; it requires such absurd unrolling that it becomes unusable for all |
|---|
| 57 | ;; but the largest inputs. A 4-way unroll seems like a good balance |
|---|
| 58 | ;; to me because then commonly used input sizes (e.g. 1024bit Public |
|---|
| 59 | ;; keys) still benifit from the speed up. |
|---|
| 60 | |
|---|
| 61 | ;; |
|---|
| 62 | ;; This is just a check to see if we are in my code testing sandbox |
|---|
| 63 | ;; or if we are actually in the GMP source tree |
|---|
| 64 | ;; |
|---|
| 65 | %ifdef __JWM_Test_Code__ |
|---|
| 66 | %include 'yasm_mac.inc' |
|---|
| 67 | %else |
|---|
| 68 | %include '../yasm_mac.inc' |
|---|
| 69 | %endif |
|---|
| 70 | |
|---|
| 71 | |
|---|
| 72 | |
|---|
| 73 | ;; ********************************************************************* |
|---|
| 74 | ;; ********************************************************************* |
|---|
| 75 | ;; |
|---|
| 76 | ;; Here are the important macro parameters for the code |
|---|
| 77 | ;; |
|---|
| 78 | ;; BpL is Bytes per Limb (8 since this is 64bit code) |
|---|
| 79 | ;; |
|---|
| 80 | ;; UNROLL_SIZE is a power of 2 for which we unroll the code. |
|---|
| 81 | ;; possible values are 2,4,8,15,..., 256. A reasonable |
|---|
| 82 | ;; value is probably 4. If really large inputs |
|---|
| 83 | ;; are expected, then 16 is probably good. Larger |
|---|
| 84 | ;; values are really only useful for flashy |
|---|
| 85 | ;; benchmarks and testing asymptotic behavior. |
|---|
| 86 | ;; |
|---|
| 87 | ;; THRESHOLD is the minimum number of limbs needed before we bother |
|---|
| 88 | ;; using the complicated loop. A reasonable value is |
|---|
| 89 | ;; 2*UNROLL_SIZE + 6 |
|---|
| 90 | ;; |
|---|
| 91 | ;; ********************************************************************* |
|---|
| 92 | ;; ********************************************************************* |
|---|
| 93 | %define BpL 8 |
|---|
| 94 | %define UNROLL_SIZE 4 |
|---|
| 95 | %define UNROLL_MASK UNROLL_SIZE-1 |
|---|
| 96 | %define THRESHOLD 2*UNROLL_SIZE+6 |
|---|
| 97 | |
|---|
| 98 | ;; Here is a convenient Macro for addressing |
|---|
| 99 | ;; memory. Entries of the form |
|---|
| 100 | ;; |
|---|
| 101 | ;; ADDR(ptr,index,displacement) |
|---|
| 102 | ;; |
|---|
| 103 | ;; get converted to |
|---|
| 104 | ;; |
|---|
| 105 | ;; [displacement*BpL + ptr + index*BpL] |
|---|
| 106 | ;; |
|---|
| 107 | %define ADDR(a,b,c) [c*BpL+a+b*BpL] |
|---|
| 108 | |
|---|
| 109 | |
|---|
| 110 | ;; Register Usage |
|---|
| 111 | ;; -------- ----- |
|---|
| 112 | ;; rax low word from mul |
|---|
| 113 | ;; rbx* |
|---|
| 114 | ;; rcx s2limb |
|---|
| 115 | ;; rdx high word from mul |
|---|
| 116 | ;; rsi s1p |
|---|
| 117 | ;; rdi rp |
|---|
| 118 | ;; rbp* Base Pointer |
|---|
| 119 | ;; rsp* Stack Pointer |
|---|
| 120 | ;; r8 A_x |
|---|
| 121 | ;; r9 A_y |
|---|
| 122 | ;; r10 A_z |
|---|
| 123 | ;; r11 B_x |
|---|
| 124 | ;; r12* B_y |
|---|
| 125 | ;; r13* B_z |
|---|
| 126 | ;; r14* temp |
|---|
| 127 | ;; r15* index |
|---|
| 128 | ;; |
|---|
| 129 | ;; * indicates that the register must be |
|---|
| 130 | ;; preserved for the caller. |
|---|
| 131 | %define s2limb rcx |
|---|
| 132 | %define s1p rsi |
|---|
| 133 | %define rp rdi |
|---|
| 134 | %define A_x r8 |
|---|
| 135 | %define A_y r9 |
|---|
| 136 | %define A_z r10 |
|---|
| 137 | %define B_x r11 |
|---|
| 138 | %define B_y r12 |
|---|
| 139 | %define B_z r13 |
|---|
| 140 | %define temp r14 |
|---|
| 141 | %define index r15 |
|---|
| 142 | |
|---|
| 143 | |
|---|
| 144 | ;; INPUT PARAMETERS |
|---|
| 145 | ;; rp rdi |
|---|
| 146 | ;; s1p rsi |
|---|
| 147 | ;; n rdx |
|---|
| 148 | ;; s2limb rcx |
|---|
| 149 | BITS 64 |
|---|
| 150 | G_EXPORT mpn_submul_1 |
|---|
| 151 | ;; Compare the limb count |
|---|
| 152 | ;; with the threshold value. |
|---|
| 153 | ;; If the limb count is small |
|---|
| 154 | ;; we just use the small loop, |
|---|
| 155 | ;; otherwise we jump to the |
|---|
| 156 | ;; more complicated loop. |
|---|
| 157 | cmp rdx,THRESHOLD |
|---|
| 158 | jge L_mpn_submul_1_main_loop_prep |
|---|
| 159 | mov r11,rdx |
|---|
| 160 | lea rsi,[rsi+rdx*8] |
|---|
| 161 | lea rdi,[rdi+rdx*8] |
|---|
| 162 | neg r11 |
|---|
| 163 | xor r8, r8 |
|---|
| 164 | xor r10, r10 |
|---|
| 165 | jmp L_mpn_submul_1_small_loop |
|---|
| 166 | |
|---|
| 167 | align 16 |
|---|
| 168 | L_mpn_submul_1_small_loop: |
|---|
| 169 | mov rax,[rsi+r11*8] |
|---|
| 170 | mul rcx |
|---|
| 171 | add rax,r8 |
|---|
| 172 | adc rdx,r10 |
|---|
| 173 | sub [rdi+r11*8],rax |
|---|
| 174 | mov r8,r10 |
|---|
| 175 | adc r8,rdx |
|---|
| 176 | inc r11 |
|---|
| 177 | jne L_mpn_submul_1_small_loop |
|---|
| 178 | |
|---|
| 179 | mov rax,r8 |
|---|
| 180 | ret |
|---|
| 181 | |
|---|
| 182 | L_mpn_submul_1_main_loop_prep: |
|---|
| 183 | push r15 |
|---|
| 184 | push r14 |
|---|
| 185 | push r13 |
|---|
| 186 | push r12 |
|---|
| 187 | ;; If n is even, we need to do three |
|---|
| 188 | ;; pre-multiplies, if n is odd we only |
|---|
| 189 | ;; need to do two. |
|---|
| 190 | mov temp,rdx |
|---|
| 191 | mov index,0 |
|---|
| 192 | mov A_x,0 |
|---|
| 193 | mov A_y,0 |
|---|
| 194 | and rdx,1 |
|---|
| 195 | jnz L_mpn_submul_1_odd_n |
|---|
| 196 | |
|---|
| 197 | ;; Case n is even |
|---|
| 198 | mov rax,ADDR(s1p,index,0) |
|---|
| 199 | mul s2limb |
|---|
| 200 | sub ADDR(rp,index,0),rax |
|---|
| 201 | adc A_x,rdx |
|---|
| 202 | add index,1 |
|---|
| 203 | ;; At this point |
|---|
| 204 | ;; temp = n (even) |
|---|
| 205 | ;; index = 1 |
|---|
| 206 | |
|---|
| 207 | L_mpn_submul_1_odd_n: |
|---|
| 208 | ;; Now |
|---|
| 209 | ;; temp = n |
|---|
| 210 | ;; index = 1 if n even |
|---|
| 211 | ;; = 0 if n odd |
|---|
| 212 | ;; |
|---|
| 213 | mov rax,ADDR(s1p,index,0) |
|---|
| 214 | mul s2limb |
|---|
| 215 | mov A_z,ADDR(rp,index,0) |
|---|
| 216 | add A_x,rax |
|---|
| 217 | adc A_y,rdx |
|---|
| 218 | |
|---|
| 219 | mov rax,ADDR(s1p,index,1) |
|---|
| 220 | mul s2limb |
|---|
| 221 | mov B_z,ADDR(rp,index,1) |
|---|
| 222 | mov B_x,rax |
|---|
| 223 | mov B_y,rdx |
|---|
| 224 | mov rax,ADDR(s1p,index,2) |
|---|
| 225 | |
|---|
| 226 | add index,3 |
|---|
| 227 | lea s1p,ADDR(s1p,temp,-1) |
|---|
| 228 | lea rp,ADDR(rp,temp,-1) |
|---|
| 229 | neg temp |
|---|
| 230 | add index,temp |
|---|
| 231 | ;; At this point: |
|---|
| 232 | ;; s1p = address of last s1limb |
|---|
| 233 | ;; rp = address of last rplimb |
|---|
| 234 | ;; temp = -n |
|---|
| 235 | ;; index = 4 - n if n even |
|---|
| 236 | ;; = 3 - n if n odd |
|---|
| 237 | ;; |
|---|
| 238 | ;; So, index is a (negative) even |
|---|
| 239 | ;; number. |
|---|
| 240 | ;; |
|---|
| 241 | ;; ***************************************** |
|---|
| 242 | ;; ATTENTION: |
|---|
| 243 | ;; |
|---|
| 244 | ;; From here on, I will use array |
|---|
| 245 | ;; indexing notation in the comments |
|---|
| 246 | ;; because it is convenient. So, I |
|---|
| 247 | ;; will pretend that index is positive |
|---|
| 248 | ;; because then a comment like |
|---|
| 249 | ;; B_z = rp[index-1] |
|---|
| 250 | ;; is easier to read. |
|---|
| 251 | ;; However, keep in mind that index is |
|---|
| 252 | ;; actually a negative number indexing |
|---|
| 253 | ;; back from the end of the array. |
|---|
| 254 | ;; This is a common trick to remove one |
|---|
| 255 | ;; compare operation from the main loop. |
|---|
| 256 | ;; ***************************************** |
|---|
| 257 | |
|---|
| 258 | ;; |
|---|
| 259 | ;; Now we enter a spin-up loop the |
|---|
| 260 | ;; will make sure that the index is |
|---|
| 261 | ;; a multiple of UNROLL_SIZE before |
|---|
| 262 | ;; going to our main unrolled loop. |
|---|
| 263 | mov temp,index |
|---|
| 264 | neg temp |
|---|
| 265 | and temp,UNROLL_MASK |
|---|
| 266 | jz L_mpn_submul_1_main_loop |
|---|
| 267 | shr temp,1 |
|---|
| 268 | L_mpn_submul_1_main_loop_spin_up: |
|---|
| 269 | ;; At this point we should have: |
|---|
| 270 | ;; |
|---|
| 271 | ;; A_x = low_mul[index-2] + carry_in |
|---|
| 272 | ;; A_y = high_mul[index-2] + CF |
|---|
| 273 | ;; A_z = rp[index-2] |
|---|
| 274 | ;; |
|---|
| 275 | ;; B_x = low_mul[index-1] |
|---|
| 276 | ;; B_y = high_mul[index-1] |
|---|
| 277 | ;; B_z = rp[index-1] |
|---|
| 278 | ;; |
|---|
| 279 | ;; rax = s1p[index] |
|---|
| 280 | mul s2limb |
|---|
| 281 | sub A_z,A_x |
|---|
| 282 | lea A_x,[rax] |
|---|
| 283 | mov rax,ADDR(s1p,index,1) |
|---|
| 284 | adc B_x,A_y |
|---|
| 285 | mov ADDR(rp,index,-2),A_z |
|---|
| 286 | mov A_z,ADDR(rp,index,0) |
|---|
| 287 | adc B_y,0 |
|---|
| 288 | lea A_y,[rdx] |
|---|
| 289 | ;; At this point we should have: |
|---|
| 290 | ;; |
|---|
| 291 | ;; B_x = low_mul[index-1] + carry_in |
|---|
| 292 | ;; B_y = high_mul[index-1] + CF |
|---|
| 293 | ;; B_z = rp[index-1] |
|---|
| 294 | ;; |
|---|
| 295 | ;; A_x = low_mul[index] |
|---|
| 296 | ;; A_y = high_mul[index] |
|---|
| 297 | ;; A_z = rp[index] |
|---|
| 298 | ;; |
|---|
| 299 | ;; rax = s1p[index+1] |
|---|
| 300 | mul s2limb |
|---|
| 301 | sub B_z,B_x |
|---|
| 302 | lea B_x,[rax] |
|---|
| 303 | mov rax,ADDR(s1p,index,2) |
|---|
| 304 | adc A_x,B_y |
|---|
| 305 | mov ADDR(rp,index,-1),B_z |
|---|
| 306 | mov B_z,ADDR(rp,index,1) |
|---|
| 307 | adc A_y,0 |
|---|
| 308 | lea B_y,[rdx] |
|---|
| 309 | |
|---|
| 310 | add index,2 |
|---|
| 311 | sub temp,1 |
|---|
| 312 | jnz L_mpn_submul_1_main_loop_spin_up |
|---|
| 313 | |
|---|
| 314 | jmp L_mpn_submul_1_main_loop |
|---|
| 315 | |
|---|
| 316 | align 16 |
|---|
| 317 | L_mpn_submul_1_main_loop: |
|---|
| 318 | ;; The code here is really the same |
|---|
| 319 | ;; logic as the spin-up loop. It's |
|---|
| 320 | ;; just been unrolled. |
|---|
| 321 | %assign unroll_index 0 |
|---|
| 322 | %rep UNROLL_SIZE/2 |
|---|
| 323 | mul s2limb |
|---|
| 324 | sub A_z,A_x |
|---|
| 325 | lea A_x,[rax] |
|---|
| 326 | mov rax,ADDR(s1p,index,(2*unroll_index+1)) |
|---|
| 327 | adc B_x,A_y |
|---|
| 328 | mov ADDR(rp,index,(2*unroll_index-2)),A_z |
|---|
| 329 | mov A_z,ADDR(rp,index,(2*unroll_index)) |
|---|
| 330 | adc B_y,0 |
|---|
| 331 | lea A_y,[rdx] |
|---|
| 332 | |
|---|
| 333 | mul s2limb |
|---|
| 334 | sub B_z,B_x |
|---|
| 335 | lea B_x,[rax] |
|---|
| 336 | mov rax,ADDR(s1p,index,(2*unroll_index+2)) |
|---|
| 337 | adc A_x,B_y |
|---|
| 338 | mov ADDR(rp,index,(2*unroll_index-1)),B_z |
|---|
| 339 | mov B_z,ADDR(rp,index,(2*unroll_index+1)) |
|---|
| 340 | adc A_y,0 |
|---|
| 341 | lea B_y,[rdx] |
|---|
| 342 | %assign unroll_index unroll_index+1 |
|---|
| 343 | %endrep |
|---|
| 344 | |
|---|
| 345 | |
|---|
| 346 | add index,UNROLL_SIZE |
|---|
| 347 | jnz L_mpn_submul_1_main_loop |
|---|
| 348 | |
|---|
| 349 | L_mpn_submul_1_finish: |
|---|
| 350 | ;; At this point we should have: |
|---|
| 351 | ;; |
|---|
| 352 | ;; index = n-1 |
|---|
| 353 | ;; |
|---|
| 354 | ;; A_x = low_mul[index-2] + carry_in |
|---|
| 355 | ;; A_y = high_mul[index-2] + CF |
|---|
| 356 | ;; A_z = rp[index-2] |
|---|
| 357 | ;; |
|---|
| 358 | ;; B_x = low_mul[index-1] |
|---|
| 359 | ;; B_y = high_mul[index-1] |
|---|
| 360 | ;; B_z = rp[index-1] |
|---|
| 361 | ;; |
|---|
| 362 | ;; rax = s1p[index] |
|---|
| 363 | mul s2limb |
|---|
| 364 | sub A_z,A_x |
|---|
| 365 | mov A_x,rax |
|---|
| 366 | mov ADDR(rp,index,-2),A_z |
|---|
| 367 | mov A_z,ADDR(rp,index,0) |
|---|
| 368 | adc B_x,A_y |
|---|
| 369 | adc B_y,0 |
|---|
| 370 | mov A_y,rdx |
|---|
| 371 | ;; At this point we should have: |
|---|
| 372 | ;; |
|---|
| 373 | ;; index = n-1 |
|---|
| 374 | ;; |
|---|
| 375 | ;; B_x = low_mul[index-1] + carry_in |
|---|
| 376 | ;; B_y = high_mul[index-1] + CF |
|---|
| 377 | ;; B_z = rp[index-1] |
|---|
| 378 | ;; |
|---|
| 379 | ;; A_x = low_mul[index] |
|---|
| 380 | ;; A_y = high_mul[index] |
|---|
| 381 | ;; A_z = rp[index] |
|---|
| 382 | sub B_z,B_x |
|---|
| 383 | mov ADDR(rp,index,-1),B_z |
|---|
| 384 | adc A_x,B_y |
|---|
| 385 | adc A_y,0 |
|---|
| 386 | ;; At this point we should have: |
|---|
| 387 | ;; |
|---|
| 388 | ;; index = n-1 |
|---|
| 389 | ;; |
|---|
| 390 | ;; A_x = low_mul[index] + carry_in |
|---|
| 391 | ;; A_y = high_mul[index] + CF |
|---|
| 392 | ;; A_z = rp[index] |
|---|
| 393 | ;; |
|---|
| 394 | sub A_z,A_x |
|---|
| 395 | mov ADDR(rp,index,0),A_z |
|---|
| 396 | adc A_y,0 |
|---|
| 397 | |
|---|
| 398 | mov rax,A_y |
|---|
| 399 | pop r12 |
|---|
| 400 | pop r13 |
|---|
| 401 | pop r14 |
|---|
| 402 | pop r15 |
|---|
| 403 | ret |
|---|