Ticket #1: add_n.yasm

File add_n.yasm, 5.3 KB (added by wbhart, 4 years ago)
Line 
1;; **************************************************************************
2;;  x86_64 mpn_add_n -- Add two limb vectors of the same length > 0 and store
3;;  sum in a third limb vector.
4;;
5;;  Copyright (C) 2006  Jason Worth Martin <jason.worth.martin@gmail.com>
6;;
7;;  This program is free software; you can redistribute it and/or modify
8;;  it under the terms of the GNU General Public License as published by
9;;  the Free Software Foundation; either version 2 of the License, or
10;;  (at your option) any later version.
11;;
12;;  This program is distributed in the hope that it will be useful,
13;;  but WITHOUT ANY WARRANTY; without even the implied warranty of
14;;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15;;  GNU General Public License for more details.
16;;
17;;  You should have received a copy of the GNU General Public License along
18;;  with this program; if not, write to the Free Software Foundation, Inc.,
19;;  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20;;
21;; **************************************************************************
22;;
23;;
24;; CREDITS
25;;
26;; This code is based largely on Pierrick Gaudry's excellent assembly
27;; support for the AMD64 architecture.  (Note that Intel64 and AMD64,
28;; while using the same instruction set, have very different
29;; microarchitectures.  So, this code performs very poorly on AMD64
30;; machines even though it is near-optimal on Intel64.)
31;;
32;; Roger Golliver works for Intel and provided insightful improvements
33;; particularly in using the "lea" instruction to perform additions
34;; and register-to-register moves.
35;;
36;; Eric Bainville has a brilliant exposition of optimizing arithmetic for
37;; AMD64 (http://www.bealto.it).  I adapted many of the ideas he
38;; describes to Intel64.
39;;
40;; Agner Fog is a demigod in the x86 world.  If you are reading assembly
41;; code files and you haven't heard of Agner Fog, then take a minute to
42;; look over his software optimization manuals (http://www.agner.org/).
43;; They are superb.
44;;
45;; *********************************************************************
46
47%ifdef  __JWM_Test_Code__
48%include 'yasm_mac.inc'
49%else
50%include '../yasm_mac.inc'
51%endif
52
53;;
54;; If YASM supports lahf and sahf instructions, then we'll get rid
55;; of this.
56;;
57%define save_CF_to_reg_a  db    0x9f
58%define get_CF_from_reg_a db    0x9e
59;; define(`save_CF_to_reg_a',`setc      %al')
60;; define(`get_CF_from_reg_a',`bt       `$'0x0,%rax')
61
62
63;;         cycles/limb
64;; Hammer:     2.5 (for 1024 limbs)
65;; Woodcrest:  2.6 (for 1024 limbs)     
66
67;; INPUT PARAMETERS
68;; rp   rdi
69;; up   rsi
70;; vp   rdx
71;; n    rcx
72        BITS    64
73G_EXPORT mpn_add_n
74        push    rbp             ;; Save off callee-save registers
75        push    rbx
76        push    r12
77        push    r13
78        push    r14
79        push    r15
80
81        xor     r15,r15                 ;; r15 will be our index, so
82                                        ;; I'll call it i here after
83        save_CF_to_reg_a                ;; Save CF
84
85        mov     r9,rcx
86        sub     r9,4                    ;; r9 = n-(i+4)
87
88        align   16                      ;; aligning for loop
89L_mpn_add_n_main_loop:
90        ;; The goal of our main unrolled loop is to keep all the
91        ;; execution units as busy as possible.  Since
92        ;; there are three ALUs, we try to perform three
93        ;; adds at a time.  Of course, we will have the
94        ;; carry dependency, so there is at least one
95        ;; clock cycle between each adc.  However, we'll
96        ;; try to keep the other execution units busy
97        ;; with loads and stores at the same time so that
98        ;; our net throughput is close to one add per clock
99        ;; cycle.  Hopefully this function will have asymptotic
100        ;; behavior of taking 3*n clock cycles where n is the
101        ;; number of limbs to add.
102        ;;
103        ;; Note that I'm using FOUR adds at a time, this is just
104        ;; because I wanted to use up all available registers since
105        ;; I'm hoping the out-of-order and loop-pipeline logic in
106        ;; the Xeon will help us out.
107
108        ;; See if we are still looping
109        jle     L_mpn_add_n_loop_done
110
111        get_CF_from_reg_a               ;; recover CF
112       
113        ;; Load inputs into rbx and r8
114        ;; add with carry, and put result in r8
115        ;; then store r8 to output.
116        mov     rbx,[rsi+r15*8]
117        mov     r8,[rdx+r15*8]
118        adc     r8,rbx
119        mov     [rdi+r15*8],r8
120       
121        ;; Load inputs into r9 and r10
122        ;; add with carry, and put result in r10
123        ;; then store r10 to output.
124        mov     r9,[8+rsi+r15*8]
125        mov     r10,[8+rdx+r15*8]
126        adc     r10,r9
127        mov     [8+rdi+r15*8],r10
128       
129        ;; Load inputs into r11 and r12
130        ;; add with carry, and put result in r12
131        ;; then store r12 to output.
132        mov     r11,[16+rsi+r15*8]
133        mov     r12,[16+rdx+r15*8]
134        adc     r12,r11
135        mov     [16+rdi+r15*8],r12
136
137        ;; Load inputs into r13 and r14
138        ;; add with carry, and put result in r14
139        ;; then store r14 to output.
140        mov     r13,[24+rsi+r15*8]
141        mov     r14,[24+rdx+r15*8]
142        adc     r14,r13
143        mov     [24+rdi+r15*8],r14
144
145        save_CF_to_reg_a                ;; save CF
146
147        mov     r10,r15
148        add     r10,8
149        add     r15,4           ;; increment by 4.
150       
151        mov     r9,rcx
152        sub     r9,r10          ;; r9 = n-(i+4)
153        jmp     L_mpn_add_n_main_loop
154
155L_mpn_add_n_loop_done:
156        mov     r15,rcx         ;;
157        sub     r15,r9          ;; r15 = n-(n-(i+4))=i+4
158        sub     r15,4           ;; r15 = i
159        cmp     r15,rcx
160L_mpn_add_n_post_loop:
161        je      L_mpn_add_n_exit
162        get_CF_from_reg_a               ;; recover CF
163       
164        ;; Load inputs into rbx and r8
165        ;; add with carry, and put result in r8
166        ;; then store r8 to output.
167        mov     rbx,[rsi+r15*8]
168        mov     r8,[rdx+r15*8]
169        adc     r8,rbx
170        mov     [rdi+r15*8],r8
171        save_CF_to_reg_a                ;; save CF
172        add     r15,1
173        cmp     r15,rcx
174        jmp     L_mpn_add_n_post_loop
175       
176       
177L_mpn_add_n_exit:
178        xor     rcx,rcx
179        get_CF_from_reg_a       ;; recover the CF
180        mov     rax,rcx         ;; Clears rax without affecting carry flag
181        adc     rax,rax         ;; returns carry status.
182       
183        pop     r15             ;; restore callee-save registers
184        pop     r14
185        pop     r13
186        pop     r12
187        pop     rbx
188        pop     rbp
189        ret