-rw-r--r-- 6466 high-ctidh-20210504/fp512.S
/* DO NOT EDIT! generated by ./autogen */ .intel_syntax noprefix #include "uintbig_namespace.h" #include "fp_namespace.h" .section .rodata .set pbits,511 .set pbytes,64 .set plimbs,8 .inv_min_p_mod_r: /* -p^-1 mod 2^64 */ .quad 0x66c1301f632e294d .global fp_0 fp_0: .zero 64 .global fp_1 fp_1: /* 2^512 mod p */ .quad 0xc8fc8df598726f0a, 0x7b1bc81750a6af95, 0x5d319e67c1e961b4, 0xb0aa7275301955f1 .quad 0x4a080672d9ba6c64, 0x97a5ef8a246ee77b, 0x06ea9e5d4383676a, 0x3496e2e117e0ec80 .global fp_2 fp_2: /* 2^513 mod p */ .quad 0x767762e5fd1e1599, 0x33c5743a49a0b6f6, 0x68fc0c0364c77443, 0xb9aa1e24f83f56db .quad 0x3914101f20520efb, 0x7b1ed6d95b1542b4, 0x114a8be928c8828a, 0x03793732bbb24f40 .r_squared_mod_p: /* (2^512)^2 mod p */ .quad 0x36905b572ffc1724, 0x67086f4525f1f27d, 0x4faf3fbfd22370ca, 0x192ea214bcc584b1 .quad 0x5dae03ee2f5de3d0, 0x1e9248731776b371, 0xad5f166e20e4f52d, 0x4ed759aea6f3917e .section .data .global fp_mulsq_count fp_mulsq_count: .quad 0 .global fp_sq_count fp_sq_count: .quad 0 .global fp_addsub_count fp_addsub_count: .quad 0 .section .text .p2align 4,,15 .global fp_copy fp_copy: cld mov rcx, plimbs rep movsq ret .global fp_cmov fp_cmov: movzx rax, dl neg rax .set k, 0 .rept plimbs mov rcx, [rdi + 8*k] mov rdx, [rsi + 8*k] xor rdx, rcx and rdx, rax xor rcx, rdx mov [rdi + 8*k], rcx .set k, k+1 .endr ret .global fp_cswap fp_cswap: movzx rax, dl neg rax .set k, 0 .rept plimbs mov rcx, [rdi + 8*k] mov rdx, [rsi + 8*k] mov r8, rcx xor r8, rdx and r8, rax xor rcx, r8 xor rdx, r8 mov [rdi + 8*k], rcx mov [rsi + 8*k], rdx .set k, k+1 .endr ret .reduce_once: push rbp mov rbp, rdi mov rdi, [rbp + 0] sub rdi, [rip + uintbig_p + 0] mov rsi, [rbp + 8] sbb rsi, [rip + uintbig_p + 8] mov rdx, [rbp + 16] sbb rdx, [rip + uintbig_p + 16] mov rcx, [rbp + 24] sbb rcx, [rip + uintbig_p + 24] mov r8, [rbp + 32] sbb r8, [rip + uintbig_p + 32] mov r9, [rbp + 40] sbb r9, [rip + uintbig_p + 40] mov r10, [rbp + 48] sbb r10, [rip + uintbig_p + 48] mov r11, [rbp + 56] sbb r11, [rip + uintbig_p + 56] setnc al movzx rax, al neg rax .macro cswap2, r, m xor \r, \m and \r, rax xor \m, \r .endm cswap2 rdi, [rbp + 0] cswap2 rsi, [rbp + 8] cswap2 rdx, [rbp + 16] cswap2 rcx, [rbp + 24] cswap2 r8, [rbp + 32] cswap2 r9, [rbp + 40] cswap2 r10, [rbp + 48] cswap2 r11, [rbp + 56] pop rbp ret .global fp_add2 fp_add2: mov rdx, rdi .global fp_add3 fp_add3: addq [fp_addsub_count+rip],1 push rdi call uintbig_add3 pop rdi jmp .reduce_once .global fp_sub2 fp_sub2: mov rdx, rdi xchg rsi, rdx .global fp_sub3 fp_sub3: addq [fp_addsub_count+rip],1 push rdi call uintbig_sub3 pop rdi neg rax sub rsp, pbytes mov rcx, [rip + uintbig_p + 0] and rcx, rax mov [rsp + 0],rcx .set k, 1 .rept plimbs-1 mov rcx, [rip + uintbig_p + 8*k] and rcx, rax mov [rsp + 8*k], rcx .set k, k+1 .endr mov rcx, [rsp + 0] add rcx, [rdi + 0] mov [rdi + 0], rcx .set k, 1 .rept plimbs-1 mov rcx, [rsp + 8*k] adc rcx, [rdi + 8*k] mov [rdi + 8*k], rcx .set k, k+1 .endr add rsp, pbytes ret /* Montgomery arithmetic */ .global fp_mul2 fp_mul2: mov rdx, rdi .global fp_mul3 fp_mul3: push rbp push rbx push r12 push r13 push r14 push r15 push rdi addq [fp_mulsq_count+rip],1 mov rdi, rsi mov rsi, rdx xor r8, r8 xor r9, r9 xor r10, r10 xor r11, r11 xor r12, r12 xor r13, r13 xor r14, r14 xor r15, r15 xor rbp, rbp /* flags are already cleared */ .macro MULSTEP, k, r0, r1, r2, r3, r4, r5, r6, r7, r8 mov rdx, [rsi + 0] mulx rcx, rdx, [rdi + 8*\k] add rdx, \r0 mulx rcx, rdx, [rip + .inv_min_p_mod_r] xor rax, rax /* clear flags */ mulx rbx, rax, [rip + uintbig_p + 0] adox \r0, rax mulx rcx, rax, [rip + uintbig_p + 8] adcx \r1, rbx adox \r1, rax mulx rbx, rax, [rip + uintbig_p + 16] adcx \r2, rcx adox \r2, rax mulx rcx, rax, [rip + uintbig_p + 24] adcx \r3, rbx adox \r3, rax mulx rbx, rax, [rip + uintbig_p + 32] adcx \r4, rcx adox \r4, rax mulx rcx, rax, [rip + uintbig_p + 40] adcx \r5, rbx adox \r5, rax mulx rbx, rax, [rip + uintbig_p + 48] adcx \r6, rcx adox \r6, rax mulx rcx, rax, [rip + uintbig_p + 56] adcx \r7, rbx adox \r7, rax mov rax, 0 adcx \r8, rcx adox \r8, rax mov rdx, [rdi + 8*\k] xor rax, rax /* clear flags */ mulx rbx, rax, [rsi + 0] adox \r0, rax mulx rcx, rax, [rsi + 8] adcx \r1, rbx adox \r1, rax mulx rbx, rax, [rsi + 16] adcx \r2, rcx adox \r2, rax mulx rcx, rax, [rsi + 24] adcx \r3, rbx adox \r3, rax mulx rbx, rax, [rsi + 32] adcx \r4, rcx adox \r4, rax mulx rcx, rax, [rsi + 40] adcx \r5, rbx adox \r5, rax mulx rbx, rax, [rsi + 48] adcx \r6, rcx adox \r6, rax mulx rcx, rax, [rsi + 56] adcx \r7, rbx adox \r7, rax mov rax, 0 adcx \r8, rcx adox \r8, rax .endm MULSTEP 0, r8, r9, r10, r11, r12, r13, r14, r15, rbp MULSTEP 1, r9, r10, r11, r12, r13, r14, r15, rbp, r8 MULSTEP 2, r10, r11, r12, r13, r14, r15, rbp, r8, r9 MULSTEP 3, r11, r12, r13, r14, r15, rbp, r8, r9, r10 MULSTEP 4, r12, r13, r14, r15, rbp, r8, r9, r10, r11 MULSTEP 5, r13, r14, r15, rbp, r8, r9, r10, r11, r12 MULSTEP 6, r14, r15, rbp, r8, r9, r10, r11, r12, r13 MULSTEP 7, r15, rbp, r8, r9, r10, r11, r12, r13, r14 pop rdi mov [rdi + 0], rbp mov [rdi + 8], r8 mov [rdi + 16], r9 mov [rdi + 24], r10 mov [rdi + 32], r11 mov [rdi + 40], r12 mov [rdi + 48], r13 mov [rdi + 56], r14 pop r15 pop r14 pop r13 pop r12 pop rbx pop rbp jmp .reduce_once .global fp_sq1 fp_sq1: mov rsi, rdi .global fp_sq2 fp_sq2: /* TODO implement optimized Montgomery squaring */ mov rdx, rsi addq [fp_sq_count+rip],1 jmp fp_mul3