-rw-r--r-- 12360 high-ctidh-20210504/fp1024.S
/* DO NOT EDIT! generated by ./autogen */ .intel_syntax noprefix #include "uintbig_namespace.h" #include "fp_namespace.h" .section .rodata .set pbits,1020 .set pbytes,128 .set plimbs,16 .inv_min_p_mod_r: /* -p^-1 mod 2^64 */ .quad 0xd2c2c24160038025 .global fp_0 fp_0: .zero 128 .global fp_1 fp_1: /* 2^1024 mod p */ .quad 0x65e7ee6590e6567d, 0x40a5f2587fef86d4, 0x99f9e607b99d62f2, 0x1089df50f4f8f26d .quad 0x592890dd02bb585a, 0xe1b6be68b969ecb9, 0xaebe3c10395f33c3, 0x5ef9652396531f1b .quad 0x28d37db76b7a1b7f, 0x86d089fa474b4a3f, 0xdbce120cc7a4fff2, 0x08b3f947137340ac .quad 0x913f3e7c71b37ce5, 0xc7d1b17b09ec4577, 0x9d834aff6f7956b6, 0x044c4b3e968ec2b8 .global fp_2 fp_2: /* 2^1025 mod p */ .quad 0xcbcfdccb21ccacfa, 0x814be4b0ffdf0da8, 0x33f3cc0f733ac5e4, 0x2113bea1e9f1e4db .quad 0xb25121ba0576b0b4, 0xc36d7cd172d3d972, 0x5d7c782072be6787, 0xbdf2ca472ca63e37 .quad 0x51a6fb6ed6f436fe, 0x0da113f48e96947e, 0xb79c24198f49ffe5, 0x1167f28e26e68159 .quad 0x227e7cf8e366f9ca, 0x8fa362f613d88aef, 0x3b0695fedef2ad6d, 0x0898967d2d1d8571 .r_squared_mod_p: /* (2^1024)^2 mod p */ .quad 0xd6b8f146ec5055af, 0x68ac5d7707ccb03a, 0x1322c9b9837dca17, 0x4f2940830c1d2b35 .quad 0x8c1a56e5bf96471a, 0x6cdde00636c4f801, 0x9365ec4fa327c9ac, 0xa0056a67c1de0e82 .quad 0x8aa6fa7e6811faa8, 0x9aad9631bb760403, 0x156b34c683839b9d, 0xa5ae047480992b2c .quad 0xc124d930289048b5, 0x4f8a8344bbe56288, 0xe1a2eb1d838b8237, 0x057162f911ca93a3 .section .data .global fp_mulsq_count fp_mulsq_count: .quad 0 .global fp_sq_count fp_sq_count: .quad 0 .global fp_addsub_count fp_addsub_count: .quad 0 .section .text .p2align 4,,15 .global fp_copy fp_copy: cld mov rcx, plimbs rep movsq ret .global fp_cmov fp_cmov: movzx rax, dl neg rax .set k, 0 .rept plimbs mov rcx, [rdi + 8*k] mov rdx, [rsi + 8*k] xor rdx, rcx and rdx, rax xor rcx, rdx mov [rdi + 8*k], rcx .set k, k+1 .endr ret .global fp_cswap fp_cswap: movzx rax, dl neg rax .set k, 0 .rept plimbs mov rcx, [rdi + 8*k] mov rdx, [rsi + 8*k] mov r8, rcx xor r8, rdx and r8, rax xor rcx, r8 xor rdx, r8 mov [rdi + 8*k], rcx mov [rsi + 8*k], rdx .set k, k+1 .endr ret .reduce_once: push rbp sub rsp, 64 mov rbp, rdi mov rdi, [rbp + 0] sub rdi, [rip + uintbig_p + 0] movq [rsp + 0], rdi mov rsi, [rbp + 8] sbb rsi, [rip + uintbig_p + 8] movq [rsp + 8], rsi mov rdx, [rbp + 16] sbb rdx, [rip + uintbig_p + 16] movq [rsp + 16], rdx mov rcx, [rbp + 24] sbb rcx, [rip + uintbig_p + 24] movq [rsp + 24], rcx mov r8, [rbp + 32] sbb r8, [rip + uintbig_p + 32] movq [rsp + 32], r8 mov r9, [rbp + 40] sbb r9, [rip + uintbig_p + 40] movq [rsp + 40], r9 mov r10, [rbp + 48] sbb r10, [rip + uintbig_p + 48] movq [rsp + 48], r10 mov r11, [rbp + 56] sbb r11, [rip + uintbig_p + 56] movq [rsp + 56], r11 mov rdi, [rbp + 64] sbb rdi, [rip + uintbig_p + 64] mov rsi, [rbp + 72] sbb rsi, [rip + uintbig_p + 72] mov rdx, [rbp + 80] sbb rdx, [rip + uintbig_p + 80] mov rcx, [rbp + 88] sbb rcx, [rip + uintbig_p + 88] mov r8, [rbp + 96] sbb r8, [rip + uintbig_p + 96] mov r9, [rbp + 104] sbb r9, [rip + uintbig_p + 104] mov r10, [rbp + 112] sbb r10, [rip + uintbig_p + 112] mov r11, [rbp + 120] sbb r11, [rip + uintbig_p + 120] setnc al movzx rax, al neg rax .macro cswap2, r, m xor \r, \m and \r, rax xor \m, \r .endm cswap2 rdi, [rbp + 64] cswap2 rsi, [rbp + 72] cswap2 rdx, [rbp + 80] cswap2 rcx, [rbp + 88] cswap2 r8, [rbp + 96] cswap2 r9, [rbp + 104] cswap2 r10, [rbp + 112] cswap2 r11, [rbp + 120] movq rdi, [rsp + 0] cswap2 rdi, [rbp + 0] movq rsi, [rsp + 8] cswap2 rsi, [rbp + 8] movq rdx, [rsp + 16] cswap2 rdx, [rbp + 16] movq rcx, [rsp + 24] cswap2 rcx, [rbp + 24] movq r8, [rsp + 32] cswap2 r8, [rbp + 32] movq r9, [rsp + 40] cswap2 r9, [rbp + 40] movq r10, [rsp + 48] cswap2 r10, [rbp + 48] movq r11, [rsp + 56] cswap2 r11, [rbp + 56] add rsp, 64 pop rbp ret .global fp_add2 fp_add2: mov rdx, rdi .global fp_add3 fp_add3: addq [fp_addsub_count+rip],1 push rdi call uintbig_add3 pop rdi jmp .reduce_once .global fp_sub2 fp_sub2: mov rdx, rdi xchg rsi, rdx .global fp_sub3 fp_sub3: addq [fp_addsub_count+rip],1 push rdi call uintbig_sub3 pop rdi neg rax sub rsp, pbytes mov rcx, [rip + uintbig_p + 0] and rcx, rax mov [rsp + 0],rcx .set k, 1 .rept plimbs-1 mov rcx, [rip + uintbig_p + 8*k] and rcx, rax mov [rsp + 8*k], rcx .set k, k+1 .endr mov rcx, [rsp + 0] add rcx, [rdi + 0] mov [rdi + 0], rcx .set k, 1 .rept plimbs-1 mov rcx, [rsp + 8*k] adc rcx, [rdi + 8*k] mov [rdi + 8*k], rcx .set k, k+1 .endr add rsp, pbytes ret /* Montgomery arithmetic */ .global fp_mul2 fp_mul2: mov rdx, rdi .global fp_mul3 fp_mul3: push rbp push rbx addq [fp_mulsq_count+rip],1 sub rsp,144 mov [rsp+136],rdi mov rdi,rsi mov rsi,rdx /* XXX: put directly into output */ xor rax,rax mov [rsp+0],rax mov [rsp+8],rax mov [rsp+16],rax mov [rsp+24],rax mov [rsp+32],rax mov [rsp+40],rax mov [rsp+48],rax mov [rsp+56],rax mov [rsp+64],rax mov [rsp+72],rax mov [rsp+80],rax mov [rsp+88],rax mov [rsp+96],rax mov [rsp+104],rax mov [rsp+112],rax mov [rsp+120],rax mov [rsp+128],rax .macro MULSTEP, k, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15, I16 mov r11,[rsp+\I0] mov rdx, [rsi + 0] mulx rcx, rdx, [rdi + 8*\k] add rdx, r11 mulx rcx, rdx, [rip + .inv_min_p_mod_r] xor rax, rax /* clear flags */ mulx rbx, rax, [rip + uintbig_p + 0] adox r11, rax mov [rsp+\I0],r11 mov r11,[rsp+\I1] mulx rcx, rax, [rip + uintbig_p + 8] adcx r11, rbx adox r11, rax mov [rsp+\I1],r11 mov r11,[rsp+\I2] mulx rbx, rax, [rip + uintbig_p + 16] adcx r11, rcx adox r11, rax mov [rsp+\I2],r11 mov r11,[rsp+\I3] mulx rcx, rax, [rip + uintbig_p + 24] adcx r11, rbx adox r11, rax mov [rsp+\I3],r11 mov r11,[rsp+\I4] mulx rbx, rax, [rip + uintbig_p + 32] adcx r11, rcx adox r11, rax mov [rsp+\I4],r11 mov r11,[rsp+\I5] mulx rcx, rax, [rip + uintbig_p + 40] adcx r11, rbx adox r11, rax mov [rsp+\I5],r11 mov r11,[rsp+\I6] mulx rbx, rax, [rip + uintbig_p + 48] adcx r11, rcx adox r11, rax mov [rsp+\I6],r11 mov r11,[rsp+\I7] mulx rcx, rax, [rip + uintbig_p + 56] adcx r11, rbx adox r11, rax mov [rsp+\I7],r11 mov r11,[rsp+\I8] mulx rbx, rax, [rip + uintbig_p + 64] adcx r11, rcx adox r11, rax mov [rsp+\I8],r11 mov r11,[rsp+\I9] mulx rcx, rax, [rip + uintbig_p + 72] adcx r11, rbx adox r11, rax mov [rsp+\I9],r11 mov r11,[rsp+\I10] mulx rbx, rax, [rip + uintbig_p + 80] adcx r11, rcx adox r11, rax mov [rsp+\I10],r11 mov r11,[rsp+\I11] mulx rcx, rax, [rip + uintbig_p + 88] adcx r11, rbx adox r11, rax mov [rsp+\I11],r11 mov r11,[rsp+\I12] mulx rbx, rax, [rip + uintbig_p + 96] adcx r11, rcx adox r11, rax mov [rsp+\I12],r11 mov r11,[rsp+\I13] mulx rcx, rax, [rip + uintbig_p + 104] adcx r11, rbx adox r11, rax mov [rsp+\I13],r11 mov r11,[rsp+\I14] mulx rbx, rax, [rip + uintbig_p + 112] adcx r11, rcx adox r11, rax mov [rsp+\I14],r11 mov r11,[rsp+\I15] mulx rcx, rax, [rip + uintbig_p + 120] adcx r11, rbx adox r11, rax mov [rsp+\I15],r11 mov r11,[rsp+\I16] mov rax, 0 adcx r11, rcx adox r11, rax mov [rsp+\I16],r11 mov rdx, [rdi + 8*\k] xor rax, rax /* clear flags */ mov r11,[rsp+\I0] mulx rbx, rax, [rsi + 0] adox r11, rax mov [rsp+\I0],r11 mov r11,[rsp+\I1] mulx rcx, rax, [rsi + 8] adcx r11, rbx adox r11, rax mov [rsp+\I1],r11 mov r11,[rsp+\I2] mulx rbx, rax, [rsi + 16] adcx r11, rcx adox r11, rax mov [rsp+\I2],r11 mov r11,[rsp+\I3] mulx rcx, rax, [rsi + 24] adcx r11, rbx adox r11, rax mov [rsp+\I3],r11 mov r11,[rsp+\I4] mulx rbx, rax, [rsi + 32] adcx r11, rcx adox r11, rax mov [rsp+\I4],r11 mov r11,[rsp+\I5] mulx rcx, rax, [rsi + 40] adcx r11, rbx adox r11, rax mov [rsp+\I5],r11 mov r11,[rsp+\I6] mulx rbx, rax, [rsi + 48] adcx r11, rcx adox r11, rax mov [rsp+\I6],r11 mov r11,[rsp+\I7] mulx rcx, rax, [rsi + 56] adcx r11, rbx adox r11, rax mov [rsp+\I7],r11 mov r11,[rsp+\I8] mulx rbx, rax, [rsi + 64] adcx r11, rcx adox r11, rax mov [rsp+\I8],r11 mov r11,[rsp+\I9] mulx rcx, rax, [rsi + 72] adcx r11, rbx adox r11, rax mov [rsp+\I9],r11 mov r11,[rsp+\I10] mulx rbx, rax, [rsi + 80] adcx r11, rcx adox r11, rax mov [rsp+\I10],r11 mov r11,[rsp+\I11] mulx rcx, rax, [rsi + 88] adcx r11, rbx adox r11, rax mov [rsp+\I11],r11 mov r11,[rsp+\I12] mulx rbx, rax, [rsi + 96] adcx r11, rcx adox r11, rax mov [rsp+\I12],r11 mov r11,[rsp+\I13] mulx rcx, rax, [rsi + 104] adcx r11, rbx adox r11, rax mov [rsp+\I13],r11 mov r11,[rsp+\I14] mulx rbx, rax, [rsi + 112] adcx r11, rcx adox r11, rax mov [rsp+\I14],r11 mov r11,[rsp+\I15] mulx rcx, rax, [rsi + 120] adcx r11, rbx adox r11, rax mov [rsp+\I15],r11 mov r11,[rsp+\I16] mov rax, 0 adcx r11, rcx adox r11, rax mov [rsp+\I16],r11 .endm MULSTEP 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 0 MULSTEP 1, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 0, 8 MULSTEP 2, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 0, 8, 16 MULSTEP 3, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 0, 8, 16, 24 MULSTEP 4, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 0, 8, 16, 24, 32 MULSTEP 5, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 0, 8, 16, 24, 32, 40 MULSTEP 6, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 0, 8, 16, 24, 32, 40, 48 MULSTEP 7, 64, 72, 80, 88, 96, 104, 112, 120, 128, 0, 8, 16, 24, 32, 40, 48, 56 MULSTEP 8, 72, 80, 88, 96, 104, 112, 120, 128, 0, 8, 16, 24, 32, 40, 48, 56, 64 MULSTEP 9, 80, 88, 96, 104, 112, 120, 128, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72 MULSTEP 10, 88, 96, 104, 112, 120, 128, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80 MULSTEP 11, 96, 104, 112, 120, 128, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88 MULSTEP 12, 104, 112, 120, 128, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96 MULSTEP 13, 112, 120, 128, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104 MULSTEP 14, 120, 128, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112 MULSTEP 15, 128, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120 mov rdi,[rsp+136] mov r11,[rsp+0] mov [rdi+0],r11 mov r11,[rsp+8] mov [rdi+8],r11 mov r11,[rsp+16] mov [rdi+16],r11 mov r11,[rsp+24] mov [rdi+24],r11 mov r11,[rsp+32] mov [rdi+32],r11 mov r11,[rsp+40] mov [rdi+40],r11 mov r11,[rsp+48] mov [rdi+48],r11 mov r11,[rsp+56] mov [rdi+56],r11 mov r11,[rsp+64] mov [rdi+64],r11 mov r11,[rsp+72] mov [rdi+72],r11 mov r11,[rsp+80] mov [rdi+80],r11 mov r11,[rsp+88] mov [rdi+88],r11 mov r11,[rsp+96] mov [rdi+96],r11 mov r11,[rsp+104] mov [rdi+104],r11 mov r11,[rsp+112] mov [rdi+112],r11 mov r11,[rsp+120] mov [rdi+120],r11 add rsp,144 pop rbx pop rbp jmp .reduce_once .global fp_sq1 fp_sq1: mov rsi, rdi .global fp_sq2 fp_sq2: /* TODO implement optimized Montgomery squaring */ mov rdx, rsi addq [fp_sq_count+rip],1 jmp fp_mul3