mirror of
https://github.com/facebook/zstd.git
synced 2025-03-07 01:10:04 +02:00
Mark that `huf_decompress_amd64.S` supports BTI and PAC, which it trivially does because it is empty for aarch64. The issue only requested BTI markings, but it also makes sense to mark PAC, which is the only other feature. Also run add a test for this mode to the ARM64 QEMU test. Before this PR it warns on `huf_decompress_amd64.S`, after it doesn't. Fixes Issue #3841.
596 lines
15 KiB
ArmAsm
596 lines
15 KiB
ArmAsm
/*
|
|
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
|
* in the COPYING file in the root directory of this source tree).
|
|
* You may select, at your option, one of the above-listed licenses.
|
|
*/
|
|
|
|
#include "../common/portability_macros.h"
|
|
|
|
#if defined(__ELF__) && defined(__GNUC__)
|
|
/* Stack marking
|
|
* ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
|
|
*/
|
|
.section .note.GNU-stack,"",%progbits
|
|
|
|
#if defined(__aarch64__)
|
|
/* Mark that this assembly supports BTI & PAC, because it is empty for aarch64.
|
|
* See: https://github.com/facebook/zstd/issues/3841
|
|
* See: https://gcc.godbolt.org/z/sqr5T4ffK
|
|
* See: https://lore.kernel.org/linux-arm-kernel/20200429211641.9279-8-broonie@kernel.org/
|
|
* See: https://reviews.llvm.org/D62609
|
|
*/
|
|
.pushsection .note.gnu.property, "a"
|
|
.p2align 3
|
|
.long 4 /* size of the name - "GNU\0" */
|
|
.long 0x10 /* size of descriptor */
|
|
.long 0x5 /* NT_GNU_PROPERTY_TYPE_0 */
|
|
.asciz "GNU"
|
|
.long 0xc0000000 /* pr_type - GNU_PROPERTY_AARCH64_FEATURE_1_AND */
|
|
.long 4 /* pr_datasz - 4 bytes */
|
|
.long 3 /* pr_data - GNU_PROPERTY_AARCH64_FEATURE_1_BTI | GNU_PROPERTY_AARCH64_FEATURE_1_PAC */
|
|
.p2align 3 /* pr_padding - bring everything to 8 byte alignment */
|
|
.popsection
|
|
#endif
|
|
|
|
#endif
|
|
|
|
#if ZSTD_ENABLE_ASM_X86_64_BMI2
|
|
|
|
/* Calling convention:
|
|
*
|
|
* %rdi contains the first argument: HUF_DecompressAsmArgs*.
|
|
* %rbp isn't maintained (no frame pointer).
|
|
* %rsp contains the stack pointer that grows down.
|
|
* No red-zone is assumed, only addresses >= %rsp are used.
|
|
* All register contents are preserved.
|
|
*
|
|
* TODO: Support Windows calling convention.
|
|
*/
|
|
|
|
ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
|
|
ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
|
|
ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
|
|
ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
|
|
.global HUF_decompress4X1_usingDTable_internal_fast_asm_loop
|
|
.global HUF_decompress4X2_usingDTable_internal_fast_asm_loop
|
|
.global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop
|
|
.global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop
|
|
.text
|
|
|
|
/* Sets up register mappings for clarity.
|
|
* op[], bits[], dtable & ip[0] each get their own register.
|
|
* ip[1,2,3] & olimit alias var[].
|
|
* %rax is a scratch register.
|
|
*/
|
|
|
|
#define op0 rsi
|
|
#define op1 rbx
|
|
#define op2 rcx
|
|
#define op3 rdi
|
|
|
|
#define ip0 r8
|
|
#define ip1 r9
|
|
#define ip2 r10
|
|
#define ip3 r11
|
|
|
|
#define bits0 rbp
|
|
#define bits1 rdx
|
|
#define bits2 r12
|
|
#define bits3 r13
|
|
#define dtable r14
|
|
#define olimit r15
|
|
|
|
/* var[] aliases ip[1,2,3] & olimit
|
|
* ip[1,2,3] are saved every iteration.
|
|
* olimit is only used in compute_olimit.
|
|
*/
|
|
#define var0 r15
|
|
#define var1 r9
|
|
#define var2 r10
|
|
#define var3 r11
|
|
|
|
/* 32-bit var registers */
|
|
#define vard0 r15d
|
|
#define vard1 r9d
|
|
#define vard2 r10d
|
|
#define vard3 r11d
|
|
|
|
/* Calls X(N) for each stream 0, 1, 2, 3. */
|
|
#define FOR_EACH_STREAM(X) \
|
|
X(0); \
|
|
X(1); \
|
|
X(2); \
|
|
X(3)
|
|
|
|
/* Calls X(N, idx) for each stream 0, 1, 2, 3. */
|
|
#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
|
|
X(0, idx); \
|
|
X(1, idx); \
|
|
X(2, idx); \
|
|
X(3, idx)
|
|
|
|
/* Define both _HUF_* & HUF_* symbols because MacOS
|
|
* C symbols are prefixed with '_' & Linux symbols aren't.
|
|
*/
|
|
_HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
|
|
HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
|
|
ZSTD_CET_ENDBRANCH
|
|
/* Save all registers - even if they are callee saved for simplicity. */
|
|
push %rax
|
|
push %rbx
|
|
push %rcx
|
|
push %rdx
|
|
push %rbp
|
|
push %rsi
|
|
push %rdi
|
|
push %r8
|
|
push %r9
|
|
push %r10
|
|
push %r11
|
|
push %r12
|
|
push %r13
|
|
push %r14
|
|
push %r15
|
|
|
|
/* Read HUF_DecompressAsmArgs* args from %rax */
|
|
movq %rdi, %rax
|
|
movq 0(%rax), %ip0
|
|
movq 8(%rax), %ip1
|
|
movq 16(%rax), %ip2
|
|
movq 24(%rax), %ip3
|
|
movq 32(%rax), %op0
|
|
movq 40(%rax), %op1
|
|
movq 48(%rax), %op2
|
|
movq 56(%rax), %op3
|
|
movq 64(%rax), %bits0
|
|
movq 72(%rax), %bits1
|
|
movq 80(%rax), %bits2
|
|
movq 88(%rax), %bits3
|
|
movq 96(%rax), %dtable
|
|
push %rax /* argument */
|
|
push 104(%rax) /* ilowest */
|
|
push 112(%rax) /* oend */
|
|
push %olimit /* olimit space */
|
|
|
|
subq $24, %rsp
|
|
|
|
.L_4X1_compute_olimit:
|
|
/* Computes how many iterations we can do safely
|
|
* %r15, %rax may be clobbered
|
|
* rbx, rdx must be saved
|
|
* op3 & ip0 mustn't be clobbered
|
|
*/
|
|
movq %rbx, 0(%rsp)
|
|
movq %rdx, 8(%rsp)
|
|
|
|
movq 32(%rsp), %rax /* rax = oend */
|
|
subq %op3, %rax /* rax = oend - op3 */
|
|
|
|
/* r15 = (oend - op3) / 5 */
|
|
movabsq $-3689348814741910323, %rdx
|
|
mulq %rdx
|
|
movq %rdx, %r15
|
|
shrq $2, %r15
|
|
|
|
movq %ip0, %rax /* rax = ip0 */
|
|
movq 40(%rsp), %rdx /* rdx = ilowest */
|
|
subq %rdx, %rax /* rax = ip0 - ilowest */
|
|
movq %rax, %rbx /* rbx = ip0 - ilowest */
|
|
|
|
/* rdx = (ip0 - ilowest) / 7 */
|
|
movabsq $2635249153387078803, %rdx
|
|
mulq %rdx
|
|
subq %rdx, %rbx
|
|
shrq %rbx
|
|
addq %rbx, %rdx
|
|
shrq $2, %rdx
|
|
|
|
/* r15 = min(%rdx, %r15) */
|
|
cmpq %rdx, %r15
|
|
cmova %rdx, %r15
|
|
|
|
/* r15 = r15 * 5 */
|
|
leaq (%r15, %r15, 4), %r15
|
|
|
|
/* olimit = op3 + r15 */
|
|
addq %op3, %olimit
|
|
|
|
movq 8(%rsp), %rdx
|
|
movq 0(%rsp), %rbx
|
|
|
|
/* If (op3 + 20 > olimit) */
|
|
movq %op3, %rax /* rax = op3 */
|
|
cmpq %rax, %olimit /* op3 == olimit */
|
|
je .L_4X1_exit
|
|
|
|
/* If (ip1 < ip0) go to exit */
|
|
cmpq %ip0, %ip1
|
|
jb .L_4X1_exit
|
|
|
|
/* If (ip2 < ip1) go to exit */
|
|
cmpq %ip1, %ip2
|
|
jb .L_4X1_exit
|
|
|
|
/* If (ip3 < ip2) go to exit */
|
|
cmpq %ip2, %ip3
|
|
jb .L_4X1_exit
|
|
|
|
/* Reads top 11 bits from bits[n]
|
|
* Loads dt[bits[n]] into var[n]
|
|
*/
|
|
#define GET_NEXT_DELT(n) \
|
|
movq $53, %var##n; \
|
|
shrxq %var##n, %bits##n, %var##n; \
|
|
movzwl (%dtable,%var##n,2),%vard##n
|
|
|
|
/* var[n] must contain the DTable entry computed with GET_NEXT_DELT
|
|
* Moves var[n] to %rax
|
|
* bits[n] <<= var[n] & 63
|
|
* op[n][idx] = %rax >> 8
|
|
* %ah is a way to access bits [8, 16) of %rax
|
|
*/
|
|
#define DECODE_FROM_DELT(n, idx) \
|
|
movq %var##n, %rax; \
|
|
shlxq %var##n, %bits##n, %bits##n; \
|
|
movb %ah, idx(%op##n)
|
|
|
|
/* Assumes GET_NEXT_DELT has been called.
|
|
* Calls DECODE_FROM_DELT then GET_NEXT_DELT
|
|
*/
|
|
#define DECODE_AND_GET_NEXT(n, idx) \
|
|
DECODE_FROM_DELT(n, idx); \
|
|
GET_NEXT_DELT(n) \
|
|
|
|
/* // ctz & nbBytes is stored in bits[n]
|
|
* // nbBits is stored in %rax
|
|
* ctz = CTZ[bits[n]]
|
|
* nbBits = ctz & 7
|
|
* nbBytes = ctz >> 3
|
|
* op[n] += 5
|
|
* ip[n] -= nbBytes
|
|
* // Note: x86-64 is little-endian ==> no bswap
|
|
* bits[n] = MEM_readST(ip[n]) | 1
|
|
* bits[n] <<= nbBits
|
|
*/
|
|
#define RELOAD_BITS(n) \
|
|
bsfq %bits##n, %bits##n; \
|
|
movq %bits##n, %rax; \
|
|
andq $7, %rax; \
|
|
shrq $3, %bits##n; \
|
|
leaq 5(%op##n), %op##n; \
|
|
subq %bits##n, %ip##n; \
|
|
movq (%ip##n), %bits##n; \
|
|
orq $1, %bits##n; \
|
|
shlx %rax, %bits##n, %bits##n
|
|
|
|
/* Store clobbered variables on the stack */
|
|
movq %olimit, 24(%rsp)
|
|
movq %ip1, 0(%rsp)
|
|
movq %ip2, 8(%rsp)
|
|
movq %ip3, 16(%rsp)
|
|
|
|
/* Call GET_NEXT_DELT for each stream */
|
|
FOR_EACH_STREAM(GET_NEXT_DELT)
|
|
|
|
.p2align 6
|
|
|
|
.L_4X1_loop_body:
|
|
/* Decode 5 symbols in each of the 4 streams (20 total)
|
|
* Must have called GET_NEXT_DELT for each stream
|
|
*/
|
|
FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
|
|
FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
|
|
FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
|
|
FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
|
|
FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
|
|
|
|
/* Load ip[1,2,3] from stack (var[] aliases them)
|
|
* ip[] is needed for RELOAD_BITS
|
|
* Each will be stored back to the stack after RELOAD
|
|
*/
|
|
movq 0(%rsp), %ip1
|
|
movq 8(%rsp), %ip2
|
|
movq 16(%rsp), %ip3
|
|
|
|
/* Reload each stream & fetch the next table entry
|
|
* to prepare for the next iteration
|
|
*/
|
|
RELOAD_BITS(0)
|
|
GET_NEXT_DELT(0)
|
|
|
|
RELOAD_BITS(1)
|
|
movq %ip1, 0(%rsp)
|
|
GET_NEXT_DELT(1)
|
|
|
|
RELOAD_BITS(2)
|
|
movq %ip2, 8(%rsp)
|
|
GET_NEXT_DELT(2)
|
|
|
|
RELOAD_BITS(3)
|
|
movq %ip3, 16(%rsp)
|
|
GET_NEXT_DELT(3)
|
|
|
|
/* If op3 < olimit: continue the loop */
|
|
cmp %op3, 24(%rsp)
|
|
ja .L_4X1_loop_body
|
|
|
|
/* Reload ip[1,2,3] from stack */
|
|
movq 0(%rsp), %ip1
|
|
movq 8(%rsp), %ip2
|
|
movq 16(%rsp), %ip3
|
|
|
|
/* Re-compute olimit */
|
|
jmp .L_4X1_compute_olimit
|
|
|
|
#undef GET_NEXT_DELT
|
|
#undef DECODE_FROM_DELT
|
|
#undef DECODE
|
|
#undef RELOAD_BITS
|
|
.L_4X1_exit:
|
|
addq $24, %rsp
|
|
|
|
/* Restore stack (oend & olimit) */
|
|
pop %rax /* olimit */
|
|
pop %rax /* oend */
|
|
pop %rax /* ilowest */
|
|
pop %rax /* arg */
|
|
|
|
/* Save ip / op / bits */
|
|
movq %ip0, 0(%rax)
|
|
movq %ip1, 8(%rax)
|
|
movq %ip2, 16(%rax)
|
|
movq %ip3, 24(%rax)
|
|
movq %op0, 32(%rax)
|
|
movq %op1, 40(%rax)
|
|
movq %op2, 48(%rax)
|
|
movq %op3, 56(%rax)
|
|
movq %bits0, 64(%rax)
|
|
movq %bits1, 72(%rax)
|
|
movq %bits2, 80(%rax)
|
|
movq %bits3, 88(%rax)
|
|
|
|
/* Restore registers */
|
|
pop %r15
|
|
pop %r14
|
|
pop %r13
|
|
pop %r12
|
|
pop %r11
|
|
pop %r10
|
|
pop %r9
|
|
pop %r8
|
|
pop %rdi
|
|
pop %rsi
|
|
pop %rbp
|
|
pop %rdx
|
|
pop %rcx
|
|
pop %rbx
|
|
pop %rax
|
|
ret
|
|
|
|
_HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
|
|
HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
|
|
ZSTD_CET_ENDBRANCH
|
|
/* Save all registers - even if they are callee saved for simplicity. */
|
|
push %rax
|
|
push %rbx
|
|
push %rcx
|
|
push %rdx
|
|
push %rbp
|
|
push %rsi
|
|
push %rdi
|
|
push %r8
|
|
push %r9
|
|
push %r10
|
|
push %r11
|
|
push %r12
|
|
push %r13
|
|
push %r14
|
|
push %r15
|
|
|
|
movq %rdi, %rax
|
|
movq 0(%rax), %ip0
|
|
movq 8(%rax), %ip1
|
|
movq 16(%rax), %ip2
|
|
movq 24(%rax), %ip3
|
|
movq 32(%rax), %op0
|
|
movq 40(%rax), %op1
|
|
movq 48(%rax), %op2
|
|
movq 56(%rax), %op3
|
|
movq 64(%rax), %bits0
|
|
movq 72(%rax), %bits1
|
|
movq 80(%rax), %bits2
|
|
movq 88(%rax), %bits3
|
|
movq 96(%rax), %dtable
|
|
push %rax /* argument */
|
|
push %rax /* olimit */
|
|
push 104(%rax) /* ilowest */
|
|
|
|
movq 112(%rax), %rax
|
|
push %rax /* oend3 */
|
|
|
|
movq %op3, %rax
|
|
push %rax /* oend2 */
|
|
|
|
movq %op2, %rax
|
|
push %rax /* oend1 */
|
|
|
|
movq %op1, %rax
|
|
push %rax /* oend0 */
|
|
|
|
/* Scratch space */
|
|
subq $8, %rsp
|
|
|
|
.L_4X2_compute_olimit:
|
|
/* Computes how many iterations we can do safely
|
|
* %r15, %rax may be clobbered
|
|
* rdx must be saved
|
|
* op[1,2,3,4] & ip0 mustn't be clobbered
|
|
*/
|
|
movq %rdx, 0(%rsp)
|
|
|
|
/* We can consume up to 7 input bytes each iteration. */
|
|
movq %ip0, %rax /* rax = ip0 */
|
|
movq 40(%rsp), %rdx /* rdx = ilowest */
|
|
subq %rdx, %rax /* rax = ip0 - ilowest */
|
|
movq %rax, %r15 /* r15 = ip0 - ilowest */
|
|
|
|
/* rdx = rax / 7 */
|
|
movabsq $2635249153387078803, %rdx
|
|
mulq %rdx
|
|
subq %rdx, %r15
|
|
shrq %r15
|
|
addq %r15, %rdx
|
|
shrq $2, %rdx
|
|
|
|
/* r15 = (ip0 - ilowest) / 7 */
|
|
movq %rdx, %r15
|
|
|
|
/* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */
|
|
movq 8(%rsp), %rax /* rax = oend0 */
|
|
subq %op0, %rax /* rax = oend0 - op0 */
|
|
movq 16(%rsp), %rdx /* rdx = oend1 */
|
|
subq %op1, %rdx /* rdx = oend1 - op1 */
|
|
|
|
cmpq %rax, %rdx
|
|
cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
|
|
|
|
movq 24(%rsp), %rax /* rax = oend2 */
|
|
subq %op2, %rax /* rax = oend2 - op2 */
|
|
|
|
cmpq %rax, %rdx
|
|
cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
|
|
|
|
movq 32(%rsp), %rax /* rax = oend3 */
|
|
subq %op3, %rax /* rax = oend3 - op3 */
|
|
|
|
cmpq %rax, %rdx
|
|
cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
|
|
|
|
movabsq $-3689348814741910323, %rax
|
|
mulq %rdx
|
|
shrq $3, %rdx /* rdx = rdx / 10 */
|
|
|
|
/* r15 = min(%rdx, %r15) */
|
|
cmpq %rdx, %r15
|
|
cmova %rdx, %r15
|
|
|
|
/* olimit = op3 + 5 * r15 */
|
|
movq %r15, %rax
|
|
leaq (%op3, %rax, 4), %olimit
|
|
addq %rax, %olimit
|
|
|
|
movq 0(%rsp), %rdx
|
|
|
|
/* If (op3 + 10 > olimit) */
|
|
movq %op3, %rax /* rax = op3 */
|
|
cmpq %rax, %olimit /* op3 == olimit */
|
|
je .L_4X2_exit
|
|
|
|
/* If (ip1 < ip0) go to exit */
|
|
cmpq %ip0, %ip1
|
|
jb .L_4X2_exit
|
|
|
|
/* If (ip2 < ip1) go to exit */
|
|
cmpq %ip1, %ip2
|
|
jb .L_4X2_exit
|
|
|
|
/* If (ip3 < ip2) go to exit */
|
|
cmpq %ip2, %ip3
|
|
jb .L_4X2_exit
|
|
|
|
#define DECODE(n, idx) \
|
|
movq %bits##n, %rax; \
|
|
shrq $53, %rax; \
|
|
movzwl 0(%dtable,%rax,4),%r8d; \
|
|
movzbl 2(%dtable,%rax,4),%r15d; \
|
|
movzbl 3(%dtable,%rax,4),%eax; \
|
|
movw %r8w, (%op##n); \
|
|
shlxq %r15, %bits##n, %bits##n; \
|
|
addq %rax, %op##n
|
|
|
|
#define RELOAD_BITS(n) \
|
|
bsfq %bits##n, %bits##n; \
|
|
movq %bits##n, %rax; \
|
|
shrq $3, %bits##n; \
|
|
andq $7, %rax; \
|
|
subq %bits##n, %ip##n; \
|
|
movq (%ip##n), %bits##n; \
|
|
orq $1, %bits##n; \
|
|
shlxq %rax, %bits##n, %bits##n
|
|
|
|
|
|
movq %olimit, 48(%rsp)
|
|
|
|
.p2align 6
|
|
|
|
.L_4X2_loop_body:
|
|
/* We clobber r8, so store it on the stack */
|
|
movq %r8, 0(%rsp)
|
|
|
|
/* Decode 5 symbols from each of the 4 streams (20 symbols total). */
|
|
FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
|
|
FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
|
|
FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
|
|
FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
|
|
FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
|
|
|
|
/* Reload r8 */
|
|
movq 0(%rsp), %r8
|
|
|
|
FOR_EACH_STREAM(RELOAD_BITS)
|
|
|
|
cmp %op3, 48(%rsp)
|
|
ja .L_4X2_loop_body
|
|
jmp .L_4X2_compute_olimit
|
|
|
|
#undef DECODE
|
|
#undef RELOAD_BITS
|
|
.L_4X2_exit:
|
|
addq $8, %rsp
|
|
/* Restore stack (oend & olimit) */
|
|
pop %rax /* oend0 */
|
|
pop %rax /* oend1 */
|
|
pop %rax /* oend2 */
|
|
pop %rax /* oend3 */
|
|
pop %rax /* ilowest */
|
|
pop %rax /* olimit */
|
|
pop %rax /* arg */
|
|
|
|
/* Save ip / op / bits */
|
|
movq %ip0, 0(%rax)
|
|
movq %ip1, 8(%rax)
|
|
movq %ip2, 16(%rax)
|
|
movq %ip3, 24(%rax)
|
|
movq %op0, 32(%rax)
|
|
movq %op1, 40(%rax)
|
|
movq %op2, 48(%rax)
|
|
movq %op3, 56(%rax)
|
|
movq %bits0, 64(%rax)
|
|
movq %bits1, 72(%rax)
|
|
movq %bits2, 80(%rax)
|
|
movq %bits3, 88(%rax)
|
|
|
|
/* Restore registers */
|
|
pop %r15
|
|
pop %r14
|
|
pop %r13
|
|
pop %r12
|
|
pop %r11
|
|
pop %r10
|
|
pop %r9
|
|
pop %r8
|
|
pop %rdi
|
|
pop %rsi
|
|
pop %rbp
|
|
pop %rdx
|
|
pop %rcx
|
|
pop %rbx
|
|
pop %rax
|
|
ret
|
|
|
|
#endif
|