diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp index 89e093f6d1277..d1831aac96c9b 100644 --- a/src/hotspot/cpu/x86/assembler_x86.cpp +++ b/src/hotspot/cpu/x86/assembler_x86.cpp @@ -5008,6 +5008,40 @@ assert(vector_len == AVX_128bit? VM_Version::supports_avx() : emit_int16(0x04, (0xC0 | encode)); } +void Assembler::evpmadd52luq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) { + evpmadd52luq(dst, k0, src1, src2, false, vector_len); +} + +void Assembler::evpmadd52luq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { + assert(VM_Version::supports_avx512ifma(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + if (merge) { + attributes.reset_is_clear_context(); + } + + int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16((unsigned char)0xB4, (0xC0 | encode)); +} + +void Assembler::evpmadd52huq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) { + evpmadd52huq(dst, k0, src1, src2, false, vector_len); +} + +void Assembler::evpmadd52huq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { + assert(VM_Version::supports_avx512ifma(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + if (merge) { + attributes.reset_is_clear_context(); + } + + int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16((unsigned char)0xB5, (0xC0 | encode)); +} + void Assembler::evpdpwssd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_evex(), ""); assert(VM_Version::supports_avx512_vnni(), "must support vnni"); @@ -5425,6 +5459,42 @@ void Assembler::punpcklqdq(XMMRegister dst, XMMRegister src) { emit_int16(0x6C, (0xC0 | encode)); } +void Assembler::evpunpcklqdq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) { + evpunpcklqdq(dst, k0, src1, src2, false, vector_len); +} + +void Assembler::evpunpcklqdq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { + assert(VM_Version::supports_evex(), "requires AVX512F"); + assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL"); + InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + if (merge) { + attributes.reset_is_clear_context(); + } + + int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16(0x6C, (0xC0 | encode)); +} + +void Assembler::evpunpckhqdq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) { + evpunpckhqdq(dst, k0, src1, src2, false, vector_len); +} + +void Assembler::evpunpckhqdq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { + assert(VM_Version::supports_evex(), "requires AVX512F"); + assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL"); + InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + if (merge) { + attributes.reset_is_clear_context(); + } + + int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16(0x6D, (0xC0 | encode)); +} + void Assembler::push(int32_t imm32) { // in 64bits we push 64bits onto the stack but only // take a 32bit immediate @@ -5869,6 +5939,18 @@ void Assembler::shrdl(Register dst, Register src, int8_t imm8) { emit_int32(0x0F, (unsigned char)0xAC, (0xC0 | encode), imm8); } +#ifdef _LP64 +void Assembler::shldq(Register dst, Register src, int8_t imm8) { + int encode = prefixq_and_encode(src->encoding(), dst->encoding()); + emit_int32(0x0F, (unsigned char)0xA4, (0xC0 | encode), imm8); +} + +void Assembler::shrdq(Register dst, Register src, int8_t imm8) { + int encode = prefixq_and_encode(src->encoding(), dst->encoding()); + emit_int32(0x0F, (unsigned char)0xAC, (0xC0 | encode), imm8); +} +#endif + // copies a single word from [esi] to [edi] void Assembler::smovl() { emit_int8((unsigned char)0xA5); @@ -7740,11 +7822,12 @@ void Assembler::vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_ emit_operand(dst, src, 0); } -void Assembler::vpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { - assert(VM_Version::supports_evex(), ""); - InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); - int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); - emit_int16((unsigned char)0xDB, (0xC0 | encode)); +void Assembler::evpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + evpandq(dst, k0, nds, src, false, vector_len); +} + +void Assembler::evpandq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + evpandq(dst, k0, nds, src, false, vector_len); } //Variable Shift packed integers logically left. @@ -7857,13 +7940,13 @@ void Assembler::vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_l emit_operand(dst, src, 0); } -void Assembler::vporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { - assert(VM_Version::supports_evex(), ""); - InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); - int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); - emit_int16((unsigned char)0xEB, (0xC0 | encode)); +void Assembler::evporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + evporq(dst, k0, nds, src, false, vector_len); } +void Assembler::evporq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + evporq(dst, k0, nds, src, false, vector_len); +} void Assembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { assert(VM_Version::supports_evex(), ""); @@ -8004,7 +8087,8 @@ void Assembler::evpandd(XMMRegister dst, KRegister mask, XMMRegister nds, Addres } void Assembler::evpandq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { - assert(VM_Version::supports_evex(), ""); + assert(VM_Version::supports_evex(), "requires AVX512F"); + assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL"); InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); attributes.set_is_evex_instruction(); attributes.set_embedded_opmask_register_specifier(mask); @@ -8016,7 +8100,8 @@ void Assembler::evpandq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMReg } void Assembler::evpandq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { - assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), ""); + assert(VM_Version::supports_evex(), "requires AVX512F"); + assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL"); InstructionMark im(this); InstructionAttr attributes(vector_len, /* vex_w */ true,/* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FV,/* input_size_in_bits */ EVEX_32bit); @@ -8031,7 +8116,8 @@ void Assembler::evpandq(XMMRegister dst, KRegister mask, XMMRegister nds, Addres } void Assembler::evporq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { - assert(VM_Version::supports_evex(), ""); + assert(VM_Version::supports_evex(), "requires AVX512F"); + assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL"); InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); attributes.set_is_evex_instruction(); attributes.set_embedded_opmask_register_specifier(mask); @@ -8043,7 +8129,8 @@ void Assembler::evporq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegi } void Assembler::evporq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { - assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), ""); + assert(VM_Version::supports_evex(), "requires AVX512F"); + assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL"); InstructionMark im(this); InstructionAttr attributes(vector_len, /* vex_w */ true,/* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FV,/* input_size_in_bits */ EVEX_32bit); @@ -8201,8 +8288,8 @@ void Assembler::vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, Address } void Assembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len) { - assert(VM_Version::supports_evex(), "requires EVEX support"); - assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires VL support"); + assert(VM_Version::supports_evex(), "requires AVX512F"); + assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL"); InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_is_evex_instruction(); int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src3->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); @@ -8211,6 +8298,20 @@ void Assembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, XMMRegis emit_int8(imm8); } +void Assembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, Address src3, int vector_len) { + assert(VM_Version::supports_evex(), "requires EVEX support"); + assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires VL support"); + assert(dst != xnoreg, "sanity"); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit); + vex_prefix(src3, src2->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int8(0x25); + emit_operand(dst, src3, 1); + emit_int8(imm8); +} + void Assembler::evexpandps(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { assert(VM_Version::supports_evex(), ""); assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), ""); @@ -13452,6 +13553,13 @@ void Assembler::vzeroupper() { emit_copy(code_section(), vzup_code, vzup_len); } +void Assembler::vzeroall() { + assert(VM_Version::supports_avx(), "requires AVX"); + InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + (void)vex_prefix_and_encode(0, 0, 0, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int8(0x77); +} + void Assembler::pushq(Address src) { InstructionMark im(this); emit_int16(get_prefixq(src), (unsigned char)0xFF); diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp index c7316ae01fc7e..04dbb7907bee1 100644 --- a/src/hotspot/cpu/x86/assembler_x86.hpp +++ b/src/hotspot/cpu/x86/assembler_x86.hpp @@ -1891,6 +1891,10 @@ class Assembler : public AbstractAssembler { void pmaddwd(XMMRegister dst, XMMRegister src); void vpmaddwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpmaddubsw(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len); + void evpmadd52luq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len); + void evpmadd52luq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len); + void evpmadd52huq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len); + void evpmadd52huq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len); // Multiply add accumulate void evpdpwssd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); @@ -1990,6 +1994,11 @@ class Assembler : public AbstractAssembler { // Interleave Low Quadwords void punpcklqdq(XMMRegister dst, XMMRegister src); + void evpunpcklqdq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len); + void evpunpcklqdq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len); + void evpunpckhqdq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len); + void evpunpckhqdq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len); + // Vector sum of absolute difference. void vpsadbw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); @@ -2092,6 +2101,10 @@ class Assembler : public AbstractAssembler { void shldl(Register dst, Register src, int8_t imm8); void shrdl(Register dst, Register src); void shrdl(Register dst, Register src, int8_t imm8); +#ifdef _LP64 + void shldq(Register dst, Register src, int8_t imm8); + void shrdq(Register dst, Register src, int8_t imm8); +#endif void shll(Register dst, int imm8); void shll(Register dst); @@ -2616,7 +2629,8 @@ class Assembler : public AbstractAssembler { void pand(XMMRegister dst, XMMRegister src); void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len); - void vpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void evpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void evpandq(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Andn packed integers void pandn(XMMRegister dst, XMMRegister src); @@ -2626,7 +2640,8 @@ class Assembler : public AbstractAssembler { void por(XMMRegister dst, XMMRegister src); void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len); - void vporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void evporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void evporq(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Xor packed integers void pxor(XMMRegister dst, XMMRegister src); @@ -2640,6 +2655,7 @@ class Assembler : public AbstractAssembler { void vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len); void vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, Address src3, int vector_len); void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len); + void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, Address src3, int vector_len); // Vector compress/expand instructions. void evpcompressb(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len); @@ -2753,6 +2769,8 @@ class Assembler : public AbstractAssembler { // runtime code and native libraries. void vzeroupper(); + void vzeroall(); + // Vector double compares void vcmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len); void evcmppd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp index 2154e867d929b..c3a001d4bac2a 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp @@ -5279,7 +5279,7 @@ void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMReg // Get the reverse bit sequence of lower nibble of each byte. vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); - vpandq(dst, xtmp2, src, vec_enc); + evpandq(dst, xtmp2, src, vec_enc); vpshufb(dst, xtmp1, dst, vec_enc); vpsllq(dst, dst, 4, vec_enc); @@ -5290,7 +5290,7 @@ void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMReg // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. - vporq(xtmp2, dst, xtmp2, vec_enc); + evporq(xtmp2, dst, xtmp2, vec_enc); vector_reverse_byte(bt, dst, xtmp2, vec_enc); } else if(vec_enc == Assembler::AVX_512bit) { @@ -5345,11 +5345,11 @@ void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, X void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, Register rtmp, int vec_enc) { vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); - vpandq(dst, xtmp1, src, vec_enc); + evpandq(dst, xtmp1, src, vec_enc); vpsllq(dst, dst, nbits, vec_enc); vpandn(xtmp1, xtmp1, src, vec_enc); vpsrlq(xtmp1, xtmp1, nbits, vec_enc); - vporq(dst, dst, xtmp1, vec_enc); + evporq(dst, dst, xtmp1, vec_enc); } void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp b/src/hotspot/cpu/x86/macroAssembler_x86.cpp index aade92a2aba5f..3026109d979dc 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp @@ -1217,6 +1217,19 @@ void MacroAssembler::andptr(Register dst, int32_t imm32) { LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32)); } +#ifdef _LP64 +void MacroAssembler::andq(Register dst, AddressLiteral src, Register rscratch) { + assert(rscratch != noreg || always_reachable(src), "missing"); + + if (reachable(src)) { + andq(dst, as_Address(src)); + } else { + lea(rscratch, src); + andq(dst, Address(rscratch, 0)); + } +} +#endif + void MacroAssembler::atomic_incl(Address counter_addr) { lock(); incrementl(counter_addr); @@ -9105,6 +9118,40 @@ void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMM fatal("Unexpected type argument %s", type2name(type)); break; } } + +void MacroAssembler::evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { + assert(rscratch != noreg || always_reachable(src), "missing"); + + if (reachable(src)) { + evpandq(dst, nds, as_Address(src), vector_len); + } else { + lea(rscratch, src); + evpandq(dst, nds, Address(rscratch, 0), vector_len); + } +} + +void MacroAssembler::evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { + assert(rscratch != noreg || always_reachable(src), "missing"); + + if (reachable(src)) { + evporq(dst, nds, as_Address(src), vector_len); + } else { + lea(rscratch, src); + evporq(dst, nds, Address(rscratch, 0), vector_len); + } +} + +void MacroAssembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch) { + assert(rscratch != noreg || always_reachable(src3), "missing"); + + if (reachable(src3)) { + vpternlogq(dst, imm8, src2, as_Address(src3), vector_len); + } else { + lea(rscratch, src3); + vpternlogq(dst, imm8, src2, Address(rscratch, 0), vector_len); + } +} + #if COMPILER2_OR_JVMCI void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask, diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp b/src/hotspot/cpu/x86/macroAssembler_x86.hpp index 3318c5669b41f..5a0a3d8c9a1ee 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp @@ -730,6 +730,11 @@ class MacroAssembler: public Assembler { void andptr(Register dst, int32_t src); void andptr(Register src1, Register src2) { LP64_ONLY(andq(src1, src2)) NOT_LP64(andl(src1, src2)) ; } +#ifdef _LP64 + using Assembler::andq; + void andq(Register dst, AddressLiteral src, Register rscratch = noreg); +#endif + void cmp8(AddressLiteral src1, int imm, Register rscratch = noreg); // renamed to drag out the casting of address to int32_t/intptr_t @@ -1754,6 +1759,15 @@ class MacroAssembler: public Assembler { void evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc); void evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc); + using Assembler::evpandq; + void evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg); + + using Assembler::evporq; + void evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg); + + using Assembler::vpternlogq; + void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch = noreg); + void alltrue(Register dst, uint masklen, KRegister src1, KRegister src2, KRegister kscratch); void anytrue(Register dst, uint masklen, KRegister src, KRegister kscratch); diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index 77be6f9e871dc..c3e0b79dc46af 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -2519,7 +2519,7 @@ address StubGenerator::generate_base64_decodeBlock() { // Decode all bytes within our merged input __ evmovdquq(tmp, lookup_lo, Assembler::AVX_512bit); __ evpermt2b(tmp, input_initial_valid_b64, lookup_hi, Assembler::AVX_512bit); - __ vporq(mask, tmp, input_initial_valid_b64, Assembler::AVX_512bit); + __ evporq(mask, tmp, input_initial_valid_b64, Assembler::AVX_512bit); // Check for error. Compare (decoded | initial) to all invalid. // If any bytes have their high-order bit set, then we have an error. @@ -3709,6 +3709,10 @@ void StubGenerator::generate_initial() { StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); } + if (UsePolyIntrinsics) { + StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); + } + if (UseCRC32CIntrinsics) { bool supports_clmul = VM_Version::supports_clmul(); StubRoutines::x86::generate_CRC32C_table(supports_clmul); diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp index 7d5e25de381a9..5e97e1e9a4456 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp @@ -387,6 +387,24 @@ class StubGenerator: public StubCodeGenerator { // Ghash single and multi block operations using AVX instructions address generate_avx_ghash_processBlocks(); + // Poly1305 multiblock using IFMA instructions + address generate_poly1305_processBlocks(); + void poly1305_process_blocks_avx512(const Register input, const Register length, + const Register A0, const Register A1, const Register A2, + const Register R0, const Register R1, const Register C1); + void poly1305_multiply_scalar(const Register a0, const Register a1, const Register a2, + const Register r0, const Register r1, const Register c1, bool only128, + const Register t0, const Register t1, const Register t2, + const Register mulql, const Register mulqh); + void poly1305_multiply8_avx512(const XMMRegister A0, const XMMRegister A1, const XMMRegister A2, + const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P, + const XMMRegister P0L, const XMMRegister P0H, const XMMRegister P1L, const XMMRegister P1H, const XMMRegister P2L, const XMMRegister P2H, + const XMMRegister TMP, const Register rscratch); + void poly1305_limbs(const Register limbs, const Register a0, const Register a1, const Register a2, const Register t0, const Register t1); + void poly1305_limbs_out(const Register a0, const Register a1, const Register a2, const Register limbs, const Register t0, const Register t1); + void poly1305_limbs_avx512(const XMMRegister D0, const XMMRegister D1, + const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG, + const XMMRegister TMP, const Register rscratch); // BASE64 stubs diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp new file mode 100644 index 0000000000000..97f9f6ccc470b --- /dev/null +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp @@ -0,0 +1,1027 @@ +/* + * Copyright (c) 2022, Intel Corporation. All rights reserved. + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#include "precompiled.hpp" +#include "macroAssembler_x86.hpp" +#include "stubGenerator_x86_64.hpp" + +#define __ _masm-> + +// References: +// - (Normative) RFC7539 - ChaCha20 and Poly1305 for IETF Protocols +// - M. Goll and S. Gueron, "Vectorization of Poly1305 Message Authentication Code" +// - "The design of Poly1305" https://loup-vaillant.fr/tutorials/poly1305-design + +// Explanation for the 'well known' modular arithmetic optimization, reduction by pseudo-Mersene prime 2^130-5: +// +// Reduction by 2^130-5 can be expressed as follows: +// ( a×2^130 + b ) mod 2^130-5 //i.e. number split along the 130-bit boundary +// = ( a×2^130 - 5×a + 5×a + b ) mod 2^130-5 +// = ( a×(2^130 - 5) + 5×a + b ) mod 2^130-5 // i.e. adding multiples of modulus is a noop +// = ( 5×a + b ) mod 2^130-5 +// QED: shows mathematically the well known algorithm of 'split the number down the middle, multiply upper and add' +// This is particularly useful to understand when combining with 'odd-sized' limbs that might cause misallignment +// + +// Pseudocode for this file (in general): +// * used for poly1305_multiply_scalar +// × used for poly1305_multiply8_avx512 +// lower-case variables are scalar numbers in 3×44-bit limbs (in gprs) +// upper-case variables are 8-element vector numbers in 3×44-bit limbs (in zmm registers) +// [ ] used to denote vector numbers (with their elements) + +// Constant Pool: +ATTRIBUTE_ALIGNED(64) uint64_t POLY1305_PAD_MSG[] = { + 0x0000010000000000, 0x0000010000000000, + 0x0000010000000000, 0x0000010000000000, + 0x0000010000000000, 0x0000010000000000, + 0x0000010000000000, 0x0000010000000000, +}; +static address poly1305_pad_msg() { + return (address)POLY1305_PAD_MSG; +} + +ATTRIBUTE_ALIGNED(64) uint64_t POLY1305_MASK42[] = { + 0x000003ffffffffff, 0x000003ffffffffff, + 0x000003ffffffffff, 0x000003ffffffffff, + 0x000003ffffffffff, 0x000003ffffffffff, + 0x000003ffffffffff, 0x000003ffffffffff +}; +static address poly1305_mask42() { + return (address)POLY1305_MASK42; +} + +ATTRIBUTE_ALIGNED(64) uint64_t POLY1305_MASK44[] = { + 0x00000fffffffffff, 0x00000fffffffffff, + 0x00000fffffffffff, 0x00000fffffffffff, + 0x00000fffffffffff, 0x00000fffffffffff, + 0x00000fffffffffff, 0x00000fffffffffff, +}; +static address poly1305_mask44() { + return (address)POLY1305_MASK44; +} + +// Compute product for 8 16-byte message blocks, +// i.e. For each block, compute [a2 a1 a0] = [a2 a1 a0] × [r2 r1 r0] +// +// Each block/number is represented by 3 44-bit limb digits, start with multiplication +// +// a2 a1 a0 +// × r2 r1 r0 +// ---------------------------------- +// a2×r0 a1×r0 a0×r0 +// + a1×r1 a0×r1 5×a2×r1' (r1' = r1<<2) +// + a0×r2 5×a2×r2' 5×a1×r2' (r2' = r2<<2) +// ---------------------------------- +// p2 p1 p0 +// +// Then, propagate the carry (bits after bit 44) from lower limbs into higher limbs. +// Then, modular reduction from upper limb wrapped to lower limbs +// +// Math Note 1: 'carry propagation' from p2 to p0 involves multiplication by 5 (i.e. slightly modified modular reduction from above): +// ( p2×2^88 ) mod 2^130-5 +// = ( p2'×2^88 + p2''×2^130) mod 2^130-5 // Split on 130-bit boudary +// = ( p2'×2^88 + p2''×2^130 - 5×p2'' + 5×p2'') mod 2^130-5 +// = ( p2'×2^88 + p2''×(2^130 - 5) + 5×p2'') mod 2^130-5 // i.e. adding multiples of modulus is a noop +// = ( p2'×2^88 + 5×p2'') mod 2^130-5 +// +// Math Note 2: R1P = 4*5*R1 and R2P = 4*5*R2; This precomputation allows simultaneous reduction and multiplication. +// This is not the standard 'multiply-upper-by-5', here is why the factor is 4*5 instead of 5. +// For example, partial product (a2×r2): +// (a2×2^88)×(r2×2^88) mod 2^130-5 +// = (a2×r2 × 2^176) mod 2^130-5 +// = (a2×r2 × 2^46×2^130) mod 2^130-5 +// = (a2×r2×2^46 × 2^130- 5×a2×r2×2^46 + 5×a2×r2×2^46) mod 2^130-5 +// = (a2×r2×2^46 × (2^130- 5) + 5×a2×r2×2^46) mod 2^130-5 // i.e. adding multiples of modulus is a noop +// = (5×a2×r2×2^46) mod 2^130-5 +// = (a2×5×r2×2^2 × 2^44) mod 2^130-5 // Align to limb boudary +// = (a2×[5×r2×4] × 2^44) mod 2^130-5 +// = (a2×R2P × 2^44) mod 2^130-5 // i.e. R2P = 4*5*R2 +// +void StubGenerator::poly1305_multiply8_avx512( + const XMMRegister A0, const XMMRegister A1, const XMMRegister A2, + const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P, + const XMMRegister P0L, const XMMRegister P0H, const XMMRegister P1L, const XMMRegister P1H, const XMMRegister P2L, const XMMRegister P2H, + const XMMRegister TMP, const Register rscratch) +{ + + // Reset partial sums + __ evpxorq(P0L, P0L, P0L, Assembler::AVX_512bit); + __ evpxorq(P0H, P0H, P0H, Assembler::AVX_512bit); + __ evpxorq(P1L, P1L, P1L, Assembler::AVX_512bit); + __ evpxorq(P1H, P1H, P1H, Assembler::AVX_512bit); + __ evpxorq(P2L, P2L, P2L, Assembler::AVX_512bit); + __ evpxorq(P2H, P2H, P2H, Assembler::AVX_512bit); + + // Calculate partial products + // p0 = a2×r1' + // p1 = a2×r2' + // p2 = a2×r0 + __ evpmadd52luq(P0L, A2, R1P, Assembler::AVX_512bit); + __ evpmadd52huq(P0H, A2, R1P, Assembler::AVX_512bit); + __ evpmadd52luq(P1L, A2, R2P, Assembler::AVX_512bit); + __ evpmadd52huq(P1H, A2, R2P, Assembler::AVX_512bit); + __ evpmadd52luq(P2L, A2, R0, Assembler::AVX_512bit); + __ evpmadd52huq(P2H, A2, R0, Assembler::AVX_512bit); + + // p0 += a0×r0 + // p1 += a0×r1 + // p2 += a0×r2 + __ evpmadd52luq(P1L, A0, R1, Assembler::AVX_512bit); + __ evpmadd52huq(P1H, A0, R1, Assembler::AVX_512bit); + __ evpmadd52luq(P2L, A0, R2, Assembler::AVX_512bit); + __ evpmadd52huq(P2H, A0, R2, Assembler::AVX_512bit); + __ evpmadd52luq(P0L, A0, R0, Assembler::AVX_512bit); + __ evpmadd52huq(P0H, A0, R0, Assembler::AVX_512bit); + + // p0 += a1×r2' + // p1 += a1×r0 + // p2 += a1×r1 + __ evpmadd52luq(P0L, A1, R2P, Assembler::AVX_512bit); + __ evpmadd52huq(P0H, A1, R2P, Assembler::AVX_512bit); + __ evpmadd52luq(P1L, A1, R0, Assembler::AVX_512bit); + __ evpmadd52huq(P1H, A1, R0, Assembler::AVX_512bit); + __ evpmadd52luq(P2L, A1, R1, Assembler::AVX_512bit); + __ evpmadd52huq(P2H, A1, R1, Assembler::AVX_512bit); + + // Carry propagation: + // (Not quite aligned) | More mathematically correct: + // P2L P1L P0L | P2L×2^88 + P1L×2^44 + P0L×2^0 + // + P2H P1H P0H | + P2H×2^140 + P1H×2^96 + P0H×2^52 + // --------------------------- | ----------------------------------------------- + // = P2H A2 A1 A0 | = P2H×2^130 + A2×2^88 + A1×2^44 + A0×2^0 + // + __ vpsrlq(TMP, P0L, 44, Assembler::AVX_512bit); + __ evpandq(A0, P0L, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // Clear top 20 bits + + __ vpsllq(P0H, P0H, 8, Assembler::AVX_512bit); + __ vpaddq(P0H, P0H, TMP, Assembler::AVX_512bit); + __ vpaddq(P1L, P1L, P0H, Assembler::AVX_512bit); + __ evpandq(A1, P1L, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // Clear top 20 bits + + __ vpsrlq(TMP, P1L, 44, Assembler::AVX_512bit); + __ vpsllq(P1H, P1H, 8, Assembler::AVX_512bit); + __ vpaddq(P1H, P1H, TMP, Assembler::AVX_512bit); + __ vpaddq(P2L, P2L, P1H, Assembler::AVX_512bit); + __ evpandq(A2, P2L, ExternalAddress(poly1305_mask42()), Assembler::AVX_512bit, rscratch); // Clear top 22 bits + + __ vpsrlq(TMP, P2L, 42, Assembler::AVX_512bit); + __ vpsllq(P2H, P2H, 10, Assembler::AVX_512bit); + __ vpaddq(P2H, P2H, TMP, Assembler::AVX_512bit); + + // Reduction: p2->a0->a1 + // Multiply by 5 the highest bits (p2 is above 130 bits) + __ vpaddq(A0, A0, P2H, Assembler::AVX_512bit); + __ vpsllq(P2H, P2H, 2, Assembler::AVX_512bit); + __ vpaddq(A0, A0, P2H, Assembler::AVX_512bit); + __ vpsrlq(TMP, A0, 44, Assembler::AVX_512bit); + __ evpandq(A0, A0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); + __ vpaddq(A1, A1, TMP, Assembler::AVX_512bit); +} + +// Compute product for a single 16-byte message blocks +// - Assumes that r = [r1 r0] is only 128 bits (not 130) +// - Input [a2 a1 a0]; when only128 is set, input is 128 bits (i.e. a2==0) +// - Output [a2 a1 a0] is at least 130 bits (i.e. a2 is used regardless of only128) +// +// Note 1: a2 here is only two bits so anything above is subject of reduction. +// Note 2: Constant c1 = 5xr1 = r1 + (r1 << 2) simplifies multiply with less operations +// +// Flow of the code below is as follows: +// +// a2 a1 a0 +// x r1 r0 +// ----------------------------- +// a2×r0 a1×r0 a0×r0 +// + a0×r1 +// + 5xa2xr1 5xa1xr1 +// ----------------------------- +// [0|L2L] [L1H|L1L] [L0H|L0L] +// +// Registers: t2:t1 t0:a0 +// +// Completing the multiply and adding (with carry) 3x128-bit limbs into +// 192-bits again (3x64-bits): +// a0 = L0L +// a1 = L0H + L1L +// t2 = L1H + L2L +void StubGenerator::poly1305_multiply_scalar( + const Register a0, const Register a1, const Register a2, + const Register r0, const Register r1, const Register c1, bool only128, + const Register t0, const Register t1, const Register t2, + const Register mulql, const Register mulqh) +{ + // mulq instruction requires/clobers rax, rdx (mulql, mulqh) + + // t2:t1 = (a0 * r1) + __ movq(rax, r1); + __ mulq(a0); + __ movq(t1, rax); + __ movq(t2, rdx); + + // t0:a0 = (a0 * r0) + __ movq(rax, r0); + __ mulq(a0); + __ movq(a0, rax); // a0 not used in other operations + __ movq(t0, rdx); + + // t2:t1 += (a1 * r0) + __ movq(rax, r0); + __ mulq(a1); + __ addq(t1, rax); + __ adcq(t2, rdx); + + // t0:a0 += (a1 * r1x5) + __ movq(rax, c1); + __ mulq(a1); + __ addq(a0, rax); + __ adcq(t0, rdx); + + // Note: a2 is clamped to 2-bits, + // r1/r0 is clamped to 60-bits, + // their product is less than 2^64. + + if (only128) { // Accumulator only 128 bits, i.e. a2 == 0 + // just move and add t0-t1 to a1 + __ movq(a1, t0); + __ addq(a1, t1); + __ adcq(t2, 0); + } else { + // t2:t1 += (a2 * r1x5) + __ movq(a1, a2); // use a1 for a2 + __ imulq(a1, c1); + __ addq(t1, a1); + __ adcq(t2, 0); + + __ movq(a1, t0); // t0:a0 => a1:a0 + + // t2:a1 += (a2 * r0):t1 + __ imulq(a2, r0); + __ addq(a1, t1); + __ adcq(t2, a2); + } + + // At this point, 3 64-bit limbs are in t2:a1:a0 + // t2 can span over more than 2 bits so final partial reduction step is needed. + // + // Partial reduction (just to fit into 130 bits) + // a2 = t2 & 3 + // k = (t2 & ~3) + (t2 >> 2) + // Y x4 + Y x1 + // a2:a1:a0 += k + // + // Result will be in a2:a1:a0 + __ movq(t0, t2); + __ movl(a2, t2); // DWORD + __ andq(t0, ~3); + __ shrq(t2, 2); + __ addq(t0, t2); + __ andl(a2, 3); // DWORD + + // a2:a1:a0 += k (kept in t0) + __ addq(a0, t0); + __ adcq(a1, 0); + __ adcl(a2, 0); // DWORD +} + +// Convert array of 128-bit numbers in quadwords (in D0:D1) into 128-bit numbers across 44-bit limbs (in L0:L1:L2) +// Optionally pad all the numbers (i.e. add 2^128) +// +// +-------------------------+-------------------------+ +// D0:D1 | h0 h1 g0 g1 f0 f1 e0 e1 | d0 d1 c0 c1 b0 b1 a0 a1 | +// +-------------------------+-------------------------+ +// +-------------------------+ +// L2 | h2 d2 g2 c2 f2 b2 e2 a2 | +// +-------------------------+ +// +-------------------------+ +// L1 | h1 d1 g1 c1 f1 b1 e1 a1 | +// +-------------------------+ +// +-------------------------+ +// L0 | h0 d0 g0 c0 f0 b0 e0 a0 | +// +-------------------------+ +// +void StubGenerator::poly1305_limbs_avx512( + const XMMRegister D0, const XMMRegister D1, + const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG, + const XMMRegister TMP, const Register rscratch) +{ + // Interleave blocks of data + __ evpunpckhqdq(TMP, D0, D1, Assembler::AVX_512bit); + __ evpunpcklqdq(L0, D0, D1, Assembler::AVX_512bit); + + // Highest 42-bit limbs of new blocks + __ vpsrlq(L2, TMP, 24, Assembler::AVX_512bit); + if (padMSG) { + __ evporq(L2, L2, ExternalAddress(poly1305_pad_msg()), Assembler::AVX_512bit, rscratch); // Add 2^128 to all 8 final qwords of the message + } + + // Middle 44-bit limbs of new blocks + __ vpsrlq(L1, L0, 44, Assembler::AVX_512bit); + __ vpsllq(TMP, TMP, 20, Assembler::AVX_512bit); + __ vpternlogq(L1, 0xA8, TMP, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // (A OR B AND C) + + // Lowest 44-bit limbs of new blocks + __ evpandq(L0, L0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); +} + +/** + * Copy 5×26-bit (unreduced) limbs stored at Register limbs into a2:a1:a0 (3×64-bit limbs) + * + * a2 is optional. When only128 is set, limbs are expected to fit into 128-bits (i.e. a1:a0 such as clamped R) + */ +void StubGenerator::poly1305_limbs( + const Register limbs, const Register a0, const Register a1, const Register a2, + const Register t0, const Register t1) +{ + __ movq(a0, Address(limbs, 0)); + __ movq(t0, Address(limbs, 8)); + __ shlq(t0, 26); + __ addq(a0, t0); + __ movq(t0, Address(limbs, 16)); + __ movq(t1, Address(limbs, 24)); + __ movq(a1, t0); + __ shlq(t0, 52); + __ shrq(a1, 12); + __ shlq(t1, 14); + __ addq(a0, t0); + __ adcq(a1, t1); + __ movq(t0, Address(limbs, 32)); + if (a2 != noreg) { + __ movq(a2, t0); + __ shrq(a2, 24); + } + __ shlq(t0, 40); + __ addq(a1, t0); + if (a2 != noreg) { + __ adcq(a2, 0); + + // One round of reduction + // Take bits above 130 in a2, multiply by 5 and add to a2:a1:a0 + __ movq(t0, a2); + __ andq(t0, ~3); + __ andq(a2, 3); + __ movq(t1, t0); + __ shrq(t1, 2); + __ addq(t0, t1); + + __ addq(a0, t0); + __ adcq(a1, 0); + __ adcq(a2, 0); + } +} + +/** + * Break 3×64-bit a2:a1:a0 limbs into 5×26-bit limbs and store out into 5 quadwords at address `limbs` + */ +void StubGenerator::poly1305_limbs_out( + const Register a0, const Register a1, const Register a2, + const Register limbs, + const Register t0, const Register t1) +{ + // Extra round of reduction + // Take bits above 130 in a2, multiply by 5 and add to a2:a1:a0 + __ movq(t0, a2); + __ andq(t0, ~3); + __ andq(a2, 3); + __ movq(t1, t0); + __ shrq(t1, 2); + __ addq(t0, t1); + + __ addq(a0, t0); + __ adcq(a1, 0); + __ adcq(a2, 0); + + // Chop a2:a1:a0 into 26-bit limbs + __ movl(t0, a0); + __ andl(t0, 0x3ffffff); + __ movq(Address(limbs, 0), t0); + + __ shrq(a0, 26); + __ movl(t0, a0); + __ andl(t0, 0x3ffffff); + __ movq(Address(limbs, 8), t0); + + __ shrq(a0, 26); // 12 bits left in a0, concatenate 14 from a1 + __ movl(t0, a1); + __ shll(t0, 12); + __ addl(t0, a0); + __ andl(t0, 0x3ffffff); + __ movq(Address(limbs, 16), t0); + + __ shrq(a1, 14); // already used up 14 bits + __ shlq(a2, 50); // a2 contains 2 bits when reduced, but $Element.limbs dont have to be fully reduced + __ addq(a1, a2); // put remaining bits into a1 + + __ movl(t0, a1); + __ andl(t0, 0x3ffffff); + __ movq(Address(limbs, 24), t0); + + __ shrq(a1, 26); + __ movl(t0, a1); + //andl(t0, 0x3ffffff); doesnt have to be fully reduced, leave remaining bit(s) + __ movq(Address(limbs, 32), t0); +} + +// This function consumes as many whole 16*16-byte blocks as available in input +// After execution, input and length will point at remaining (unprocessed) data +// and [a2 a1 a0] will contain the current accumulator value +// +// Math Note: +// Main loop in this function multiplies each message block by r^16; And some glue before and after.. +// Proof (for brevity, split into 4 'rows' instead of 16): +// +// hash = ((((m1*r + m2)*r + m3)*r ... mn)*r +// = m1*r^n + m2*r^(n-1) + ... +mn_1*r^2 + mn*r // Horner's rule +// +// = m1*r^n + m4*r^(n-4) + m8*r^(n-8) ... // split into 4 groups for brevity, same applies to 16 blocks +// + m2*r^(n-1) + m5*r^(n-5) + m9*r^(n-9) ... +// + m3*r^(n-2) + m6*r^(n-6) + m10*r^(n-10) ... +// + m4*r^(n-3) + m7*r^(n-7) + m11*r^(n-11) ... +// +// = r^4 * (m1*r^(n-4) + m4*r^(n-8) + m8 *r^(n-16) ... + mn_3) // factor out r^4..r; same applies to 16 but r^16..r factors +// + r^3 * (m2*r^(n-4) + m5*r^(n-8) + m9 *r^(n-16) ... + mn_2) +// + r^2 * (m3*r^(n-4) + m6*r^(n-8) + m10*r^(n-16) ... + mn_1) +// + r^1 * (m4*r^(n-4) + m7*r^(n-8) + m11*r^(n-16) ... + mn_0) // Note last column: message group has no multiplier +// +// = (((m1*r^4 + m4)*r^4 + m8 )*r^4 ... + mn_3) * r^4 // reverse Horner's rule, for each group +// + (((m2*r^4 + m5)*r^4 + m9 )*r^4 ... + mn_2) * r^3 // each column is multiplied by r^4, except last +// + (((m3*r^4 + m6)*r^4 + m10)*r^4 ... + mn_1) * r^2 +// + (((m4*r^4 + m7)*r^4 + m11)*r^4 ... + mn_0) * r^1 +// +// Also see M. Goll and S. Gueron, "Vectorization of Poly1305 Message Authentication Code" +// +// Pseudocode: +// * used for poly1305_multiply_scalar +// × used for poly1305_multiply8_avx512 +// lower-case variables are scalar numbers in 3×44-bit limbs (in gprs) +// upper-case variables are 8&16-element vector numbers in 3×44-bit limbs (in zmm registers) +// +// CL = a // [0 0 0 0 0 0 0 a] +// AL = poly1305_limbs_avx512(input) +// AH = poly1305_limbs_avx512(input+8) +// AL = AL + C +// input+=16, length-=16 +// +// a = r +// a = a*r +// r^2 = a +// a = a*r +// r^3 = a +// r = a*r +// r^4 = a +// +// T = r^4 || r^3 || r^2 || r +// B = limbs(T) // [r^4 0 r^3 0 r^2 0 r^1 0 ] +// CL = B >> 1 // [ 0 r^4 0 r^3 0 r^2 0 r^1] +// R = r^4 || r^4 || .. // [r^4 r^4 r^4 r^4 r^4 r^4 r^4 r^4] +// B = B×R // [r^8 0 r^7 0 r^6 0 r^5 0 ] +// B = B | CL // [r^8 r^4 r^7 r^3 r^6 r^2 r^5 r^1] +// CL = B +// R = r^8 || r^8 || .. // [r^8 r^8 r^8 r^8 r^8 r^8 r^8 r^8] +// B = B × R // [r^16 r^12 r^15 r^11 r^14 r^10 r^13 r^9] +// CH = B +// R = r^16 || r^16 || .. // [r^16 r^16 r^16 r^16 r^16 r^16 r^16 r^16] +// +// for (;length>=16; input+=16, length-=16) +// BL = poly1305_limbs_avx512(input) +// BH = poly1305_limbs_avx512(input+8) +// AL = AL × R +// AH = AH × R +// AL = AL + BL +// AH = AH + BH +// +// AL = AL × CL +// AH = AH × CH +// A = AL + AH // 16->8 blocks +// T = A >> 4 // 8 ->4 blocks +// A = A + T +// T = A >> 2 // 4 ->2 blocks +// A = A + T +// T = A >> 1 // 2 ->1 blocks +// A = A + T +// a = A +// +// Register Map: +// GPRs: +// input = rdi +// length = rbx +// accumulator = rcx +// R = r8 +// a0 = rsi +// a1 = r9 +// a2 = r10 +// r0 = r11 +// r1 = r12 +// c1 = r8; +// t0 = r13 +// t1 = r14 +// t2 = r15 +// rscratch = r13 +// stack(rsp, rbp) +// mulq(rax, rdx) in poly1305_multiply_scalar +// +// ZMMs: +// D: xmm0-1 +// TMP: xmm2 +// T: xmm3-8 +// A: xmm9-14 +// B: xmm15-20 +// C: xmm21-26 +// R: xmm27-31 +void StubGenerator::poly1305_process_blocks_avx512( + const Register input, const Register length, + const Register a0, const Register a1, const Register a2, + const Register r0, const Register r1, const Register c1) +{ + Label L_process256Loop, L_process256LoopDone; + const Register t0 = r13; + const Register t1 = r14; + const Register t2 = r15; + const Register rscratch = r13; + const Register mulql = rax; + const Register mulqh = rdx; + + const XMMRegister D0 = xmm0; + const XMMRegister D1 = xmm1; + const XMMRegister TMP = xmm2; + + const XMMRegister T0 = xmm3; + const XMMRegister T1 = xmm4; + const XMMRegister T2 = xmm5; + const XMMRegister T3 = xmm6; + const XMMRegister T4 = xmm7; + const XMMRegister T5 = xmm8; + + const XMMRegister A0 = xmm9; + const XMMRegister A1 = xmm10; + const XMMRegister A2 = xmm11; + const XMMRegister A3 = xmm12; + const XMMRegister A4 = xmm13; + const XMMRegister A5 = xmm14; + + const XMMRegister B0 = xmm15; + const XMMRegister B1 = xmm16; + const XMMRegister B2 = xmm17; + const XMMRegister B3 = xmm18; + const XMMRegister B4 = xmm19; + const XMMRegister B5 = xmm20; + + const XMMRegister C0 = xmm21; + const XMMRegister C1 = xmm22; + const XMMRegister C2 = xmm23; + const XMMRegister C3 = xmm24; + const XMMRegister C4 = xmm25; + const XMMRegister C5 = xmm26; + + const XMMRegister R0 = xmm27; + const XMMRegister R1 = xmm28; + const XMMRegister R2 = xmm29; + const XMMRegister R1P = xmm30; + const XMMRegister R2P = xmm31; + + // Spread accumulator into 44-bit limbs in quadwords C0,C1,C2 + __ movq(t0, a0); + __ andq(t0, ExternalAddress(poly1305_mask44()), rscratch); // First limb (Acc[43:0]) + __ movq(C0, t0); + + __ movq(t0, a1); + __ shrdq(a0, t0, 44); + __ andq(a0, ExternalAddress(poly1305_mask44()), rscratch); // Second limb (Acc[77:52]) + __ movq(C1, a0); + + __ shrdq(a1, a2, 24); + __ andq(a1, ExternalAddress(poly1305_mask42()), rscratch); // Third limb (Acc[129:88]) + __ movq(C2, a1); + + // To add accumulator, we must unroll first loop iteration + + // Load first block of data (128 bytes) and pad + // A0 to have bits 0-43 of all 8 blocks in 8 qwords + // A1 to have bits 87-44 of all 8 blocks in 8 qwords + // A2 to have bits 127-88 of all 8 blocks in 8 qwords + __ evmovdquq(D0, Address(input, 0), Assembler::AVX_512bit); + __ evmovdquq(D1, Address(input, 64), Assembler::AVX_512bit); + poly1305_limbs_avx512(D0, D1, A0, A1, A2, true, TMP, rscratch); + + // Add accumulator to the fist message block + __ vpaddq(A0, A0, C0, Assembler::AVX_512bit); + __ vpaddq(A1, A1, C1, Assembler::AVX_512bit); + __ vpaddq(A2, A2, C2, Assembler::AVX_512bit); + + // Load next blocks of data (128 bytes) and pad + // A3 to have bits 0-43 of all 8 blocks in 8 qwords + // A4 to have bits 87-44 of all 8 blocks in 8 qwords + // A5 to have bits 127-88 of all 8 blocks in 8 qwords + __ evmovdquq(D0, Address(input, 64*2), Assembler::AVX_512bit); + __ evmovdquq(D1, Address(input, 64*3), Assembler::AVX_512bit); + poly1305_limbs_avx512(D0, D1, A3, A4, A5, true, TMP, rscratch); + + __ subl(length, 16*16); + __ lea(input, Address(input,16*16)); + + // Compute the powers of R^1..R^4 and form 44-bit limbs of each + // T0 to have bits 0-127 in 4 quadword pairs + // T1 to have bits 128-129 in alternating 8 qwords + __ vpxorq(T1, T1, T1, Assembler::AVX_512bit); + __ movq(T2, r0); + __ vpinsrq(T2, T2, r1, 1); + __ vinserti32x4(T0, T0, T2, 3); + + // Calculate R^2 + __ movq(a0, r0); + __ movq(a1, r1); + // "Clever": a2 not set because poly1305_multiply_scalar has a flag to indicate 128-bit accumulator + poly1305_multiply_scalar(a0, a1, a2, + r0, r1, c1, true, + t0, t1, t2, mulql, mulqh); + + __ movq(T2, a0); + __ vpinsrq(T2, T2, a1, 1); + __ vinserti32x4(T0, T0, T2, 2); + __ movq(T2, a2); + __ vinserti32x4(T1, T1, T2, 2); + + // Calculate R^3 + poly1305_multiply_scalar(a0, a1, a2, + r0, r1, c1, false, + t0, t1, t2, mulql, mulqh); + + __ movq(T2, a0); + __ vpinsrq(T2, T2, a1, 1); + __ vinserti32x4(T0, T0, T2, 1); + __ movq(T2, a2); + __ vinserti32x4(T1, T1, T2, 1); + + // Calculate R^4 + poly1305_multiply_scalar(a0, a1, a2, + r0, r1, c1, false, + t0, t1, t2, mulql, mulqh); + + __ movq(T2, a0); + __ vpinsrq(T2, T2, a1, 1); + __ vinserti32x4(T0, T0, T2, 0); + __ movq(T2, a2); + __ vinserti32x4(T1, T1, T2, 0); + + // Interleave the powers of R^1..R^4 to form 44-bit limbs (half-empty) + // B0 to have bits 0-43 of all 4 blocks in alternating 8 qwords + // B1 to have bits 87-44 of all 4 blocks in alternating 8 qwords + // B2 to have bits 127-88 of all 4 blocks in alternating 8 qwords + __ vpxorq(T2, T2, T2, Assembler::AVX_512bit); + poly1305_limbs_avx512(T0, T2, B0, B1, B2, false, TMP, rscratch); + + // T1 contains the 2 highest bits of the powers of R + __ vpsllq(T1, T1, 40, Assembler::AVX_512bit); + __ evporq(B2, B2, T1, Assembler::AVX_512bit); + + // Broadcast 44-bit limbs of R^4 into R0,R1,R2 + __ mov(t0, a0); + __ andq(t0, ExternalAddress(poly1305_mask44()), rscratch); // First limb (R^4[43:0]) + __ evpbroadcastq(R0, t0, Assembler::AVX_512bit); + + __ movq(t0, a1); + __ shrdq(a0, t0, 44); + __ andq(a0, ExternalAddress(poly1305_mask44()), rscratch); // Second limb (R^4[87:44]) + __ evpbroadcastq(R1, a0, Assembler::AVX_512bit); + + __ shrdq(a1, a2, 24); + __ andq(a1, ExternalAddress(poly1305_mask42()), rscratch); // Third limb (R^4[129:88]) + __ evpbroadcastq(R2, a1, Assembler::AVX_512bit); + + // Generate 4*5*R^4 into {R2P,R1P} + // Used as multiplier in poly1305_multiply8_avx512 so can + // ignore bottom limb and carry propagation + __ vpsllq(R1P, R1, 2, Assembler::AVX_512bit); // 4*R^4 + __ vpsllq(R2P, R2, 2, Assembler::AVX_512bit); + __ vpaddq(R1P, R1P, R1, Assembler::AVX_512bit); // 5*R^4 + __ vpaddq(R2P, R2P, R2, Assembler::AVX_512bit); + __ vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R^4 + __ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit); + + // Move R^4..R^1 one element over + __ vpslldq(C0, B0, 8, Assembler::AVX_512bit); + __ vpslldq(C1, B1, 8, Assembler::AVX_512bit); + __ vpslldq(C2, B2, 8, Assembler::AVX_512bit); + + // Calculate R^8-R^5 + poly1305_multiply8_avx512(B0, B1, B2, // ACC=R^4..R^1 + R0, R1, R2, R1P, R2P, // R^4..R^4, 4*5*R^4 + T0, T1, T2, T3, T4, T5, TMP, rscratch); + + // Interleave powers of R: R^8 R^4 R^7 R^3 R^6 R^2 R^5 R + __ evporq(B0, B0, C0, Assembler::AVX_512bit); + __ evporq(B1, B1, C1, Assembler::AVX_512bit); + __ evporq(B2, B2, C2, Assembler::AVX_512bit); + + // Store R^8-R for later use + __ evmovdquq(C0, B0, Assembler::AVX_512bit); + __ evmovdquq(C1, B1, Assembler::AVX_512bit); + __ evmovdquq(C2, B2, Assembler::AVX_512bit); + + // Broadcast R^8 + __ vpbroadcastq(R0, B0, Assembler::AVX_512bit); + __ vpbroadcastq(R1, B1, Assembler::AVX_512bit); + __ vpbroadcastq(R2, B2, Assembler::AVX_512bit); + + // Generate 4*5*R^8 + __ vpsllq(R1P, R1, 2, Assembler::AVX_512bit); + __ vpsllq(R2P, R2, 2, Assembler::AVX_512bit); + __ vpaddq(R1P, R1P, R1, Assembler::AVX_512bit); // 5*R^8 + __ vpaddq(R2P, R2P, R2, Assembler::AVX_512bit); + __ vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R^8 + __ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit); + + // Calculate R^16-R^9 + poly1305_multiply8_avx512(B0, B1, B2, // ACC=R^8..R^1 + R0, R1, R2, R1P, R2P, // R^8..R^8, 4*5*R^8 + T0, T1, T2, T3, T4, T5, TMP, rscratch); + + // Store R^16-R^9 for later use + __ evmovdquq(C3, B0, Assembler::AVX_512bit); + __ evmovdquq(C4, B1, Assembler::AVX_512bit); + __ evmovdquq(C5, B2, Assembler::AVX_512bit); + + // Broadcast R^16 + __ vpbroadcastq(R0, B0, Assembler::AVX_512bit); + __ vpbroadcastq(R1, B1, Assembler::AVX_512bit); + __ vpbroadcastq(R2, B2, Assembler::AVX_512bit); + + // Generate 4*5*R^16 + __ vpsllq(R1P, R1, 2, Assembler::AVX_512bit); + __ vpsllq(R2P, R2, 2, Assembler::AVX_512bit); + __ vpaddq(R1P, R1P, R1, Assembler::AVX_512bit); // 5*R^16 + __ vpaddq(R2P, R2P, R2, Assembler::AVX_512bit); + __ vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R^16 + __ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit); + + // VECTOR LOOP: process 16 * 16-byte message block at a time + __ bind(L_process256Loop); + __ cmpl(length, 16*16); + __ jcc(Assembler::less, L_process256LoopDone); + + // Load and interleave next block of data (128 bytes) + __ evmovdquq(D0, Address(input, 0), Assembler::AVX_512bit); + __ evmovdquq(D1, Address(input, 64), Assembler::AVX_512bit); + poly1305_limbs_avx512(D0, D1, B0, B1, B2, true, TMP, rscratch); + + // Load and interleave next block of data (128 bytes) + __ evmovdquq(D0, Address(input, 64*2), Assembler::AVX_512bit); + __ evmovdquq(D1, Address(input, 64*3), Assembler::AVX_512bit); + poly1305_limbs_avx512(D0, D1, B3, B4, B5, true, TMP, rscratch); + + poly1305_multiply8_avx512(A0, A1, A2, // MSG/ACC 16 blocks + R0, R1, R2, R1P, R2P, // R^16..R^16, 4*5*R^16 + T0, T1, T2, T3, T4, T5, TMP, rscratch); + poly1305_multiply8_avx512(A3, A4, A5, // MSG/ACC 16 blocks + R0, R1, R2, R1P, R2P, // R^16..R^16, 4*5*R^16 + T0, T1, T2, T3, T4, T5, TMP, rscratch); + + __ vpaddq(A0, A0, B0, Assembler::AVX_512bit); // Add low 42-bit bits from new blocks to accumulator + __ vpaddq(A1, A1, B1, Assembler::AVX_512bit); // Add medium 42-bit bits from new blocks to accumulator + __ vpaddq(A2, A2, B2, Assembler::AVX_512bit); // Add highest bits from new blocks to accumulator + __ vpaddq(A3, A3, B3, Assembler::AVX_512bit); // Add low 42-bit bits from new blocks to accumulator + __ vpaddq(A4, A4, B4, Assembler::AVX_512bit); // Add medium 42-bit bits from new blocks to accumulator + __ vpaddq(A5, A5, B5, Assembler::AVX_512bit); // Add highest bits from new blocks to accumulator + + __ subl(length, 16*16); + __ lea(input, Address(input,16*16)); + __ jmp(L_process256Loop); + + __ bind(L_process256LoopDone); + + // Tail processing: Need to multiply ACC by R^16..R^1 and add it all up into a single scalar value + // Generate 4*5*[R^16..R^9] (ignore lowest limb) + // Use D0 ~ R1P, D1 ~ R2P for higher powers + __ vpsllq(R1P, C4, 2, Assembler::AVX_512bit); + __ vpsllq(R2P, C5, 2, Assembler::AVX_512bit); + __ vpaddq(R1P, R1P, C4, Assembler::AVX_512bit); // 5*R^8 + __ vpaddq(R2P, R2P, C5, Assembler::AVX_512bit); + __ vpsllq(D0, R1P, 2, Assembler::AVX_512bit); // 4*5*R^8 + __ vpsllq(D1, R2P, 2, Assembler::AVX_512bit); + + // Generate 4*5*[R^8..R^1] (ignore lowest limb) + __ vpsllq(R1P, C1, 2, Assembler::AVX_512bit); + __ vpsllq(R2P, C2, 2, Assembler::AVX_512bit); + __ vpaddq(R1P, R1P, C1, Assembler::AVX_512bit); // 5*R^8 + __ vpaddq(R2P, R2P, C2, Assembler::AVX_512bit); + __ vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R^8 + __ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit); + + poly1305_multiply8_avx512(A0, A1, A2, // MSG/ACC 16 blocks + C3, C4, C5, D0, D1, // R^16-R^9, R1P, R2P + T0, T1, T2, T3, T4, T5, TMP, rscratch); + poly1305_multiply8_avx512(A3, A4, A5, // MSG/ACC 16 blocks + C0, C1, C2, R1P, R2P, // R^8-R, R1P, R2P + T0, T1, T2, T3, T4, T5, TMP, rscratch); + + // Add all blocks (horizontally) + // 16->8 blocks + __ vpaddq(A0, A0, A3, Assembler::AVX_512bit); + __ vpaddq(A1, A1, A4, Assembler::AVX_512bit); + __ vpaddq(A2, A2, A5, Assembler::AVX_512bit); + + // 8 -> 4 blocks + __ vextracti64x4(T0, A0, 1); + __ vextracti64x4(T1, A1, 1); + __ vextracti64x4(T2, A2, 1); + __ vpaddq(A0, A0, T0, Assembler::AVX_256bit); + __ vpaddq(A1, A1, T1, Assembler::AVX_256bit); + __ vpaddq(A2, A2, T2, Assembler::AVX_256bit); + + // 4 -> 2 blocks + __ vextracti32x4(T0, A0, 1); + __ vextracti32x4(T1, A1, 1); + __ vextracti32x4(T2, A2, 1); + __ vpaddq(A0, A0, T0, Assembler::AVX_128bit); + __ vpaddq(A1, A1, T1, Assembler::AVX_128bit); + __ vpaddq(A2, A2, T2, Assembler::AVX_128bit); + + // 2 -> 1 blocks + __ vpsrldq(T0, A0, 8, Assembler::AVX_128bit); + __ vpsrldq(T1, A1, 8, Assembler::AVX_128bit); + __ vpsrldq(T2, A2, 8, Assembler::AVX_128bit); + + // Finish folding and clear second qword + __ mov64(t0, 0xfd); + __ kmovql(k1, t0); + __ evpaddq(A0, k1, A0, T0, false, Assembler::AVX_512bit); + __ evpaddq(A1, k1, A1, T1, false, Assembler::AVX_512bit); + __ evpaddq(A2, k1, A2, T2, false, Assembler::AVX_512bit); + + // Carry propagation + __ vpsrlq(D0, A0, 44, Assembler::AVX_512bit); + __ evpandq(A0, A0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // Clear top 20 bits + __ vpaddq(A1, A1, D0, Assembler::AVX_512bit); + __ vpsrlq(D0, A1, 44, Assembler::AVX_512bit); + __ evpandq(A1, A1, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // Clear top 20 bits + __ vpaddq(A2, A2, D0, Assembler::AVX_512bit); + __ vpsrlq(D0, A2, 42, Assembler::AVX_512bit); + __ evpandq(A2, A2, ExternalAddress(poly1305_mask42()), Assembler::AVX_512bit, rscratch); // Clear top 22 bits + __ vpsllq(D1, D0, 2, Assembler::AVX_512bit); + __ vpaddq(D0, D0, D1, Assembler::AVX_512bit); + __ vpaddq(A0, A0, D0, Assembler::AVX_512bit); + + // Put together A (accumulator) + __ movq(a0, A0); + + __ movq(t0, A1); + __ movq(t1, t0); + __ shlq(t1, 44); + __ shrq(t0, 20); + + __ movq(a2, A2); + __ movq(a1, a2); + __ shlq(a1, 24); + __ shrq(a2, 40); + + __ addq(a0, t1); + __ adcq(a1, t0); + __ adcq(a2, 0); + + // Cleanup + // Zero out zmm0-zmm31. + __ vzeroall(); + for (XMMRegister rxmm = xmm16; rxmm->is_valid(); rxmm = rxmm->successor()) { + __ vpxorq(rxmm, rxmm, rxmm, Assembler::AVX_512bit); + } +} + +// This function consumes as many whole 16-byte blocks as available in input +// After execution, input and length will point at remaining (unprocessed) data +// and accumulator will point to the current accumulator value +address StubGenerator::generate_poly1305_processBlocks() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks"); + address start = __ pc(); + __ enter(); + + // Save all 'SOE' registers + __ push(rbx); + #ifdef _WIN64 + __ push(rsi); + __ push(rdi); + #endif + __ push(r12); + __ push(r13); + __ push(r14); + __ push(r15); + + // Register Map + const Register input = rdi; + const Register length = rbx; + const Register accumulator = rcx; + const Register R = r8; + + const Register a0 = rsi; // [in/out] accumulator bits 63..0 + const Register a1 = r9; // [in/out] accumulator bits 127..64 + const Register a2 = r10; // [in/out] accumulator bits 195..128 + const Register r0 = r11; // R constant bits 63..0 + const Register r1 = r12; // R constant bits 127..64 + const Register c1 = r8; // 5*R (upper limb only) + const Register t0 = r13; + const Register t1 = r14; + const Register t2 = r15; + const Register mulql = rax; + const Register mulqh = rdx; + + // Normalize input + // pseudo-signature: void poly1305_processBlocks(byte[] input, int length, int[5] accumulator, int[5] R) + // input, a, r pointers point at first array element + // java headers bypassed in LibraryCallKit::inline_poly1305_processBlocks + #ifdef _WIN64 + // c_rarg0 - rcx + // c_rarg1 - rdx + // c_rarg2 - r8 + // c_rarg3 - r9 + __ mov(input, c_rarg0); + __ mov(length, c_rarg1); + __ mov(accumulator, c_rarg2); + __ mov(R, c_rarg3); + #else + // c_rarg0 - rdi + // c_rarg1 - rsi + // c_rarg2 - rdx + // c_rarg3 - rcx + // dont clobber R, args copied out-of-order + __ mov(length, c_rarg1); + __ mov(R, c_rarg3); + __ mov(accumulator, c_rarg2); + #endif + + Label L_process16Loop, L_process16LoopDone; + + // Load R into r1:r0 + poly1305_limbs(R, r0, r1, noreg, t0, t1); + + // Compute 5*R (Upper limb only) + __ movq(c1, r1); + __ shrq(c1, 2); + __ addq(c1, r1); // c1 = r1 + (r1 >> 2) + + // Load accumulator into a2:a1:a0 + poly1305_limbs(accumulator, a0, a1, a2, t0, t1); + + // VECTOR LOOP: Minimum of 256 bytes to run vectorized code + __ cmpl(length, 16*16); + __ jcc(Assembler::less, L_process16Loop); + + poly1305_process_blocks_avx512(input, length, + a0, a1, a2, + r0, r1, c1); + + // SCALAR LOOP: process one 16-byte message block at a time + __ bind(L_process16Loop); + __ cmpl(length, 16); + __ jcc(Assembler::less, L_process16LoopDone); + + __ addq(a0, Address(input,0)); + __ adcq(a1, Address(input,8)); + __ adcq(a2,1); + poly1305_multiply_scalar(a0, a1, a2, + r0, r1, c1, false, + t0, t1, t2, mulql, mulqh); + + __ subl(length, 16); + __ lea(input, Address(input,16)); + __ jmp(L_process16Loop); + __ bind(L_process16LoopDone); + + // Write output + poly1305_limbs_out(a0, a1, a2, accumulator, t0, t1); + + __ pop(r15); + __ pop(r14); + __ pop(r13); + __ pop(r12); + #ifdef _WIN64 + __ pop(rdi); + __ pop(rsi); + #endif + __ pop(rbx); + + __ leave(); + __ ret(0); + return start; +} diff --git a/src/hotspot/cpu/x86/stubRoutines_x86.hpp b/src/hotspot/cpu/x86/stubRoutines_x86.hpp index 989536da2a552..bb98fcf46cd37 100644 --- a/src/hotspot/cpu/x86/stubRoutines_x86.hpp +++ b/src/hotspot/cpu/x86/stubRoutines_x86.hpp @@ -33,7 +33,7 @@ static bool returns_to_call_stub(address return_pc) { return return_pc == _call_ enum platform_dependent_constants { code_size1 = 20000 LP64_ONLY(+10000), // simply increase if too small (assembler will crash if too small) - code_size2 = 35300 LP64_ONLY(+35000) WINDOWS_ONLY(+2048) // simply increase if too small (assembler will crash if too small) + code_size2 = 35300 LP64_ONLY(+45000) WINDOWS_ONLY(+2048) // simply increase if too small (assembler will crash if too small) }; class x86 { diff --git a/src/hotspot/cpu/x86/vm_version_x86.cpp b/src/hotspot/cpu/x86/vm_version_x86.cpp index c8ecc68468469..377ff2a456614 100644 --- a/src/hotspot/cpu/x86/vm_version_x86.cpp +++ b/src/hotspot/cpu/x86/vm_version_x86.cpp @@ -950,6 +950,7 @@ void VM_Version::get_processor_features() { _features &= ~CPU_AVX512_VBMI; _features &= ~CPU_AVX512_VBMI2; _features &= ~CPU_AVX512_BITALG; + _features &= ~CPU_AVX512_IFMA; } if (UseAVX < 2) @@ -981,6 +982,7 @@ void VM_Version::get_processor_features() { _features &= ~CPU_FLUSHOPT; _features &= ~CPU_GFNI; _features &= ~CPU_AVX512_BITALG; + _features &= ~CPU_AVX512_IFMA; } } @@ -1333,6 +1335,18 @@ void VM_Version::get_processor_features() { } #endif // COMPILER2 && ASSERT +#ifdef _LP64 + if (supports_avx512ifma() && supports_avx512vlbw() && MaxVectorSize >= 64) { + if (FLAG_IS_DEFAULT(UsePolyIntrinsics)) { + FLAG_SET_DEFAULT(UsePolyIntrinsics, true); + } + } else +#endif + if (UsePolyIntrinsics) { + warning("Intrinsics for Poly1305 crypto hash functions not available on this CPU."); + FLAG_SET_DEFAULT(UsePolyIntrinsics, false); + } + #ifdef _LP64 if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) { UseMultiplyToLenIntrinsic = true; @@ -2897,6 +2911,8 @@ uint64_t VM_Version::feature_flags() { result |= CPU_AVX512CD; if (_cpuid_info.sef_cpuid7_ebx.bits.avx512dq != 0) result |= CPU_AVX512DQ; + if (_cpuid_info.sef_cpuid7_ebx.bits.avx512ifma != 0) + result |= CPU_AVX512_IFMA; if (_cpuid_info.sef_cpuid7_ebx.bits.avx512pf != 0) result |= CPU_AVX512PF; if (_cpuid_info.sef_cpuid7_ebx.bits.avx512er != 0) diff --git a/src/hotspot/cpu/x86/vm_version_x86.hpp b/src/hotspot/cpu/x86/vm_version_x86.hpp index 1f4cfd39e8d0f..9213d42bc572f 100644 --- a/src/hotspot/cpu/x86/vm_version_x86.hpp +++ b/src/hotspot/cpu/x86/vm_version_x86.hpp @@ -223,7 +223,9 @@ class VM_Version : public Abstract_VM_Version { avx512dq : 1, : 1, adx : 1, - : 3, + : 1, + avx512ifma : 1, + : 1, clflushopt : 1, clwb : 1, : 1, @@ -387,7 +389,8 @@ class VM_Version : public Abstract_VM_Version { decl(PKU, "pku", 54) /* Protection keys for user-mode pages */ \ decl(OSPKE, "ospke", 55) /* OS enables protection keys */ \ decl(CET_IBT, "cet_ibt", 56) /* Control Flow Enforcement - Indirect Branch Tracking */ \ - decl(CET_SS, "cet_ss", 57) /* Control Flow Enforcement - Shadow Stack */ + decl(CET_SS, "cet_ss", 57) /* Control Flow Enforcement - Shadow Stack */ \ + decl(AVX512_IFMA, "avx512_ifma", 58) /* Integer Vector FMA instructions*/ #define DECLARE_CPU_FEATURE_FLAG(id, name, bit) CPU_##id = (1ULL << bit), CPU_FEATURE_FLAGS(DECLARE_CPU_FEATURE_FLAG) @@ -667,6 +670,7 @@ class VM_Version : public Abstract_VM_Version { static bool supports_adx() { return (_features & CPU_ADX) != 0; } static bool supports_evex() { return (_features & CPU_AVX512F) != 0; } static bool supports_avx512dq() { return (_features & CPU_AVX512DQ) != 0; } + static bool supports_avx512ifma() { return (_features & CPU_AVX512_IFMA) != 0; } static bool supports_avx512pf() { return (_features & CPU_AVX512PF) != 0; } static bool supports_avx512er() { return (_features & CPU_AVX512ER) != 0; } static bool supports_avx512cd() { return (_features & CPU_AVX512CD) != 0; } diff --git a/src/hotspot/share/classfile/vmIntrinsics.cpp b/src/hotspot/share/classfile/vmIntrinsics.cpp index 67596bac13af6..4cd4073535974 100644 --- a/src/hotspot/share/classfile/vmIntrinsics.cpp +++ b/src/hotspot/share/classfile/vmIntrinsics.cpp @@ -479,6 +479,9 @@ bool vmIntrinsics::disabled_by_jvm_flags(vmIntrinsics::ID id) { case vmIntrinsics::_base64_decodeBlock: if (!UseBASE64Intrinsics) return true; break; + case vmIntrinsics::_poly1305_processBlocks: + if (!UsePolyIntrinsics) return true; + break; case vmIntrinsics::_updateBytesCRC32C: case vmIntrinsics::_updateDirectByteBufferCRC32C: if (!UseCRC32CIntrinsics) return true; diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp index 03c47b7fbdc9a..9b0cd3f366f67 100644 --- a/src/hotspot/share/classfile/vmIntrinsics.hpp +++ b/src/hotspot/share/classfile/vmIntrinsics.hpp @@ -519,7 +519,7 @@ class methodHandle; do_class(java_util_Base64_Decoder, "java/util/Base64$Decoder") \ do_intrinsic(_base64_decodeBlock, java_util_Base64_Decoder, decodeBlock_name, decodeBlock_signature, F_R) \ do_name(decodeBlock_name, "decodeBlock") \ - do_signature(decodeBlock_signature, "([BII[BIZZ)I") \ + do_signature(decodeBlock_signature, "([BII[BIZZ)I") \ \ /* support for com.sun.crypto.provider.GHASH */ \ do_class(com_sun_crypto_provider_ghash, "com/sun/crypto/provider/GHASH") \ @@ -527,6 +527,11 @@ class methodHandle; do_name(processBlocks_name, "processBlocks") \ do_signature(ghash_processBlocks_signature, "([BII[J[J)V") \ \ + /* support for com.sun.crypto.provider.Poly1305 */ \ + do_class(com_sun_crypto_provider_Poly1305, "com/sun/crypto/provider/Poly1305") \ + do_intrinsic(_poly1305_processBlocks, com_sun_crypto_provider_Poly1305, processMultipleBlocks_name, ghash_processBlocks_signature, F_R) \ + do_name(processMultipleBlocks_name, "processMultipleBlocks") \ + \ /* support for java.util.zip */ \ do_class(java_util_zip_CRC32, "java/util/zip/CRC32") \ do_intrinsic(_updateCRC32, java_util_zip_CRC32, update_name, int2_int_signature, F_SN) \ diff --git a/src/hotspot/share/opto/c2compiler.cpp b/src/hotspot/share/opto/c2compiler.cpp index 7aeedff86f9cf..a683b259b9203 100644 --- a/src/hotspot/share/opto/c2compiler.cpp +++ b/src/hotspot/share/opto/c2compiler.cpp @@ -739,6 +739,7 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method, bool is_virt case vmIntrinsics::_ghash_processBlocks: case vmIntrinsics::_base64_encodeBlock: case vmIntrinsics::_base64_decodeBlock: + case vmIntrinsics::_poly1305_processBlocks: case vmIntrinsics::_updateCRC32: case vmIntrinsics::_updateBytesCRC32: case vmIntrinsics::_updateByteBufferCRC32: diff --git a/src/hotspot/share/opto/escape.cpp b/src/hotspot/share/opto/escape.cpp index abb8ad59a7a66..69b2bcd59e8f9 100644 --- a/src/hotspot/share/opto/escape.cpp +++ b/src/hotspot/share/opto/escape.cpp @@ -1168,6 +1168,7 @@ void ConnectionGraph::process_call_arguments(CallNode *call) { strcmp(call->as_CallLeaf()->_name, "electronicCodeBook_decryptAESCrypt") == 0 || strcmp(call->as_CallLeaf()->_name, "counterMode_AESCrypt") == 0 || strcmp(call->as_CallLeaf()->_name, "galoisCounterMode_AESCrypt") == 0 || + strcmp(call->as_CallLeaf()->_name, "poly1305_processBlocks") == 0 || strcmp(call->as_CallLeaf()->_name, "ghash_processBlocks") == 0 || strcmp(call->as_CallLeaf()->_name, "encodeBlock") == 0 || strcmp(call->as_CallLeaf()->_name, "decodeBlock") == 0 || diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp index 60a25a43e5175..274d86f2ebc39 100644 --- a/src/hotspot/share/opto/library_call.cpp +++ b/src/hotspot/share/opto/library_call.cpp @@ -612,6 +612,8 @@ bool LibraryCallKit::try_to_inline(int predicate) { return inline_base64_encodeBlock(); case vmIntrinsics::_base64_decodeBlock: return inline_base64_decodeBlock(); + case vmIntrinsics::_poly1305_processBlocks: + return inline_poly1305_processBlocks(); case vmIntrinsics::_encodeISOArray: case vmIntrinsics::_encodeByteISOArray: @@ -6962,6 +6964,42 @@ bool LibraryCallKit::inline_base64_decodeBlock() { return true; } +bool LibraryCallKit::inline_poly1305_processBlocks() { + address stubAddr; + const char *stubName; + assert(UsePolyIntrinsics, "need Poly intrinsics support"); + assert(callee()->signature()->size() == 5, "poly1305_processBlocks has %d parameters", callee()->signature()->size()); + stubAddr = StubRoutines::poly1305_processBlocks(); + stubName = "poly1305_processBlocks"; + + if (!stubAddr) return false; + null_check_receiver(); // null-check receiver + if (stopped()) return true; + + Node* input = argument(1); + Node* input_offset = argument(2); + Node* len = argument(3); + Node* alimbs = argument(4); + Node* rlimbs = argument(5); + + input = must_be_not_null(input, true); + alimbs = must_be_not_null(alimbs, true); + rlimbs = must_be_not_null(rlimbs, true); + + Node* input_start = array_element_address(input, input_offset, T_BYTE); + assert(input_start, "input array is NULL"); + Node* acc_start = array_element_address(alimbs, intcon(0), T_LONG); + assert(acc_start, "acc array is NULL"); + Node* r_start = array_element_address(rlimbs, intcon(0), T_LONG); + assert(r_start, "r array is NULL"); + + Node* call = make_runtime_call(RC_LEAF | RC_NO_FP, + OptoRuntime::poly1305_processBlocks_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + input_start, len, acc_start, r_start); + return true; +} + //------------------------------inline_digestBase_implCompress----------------------- // // Calculate MD5 for single-block byte[] array. diff --git a/src/hotspot/share/opto/library_call.hpp b/src/hotspot/share/opto/library_call.hpp index 07235497740aa..f104c9bc88faf 100644 --- a/src/hotspot/share/opto/library_call.hpp +++ b/src/hotspot/share/opto/library_call.hpp @@ -293,6 +293,7 @@ class LibraryCallKit : public GraphKit { bool inline_ghash_processBlocks(); bool inline_base64_encodeBlock(); bool inline_base64_decodeBlock(); + bool inline_poly1305_processBlocks(); bool inline_digestBase_implCompress(vmIntrinsics::ID id); bool inline_digestBase_implCompressMB(int predicate); bool inline_digestBase_implCompressMB(Node* digestBaseObj, ciInstanceKlass* instklass, diff --git a/src/hotspot/share/opto/runtime.cpp b/src/hotspot/share/opto/runtime.cpp index 68408e198c74d..7b08cab6748f8 100644 --- a/src/hotspot/share/opto/runtime.cpp +++ b/src/hotspot/share/opto/runtime.cpp @@ -1266,6 +1266,26 @@ const TypeFunc* OptoRuntime::base64_decodeBlock_Type() { return TypeFunc::make(domain, range); } +// Poly1305 processMultipleBlocks function +const TypeFunc* OptoRuntime::poly1305_processBlocks_Type() { + int argcnt = 4; + + const Type** fields = TypeTuple::fields(argcnt); + int argp = TypeFunc::Parms; + fields[argp++] = TypePtr::NOTNULL; // input array + fields[argp++] = TypeInt::INT; // input length + fields[argp++] = TypePtr::NOTNULL; // accumulator array + fields[argp++] = TypePtr::NOTNULL; // r array + assert(argp == TypeFunc::Parms + argcnt, "correct decoding"); + const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields); + + // result type needed + fields = TypeTuple::fields(1); + fields[TypeFunc::Parms + 0] = NULL; // void + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields); + return TypeFunc::make(domain, range); +} + //------------- Interpreter state access for on stack replacement const TypeFunc* OptoRuntime::osr_end_Type() { // create input type (domain) diff --git a/src/hotspot/share/opto/runtime.hpp b/src/hotspot/share/opto/runtime.hpp index 43e4cff52287f..1de8ffb18fb97 100644 --- a/src/hotspot/share/opto/runtime.hpp +++ b/src/hotspot/share/opto/runtime.hpp @@ -280,6 +280,7 @@ class OptoRuntime : public AllStatic { static const TypeFunc* ghash_processBlocks_Type(); static const TypeFunc* base64_encodeBlock_Type(); static const TypeFunc* base64_decodeBlock_Type(); + static const TypeFunc* poly1305_processBlocks_Type(); static const TypeFunc* updateBytesCRC32_Type(); static const TypeFunc* updateBytesCRC32C_Type(); diff --git a/src/hotspot/share/runtime/globals.hpp b/src/hotspot/share/runtime/globals.hpp index f76ae8d8d5e19..c5750d6d68e83 100644 --- a/src/hotspot/share/runtime/globals.hpp +++ b/src/hotspot/share/runtime/globals.hpp @@ -238,6 +238,9 @@ const int ObjectAlignmentInBytes = 8; product(bool, UseBASE64Intrinsics, false, \ "Use intrinsics for java.util.Base64") \ \ + product(bool, UsePolyIntrinsics, false, DIAGNOSTIC, \ + "Use intrinsics for sun.security.util.math.intpoly") \ + \ product(size_t, LargePageSizeInBytes, 0, \ "Maximum large page size used (0 will use the default large " \ "page size for the environment as the maximum)") \ diff --git a/src/hotspot/share/runtime/stubRoutines.cpp b/src/hotspot/share/runtime/stubRoutines.cpp index 93927ad0f89cf..9418b758387d3 100644 --- a/src/hotspot/share/runtime/stubRoutines.cpp +++ b/src/hotspot/share/runtime/stubRoutines.cpp @@ -130,6 +130,7 @@ address StubRoutines::_galoisCounterMode_AESCrypt = NULL; address StubRoutines::_ghash_processBlocks = NULL; address StubRoutines::_base64_encodeBlock = NULL; address StubRoutines::_base64_decodeBlock = NULL; +address StubRoutines::_poly1305_processBlocks = NULL; address StubRoutines::_md5_implCompress = NULL; address StubRoutines::_md5_implCompressMB = NULL; diff --git a/src/hotspot/share/runtime/stubRoutines.hpp b/src/hotspot/share/runtime/stubRoutines.hpp index 30f58519ea9e4..f4cec54aa7f3c 100644 --- a/src/hotspot/share/runtime/stubRoutines.hpp +++ b/src/hotspot/share/runtime/stubRoutines.hpp @@ -211,6 +211,7 @@ class StubRoutines: AllStatic { static address _ghash_processBlocks; static address _base64_encodeBlock; static address _base64_decodeBlock; + static address _poly1305_processBlocks; static address _md5_implCompress; static address _md5_implCompressMB; @@ -384,6 +385,7 @@ class StubRoutines: AllStatic { static address cipherBlockChaining_decryptAESCrypt() { return _cipherBlockChaining_decryptAESCrypt; } static address electronicCodeBook_encryptAESCrypt() { return _electronicCodeBook_encryptAESCrypt; } static address electronicCodeBook_decryptAESCrypt() { return _electronicCodeBook_decryptAESCrypt; } + static address poly1305_processBlocks() { return _poly1305_processBlocks; } static address counterMode_AESCrypt() { return _counterMode_AESCrypt; } static address ghash_processBlocks() { return _ghash_processBlocks; } static address base64_encodeBlock() { return _base64_encodeBlock; } diff --git a/src/hotspot/share/runtime/vmStructs.cpp b/src/hotspot/share/runtime/vmStructs.cpp index d339b0a1ee1c6..bd06ed29010f7 100644 --- a/src/hotspot/share/runtime/vmStructs.cpp +++ b/src/hotspot/share/runtime/vmStructs.cpp @@ -544,6 +544,7 @@ static_field(StubRoutines, _ghash_processBlocks, address) \ static_field(StubRoutines, _base64_encodeBlock, address) \ static_field(StubRoutines, _base64_decodeBlock, address) \ + static_field(StubRoutines, _poly1305_processBlocks, address) \ static_field(StubRoutines, _updateBytesCRC32, address) \ static_field(StubRoutines, _crc_table_adr, address) \ static_field(StubRoutines, _crc32c_table_addr, address) \ diff --git a/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java b/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java index cd78df84bede0..d24b29cedbfdf 100644 --- a/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java +++ b/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java @@ -34,6 +34,8 @@ import sun.security.util.math.*; import sun.security.util.math.intpoly.*; +import jdk.internal.vm.annotation.IntrinsicCandidate; +import jdk.internal.vm.annotation.ForceInline; /** * This class represents the Poly1305 function defined in RFC 7539. @@ -59,8 +61,10 @@ final class Poly1305 { private IntegerModuloP s; private MutableIntegerModuloP a; private final MutableIntegerModuloP n = ipl1305.get1().mutable(); + private final boolean checkWeakKey; - Poly1305() { } + Poly1305() { this(true); } + Poly1305(boolean checkKey) { checkWeakKey = checkKey; } /** * Initialize the Poly1305 object @@ -165,11 +169,15 @@ void engineUpdate(byte[] input, int offset, int len) { blockOffset = 0; } } - while (len >= BLOCK_LENGTH) { - processBlock(input, offset, BLOCK_LENGTH); - offset += BLOCK_LENGTH; - len -= BLOCK_LENGTH; - } + + int blockMultipleLength = len & (~(BLOCK_LENGTH-1)); + long[] aLimbs = a.getLimbs(); + long[] rLimbs = r.getLimbs(); + processMultipleBlocksCheck(input, offset, blockMultipleLength, aLimbs, rLimbs); + processMultipleBlocks(input, offset, blockMultipleLength, aLimbs, rLimbs); + offset += blockMultipleLength; + len -= blockMultipleLength; + if (len > 0) { // and len < BLOCK_LENGTH System.arraycopy(input, offset, block, 0, len); blockOffset = len; @@ -235,12 +243,35 @@ private void processBlock(byte[] block, int offset, int length) { a.setProduct(r); // a = (a * r) % p } + // This is an intrinsified method. The unused parameters aLimbs and rLimbs are used by the intrinsic. + // They correspond to this.a and this.r respectively + @ForceInline + @IntrinsicCandidate + private void processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs) { + while (length >= BLOCK_LENGTH) { + processBlock(input, offset, BLOCK_LENGTH); + offset += BLOCK_LENGTH; + length -= BLOCK_LENGTH; + } + } + + private static void processMultipleBlocksCheck(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs) { + Objects.checkFromIndexSize(offset, length, input.length); + final int numLimbs = 5; // Intrinsic expects exactly 5 limbs + if (aLimbs.length != numLimbs) { + throw new RuntimeException("invalid accumulator length: " + aLimbs.length); + } + if (rLimbs.length != numLimbs) { + throw new RuntimeException("invalid R length: " + rLimbs.length); + } + } + /** * Partition the authentication key into the R and S components, clamp * the R value, and instantiate IntegerModuloP objects to R and S's * numeric values. */ - private void setRSVals() { + private void setRSVals() throws InvalidKeyException { // Clamp the bytes in the "r" half of the key. keyBytes[3] &= 15; keyBytes[7] &= 15; @@ -250,6 +281,24 @@ private void setRSVals() { keyBytes[8] &= (byte)252; keyBytes[12] &= (byte)252; + if (checkWeakKey) { + byte keyIsZero = 0; + for (int i = 0; i < RS_LENGTH; i++) { + keyIsZero |= keyBytes[i]; + } + if (keyIsZero == 0) { + throw new InvalidKeyException("R is set to zero"); + } + + keyIsZero = 0; + for (int i = RS_LENGTH; i < 2*RS_LENGTH; i++) { + keyIsZero |= keyBytes[i]; + } + if (keyIsZero == 0) { + throw new InvalidKeyException("S is set to zero"); + } + } + // Create IntegerModuloP elements from the r and s values r = ipl1305.getElement(keyBytes, 0, RS_LENGTH, (byte)0); s = ipl1305.getElement(keyBytes, RS_LENGTH, RS_LENGTH, (byte)0); diff --git a/src/java.base/share/classes/sun/security/util/math/IntegerModuloP.java b/src/java.base/share/classes/sun/security/util/math/IntegerModuloP.java index 7084776bfa166..2373bf903fc56 100644 --- a/src/java.base/share/classes/sun/security/util/math/IntegerModuloP.java +++ b/src/java.base/share/classes/sun/security/util/math/IntegerModuloP.java @@ -153,6 +153,11 @@ default byte[] asByteArray(int len) { */ void asByteArray(byte[] result); + /** + * Break encapsulation, used for IntrinsicCandidate functions + */ + long[] getLimbs(); + /** * Compute the multiplicative inverse of this field element. * diff --git a/src/java.base/share/classes/sun/security/util/math/intpoly/IntegerPolynomial.java b/src/java.base/share/classes/sun/security/util/math/intpoly/IntegerPolynomial.java index 810c3fb3b86f8..693d88bcc76af 100644 --- a/src/java.base/share/classes/sun/security/util/math/intpoly/IntegerPolynomial.java +++ b/src/java.base/share/classes/sun/security/util/math/intpoly/IntegerPolynomial.java @@ -626,6 +626,10 @@ public void asByteArray(byte[] result) { } limbsToByteArray(limbs, result); } + + public long[] getLimbs() { + return limbs; + } } protected class MutableElement extends Element diff --git a/src/jdk.internal.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java b/src/jdk.internal.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java index 272bb67041816..2165b2cc03085 100644 --- a/src/jdk.internal.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java +++ b/src/jdk.internal.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java @@ -231,6 +231,7 @@ public enum CPUFeature implements CPUFeatureName { OSPKE, CET_IBT, CET_SS, + AVX512_IFMA, } private final EnumSet features; diff --git a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java index df80d7d9de81d..34bb118155f32 100644 --- a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java +++ b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -28,3 +28,41 @@ * @run main java.base/com.sun.crypto.provider.Poly1305UnitTest * @summary Unit test for com.sun.crypto.provider.Poly1305. */ + +/* + * @test + * @key randomness + * @modules java.base/com.sun.crypto.provider + * @run main java.base/com.sun.crypto.provider.Poly1305IntrinsicFuzzTest + * @summary Unit test for com.sun.crypto.provider.Poly1305. + */ + +/* + * @test + * @modules java.base/com.sun.crypto.provider + * @run main java.base/com.sun.crypto.provider.Poly1305KAT + * @summary Unit test for com.sun.crypto.provider.Poly1305. + */ + +/* + * @test + * @key randomness + * @modules java.base/com.sun.crypto.provider + * @summary Unit test for IntrinsicCandidate in com.sun.crypto.provider.Poly1305. + * @run main/othervm -Xcomp -XX:-TieredCompilation java.base/com.sun.crypto.provider.Poly1305IntrinsicFuzzTest + */ + +/* + * @test + * @modules java.base/com.sun.crypto.provider + * @summary Unit test for IntrinsicCandidate in com.sun.crypto.provider.Poly1305. + * @run main/othervm -Xcomp -XX:-TieredCompilation java.base/com.sun.crypto.provider.Poly1305KAT + */ + +package com.sun.crypto.provider.Cipher.ChaCha20; + +public class Poly1305UnitTestDriver { + static public void main(String[] args) { + System.out.println("Passed"); + } +} diff --git a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305IntrinsicFuzzTest.java b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305IntrinsicFuzzTest.java new file mode 100644 index 0000000000000..3e7ecbad62e0a --- /dev/null +++ b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305IntrinsicFuzzTest.java @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2022, Intel Corporation. All rights reserved. + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package com.sun.crypto.provider; + +import java.nio.ByteBuffer; +import java.util.Arrays; + +import javax.crypto.spec.SecretKeySpec; + +// This test case relies on the fact that single-byte Poly1305.engineUpdate(byte) does not have an intrinsic +// In this way we can compare if the intrinsic and pure java produce same result +// This test case is NOT entirely deterministic, it uses a random seed for pseudo-random number generator +// If a failure occurs, hardcode the seed to make the test case deterministic +public class Poly1305IntrinsicFuzzTest { + public static void main(String[] args) throws Exception { + //Note: it might be useful to increase this number during development of new Poly1305 intrinsics + final int repeat = 100; + for (int i = 0; i < repeat; i++) { + run(); + } + System.out.println("Fuzz Success"); + } + + public static void run() throws Exception { + java.util.Random rnd = new java.util.Random(); + long seed = rnd.nextLong(); + rnd.setSeed(seed); + + byte[] key = new byte[32]; + rnd.nextBytes(key); + int msgLen = rnd.nextInt(128, 4096); // x86_64 intrinsic requires 256 bytes minimum + byte[] message = new byte[msgLen]; + + Poly1305 authenticator = new Poly1305(); + Poly1305 authenticatorSlow = new Poly1305(); + if (authenticator.engineGetMacLength() != 16) { + throw new RuntimeException("The length of Poly1305 MAC must be 16-bytes."); + } + + authenticator.engineInit(new SecretKeySpec(key, 0, 32, "Poly1305"), null); + authenticatorSlow.engineInit(new SecretKeySpec(key, 0, 32, "Poly1305"), null); + + if (rnd.nextBoolean()) { + // Prime just the buffer and/or accumulator (buffer can keep at most 16 bytes from previous engineUpdate) + int initDataLen = rnd.nextInt(8, 24); + authenticator.engineUpdate(message, 0, initDataLen); + slowUpdate(authenticatorSlow, message, 0, initDataLen); + } + + if (rnd.nextBoolean()) { + // Multiple calls to engineUpdate + authenticator.engineUpdate(message, 0, message.length); + slowUpdate(authenticatorSlow, message, 0, message.length); + } + + authenticator.engineUpdate(message, 0, message.length); + slowUpdate(authenticatorSlow, message, 0, message.length); + + byte[] tag = authenticator.engineDoFinal(); + byte[] tagSlow = authenticatorSlow.engineDoFinal(); + + if (!Arrays.equals(tag, tagSlow)) { + throw new RuntimeException("[Seed "+seed+"] Tag mismatch: " + Arrays.toString(tag) + " != " + Arrays.toString(tagSlow)); + } + } + + static void slowUpdate(Poly1305 authenticator, byte[] message, int offset, int len) { + len = Math.min(message.length, offset + len); + for (int i = offset; i < len; i++) { + authenticator.engineUpdate(message[i]); + } + } +} diff --git a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305KAT.java b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305KAT.java new file mode 100644 index 0000000000000..649d1888c70b7 --- /dev/null +++ b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305KAT.java @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2022, Intel Corporation. All rights reserved. + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package com.sun.crypto.provider; + +import java.util.*; +import java.nio.ByteBuffer; +import java.util.Arrays; + +import javax.crypto.spec.SecretKeySpec; + +public class Poly1305KAT { + public static class TestData { + public TestData(String name, String keyStr, String inputStr, String outStr) { + HexFormat hex = HexFormat.of(); + testName = Objects.requireNonNull(name); + key = hex.parseHex(Objects.requireNonNull(keyStr)); + input = hex.parseHex(Objects.requireNonNull(inputStr)); + expOutput = hex.parseHex(Objects.requireNonNull(outStr)); + } + + public final String testName; + public final byte[] key; + public final byte[] input; + public final byte[] expOutput; + } + + public static final List testList = new LinkedList() {{ + add(new TestData("RFC 7539 A.3 Test Vector #1", + "0000000000000000000000000000000000000000000000000000000000000000", + "0000000000000000000000000000000000000000000000000000000000000000" + + "0000000000000000000000000000000000000000000000000000000000000000", + "00000000000000000000000000000000")); + add(new TestData("RFC 7539 A.3 Test Vector #2", + "0000000000000000000000000000000036e5f6b5c5e06070f0efca96227a863e", + "416e79207375626d697373696f6e20746f20746865204945544620696e74656e" + + "6465642062792074686520436f6e7472696275746f7220666f72207075626c69" + + "636174696f6e20617320616c6c206f722070617274206f6620616e2049455446" + + "20496e7465726e65742d4472616674206f722052464320616e6420616e792073" + + "746174656d656e74206d6164652077697468696e2074686520636f6e74657874" + + "206f6620616e204945544620616374697669747920697320636f6e7369646572" + + "656420616e20224945544620436f6e747269627574696f6e222e205375636820" + + "73746174656d656e747320696e636c756465206f72616c2073746174656d656e" + + "747320696e20494554462073657373696f6e732c2061732077656c6c20617320" + + "7772697474656e20616e6420656c656374726f6e696320636f6d6d756e696361" + + "74696f6e73206d61646520617420616e792074696d65206f7220706c6163652c" + + "207768696368206172652061646472657373656420746f", + "36e5f6b5c5e06070f0efca96227a863e")); + add(new TestData("RFC 7539 A.3 Test Vector #3", + "36e5f6b5c5e06070f0efca96227a863e00000000000000000000000000000000", + "416e79207375626d697373696f6e20746f20746865204945544620696e74656e" + + "6465642062792074686520436f6e7472696275746f7220666f72207075626c69" + + "636174696f6e20617320616c6c206f722070617274206f6620616e2049455446" + + "20496e7465726e65742d4472616674206f722052464320616e6420616e792073" + + "746174656d656e74206d6164652077697468696e2074686520636f6e74657874" + + "206f6620616e204945544620616374697669747920697320636f6e7369646572" + + "656420616e20224945544620436f6e747269627574696f6e222e205375636820" + + "73746174656d656e747320696e636c756465206f72616c2073746174656d656e" + + "747320696e20494554462073657373696f6e732c2061732077656c6c20617320" + + "7772697474656e20616e6420656c656374726f6e696320636f6d6d756e696361" + + "74696f6e73206d61646520617420616e792074696d65206f7220706c6163652c" + + "207768696368206172652061646472657373656420746f", + "f3477e7cd95417af89a6b8794c310cf0")); + add(new TestData("RFC 7539 A.3 Test Vector #4", + "1c9240a5eb55d38af333888604f6b5f0473917c1402b80099dca5cbc207075c0", + "2754776173206272696c6c69672c20616e642074686520736c6974687920746f" + + "7665730a446964206779726520616e642067696d626c6520696e207468652077" + + "6162653a0a416c6c206d696d737920776572652074686520626f726f676f7665" + + "732c0a416e6420746865206d6f6d65207261746873206f757467726162652e", + "4541669a7eaaee61e708dc7cbcc5eb62")); + add(new TestData("RFC 7539 A.3 Test Vector #5: If one uses 130-bit partial reduction, does the code handle the case where partially reducedfinal result is not fully reduced?", + "0200000000000000000000000000000000000000000000000000000000000000", + "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", + "03000000000000000000000000000000")); + add(new TestData("RFC 7539 A.3 Test Vector #6: What happens if addition of s overflows modulo 2^128?", + "02000000000000000000000000000000FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", + "02000000000000000000000000000000", + "03000000000000000000000000000000")); + add(new TestData("RFC 7539 A.3 Test Vector #7: What happens if data limb is all ones and there is carry from lower limb?", + "0100000000000000000000000000000000000000000000000000000000000000", + "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF0FFFFFFFFFFFFFFFFFFFFFFFFFFFFFF" + + "11000000000000000000000000000000", + "05000000000000000000000000000000")); + add(new TestData("RFC 7539 A.3 Test Vector #8: What happens if final result from polynomial part is exactly 2^130-5?", + "0100000000000000000000000000000000000000000000000000000000000000", + "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFBFEFEFEFEFEFEFEFEFEFEFEFEFEFEFE" + + "01010101010101010101010101010101", + "00000000000000000000000000000000")); + add(new TestData("RFC 7539 A.3 Test Vector #9: What happens if final result from polynomial part is exactly 2^130-6?", + "0200000000000000000000000000000000000000000000000000000000000000", + "FDFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", + "FAFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF")); + add(new TestData("RFC 7539 A.3 Test Vector #10: What happens if 5*H+L-type reduction produces 131-bit intermediate result?", + "0100000000000000040000000000000000000000000000000000000000000000", + "E33594D7505E43B900000000000000003394D7505E4379CD0100000000000000" + + "0000000000000000000000000000000001000000000000000000000000000000", + "14000000000000005500000000000000")); + add(new TestData("RFC 7539 A.3 Test Vector #11: What happens if 5*H+L-type reduction produces 131-bit final result?", + "0100000000000000040000000000000000000000000000000000000000000000", + "E33594D7505E43B900000000000000003394D7505E4379CD0100000000000000" + + "00000000000000000000000000000000", + "13000000000000000000000000000000")); + }}; + + public static void main(String args[]) throws Exception { + int testsPassed = 0; + int testNumber = 0; + + for (TestData test : testList) { + System.out.println("*** Test " + ++testNumber + ": " + + test.testName); + if (runSingleTest(test)) { + testsPassed++; + } + } + System.out.println(); + + if (testsPassed != testNumber) { + throw new RuntimeException("One or more tests failed. " + + "Check output for details"); + } + } + + private static boolean runSingleTest(TestData testData) throws Exception { + Poly1305 authenticator = new Poly1305(false); + authenticator.engineInit(new SecretKeySpec(testData.key, 0, testData.key.length, "Poly1305"), null); + authenticator.engineUpdate(testData.input, 0, testData.input.length); + byte[] tag = authenticator.engineDoFinal(); + if (!Arrays.equals(tag, testData.expOutput)) { + System.out.println("ERROR - Output Mismatch!"); + System.out.println("Expected:\n" + + dumpHexBytes(testData.expOutput, testData.expOutput.length, "\n", " ")); + System.out.println("Actual:\n" + + dumpHexBytes(tag, tag.length, "\n", " ")); + System.out.println(); + return false; + } + return true; + } + + /** + * Dump the hex bytes of a buffer into string form. + * + * @param data The array of bytes to dump to stdout. + * @param itemsPerLine The number of bytes to display per line + * if the {@code lineDelim} character is blank then all bytes + * will be printed on a single line. + * @param lineDelim The delimiter between lines + * @param itemDelim The delimiter between bytes + * + * @return The hexdump of the byte array + */ + private static String dumpHexBytes(byte[] data, int itemsPerLine, + String lineDelim, String itemDelim) { + return dumpHexBytes(ByteBuffer.wrap(data), itemsPerLine, lineDelim, + itemDelim); + } + + private static String dumpHexBytes(ByteBuffer data, int itemsPerLine, + String lineDelim, String itemDelim) { + StringBuilder sb = new StringBuilder(); + if (data != null) { + data.mark(); + int i = 0; + while (data.remaining() > 0) { + if (i % itemsPerLine == 0 && i != 0) { + sb.append(lineDelim); + } + sb.append(String.format("%02X", data.get())).append(itemDelim); + i++; + } + data.reset(); + } + + return sb.toString(); + } +} + diff --git a/test/lib-test/jdk/test/whitebox/CPUInfoTest.java b/test/lib-test/jdk/test/whitebox/CPUInfoTest.java index b5b8274b2a4f6..b41329d126db6 100644 --- a/test/lib-test/jdk/test/whitebox/CPUInfoTest.java +++ b/test/lib-test/jdk/test/whitebox/CPUInfoTest.java @@ -65,7 +65,7 @@ public class CPUInfoTest { "avx512_vbmi2", "avx512_vbmi", "rdtscp", "rdpid", "hv", "fsrm", "avx512_bitalg", "gfni", "f16c", "pku", "ospke", "cet_ibt", - "cet_ss" + "cet_ss", "avx512_ifma" ); // @formatter:on // Checkstyle: resume diff --git a/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java b/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java new file mode 100644 index 0000000000000..aa45aa2e398a0 --- /dev/null +++ b/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2022, Intel Corporation. All rights reserved. + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.javax.crypto.full; + +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Setup; + +import java.lang.invoke.MethodHandle; +import java.lang.invoke.MethodHandles; +import java.lang.reflect.Method; +import java.lang.reflect.Constructor; +import java.security.Key; +import java.security.spec.AlgorithmParameterSpec; +import javax.crypto.spec.SecretKeySpec; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.annotations.Measurement; + +@Measurement(iterations = 3, time = 10) +@Warmup(iterations = 3, time = 10) +@Fork(value = 1, jvmArgsAppend = {"--add-opens", "java.base/com.sun.crypto.provider=ALL-UNNAMED"}) +public class Poly1305DigestBench extends CryptoBase { + public static final int SET_SIZE = 128; + + @Param({"64", "256", "1024", "" + 16*1024, "" + 1024*1024}) + int dataSize; + + private byte[][] data; + int index = 0; + private static MethodHandle polyEngineInit, polyEngineUpdate, polyEngineFinal; + private static Object polyObj; + + static { + try { + MethodHandles.Lookup lookup = MethodHandles.lookup(); + Class polyClazz = Class.forName("com.sun.crypto.provider.Poly1305"); + Constructor constructor = polyClazz.getDeclaredConstructor(); + constructor.setAccessible(true); + polyObj = constructor.newInstance(); + + Method m = polyClazz.getDeclaredMethod("engineInit", Key.class, AlgorithmParameterSpec.class); + m.setAccessible(true); + polyEngineInit = lookup.unreflect(m); + + m = polyClazz.getDeclaredMethod("engineUpdate", byte[].class, int.class, int.class); + m.setAccessible(true); + polyEngineUpdate = lookup.unreflect(m); + + m = polyClazz.getDeclaredMethod("engineDoFinal"); + m.setAccessible(true); + polyEngineFinal = lookup.unreflect(m); + } catch (Throwable ex) { + throw new RuntimeException(ex); + } + } + + @Setup + public void setup() { + setupProvider(); + data = fillRandom(new byte[SET_SIZE][dataSize]); + } + + @Benchmark + public byte[] digest() { + try { + byte[] d = data[index]; + index = (index +1) % SET_SIZE; + polyEngineInit.invoke(polyObj, new SecretKeySpec(d, 0, 32, "Poly1305"), null); + polyEngineUpdate.invoke(polyObj, d, 0, d.length); + return (byte[])polyEngineFinal.invoke(polyObj); + } catch (Throwable ex) { + throw new RuntimeException(ex); + } + } +}