|
54 | 54 | #include "runtime/stubRoutines.hpp"
|
55 | 55 | #include "utilities/align.hpp"
|
56 | 56 | #include "utilities/checkedCast.hpp"
|
| 57 | +#include "utilities/debug.hpp" |
57 | 58 | #include "utilities/globalDefinitions.hpp"
|
| 59 | +#include "utilities/intpow.hpp" |
58 | 60 | #include "utilities/powerOfTwo.hpp"
|
59 | 61 | #ifdef COMPILER2
|
60 | 62 | #include "opto/runtime.hpp"
|
@@ -5320,6 +5322,307 @@ class StubGenerator: public StubCodeGenerator {
|
5320 | 5322 | return entry;
|
5321 | 5323 | }
|
5322 | 5324 |
|
| 5325 | + // result = r0 - return value. Contains initial hashcode value on entry. |
| 5326 | + // ary = r1 - array address |
| 5327 | + // cnt = r2 - elements count |
| 5328 | + // Clobbers: v0-v13, rscratch1, rscratch2 |
| 5329 | + address generate_large_arrays_hashcode(BasicType eltype) { |
| 5330 | + const Register result = r0, ary = r1, cnt = r2; |
| 5331 | + const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0; |
| 5332 | + const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7; |
| 5333 | + const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0> |
| 5334 | + const FloatRegister vpowm = v13; |
| 5335 | + |
| 5336 | + ARRAYS_HASHCODE_REGISTERS; |
| 5337 | + |
| 5338 | + Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE; |
| 5339 | + |
| 5340 | + unsigned int vf; // vectorization factor |
| 5341 | + bool multiply_by_halves; |
| 5342 | + Assembler::SIMD_Arrangement load_arrangement; |
| 5343 | + switch (eltype) { |
| 5344 | + case T_BOOLEAN: |
| 5345 | + case T_BYTE: |
| 5346 | + load_arrangement = Assembler::T8B; |
| 5347 | + multiply_by_halves = true; |
| 5348 | + vf = 8; |
| 5349 | + break; |
| 5350 | + case T_CHAR: |
| 5351 | + case T_SHORT: |
| 5352 | + load_arrangement = Assembler::T8H; |
| 5353 | + multiply_by_halves = true; |
| 5354 | + vf = 8; |
| 5355 | + break; |
| 5356 | + case T_INT: |
| 5357 | + load_arrangement = Assembler::T4S; |
| 5358 | + multiply_by_halves = false; |
| 5359 | + vf = 4; |
| 5360 | + break; |
| 5361 | + default: |
| 5362 | + ShouldNotReachHere(); |
| 5363 | + } |
| 5364 | + |
| 5365 | + // Unroll factor |
| 5366 | + const unsigned uf = 4; |
| 5367 | + |
| 5368 | + // Effective vectorization factor |
| 5369 | + const unsigned evf = vf * uf; |
| 5370 | + |
| 5371 | + __ align(CodeEntryAlignment); |
| 5372 | + |
| 5373 | + const char *mark_name = ""; |
| 5374 | + switch (eltype) { |
| 5375 | + case T_BOOLEAN: |
| 5376 | + mark_name = "_large_arrays_hashcode_boolean"; |
| 5377 | + break; |
| 5378 | + case T_BYTE: |
| 5379 | + mark_name = "_large_arrays_hashcode_byte"; |
| 5380 | + break; |
| 5381 | + case T_CHAR: |
| 5382 | + mark_name = "_large_arrays_hashcode_char"; |
| 5383 | + break; |
| 5384 | + case T_SHORT: |
| 5385 | + mark_name = "_large_arrays_hashcode_short"; |
| 5386 | + break; |
| 5387 | + case T_INT: |
| 5388 | + mark_name = "_large_arrays_hashcode_int"; |
| 5389 | + break; |
| 5390 | + default: |
| 5391 | + mark_name = "_large_arrays_hashcode_incorrect_type"; |
| 5392 | + __ should_not_reach_here(); |
| 5393 | + }; |
| 5394 | + |
| 5395 | + StubCodeMark mark(this, "StubRoutines", mark_name); |
| 5396 | + |
| 5397 | + address entry = __ pc(); |
| 5398 | + __ enter(); |
| 5399 | + |
| 5400 | + // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in |
| 5401 | + // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's |
| 5402 | + // value shouldn't change throughout both loops. |
| 5403 | + __ movw(rscratch1, intpow(31U, 3)); |
| 5404 | + __ mov(vpow, Assembler::S, 0, rscratch1); |
| 5405 | + __ movw(rscratch1, intpow(31U, 2)); |
| 5406 | + __ mov(vpow, Assembler::S, 1, rscratch1); |
| 5407 | + __ movw(rscratch1, intpow(31U, 1)); |
| 5408 | + __ mov(vpow, Assembler::S, 2, rscratch1); |
| 5409 | + __ movw(rscratch1, intpow(31U, 0)); |
| 5410 | + __ mov(vpow, Assembler::S, 3, rscratch1); |
| 5411 | + |
| 5412 | + __ mov(vmul0, Assembler::T16B, 0); |
| 5413 | + __ mov(vmul0, Assembler::S, 3, result); |
| 5414 | + |
| 5415 | + __ andr(rscratch2, cnt, (uf - 1) * vf); |
| 5416 | + __ cbz(rscratch2, LARGE_LOOP_PREHEADER); |
| 5417 | + |
| 5418 | + __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf)); |
| 5419 | + __ mov(vpowm, Assembler::S, 0, rscratch1); |
| 5420 | + |
| 5421 | + // SMALL LOOP |
| 5422 | + __ bind(SMALL_LOOP); |
| 5423 | + |
| 5424 | + __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype)))); |
| 5425 | + __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); |
| 5426 | + __ subsw(rscratch2, rscratch2, vf); |
| 5427 | + |
| 5428 | + if (load_arrangement == Assembler::T8B) { |
| 5429 | + // Extend 8B to 8H to be able to use vector multiply |
| 5430 | + // instructions |
| 5431 | + assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); |
| 5432 | + if (is_signed_subword_type(eltype)) { |
| 5433 | + __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); |
| 5434 | + } else { |
| 5435 | + __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); |
| 5436 | + } |
| 5437 | + } |
| 5438 | + |
| 5439 | + switch (load_arrangement) { |
| 5440 | + case Assembler::T4S: |
| 5441 | + __ addv(vmul0, load_arrangement, vmul0, vdata0); |
| 5442 | + break; |
| 5443 | + case Assembler::T8B: |
| 5444 | + case Assembler::T8H: |
| 5445 | + assert(is_subword_type(eltype), "subword type expected"); |
| 5446 | + if (is_signed_subword_type(eltype)) { |
| 5447 | + __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); |
| 5448 | + } else { |
| 5449 | + __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); |
| 5450 | + } |
| 5451 | + break; |
| 5452 | + default: |
| 5453 | + __ should_not_reach_here(); |
| 5454 | + } |
| 5455 | + |
| 5456 | + // Process the upper half of a vector |
| 5457 | + if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { |
| 5458 | + __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); |
| 5459 | + if (is_signed_subword_type(eltype)) { |
| 5460 | + __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); |
| 5461 | + } else { |
| 5462 | + __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); |
| 5463 | + } |
| 5464 | + } |
| 5465 | + |
| 5466 | + __ br(Assembler::HI, SMALL_LOOP); |
| 5467 | + |
| 5468 | + // SMALL LOOP'S EPILOQUE |
| 5469 | + __ lsr(rscratch2, cnt, exact_log2(evf)); |
| 5470 | + __ cbnz(rscratch2, LARGE_LOOP_PREHEADER); |
| 5471 | + |
| 5472 | + __ mulv(vmul0, Assembler::T4S, vmul0, vpow); |
| 5473 | + __ addv(vmul0, Assembler::T4S, vmul0); |
| 5474 | + __ umov(result, vmul0, Assembler::S, 0); |
| 5475 | + |
| 5476 | + // TAIL |
| 5477 | + __ bind(TAIL); |
| 5478 | + |
| 5479 | + // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs |
| 5480 | + // of load + madd insns i.e. it only executes cnt % vf load + madd pairs. |
| 5481 | + assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC"); |
| 5482 | + __ andr(rscratch2, cnt, vf - 1); |
| 5483 | + __ bind(TAIL_SHORTCUT); |
| 5484 | + __ adr(rscratch1, BR_BASE); |
| 5485 | + __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3); |
| 5486 | + __ movw(rscratch2, 0x1f); |
| 5487 | + __ br(rscratch1); |
| 5488 | + |
| 5489 | + for (size_t i = 0; i < vf - 1; ++i) { |
| 5490 | + __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))), |
| 5491 | + eltype); |
| 5492 | + __ maddw(result, result, rscratch2, rscratch1); |
| 5493 | + } |
| 5494 | + __ bind(BR_BASE); |
| 5495 | + |
| 5496 | + __ leave(); |
| 5497 | + __ ret(lr); |
| 5498 | + |
| 5499 | + // LARGE LOOP |
| 5500 | + __ bind(LARGE_LOOP_PREHEADER); |
| 5501 | + |
| 5502 | + __ lsr(rscratch2, cnt, exact_log2(evf)); |
| 5503 | + |
| 5504 | + if (multiply_by_halves) { |
| 5505 | + // 31^4 - multiplier between lower and upper parts of a register |
| 5506 | + __ movw(rscratch1, intpow(31U, vf / 2)); |
| 5507 | + __ mov(vpowm, Assembler::S, 1, rscratch1); |
| 5508 | + // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4 |
| 5509 | + __ movw(rscratch1, intpow(31U, evf - vf / 2)); |
| 5510 | + __ mov(vpowm, Assembler::S, 0, rscratch1); |
| 5511 | + } else { |
| 5512 | + // 31^16 |
| 5513 | + __ movw(rscratch1, intpow(31U, evf)); |
| 5514 | + __ mov(vpowm, Assembler::S, 0, rscratch1); |
| 5515 | + } |
| 5516 | + |
| 5517 | + __ mov(vmul3, Assembler::T16B, 0); |
| 5518 | + __ mov(vmul2, Assembler::T16B, 0); |
| 5519 | + __ mov(vmul1, Assembler::T16B, 0); |
| 5520 | + |
| 5521 | + __ bind(LARGE_LOOP); |
| 5522 | + |
| 5523 | + __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0); |
| 5524 | + __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0); |
| 5525 | + __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0); |
| 5526 | + __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); |
| 5527 | + |
| 5528 | + __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement, |
| 5529 | + Address(__ post(ary, evf * type2aelembytes(eltype)))); |
| 5530 | + |
| 5531 | + if (load_arrangement == Assembler::T8B) { |
| 5532 | + // Extend 8B to 8H to be able to use vector multiply |
| 5533 | + // instructions |
| 5534 | + assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); |
| 5535 | + if (is_signed_subword_type(eltype)) { |
| 5536 | + __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); |
| 5537 | + __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); |
| 5538 | + __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); |
| 5539 | + __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); |
| 5540 | + } else { |
| 5541 | + __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); |
| 5542 | + __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); |
| 5543 | + __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); |
| 5544 | + __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); |
| 5545 | + } |
| 5546 | + } |
| 5547 | + |
| 5548 | + switch (load_arrangement) { |
| 5549 | + case Assembler::T4S: |
| 5550 | + __ addv(vmul3, load_arrangement, vmul3, vdata3); |
| 5551 | + __ addv(vmul2, load_arrangement, vmul2, vdata2); |
| 5552 | + __ addv(vmul1, load_arrangement, vmul1, vdata1); |
| 5553 | + __ addv(vmul0, load_arrangement, vmul0, vdata0); |
| 5554 | + break; |
| 5555 | + case Assembler::T8B: |
| 5556 | + case Assembler::T8H: |
| 5557 | + assert(is_subword_type(eltype), "subword type expected"); |
| 5558 | + if (is_signed_subword_type(eltype)) { |
| 5559 | + __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); |
| 5560 | + __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); |
| 5561 | + __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); |
| 5562 | + __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); |
| 5563 | + } else { |
| 5564 | + __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); |
| 5565 | + __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); |
| 5566 | + __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); |
| 5567 | + __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); |
| 5568 | + } |
| 5569 | + break; |
| 5570 | + default: |
| 5571 | + __ should_not_reach_here(); |
| 5572 | + } |
| 5573 | + |
| 5574 | + // Process the upper half of a vector |
| 5575 | + if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { |
| 5576 | + __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1); |
| 5577 | + __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1); |
| 5578 | + __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1); |
| 5579 | + __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1); |
| 5580 | + if (is_signed_subword_type(eltype)) { |
| 5581 | + __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); |
| 5582 | + __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); |
| 5583 | + __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); |
| 5584 | + __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); |
| 5585 | + } else { |
| 5586 | + __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); |
| 5587 | + __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); |
| 5588 | + __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); |
| 5589 | + __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); |
| 5590 | + } |
| 5591 | + } |
| 5592 | + |
| 5593 | + __ subsw(rscratch2, rscratch2, 1); |
| 5594 | + __ br(Assembler::HI, LARGE_LOOP); |
| 5595 | + |
| 5596 | + __ mulv(vmul3, Assembler::T4S, vmul3, vpow); |
| 5597 | + __ addv(vmul3, Assembler::T4S, vmul3); |
| 5598 | + __ umov(result, vmul3, Assembler::S, 0); |
| 5599 | + |
| 5600 | + __ mov(rscratch2, intpow(31U, vf)); |
| 5601 | + |
| 5602 | + __ mulv(vmul2, Assembler::T4S, vmul2, vpow); |
| 5603 | + __ addv(vmul2, Assembler::T4S, vmul2); |
| 5604 | + __ umov(rscratch1, vmul2, Assembler::S, 0); |
| 5605 | + __ maddw(result, result, rscratch2, rscratch1); |
| 5606 | + |
| 5607 | + __ mulv(vmul1, Assembler::T4S, vmul1, vpow); |
| 5608 | + __ addv(vmul1, Assembler::T4S, vmul1); |
| 5609 | + __ umov(rscratch1, vmul1, Assembler::S, 0); |
| 5610 | + __ maddw(result, result, rscratch2, rscratch1); |
| 5611 | + |
| 5612 | + __ mulv(vmul0, Assembler::T4S, vmul0, vpow); |
| 5613 | + __ addv(vmul0, Assembler::T4S, vmul0); |
| 5614 | + __ umov(rscratch1, vmul0, Assembler::S, 0); |
| 5615 | + __ maddw(result, result, rscratch2, rscratch1); |
| 5616 | + |
| 5617 | + __ andr(rscratch2, cnt, vf - 1); |
| 5618 | + __ cbnz(rscratch2, TAIL_SHORTCUT); |
| 5619 | + |
| 5620 | + __ leave(); |
| 5621 | + __ ret(lr); |
| 5622 | + |
| 5623 | + return entry; |
| 5624 | + } |
| 5625 | + |
5323 | 5626 | address generate_dsin_dcos(bool isCos) {
|
5324 | 5627 | __ align(CodeEntryAlignment);
|
5325 | 5628 | StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
|
@@ -8361,6 +8664,13 @@ class StubGenerator: public StubCodeGenerator {
|
8361 | 8664 | StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
|
8362 | 8665 | }
|
8363 | 8666 |
|
| 8667 | + // arrays_hascode stub for large arrays. |
| 8668 | + StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN); |
| 8669 | + StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE); |
| 8670 | + StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR); |
| 8671 | + StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT); |
| 8672 | + StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT); |
| 8673 | + |
8364 | 8674 | // byte_array_inflate stub for large arrays.
|
8365 | 8675 | StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
|
8366 | 8676 |
|
|
0 commit comments