8292758: put support for UNSIGNED5 format into its own header file

Reviewed-by: dlong, coleenp
openjdk · Sep 8, 2022 · 8d3399b · 8d3399b · openjdk-notifier · Sep 8, 2022
1 parent 6677227
commit 8d3399b
Show file tree

Hide file tree

Showing 9 changed files with 1,129 additions and 157 deletions.
diff --git a/src/hotspot/share/code/compressedStream.cpp b/src/hotspot/share/code/compressedStream.cpp
@@ -27,36 +27,45 @@
 #include "utilities/ostream.hpp"
 #include "utilities/moveBits.hpp"
 
-// 32-bit self-inverse encoding of float bits
-// converts trailing zeroes (common in floats) to leading zeroes
-inline juint CompressedStream::reverse_int(juint i) {
-  return reverse_bits(i);
-}
-
 jint CompressedReadStream::read_signed_int() {
-  return decode_sign(read_int());
+  return UNSIGNED5::decode_sign(read_int());
 }
 
 // Compressing floats is simple, because the only common pattern
 // is trailing zeroes.  (Compare leading sign bits on ints.)
 // Since floats are left-justified, as opposed to right-justified
 // ints, we can bit-reverse them in order to take advantage of int
-// compression.
-
+// compression.  Since bit reversal converts trailing zeroes to
+// leading zeroes, effect is better compression of those common
+// 32-bit float values, such as integers or integers divided by
+// powers of two, that have many trailing zeroes.
 jfloat CompressedReadStream::read_float() {
   int rf = read_int();
-  int f  = reverse_int(rf);
+  int f  = reverse_bits(rf);
   return jfloat_cast(f);
 }
 
+// The treatment of doubles is similar.  We could bit-reverse each
+// entire 64-bit word, but it is almost as effective to bit-reverse
+// the individual halves.  Since we are going to encode them
+// separately as 32-bit halves anyway, it seems slightly simpler
+// to reverse after splitting, and when reading reverse each
+// half before joining them together.
 jdouble CompressedReadStream::read_double() {
   jint rh = read_int();
   jint rl = read_int();
-  jint h  = reverse_int(rh);
-  jint l  = reverse_int(rl);
+  jint h  = reverse_bits(rh);
+  jint l  = reverse_bits(rl);
   return jdouble_cast(jlong_from(h, l));
 }
 
+// A 64-bit long is encoded into distinct 32-bit halves.  This saves
+// us from having to define a 64-bit encoding and is almost as
+// effective.  A modified LEB128 could encode longs into 9 bytes, and
+// this technique maxes out at 10 bytes, so, if we didn't mind the
+// extra complexity of another coding system, we could process 64-bit
+// values as single units.  But, the complexity does not seem
+// worthwhile.
 jlong CompressedReadStream::read_long() {
   jint low  = read_signed_int();
   jint high = read_signed_int();
@@ -70,26 +79,31 @@ CompressedWriteStream::CompressedWriteStream(int initial_size) : CompressedStrea
 }
 
 void CompressedWriteStream::grow() {
-  u_char* _new_buffer = NEW_RESOURCE_ARRAY(u_char, _size * 2);
+  int nsize = _size * 2;
+  const int min_expansion = UNSIGNED5::MAX_LENGTH;
+  if (nsize < min_expansion*2) {
+    nsize = min_expansion*2;
+  }
+  u_char* _new_buffer = NEW_RESOURCE_ARRAY(u_char, nsize);
   memcpy(_new_buffer, _buffer, _position);
   _buffer = _new_buffer;
-  _size   = _size * 2;
+  _size   = nsize;
 }
 
 void CompressedWriteStream::write_float(jfloat value) {
   juint f = jint_cast(value);
-  juint rf = reverse_int(f);
-  assert(f == reverse_int(rf), "can re-read same bits");
+  juint rf = reverse_bits(f);
+  assert(f == reverse_bits(rf), "can re-read same bits");
   write_int(rf);
 }
 
 void CompressedWriteStream::write_double(jdouble value) {
   juint h  = high(jlong_cast(value));
   juint l  = low( jlong_cast(value));
-  juint rh = reverse_int(h);
-  juint rl = reverse_int(l);
-  assert(h == reverse_int(rh), "can re-read same bits");
-  assert(l == reverse_int(rl), "can re-read same bits");
+  juint rh = reverse_bits(h);
+  juint rl = reverse_bits(l);
+  assert(h == reverse_bits(rh), "can re-read same bits");
+  assert(l == reverse_bits(rl), "can re-read same bits");
   write_int(rh);
   write_int(rl);
 }

diff --git a/src/hotspot/share/code/compressedStream.hpp b/src/hotspot/share/code/compressedStream.hpp
@@ -26,6 +26,7 @@
 #define SHARE_CODE_COMPRESSEDSTREAM_HPP
 
 #include "memory/allocation.hpp"
+#include "utilities/unsigned5.hpp"
 
 // Simple interface for filing out and filing in basic types
 // Used for writing out and reading in debugging information.
@@ -36,18 +37,6 @@ class CompressedStream : public ResourceObj {
   u_char* _buffer;
   int     _position;
 
-  enum {
-    // Constants for UNSIGNED5 coding of Pack200
-    lg_H = 6, H = 1<<lg_H,    // number of high codes (64)
-    L = (1<<BitsPerByte)-H,   // number of low codes (192)
-    MAX_i = 4                 // bytes are numbered in (0..4), max 5 bytes
-  };
-
-  // 32-bit one-to-one sign encoding taken from Pack200
-  // converts leading sign bits into leading zeroes with trailing sign bit
-  static juint encode_sign(jint  value) { return (value << 1) ^ (value >> 31); }
-  static jint  decode_sign(juint value) { return (value >> 1) ^ -(jint)(value & 1); }
-  static juint reverse_int(juint i);   // to trim trailing float 0's
  public:
   CompressedStream(u_char* buffer, int position = 0) {
     _buffer   = buffer;
@@ -66,41 +55,6 @@ class CompressedReadStream : public CompressedStream {
  private:
   inline u_char read()                 { return _buffer[_position++]; }
 
-  // This encoding, called UNSIGNED5, is taken from J2SE Pack200.
-  // It assumes that most values have lots of leading zeroes.
-  // Very small values, in the range [0..191], code in one byte.
-  // Any 32-bit value (including negatives) can be coded, in
-  // up to five bytes.  The grammar is:
-  //    low_byte  = [0..191]
-  //    high_byte = [192..255]
-  //    any_byte  = low_byte | high_byte
-  //    coding = low_byte
-  //           | high_byte low_byte
-  //           | high_byte high_byte low_byte
-  //           | high_byte high_byte high_byte low_byte
-  //           | high_byte high_byte high_byte high_byte any_byte
-  // Each high_byte contributes six bits of payload.
-  // The encoding is one-to-one (except for integer overflow)
-  // and easy to parse and unparse.
-
-  jint read_int_mb(jint b0) {
-    int     pos = position() - 1;
-    u_char* buf = buffer() + pos;
-    assert(buf[0] == b0 && b0 >= L, "correctly called");
-    jint    sum = b0;
-    // must collect more bytes:  b[1]...b[4]
-    int lg_H_i = lg_H;
-    for (int i = 0; ; ) {
-      jint b_i = buf[++i]; // b_i = read(); ++i;
-      sum += b_i << lg_H_i;  // sum += b[i]*(64**i)
-      if (b_i < L || i == MAX_i) {
-        set_position(pos+i+1);
-        return sum;
-      }
-      lg_H_i += lg_H;
-    }
-  }
-
  public:
   CompressedReadStream(u_char* buffer, int position = 0)
   : CompressedStream(buffer, position) {}
@@ -109,14 +63,14 @@ class CompressedReadStream : public CompressedStream {
   jbyte    read_byte()                 { return (jbyte   ) read();      }
   jchar    read_char()                 { return (jchar   ) read_int();  }
   jshort   read_short()                { return (jshort  ) read_signed_int(); }
-  jint     read_int()                  { jint   b0 = read();
-                                         if (b0 < L)  return b0;
-                                         else         return read_int_mb(b0);
-                                       }
   jint     read_signed_int();
-  jfloat   read_float();               // jfloat_cast(reverse_int(read_int()))
-  jdouble  read_double();              // jdouble_cast(2*reverse_int(read_int))
+  jfloat   read_float();               // jfloat_cast(reverse_bits(read_int()))
+  jdouble  read_double();              // jdouble_cast(2*reverse_bits(read_int))
   jlong    read_long();                // jlong_from(2*read_signed_int())
+
+  jint     read_int() {
+    return UNSIGNED5::read_uint(_buffer, _position, 0);
+  }
 };
 
 
@@ -134,23 +88,6 @@ class CompressedWriteStream : public CompressedStream {
   }
   void grow();
 
-  // UNSIGNED5 coding, 1-5 byte cases
-  void write_int_mb(jint value) {
-    juint sum = value;
-    for (int i = 0; ; ) {
-      if (sum < L || i == MAX_i) {
-        // remainder is either a "low code" or the 5th byte
-        assert(sum == (u_char)sum, "valid byte");
-        write((u_char)sum);
-        break;
-      }
-      sum -= L;
-      int b_i = L + (sum % H);  // this is a "high code"
-      sum >>= lg_H;             // extracted 6 bits
-      write(b_i); ++i;
-    }
-  }
-
  protected:
   int _size;
 
@@ -163,13 +100,15 @@ class CompressedWriteStream : public CompressedStream {
   void write_byte(jbyte value)         { write(value);      }
   void write_char(jchar value)         { write_int(value); }
   void write_short(jshort value)       { write_signed_int(value);  }
-  void write_int(jint value)           { if ((juint)value < L && !full())
-                                               store((u_char)value);
-                                         else  write_int_mb(value);  }
-  void write_signed_int(jint value)    { write_int(encode_sign(value)); }
-  void write_float(jfloat value);      // write_int(reverse_int(jint_cast(v)))
-  void write_double(jdouble value);    // write_int(reverse_int(<low,high>))
+  void write_signed_int(jint value)    { write_int(UNSIGNED5::encode_sign(value)); }
+  void write_float(jfloat value);      // write_int(reverse_bits(jint_cast(v)))
+  void write_double(jdouble value);    // write_int(reverse_bits(<low,high>))
   void write_long(jlong value);        // write_signed_int(<low,high>)
+
+  void write_int(juint value) {
+    UNSIGNED5::write_uint_grow(value, _buffer, _position, _size,
+                               [&](int){ grow(); });
+  }
 };
 
 #endif // SHARE_CODE_COMPRESSEDSTREAM_HPP
diff --git a/src/hotspot/share/utilities/debug.cpp b/src/hotspot/share/utilities/debug.cpp
@@ -61,6 +61,7 @@
 #include "utilities/formatBuffer.hpp"
 #include "utilities/globalDefinitions.hpp"
 #include "utilities/macros.hpp"
+#include "utilities/unsigned5.hpp"
 #include "utilities/vmError.hpp"
 
 #include <stdio.h>
@@ -648,6 +649,37 @@ extern "C" JNIEXPORT void findbcp(intptr_t method, intptr_t bcp) {
   }
 }
 
+// check and decode a single u5 value
+extern "C" JNIEXPORT u4 u5decode(intptr_t addr) {
+  Command c("u5decode");
+  u1* arr = (u1*)addr;
+  size_t off = 0, lim = 5;
+  if (!UNSIGNED5::check_length(arr, off, lim)) {
+    return 0;
+  }
+  return UNSIGNED5::read_uint(arr, off, lim);
+}
+
+// Sets up a Reader from addr/limit and prints count items.
+// A limit of zero means no set limit; stop at the first null
+// or after count items are printed.
+// A count of zero or less is converted to -1, which means
+// there is no limit on the count of items printed; the
+// printing stops when an null is printed or at limit.
+// See documentation for UNSIGNED5::Reader::print(count).
+extern "C" JNIEXPORT intptr_t u5p(intptr_t addr,
+                                  intptr_t limit,
+                                  int count) {
+  Command c("u5p");
+  u1* arr = (u1*)addr;
+  if (limit && limit < addr)  limit = addr;
+  size_t lim = !limit ? 0 : (limit - addr);
+  size_t endpos = UNSIGNED5::print_count(count > 0 ? count : -1,
+                                         arr, (size_t)0, lim);
+  return addr + endpos;
+}
+
+
 // int versions of all methods to avoid having to type type casts in the debugger
 
 void pp(intptr_t p)          { pp((void*)p); }

diff --git a/src/hotspot/share/utilities/unsigned5.cpp b/src/hotspot/share/utilities/unsigned5.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "memory/allocation.hpp"
+#include "utilities/unsigned5.hpp"
+
+// Most of UNSIGNED5 is in the header file.
+// Let's put a few debug functions out-of-line here.
+
+// For the record, UNSIGNED5 was defined around 2001 and was first
+// published in the initial Pack200 spec.  See:
+// https://docs.oracle.com/en/java/javase/11/docs/specs/pack-spec.html
+// in Section 6.1, "Encoding of Small Whole Numbers".
+
+PRAGMA_DIAG_PUSH
+PRAGMA_FORMAT_NONLITERAL_IGNORED
+
+// For debugging, even in product builds (see debug.cpp).
+template<typename ARR, typename OFF, typename GET>
+void UNSIGNED5::Reader<ARR,OFF,GET>::
+print_on(outputStream* st, int count,
+         const char* left,   // "U5: ["
+         const char* right   // "] (values=%d/length=%d)\n"
+         ) {
+  if (left == NULL)   left = "U5: [";
+  if (right == NULL)  right = "] (values=%d/length=%d)\n";
+  int printed = 0;
+  st->print("%s", left);
+  for (;;) {
+    if (count >= 0 && printed >= count)  break;
+    if (!has_next()) {
+      if ((_limit == 0 || _position < _limit) && _array[_position] == 0) {
+        st->print(" null");
+        ++_position;  // skip null byte
+        ++printed;
+        if (_limit != 0)  continue;  // keep going to explicit limit
+      }
+      break;
+    }
+    u4 value = next_uint();
+    if (printed == 0)
+      st->print("%d", value);
+    else
+      st->print(" %d", value);
+    ++printed;
+  }
+  st->print(right,
+            // these arguments may or may not be used in the format string:
+            printed,
+            (int)_position);
+}
+
+PRAGMA_DIAG_POP
+
+// Explicit instantiation for supported types.
+template void UNSIGNED5::Reader<char*,int>::
+print_on(outputStream* st, int count, const char* left, const char* right);
+template void UNSIGNED5::Reader<u1*,int>::
+print_on(outputStream* st, int count, const char* left, const char* right);
+template void UNSIGNED5::Reader<address,size_t>::
+print_on(outputStream* st, int count, const char* left, const char* right);