Skip to content

Commit 7c991cc

Browse files
committedSep 20, 2023
8296246: Update Unicode Data Files to Version 15.1.0
Reviewed-by: erikj, joehw, srl, rriggs
1 parent a021dbc commit 7c991cc

File tree

22 files changed

+1518
-231
lines changed

22 files changed

+1518
-231
lines changed
 

‎make/ToolsJdk.gmk

+2-2
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,8 @@ TOOL_GENERATECACERTS = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_class
7676
-Dkeystore.pkcs12.macAlgorithm=NONE \
7777
build.tools.generatecacerts.GenerateCacerts
7878

79-
TOOL_GENERATEEMOJIDATA = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
80-
build.tools.generateemojidata.GenerateEmojiData
79+
TOOL_GENERATEEXTRAPROPERTIES = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
80+
build.tools.generateextraproperties.GenerateExtraProperties
8181

8282
TOOL_MAKEZIPREPRODUCIBLE = $(JAVA_SMALL) -cp $(BUILDTOOLS_OUTPUTDIR)/jdk_tools_classes \
8383
build.tools.makezipreproducible.MakeZipReproducible

‎make/jdk/src/classes/build/tools/generatecharacter/PropList.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ private PropList(File file, int plane) throws IOException {
6464

6565
int i, j;
6666
BufferedReader sbfr = new BufferedReader(new FileReader(file));
67-
Matcher m = Pattern.compile("(\\p{XDigit}+)(?:\\.{2}(\\p{XDigit}+))?\\s*;\\s+(\\w+)\\s+#.*").matcher("");
67+
Matcher m = Pattern.compile("(\\p{XDigit}+)(?:\\.{2}(\\p{XDigit}+))?\\s*;\\s+(\\w+)[;\\s].*").matcher("");
6868
String line = null;
6969
int lineNo = 0;
7070
while ((line = sbfr.readLine()) != null) {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
/*
2+
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* This code is free software; you can redistribute it and/or modify it
6+
* under the terms of the GNU General Public License version 2 only, as
7+
* published by the Free Software Foundation. Oracle designates this
8+
* particular file as subject to the "Classpath" exception as provided
9+
* by Oracle in the LICENSE file that accompanied this code.
10+
*
11+
* This code is distributed in the hope that it will be useful, but WITHOUT
12+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14+
* version 2 for more details (a copy is included in the LICENSE file that
15+
* accompanied this code).
16+
*
17+
* You should have received a copy of the GNU General Public License version
18+
* 2 along with this work; if not, write to the Free Software Foundation,
19+
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20+
*
21+
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22+
* or visit www.oracle.com if you need additional information or have any
23+
* questions.
24+
*/
25+
package build.tools.generateextraproperties;
26+
27+
import java.io.IOException;
28+
import java.nio.file.Files;
29+
import java.nio.file.Paths;
30+
import java.nio.file.StandardOpenOption;
31+
import java.util.ArrayList;
32+
import java.util.Arrays;
33+
import java.util.HashMap;
34+
import java.util.List;
35+
import java.util.function.Predicate;
36+
import java.util.stream.Collectors;
37+
import java.util.stream.Stream;
38+
39+
/**
40+
* Parses extra properties files of UCD, and replaces the placeholders in
41+
* the given template source file with the generated conditions, then emits
42+
* .java files. For example, if the properties file has:
43+
* <blockquote>
44+
* 0009..000D ; Type (; Value)
45+
* 0020 ; Type (; Value)
46+
* 2000..200A ; Type (; Value)
47+
* </blockquote>
48+
* and the template file contains
49+
* <blockquote>
50+
* %%%Type(=Value)%%%
51+
* </blockquote>
52+
* then the generated .java file would have the following in place:
53+
* <blockquote>
54+
* (cp >= 0x0009 && cp <= 0x000D) ||
55+
* cp == 0x0020 ||
56+
* (cp >= 0x2000 && cp <= 0x200A);
57+
* </blockquote>
58+
* Note that those in parentheses in the properties file and the
59+
* template file are optional.
60+
*
61+
* Arguments to this utility:
62+
* args[0]: Full path string to the template file
63+
* args[1]: Full path string to the properties file
64+
* args[2]: Full path string to the generated .java file
65+
* args[3...]: Names of the property to generate the conditions
66+
*/
67+
public class GenerateExtraProperties {
68+
public static void main(String[] args) {
69+
var templateFile = Paths.get(args[0]);
70+
var propertiesFile = Paths.get(args[1]);
71+
var gensrcFile = Paths.get(args[2]);
72+
var propertyNames = Arrays.copyOfRange(args, 3, args.length);
73+
var replacementMap = new HashMap<String, String>();
74+
75+
try {
76+
for (var propertyName: propertyNames) {
77+
var pn = "; " + propertyName.replaceFirst("=", "; ");
78+
79+
List<Range> ranges = Files.lines(propertiesFile)
80+
.filter(Predicate.not(l -> l.startsWith("#") || l.isBlank()))
81+
.filter(l -> l.contains(pn))
82+
.map(l -> new Range(l.replaceFirst(" .*", "")))
83+
.sorted()
84+
.collect(ArrayList<Range>::new,
85+
(list, r) -> {
86+
// collapsing consecutive pictographic ranges
87+
int lastIndex = list.size() - 1;
88+
if (lastIndex >= 0) {
89+
Range lastRange = list.get(lastIndex);
90+
if (lastRange.last + 1 == r.start) {
91+
list.set(lastIndex, new Range(lastRange.start, r.last));
92+
return;
93+
}
94+
}
95+
list.add(r);
96+
},
97+
ArrayList<Range>::addAll);
98+
99+
100+
replacementMap.put("%%%" + propertyName + "%%%",
101+
ranges.stream()
102+
.map(GenerateExtraProperties::rangeToString)
103+
.collect(Collectors.joining(" ||\n", "", ";")));
104+
}
105+
106+
// Generate .java file
107+
Files.write(gensrcFile,
108+
Files.lines(templateFile)
109+
.flatMap(l -> Stream.of(replacementMap.getOrDefault(l.trim(), l)))
110+
.collect(Collectors.toList()),
111+
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
112+
} catch (IOException e) {
113+
e.printStackTrace();
114+
}
115+
}
116+
117+
static String rangeToString(Range r) {
118+
if (r.start == r.last) {
119+
return (" ".repeat(12) + "cp == 0x" + toHexString(r.start));
120+
} else if (r.start == r.last - 1) {
121+
return " ".repeat(12) + "cp == 0x" + toHexString(r.start) + " ||\n" +
122+
" ".repeat(12) + "cp == 0x" + toHexString(r.last);
123+
} else {
124+
return " ".repeat(11) + "(cp >= 0x" + toHexString(r.start) +
125+
" && cp <= 0x" + toHexString(r.last) + ")";
126+
}
127+
}
128+
129+
static int toInt(String hexStr) {
130+
return Integer.parseUnsignedInt(hexStr, 16);
131+
}
132+
133+
static String toHexString(int cp) {
134+
String ret = Integer.toUnsignedString(cp, 16).toUpperCase();
135+
if (ret.length() < 4) {
136+
ret = "0".repeat(4 - ret.length()) + ret;
137+
}
138+
return ret;
139+
}
140+
141+
static class Range implements Comparable<Range> {
142+
int start;
143+
int last;
144+
145+
Range (int start, int last) {
146+
this.start = start;
147+
this.last = last;
148+
}
149+
150+
Range (String input) {
151+
input = input.replaceFirst("\\s#.*", "");
152+
start = toInt(input.replaceFirst("[\\s\\.].*", ""));
153+
last = input.contains("..") ?
154+
toInt(input.replaceFirst(".*\\.\\.", "")
155+
.replaceFirst(";.*", "").trim())
156+
: start;
157+
}
158+
159+
@Override
160+
public String toString() {
161+
return "Start: " + toHexString(start) + ", Last: " + toHexString(last);
162+
}
163+
164+
@Override
165+
public int compareTo(Range other) {
166+
return Integer.compare(start, other.start);
167+
}
168+
}
169+
}

‎make/modules/java.base/Gensrc.gmk

+1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ include gensrc/GensrcExceptions.gmk
3535
include gensrc/GensrcVarHandles.gmk
3636
include gensrc/GensrcModuleLoaderMap.gmk
3737
include gensrc/GensrcScopedMemoryAccess.gmk
38+
include gensrc/GensrcRegex.gmk
3839

3940
# GensrcLocaleData.gmk does not set TARGETS, so we must choose which targets
4041
# to include.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#
2+
# Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
3+
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
#
5+
# This code is free software; you can redistribute it and/or modify it
6+
# under the terms of the GNU General Public License version 2 only, as
7+
# published by the Free Software Foundation. Oracle designates this
8+
# particular file as subject to the "Classpath" exception as provided
9+
# by Oracle in the LICENSE file that accompanied this code.
10+
#
11+
# This code is distributed in the hope that it will be useful, but WITHOUT
12+
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13+
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14+
# version 2 for more details (a copy is included in the LICENSE file that
15+
# accompanied this code).
16+
#
17+
# You should have received a copy of the GNU General Public License version
18+
# 2 along with this work; if not, write to the Free Software Foundation,
19+
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20+
#
21+
# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22+
# or visit www.oracle.com if you need additional information or have any
23+
# questions.
24+
#
25+
26+
#
27+
# Rules to create java files under
28+
# $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/
29+
#
30+
31+
GENSRC_INDICCONJUNCTBREAK := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/IndicConjunctBreak.java
32+
33+
INDICCONJUNCTBREAKTEMP := $(MODULE_SRC)/share/classes/jdk/internal/util/regex/IndicConjunctBreak.java.template
34+
INDICCONJUNCTBREAKPROPS := $(MODULE_SRC)/share/data/unicodedata/DerivedCoreProperties.txt
35+
INDICCONJUNCTBREAKPARAMS := InCB=Linker InCB=Extend InCB=Consonant
36+
37+
$(GENSRC_INDICCONJUNCTBREAK): $(BUILD_TOOLS_JDK) $(INDICCONJUNCTBREAKTEMP) $(INDICCONJUNCTBREAKPROPS)
38+
$(call LogInfo, Generating $@)
39+
$(call MakeTargetDir)
40+
$(TOOL_GENERATEEXTRAPROPERTIES) \
41+
$(INDICCONJUNCTBREAKTEMP) \
42+
$(INDICCONJUNCTBREAKPROPS) \
43+
$(GENSRC_INDICCONJUNCTBREAK) \
44+
$(INDICCONJUNCTBREAKPARAMS)
45+
46+
TARGETS += $(GENSRC_INDICCONJUNCTBREAK)

‎src/java.base/share/classes/java/lang/Character.java

+29-13
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@
6363
* from the Unicode Consortium at
6464
* <a href="http://www.unicode.org">http://www.unicode.org</a>.
6565
* <p>
66-
* Character information is based on the Unicode Standard, version 15.0.
66+
* Character information is based on the Unicode Standard, version 15.1.
6767
* <p>
6868
* The Java platform has supported different versions of the Unicode
6969
* Standard over time. Upgrades to newer versions of the Unicode Standard
@@ -75,6 +75,8 @@
7575
* <th scope="col">Unicode version</th></tr>
7676
* </thead>
7777
* <tbody>
78+
* <tr><th scope="row" style="text-align:left">Java SE 22</th>
79+
* <td>Unicode 15.1</td></tr>
7880
* <tr><th scope="row" style="text-align:left">Java SE 20</th>
7981
* <td>Unicode 15.0</td></tr>
8082
* <tr><th scope="row" style="text-align:left">Java SE 19</th>
@@ -744,7 +746,7 @@ public static final class UnicodeBlock extends Subset {
744746
* It should be adjusted whenever the Unicode Character Database
745747
* is upgraded.
746748
*/
747-
private static final int NUM_ENTITIES = 756;
749+
private static final int NUM_ENTITIES = 759;
748750
private static Map<String, UnicodeBlock> map = HashMap.newHashMap(NUM_ENTITIES);
749751

750752
/**
@@ -3611,6 +3613,16 @@ private UnicodeBlock(String idName, String... aliases) {
36113613
"CJK UNIFIED IDEOGRAPHS EXTENSION H",
36123614
"CJKUNIFIEDIDEOGRAPHSEXTENSIONH");
36133615

3616+
/**
3617+
* Constant for the "CJK Unified Ideographs Extension I" Unicode
3618+
* character block.
3619+
* @since 22
3620+
*/
3621+
public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I =
3622+
new UnicodeBlock("CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I",
3623+
"CJK UNIFIED IDEOGRAPHS EXTENSION I",
3624+
"CJKUNIFIEDIDEOGRAPHSEXTENSIONI");
3625+
36143626
private static final int[] blockStarts = {
36153627
0x0000, // 0000..007F; Basic Latin
36163628
0x0080, // 0080..00FF; Latin-1 Supplement
@@ -3978,7 +3990,8 @@ private UnicodeBlock(String idName, String... aliases) {
39783990
0x2B740, // 2B740..2B81F; CJK Unified Ideographs Extension D
39793991
0x2B820, // 2B820..2CEAF; CJK Unified Ideographs Extension E
39803992
0x2CEB0, // 2CEB0..2EBEF; CJK Unified Ideographs Extension F
3981-
0x2EBF0, // unassigned
3993+
0x2EBF0, // 2EBF0..2EE5F; CJK Unified Ideographs Extension I
3994+
0x2EE60, // unassigned
39823995
0x2F800, // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
39833996
0x2FA20, // unassigned
39843997
0x30000, // 30000..3134F; CJK Unified Ideographs Extension G
@@ -4359,6 +4372,7 @@ private UnicodeBlock(String idName, String... aliases) {
43594372
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D,
43604373
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E,
43614374
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F,
4375+
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I,
43624376
null,
43634377
CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
43644378
null,
@@ -6057,9 +6071,7 @@ public static enum UnicodeScript {
60576071
0x2EF4, // 2EF4..2EFF; UNKNOWN
60586072
0x2F00, // 2F00..2FD5; HAN
60596073
0x2FD6, // 2FD6..2FEF; UNKNOWN
6060-
0x2FF0, // 2FF0..2FFB; COMMON
6061-
0x2FFC, // 2FFC..2FFF; UNKNOWN
6062-
0x3000, // 3000..3004; COMMON
6074+
0x2FF0, // 2FF0..3004; COMMON
60636075
0x3005, // 3005 ; HAN
60646076
0x3006, // 3006 ; COMMON
60656077
0x3007, // 3007 ; HAN
@@ -6088,7 +6100,8 @@ public static enum UnicodeScript {
60886100
0x3190, // 3190..319F; COMMON
60896101
0x31A0, // 31A0..31BF; BOPOMOFO
60906102
0x31C0, // 31C0..31E3; COMMON
6091-
0x31E4, // 31E4..31EF; UNKNOWN
6103+
0x31E4, // 31E4..31EE; UNKNOWN
6104+
0x31EF, // 31EF ; COMMON
60926105
0x31F0, // 31F0..31FF; KATAKANA
60936106
0x3200, // 3200..321E; HANGUL
60946107
0x321F, // 321F ; UNKNOWN
@@ -7028,7 +7041,9 @@ public static enum UnicodeScript {
70287041
0x2B820, // 2B820..2CEA1; HAN
70297042
0x2CEA2, // 2CEA2..2CEAF; UNKNOWN
70307043
0x2CEB0, // 2CEB0..2EBE0; HAN
7031-
0x2EBE1, // 2EBE1..2F7FF; UNKNOWN
7044+
0x2EBE1, // 2EBE1..2EBEF; UNKNOWN
7045+
0x2EBF0, // 2EBF0..2EE5D; HAN
7046+
0x2EE5E, // 2EE5E..2F7FF; UNKNOWN
70327047
0x2F800, // 2F800..2FA1D; HAN
70337048
0x2FA1E, // 2FA1E..2FFFF; UNKNOWN
70347049
0x30000, // 30000..3134A; HAN
@@ -7717,9 +7732,7 @@ public static enum UnicodeScript {
77177732
UNKNOWN, // 2EF4..2EFF
77187733
HAN, // 2F00..2FD5
77197734
UNKNOWN, // 2FD6..2FEF
7720-
COMMON, // 2FF0..2FFB
7721-
UNKNOWN, // 2FFC..2FFF
7722-
COMMON, // 3000..3004
7735+
COMMON, // 2FF0..3004
77237736
HAN, // 3005
77247737
COMMON, // 3006
77257738
HAN, // 3007
@@ -7748,7 +7761,8 @@ public static enum UnicodeScript {
77487761
COMMON, // 3190..319F
77497762
BOPOMOFO, // 31A0..31BF
77507763
COMMON, // 31C0..31E3
7751-
UNKNOWN, // 31E4..31EF
7764+
UNKNOWN, // 31E4..31EE
7765+
COMMON, // 31EF
77527766
KATAKANA, // 31F0..31FF
77537767
HANGUL, // 3200..321E
77547768
UNKNOWN, // 321F
@@ -8688,7 +8702,9 @@ public static enum UnicodeScript {
86888702
HAN, // 2B820..2CEA1
86898703
UNKNOWN, // 2CEA2..2CEAF
86908704
HAN, // 2CEB0..2EBE0
8691-
UNKNOWN, // 2EBE1..2F7FF
8705+
UNKNOWN, // 2EBE1..2EBEF
8706+
HAN, // 2EBF0..2EE5D
8707+
UNKNOWN, // 2EE5E..2F7FF
86928708
HAN, // 2F800..2FA1D
86938709
UNKNOWN, // 2FA1E..2FFFF
86948710
HAN, // 30000..3134A

0 commit comments

Comments
 (0)
Please sign in to comment.