Navigation Menu

Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

8291660: Grapheme support in BreakIterator #9991

Closed
wants to merge 10 commits into from
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -94,25 +94,20 @@ private static void generateFiles() throws Exception {
rules = (ResourceBundle) Class.forName(
localizedBundleName("sun.text.resources", "BreakIteratorRules")).getDeclaredConstructor().newInstance();

if (info.containsKey("CharacterData")) {
generateDataFile(info.getString("CharacterData"),
rules.getString("CharacterBreakRules"),
classNames[0]);
}
if (info.containsKey("WordData")) {
generateDataFile(info.getString("WordData"),
rules.getString("WordBreakRules"),
classNames[1]);
classNames[0]);
}
if (info.containsKey("LineData")) {
generateDataFile(info.getString("LineData"),
rules.getString("LineBreakRules"),
classNames[2]);
classNames[1]);
}
if (info.containsKey("SentenceData")) {
generateDataFile(info.getString("SentenceData"),
rules.getString("SentenceBreakRules"),
classNames[3]);
classNames[2]);
}
}

Expand Down
6 changes: 3 additions & 3 deletions make/modules/java.base/gensrc/GensrcEmojiData.gmk
Expand Up @@ -24,12 +24,12 @@
#

#
# Rules to create $(SUPPORT_OUTPUTDIR)/gensrc/java.base/java/util/regex/EmojiData.java
# Rules to create $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/EmojiData.java
#

GENSRC_EMOJIDATA := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/java/util/regex/EmojiData.java
GENSRC_EMOJIDATA := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/EmojiData.java

EMOJIDATATEMP = $(MODULE_SRC)/share/classes/java/util/regex/EmojiData.java.template
EMOJIDATATEMP = $(MODULE_SRC)/share/classes/jdk/internal/util/regex/EmojiData.java.template
UNICODEDATA = $(MODULE_SRC)/share/data/unicodedata

$(GENSRC_EMOJIDATA): $(BUILD_TOOLS_JDK) $(EMOJIDATATEMP) $(UNICODEDATA)/emoji/emoji-data.txt
Expand Down
8 changes: 7 additions & 1 deletion src/java.base/share/classes/java/text/BreakIterator.java
@@ -1,5 +1,5 @@
/*
* Copyright (c) 1996, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1996, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -99,6 +99,12 @@
* and a diacritical mark. What users consider to be a character can
* differ between languages.
*
* @implSpec The default implementation of the character boundary analysis
* conforms to the Unicode Consortium's Extended Grapheme Cluster breaks.
* For more detail, refer to
* <a href="https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries">
* Grapheme Cluster Boundaries</a> section in the Unicode Standard Annex #29.
*
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would say this is an @implSpec instead of an @implNote because we want applications to be able to rely on this behavior if they're using the default implementation. Also please make corresponding update to the CSR.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed

* <p>
* The {@code BreakIterator} instances returned by the factory methods
* of this class are intended for use with natural languages only, not for
Expand Down
1 change: 1 addition & 0 deletions src/java.base/share/classes/java/util/regex/Pattern.java
Expand Up @@ -44,6 +44,7 @@
import java.util.stream.StreamSupport;

import jdk.internal.util.ArraysSupport;
import jdk.internal.util.regex.Grapheme;

/**
* A compiled representation of a regular expression.
Expand Down
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2019, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand All @@ -23,7 +23,7 @@
* questions.
*/

package java.util.regex;
package jdk.internal.util.regex;

/**
* Holds data contained in the Unicode Technical Standard #51: Unicode
Expand Down
Expand Up @@ -23,11 +23,11 @@
* questions.
*/

package java.util.regex;
package jdk.internal.util.regex;

import java.util.Objects;

final class Grapheme {
public final class Grapheme {

/**
* Look for the next extended grapheme cluster boundary in a CharSequence.
Expand All @@ -43,7 +43,7 @@ final class Grapheme {
* @param limit limit offset in the src (exclusive)
* @return the next grapheme boundary
*/
static int nextBoundary(CharSequence src, int off, int limit) {
public static int nextBoundary(CharSequence src, int off, int limit) {
Objects.checkFromToIndex(off, limit, src.length());

int ch0 = Character.codePointAt(src, off);
Expand Down
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2003, 2005, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -50,15 +50,13 @@ protected final Object[][] getContents() {
// built-in type of BreakIterator
{"BreakIteratorClasses",
new String[] {
"RuleBasedBreakIterator", // character-break iterator class
"RuleBasedBreakIterator", // word-break iterator class
"RuleBasedBreakIterator", // line-break iterator class
"RuleBasedBreakIterator" // sentence-break iterator class
}
},

// Rules filename for each break-iterator
{"CharacterData", "CharacterBreakIteratorData"},
{"WordData", "WordBreakIteratorData"},
{"LineData", "LineBreakIteratorData"},
{"SentenceData", "SentenceBreakIteratorData"},
Expand Down
@@ -1,5 +1,5 @@
/*
* Copyright (c) 1999, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1999, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -67,53 +67,6 @@
public class BreakIteratorRules extends ListResourceBundle {
protected final Object[][] getContents() {
return new Object[][] {
// rules describing how to break between logical characters
{ "CharacterBreakRules",

// ignore non-spacing marks and enclosing marks (since we never
// put a break before ignore characters, this keeps combining
// accents with the base characters they modify)
"<enclosing>=[:Mn::Me:];"

// other category definitions
+ "<choseong>=[\u1100-\u115f];"
+ "<jungseong>=[\u1160-\u11a7];"
+ "<jongseong>=[\u11a8-\u11ff];"
+ "<surr-hi>=[\ud800-\udbff];"
+ "<surr-lo>=[\udc00-\udfff];"

// break after every character, except as follows:
+ ".;"

// keep base and combining characters togethers
+ "<base>=[^<enclosing>^[:Cc::Cf::Zl::Zp:]];"
+ "<base><enclosing><enclosing>*;"

// keep CRLF sequences together
+ "\r\n;"

// keep surrogate pairs together
+ "<surr-hi><surr-lo>;"

// keep Hangul syllables spelled out using conjoining jamo together
+ "<choseong>*<jungseong>*<jongseong>*;"

// various additions for Hindi support
+ "<nukta>=[\u093c];"
+ "<danda>=[\u0964\u0965];"
+ "<virama>=[\u094d];"
+ "<devVowelSign>=[\u093e-\u094c\u0962\u0963];"
+ "<devConsonant>=[\u0915-\u0939];"
+ "<devNuktaConsonant>=[\u0958-\u095f];"
+ "<devCharEnd>=[\u0902\u0903\u0951-\u0954];"
+ "<devCAMN>=(<devConsonant>{<nukta>});"
+ "<devConsonant1>=(<devNuktaConsonant>|<devCAMN>);"
+ "<zwj>=[\u200d];"
+ "<devConjunct>=({<devConsonant1><virama>{<zwj>}}<devConsonant1>);"
+ "<devConjunct>{<devVowelSign>}{<devCharEnd>};"
+ "<danda><nukta>;"
},

// default rules for finding word boundaries
{ "WordBreakRules",
// ignore non-spacing marks, enclosing marks, and format characters,
Expand Down