Skip to content

Commit 80e9797

Browse files
committedMar 20, 2023
8304433: cleanup sentence breaker code in DocTreeMaker
Reviewed-by: hannesw
1 parent c396f1e commit 80e9797

File tree

2 files changed

+211
-196
lines changed

2 files changed

+211
-196
lines changed
 

‎src/jdk.compiler/share/classes/com/sun/tools/javac/tree/DocTreeMaker.java

+211-195
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,8 @@
2727

2828
import java.text.BreakIterator;
2929
import java.util.ArrayList;
30-
import java.util.Collection;
3130
import java.util.Collections;
3231
import java.util.List;
33-
import java.util.ListIterator;
3432
import java.util.Set;
3533

3634
import javax.lang.model.element.Name;
@@ -110,10 +108,6 @@ public class DocTreeMaker implements DocTreeFactory {
110108
/** The context key for the tree factory. */
111109
protected static final Context.Key<DocTreeMaker> treeMakerKey = new Context.Key<>();
112110

113-
// A subset of block tags, which acts as sentence breakers, appearing
114-
// anywhere but the zero'th position in the first sentence.
115-
final Set<String> sentenceBreakTags;
116-
117111
/** Get the TreeMaker instance. */
118112
public static DocTreeMaker instance(Context context) {
119113
DocTreeMaker instance = context.get(treeMakerKey);
@@ -127,6 +121,7 @@ public static DocTreeMaker instance(Context context) {
127121
public int pos;
128122

129123
private final JavacTrees trees;
124+
private final SentenceBreaker breaker;
130125

131126
/** Utility class to parse reference signatures. */
132127
private final ReferenceParser referenceParser;
@@ -139,7 +134,7 @@ protected DocTreeMaker(Context context) {
139134
this.pos = Position.NOPOS;
140135
trees = JavacTrees.instance(context);
141136
referenceParser = new ReferenceParser(ParserFactory.instance(context));
142-
sentenceBreakTags = Set.of("H1", "H2", "H3", "H4", "H5", "H6", "PRE", "P");
137+
breaker = new SentenceBreaker(this);
143138
}
144139

145140
/** Reassign current position.
@@ -518,224 +513,245 @@ public List<DocTree> getFirstSentence(List<? extends DocTree> list) {
518513
return new ArrayList<>(pair.fst);
519514
}
520515

521-
/*
522-
* Breaks up the body tags into the first sentence and its successors.
523-
* The first sentence is determined with the presence of a period,
524-
* block tag, or a sentence break, as returned by the BreakIterator.
525-
* Trailing whitespaces are trimmed.
526-
*/
527-
private Pair<List<DCTree>, List<DCTree>> splitBody(Collection<? extends DocTree> list) {
528-
// pos is modified as we create trees, therefore
529-
// we save the pos and restore it later.
530-
final int savedpos = this.pos;
531-
try {
532-
ListBuffer<DCTree> body = new ListBuffer<>();
533-
// split body into first sentence and body
534-
ListBuffer<DCTree> fs = new ListBuffer<>();
516+
@SuppressWarnings("unchecked")
517+
private static List<DCTree> cast(List<? extends DocTree> list) {
518+
return (List<DCTree>) list;
519+
}
520+
521+
Pair<List<DCTree>, List<DCTree>> splitBody(List<? extends DocTree> list) {
522+
return breaker.splitBody(list);
523+
}
524+
525+
static class SentenceBreaker {
526+
final DocTreeMaker m;
527+
528+
// A subset of block tags, which acts as sentence breakers, appearing
529+
// anywhere but the zero'th position in the first sentence.
530+
static final Set<String> sentenceBreakTags = Set.of(
531+
"H1", "H2", "H3", "H4", "H5", "H6",
532+
"PRE", "P");
533+
534+
SentenceBreaker(DocTreeMaker m) {
535+
this.m = m;
536+
}
537+
538+
/*
539+
* Breaks up the body tags into the first sentence and its successors.
540+
* The first sentence is determined with the presence of a period,
541+
* block tag, or a sentence break, as returned by the BreakIterator.
542+
* Trailing whitespaces are trimmed.
543+
*/
544+
Pair<List<DCTree>, List<DCTree>> splitBody(List<? extends DocTree> list) {
535545
if (list.isEmpty()) {
536-
return new Pair<>(fs.toList(), body.toList());
546+
return new Pair<>(List.of(), List.of());
537547
}
538-
boolean foundFirstSentence = false;
539-
ArrayList<DocTree> alist = new ArrayList<>(list);
540-
ListIterator<DocTree> itr = alist.listIterator();
541-
while (itr.hasNext()) {
542-
boolean isFirst = !itr.hasPrevious();
543-
DocTree dt = itr.next();
544-
int spos = ((DCTree) dt).pos;
545-
if (foundFirstSentence) {
546-
body.add((DCTree) dt);
547-
continue;
548-
}
549-
switch (dt.getKind()) {
550-
case RETURN:
551-
case SUMMARY:
552-
foundFirstSentence = true;
553-
break;
554-
case TEXT:
555-
DCText tt = (DCText) dt;
556-
String s = tt.getBody();
557-
DocTree peekedNext = itr.hasNext()
558-
? alist.get(itr.nextIndex())
559-
: null;
560-
int sbreak = getSentenceBreak(s, peekedNext);
561-
if (sbreak > 0) {
562-
s = s.substring(0, sbreak).stripTrailing();
563-
DCText text = this.at(spos).newTextTree(s);
564-
fs.add(text);
548+
// pos is modified as we create trees, therefore
549+
// we save the pos and restore it later.
550+
final var savedPos = m.pos;
551+
try {
552+
// split list into first sentence and body
553+
var fs = new ListBuffer<DCTree>();
554+
var body = new ListBuffer<DCTree>();
555+
var alist = new ArrayList<>(cast(list)); // copy to allow indexed access for peeking
556+
var iter = alist.listIterator();
557+
var foundFirstSentence = false;
558+
while (iter.hasNext() && !foundFirstSentence) {
559+
boolean isFirst = !iter.hasPrevious();
560+
DCTree dt = iter.next();
561+
switch (dt.getKind()) {
562+
case RETURN, SUMMARY -> {
563+
fs.add(dt);
565564
foundFirstSentence = true;
566-
int nwPos = skipWhiteSpace(tt.getBody(), sbreak);
567-
if (nwPos > 0) {
568-
DCText text2 = this.at(spos + nwPos).newTextTree(tt.getBody().substring(nwPos));
569-
body.add(text2);
565+
}
566+
567+
case TEXT -> {
568+
var dtPos = dt.pos;
569+
var s = ((DCText) dt).getBody();
570+
var peekedNext = iter.hasNext()
571+
? alist.get(iter.nextIndex())
572+
: null;
573+
int sbreak = getSentenceBreak(s, peekedNext);
574+
if (sbreak > 0) {
575+
var fsPart = m.at(dtPos).newTextTree(s.substring(0, sbreak).stripTrailing());
576+
fs.add(fsPart);
577+
int offsetPos = skipWhiteSpace(s, sbreak);
578+
if (offsetPos > 0) {
579+
DCText bodyPart = m.at(dtPos + offsetPos).newTextTree(s.substring(offsetPos));
580+
body.add(bodyPart);
581+
}
582+
foundFirstSentence = true;
583+
} else if (peekedNext != null) {
584+
// if the next doctree is a break, remove trailing spaces
585+
if (isSentenceBreak(peekedNext, false)) {
586+
DCTree next = iter.next();
587+
DCText fsPart = m.at(dtPos).newTextTree(s.stripTrailing());
588+
fs.add(fsPart);
589+
body.add(next);
590+
foundFirstSentence = true;
591+
} else {
592+
fs.add(dt);
593+
}
594+
} else {
595+
fs.add(dt);
570596
}
571-
continue;
572-
} else if (itr.hasNext()) {
573-
// if the next doctree is a break, remove trailing spaces
574-
peekedNext = alist.get(itr.nextIndex());
575-
boolean sbrk = isSentenceBreak(peekedNext, false);
576-
if (sbrk) {
577-
DocTree next = itr.next();
578-
s = s.stripTrailing();
579-
DCText text = this.at(spos).newTextTree(s);
580-
fs.add(text);
581-
body.add((DCTree) next);
597+
}
598+
599+
default -> {
600+
// This ignores certain block tags if they appear first in the list,
601+
// allowing the content of that tag to provide the first sentence.
602+
// It would be better if other block tags always terminated the
603+
// first sentence as well, like lists and tables.
604+
if (isSentenceBreak(dt, isFirst)) {
605+
body.add(dt);
582606
foundFirstSentence = true;
583-
continue;
607+
} else {
608+
fs.add(dt);
584609
}
585610
}
611+
}
612+
}
613+
614+
// if there are remaining elements, then we have found the first
615+
// sentence, and remaining elements are for the body.
616+
while (iter.hasNext()) {
617+
body.add(iter.next());
618+
}
619+
620+
return new Pair<>(fs.toList(), body.toList());
621+
} finally {
622+
m.pos = savedPos;
623+
}
624+
}
625+
626+
/*
627+
* Computes the first sentence break, a simple dot-space algorithm.
628+
*/
629+
private int defaultSentenceBreak(String s) {
630+
// scan for period followed by whitespace
631+
int period = -1;
632+
for (int i = 0; i < s.length(); i++) {
633+
switch (s.charAt(i)) {
634+
case '.':
635+
period = i;
586636
break;
587-
default:
588-
if (isSentenceBreak(dt, isFirst)) {
589-
body.add((DCTree) dt);
590-
foundFirstSentence = true;
591-
continue;
637+
638+
case ' ':
639+
case '\f':
640+
case '\n':
641+
case '\r':
642+
case '\t':
643+
if (period >= 0) {
644+
return i;
592645
}
593646
break;
647+
648+
default:
649+
period = -1;
650+
break;
594651
}
595-
fs.add((DCTree) dt);
596652
}
597-
return new Pair<>(fs.toList(), body.toList());
598-
} finally {
599-
this.pos = savedpos;
653+
return -1;
600654
}
601-
}
602655

603-
private boolean isTextTree(DocTree tree) {
604-
return tree.getKind() == Kind.TEXT;
605-
}
606-
607-
/*
608-
* Computes the first sentence break, a simple dot-space algorithm.
609-
*/
610-
private int defaultSentenceBreak(String s) {
611-
// scan for period followed by whitespace
612-
int period = -1;
613-
for (int i = 0; i < s.length(); i++) {
614-
switch (s.charAt(i)) {
615-
case '.':
616-
period = i;
617-
break;
618-
619-
case ' ':
620-
case '\f':
621-
case '\n':
622-
case '\r':
623-
case '\t':
624-
if (period >= 0) {
625-
return i;
626-
}
627-
break;
656+
/*
657+
* Computes the first sentence, if using a default breaker,
658+
* the break is returned, if not then a -1, indicating that
659+
* more doctree elements are required to be examined.
660+
*
661+
* BreakIterator.next points to the start of the following sentence,
662+
* and does not provide an easy way to disambiguate between "sentence break",
663+
* "possible sentence break" and "not a sentence break" at the end of the input.
664+
* For example, BreakIterator.next returns the index for the end
665+
* of the string for all of these examples,
666+
* using vertical bars to delimit the bounds of the example text
667+
* |Abc| (not a valid end of sentence break, if followed by more text)
668+
* |Abc.| (maybe a valid end of sentence break, depending on the following text)
669+
* |Abc. | (maybe a valid end of sentence break, depending on the following text)
670+
* |"Abc." | (maybe a valid end of sentence break, depending on the following text)
671+
* |Abc. | (definitely a valid end of sentence break)
672+
* |"Abc." | (definitely a valid end of sentence break)
673+
* Therefore, we have to probe further to determine whether
674+
* there really is a sentence break or not at the end of this run of text.
675+
*/
676+
private int getSentenceBreak(String s, DCTree nextTree) {
677+
BreakIterator breakIterator = m.trees.getBreakIterator();
678+
if (breakIterator == null) {
679+
return defaultSentenceBreak(s);
680+
}
681+
breakIterator.setText(s);
682+
final int sbrk = breakIterator.next();
683+
// This is the last doctree, found the droid we are looking for
684+
if (nextTree == null) {
685+
return sbrk;
686+
}
628687

629-
default:
630-
period = -1;
631-
break;
688+
// If the break is well within the span of the string ie. not
689+
// at EOL, then we have a clear break.
690+
if (sbrk < s.length() - 1) {
691+
return sbrk;
632692
}
633-
}
634-
return -1;
635-
}
636693

637-
/*
638-
* Computes the first sentence, if using a default breaker,
639-
* the break is returned, if not then a -1, indicating that
640-
* more doctree elements are required to be examined.
641-
*
642-
* BreakIterator.next points to the start of the following sentence,
643-
* and does not provide an easy way to disambiguate between "sentence break",
644-
* "possible sentence break" and "not a sentence break" at the end of the input.
645-
* For example, BreakIterator.next returns the index for the end
646-
* of the string for all of these examples,
647-
* using vertical bars to delimit the bounds of the example text
648-
* |Abc| (not a valid end of sentence break, if followed by more text)
649-
* |Abc.| (maybe a valid end of sentence break, depending on the following text)
650-
* |Abc. | (maybe a valid end of sentence break, depending on the following text)
651-
* |"Abc." | (maybe a valid end of sentence break, depending on the following text)
652-
* |Abc. | (definitely a valid end of sentence break)
653-
* |"Abc." | (definitely a valid end of sentence break)
654-
* Therefore, we have to probe further to determine whether
655-
* there really is a sentence break or not at the end of this run of text.
656-
*/
657-
private int getSentenceBreak(String s, DocTree dt) {
658-
BreakIterator breakIterator = trees.getBreakIterator();
659-
if (breakIterator == null) {
660-
return defaultSentenceBreak(s);
661-
}
662-
breakIterator.setText(s);
663-
final int sbrk = breakIterator.next();
664-
// This is the last doctree, found the droid we are looking for
665-
if (dt == null) {
666-
return sbrk;
667-
}
694+
if (nextTree.getKind() == Kind.TEXT) {
695+
// Two adjacent text trees, a corner case, perhaps
696+
// produced by a tool synthesizing a doctree. In
697+
// this case, does the break lie within the first span,
698+
// then we have the droid, otherwise allow the callers
699+
// logic to handle the break in the adjacent doctree.
700+
TextTree ttnext = (TextTree) nextTree;
701+
String combined = s + ttnext.getBody();
702+
breakIterator.setText(combined);
703+
int sbrk2 = breakIterator.next();
704+
if (sbrk < sbrk2) {
705+
return sbrk;
706+
}
707+
}
668708

669-
// If the break is well within the span of the string ie. not
670-
// at EOL, then we have a clear break.
671-
if (sbrk < s.length() - 1) {
672-
return sbrk;
673-
}
709+
// Is the adjacent tree a sentence breaker ?
710+
if (isSentenceBreak(nextTree, false)) {
711+
return sbrk;
712+
}
674713

675-
if (isTextTree(dt)) {
676-
// Two adjacent text trees, a corner case, perhaps
677-
// produced by a tool synthesizing a doctree. In
678-
// this case, does the break lie within the first span,
679-
// then we have the droid, otherwise allow the callers
680-
// logic to handle the break in the adjacent doctree.
681-
TextTree ttnext = (TextTree) dt;
682-
String combined = s + ttnext.getBody();
714+
// At this point the adjacent tree is either a javadoc tag ({@..),
715+
// html tag (<..) or an entity (&..). Perform a litmus test, by
716+
// concatenating a sentence, to validate the break earlier identified.
717+
String combined = s + "Dummy Sentence.";
683718
breakIterator.setText(combined);
684719
int sbrk2 = breakIterator.next();
685-
if (sbrk < sbrk2) {
686-
return sbrk;
720+
if (sbrk2 <= sbrk) {
721+
return sbrk2;
687722
}
723+
return -1; // indeterminate at this time
688724
}
689725

690-
// Is the adjacent tree a sentence breaker ?
691-
if (isSentenceBreak(dt, false)) {
692-
return sbrk;
693-
}
694-
695-
// At this point the adjacent tree is either a javadoc tag ({@..),
696-
// html tag (<..) or an entity (&..). Perform a litmus test, by
697-
// concatenating a sentence, to validate the break earlier identified.
698-
String combined = s + "Dummy Sentence.";
699-
breakIterator.setText(combined);
700-
int sbrk2 = breakIterator.next();
701-
if (sbrk2 <= sbrk) {
702-
return sbrk2;
726+
private boolean isSentenceBreak(DCTree dt, boolean isFirstDocTree) {
727+
switch (dt.getKind()) {
728+
case START_ELEMENT:
729+
StartElementTree set = (StartElementTree) dt;
730+
return !isFirstDocTree && dt.pos > 1 && isSentenceBreak(set.getName());
731+
case END_ELEMENT:
732+
EndElementTree eet = (EndElementTree) dt;
733+
return !isFirstDocTree && dt.pos > 1 && isSentenceBreak(eet.getName());
734+
default:
735+
return false;
736+
}
703737
}
704-
return -1; // indeterminate at this time
705-
}
706-
707-
private boolean isSentenceBreak(Name tagName) {
708-
return sentenceBreakTags.contains(StringUtils.toUpperCase(tagName.toString()));
709-
}
710738

711-
private boolean isSentenceBreak(DocTree dt, boolean isFirstDocTree) {
712-
switch (dt.getKind()) {
713-
case START_ELEMENT:
714-
StartElementTree set = (StartElementTree)dt;
715-
return !isFirstDocTree && ((DCTree) dt).pos > 1 && isSentenceBreak(set.getName());
716-
case END_ELEMENT:
717-
EndElementTree eet = (EndElementTree)dt;
718-
return !isFirstDocTree && ((DCTree) dt).pos > 1 && isSentenceBreak(eet.getName());
719-
default:
720-
return false;
739+
private boolean isSentenceBreak(Name tagName) {
740+
return sentenceBreakTags.contains(StringUtils.toUpperCase(tagName.toString()));
721741
}
722-
}
723742

724-
/*
725-
* Returns the position of the first non-whitespace character.
726-
*/
727-
private int skipWhiteSpace(String s, int start) {
728-
for (int i = start; i < s.length(); i++) {
729-
char c = s.charAt(i);
730-
if (!Character.isWhitespace(c)) {
731-
return i;
743+
/*
744+
* Returns the position of the first non-whitespace character.
745+
*/
746+
private int skipWhiteSpace(String s, int start) {
747+
for (int i = start; i < s.length(); i++) {
748+
char c = s.charAt(i);
749+
if (!Character.isWhitespace(c)) {
750+
return i;
751+
}
732752
}
753+
return -1;
733754
}
734-
return -1;
735755
}
736756

737-
@SuppressWarnings("unchecked")
738-
private List<DCTree> cast(List<? extends DocTree> list) {
739-
return (List<DCTree>) list;
740-
}
741757
}

‎test/langtools/tools/javac/tree/SourceDocTreeScannerTest.java

-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@
6161
import com.sun.tools.javac.tree.DCTree.DCDocComment;
6262
import com.sun.tools.javac.tree.DCTree.DCReference;
6363
import com.sun.tools.javac.tree.JCTree.JCCompilationUnit;
64-
import com.sun.tools.javac.util.List;
6564
import com.sun.tools.javac.util.Pair;
6665

6766
public class SourceDocTreeScannerTest extends AbstractTreeScannerTest {

1 commit comments

Comments
 (1)

openjdk-notifier[bot] commented on Mar 20, 2023

@openjdk-notifier[bot]
Please sign in to comment.