27
27
28
28
import java .text .BreakIterator ;
29
29
import java .util .ArrayList ;
30
- import java .util .Collection ;
31
30
import java .util .Collections ;
32
31
import java .util .List ;
33
- import java .util .ListIterator ;
34
32
import java .util .Set ;
35
33
36
34
import javax .lang .model .element .Name ;
@@ -110,10 +108,6 @@ public class DocTreeMaker implements DocTreeFactory {
110
108
/** The context key for the tree factory. */
111
109
protected static final Context .Key <DocTreeMaker > treeMakerKey = new Context .Key <>();
112
110
113
- // A subset of block tags, which acts as sentence breakers, appearing
114
- // anywhere but the zero'th position in the first sentence.
115
- final Set <String > sentenceBreakTags ;
116
-
117
111
/** Get the TreeMaker instance. */
118
112
public static DocTreeMaker instance (Context context ) {
119
113
DocTreeMaker instance = context .get (treeMakerKey );
@@ -127,6 +121,7 @@ public static DocTreeMaker instance(Context context) {
127
121
public int pos ;
128
122
129
123
private final JavacTrees trees ;
124
+ private final SentenceBreaker breaker ;
130
125
131
126
/** Utility class to parse reference signatures. */
132
127
private final ReferenceParser referenceParser ;
@@ -139,7 +134,7 @@ protected DocTreeMaker(Context context) {
139
134
this .pos = Position .NOPOS ;
140
135
trees = JavacTrees .instance (context );
141
136
referenceParser = new ReferenceParser (ParserFactory .instance (context ));
142
- sentenceBreakTags = Set . of ( "H1" , "H2" , "H3" , "H4" , "H5" , "H6" , "PRE" , "P" );
137
+ breaker = new SentenceBreaker ( this );
143
138
}
144
139
145
140
/** Reassign current position.
@@ -518,224 +513,245 @@ public List<DocTree> getFirstSentence(List<? extends DocTree> list) {
518
513
return new ArrayList <>(pair .fst );
519
514
}
520
515
521
- /*
522
- * Breaks up the body tags into the first sentence and its successors.
523
- * The first sentence is determined with the presence of a period,
524
- * block tag, or a sentence break, as returned by the BreakIterator.
525
- * Trailing whitespaces are trimmed.
526
- */
527
- private Pair <List <DCTree >, List <DCTree >> splitBody (Collection <? extends DocTree > list ) {
528
- // pos is modified as we create trees, therefore
529
- // we save the pos and restore it later.
530
- final int savedpos = this .pos ;
531
- try {
532
- ListBuffer <DCTree > body = new ListBuffer <>();
533
- // split body into first sentence and body
534
- ListBuffer <DCTree > fs = new ListBuffer <>();
516
+ @ SuppressWarnings ("unchecked" )
517
+ private static List <DCTree > cast (List <? extends DocTree > list ) {
518
+ return (List <DCTree >) list ;
519
+ }
520
+
521
+ Pair <List <DCTree >, List <DCTree >> splitBody (List <? extends DocTree > list ) {
522
+ return breaker .splitBody (list );
523
+ }
524
+
525
+ static class SentenceBreaker {
526
+ final DocTreeMaker m ;
527
+
528
+ // A subset of block tags, which acts as sentence breakers, appearing
529
+ // anywhere but the zero'th position in the first sentence.
530
+ static final Set <String > sentenceBreakTags = Set .of (
531
+ "H1" , "H2" , "H3" , "H4" , "H5" , "H6" ,
532
+ "PRE" , "P" );
533
+
534
+ SentenceBreaker (DocTreeMaker m ) {
535
+ this .m = m ;
536
+ }
537
+
538
+ /*
539
+ * Breaks up the body tags into the first sentence and its successors.
540
+ * The first sentence is determined with the presence of a period,
541
+ * block tag, or a sentence break, as returned by the BreakIterator.
542
+ * Trailing whitespaces are trimmed.
543
+ */
544
+ Pair <List <DCTree >, List <DCTree >> splitBody (List <? extends DocTree > list ) {
535
545
if (list .isEmpty ()) {
536
- return new Pair <>(fs . toList (), body . toList ());
546
+ return new Pair <>(List . of (), List . of ());
537
547
}
538
- boolean foundFirstSentence = false ;
539
- ArrayList <DocTree > alist = new ArrayList <>(list );
540
- ListIterator <DocTree > itr = alist .listIterator ();
541
- while (itr .hasNext ()) {
542
- boolean isFirst = !itr .hasPrevious ();
543
- DocTree dt = itr .next ();
544
- int spos = ((DCTree ) dt ).pos ;
545
- if (foundFirstSentence ) {
546
- body .add ((DCTree ) dt );
547
- continue ;
548
- }
549
- switch (dt .getKind ()) {
550
- case RETURN :
551
- case SUMMARY :
552
- foundFirstSentence = true ;
553
- break ;
554
- case TEXT :
555
- DCText tt = (DCText ) dt ;
556
- String s = tt .getBody ();
557
- DocTree peekedNext = itr .hasNext ()
558
- ? alist .get (itr .nextIndex ())
559
- : null ;
560
- int sbreak = getSentenceBreak (s , peekedNext );
561
- if (sbreak > 0 ) {
562
- s = s .substring (0 , sbreak ).stripTrailing ();
563
- DCText text = this .at (spos ).newTextTree (s );
564
- fs .add (text );
548
+ // pos is modified as we create trees, therefore
549
+ // we save the pos and restore it later.
550
+ final var savedPos = m .pos ;
551
+ try {
552
+ // split list into first sentence and body
553
+ var fs = new ListBuffer <DCTree >();
554
+ var body = new ListBuffer <DCTree >();
555
+ var alist = new ArrayList <>(cast (list )); // copy to allow indexed access for peeking
556
+ var iter = alist .listIterator ();
557
+ var foundFirstSentence = false ;
558
+ while (iter .hasNext () && !foundFirstSentence ) {
559
+ boolean isFirst = !iter .hasPrevious ();
560
+ DCTree dt = iter .next ();
561
+ switch (dt .getKind ()) {
562
+ case RETURN , SUMMARY -> {
563
+ fs .add (dt );
565
564
foundFirstSentence = true ;
566
- int nwPos = skipWhiteSpace (tt .getBody (), sbreak );
567
- if (nwPos > 0 ) {
568
- DCText text2 = this .at (spos + nwPos ).newTextTree (tt .getBody ().substring (nwPos ));
569
- body .add (text2 );
565
+ }
566
+
567
+ case TEXT -> {
568
+ var dtPos = dt .pos ;
569
+ var s = ((DCText ) dt ).getBody ();
570
+ var peekedNext = iter .hasNext ()
571
+ ? alist .get (iter .nextIndex ())
572
+ : null ;
573
+ int sbreak = getSentenceBreak (s , peekedNext );
574
+ if (sbreak > 0 ) {
575
+ var fsPart = m .at (dtPos ).newTextTree (s .substring (0 , sbreak ).stripTrailing ());
576
+ fs .add (fsPart );
577
+ int offsetPos = skipWhiteSpace (s , sbreak );
578
+ if (offsetPos > 0 ) {
579
+ DCText bodyPart = m .at (dtPos + offsetPos ).newTextTree (s .substring (offsetPos ));
580
+ body .add (bodyPart );
581
+ }
582
+ foundFirstSentence = true ;
583
+ } else if (peekedNext != null ) {
584
+ // if the next doctree is a break, remove trailing spaces
585
+ if (isSentenceBreak (peekedNext , false )) {
586
+ DCTree next = iter .next ();
587
+ DCText fsPart = m .at (dtPos ).newTextTree (s .stripTrailing ());
588
+ fs .add (fsPart );
589
+ body .add (next );
590
+ foundFirstSentence = true ;
591
+ } else {
592
+ fs .add (dt );
593
+ }
594
+ } else {
595
+ fs .add (dt );
570
596
}
571
- continue ;
572
- } else if (itr .hasNext ()) {
573
- // if the next doctree is a break, remove trailing spaces
574
- peekedNext = alist .get (itr .nextIndex ());
575
- boolean sbrk = isSentenceBreak (peekedNext , false );
576
- if (sbrk ) {
577
- DocTree next = itr .next ();
578
- s = s .stripTrailing ();
579
- DCText text = this .at (spos ).newTextTree (s );
580
- fs .add (text );
581
- body .add ((DCTree ) next );
597
+ }
598
+
599
+ default -> {
600
+ // This ignores certain block tags if they appear first in the list,
601
+ // allowing the content of that tag to provide the first sentence.
602
+ // It would be better if other block tags always terminated the
603
+ // first sentence as well, like lists and tables.
604
+ if (isSentenceBreak (dt , isFirst )) {
605
+ body .add (dt );
582
606
foundFirstSentence = true ;
583
- continue ;
607
+ } else {
608
+ fs .add (dt );
584
609
}
585
610
}
611
+ }
612
+ }
613
+
614
+ // if there are remaining elements, then we have found the first
615
+ // sentence, and remaining elements are for the body.
616
+ while (iter .hasNext ()) {
617
+ body .add (iter .next ());
618
+ }
619
+
620
+ return new Pair <>(fs .toList (), body .toList ());
621
+ } finally {
622
+ m .pos = savedPos ;
623
+ }
624
+ }
625
+
626
+ /*
627
+ * Computes the first sentence break, a simple dot-space algorithm.
628
+ */
629
+ private int defaultSentenceBreak (String s ) {
630
+ // scan for period followed by whitespace
631
+ int period = -1 ;
632
+ for (int i = 0 ; i < s .length (); i ++) {
633
+ switch (s .charAt (i )) {
634
+ case '.' :
635
+ period = i ;
586
636
break ;
587
- default :
588
- if (isSentenceBreak (dt , isFirst )) {
589
- body .add ((DCTree ) dt );
590
- foundFirstSentence = true ;
591
- continue ;
637
+
638
+ case ' ' :
639
+ case '\f' :
640
+ case '\n' :
641
+ case '\r' :
642
+ case '\t' :
643
+ if (period >= 0 ) {
644
+ return i ;
592
645
}
593
646
break ;
647
+
648
+ default :
649
+ period = -1 ;
650
+ break ;
594
651
}
595
- fs .add ((DCTree ) dt );
596
652
}
597
- return new Pair <>(fs .toList (), body .toList ());
598
- } finally {
599
- this .pos = savedpos ;
653
+ return -1 ;
600
654
}
601
- }
602
655
603
- private boolean isTextTree (DocTree tree ) {
604
- return tree .getKind () == Kind .TEXT ;
605
- }
606
-
607
- /*
608
- * Computes the first sentence break, a simple dot-space algorithm.
609
- */
610
- private int defaultSentenceBreak (String s ) {
611
- // scan for period followed by whitespace
612
- int period = -1 ;
613
- for (int i = 0 ; i < s .length (); i ++) {
614
- switch (s .charAt (i )) {
615
- case '.' :
616
- period = i ;
617
- break ;
618
-
619
- case ' ' :
620
- case '\f' :
621
- case '\n' :
622
- case '\r' :
623
- case '\t' :
624
- if (period >= 0 ) {
625
- return i ;
626
- }
627
- break ;
656
+ /*
657
+ * Computes the first sentence, if using a default breaker,
658
+ * the break is returned, if not then a -1, indicating that
659
+ * more doctree elements are required to be examined.
660
+ *
661
+ * BreakIterator.next points to the start of the following sentence,
662
+ * and does not provide an easy way to disambiguate between "sentence break",
663
+ * "possible sentence break" and "not a sentence break" at the end of the input.
664
+ * For example, BreakIterator.next returns the index for the end
665
+ * of the string for all of these examples,
666
+ * using vertical bars to delimit the bounds of the example text
667
+ * |Abc| (not a valid end of sentence break, if followed by more text)
668
+ * |Abc.| (maybe a valid end of sentence break, depending on the following text)
669
+ * |Abc. | (maybe a valid end of sentence break, depending on the following text)
670
+ * |"Abc." | (maybe a valid end of sentence break, depending on the following text)
671
+ * |Abc. | (definitely a valid end of sentence break)
672
+ * |"Abc." | (definitely a valid end of sentence break)
673
+ * Therefore, we have to probe further to determine whether
674
+ * there really is a sentence break or not at the end of this run of text.
675
+ */
676
+ private int getSentenceBreak (String s , DCTree nextTree ) {
677
+ BreakIterator breakIterator = m .trees .getBreakIterator ();
678
+ if (breakIterator == null ) {
679
+ return defaultSentenceBreak (s );
680
+ }
681
+ breakIterator .setText (s );
682
+ final int sbrk = breakIterator .next ();
683
+ // This is the last doctree, found the droid we are looking for
684
+ if (nextTree == null ) {
685
+ return sbrk ;
686
+ }
628
687
629
- default :
630
- period = -1 ;
631
- break ;
688
+ // If the break is well within the span of the string ie. not
689
+ // at EOL, then we have a clear break.
690
+ if (sbrk < s .length () - 1 ) {
691
+ return sbrk ;
632
692
}
633
- }
634
- return -1 ;
635
- }
636
693
637
- /*
638
- * Computes the first sentence, if using a default breaker,
639
- * the break is returned, if not then a -1, indicating that
640
- * more doctree elements are required to be examined.
641
- *
642
- * BreakIterator.next points to the start of the following sentence,
643
- * and does not provide an easy way to disambiguate between "sentence break",
644
- * "possible sentence break" and "not a sentence break" at the end of the input.
645
- * For example, BreakIterator.next returns the index for the end
646
- * of the string for all of these examples,
647
- * using vertical bars to delimit the bounds of the example text
648
- * |Abc| (not a valid end of sentence break, if followed by more text)
649
- * |Abc.| (maybe a valid end of sentence break, depending on the following text)
650
- * |Abc. | (maybe a valid end of sentence break, depending on the following text)
651
- * |"Abc." | (maybe a valid end of sentence break, depending on the following text)
652
- * |Abc. | (definitely a valid end of sentence break)
653
- * |"Abc." | (definitely a valid end of sentence break)
654
- * Therefore, we have to probe further to determine whether
655
- * there really is a sentence break or not at the end of this run of text.
656
- */
657
- private int getSentenceBreak (String s , DocTree dt ) {
658
- BreakIterator breakIterator = trees .getBreakIterator ();
659
- if (breakIterator == null ) {
660
- return defaultSentenceBreak (s );
661
- }
662
- breakIterator .setText (s );
663
- final int sbrk = breakIterator .next ();
664
- // This is the last doctree, found the droid we are looking for
665
- if (dt == null ) {
666
- return sbrk ;
667
- }
694
+ if (nextTree .getKind () == Kind .TEXT ) {
695
+ // Two adjacent text trees, a corner case, perhaps
696
+ // produced by a tool synthesizing a doctree. In
697
+ // this case, does the break lie within the first span,
698
+ // then we have the droid, otherwise allow the callers
699
+ // logic to handle the break in the adjacent doctree.
700
+ TextTree ttnext = (TextTree ) nextTree ;
701
+ String combined = s + ttnext .getBody ();
702
+ breakIterator .setText (combined );
703
+ int sbrk2 = breakIterator .next ();
704
+ if (sbrk < sbrk2 ) {
705
+ return sbrk ;
706
+ }
707
+ }
668
708
669
- // If the break is well within the span of the string ie. not
670
- // at EOL, then we have a clear break.
671
- if (sbrk < s .length () - 1 ) {
672
- return sbrk ;
673
- }
709
+ // Is the adjacent tree a sentence breaker ?
710
+ if (isSentenceBreak (nextTree , false )) {
711
+ return sbrk ;
712
+ }
674
713
675
- if (isTextTree (dt )) {
676
- // Two adjacent text trees, a corner case, perhaps
677
- // produced by a tool synthesizing a doctree. In
678
- // this case, does the break lie within the first span,
679
- // then we have the droid, otherwise allow the callers
680
- // logic to handle the break in the adjacent doctree.
681
- TextTree ttnext = (TextTree ) dt ;
682
- String combined = s + ttnext .getBody ();
714
+ // At this point the adjacent tree is either a javadoc tag ({@..),
715
+ // html tag (<..) or an entity (&..). Perform a litmus test, by
716
+ // concatenating a sentence, to validate the break earlier identified.
717
+ String combined = s + "Dummy Sentence." ;
683
718
breakIterator .setText (combined );
684
719
int sbrk2 = breakIterator .next ();
685
- if (sbrk < sbrk2 ) {
686
- return sbrk ;
720
+ if (sbrk2 <= sbrk ) {
721
+ return sbrk2 ;
687
722
}
723
+ return -1 ; // indeterminate at this time
688
724
}
689
725
690
- // Is the adjacent tree a sentence breaker ?
691
- if (isSentenceBreak (dt , false )) {
692
- return sbrk ;
693
- }
694
-
695
- // At this point the adjacent tree is either a javadoc tag ({@..),
696
- // html tag (<..) or an entity (&..). Perform a litmus test, by
697
- // concatenating a sentence, to validate the break earlier identified.
698
- String combined = s + "Dummy Sentence." ;
699
- breakIterator .setText (combined );
700
- int sbrk2 = breakIterator .next ();
701
- if (sbrk2 <= sbrk ) {
702
- return sbrk2 ;
726
+ private boolean isSentenceBreak (DCTree dt , boolean isFirstDocTree ) {
727
+ switch (dt .getKind ()) {
728
+ case START_ELEMENT :
729
+ StartElementTree set = (StartElementTree ) dt ;
730
+ return !isFirstDocTree && dt .pos > 1 && isSentenceBreak (set .getName ());
731
+ case END_ELEMENT :
732
+ EndElementTree eet = (EndElementTree ) dt ;
733
+ return !isFirstDocTree && dt .pos > 1 && isSentenceBreak (eet .getName ());
734
+ default :
735
+ return false ;
736
+ }
703
737
}
704
- return -1 ; // indeterminate at this time
705
- }
706
-
707
- private boolean isSentenceBreak (Name tagName ) {
708
- return sentenceBreakTags .contains (StringUtils .toUpperCase (tagName .toString ()));
709
- }
710
738
711
- private boolean isSentenceBreak (DocTree dt , boolean isFirstDocTree ) {
712
- switch (dt .getKind ()) {
713
- case START_ELEMENT :
714
- StartElementTree set = (StartElementTree )dt ;
715
- return !isFirstDocTree && ((DCTree ) dt ).pos > 1 && isSentenceBreak (set .getName ());
716
- case END_ELEMENT :
717
- EndElementTree eet = (EndElementTree )dt ;
718
- return !isFirstDocTree && ((DCTree ) dt ).pos > 1 && isSentenceBreak (eet .getName ());
719
- default :
720
- return false ;
739
+ private boolean isSentenceBreak (Name tagName ) {
740
+ return sentenceBreakTags .contains (StringUtils .toUpperCase (tagName .toString ()));
721
741
}
722
- }
723
742
724
- /*
725
- * Returns the position of the first non-whitespace character.
726
- */
727
- private int skipWhiteSpace (String s , int start ) {
728
- for (int i = start ; i < s .length (); i ++) {
729
- char c = s .charAt (i );
730
- if (!Character .isWhitespace (c )) {
731
- return i ;
743
+ /*
744
+ * Returns the position of the first non-whitespace character.
745
+ */
746
+ private int skipWhiteSpace (String s , int start ) {
747
+ for (int i = start ; i < s .length (); i ++) {
748
+ char c = s .charAt (i );
749
+ if (!Character .isWhitespace (c )) {
750
+ return i ;
751
+ }
732
752
}
753
+ return -1 ;
733
754
}
734
- return -1 ;
735
755
}
736
756
737
- @ SuppressWarnings ("unchecked" )
738
- private List <DCTree > cast (List <? extends DocTree > list ) {
739
- return (List <DCTree >) list ;
740
- }
741
757
}
1 commit comments
openjdk-notifier[bot] commentedon Mar 20, 2023
Review
Issues